diff --git a/0001-cmd-link-internal-add-support-for-internal-linking-o.patch b/0001-cmd-link-internal-add-support-for-internal-linking-o.patch new file mode 100644 index 0000000000000000000000000000000000000000..8f49bc5378a4e86ad82b6008e3e5688252365a39 --- /dev/null +++ b/0001-cmd-link-internal-add-support-for-internal-linking-o.patch @@ -0,0 +1,457 @@ +From 2730907e506ac1fdcc25fbb263df89a03c12b309 Mon Sep 17 00:00:00 2001 +From: limeidan +Date: Mon, 9 Oct 2023 17:31:14 +0800 +Subject: [PATCH 01/44] cmd/link/internal: add support for internal linking on + loong64 + +Change-Id: Ic0d36f27481ac707d04aaf7001f26061e510dd8f +--- + src/cmd/link/internal/loadelf/ldelf.go | 24 ++ + src/cmd/link/internal/loong64/asm.go | 356 ++++++++++++++++++++++++- + 2 files changed, 375 insertions(+), 5 deletions(-) + +diff --git a/src/cmd/link/internal/loadelf/ldelf.go b/src/cmd/link/internal/loadelf/ldelf.go +index e0363b5535..be14cc3bb2 100644 +--- a/src/cmd/link/internal/loadelf/ldelf.go ++++ b/src/cmd/link/internal/loadelf/ldelf.go +@@ -602,6 +602,11 @@ func Load(l *loader.Loader, arch *sys.Arch, localSymVersion int, f *bio.Reader, + // See https://sourceware.org/bugzilla/show_bug.cgi?id=21809 + continue + } ++ ++ if arch.Family == sys.Loong64 && (strings.HasPrefix(elfsym.name, ".L") || elfsym.name == "L0\001") { ++ // Symbols generated by the relax feature of gcc and binutils on loong64. ++ continue ++ } + } + + if strings.HasPrefix(elfsym.name, ".Linfo_string") { +@@ -682,6 +687,12 @@ func Load(l *loader.Loader, arch *sys.Arch, localSymVersion int, f *bio.Reader, + l.SetAttrOnList(s, true) + textp = append(textp, s) + for ss := l.SubSym(s); ss != 0; ss = l.SubSym(ss) { ++ if arch.Family == sys.Loong64 && (strings.HasPrefix(l.SymName(ss), ".L") || l.SymName(ss) == "L0\001") { ++ // Symbols generated by the relax feature of gcc and binutils on loong64. ++ // We ignore them here because there are too many symbols of this type, ++ // resulting in insufficient space in findfunctable. ++ continue ++ } + if l.AttrOnList(ss) { + return errorf("symbol %s listed multiple times", + l.SymName(ss)) +@@ -1018,7 +1029,14 @@ func relSize(arch *sys.Arch, pn string, elftype uint32) (uint8, uint8, error) { + MIPS64 | uint32(elf.R_MIPS_PC32)<<16: + return 4, 4, nil + ++ // These are informational annotations to assist linker optimizations. ++ case LOONG64 | uint32(elf.R_LARCH_ALIGN)<<16, ++ LOONG64 | uint32(elf.R_LARCH_RELAX)<<16: ++ return 0, 0, nil ++ + case LOONG64 | uint32(elf.R_LARCH_ADD8)<<16, ++ LOONG64 | uint32(elf.R_LARCH_ADD6)<<16, ++ LOONG64 | uint32(elf.R_LARCH_SUB6)<<16, + LOONG64 | uint32(elf.R_LARCH_SUB8)<<16: + return 1, 1, nil + +@@ -1032,7 +1050,13 @@ func relSize(arch *sys.Arch, pn string, elftype uint32) (uint8, uint8, error) { + LOONG64 | uint32(elf.R_LARCH_ADD32)<<16, + LOONG64 | uint32(elf.R_LARCH_SUB24)<<16, + LOONG64 | uint32(elf.R_LARCH_SUB32)<<16, ++ LOONG64 | uint32(elf.R_LARCH_B16)<<16, ++ LOONG64 | uint32(elf.R_LARCH_B21)<<16, + LOONG64 | uint32(elf.R_LARCH_B26)<<16, ++ LOONG64 | uint32(elf.R_LARCH_PCALA_HI20)<<16, ++ LOONG64 | uint32(elf.R_LARCH_PCALA_LO12)<<16, ++ LOONG64 | uint32(elf.R_LARCH_GOT_PC_HI20)<<16, ++ LOONG64 | uint32(elf.R_LARCH_GOT_PC_LO12)<<16, + LOONG64 | uint32(elf.R_LARCH_32_PCREL)<<16: + return 4, 4, nil + +diff --git a/src/cmd/link/internal/loong64/asm.go b/src/cmd/link/internal/loong64/asm.go +index 2e69594f92..3a83f1a5ad 100644 +--- a/src/cmd/link/internal/loong64/asm.go ++++ b/src/cmd/link/internal/loong64/asm.go +@@ -58,10 +58,328 @@ func gentext(ctxt *ld.Link, ldr *loader.Loader) { + } + + func adddynrel(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, s loader.Sym, r loader.Reloc, rIdx int) bool { +- log.Fatalf("adddynrel not implemented") ++ targ := r.Sym() ++ var targType sym.SymKind ++ if targ != 0 { ++ targType = ldr.SymType(targ) ++ } ++ ++ switch r.Type() { ++ default: ++ if r.Type() >= objabi.ElfRelocOffset { ++ ldr.Errorf(s, "adddynrel: unexpected reloction type %d (%s)", r.Type(), sym.RelocName(target.Arch, r.Type())) ++ return false ++ } ++ ++ case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_64): ++ if targType == sym.SDYNIMPORT { ++ ldr.Errorf(s, "unexpected R_LARCH_64 relocation for dynamic symbol %s", ldr.SymName(targ)) ++ } ++ su := ldr.MakeSymbolUpdater(s) ++ su.SetRelocType(rIdx, objabi.R_ADDR) ++ if target.IsPIE() && target.IsInternal() { ++ // For internal linking PIE, this R_ADDR relocation cannot ++ // be resolved statically. We need to generate a dynamic ++ // relocation. Let the code below handle it. ++ break ++ } ++ return true ++ ++ case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_B26): ++ if targType == sym.SDYNIMPORT { ++ addpltsym(target, ldr, syms, targ) ++ su := ldr.MakeSymbolUpdater(s) ++ su.SetRelocSym(rIdx, syms.PLT) ++ su.SetRelocAdd(rIdx, r.Add()+int64(ldr.SymPlt(targ))) ++ } ++ if targType == 0 || targType == sym.SXREF { ++ ldr.Errorf(s, "unknown symbol %s in callloong64", ldr.SymName(targ)) ++ } ++ su := ldr.MakeSymbolUpdater(s) ++ su.SetRelocType(rIdx, objabi.R_CALLLOONG64) ++ return true ++ ++ case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_GOT_PC_HI20), ++ objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_GOT_PC_LO12): ++ if targType != sym.SDYNIMPORT { ++ // TODO: turn LDR of GOT entry into ADR of symbol itself ++ } ++ ++ ld.AddGotSym(target, ldr, syms, targ, uint32(elf.R_LARCH_64)) ++ su := ldr.MakeSymbolUpdater(s) ++ if r.Type() == objabi.ElfRelocOffset+objabi.RelocType(elf.R_LARCH_GOT_PC_HI20) { ++ su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_HI) ++ } else { ++ su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_LO) ++ } ++ su.SetRelocSym(rIdx, syms.GOT) ++ su.SetRelocAdd(rIdx, r.Add()+int64(ldr.SymGot(targ))) ++ return true ++ ++ case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_PCALA_HI20), ++ objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_PCALA_LO12): ++ if targType == sym.SDYNIMPORT { ++ ldr.Errorf(s, "unexpected relocation for dynamic symbol %s", ldr.SymName(targ)) ++ } ++ if targType == 0 || targType == sym.SXREF { ++ ldr.Errorf(s, "unknown symbol %s", ldr.SymName(targ)) ++ } ++ ++ su := ldr.MakeSymbolUpdater(s) ++ if r.Type() == objabi.ElfRelocOffset+objabi.RelocType(elf.R_LARCH_PCALA_HI20) { ++ su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_HI) ++ } else { ++ su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_LO) ++ } ++ return true ++ ++ case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_ADD64), ++ objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_SUB64): ++ su := ldr.MakeSymbolUpdater(s) ++ if r.Type() == objabi.ElfRelocOffset+objabi.RelocType(elf.R_LARCH_ADD64) { ++ su.SetRelocType(rIdx, objabi.R_LOONG64_ADD64) ++ } else { ++ su.SetRelocType(rIdx, objabi.R_LOONG64_SUB64) ++ } ++ return true ++ ++ case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_B16), ++ objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_B21): ++ if targType == sym.SDYNIMPORT { ++ addpltsym(target, ldr, syms, targ) ++ su := ldr.MakeSymbolUpdater(s) ++ su.SetRelocSym(rIdx, syms.PLT) ++ su.SetRelocAdd(rIdx, r.Add()+int64(ldr.SymPlt(targ))) ++ } ++ if targType == 0 || targType == sym.SXREF { ++ ldr.Errorf(s, "unknown symbol %s in R_JMPxxLOONG64", ldr.SymName(targ)) ++ } ++ su := ldr.MakeSymbolUpdater(s) ++ if r.Type() == objabi.ElfRelocOffset+objabi.RelocType(elf.R_LARCH_B16) { ++ su.SetRelocType(rIdx, objabi.R_JMP16LOONG64) ++ } else { ++ su.SetRelocType(rIdx, objabi.R_JMP21LOONG64) ++ } ++ return true ++ } ++ ++ relocs := ldr.Relocs(s) ++ r = relocs.At(rIdx) ++ ++ switch r.Type() { ++ case objabi.R_CALLLOONG64: ++ if targType != sym.SDYNIMPORT { ++ return true ++ } ++ if target.IsExternal() { ++ return true ++ } ++ ++ // Internal linking. ++ if r.Add() != 0 { ++ ldr.Errorf(s, "PLT call with no-zero addend (%v)", r.Add()) ++ } ++ ++ // Build a PLT entry and change the relocation target to that entry. ++ addpltsym(target, ldr, syms, targ) ++ su := ldr.MakeSymbolUpdater(s) ++ su.SetRelocSym(rIdx, syms.PLT) ++ su.SetRelocAdd(rIdx, int64(ldr.SymPlt(targ))) ++ return true ++ ++ case objabi.R_ADDR: ++ if ldr.SymType(s) == sym.STEXT && target.IsElf() { ++ // The code is asking for the address of an external ++ // function. We provide it with the address of the ++ // correspondent GOT symbol. ++ ld.AddGotSym(target, ldr, syms, targ, uint32(elf.R_LARCH_64)) ++ su := ldr.MakeSymbolUpdater(s) ++ su.SetRelocSym(rIdx, syms.GOT) ++ su.SetRelocAdd(rIdx, r.Add()+int64(ldr.SymGot(targ))) ++ return true ++ } ++ ++ // Process dynamic relocations for the data sections. ++ if target.IsPIE() && target.IsInternal() { ++ // When internally linking, generate dynamic relocations ++ // for all typical R_ADDR relocations. The exception ++ // are those R_ADDR that are created as part of generating ++ // the dynamic relocations and must be resolved statically. ++ // ++ // There are three phases relevant to understanding this: ++ // ++ // dodata() // we are here ++ // address() // symbol address assignment ++ // reloc() // resolution of static R_ADDR relocs ++ // ++ // At this point symbol addresses have not been ++ // assigned yet (as the final size of the .rela section ++ // will affect the addresses), and so we cannot write ++ // the Elf64_Rela.r_offset now. Instead we delay it ++ // until after the 'address' phase of the linker is ++ // complete. We do this via Addaddrplus, which creates ++ // a new R_ADDR relocation which will be resolved in ++ // the 'reloc' phase. ++ // ++ // These synthetic static R_ADDR relocs must be skipped ++ // now, or else we will be caught in an infinite loop ++ // of generating synthetic relocs for our synthetic ++ // relocs. ++ // ++ // Furthermore, the rela sections contain dynamic ++ // relocations with R_ADDR relocations on ++ // Elf64_Rela.r_offset. This field should contain the ++ // symbol offset as determined by reloc(), not the ++ // final dynamically linked address as a dynamic ++ // relocation would provide. ++ switch ldr.SymName(s) { ++ case ".dynsym", ".rela", ".rela.plt", ".got.plt", ".dynamic": ++ return false ++ } ++ } else { ++ // Either internally linking a static executable, ++ // in which case we can resolve these relocations ++ // statically in the 'reloc' phase, or externally ++ // linking, in which case the relocation will be ++ // prepared in the 'reloc' phase and passed to the ++ // external linker in the 'asmb' phase. ++ if ldr.SymType(s) != sym.SDATA && ldr.SymType(s) != sym.SRODATA { ++ break ++ } ++ } ++ ++ if target.IsElf() { ++ // Generate R_LARCH_RELATIVE relocations for best ++ // efficiency in the dynamic linker. ++ // ++ // As noted above, symbol addresses have not been ++ // assigned yet, so we can't generate the final reloc ++ // entry yet. We ultimately want: ++ // ++ // r_offset = s + r.Off ++ // r_info = R_LARCH_RELATIVE ++ // r_addend = targ + r.Add ++ // ++ // The dynamic linker will set *offset = base address + ++ // addend. ++ // ++ // AddAddrPlus is used for r_offset and r_addend to ++ // generate new R_ADDR relocations that will update ++ // these fields in the 'reloc' phase. ++ rela := ldr.MakeSymbolUpdater(syms.Rela) ++ rela.AddAddrPlus(target.Arch, s, int64(r.Off())) ++ if r.Siz() == 8 { ++ rela.AddUint64(target.Arch, elf.R_INFO(0, uint32(elf.R_LARCH_RELATIVE))) ++ } else { ++ ldr.Errorf(s, "unexpected relocation for dynamic symbol %s", ldr.SymName(targ)) ++ } ++ rela.AddAddrPlus(target.Arch, targ, int64(r.Add())) ++ return true ++ } ++ ++ case objabi.R_LOONG64_GOT_HI, ++ objabi.R_LOONG64_GOT_LO: ++ ld.AddGotSym(target, ldr, syms, targ, uint32(elf.R_LARCH_64)) ++ su := ldr.MakeSymbolUpdater(s) ++ if r.Type() == objabi.R_LOONG64_GOT_HI { ++ su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_HI) ++ } else { ++ su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_LO) ++ } ++ su.SetRelocSym(rIdx, syms.GOT) ++ su.SetRelocAdd(rIdx, r.Add()+int64(ldr.SymGot(targ))) ++ return true ++ } + return false + } + ++func elfsetupplt(ctxt *ld.Link, ldr *loader.Loader, plt, gotplt *loader.SymbolBuilder, dynamic loader.Sym) { ++ if plt.Size() == 0 { ++ // pcalau12i $r14, imm ++ plt.AddSymRef(ctxt.Arch, gotplt.Sym(), 0, objabi.R_LOONG64_ADDR_HI, 4) ++ plt.SetUint32(ctxt.Arch, plt.Size()-4, 0x1a00000e) ++ ++ // sub.d $r13, $r13, $r15 ++ plt.AddUint32(ctxt.Arch, 0x0011bdad) ++ ++ // ld.d $r15, $r14, imm ++ plt.AddSymRef(ctxt.Arch, gotplt.Sym(), 0, objabi.R_LOONG64_ADDR_LO, 4) ++ plt.SetUint32(ctxt.Arch, plt.Size()-4, 0x28c001cf) ++ ++ // addi.d $r13, $r13, -40 ++ plt.AddUint32(ctxt.Arch, 0x02ff61ad) ++ ++ // addi.d $r12, $r14, imm ++ plt.AddSymRef(ctxt.Arch, gotplt.Sym(), 0, objabi.R_LOONG64_ADDR_LO, 4) ++ plt.SetUint32(ctxt.Arch, plt.Size()-4, 0x2c001cc) ++ ++ // srli.d $r13, $r13, 1 ++ plt.AddUint32(ctxt.Arch, 0x004505ad) ++ ++ // ld.d $r12, $r12, 8 ++ plt.AddUint32(ctxt.Arch, 0x28c0218c) ++ ++ // jirl $r0, $r15, 0 ++ plt.AddUint32(ctxt.Arch, 0x4c0001e0) ++ ++ // check gotplt.size == 0 ++ if gotplt.Size() != 0 { ++ ctxt.Errorf(gotplt.Sym(), "got.plt is not empty at the very beginning") ++ } ++ ++ gotplt.AddUint64(ctxt.Arch, 0) ++ gotplt.AddUint64(ctxt.Arch, 0) ++ } ++} ++ ++func addpltsym(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, s loader.Sym) { ++ if ldr.SymPlt(s) >= 0 { ++ return ++ } ++ ++ ld.Adddynsym(ldr, target, syms, s) ++ ++ if target.IsElf() { ++ plt := ldr.MakeSymbolUpdater(syms.PLT) ++ gotplt := ldr.MakeSymbolUpdater(syms.GOTPLT) ++ rela := ldr.MakeSymbolUpdater(syms.RelaPLT) ++ if plt.Size() == 0 { ++ panic("plt is not set up") ++ } ++ ++ // pcalau12i $r15, imm ++ plt.AddAddrPlus4(target.Arch, gotplt.Sym(), gotplt.Size()) ++ plt.SetUint32(target.Arch, plt.Size()-4, 0x1a00000f) ++ relocs := plt.Relocs() ++ plt.SetRelocType(relocs.Count()-1, objabi.R_LOONG64_ADDR_HI) ++ ++ // ld.d $r15, $r15, imm ++ plt.AddAddrPlus4(target.Arch, gotplt.Sym(), gotplt.Size()) ++ plt.SetUint32(target.Arch, plt.Size()-4, 0x28c001ef) ++ relocs = plt.Relocs() ++ plt.SetRelocType(relocs.Count()-1, objabi.R_LOONG64_ADDR_LO) ++ ++ // pcaddu12i $r13, 0 ++ plt.AddUint32(target.Arch, 0x1c00000d) ++ ++ // jirl r0, r15, 0 ++ plt.AddUint32(target.Arch, 0x4c0001e0) ++ ++ // add to got.plt: pointer to plt[0] ++ gotplt.AddAddrPlus(target.Arch, plt.Sym(), 0) ++ ++ // rela ++ rela.AddAddrPlus(target.Arch, gotplt.Sym(), gotplt.Size()-8) ++ sDynid := ldr.SymDynid(s) ++ rela.AddUint64(target.Arch, elf.R_INFO(uint32(sDynid), uint32(elf.R_LARCH_JUMP_SLOT))) ++ rela.AddUint64(target.Arch, 0) ++ ++ ldr.SetPlt(s, int32(plt.Size()-16)) ++ } else { ++ ldr.Errorf(s, "addpltsym: unsupport binary format") ++ } ++} ++ + func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym, r loader.ExtReloc, ri int, sectoff int64) bool { + // loong64 ELF relocation (endian neutral) + // offset uint64 +@@ -134,10 +452,6 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym, + return true + } + +-func elfsetupplt(ctxt *ld.Link, ldr *loader.Loader, plt, gotplt *loader.SymbolBuilder, dynamic loader.Sym) { +- return +-} +- + func machoreloc1(*sys.Arch, *ld.OutBuf, *loader.Loader, loader.Sym, loader.ExtReloc, int64) bool { + return false + } +@@ -197,6 +511,38 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade + pc := ldr.SymValue(s) + int64(r.Off()) + t := ldr.SymAddr(rs) + r.Add() - pc + return int64(val&0xfc000000 | (((t >> 2) & 0xffff) << 10) | (((t >> 2) & 0x3ff0000) >> 16)), noExtReloc, isOk ++ ++ case objabi.R_JMP16LOONG64, ++ objabi.R_JMP21LOONG64: ++ pc := ldr.SymValue(s) + int64(r.Off()) ++ t := ldr.SymAddr(rs) + r.Add() - pc ++ if r.Type() == objabi.R_JMP16LOONG64 { ++ return int64(val&0xfc0003ff | (((t >> 2) & 0xffff) << 10)), noExtReloc, isOk ++ } ++ return int64(val&0xfc0003e0 | (((t >> 2) & 0xffff) << 10) | (((t >> 2) & 0x1f0000) >> 16)), noExtReloc, isOk ++ ++ case objabi.R_LOONG64_TLS_IE_HI, ++ objabi.R_LOONG64_TLS_IE_LO: ++ if target.IsPIE() && target.IsElf() { ++ if !target.IsLinux() { ++ ldr.Errorf(s, "TLS reloc on unsupported OS %v", target.HeadType) ++ } ++ t := ldr.SymAddr(rs) + r.Add() ++ if r.Type() == objabi.R_LOONG64_TLS_IE_HI { ++ // pcalau12i -> lu12i.w ++ return (0x14000000 | (val & 0x1f) | ((t >> 12) << 5)), noExtReloc, isOk ++ } ++ // ld.d -> ori ++ return (0x03800000 | (val & 0x3ff) | ((t & 0xfff) << 10)), noExtReloc, isOk ++ } else { ++ log.Fatalf("cannot handle R_LOONG64_TLS_IE_x (sym %s) when linking internally", ldr.SymName(rs)) ++ } ++ ++ case objabi.R_LOONG64_ADD64, objabi.R_LOONG64_SUB64: ++ if r.Type() == objabi.R_LOONG64_ADD64 { ++ return int64(val + ldr.SymAddr(rs) + r.Add()), noExtReloc, isOk ++ } ++ return int64(val - (ldr.SymAddr(rs) + r.Add())), noExtReloc, isOk + } + + return val, 0, false +-- +2.38.1 + diff --git a/0002-cmd-dist-internal-platform-enable-internal-linking-f.patch b/0002-cmd-dist-internal-platform-enable-internal-linking-f.patch new file mode 100644 index 0000000000000000000000000000000000000000..8f2e4c171a8fe6328bc2ecfa7c3888b65e22339b --- /dev/null +++ b/0002-cmd-dist-internal-platform-enable-internal-linking-f.patch @@ -0,0 +1,83 @@ +From d404dccc7f089ddbd81b95c3d97f19acc6cb0329 Mon Sep 17 00:00:00 2001 +From: limeidan +Date: Mon, 9 Oct 2023 17:32:03 +0800 +Subject: [PATCH 02/44] cmd/dist, internal/platform: enable internal linking + feature and test on loong64 + +Change-Id: Ifea676e9eb44281465832fc4050f6286e50f4543 +--- + src/cmd/dist/build.go | 4 +++- + src/cmd/dist/test.go | 4 ++-- + src/internal/platform/supported.go | 6 ++++-- + 3 files changed, 9 insertions(+), 5 deletions(-) + +diff --git a/src/cmd/dist/build.go b/src/cmd/dist/build.go +index 1f467647f5..b71d6c393e 100644 +--- a/src/cmd/dist/build.go ++++ b/src/cmd/dist/build.go +@@ -624,10 +624,12 @@ func setup() { + func mustLinkExternal(goos, goarch string, cgoEnabled bool) bool { + if cgoEnabled { + switch goarch { +- case "loong64", "mips", "mipsle", "mips64", "mips64le": ++ case "mips", "mipsle", "mips64", "mips64le": + // Internally linking cgo is incomplete on some architectures. + // https://golang.org/issue/14449 + return true ++ case "loong64": ++ return false + case "arm64": + if goos == "windows" { + // windows/arm64 internal linking is not implemented. +diff --git a/src/cmd/dist/test.go b/src/cmd/dist/test.go +index 0c992118f4..9728ef29cb 100644 +--- a/src/cmd/dist/test.go ++++ b/src/cmd/dist/test.go +@@ -1164,7 +1164,7 @@ func (t *tester) internalLink() bool { + // Internally linking cgo is incomplete on some architectures. + // https://golang.org/issue/10373 + // https://golang.org/issue/14449 +- if goarch == "loong64" || goarch == "mips64" || goarch == "mips64le" || goarch == "mips" || goarch == "mipsle" || goarch == "riscv64" { ++ if goarch == "mips64" || goarch == "mips64le" || goarch == "mips" || goarch == "mipsle" || goarch == "riscv64" { + return false + } + if goos == "aix" { +@@ -1185,7 +1185,7 @@ func (t *tester) internalLinkPIE() bool { + } + switch goos + "-" + goarch { + case "darwin-amd64", "darwin-arm64", +- "linux-amd64", "linux-arm64", "linux-ppc64le", ++ "linux-amd64", "linux-arm64", "linux-loong64", "linux-ppc64le", + "android-arm64", + "windows-amd64", "windows-386", "windows-arm": + return true +diff --git a/src/internal/platform/supported.go b/src/internal/platform/supported.go +index e864c37d68..79ed6d4b1c 100644 +--- a/src/internal/platform/supported.go ++++ b/src/internal/platform/supported.go +@@ -85,10 +85,12 @@ func FuzzInstrumented(goos, goarch string) bool { + func MustLinkExternal(goos, goarch string, withCgo bool) bool { + if withCgo { + switch goarch { +- case "loong64", "mips", "mipsle", "mips64", "mips64le": ++ case "mips", "mipsle", "mips64", "mips64le": + // Internally linking cgo is incomplete on some architectures. + // https://go.dev/issue/14449 + return true ++ case "loong64": ++ return false + case "arm64": + if goos == "windows" { + // windows/arm64 internal linking is not implemented. +@@ -225,7 +227,7 @@ func InternalLinkPIESupported(goos, goarch string) bool { + switch goos + "/" + goarch { + case "android/arm64", + "darwin/amd64", "darwin/arm64", +- "linux/amd64", "linux/arm64", "linux/ppc64le", ++ "linux/amd64", "linux/arm64", "linux/loong64", "linux/ppc64le", + "windows/386", "windows/amd64", "windows/arm", "windows/arm64": + return true + } +-- +2.38.1 + diff --git a/0003-cmd-runtime-enable-race-detector-on-loong64.patch b/0003-cmd-runtime-enable-race-detector-on-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..0d61dcc5fcb6091e4e881d6ebe0a579b2ba30792 --- /dev/null +++ b/0003-cmd-runtime-enable-race-detector-on-loong64.patch @@ -0,0 +1,626 @@ +From f84142ce620b086cc90f728861a76e5066c22ed9 Mon Sep 17 00:00:00 2001 +From: Guoqi Chen +Date: Sat, 19 Aug 2023 09:22:34 +0800 +Subject: [PATCH 03/44] cmd,runtime: enable race detector on loong64 + +The race feature depends on llvm. And support for building the tsan library on +linux/loong64 has been added in this patch [1], which has been merged into the +branch main and will be supported in the upcoming llvm18. + +[1]: https://github.com/llvm/llvm-project/pull/72819 + +Co-authored-by: Xiaolin Zhao +Change-Id: If389318215476890295ed771297c6c088cfc84b3 +--- + src/cmd/dist/test.go | 2 +- + src/internal/platform/supported.go | 2 +- + src/race.bash | 3 +- + src/runtime/asm_loong64.s | 1 + + src/runtime/race/README | 3 +- + src/runtime/race/race.go | 2 +- + src/runtime/race_loong64.s | 509 +++++++++++++++++++++++ + 8 files changed, 517 insertions(+), 5 deletions(-) + create mode 100644 src/runtime/race_loong64.s + +diff --git a/src/cmd/dist/test.go b/src/cmd/dist/test.go +index 9728ef29cb..044268ada0 100644 +--- a/src/cmd/dist/test.go ++++ b/src/cmd/dist/test.go +@@ -1674,7 +1674,7 @@ func (t *tester) makeGOROOTUnwritable() (undo func()) { + func raceDetectorSupported(goos, goarch string) bool { + switch goos { + case "linux": +- return goarch == "amd64" || goarch == "ppc64le" || goarch == "arm64" || goarch == "s390x" ++ return goarch == "amd64" || goarch == "ppc64le" || goarch == "arm64" || goarch == "s390x" || goarch == "loong64" + case "darwin": + return goarch == "amd64" || goarch == "arm64" + case "freebsd", "netbsd", "windows": +diff --git a/src/internal/platform/supported.go b/src/internal/platform/supported.go +index 79ed6d4b1c..52cad096cb 100644 +--- a/src/internal/platform/supported.go ++++ b/src/internal/platform/supported.go +@@ -23,7 +23,7 @@ func (p OSArch) String() string { + func RaceDetectorSupported(goos, goarch string) bool { + switch goos { + case "linux": +- return goarch == "amd64" || goarch == "ppc64le" || goarch == "arm64" || goarch == "s390x" ++ return goarch == "amd64" || goarch == "ppc64le" || goarch == "arm64" || goarch == "s390x" || goarch == "loong64" + case "darwin": + return goarch == "amd64" || goarch == "arm64" + case "freebsd", "netbsd", "windows": +diff --git a/src/race.bash b/src/race.bash +index f1a168bfbb..ae9f57ffd7 100755 +--- a/src/race.bash ++++ b/src/race.bash +@@ -9,7 +9,7 @@ + set -e + + function usage { +- echo 'race detector is only supported on linux/amd64, linux/ppc64le, linux/arm64, linux/s390x, freebsd/amd64, netbsd/amd64, openbsd/amd64, darwin/amd64, and darwin/arm64' 1>&2 ++ echo 'race detector is only supported on linux/amd64, linux/ppc64le, linux/arm64, linux/loong64, linux/s390x, freebsd/amd64, netbsd/amd64, openbsd/amd64, darwin/amd64, and darwin/arm64' 1>&2 + exit 1 + } + +@@ -19,6 +19,7 @@ case $(uname -s -m) in + "Linux x86_64") ;; + "Linux ppc64le") ;; + "Linux aarch64") ;; ++ "Linux loongarch64") ;; + "Linux s390x") ;; + "FreeBSD amd64") ;; + "NetBSD amd64") ;; +diff --git a/src/runtime/asm_loong64.s b/src/runtime/asm_loong64.s +index 1c5ced4512..1bd8276835 100644 +--- a/src/runtime/asm_loong64.s ++++ b/src/runtime/asm_loong64.s +@@ -37,6 +37,7 @@ TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0 + JAL (R25) + + nocgo: ++ JAL runtime·save_g(SB) + // update stackguard after _cgo_init + MOVV (g_stack+stack_lo)(g), R19 + ADDV $const_stackGuard, R19 +diff --git a/src/runtime/race/README b/src/runtime/race/README +index 47c51ca9c1..06865d2b34 100644 +--- a/src/runtime/race/README ++++ b/src/runtime/race/README +@@ -13,5 +13,6 @@ internal/amd64v1/race_windows.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d + internal/amd64v3/race_linux.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d538996c9b04 and Go e7d582b55dda36e76ce4d0ce770139ca0915b7c5. + race_darwin_arm64.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d538996c9b04 and Go e7d582b55dda36e76ce4d0ce770139ca0915b7c5. + race_linux_arm64.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d538996c9b04 and Go e7d582b55dda36e76ce4d0ce770139ca0915b7c5. +-race_linux_ppc64le.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d538996c9b04 and Go e7d582b55dda36e76ce4d0ce770139ca0915b7c5. ++race_linux_loong64.syso built with LLVM 9d3fbf97bef3f19da4e0a047f017b8142f59b3fd and Go 988b718f4130ab5b3ce5a5774e1a58e83c92a163. ++race_linux_ppc64le.syso built with LLVM 41cb504b7c4b18ac15830107431a0c1eec73a6b2 and Go 851ecea4cc99ab276109493477b2c7e30c253ea8. + race_linux_s390x.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d538996c9b04 and Go e7d582b55dda36e76ce4d0ce770139ca0915b7c5. +diff --git a/src/runtime/race/race.go b/src/runtime/race/race.go +index 9c508ebc2b..9fd75424ca 100644 +--- a/src/runtime/race/race.go ++++ b/src/runtime/race/race.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build race && ((linux && (amd64 || arm64 || ppc64le || s390x)) || ((freebsd || netbsd || openbsd || windows) && amd64)) ++//go:build race && ((linux && (amd64 || arm64 || loong64 || ppc64le || s390x)) || ((freebsd || netbsd || openbsd || windows) && amd64)) + + package race + +diff --git a/src/runtime/race_loong64.s b/src/runtime/race_loong64.s +new file mode 100644 +index 0000000000..0512efc045 +--- /dev/null ++++ b/src/runtime/race_loong64.s +@@ -0,0 +1,509 @@ ++// Copyright 2023 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++//go:build race ++ ++#include "go_asm.h" ++#include "funcdata.h" ++#include "textflag.h" ++#include "cgo/abi_loong64.h" ++ ++// The following thunks allow calling the gcc-compiled race runtime directly ++// from Go code without going all the way through cgo. ++// First, it's much faster (up to 50% speedup for real Go programs). ++// Second, it eliminates race-related special cases from cgocall and scheduler. ++// Third, in long-term it will allow to remove cyclic runtime/race dependency on cmd/go. ++ ++// A brief recap of the loong64 calling convention. ++// Arguments are passed in R4...R11, the rest is on stack. ++// Callee-saved registers are: R23...R30. ++// Temporary registers are: R12...R20 ++// SP must be 16-byte aligned. ++ ++// When calling racecalladdr, R20 is the call target address. ++ ++// The race ctx, ThreadState *thr below, is passed in R4 and loaded in racecalladdr. ++ ++// Load g from TLS. (See tls_loong64.s) ++#define load_g \ ++ MOVV runtime·tls_g(SB), g ++ ++#define RARG0 R4 ++#define RARG1 R5 ++#define RARG2 R6 ++#define RARG3 R7 ++#define RCALL R20 ++ ++// func runtime·raceread(addr uintptr) ++// Called from instrumented code. ++// Defined as ABIInternal so as to avoid introducing a wrapper, ++// which would make caller's PC ineffective. ++TEXT runtime·raceread(SB), NOSPLIT, $0-8 ++#ifdef GOEXPERIMENT_regabiargs ++ MOVV R4, RARG1 ++#else ++ MOVV addr+0(FP), RARG1 ++#endif ++ MOVV R1, RARG2 ++ // void __tsan_read(ThreadState *thr, void *addr, void *pc); ++ MOVV $__tsan_read(SB), RCALL ++ JMP racecalladdr<>(SB) ++ ++// func runtime·RaceRead(addr uintptr) ++TEXT runtime·RaceRead(SB), NOSPLIT, $0-8 ++ // This needs to be a tail call, because raceread reads caller pc. ++ JMP runtime·raceread(SB) ++ ++// func runtime·racereadpc(void *addr, void *callpc, void *pc) ++TEXT runtime·racereadpc(SB), NOSPLIT, $0-24 ++ MOVV addr+0(FP), RARG1 ++ MOVV callpc+8(FP), RARG2 ++ MOVV pc+16(FP), RARG3 ++ // void __tsan_read_pc(ThreadState *thr, void *addr, void *callpc, void *pc); ++ MOVV $__tsan_read_pc(SB), RCALL ++ JMP racecalladdr<>(SB) ++ ++// func runtime·racewrite(addr uintptr) ++// Called from instrumented code. ++// Defined as ABIInternal so as to avoid introducing a wrapper, ++// which would make caller's PC ineffective. ++TEXT runtime·racewrite(SB), NOSPLIT, $0-8 ++#ifdef GOEXPERIMENT_regabiargs ++ MOVV R4, RARG1 ++#else ++ MOVV addr+0(FP), RARG1 ++#endif ++ MOVV R1, RARG2 ++ // void __tsan_write(ThreadState *thr, void *addr, void *pc); ++ MOVV $__tsan_write(SB), RCALL ++ JMP racecalladdr<>(SB) ++ ++// func runtime·RaceWrite(addr uintptr) ++TEXT runtime·RaceWrite(SB), NOSPLIT, $0-8 ++ // This needs to be a tail call, because racewrite reads caller pc. ++ JMP runtime·racewrite(SB) ++ ++// func runtime·racewritepc(void *addr, void *callpc, void *pc) ++TEXT runtime·racewritepc(SB), NOSPLIT, $0-24 ++ MOVV addr+0(FP), RARG1 ++ MOVV callpc+8(FP), RARG2 ++ MOVV pc+16(FP), RARG3 ++ // void __tsan_write_pc(ThreadState *thr, void *addr, void *callpc, void *pc); ++ MOVV $__tsan_write_pc(SB), RCALL ++ JMP racecalladdr<>(SB) ++ ++// func runtime·racereadrange(addr, size uintptr) ++// Called from instrumented code. ++// Defined as ABIInternal so as to avoid introducing a wrapper, ++// which would make caller's PC ineffective. ++TEXT runtime·racereadrange(SB), NOSPLIT, $0-16 ++#ifdef GOEXPERIMENT_regabiargs ++ MOVV R5, RARG2 ++ MOVV R4, RARG1 ++#else ++ MOVV addr+0(FP), RARG1 ++ MOVV size+8(FP), RARG2 ++#endif ++ MOVV R1, RARG3 ++ // void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc); ++ MOVV $__tsan_read_range(SB), RCALL ++ JMP racecalladdr<>(SB) ++ ++// func runtime·RaceReadRange(addr, size uintptr) ++TEXT runtime·RaceReadRange(SB), NOSPLIT, $0-16 ++ // This needs to be a tail call, because racereadrange reads caller pc. ++ JMP runtime·racereadrange(SB) ++ ++// func runtime·racereadrangepc1(void *addr, uintptr sz, void *pc) ++TEXT runtime·racereadrangepc1(SB), NOSPLIT, $0-24 ++ MOVV addr+0(FP), RARG1 ++ MOVV size+8(FP), RARG2 ++ MOVV pc+16(FP), RARG3 ++ ADDV $4, RARG3 // pc is function start, tsan wants return address. ++ // void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc); ++ MOVV $__tsan_read_range(SB), RCALL ++ JMP racecalladdr<>(SB) ++ ++// func runtime·racewriterange(addr, size uintptr) ++// Called from instrumented code. ++// Defined as ABIInternal so as to avoid introducing a wrapper, ++// which would make caller's PC ineffective. ++TEXT runtime·racewriterange(SB), NOSPLIT, $0-16 ++#ifdef GOEXPERIMENT_regabiargs ++ MOVV R5, RARG2 ++ MOVV R4, RARG1 ++#else ++ MOVV addr+0(FP), RARG1 ++ MOVV size+8(FP), RARG2 ++#endif ++ MOVV R1, RARG3 ++ // void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc); ++ MOVV $__tsan_write_range(SB), RCALL ++ JMP racecalladdr<>(SB) ++ ++// func runtime·RaceWriteRange(addr, size uintptr) ++TEXT runtime·RaceWriteRange(SB), NOSPLIT, $0-16 ++ // This needs to be a tail call, because racewriterange reads caller pc. ++ JMP runtime·racewriterange(SB) ++ ++// func runtime·racewriterangepc1(void *addr, uintptr sz, void *pc) ++TEXT runtime·racewriterangepc1(SB), NOSPLIT, $0-24 ++ MOVV addr+0(FP), RARG1 ++ MOVV size+8(FP), RARG2 ++ MOVV pc+16(FP), RARG3 ++ ADDV $4, RARG3 // pc is function start, tsan wants return address. ++ // void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc); ++ MOVV $__tsan_write_range(SB), RCALL ++ JMP racecalladdr<>(SB) ++ ++// Call a __tsan function from Go code. ++// ++// RCALL = tsan function address ++// RARG0 = *ThreadState a.k.a. g_racectx from g ++// RARG1 = addr passed to __tsan function ++// ++// If addr (RARG1) is out of range, do nothing. Otherwise, setup goroutine ++// context and invoke racecall. Other arguments already set. ++TEXT racecalladdr<>(SB), NOSPLIT, $0-0 ++ // Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend). ++ MOVV runtime·racearenastart(SB), R12 ++ BLT RARG1, R12, data ++ MOVV runtime·racearenaend(SB), R12 ++ BLT RARG1, R12, call ++data: ++ MOVV runtime·racedatastart(SB), R12 ++ BLT RARG1, R12, ret ++ MOVV runtime·racedataend(SB), R12 ++ BGE RARG1, R12, ret ++call: ++ load_g ++ MOVV g_racectx(g), RARG0 ++ JMP racecall<>(SB) ++ret: ++ RET ++ ++// func runtime·racefuncenter(pc uintptr) ++// Called from instrumented code. ++TEXT runtime·racefuncenter(SB), NOSPLIT, $0-8 ++#ifdef GOEXPERIMENT_regabiargs ++ MOVV R4, RCALL ++#else ++ MOVV callpc+0(FP), RCALL ++#endif ++ JMP racefuncenter<>(SB) ++ ++// Common code for racefuncenter ++// RCALL = caller's return address ++TEXT racefuncenter<>(SB), NOSPLIT, $0-0 ++ load_g ++ MOVV g_racectx(g), RARG0 // goroutine racectx ++ MOVV RCALL, RARG1 ++ // void __tsan_func_enter(ThreadState *thr, void *pc); ++ MOVV $__tsan_func_enter(SB), RCALL ++ JAL racecall<>(SB) ++ RET ++ ++// func runtime·racefuncexit() ++// Called from instrumented code. ++TEXT runtime·racefuncexit(SB), NOSPLIT, $0-0 ++ load_g ++ MOVV g_racectx(g), RARG0 // race context ++ // void __tsan_func_exit(ThreadState *thr); ++ MOVV $__tsan_func_exit(SB), RCALL ++ JMP racecall<>(SB) ++ ++// Atomic operations for sync/atomic package. ++// R7 = addr of arguments passed to this function, it can ++// be fetched at 24(R3) in racecallatomic after two times JAL ++// RARG0, RARG1, RARG2 set in racecallatomic ++ ++// Load ++TEXT sync∕atomic·LoadInt32(SB), NOSPLIT, $0-12 ++ GO_ARGS ++ MOVV $__tsan_go_atomic32_load(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·LoadInt64(SB), NOSPLIT, $0-16 ++ GO_ARGS ++ MOVV $__tsan_go_atomic64_load(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·LoadUint32(SB), NOSPLIT, $0-12 ++ GO_ARGS ++ JMP sync∕atomic·LoadInt32(SB) ++ ++TEXT sync∕atomic·LoadUint64(SB), NOSPLIT, $0-16 ++ GO_ARGS ++ JMP sync∕atomic·LoadInt64(SB) ++ ++TEXT sync∕atomic·LoadUintptr(SB), NOSPLIT, $0-16 ++ GO_ARGS ++ JMP sync∕atomic·LoadInt64(SB) ++ ++TEXT sync∕atomic·LoadPointer(SB), NOSPLIT, $0-16 ++ GO_ARGS ++ JMP sync∕atomic·LoadInt64(SB) ++ ++// Store ++TEXT sync∕atomic·StoreInt32(SB), NOSPLIT, $0-12 ++ GO_ARGS ++ MOVV $__tsan_go_atomic32_store(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·StoreInt64(SB), NOSPLIT, $0-16 ++ GO_ARGS ++ MOVV $__tsan_go_atomic64_store(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·StoreUint32(SB), NOSPLIT, $0-12 ++ GO_ARGS ++ JMP sync∕atomic·StoreInt32(SB) ++ ++TEXT sync∕atomic·StoreUint64(SB), NOSPLIT, $0-16 ++ GO_ARGS ++ JMP sync∕atomic·StoreInt64(SB) ++ ++TEXT sync∕atomic·StoreUintptr(SB), NOSPLIT, $0-16 ++ GO_ARGS ++ JMP sync∕atomic·StoreInt64(SB) ++ ++// Swap ++TEXT sync∕atomic·SwapInt32(SB), NOSPLIT, $0-20 ++ GO_ARGS ++ MOVV $__tsan_go_atomic32_exchange(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·SwapInt64(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ MOVV $__tsan_go_atomic64_exchange(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·SwapUint32(SB), NOSPLIT, $0-20 ++ GO_ARGS ++ JMP sync∕atomic·SwapInt32(SB) ++ ++TEXT sync∕atomic·SwapUint64(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ JMP sync∕atomic·SwapInt64(SB) ++ ++TEXT sync∕atomic·SwapUintptr(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ JMP sync∕atomic·SwapInt64(SB) ++ ++// Add ++TEXT sync∕atomic·AddInt32(SB), NOSPLIT, $0-20 ++ GO_ARGS ++ MOVV $__tsan_go_atomic32_fetch_add(SB), RCALL ++ JAL racecallatomic<>(SB) ++ MOVW add+8(FP), RARG0 // convert fetch_add to add_fetch ++ MOVW ret+16(FP), RARG1 ++ ADD RARG0, RARG1, RARG0 ++ MOVW RARG0, ret+16(FP) ++ RET ++ ++TEXT sync∕atomic·AddInt64(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ MOVV $__tsan_go_atomic64_fetch_add(SB), RCALL ++ JAL racecallatomic<>(SB) ++ MOVV add+8(FP), RARG0 // convert fetch_add to add_fetch ++ MOVV ret+16(FP), RARG1 ++ ADDV RARG0, RARG1, RARG0 ++ MOVV RARG0, ret+16(FP) ++ RET ++ ++TEXT sync∕atomic·AddUint32(SB), NOSPLIT, $0-20 ++ GO_ARGS ++ JMP sync∕atomic·AddInt32(SB) ++ ++TEXT sync∕atomic·AddUint64(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ JMP sync∕atomic·AddInt64(SB) ++ ++TEXT sync∕atomic·AddUintptr(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ JMP sync∕atomic·AddInt64(SB) ++ ++// CompareAndSwap ++TEXT sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-17 ++ GO_ARGS ++ MOVV $__tsan_go_atomic32_compare_exchange(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0-25 ++ GO_ARGS ++ MOVV $__tsan_go_atomic64_compare_exchange(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0-17 ++ GO_ARGS ++ JMP sync∕atomic·CompareAndSwapInt32(SB) ++ ++TEXT sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0-25 ++ GO_ARGS ++ JMP sync∕atomic·CompareAndSwapInt64(SB) ++ ++TEXT sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0-25 ++ GO_ARGS ++ JMP sync∕atomic·CompareAndSwapInt64(SB) ++ ++// Generic atomic operation implementation. ++// RCALL = addr of target function ++TEXT racecallatomic<>(SB), NOSPLIT, $0 ++ // Set up these registers ++ // RARG0 = *ThreadState ++ // RARG1 = caller pc ++ // RARG2 = pc ++ // RARG3 = addr of incoming arg list ++ ++ // Trigger SIGSEGV early. ++ MOVV 24(R3), RARG3 // 1st arg is addr. after two times JAL, get it at 24(R3) ++ MOVB (RARG3), R12 // segv here if addr is bad ++ ++ // Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend). ++ MOVV runtime·racearenastart(SB), R12 ++ BLT RARG3, R12, racecallatomic_data ++ MOVV runtime·racearenaend(SB), R12 ++ BLT RARG3, R12, racecallatomic_ok ++ ++racecallatomic_data: ++ MOVV runtime·racedatastart(SB), R12 ++ BLT RARG3, R12, racecallatomic_ignore ++ MOVV runtime·racedataend(SB), R12 ++ BGE RARG3, R12, racecallatomic_ignore ++ ++racecallatomic_ok: ++ // Addr is within the good range, call the atomic function. ++ load_g ++ MOVV g_racectx(g), RARG0 // goroutine context ++ MOVV 8(R3), RARG1 // caller pc ++ MOVV RCALL, RARG2 // pc ++ ADDV $24, R3, RARG3 ++ JAL racecall<>(SB) // does not return ++ RET ++ ++racecallatomic_ignore: ++ // Addr is outside the good range. ++ // Call __tsan_go_ignore_sync_begin to ignore synchronization during the atomic op. ++ // An attempt to synchronize on the address would cause crash. ++ MOVV RCALL, R25 // remember the original function ++ MOVV $__tsan_go_ignore_sync_begin(SB), RCALL ++ load_g ++ MOVV g_racectx(g), RARG0 // goroutine context ++ JAL racecall<>(SB) ++ MOVV R25, RCALL // restore the original function ++ ++ // Call the atomic function. ++ // racecall will call LLVM race code which might clobber R22 (g) ++ load_g ++ MOVV g_racectx(g), RARG0 // goroutine context ++ MOVV 8(R3), RARG1 // caller pc ++ MOVV RCALL, RARG2 // pc ++ ADDV $24, R3, RARG3 // arguments ++ JAL racecall<>(SB) ++ ++ // Call __tsan_go_ignore_sync_end. ++ MOVV $__tsan_go_ignore_sync_end(SB), RCALL ++ MOVV g_racectx(g), RARG0 // goroutine context ++ JAL racecall<>(SB) ++ RET ++ ++// func runtime·racecall(void(*f)(...), ...) ++// Calls C function f from race runtime and passes up to 4 arguments to it. ++// The arguments are never heap-object-preserving pointers, so we pretend there are no arguments. ++TEXT runtime·racecall(SB), NOSPLIT, $0-0 ++ MOVV fn+0(FP), RCALL ++ MOVV arg0+8(FP), RARG0 ++ MOVV arg1+16(FP), RARG1 ++ MOVV arg2+24(FP), RARG2 ++ MOVV arg3+32(FP), RARG3 ++ JMP racecall<>(SB) ++ ++// Switches SP to g0 stack and calls (RCALL). Arguments already set. ++TEXT racecall<>(SB), NOSPLIT|NOFRAME, $0-0 ++ MOVV g_m(g), R12 ++ // Switch to g0 stack. ++ MOVV R3, R23 // callee-saved, preserved across the CALL ++ MOVV R1, R24 // callee-saved, preserved across the CALL ++ MOVV m_g0(R12), R13 ++ BEQ R13, g, call // already on g0 ++ MOVV (g_sched+gobuf_sp)(R13), R3 ++call: ++ JAL (RCALL) ++ MOVV R23, R3 ++ JAL (R24) ++ RET ++ ++// C->Go callback thunk that allows to call runtime·racesymbolize from C code. ++// Direct Go->C race call has only switched SP, finish g->g0 switch by setting correct g. ++// The overall effect of Go->C->Go call chain is similar to that of mcall. ++// RARG0 contains command code. RARG1 contains command-specific context. ++// See racecallback for command codes. ++TEXT runtime·racecallbackthunk(SB), NOSPLIT|NOFRAME, $0 ++ // Handle command raceGetProcCmd (0) here. ++ // First, code below assumes that we are on curg, while raceGetProcCmd ++ // can be executed on g0. Second, it is called frequently, so will ++ // benefit from this fast path. ++ BNE RARG0, R0, rest ++ MOVV g, R15 ++ load_g ++ MOVV g_m(g), RARG0 ++ MOVV m_p(RARG0), RARG0 ++ MOVV p_raceprocctx(RARG0), RARG0 ++ MOVV RARG0, (RARG1) ++ MOVV R15, g ++ JMP (R1) ++rest: ++ // Save callee-saved registers (Go code won't respect that). ++ // 8(R3) and 16(R3) are for args passed through racecallback ++ ADDV $-176, R3 ++ MOVV R1, 0(R3) ++ ++ SAVE_R22_TO_R31(8*3) ++ SAVE_F24_TO_F31(8*13) ++ // Set g = g0. ++ load_g ++ MOVV g_m(g), R15 ++ MOVV m_g0(R15), R14 ++ BEQ R14, g, noswitch // branch if already on g0 ++ MOVV R14, g ++ ++#ifdef GOEXPERIMENT_regabiargs ++ JAL runtime·racecallback(SB) ++#else ++ MOVV RARG0, 8(R3) // func arg ++ MOVV RARG1, 16(R3) // func arg ++ JAL runtime·racecallback(SB) ++#endif ++ // All registers are smashed after Go code, reload. ++ MOVV g_m(g), R15 ++ MOVV m_curg(R15), g // g = m->curg ++ret: ++ // Restore callee-saved registers. ++ MOVV 0(R3), R1 ++ RESTORE_F24_TO_F31(8*13) ++ RESTORE_R22_TO_R31(8*3) ++ ADDV $176, R3 ++ JMP (R1) ++ ++noswitch: ++ // already on g0 ++#ifdef GOEXPERIMENT_regabiargs ++ JAL runtime·racecallback(SB) ++#else ++ MOVV RARG0, 8(R3) // func arg ++ MOVV RARG1, 16(R3) // func arg ++ JAL runtime·racecallback(SB) ++#endif ++ JMP ret ++ ++// tls_g, g value for each thread in TLS ++GLOBL runtime·tls_g+0(SB), TLSBSS+DUPOK, $8 +-- +2.38.1 + diff --git a/0004-runtime-delete-on-register-ABI-fallback-path-for-rac.patch b/0004-runtime-delete-on-register-ABI-fallback-path-for-rac.patch new file mode 100644 index 0000000000000000000000000000000000000000..54922d9071f7d3b9635639e6d87b4e67ab1d5546 --- /dev/null +++ b/0004-runtime-delete-on-register-ABI-fallback-path-for-rac.patch @@ -0,0 +1,111 @@ +From 5623cd585fd5891d1f6d6d93256e4252b95b9dae Mon Sep 17 00:00:00 2001 +From: Guoqi Chen +Date: Mon, 6 Nov 2023 17:13:43 +0800 +Subject: [PATCH 04/44] runtime: delete on-register ABI fallback path for race + of loong64 + +Co-authored-by: Xiaolin Zhao +Change-Id: Ie8c4a137205e29dd7dc63825f502b1f6b2f1c205 +--- + src/runtime/race_loong64.s | 34 ---------------------------------- + 1 file changed, 34 deletions(-) + +diff --git a/src/runtime/race_loong64.s b/src/runtime/race_loong64.s +index 0512efc045..04f264b21b 100644 +--- a/src/runtime/race_loong64.s ++++ b/src/runtime/race_loong64.s +@@ -40,11 +40,7 @@ + // Defined as ABIInternal so as to avoid introducing a wrapper, + // which would make caller's PC ineffective. + TEXT runtime·raceread(SB), NOSPLIT, $0-8 +-#ifdef GOEXPERIMENT_regabiargs + MOVV R4, RARG1 +-#else +- MOVV addr+0(FP), RARG1 +-#endif + MOVV R1, RARG2 + // void __tsan_read(ThreadState *thr, void *addr, void *pc); + MOVV $__tsan_read(SB), RCALL +@@ -69,11 +65,7 @@ TEXT runtime·racereadpc(SB), NOSPLIT, $0-24 + // Defined as ABIInternal so as to avoid introducing a wrapper, + // which would make caller's PC ineffective. + TEXT runtime·racewrite(SB), NOSPLIT, $0-8 +-#ifdef GOEXPERIMENT_regabiargs + MOVV R4, RARG1 +-#else +- MOVV addr+0(FP), RARG1 +-#endif + MOVV R1, RARG2 + // void __tsan_write(ThreadState *thr, void *addr, void *pc); + MOVV $__tsan_write(SB), RCALL +@@ -98,13 +90,8 @@ TEXT runtime·racewritepc(SB), NOSPLIT, $0-24 + // Defined as ABIInternal so as to avoid introducing a wrapper, + // which would make caller's PC ineffective. + TEXT runtime·racereadrange(SB), NOSPLIT, $0-16 +-#ifdef GOEXPERIMENT_regabiargs + MOVV R5, RARG2 + MOVV R4, RARG1 +-#else +- MOVV addr+0(FP), RARG1 +- MOVV size+8(FP), RARG2 +-#endif + MOVV R1, RARG3 + // void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc); + MOVV $__tsan_read_range(SB), RCALL +@@ -130,13 +117,8 @@ TEXT runtime·racereadrangepc1(SB), NOSPLIT, $0-24 + // Defined as ABIInternal so as to avoid introducing a wrapper, + // which would make caller's PC ineffective. + TEXT runtime·racewriterange(SB), NOSPLIT, $0-16 +-#ifdef GOEXPERIMENT_regabiargs + MOVV R5, RARG2 + MOVV R4, RARG1 +-#else +- MOVV addr+0(FP), RARG1 +- MOVV size+8(FP), RARG2 +-#endif + MOVV R1, RARG3 + // void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc); + MOVV $__tsan_write_range(SB), RCALL +@@ -186,11 +168,7 @@ ret: + // func runtime·racefuncenter(pc uintptr) + // Called from instrumented code. + TEXT runtime·racefuncenter(SB), NOSPLIT, $0-8 +-#ifdef GOEXPERIMENT_regabiargs + MOVV R4, RCALL +-#else +- MOVV callpc+0(FP), RCALL +-#endif + JMP racefuncenter<>(SB) + + // Common code for racefuncenter +@@ -476,13 +454,7 @@ rest: + BEQ R14, g, noswitch // branch if already on g0 + MOVV R14, g + +-#ifdef GOEXPERIMENT_regabiargs + JAL runtime·racecallback(SB) +-#else +- MOVV RARG0, 8(R3) // func arg +- MOVV RARG1, 16(R3) // func arg +- JAL runtime·racecallback(SB) +-#endif + // All registers are smashed after Go code, reload. + MOVV g_m(g), R15 + MOVV m_curg(R15), g // g = m->curg +@@ -496,13 +468,7 @@ ret: + + noswitch: + // already on g0 +-#ifdef GOEXPERIMENT_regabiargs + JAL runtime·racecallback(SB) +-#else +- MOVV RARG0, 8(R3) // func arg +- MOVV RARG1, 16(R3) // func arg +- JAL runtime·racecallback(SB) +-#endif + JMP ret + + // tls_g, g value for each thread in TLS +-- +2.38.1 + diff --git a/0005-cmd-internal-obj-loong64-remove-unused-register-alia.patch b/0005-cmd-internal-obj-loong64-remove-unused-register-alia.patch new file mode 100644 index 0000000000000000000000000000000000000000..34a43a06ff29f3746fc70132c426bacdfed426e9 --- /dev/null +++ b/0005-cmd-internal-obj-loong64-remove-unused-register-alia.patch @@ -0,0 +1,27 @@ +From 2ecb3ca09093ce12b2e47d97cbff223a950de0bb Mon Sep 17 00:00:00 2001 +From: Guoqi Chen +Date: Thu, 16 Nov 2023 17:28:46 +0800 +Subject: [PATCH 05/44] cmd/internal/obj/loong64: remove unused register alias + definitions + +Change-Id: Ie788747372cd47cb3780e75b35750bb08bd166fc +--- + src/cmd/internal/obj/loong64/a.out.go | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index e6984dcba7..53b005af4d 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -225,8 +225,6 @@ const ( + REGZERO = REG_R0 // set to zero + REGLINK = REG_R1 + REGSP = REG_R3 +- REGRET = REG_R20 // not use +- REGARG = -1 // -1 disables passing the first argument in register + REGRT1 = REG_R20 // reserved for runtime, duffzero and duffcopy + REGRT2 = REG_R21 // reserved for runtime, duffcopy + REGCTXT = REG_R29 // context for closures +-- +2.38.1 + diff --git a/0006-internal-bytealg-optimize-IndexByte-and-IndexByteStr.patch b/0006-internal-bytealg-optimize-IndexByte-and-IndexByteStr.patch new file mode 100644 index 0000000000000000000000000000000000000000..b295cb6d9be8b060f1b9ce2adf084cc77b189724 --- /dev/null +++ b/0006-internal-bytealg-optimize-IndexByte-and-IndexByteStr.patch @@ -0,0 +1,160 @@ +From 0b580e45412ffc11f3a1c7ed7165f7a81e51adec Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Fri, 17 May 2024 17:10:59 +0800 +Subject: [PATCH 06/44] internal/bytealg: optimize IndexByte and + IndexByteString function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark results on Loongson 3C5000 (which is an LA464 implementation): + +goos: linux +goarch: loong64 +pkg: bytes +cpu: Loongson-3C5000 @ 2200.00MHz + │ test/old_3c5000_indexbyte.log │ test/new_3c5000_indexbyte.log │ + │ sec/op │ sec/op vs base │ +IndexByte/10 19.72n ± 0% 13.72n ± 0% -30.44% (p=0.000 n=20) +IndexByte/32 58.27n ± 0% 21.54n ± 0% -63.04% (p=0.000 n=20) +IndexByte/4K 5.609µ ± 0% 2.349µ ± 0% -58.13% (p=0.000 n=20) +IndexByte/4M 3.844m ± 2% 2.408m ± 1% -37.36% (p=0.000 n=20) +IndexByte/64M 62.38m ± 0% 41.83m ± 2% -32.94% (p=0.000 n=20) +geomean 17.29µ 9.309µ -46.17% + +Change-Id: I9d60af0196a0078e829669ccd88f93b5f7a5db0a +--- + src/internal/bytealg/indexbyte_loong64.s | 105 ++++++++++++++++++----- + 1 file changed, 82 insertions(+), 23 deletions(-) + +diff --git a/src/internal/bytealg/indexbyte_loong64.s b/src/internal/bytealg/indexbyte_loong64.s +index c9591b3cda..7811741423 100644 +--- a/src/internal/bytealg/indexbyte_loong64.s ++++ b/src/internal/bytealg/indexbyte_loong64.s +@@ -10,41 +10,100 @@ TEXT ·IndexByte(SB),NOSPLIT,$0-40 + // R5 = b_len + // R6 = b_cap (unused) + // R7 = byte to find +- AND $0xff, R7 ++ ADDV R4, R5 // end + MOVV R4, R6 // store base for later ++ AND $0xff, R7 ++ JMP indexbytebody<>(SB) ++ ++TEXT ·IndexByteString(SB),NOSPLIT,$0-32 ++ // R4 = s_base ++ // R5 = s_len ++ // R6 = byte to find ++ AND $0xff, R6, R7 + ADDV R4, R5 // end +- ADDV $-1, R4 ++ MOVV R4, R6 // store base for later ++ JMP indexbytebody<>(SB) + +- PCALIGN $16 ++// input: ++// R4: b_base ++// R5: end ++// R6: store base for later ++// R7: byte to find ++TEXT indexbytebody<>(SB),NOSPLIT,$0 + loop: ++ ADDV $8, R4, R10 ++ BLT R5, R10, tail ++ MOVV (R4), R8 ++ ++ AND $0xff, R8, R9 ++ BEQ R7, R9, found ++ ++ WORD $0xcf2109 // bstrpick.w r9, r8, 15, 8 ++ BEQ R7, R9, byte_1th ++ ++ WORD $0xd74109 // bstrpick.w r9, r8, 23, 16 ++ BEQ R7, R9, byte_2th ++ ++ WORD $0xdf6109 // bstrpick.w r9, r8, 31, 24 ++ BEQ R7, R9, byte_3th ++ ++ WORD $0xe78109 // bstrpick.w r9, r8, 39, 32 ++ BEQ R7, R9, byte_4th ++ ++ WORD $0xefa109 // bstrpick.w r9, r8, 47, 40 ++ BEQ R7, R9, byte_5th ++ ++ WORD $0xf7c109 // bstrpick.w r9, r8, 55, 48 ++ BEQ R7, R9, byte_6th ++ ++ WORD $0xffe109 // bstrpick.w r9, r8, 63, 56 ++ BEQ R7, R9, byte_7th ++ ++ MOVV R10, R4 ++ JMP loop ++ ++tail: ++ BEQ R4, R5, notfound ++ MOVBU (R4), R8 ++ BEQ R7, R8, found + ADDV $1, R4 +- BEQ R4, R5, notfound +- MOVBU (R4), R8 +- BNE R7, R8, loop ++ JMP tail + +- SUBV R6, R4 // remove base ++byte_1th: ++ ADDV $1, R4 ++ SUBV R6, R4 + RET + +-notfound: +- MOVV $-1, R4 ++byte_2th: ++ ADDV $2, R4 ++ SUBV R6, R4 + RET + +-TEXT ·IndexByteString(SB),NOSPLIT,$0-32 +- // R4 = s_base +- // R5 = s_len +- // R6 = byte to find +- MOVV R4, R7 // store base for later +- ADDV R4, R5 // end +- ADDV $-1, R4 ++byte_3th: ++ ADDV $3, R4 ++ SUBV R6, R4 ++ RET + +- PCALIGN $16 +-loop: +- ADDV $1, R4 +- BEQ R4, R5, notfound +- MOVBU (R4), R8 +- BNE R6, R8, loop ++byte_4th: ++ ADDV $4, R4 ++ SUBV R6, R4 ++ RET ++ ++byte_5th: ++ ADDV $5, R4 ++ SUBV R6, R4 ++ RET + +- SUBV R7, R4 // remove base ++byte_6th: ++ ADDV $6, R4 ++ SUBV R6, R4 ++ RET ++ ++byte_7th: ++ ADDV $7, R4 ++ ++found: ++ SUBV R6, R4 // remove base + RET + + notfound: +-- +2.38.1 + diff --git a/0007-internal-bytealg-optimize-memequal-and-memequal_varl.patch b/0007-internal-bytealg-optimize-memequal-and-memequal_varl.patch new file mode 100644 index 0000000000000000000000000000000000000000..7e97b4ae39bffab8dbc8df201de06e31b71fee74 --- /dev/null +++ b/0007-internal-bytealg-optimize-memequal-and-memequal_varl.patch @@ -0,0 +1,142 @@ +From 83f497423050707a8cd27152256699ccd7819456 Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Sat, 18 May 2024 11:00:57 +0800 +Subject: [PATCH 07/44] internal/bytealg: optimize memequal and memequal_varlen + function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +goos: linux +goarch: loong64 +pkg: bytes +cpu: Loongson-3C5000 @ 2200.00MHz + │ test/old_3c5000_equal.log │ test/new_3c5000_equal.log │ + │ sec/op │ sec/op vs base │ +Equal/0 0.6824n ± 0% 0.6837n ± 0% +0.20% (p=0.000 n=20) +Equal/1 10.46n ± 0% 12.71n ± 0% +21.46% (p=0.000 n=20) +Equal/6 17.29n ± 0% 19.57n ± 0% +13.22% (p=0.000 n=20) +Equal/9 21.38n ± 0% 13.19n ± 0% -38.31% (p=0.000 n=20) +Equal/15 29.57n ± 0% 21.39n ± 0% -27.68% (p=0.000 n=20) +Equal/16 30.94n ± 0% 10.46n ± 0% -66.19% (p=0.000 n=20) +Equal/20 36.40n ± 0% 16.83n ± 0% -53.76% (p=0.000 n=20) +Equal/32 52.78n ± 0% 12.28n ± 0% -76.73% (p=0.000 n=20) +Equal/4K 5606.0n ± 0% 385.8n ± 0% -93.12% (p=0.000 n=20) +Equal/4M 5728.9µ ± 0% 746.4µ ± 0% -86.97% (p=0.000 n=20) +Equal/64M 92.02m ± 0% 14.13m ± 5% -84.65% (p=0.000 n=20) +EqualBothUnaligned/64_0 98.73n ± 0% 10.04n ± 0% -89.83% (p=0.000 n=20) +EqualBothUnaligned/64_1 98.73n ± 0% 10.29n ± 0% -89.58% (p=0.000 n=20) +EqualBothUnaligned/64_4 98.73n ± 0% 10.29n ± 0% -89.58% (p=0.000 n=20) +EqualBothUnaligned/64_7 98.73n ± 0% 10.28n ± 0% -89.59% (p=0.000 n=20) +EqualBothUnaligned/4096_0 5602.0n ± 0% 365.8n ± 0% -93.47% (p=0.000 n=20) +EqualBothUnaligned/4096_1 5602.0n ± 0% 437.2n ± 0% -92.19% (p=0.000 n=20) +EqualBothUnaligned/4096_4 5602.0n ± 0% 436.4n ± 0% -92.21% (p=0.000 n=20) +EqualBothUnaligned/4096_7 5602.0n ± 0% 439.2n ± 0% -92.16% (p=0.000 n=20) +EqualBothUnaligned/4194304_0 5729.0µ ± 0% 732.4µ ± 0% -87.22% (p=0.000 n=20) +EqualBothUnaligned/4194304_1 5729.2µ ± 0% 781.8µ ± 1% -86.35% (p=0.000 n=20) +EqualBothUnaligned/4194304_4 5729.3µ ± 0% 773.9µ ± 0% -86.49% (p=0.000 n=20) +EqualBothUnaligned/4194304_7 5729.3µ ± 0% 773.9µ ± 5% -86.49% (p=0.000 n=20) +EqualBothUnaligned/67108864_0 92.38m ± 0% 34.61m ± 38% -62.53% (p=0.000 n=20) +EqualBothUnaligned/67108864_1 92.38m ± 0% 33.07m ± 23% -64.20% (p=0.000 n=20) +EqualBothUnaligned/67108864_4 92.38m ± 0% 82.09m ± 32% -11.14% (p=0.000 n=20) +EqualBothUnaligned/67108864_7 92.39m ± 0% 61.47m ± 16% -33.46% (p=0.000 n=20) +geomean 11.86µ 2.654µ -77.62% + +Change-Id: Ib181f532238e6f6d82a3e9e6987abe121688b6eb +--- + src/internal/bytealg/equal_loong64.s | 72 +++++++++++++++++++--------- + 1 file changed, 49 insertions(+), 23 deletions(-) + +diff --git a/src/internal/bytealg/equal_loong64.s b/src/internal/bytealg/equal_loong64.s +index 830b09bd2c..4cc31d5e46 100644 +--- a/src/internal/bytealg/equal_loong64.s ++++ b/src/internal/bytealg/equal_loong64.s +@@ -9,36 +9,62 @@ + + // memequal(a, b unsafe.Pointer, size uintptr) bool + TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 +- BEQ R4, R5, eq +- ADDV R4, R6, R7 +- PCALIGN $16 +-loop: +- BNE R4, R7, test +- MOVV $1, R4 ++ // R4 = a_base ++ // R5 = b_base ++ // R6 = size ++ JMP equalbody<>(SB) ++ ++// memequal_varlen(a, b unsafe.Pointer) bool ++TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 ++ // R4 = a_base ++ // R5 = b_base ++ MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure ++ JMP equalbody<>(SB) ++ ++TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0 ++ BEQ R4, R5, eq ++ ADDV R4, R6, R6 // end ++ ++loop_16byte: ++ ADDV $16, R4, R9 ++ BLT R6, R9, load8byte ++ MOVV (R4), R7 ++ MOVV (R5), R8 ++ MOVV 8(R4), R10 ++ MOVV 8(R5), R11 ++ MOVV R9, R4 ++ XOR R7, R8, R7 ++ XOR R10, R11, R10 ++ OR R10, R7, R7 ++ ADDV $16, R5 ++ BEQ R7, loop_16byte ++ ++ MOVB R0, R4 + RET +-test: +- MOVBU (R4), R9 ++ ++load8byte: ++ ADDV $8, R4, R9 ++ BLT R6, R9, tail ++ MOVV (R4), R7 ++ MOVV (R5), R8 ++ MOVV R9, R4 ++ ADDV $8, R5 ++ BEQ R7, R8, tail ++ ++ MOVB R0, R4 ++ RET ++ ++tail: ++ BEQ R4, R6, eq ++ MOVBU (R4), R7 ++ MOVBU (R5), R8 + ADDV $1, R4 +- MOVBU (R5), R10 + ADDV $1, R5 +- BEQ R9, R10, loop ++ BEQ R7, R8, tail + + MOVB R0, R4 + RET +-eq: +- MOVV $1, R4 +- RET + +-// memequal_varlen(a, b unsafe.Pointer) bool +-TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17 +- BEQ R4, R5, eq +- MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure +- MOVV R4, 8(R3) +- MOVV R5, 16(R3) +- MOVV R6, 24(R3) +- JAL runtime·memequal(SB) +- MOVBU 32(R3), R4 +- RET + eq: + MOVV $1, R4 + RET +-- +2.38.1 + diff --git a/0008-internal-bytealg-optimize-Index-and-IndexString-func.patch b/0008-internal-bytealg-optimize-Index-and-IndexString-func.patch new file mode 100644 index 0000000000000000000000000000000000000000..4fb2113d1dc87b193fc7af7c42f2ab6d9a4a6047 --- /dev/null +++ b/0008-internal-bytealg-optimize-Index-and-IndexString-func.patch @@ -0,0 +1,299 @@ +From 89d740fe5889c558dbb69b6ac3a80ec38cd5765c Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Thu, 23 May 2024 16:25:06 +0800 +Subject: [PATCH 08/44] internal/bytealg: optimize Index and IndexString + function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +goos: linux +goarch: loong64 +pkg: bytes +cpu: Loongson-3C5000 @ 2200.00MHz + │ test/old_3c5000_index.log │ test/new_3c5000_index.log │ + │ sec/op │ sec/op vs base │ +Index/10 66.42n ± 0% 20.47n ± 0% -69.18% (p=0.000 n=20) +Index/32 196.1n ± 0% 105.7n ± 0% -46.12% (p=0.000 n=20) +Index/4K 13.622µ ± 0% 5.673µ ± 0% -58.35% (p=0.000 n=20) +Index/4M 14.005m ± 0% 5.734m ± 0% -59.06% (p=0.000 n=20) +Index/64M 224.50m ± 0% 91.94m ± 0% -59.05% (p=0.000 n=20) +IndexEasy/10 21.30n ± 0% 18.66n ± 0% -12.41% (p=0.000 n=20) +IndexEasy/32 41.40n ± 0% 33.91n ± 1% -18.09% (p=0.000 n=20) +IndexEasy/4K 4.141µ ± 4% 2.373µ ± 1% -42.70% (p=0.000 n=20) +IndexEasy/4M 3.830m ± 0% 2.392m ± 0% -37.55% (p=0.000 n=20) +IndexEasy/64M 62.54m ± 1% 39.86m ± 0% -36.26% (p=0.000 n=20) +geomean 29.43µ 15.73µ -46.57% + +goos: linux +goarch: loong64 +pkg: strings +cpu: Loongson-3C5000 @ 2200.00MHz + │ test/old_3c5000_indexstring.log │ test/new_3c5000_indexstring.log │ + │ sec/op │ sec/op vs base │ +Index 30.54n ± 0% 16.91n ± 0% -44.64% (p=0.000 n=20) + +Change-Id: I92739ada1637356c6d42761a8a596b0bffec405d +--- + src/internal/bytealg/index_generic.go | 2 +- + src/internal/bytealg/index_loong64.go | 23 ++++ + src/internal/bytealg/index_loong64.s | 190 ++++++++++++++++++++++++++ + src/internal/bytealg/index_native.go | 2 +- + 4 files changed, 215 insertions(+), 2 deletions(-) + create mode 100644 src/internal/bytealg/index_loong64.go + create mode 100644 src/internal/bytealg/index_loong64.s + +diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go +index a59e32938e..2d89c41825 100644 +--- a/src/internal/bytealg/index_generic.go ++++ b/src/internal/bytealg/index_generic.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64 ++//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64 && !loong64 + + package bytealg + +diff --git a/src/internal/bytealg/index_loong64.go b/src/internal/bytealg/index_loong64.go +new file mode 100644 +index 0000000000..d6f43eb32c +--- /dev/null ++++ b/src/internal/bytealg/index_loong64.go +@@ -0,0 +1,23 @@ ++// Copyright 2018 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++package bytealg ++ ++// Empirical data shows that using Index can get better ++// performance when len(s) <= 16. ++const MaxBruteForce = 16 ++ ++func init() { ++ // Optimize cases where the length of the substring is less than 32 bytes ++ MaxLen = 32 ++} ++ ++// Cutover reports the number of failures of IndexByte we should tolerate ++// before switching over to Index. ++// n is the number of bytes processed so far. ++// See the bytes.Index implementation for details. ++func Cutover(n int) int { ++ // 1 error per 8 characters, plus a few slop to start. ++ return (n + 16) / 8 ++} +diff --git a/src/internal/bytealg/index_loong64.s b/src/internal/bytealg/index_loong64.s +new file mode 100644 +index 0000000000..221d0332a4 +--- /dev/null ++++ b/src/internal/bytealg/index_loong64.s +@@ -0,0 +1,190 @@ ++// Copyright 2018 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++#include "go_asm.h" ++#include "textflag.h" ++ ++TEXT ·Index(SB),NOSPLIT,$0-56 ++ MOVV R7, R6 // R6 = separator pointer ++ MOVV R8, R7 // R7 = separator length ++ JMP indexbody<>(SB) ++ ++TEXT ·IndexString(SB),NOSPLIT,$0-40 ++ JMP indexbody<>(SB) ++ ++// input: ++// R4 = string ++// R5 = length ++// R6 = separator pointer ++// R7 = separator length (2 <= len <= 32) ++TEXT indexbody<>(SB),NOSPLIT,$0 ++ // main idea is to load 'sep' into separate register(s) ++ // to avoid repeatedly re-load it again and again ++ // for sebsequent substring comparisons ++ SUBV R7, R5, R8 ++ ADDV $1, R4, R9 // store base for later ++ MOVV $8, R5 ++ ADDV R4, R8 // end ++ BLT R5, R7, len_gt_8 ++ ++len_le_8: ++ AND $0x8, R7, R5 ++ BNE R5, len_8 ++ AND $0x4, R7, R5 ++ BNE R5, len_4_7 ++ ++len_2_3: ++ AND $0x1, R7, R5 ++ BNE R5, len_3 ++ ++len_2: ++ MOVHU (R6), R10 ++loop_2: ++ BLT R8, R4, not_found ++ MOVHU (R4), R11 ++ ADDV $1, R4 ++ BNE R10, R11, loop_2 ++ JMP found ++ ++len_3: ++ MOVHU (R6), R10 ++ MOVBU 2(R6), R11 ++loop_3: ++ BLT R8, R4, not_found ++ MOVHU (R4), R12 ++ ADDV $1, R4 ++ BNE R10, R12, loop_3 ++ MOVBU 1(R4), R12 ++ BNE R11, R12, loop_3 ++ JMP found ++ ++len_4_7: ++ AND $0x2, R7, R5 ++ BNE R5, len_6_7 ++ AND $0x1, R7, R5 ++ BNE R5, len_5 ++ ++len_4: ++ MOVWU (R6), R10 ++loop_4: ++ BLT R8, R4, not_found ++ MOVWU (R4), R11 ++ ADDV $1, R4 ++ BNE R10, R11, loop_4 ++ JMP found ++len_5: ++ MOVWU (R6), R10 ++ MOVBU 4(R6), R11 ++loop_5: ++ BLT R8, R4, not_found ++ MOVWU (R4), R12 ++ ADDV $1, R4 ++ BNE R10, R12, loop_5 ++ MOVBU 3(R4), R12 ++ BNE R11, R12, loop_5 ++ JMP found ++ ++len_6_7: ++ AND $0x1, R7, R5 ++ BNE R5, len_7 ++ ++len_6: ++ MOVWU (R6), R10 ++ MOVHU 4(R6), R11 ++loop_6: ++ BLT R8, R4, not_found ++ MOVWU (R4), R12 ++ ADDV $1, R4 ++ BNE R10, R12, loop_6 ++ MOVHU 3(R4), R12 ++ BNE R11, R12, loop_6 ++ JMP found ++ ++len_7: ++ MOVWU (R6), R10 ++ MOVWU 3(R6), R11 ++loop_7: ++ BLT R8, R4, not_found ++ MOVWU (R4), R12 ++ ADDV $1, R4 ++ BNE R10, R12, loop_7 ++ MOVWU 2(R4), R12 ++ BNE R11, R12, loop_7 ++ JMP found ++ ++len_8: ++ MOVV (R6), R10 ++loop_8: ++ BLT R8, R4, not_found ++ MOVV (R4), R11 ++ ADDV $1, R4 ++ BNE R10, R11, loop_8 ++ JMP found ++ ++len_gt_8: ++ MOVV $16, R5 ++ BLT R5, R7, len_gt_16 ++ ++len_9_16: ++ MOVV (R6), R10 ++ SUBV $8, R7 ++ MOVV (R6)(R7), R11 ++ SUBV $1, R7 ++loop_9_16: ++ BLT R8, R4, not_found ++ MOVV (R4), R12 ++ ADDV $1, R4 ++ BNE R10, R12, loop_9_16 ++ MOVV (R4)(R7), R12 ++ BNE R11, R12, loop_9_16 ++ JMP found ++ ++len_gt_16: ++ MOVV $24, R5 ++ BLT R5, R7, len_25_32 ++ ++len_17_24: ++ MOVV (R6), R10 ++ SUBV $8, R7 ++ MOVV 8(R6), R11 ++ MOVV (R6)(R7), R12 ++ SUBV $1, R7 ++loop_17_24: ++ BLT R8, R4, not_found ++ MOVV (R4), R13 ++ ADDV $1, R4 ++ BNE R10, R13, loop_17_24 ++ MOVV 7(R4), R13 ++ BNE R11, R13, loop_17_24 ++ MOVV (R4)(R7), R13 ++ BNE R12, R13, loop_17_24 ++ JMP found ++ ++len_25_32: ++ MOVV (R6), R10 ++ SUBV $8, R7 ++ MOVV 8(R6), R11 ++ MOVV 16(R6), R12 ++ MOVV (R6)(R7), R13 ++ SUBV $1, R7 ++loop_25_32: ++ BLT R8, R4, not_found ++ MOVV (R4), R14 ++ ADDV $1, R4 ++ BNE R10, R14, loop_25_32 ++ MOVV 7(R4), R14 ++ BNE R11, R14, loop_25_32 ++ MOVV 15(R4), R14 ++ BNE R12, R14, loop_25_32 ++ MOVV (R4)(R7), R14 ++ BNE R13, R14, loop_25_32 ++ JMP found ++ ++found: ++ SUBV R9, R4 ++ RET ++ ++not_found: ++ MOVV $-1, R4 ++ RET +diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go +index 59c93f9d12..7aadaabe4e 100644 +--- a/src/internal/bytealg/index_native.go ++++ b/src/internal/bytealg/index_native.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build amd64 || arm64 || s390x || ppc64le || ppc64 ++//go:build amd64 || arm64 || s390x || ppc64le || ppc64 || loong64 + + package bytealg + +-- +2.38.1 + diff --git a/0009-internal-bytealg-optimize-Count-and-CountString-func.patch b/0009-internal-bytealg-optimize-Count-and-CountString-func.patch new file mode 100644 index 0000000000000000000000000000000000000000..9b99d37cca346311046f3085e4133441b5f321ae --- /dev/null +++ b/0009-internal-bytealg-optimize-Count-and-CountString-func.patch @@ -0,0 +1,153 @@ +From 37c73e45ea537b7e8662b968b630a2566b25ae59 Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Wed, 29 May 2024 10:49:41 +0800 +Subject: [PATCH 09/44] internal/bytealg: optimize Count and CountString + function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark results on Loongson 3C5000 (which is an LA464 implementation): + +goos: linux +goarch: loong64 +pkg: bytes +cpu: Loongson-3C5000 @ 2200.00MHz + │ test/old_3c5000_count.log │ test/new_3c5000_count.log │ + │ sec/op │ sec/op vs base │ +CountSingle/10 16.26n ± 0% 16.26n ± 0% ~ (p=0.653 n=20) +CountSingle/32 41.48n ± 0% 27.48n ± 0% -33.75% (p=0.000 n=20) +CountSingle/4K 4.998µ ± 0% 2.961µ ± 0% -40.76% (p=0.000 n=20) +CountSingle/4M 5.076m ± 0% 3.510m ± 8% -30.84% (p=0.000 n=20) +CountSingle/64M 88.70m ± 0% 58.15m ± 1% -34.45% (p=0.000 n=20) +geomean 17.23µ 12.20µ -29.19% + +Change-Id: Ic60d49fea83c9cf4f9b02bae3ce69b81206c7017 +--- + src/internal/bytealg/count_generic.go | 2 +- + src/internal/bytealg/count_loong64.s | 86 +++++++++++++++++++++++++++ + src/internal/bytealg/count_native.go | 2 +- + 3 files changed, 88 insertions(+), 2 deletions(-) + create mode 100644 src/internal/bytealg/count_loong64.s + +diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go +index 932a7c584c..16f974539c 100644 +--- a/src/internal/bytealg/count_generic.go ++++ b/src/internal/bytealg/count_generic.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !amd64 && !arm && !arm64 && !ppc64le && !ppc64 && !riscv64 && !s390x ++//go:build !amd64 && !arm && !arm64 && !loong64 && !ppc64le && !ppc64 && !riscv64 && !s390x + + package bytealg + +diff --git a/src/internal/bytealg/count_loong64.s b/src/internal/bytealg/count_loong64.s +new file mode 100644 +index 0000000000..ca19c5f343 +--- /dev/null ++++ b/src/internal/bytealg/count_loong64.s +@@ -0,0 +1,86 @@ ++// Copyright 2020 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++#include "go_asm.h" ++#include "textflag.h" ++ ++TEXT ·Count(SB),NOSPLIT,$0-40 ++ // R4 = b_base ++ // R5 = b_len ++ // R6 = b_cap (unused) ++ // R7 = byte to count (want in R6) ++ AND $0xff, R7, R6 ++ JMP countbody<>(SB) ++ ++TEXT ·CountString(SB),NOSPLIT,$0-32 ++ // R4 = s_base ++ // R5 = s_len ++ // R6 = byte to count ++ AND $0xff, R6 ++ JMP countbody<>(SB) ++ ++// input: ++// R4 = s_base ++// R5 = s_len ++// R6 = byte to count ++TEXT countbody<>(SB),NOSPLIT,$0 ++ MOVV R0, R7 // count ++ ADDV R4, R5 // end ++ MOVV $1, R17 ++ ++loop: ++ ADDV $8, R4, R9 ++ BLT R5, R9, tail ++ MOVV (R4), R8 ++ ++ AND $0xff, R8, R10 ++ WORD $0xcf210b // bstrpick.w r11, r8, 15, 8 ++ XOR R6, R10, R10 ++ XOR R6, R11, R11 ++ MASKNEZ R10, R17, R12 ++ MASKNEZ R11, R17, R13 ++ ADDV R7, R12, R7 ++ ADDV R7, R13, R7 ++ ++ WORD $0xd7410a // bstrpick.w r10, r8, 23, 16 ++ WORD $0xdf610b // bstrpick.w r11, r8, 31, 24 ++ XOR R6, R10, R10 ++ XOR R6, R11, R11 ++ MASKNEZ R10, R17, R12 ++ MASKNEZ R11, R17, R13 ++ ADDV R7, R12, R7 ++ ADDV R7, R13, R7 ++ ++ WORD $0xe7810a // bstrpick.w r10, r8, 39, 32 ++ WORD $0xefa10b // bstrpick.w r11, r8, 47, 40 ++ XOR R6, R10, R10 ++ XOR R6, R11, R11 ++ MASKNEZ R10, R17, R12 ++ MASKNEZ R11, R17, R13 ++ ADDV R7, R12, R7 ++ ADDV R7, R13, R7 ++ ++ WORD $0xf7c10a // bstrpick.w r10, r8, 55, 48 ++ WORD $0xffe10b // bstrpick.w r11, r8, 63, 56 ++ XOR R6, R10, R10 ++ XOR R6, R11, R11 ++ MASKNEZ R10, R17, R12 ++ MASKNEZ R11, R17, R13 ++ ADDV R7, R12, R7 ++ ADDV R7, R13, R7 ++ ++ MOVV R9, R4 ++ JMP loop ++ ++tail: ++ BEQ R4, R5, done ++ MOVBU (R4), R8 ++ ADDV $1, R4 ++ BNE R6, R8, tail ++ ADDV $1, R7 ++ JMP tail ++ ++done: ++ MOVV R7, R4 ++ RET +diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go +index 90189c9fe0..eab64e8950 100644 +--- a/src/internal/bytealg/count_native.go ++++ b/src/internal/bytealg/count_native.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build amd64 || arm || arm64 || ppc64le || ppc64 || riscv64 || s390x ++//go:build amd64 || arm || arm64 || loong64 || ppc64le || ppc64 || riscv64 || s390x + + package bytealg + +-- +2.38.1 + diff --git a/0010-internal-bytealg-adjust-the-format-of-assembly-files.patch b/0010-internal-bytealg-adjust-the-format-of-assembly-files.patch new file mode 100644 index 0000000000000000000000000000000000000000..85a21d39280a660722421eccf9ed74d018375699 --- /dev/null +++ b/0010-internal-bytealg-adjust-the-format-of-assembly-files.patch @@ -0,0 +1,583 @@ +From 14ffec301d84da6bcd5ef5757d6cd6445351225e Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Mon, 3 Jun 2024 15:43:32 +0800 +Subject: [PATCH 10/44] internal/bytealg: adjust the format of assembly files + {count, equal, index, indexbyte}_loong64.s + +Change-Id: I19e6650e6595148e449da7a82be6e735c6f01ab6 +--- + src/internal/bytealg/count_loong64.s | 92 +++++++------- + src/internal/bytealg/equal_loong64.s | 42 ++++--- + src/internal/bytealg/index_loong64.s | 148 +++++++++++------------ + src/internal/bytealg/indexbyte_loong64.s | 52 ++++---- + 4 files changed, 169 insertions(+), 165 deletions(-) + +diff --git a/src/internal/bytealg/count_loong64.s b/src/internal/bytealg/count_loong64.s +index ca19c5f343..db8ba2cb24 100644 +--- a/src/internal/bytealg/count_loong64.s ++++ b/src/internal/bytealg/count_loong64.s +@@ -1,4 +1,4 @@ +-// Copyright 2020 The Go Authors. All rights reserved. ++// Copyright 2024 The Go Authors. All rights reserved. + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +@@ -9,77 +9,77 @@ TEXT ·Count(SB),NOSPLIT,$0-40 + // R4 = b_base + // R5 = b_len + // R6 = b_cap (unused) +- // R7 = byte to count (want in R6) +- AND $0xff, R7, R6 +- JMP countbody<>(SB) ++ // R7 = byte to count ++ AND $0xff, R7, R6 ++ JMP countbody<>(SB) + + TEXT ·CountString(SB),NOSPLIT,$0-32 + // R4 = s_base + // R5 = s_len + // R6 = byte to count +- AND $0xff, R6 +- JMP countbody<>(SB) ++ AND $0xff, R6 ++ JMP countbody<>(SB) + + // input: + // R4 = s_base + // R5 = s_len + // R6 = byte to count + TEXT countbody<>(SB),NOSPLIT,$0 +- MOVV R0, R7 // count +- ADDV R4, R5 // end +- MOVV $1, R17 ++ MOVV R0, R7 // count ++ ADDV R4, R5 // end ++ MOVV $1, R17 + + loop: + ADDV $8, R4, R9 +- BLT R5, R9, tail ++ BLT R5, R9, tail + MOVV (R4), R8 + +- AND $0xff, R8, R10 +- WORD $0xcf210b // bstrpick.w r11, r8, 15, 8 +- XOR R6, R10, R10 +- XOR R6, R11, R11 +- MASKNEZ R10, R17, R12 +- MASKNEZ R11, R17, R13 +- ADDV R7, R12, R7 +- ADDV R7, R13, R7 ++ AND $0xff, R8, R10 ++ WORD $0xcf210b // bstrpick.w r11, r8, 15, 8 ++ XOR R6, R10, R10 ++ XOR R6, R11, R11 ++ MASKNEZ R10, R17, R12 ++ MASKNEZ R11, R17, R13 ++ ADDV R7, R12, R7 ++ ADDV R7, R13, R7 + +- WORD $0xd7410a // bstrpick.w r10, r8, 23, 16 +- WORD $0xdf610b // bstrpick.w r11, r8, 31, 24 +- XOR R6, R10, R10 +- XOR R6, R11, R11 +- MASKNEZ R10, R17, R12 +- MASKNEZ R11, R17, R13 +- ADDV R7, R12, R7 +- ADDV R7, R13, R7 ++ WORD $0xd7410a // bstrpick.w r10, r8, 23, 16 ++ WORD $0xdf610b // bstrpick.w r11, r8, 31, 24 ++ XOR R6, R10, R10 ++ XOR R6, R11, R11 ++ MASKNEZ R10, R17, R12 ++ MASKNEZ R11, R17, R13 ++ ADDV R7, R12, R7 ++ ADDV R7, R13, R7 + +- WORD $0xe7810a // bstrpick.w r10, r8, 39, 32 +- WORD $0xefa10b // bstrpick.w r11, r8, 47, 40 +- XOR R6, R10, R10 +- XOR R6, R11, R11 +- MASKNEZ R10, R17, R12 +- MASKNEZ R11, R17, R13 +- ADDV R7, R12, R7 +- ADDV R7, R13, R7 ++ WORD $0xe7810a // bstrpick.w r10, r8, 39, 32 ++ WORD $0xefa10b // bstrpick.w r11, r8, 47, 40 ++ XOR R6, R10, R10 ++ XOR R6, R11, R11 ++ MASKNEZ R10, R17, R12 ++ MASKNEZ R11, R17, R13 ++ ADDV R7, R12, R7 ++ ADDV R7, R13, R7 + +- WORD $0xf7c10a // bstrpick.w r10, r8, 55, 48 +- WORD $0xffe10b // bstrpick.w r11, r8, 63, 56 +- XOR R6, R10, R10 +- XOR R6, R11, R11 +- MASKNEZ R10, R17, R12 +- MASKNEZ R11, R17, R13 +- ADDV R7, R12, R7 +- ADDV R7, R13, R7 ++ WORD $0xf7c10a // bstrpick.w r10, r8, 55, 48 ++ WORD $0xffe10b // bstrpick.w r11, r8, 63, 56 ++ XOR R6, R10, R10 ++ XOR R6, R11, R11 ++ MASKNEZ R10, R17, R12 ++ MASKNEZ R11, R17, R13 ++ ADDV R7, R12, R7 ++ ADDV R7, R13, R7 + + MOVV R9, R4 +- JMP loop ++ JMP loop + + tail: +- BEQ R4, R5, done ++ BEQ R4, R5, done + MOVBU (R4), R8 + ADDV $1, R4 +- BNE R6, R8, tail ++ BNE R6, R8, tail + ADDV $1, R7 +- JMP tail ++ JMP tail + + done: + MOVV R7, R4 +diff --git a/src/internal/bytealg/equal_loong64.s b/src/internal/bytealg/equal_loong64.s +index 4cc31d5e46..5d5d591a2c 100644 +--- a/src/internal/bytealg/equal_loong64.s ++++ b/src/internal/bytealg/equal_loong64.s +@@ -12,57 +12,61 @@ TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 + // R4 = a_base + // R5 = b_base + // R6 = size +- JMP equalbody<>(SB) ++ JMP equalbody<>(SB) + + // memequal_varlen(a, b unsafe.Pointer) bool + TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 + // R4 = a_base + // R5 = b_base +- MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure +- JMP equalbody<>(SB) ++ MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure ++ JMP equalbody<>(SB) + ++// input: ++// R4 = a_base ++// R5 = b_base ++// R6 = size + TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0 +- BEQ R4, R5, eq +- ADDV R4, R6, R6 // end ++ BEQ R4, R5, eq ++ ADDV R4, R6, R6 // end + + loop_16byte: +- ADDV $16, R4, R9 +- BLT R6, R9, load8byte ++ ADDV $16, R4, R9 ++ BLT R6, R9, load8byte + MOVV (R4), R7 + MOVV (R5), R8 + MOVV 8(R4), R10 + MOVV 8(R5), R11 + MOVV R9, R4 +- XOR R7, R8, R7 +- XOR R10, R11, R10 +- OR R10, R7, R7 ++ XOR R7, R8, R7 ++ XOR R10, R11, R10 ++ OR R10, R7, R7 + ADDV $16, R5 +- BEQ R7, loop_16byte ++ BEQ R7, loop_16byte + +- MOVB R0, R4 ++ MOVB R0, R4 + RET + + load8byte: +- ADDV $8, R4, R9 +- BLT R6, R9, tail ++ ADDV $8, R4, R9 ++ BLT R6, R9, tail + MOVV (R4), R7 + MOVV (R5), R8 + MOVV R9, R4 + ADDV $8, R5 +- BEQ R7, R8, tail ++ BEQ R7, R8, tail + +- MOVB R0, R4 ++ MOVB R0, R4 + RET + + tail: +- BEQ R4, R6, eq ++ BEQ R4, R6, eq + MOVBU (R4), R7 + MOVBU (R5), R8 + ADDV $1, R4 + ADDV $1, R5 +- BEQ R7, R8, tail ++ BEQ R7, R8, tail + +- MOVB R0, R4 ++ MOVB R0, R4 + RET + + eq: +diff --git a/src/internal/bytealg/index_loong64.s b/src/internal/bytealg/index_loong64.s +index 221d0332a4..7f7190b3be 100644 +--- a/src/internal/bytealg/index_loong64.s ++++ b/src/internal/bytealg/index_loong64.s +@@ -1,4 +1,4 @@ +-// Copyright 2018 The Go Authors. All rights reserved. ++// Copyright 2024 The Go Authors. All rights reserved. + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +@@ -6,12 +6,12 @@ + #include "textflag.h" + + TEXT ·Index(SB),NOSPLIT,$0-56 +- MOVV R7, R6 // R6 = separator pointer +- MOVV R8, R7 // R7 = separator length +- JMP indexbody<>(SB) ++ MOVV R7, R6 // R6 = separator pointer ++ MOVV R8, R7 // R7 = separator length ++ JMP indexbody<>(SB) + + TEXT ·IndexString(SB),NOSPLIT,$0-40 +- JMP indexbody<>(SB) ++ JMP indexbody<>(SB) + + // input: + // R4 = string +@@ -23,108 +23,108 @@ TEXT indexbody<>(SB),NOSPLIT,$0 + // to avoid repeatedly re-load it again and again + // for sebsequent substring comparisons + SUBV R7, R5, R8 +- ADDV $1, R4, R9 // store base for later ++ ADDV $1, R4, R9 // store base for later + MOVV $8, R5 +- ADDV R4, R8 // end +- BLT R5, R7, len_gt_8 ++ ADDV R4, R8 // end ++ BLT R5, R7, len_gt_8 + + len_le_8: +- AND $0x8, R7, R5 +- BNE R5, len_8 +- AND $0x4, R7, R5 +- BNE R5, len_4_7 ++ AND $0x8, R7, R5 ++ BNE R5, len_8 ++ AND $0x4, R7, R5 ++ BNE R5, len_4_7 + + len_2_3: +- AND $0x1, R7, R5 +- BNE R5, len_3 ++ AND $0x1, R7, R5 ++ BNE R5, len_3 + + len_2: +- MOVHU (R6), R10 ++ MOVHU (R6), R10 + loop_2: +- BLT R8, R4, not_found +- MOVHU (R4), R11 ++ BLT R8, R4, not_found ++ MOVHU (R4), R11 + ADDV $1, R4 +- BNE R10, R11, loop_2 +- JMP found ++ BNE R10, R11, loop_2 ++ JMP found + + len_3: + MOVHU (R6), R10 + MOVBU 2(R6), R11 + loop_3: +- BLT R8, R4, not_found +- MOVHU (R4), R12 ++ BLT R8, R4, not_found ++ MOVHU (R4), R12 + ADDV $1, R4 +- BNE R10, R12, loop_3 +- MOVBU 1(R4), R12 +- BNE R11, R12, loop_3 +- JMP found ++ BNE R10, R12, loop_3 ++ MOVBU 1(R4), R12 ++ BNE R11, R12, loop_3 ++ JMP found + + len_4_7: +- AND $0x2, R7, R5 +- BNE R5, len_6_7 +- AND $0x1, R7, R5 +- BNE R5, len_5 ++ AND $0x2, R7, R5 ++ BNE R5, len_6_7 ++ AND $0x1, R7, R5 ++ BNE R5, len_5 + + len_4: +- MOVWU (R6), R10 ++ MOVWU (R6), R10 + loop_4: +- BLT R8, R4, not_found +- MOVWU (R4), R11 ++ BLT R8, R4, not_found ++ MOVWU (R4), R11 + ADDV $1, R4 +- BNE R10, R11, loop_4 +- JMP found ++ BNE R10, R11, loop_4 ++ JMP found + len_5: + MOVWU (R6), R10 + MOVBU 4(R6), R11 + loop_5: +- BLT R8, R4, not_found +- MOVWU (R4), R12 ++ BLT R8, R4, not_found ++ MOVWU (R4), R12 + ADDV $1, R4 +- BNE R10, R12, loop_5 +- MOVBU 3(R4), R12 +- BNE R11, R12, loop_5 +- JMP found ++ BNE R10, R12, loop_5 ++ MOVBU 3(R4), R12 ++ BNE R11, R12, loop_5 ++ JMP found + + len_6_7: +- AND $0x1, R7, R5 +- BNE R5, len_7 ++ AND $0x1, R7, R5 ++ BNE R5, len_7 + + len_6: + MOVWU (R6), R10 + MOVHU 4(R6), R11 + loop_6: +- BLT R8, R4, not_found +- MOVWU (R4), R12 ++ BLT R8, R4, not_found ++ MOVWU (R4), R12 + ADDV $1, R4 +- BNE R10, R12, loop_6 +- MOVHU 3(R4), R12 +- BNE R11, R12, loop_6 +- JMP found ++ BNE R10, R12, loop_6 ++ MOVHU 3(R4), R12 ++ BNE R11, R12, loop_6 ++ JMP found + + len_7: + MOVWU (R6), R10 + MOVWU 3(R6), R11 + loop_7: +- BLT R8, R4, not_found +- MOVWU (R4), R12 ++ BLT R8, R4, not_found ++ MOVWU (R4), R12 + ADDV $1, R4 +- BNE R10, R12, loop_7 +- MOVWU 2(R4), R12 +- BNE R11, R12, loop_7 +- JMP found ++ BNE R10, R12, loop_7 ++ MOVWU 2(R4), R12 ++ BNE R11, R12, loop_7 ++ JMP found + + len_8: + MOVV (R6), R10 + loop_8: +- BLT R8, R4, not_found ++ BLT R8, R4, not_found + MOVV (R4), R11 + ADDV $1, R4 +- BNE R10, R11, loop_8 +- JMP found ++ BNE R10, R11, loop_8 ++ JMP found + + len_gt_8: + MOVV $16, R5 +- BLT R5, R7, len_gt_16 ++ BLT R5, R7, len_gt_16 + + len_9_16: + MOVV (R6), R10 +@@ -132,17 +132,17 @@ len_9_16: + MOVV (R6)(R7), R11 + SUBV $1, R7 + loop_9_16: +- BLT R8, R4, not_found ++ BLT R8, R4, not_found + MOVV (R4), R12 + ADDV $1, R4 +- BNE R10, R12, loop_9_16 ++ BNE R10, R12, loop_9_16 + MOVV (R4)(R7), R12 +- BNE R11, R12, loop_9_16 +- JMP found ++ BNE R11, R12, loop_9_16 ++ JMP found + + len_gt_16: + MOVV $24, R5 +- BLT R5, R7, len_25_32 ++ BLT R5, R7, len_25_32 + + len_17_24: + MOVV (R6), R10 +@@ -151,15 +151,15 @@ len_17_24: + MOVV (R6)(R7), R12 + SUBV $1, R7 + loop_17_24: +- BLT R8, R4, not_found ++ BLT R8, R4, not_found + MOVV (R4), R13 + ADDV $1, R4 +- BNE R10, R13, loop_17_24 ++ BNE R10, R13, loop_17_24 + MOVV 7(R4), R13 +- BNE R11, R13, loop_17_24 ++ BNE R11, R13, loop_17_24 + MOVV (R4)(R7), R13 +- BNE R12, R13, loop_17_24 +- JMP found ++ BNE R12, R13, loop_17_24 ++ JMP found + + len_25_32: + MOVV (R6), R10 +@@ -169,17 +169,17 @@ len_25_32: + MOVV (R6)(R7), R13 + SUBV $1, R7 + loop_25_32: +- BLT R8, R4, not_found ++ BLT R8, R4, not_found + MOVV (R4), R14 + ADDV $1, R4 +- BNE R10, R14, loop_25_32 ++ BNE R10, R14, loop_25_32 + MOVV 7(R4), R14 +- BNE R11, R14, loop_25_32 ++ BNE R11, R14, loop_25_32 + MOVV 15(R4), R14 +- BNE R12, R14, loop_25_32 ++ BNE R12, R14, loop_25_32 + MOVV (R4)(R7), R14 +- BNE R13, R14, loop_25_32 +- JMP found ++ BNE R13, R14, loop_25_32 ++ JMP found + + found: + SUBV R9, R4 +diff --git a/src/internal/bytealg/indexbyte_loong64.s b/src/internal/bytealg/indexbyte_loong64.s +index 7811741423..b5f8f9cdbc 100644 +--- a/src/internal/bytealg/indexbyte_loong64.s ++++ b/src/internal/bytealg/indexbyte_loong64.s +@@ -12,17 +12,17 @@ TEXT ·IndexByte(SB),NOSPLIT,$0-40 + // R7 = byte to find + ADDV R4, R5 // end + MOVV R4, R6 // store base for later +- AND $0xff, R7 +- JMP indexbytebody<>(SB) ++ AND $0xff, R7 ++ JMP indexbytebody<>(SB) + + TEXT ·IndexByteString(SB),NOSPLIT,$0-32 + // R4 = s_base + // R5 = s_len + // R6 = byte to find +- AND $0xff, R6, R7 ++ AND $0xff, R6, R7 + ADDV R4, R5 // end + MOVV R4, R6 // store base for later +- JMP indexbytebody<>(SB) ++ JMP indexbytebody<>(SB) + + // input: + // R4: b_base +@@ -32,42 +32,42 @@ TEXT ·IndexByteString(SB),NOSPLIT,$0-32 + TEXT indexbytebody<>(SB),NOSPLIT,$0 + loop: + ADDV $8, R4, R10 +- BLT R5, R10, tail ++ BLT R5, R10, tail + MOVV (R4), R8 + +- AND $0xff, R8, R9 +- BEQ R7, R9, found ++ AND $0xff, R8, R9 ++ BEQ R7, R9, found + +- WORD $0xcf2109 // bstrpick.w r9, r8, 15, 8 +- BEQ R7, R9, byte_1th ++ WORD $0xcf2109 // bstrpick.w r9, r8, 15, 8 ++ BEQ R7, R9, byte_1th + +- WORD $0xd74109 // bstrpick.w r9, r8, 23, 16 +- BEQ R7, R9, byte_2th ++ WORD $0xd74109 // bstrpick.w r9, r8, 23, 16 ++ BEQ R7, R9, byte_2th + +- WORD $0xdf6109 // bstrpick.w r9, r8, 31, 24 +- BEQ R7, R9, byte_3th ++ WORD $0xdf6109 // bstrpick.w r9, r8, 31, 24 ++ BEQ R7, R9, byte_3th + +- WORD $0xe78109 // bstrpick.w r9, r8, 39, 32 +- BEQ R7, R9, byte_4th ++ WORD $0xe78109 // bstrpick.w r9, r8, 39, 32 ++ BEQ R7, R9, byte_4th + +- WORD $0xefa109 // bstrpick.w r9, r8, 47, 40 +- BEQ R7, R9, byte_5th ++ WORD $0xefa109 // bstrpick.w r9, r8, 47, 40 ++ BEQ R7, R9, byte_5th + +- WORD $0xf7c109 // bstrpick.w r9, r8, 55, 48 +- BEQ R7, R9, byte_6th ++ WORD $0xf7c109 // bstrpick.w r9, r8, 55, 48 ++ BEQ R7, R9, byte_6th + +- WORD $0xffe109 // bstrpick.w r9, r8, 63, 56 +- BEQ R7, R9, byte_7th ++ WORD $0xffe109 // bstrpick.w r9, r8, 63, 56 ++ BEQ R7, R9, byte_7th + + MOVV R10, R4 +- JMP loop ++ JMP loop + + tail: +- BEQ R4, R5, notfound +- MOVBU (R4), R8 +- BEQ R7, R8, found ++ BEQ R4, R5, notfound ++ MOVBU (R4), R8 ++ BEQ R7, R8, found + ADDV $1, R4 +- JMP tail ++ JMP tail + + byte_1th: + ADDV $1, R4 +-- +2.38.1 + diff --git a/0011-cmd-internal-obj-loong64-optimize-immediate-loading.patch b/0011-cmd-internal-obj-loong64-optimize-immediate-loading.patch new file mode 100644 index 0000000000000000000000000000000000000000..6136b63a2a511b369295c0585254d7237caab0fc --- /dev/null +++ b/0011-cmd-internal-obj-loong64-optimize-immediate-loading.patch @@ -0,0 +1,776 @@ +From a08a479c526bcc63bf24e69ff7fa1d37a1179e1f Mon Sep 17 00:00:00 2001 +From: limeidan +Date: Thu, 11 Jul 2024 21:03:45 +0800 +Subject: [PATCH 11/44] cmd/internal/obj/loong64: optimize immediate loading +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + + | old | new | + | sec/op | sec/op vs base | +BinaryTree17 11.08 ± 2% 11.16 ± 1% ~ (p=0.529 n=10) +Fannkuch11 2.716 ± 0% 2.737 ± 0% +0.79% (p=0.000 n=10) +FmtFprintfEmpty 67.37n ± 0% 66.42n ± 0% -1.41% (p=0.000 n=10) +FmtFprintfString 95.28n ± 0% 90.85n ± 0% -4.64% (p=0.000 n=10) +FmtFprintfInt 97.69n ± 0% 98.06n ± 0% +0.38% (p=0.000 n=10) +FmtFprintfIntInt 149.1n ± 0% 147.4n ± 0% -1.14% (p=0.000 n=10) +FmtFprintfPrefixedInt 223.6n ± 0% 196.5n ± 0% -12.10% (p=0.000 n=10) +FmtFprintfFloat 290.9n ± 0% 281.6n ± 1% -3.21% (p=0.000 n=10) +FmtManyArgs 670.6n ± 0% 642.6n ± 0% -4.18% (p=0.000 n=10) +GobDecode 10.26m ± 1% 10.23m ± 1% ~ (p=0.105 n=10) +GobEncode 12.09m ± 1% 11.94m ± 1% -1.24% (p=0.000 n=10) +Gzip 316.9m ± 0% 315.9m ± 0% -0.32% (p=0.001 n=10) +Gunzip 65.48m ± 0% 59.77m ± 0% -8.72% (p=0.000 n=10) +HTTPClientServer 70.36µ ± 0% 68.72µ ± 0% -2.34% (p=0.000 n=10) +JSONEncode 13.61m ± 1% 13.19m ± 1% -3.13% (p=0.000 n=10) +JSONDecode 57.52m ± 1% 54.15m ± 1% -5.86% (p=0.000 n=10) +Mandelbrot200 4.577m ± 0% 4.572m ± 0% -0.10% (p=0.002 n=10) +GoParse 6.466m ± 0% 6.363m ± 0% -1.58% (p=0.000 n=10) +RegexpMatchEasy0_32 89.20n ± 0% 87.72n ± 0% -1.65% (p=0.000 n=10) +RegexpMatchEasy0_1K 748.6n ± 0% 907.6n ± 0% +21.22% (p=0.000 n=10) +RegexpMatchEasy1_32 94.14n ± 0% 93.81n ± 0% -0.35% (p=0.000 n=10) +RegexpMatchEasy1_1K 832.1n ± 0% 953.6n ± 0% +14.59% (p=0.000 n=10) +RegexpMatchMedium_32 982.7n ± 0% 1018.0n ± 0% +3.59% (p=0.000 n=10) +RegexpMatchMedium_1K 30.51µ ± 0% 30.00µ ± 0% -1.65% (p=0.000 n=10) +RegexpMatchHard_32 1.721µ ± 0% 1.664µ ± 0% -3.34% (p=0.000 n=10) +RegexpMatchHard_1K 50.76µ ± 0% 50.92µ ± 0% +0.32% (p=0.000 n=10) +Revcomp 870.5m ± 0% 710.5m ± 0% -18.38% (p=0.000 n=10) +Template 93.18m ± 1% 93.67m ± 1% ~ (p=0.123 n=10) +TimeParse 309.2n ± 0% 307.8n ± 0% -0.45% (p=0.000 n=10) +TimeFormat 401.5n ± 0% 394.2n ± 0% -1.82% (p=0.000 n=10) +geomean 72.73µ 71.70µ -1.41% + +Change-Id: Id8d342ef3bb82a420434b2b841674683efef67be +--- + src/cmd/asm/internal/asm/endtoend_test.go | 2 + + .../asm/internal/asm/testdata/loong64enc1.s | 24 ++ + .../asm/internal/asm/testdata/loong64enc2.s | 46 +++ + .../asm/internal/asm/testdata/loong64enc3.s | 65 ++++ + .../asm/internal/asm/testdata/loong64enc4.s | 42 +++ + .../asm/internal/asm/testdata/loong64enc5.s | 17 + + src/cmd/internal/obj/loong64/a.out.go | 54 ++- + src/cmd/internal/obj/loong64/asm.go | 321 +++++++++++++++++- + src/cmd/internal/obj/loong64/cnames.go | 14 + + 9 files changed, 579 insertions(+), 6 deletions(-) + create mode 100644 src/cmd/asm/internal/asm/testdata/loong64enc4.s + create mode 100644 src/cmd/asm/internal/asm/testdata/loong64enc5.s + +diff --git a/src/cmd/asm/internal/asm/endtoend_test.go b/src/cmd/asm/internal/asm/endtoend_test.go +index 6e1aa1cd95..3760b77625 100644 +--- a/src/cmd/asm/internal/asm/endtoend_test.go ++++ b/src/cmd/asm/internal/asm/endtoend_test.go +@@ -465,6 +465,8 @@ func TestLOONG64Encoder(t *testing.T) { + testEndToEnd(t, "loong64", "loong64enc1") + testEndToEnd(t, "loong64", "loong64enc2") + testEndToEnd(t, "loong64", "loong64enc3") ++ testEndToEnd(t, "loong64", "loong64enc4") ++ testEndToEnd(t, "loong64", "loong64enc5") + testEndToEnd(t, "loong64", "loong64") + } + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index 4a88aca031..3a3eb10a74 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -516,3 +516,27 @@ lable2: + XVPCNTH X3, X2 // 62249c76 + XVPCNTW X3, X2 // 62289c76 + XVPCNTV X3, X2 // 622c9c76 ++ ++ // MOVV C_DCON12_0, r ++ MOVV $0x7a90000000000000, R4 // MOVV $8831558869273542656, R4 // 04a41e03 ++ MOVV $0xea90000000000000, R4 // MOVV $-1544734672188080128, R4 // 04a43a03 ++ ++ // MOVV C_UCON, r ++ MOVV $0x54321000, R4 // MOVV $1412567040, R4 // 2464a814 ++ MOVV $0xffffffff8432f000, R4 // MOVV $-2077036544, R4 // e4650815 ++ ++ // MOVV C_ADDCON, r ++ MOVV $0xfffffffffffff821, R4 // MOVV $-2015, R4 // 0484e002 ++ ++ // MOVV C_ANDCON, r ++ MOVV $0x821, R4 // MOVV $2081, R4 // 0484a003 ++ ++ // ADDV C_SCON, [r1], r2 ++ ADDV $0x321, R4 // ADDV $801, R4 // 8484cc02 ++ ADDV $0x321, R5, R4 // ADDV $801, R5, R4 // a484cc02 ++ ADDV $0xfffffffffffffc21, R4 // ADDV $-991, R4 // 8484f002 ++ ADDV $0xfffffffffffffc21, R5, R4 // ADDV $-991, R5, R4 // a484f002 ++ ++ // AND C_SCON, [r1], r2 ++ AND $0x321, R4 // AND $801, R4 // 84844c03 ++ AND $0x321, R5, R4 // AND $801, R5, R4 // a4844c03 +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc2.s b/src/cmd/asm/internal/asm/testdata/loong64enc2.s +index e497b83627..ee3bad74b1 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc2.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc2.s +@@ -77,3 +77,49 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0 + MOVH name(SB), R4 // 1e00001ac4034028 + MOVHU R4, name(SB) // 1e00001ac4034029 + MOVHU name(SB), R4 // 1e00001ac403402a ++ ++ // MOVV C_DCON12_20S, r ++ MOVV $0x273fffff80000000, R4 // MOVV $2828260563841187840, R4 // 0400001584cc0903 ++ MOVV $0xf73fffff80000000, R4 // MOVV $-630503949979353088, R4 // 0400001584cc3d03 ++ ++ // MOVV C_DCON20S_20, r ++ MOVV $0xfff800000f000000, R4 // MOVV $-2251799562027008, R4 // 04001e1404000017 ++ ++ // MOVV C_DCON12_12S, r ++ MOVV $0x273ffffffffff800, R4 // MOVV $2828260565988669440, R4 // 0400e00284cc0903 ++ MOVV $0xf73ffffffffff800, R4 // MOVV $-630503947831871488, R4 // 0400e00284cc3d03 ++ ++ // MOVV C_DCON20S_12S, r ++ MOVV $0xfff80000fffff800, R4 // MOVV $-2251795518720000, R4 // 0400a00204000017 ++ MOVV $0xfff8000000000000, R4 // MOVV $-2251799813685248, R4 // 0400800204000017 ++ ++ // MOVV C_DCON12_12U, r ++ MOVV $0x2730000000000800, R4 // MOVV $2823756966361303040, R4 // 0400a00384cc0903 ++ MOVV $0xf730000000000800, R4 // MOVV $-635007547459237888, R4 // 0400a00384cc3d03 ++ ++ // MOVV C_DCON20S_12U, r ++ MOVV $0xfff8000000000800, R4 // MOVV $-2251799813683200, R4 // 0400a00304000017 ++ ++ // ADDV/AND C_DCON12_0, [r1], r2 ++ ADDV $0x3210000000000000, R4 // ADDV $3607383301523767296, R4 // 1e840c0384f81000 ++ ADDV $0x3210000000000000, R5, R4 // ADDV $3607383301523767296, R5, R4 // 1e840c03a4f81000 ++ ADDV $0xc210000000000000, R4 // ADDV $-4463067230724161536, R4 // 1e84300384f81000 ++ ADDV $0xc210000000000000, R5, R4 // ADDV $-4463067230724161536, R5, R4 // 1e843003a4f81000 ++ AND $0x3210000000000000, R4 // AND $3607383301523767296, R4 // 1e840c0384f81400 ++ AND $0x3210000000000000, R5, R4 // AND $3607383301523767296, R5, R4 // 1e840c03a4f81400 ++ AND $0xc210000000000000, R4 // AND $-4463067230724161536, R4 // 1e84300384f81400 ++ AND $0xc210000000000000, R5, R4 // AND $-4463067230724161536, R5, R4 // 1e843003a4f81400 ++ ++ // ADDV/AND C_UCON, [r1], r2 ++ ADDV $0x43210000, R4 // ADDV $1126236160, R4 // 1e42861484f81000 ++ ADDV $0x43210000, R5, R4 // ADDV $1126236160, R5, R4 // 1e428614a4f81000 ++ ADDV $0xffffffffc3210000, R4 // ADDV $-1021247488, R4 // 1e42861584f81000 ++ ADDV $0xffffffffc3210000, R5, R4 // ADDV $-1021247488, R5, R4 // 1e428615a4f81000 ++ AND $0x43210000, R4 // AND $1126236160, R4 // 1e42861484f81400 ++ AND $0x43210000, R5, R4 // AND $1126236160, R5, R4 // 1e428614a4f81400 ++ AND $0xffffffffc3210000, R4 // AND $-1021247488, R4 // 1e42861584f81400 ++ AND $0xffffffffc3210000, R5, R4 // AND $-1021247488, R5, R4 // 1e428615a4f81400 ++ ++ // AND C_ADDCON, [r1], r2 ++ AND $0xfffffffffffffc21, R4 // AND $-991, R4 // 1e84b00284f81400 ++ AND $0xfffffffffffffc21, R5, R4 // AND $-991, R5, R4 // 1e84b002a4f81400 +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc3.s b/src/cmd/asm/internal/asm/testdata/loong64enc3.s +index 2600884309..2d83bd719a 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc3.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc3.s +@@ -121,3 +121,68 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0 + XOR $74565, R4, R5 // 5e020014de178d0385f81500 + XOR $4097, R4 // 3e000014de07800384f81500 + XOR $4097, R4, R5 // 3e000014de07800385f81500 ++ ++ // MOVV C_DCON32_12S, r ++ MOVV $0x27312345fffff800, R4 // MOVV $2824077224892692480, R4 // 0400a002a468241684cc0903 ++ MOVV $0xf7312345fffff800, R4 // MOVV $-634687288927848448, R4 // 0400a002a468241684cc3d03 ++ ++ // MOVV C_DCON32_0, r ++ MOVV $0x2731234500000000, R4 // MOVV $2824077220597727232, R4 // 04008002a468241684cc0903 ++ MOVV $0xf731234500000000, R4 // MOVV $-634687293222813696, R4 // 04008002a468241684cc3d03 ++ ++ // MOVV C_DCON32_20, r ++ MOVV $0x2731234512345000, R4 // MOVV $2824077220903145472, R4 // a4682414a468241684cc0903 ++ MOVV $0xf731234512345000, R4 // MOVV $-634687292917395456, R4 // a4682414a468241684cc3d03 ++ ++ // MOVV C_DCON12_32S, r ++ MOVV $0x273fffff80000800, R4 // MOVV $2828260563841189888, R4 // 040000158400a00384cc0903 ++ MOVV $0xf73fffff80000800, R4 // MOVV $-630503949979351040, R4 // 040000158400a00384cc3d03 ++ ++ // MOVV C_DCON20S_32, r ++ MOVV $0xfff8000080000800, R4 // MOVV $-2251797666199552, R4 // 040000158400a00304000017 ++ ++ // MOVV C_DCON32_12U, r ++ MOVV $0x2731234500000800, R4 // MOVV $2824077220597729280, R4 // 0400a003a468241684cc0903 ++ MOVV $0xf731234500000800, R4 // MOVV $-634687293222811648, R4 // 0400a003a468241684cc3d03 ++ ++ // ADDV/AND C_DCON12_20S, [r1], r2 ++ ADDV $0x273fffff80000000, R4 // ADDV $2828260563841187840, R4 // 1e000015decf090384f81000 ++ ADDV $0x273fffff80000000, R4, R5 // ADDV $2828260563841187840, R4, R5 // 1e000015decf090385f81000 ++ AND $0x273fffff80000000, R4 // AND $2828260563841187840, R4 // 1e000015decf090384f81400 ++ AND $0x273fffff80000000, R4, R5 // AND $2828260563841187840, R4, R5 // 1e000015decf090385f81400 ++ ++ // ADDV/AND C_DCON20S_20, [r1], r2 ++ ADDV $0xfff800000f000000, R4 // ADDV $-2251799562027008, R4 // 1e001e141e00001784f81000 ++ ADDV $0xfff800000f000000, R4, R5 // ADDV $-2251799562027008, R4, R5 // 1e001e141e00001785f81000 ++ AND $0xfff800000f000000, R4 // AND $-2251799562027008, R4 // 1e001e141e00001784f81400 ++ AND $0xfff800000f000000, R4, R5 // AND $-2251799562027008, R4, R5 // 1e001e141e00001785f81400 ++ ++ // ADDV/AND C_DCON12_12S, [r1], r2 ++ ADDV $0x273ffffffffff800, R4 // ADDV $2828260565988669440, R4 // 1e00e002decf090384f81000 ++ ADDV $0x273ffffffffff800, R4, R5 // ADDV $2828260565988669440, R4, R5 // 1e00e002decf090385f81000 ++ AND $0x273ffffffffff800, R4 // AND $2828260565988669440, R4 // 1e00e002decf090384f81400 ++ AND $0x273ffffffffff800, R4, R5 // AND $2828260565988669440, R4, R5 // 1e00e002decf090385f81400 ++ ++ // ADDV/AND C_DCON20S_12S, [r1], r2 ++ ADDV $0xfff80000fffff800, R4 // ADDV $-2251795518720000, R4 // 1e00a0021e00001784f81000 ++ ADDV $0xfff80000fffff800, R4, R5 // ADDV $-2251795518720000, R4, R5 // 1e00a0021e00001785f81000 ++ AND $0xfff80000fffff800, R4 // AND $-2251795518720000, R4 // 1e00a0021e00001784f81400 ++ AND $0xfff80000fffff800, R4, R5 // AND $-2251795518720000, R4, R5 // 1e00a0021e00001785f81400 ++ ++ // ADDV/AND C_DCON20S_0, [r1], r2 ++ ADDV $0xfff8000000000000, R4 // ADDV $-2251799813685248, R4 // 1e0080021e00001784f81000 ++ ADDV $0xfff8000000000000, R4, R5 // ADDV $-2251799813685248, R4, R5 // 1e0080021e00001785f81000 ++ AND $0xfff8000000000000, R4 // AND $-2251799813685248, R4 // 1e0080021e00001784f81400 ++ AND $0xfff8000000000000, R4, R5 // AND $-2251799813685248, R4, R5 // 1e0080021e00001785f81400 ++ ++ // ADDV/AND C_DCON12_12U, [r1], r2 ++ ADDV $0x2730000000000800, R4 // ADDV $2823756966361303040, R4 // 1e00a003decf090384f81000 ++ ADDV $0x2730000000000800, R4, R5 // ADDV $2823756966361303040, R4, R5 // 1e00a003decf090385f81000 ++ AND $0x2730000000000800, R4 // AND $2823756966361303040, R4 // 1e00a003decf090384f81400 ++ AND $0x2730000000000800, R4, R5 // AND $2823756966361303040, R4, R5 // 1e00a003decf090385f81400 ++ ++ // ADDV/AND C_DCON20S_12U, [r1], r2 ++ ADDV $0xfff8000000000800, R4 // ADDV $-2251799813683200, R4 // 1e00a0031e00001784f81000 ++ ADDV $0xfff8000000000800, R4, R5 // ADDV $-2251799813683200, R4, R5 // 1e00a0031e00001785f81000 ++ AND $0xfff8000000000800, R4 // AND $-2251799813683200, R4 // 1e00a0031e00001784f81400 ++ AND $0xfff8000000000800, R4, R5 // AND $-2251799813683200, R4, R5 // 1e00a0031e00001785f81400 +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc4.s b/src/cmd/asm/internal/asm/testdata/loong64enc4.s +new file mode 100644 +index 0000000000..16c06a3501 +--- /dev/null ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc4.s +@@ -0,0 +1,42 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++#include "../../../../../runtime/textflag.h" ++ ++TEXT asmtest(SB),DUPOK|NOSPLIT,$0 ++ // ADDV/AND C_DCON32_12S, [r1], r2 ++ ADDV $0x27312345fffff800, R4 // ADDV $2824077224892692480, R4 // 1e00a002be682416decf090384f81000 ++ ADDV $0x27312345fffff800, R4, R5 // ADDV $2824077224892692480, R4, R5 // 1e00a002be682416decf090385f81000 ++ AND $0x27312345fffff800, R4 // AND $2824077224892692480, R4 // 1e00a002be682416decf090384f81400 ++ AND $0x27312345fffff800, R4, R5 // AND $2824077224892692480, R4, R5 // 1e00a002be682416decf090385f81400 ++ ++ // ADDV/AND C_DCON32_0, [r1], r2 ++ ADDV $0x2731234500000000, R4 // ADDV $2824077220597727232, R4 // 1e008002be682416decf090384f81000 ++ ADDV $0x2731234500000000, R4, R5 // ADDV $2824077220597727232, R4, R5 // 1e008002be682416decf090385f81000 ++ AND $0x2731234500000000, R4 // AND $2824077220597727232, R4 // 1e008002be682416decf090384f81400 ++ AND $0x2731234500000000, R4, R5 // AND $2824077220597727232, R4, R5 // 1e008002be682416decf090385f81400 ++ ++ // ADDV/AND C_DCON32_20, [r1], r2 ++ ADDV $0x2731234512345000, R4 // ADDV $2824077220903145472, R4 // be682414be682416decf090384f81000 ++ ADDV $0x2731234512345000, R4, R5 // ADDV $2824077220903145472, R4, R5 // be682414be682416decf090385f81000 ++ AND $0x2731234512345000, R4 // AND $2824077220903145472, R4 // be682414be682416decf090384f81400 ++ AND $0x2731234512345000, R4, R5 // AND $2824077220903145472, R4, R5 // be682414be682416decf090385f81400 ++ ++ // ADDV/AND C_DCON12_32S, [r1], r2 ++ ADDV $0x273fffff80000800, R4 // ADDV $2828260563841189888, R4 // 1e000015de03a003decf090384f81000 ++ ADDV $0x273fffff80000800, R4, R5 // ADDV $2828260563841189888, R4, R5 // 1e000015de03a003decf090385f81000 ++ AND $0x273fffff80000800, R4 // AND $2828260563841189888, R4 // 1e000015de03a003decf090384f81400 ++ AND $0x273fffff80000800, R4, R5 // AND $2828260563841189888, R4, R5 // 1e000015de03a003decf090385f81400 ++ ++ // ADDV/AND C_DCON20S_32, [r1], r2 ++ ADDV $0xfff8000080000800, R4 // ADDV $-2251797666199552, R4 // 1e000015de03a0031e00001784f81000 ++ ADDV $0xfff8000080000800, R4, R5 // ADDV $-2251797666199552, R4, R5 // 1e000015de03a0031e00001785f81000 ++ AND $0xfff8000080000800, R4 // AND $-2251797666199552, R4 // 1e000015de03a0031e00001784f81400 ++ AND $0xfff8000080000800, R4, R5 // AND $-2251797666199552, R4, R5 // 1e000015de03a0031e00001785f81400 ++ ++ // ADDV/AND C_DCON32_12U, [r1], r2 ++ ADDV $0x2731234500000800, R4 // ADDV $2824077220597729280, R4 // 1e00a003be682416decf090384f81000 ++ ADDV $0x2731234500000800, R4, R5 // ADDV $2824077220597729280, R4, R5 // 1e00a003be682416decf090385f81000 ++ AND $0x2731234500000800, R4 // AND $2824077220597729280, R4 // 1e00a003be682416decf090384f81400 ++ AND $0x2731234500000800, R4, R5 // AND $2824077220597729280, R4, R5 // 1e00a003be682416decf090385f81400 +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc5.s b/src/cmd/asm/internal/asm/testdata/loong64enc5.s +new file mode 100644 +index 0000000000..423e5c3b01 +--- /dev/null ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc5.s +@@ -0,0 +1,17 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++#include "../../../../../runtime/textflag.h" ++ ++TEXT asmtest(SB),DUPOK|NOSPLIT,$0 ++ // ADDV/AND C_DCON, [r1], r2 ++ ADDV $0xfedcba9876543210, R4 // ADDV $-81985529216486896, R4 // 7ea8ec14de4388031e539717deb73f0384f81000 ++ ADDV $0xfedcba9876543210, R5, R4 // ADDV $-81985529216486896, R5, R4 // 7ea8ec14de4388031e539717deb73f03a4f81000 ++ ADDV $0x4edcba9876543210, R4 // ADDV $5682621993817747984, R4 // 7ea8ec14de4388031e539717deb7130384f81000 ++ ADDV $0x4edcba9876543210, R5, R4 // ADDV $5682621993817747984, R5, R4 // 7ea8ec14de4388031e539717deb71303a4f81000 ++ AND $0x4edcba9876543210, R4 // AND $5682621993817747984, R4 // 7ea8ec14de4388031e539717deb7130384f81400 ++ AND $0x4edcba9876543210, R5, R4 // AND $5682621993817747984, R5, R4 // 7ea8ec14de4388031e539717deb71303a4f81400 ++ AND $0xfedcba9876543210, R4 // AND $-81985529216486896, R4 // 7ea8ec14de4388031e539717deb73f0384f81400 ++ AND $0xfedcba9876543210, R5, R4 // AND $-81985529216486896, R5, R4 // 7ea8ec14de4388031e539717deb73f03a4f81400 ++ +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index 53b005af4d..b2207c2523 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -328,12 +328,58 @@ const ( + C_ZCON + C_SCON // 12 bit signed + C_UCON // 32 bit signed, low 12 bits 0 ++ ++ // When the immediate value is SCON, it can choose either the ADDCON implementation ++ // or the ANDCON implementation, using ADD0CON/AND0CON to distinguish them, so that ++ // the program can choose the implementation with fewer instructions. + C_ADD0CON + C_AND0CON +- C_ADDCON // -0x800 <= v < 0 +- C_ANDCON // 0 < v <= 0xFFF +- C_LCON // other 32 +- C_DCON // other 64 (could subdivide further) ++ ++ C_ADDCON // -0x800 <= v < 0 ++ C_ANDCON // 0 < v <= 0xFFF ++ C_LCON // other 32 ++ ++ // 64 bit signed, lo32 bits 0, hi20 bits are not 0, hi12 bits can ++ // be obtained by sign extension of the hi20 bits. ++ C_DCON20S_0 ++ // 64 bit signed, lo52 bits 0, hi12 bits are not 0. ++ C_DCON12_0 ++ // 64 bit signed, lo32 bits 0, hi32 bits are not 0. ++ C_DCON32_0 ++ // 64 bit signed, lo12 bits 0, lo20 bits are not 0, hi20 bits can be ++ // obtained by sign extension of the lo20 bits, other bits are not 0. ++ C_DCON12_20S ++ // 64 bit signed, lo12 bits 0, hi20 bits are not 0, hi12 bits can be ++ // obtained by sign extension of the hi20 bits, other bits are not 0. ++ C_DCON20S_20 ++ // 64 bit signed, lo12 bits 0, other bits are not 0. ++ C_DCON32_20 ++ // 64 bit signed, lo12 bits are not 0, 12~51 bits can be obtained ++ // by sign extension of the lo12 bits, other bits are not 0. ++ C_DCON12_12S ++ // 64 bit signed, hi20 bits and lo12 bits are not 0, hi12 bits can ++ // be obtained by sign extension of the hi20 bits, lo20 bits can ++ // be obtained by sign extension of the lo12 bits. ++ C_DCON20S_12S ++ // 64 bit signed, lo12 bits are not 0, lo20 bits can be obtained by sign ++ // extension of the lo12 bits, other bits are not 0. ++ C_DCON32_12S ++ // 64 bit signed, lo20 and lo12 bits are not 0, hi20 bits can be obtained by sign ++ // extension of the lo20 bits. other bits are not 0. ++ C_DCON12_32S ++ // 64 bit signed, hi20 bits are not 0, hi12 bits can be obtained by sign ++ // extension of the hi20 bits, lo32 bits are not 0. ++ C_DCON20S_32 ++ // 64 bit signed, 12~51 bits 0, other bits are not 0. ++ C_DCON12_12U ++ // 64 bit signed, lo20 bits 0, hi20 bits are not 0, hi12 bits can be ++ // obtained by sign extension of the hi20 bits, lo12 bits are not 0. ++ C_DCON20S_12U ++ // 64 bit signed, lo20 bits 0, other bits are not 0. ++ C_DCON32_12U ++ // other 64 ++ C_DCON ++ + C_SACON // $n(REG) where n <= int12 + C_LACON // $n(REG) where int12 < n <= int32 + C_DACON // $n(REG) where int32 < n +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index 9024c5e53e..5757c3c452 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -9,6 +9,7 @@ import ( + "cmd/internal/objabi" + "fmt" + "log" ++ "math/bits" + "slices" + ) + +@@ -192,6 +193,9 @@ var optab = []Optab{ + {AMOVV, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 24, 4, 0, 0}, + {AMOVW, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP}, + {AMOVV, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP}, ++ {AMOVV, C_DCON12_0, C_NONE, C_NONE, C_REG, C_NONE, 67, 4, 0, NOTUSETMP}, ++ {AMOVV, C_DCON12_20S, C_NONE, C_NONE, C_REG, C_NONE, 68, 8, 0, NOTUSETMP}, ++ {AMOVV, C_DCON32_12S, C_NONE, C_NONE, C_REG, C_NONE, 69, 12, 0, NOTUSETMP}, + {AMOVV, C_DCON, C_NONE, C_NONE, C_REG, C_NONE, 59, 16, 0, NOTUSETMP}, + + {AADD, C_ADD0CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, +@@ -225,6 +229,20 @@ var optab = []Optab{ + + {AADDV, C_DCON, C_NONE, C_NONE, C_REG, C_NONE, 60, 20, 0, 0}, + {AADDV, C_DCON, C_REG, C_NONE, C_REG, C_NONE, 60, 20, 0, 0}, ++ {AAND, C_DCON, C_NONE, C_NONE, C_REG, C_NONE, 60, 20, 0, 0}, ++ {AAND, C_DCON, C_REG, C_NONE, C_REG, C_NONE, 60, 20, 0, 0}, ++ {AADDV, C_DCON12_0, C_NONE, C_NONE, C_REG, C_NONE, 70, 8, 0, 0}, ++ {AADDV, C_DCON12_0, C_REG, C_NONE, C_REG, C_NONE, 70, 8, 0, 0}, ++ {AAND, C_DCON12_0, C_NONE, C_NONE, C_REG, C_NONE, 70, 8, 0, 0}, ++ {AAND, C_DCON12_0, C_REG, C_NONE, C_REG, C_NONE, 70, 8, 0, 0}, ++ {AADDV, C_DCON12_20S, C_NONE, C_NONE, C_REG, C_NONE, 71, 12, 0, 0}, ++ {AADDV, C_DCON12_20S, C_REG, C_NONE, C_REG, C_NONE, 71, 12, 0, 0}, ++ {AAND, C_DCON12_20S, C_NONE, C_NONE, C_REG, C_NONE, 71, 12, 0, 0}, ++ {AAND, C_DCON12_20S, C_REG, C_NONE, C_REG, C_NONE, 71, 12, 0, 0}, ++ {AADDV, C_DCON32_12S, C_NONE, C_NONE, C_REG, C_NONE, 72, 16, 0, 0}, ++ {AADDV, C_DCON32_12S, C_REG, C_NONE, C_REG, C_NONE, 72, 16, 0, 0}, ++ {AAND, C_DCON32_12S, C_NONE, C_NONE, C_REG, C_NONE, 72, 16, 0, 0}, ++ {AAND, C_DCON32_12S, C_REG, C_NONE, C_REG, C_NONE, 72, 16, 0, 0}, + + {ASLL, C_SCON, C_REG, C_NONE, C_REG, C_NONE, 16, 4, 0, 0}, + {ASLL, C_SCON, C_NONE, C_NONE, C_REG, C_NONE, 16, 4, 0, 0}, +@@ -790,7 +808,7 @@ func (c *ctxt0) aclass(a *obj.Addr) int { + } + + if c.instoffset != int64(int32(c.instoffset)) { +- return C_DCON ++ return dconClass(c.instoffset) + } + + if c.instoffset >= 0 { +@@ -830,6 +848,159 @@ func (c *ctxt0) aclass(a *obj.Addr) int { + return C_GOK + } + ++// The constants here define the data characteristics within the bit field range. ++// ++// ALL1: The data in the bit field is all 1 ++// ALL0: The data in the bit field is all 0 ++// ST1: The data in the bit field starts with 1, but not all 1 ++// ST0: The data in the bit field starts with 0, but not all 0 ++const ( ++ ALL1 = iota ++ ALL0 ++ ST1 ++ ST0 ++) ++ ++// mask returns the mask of the specified bit field, which is used to help determine ++// the data characteristics of the immediate value at the specified bit. ++func mask(suf int8, len int8) (uint64, uint64) { ++ if len == 12 { ++ if suf == 0 { ++ return 0xfff, 0x800 ++ } else { // suf == 52 ++ return 0xfff0000000000000, 0x8000000000000000 ++ } ++ } else { // len == 20 ++ if suf == 12 { ++ return 0xfffff000, 0x80000000 ++ } else { // suf == 32 ++ return 0xfffff00000000, 0x8000000000000 ++ } ++ } ++} ++ ++// bitField return a number represent status of val in bit field ++// ++// suf: The starting bit of the bit field ++// len: The length of the bit field ++func bitField(val int64, suf int8, len int8) int8 { ++ mask1, mask2 := mask(suf, len) ++ if uint64(val)&mask1 == mask1 { ++ return ALL1 ++ } else if uint64(val)&mask1 == 0x0 { ++ return ALL0 ++ } else if uint64(val)&mask2 == mask2 { ++ return ST1 ++ } else { ++ return ST0 ++ } ++} ++ ++// Loading an immediate value larger than 32 bits requires four instructions ++// on loong64 (lu12i.w + ori + lu32i.d + lu52i.d), but in some special cases, ++// we can use the sign extension and zero extension features of the instruction ++// to fill in the high-order data (all 0 or all 1), which can save one to ++// three instructions. ++// ++// | 63 ~ 52 | 51 ~ 32 | 31 ~ 12 | 11 ~ 0 | ++// | lu52i.d | lu32i.d | lu12i.w | ori | ++func dconClass(offset int64) int { ++ tzb := bits.TrailingZeros64(uint64(offset)) ++ hi12 := bitField(offset, 52, 12) ++ hi20 := bitField(offset, 32, 20) ++ lo20 := bitField(offset, 12, 20) ++ lo12 := bitField(offset, 0, 12) ++ if tzb >= 52 { ++ return C_DCON12_0 // lu52i.d ++ } ++ if tzb >= 32 { ++ if ((hi20 == ALL1 || hi20 == ST1) && hi12 == ALL1) || ((hi20 == ALL0 || hi20 == ST0) && hi12 == ALL0) { ++ return C_DCON20S_0 // addi.w + lu32i.d ++ } ++ return C_DCON32_0 // addi.w + lu32i.d + lu52i.d ++ } ++ if tzb >= 12 { ++ if lo20 == ST1 || lo20 == ALL1 { ++ if hi20 == ALL1 { ++ return C_DCON12_20S // lu12i.w + lu52i.d ++ } ++ if (hi20 == ST1 && hi12 == ALL1) || ((hi20 == ST0 || hi20 == ALL0) && hi12 == ALL0) { ++ return C_DCON20S_20 // lu12i.w + lu32i.d ++ } ++ return C_DCON32_20 // lu12i.w + lu32i.d + lu52i.d ++ } ++ if hi20 == ALL0 { ++ return C_DCON12_20S // lu12i.w + lu52i.d ++ } ++ if (hi20 == ST0 && hi12 == ALL0) || ((hi20 == ST1 || hi20 == ALL1) && hi12 == ALL1) { ++ return C_DCON20S_20 // lu12i.w + lu32i.d ++ } ++ return C_DCON32_20 // lu12i.w + lu32i.d + lu52i.d ++ } ++ if lo12 == ST1 || lo12 == ALL1 { ++ if lo20 == ALL1 { ++ if hi20 == ALL1 { ++ return C_DCON12_12S // addi.d + lu52i.d ++ } ++ if (hi20 == ST1 && hi12 == ALL1) || ((hi20 == ST0 || hi20 == ALL0) && hi12 == ALL0) { ++ return C_DCON20S_12S // addi.w + lu32i.d ++ } ++ return C_DCON32_12S // addi.w + lu32i.d + lu52i.d ++ } ++ if lo20 == ST1 { ++ if hi20 == ALL1 { ++ ++ return C_DCON12_32S // lu12i.w + ori + lu52i.d ++ } ++ if (hi20 == ST1 && hi12 == ALL1) || ((hi20 == ST0 || hi20 == ALL0) && hi12 == ALL0) { ++ return C_DCON20S_32 // lu12i.w + ori + lu32i.d ++ } ++ return C_DCON // lu12i.w + ori + lu32i.d + lu52i.d ++ } ++ if lo20 == ALL0 { ++ if hi20 == ALL0 { ++ return C_DCON12_12U // ori + lu52i.d ++ } ++ if ((hi20 == ST1 || hi20 == ALL1) && hi12 == ALL1) || (hi20 == ST0 && hi12 == ALL0) { ++ return C_DCON20S_12U // ori + lu32i.d ++ } ++ return C_DCON32_12U // ori + lu32i.d + lu52i.d ++ } ++ if hi20 == ALL0 { ++ return C_DCON12_32S // lu12i.w + ori + lu52i.d ++ } ++ if ((hi20 == ST1 || hi20 == ALL1) && hi12 == ALL1) || (hi20 == ST0 && hi12 == ALL0) { ++ return C_DCON20S_32 // lu12i.w + ori + lu32i.d ++ } ++ return C_DCON // lu12i.w + ori + lu32i.d + lu52i.d ++ } ++ if lo20 == ALL0 { ++ if hi20 == ALL0 { ++ return C_DCON12_12U // ori + lu52i.d ++ } ++ if ((hi20 == ST1 || hi20 == ALL1) && hi12 == ALL1) || (hi20 == ST0 && hi12 == ALL0) { ++ return C_DCON20S_12U // ori + lu32i.d ++ } ++ return C_DCON32_12U // ori + lu32i.d + lu52i.d ++ } ++ if lo20 == ST1 || lo20 == ALL1 { ++ if hi20 == ALL1 { ++ return C_DCON12_32S // lu12i.w + ori + lu52i.d ++ } ++ if (hi20 == ST1 && hi12 == ALL1) || ((hi20 == ST0 || hi20 == ALL0) && hi12 == ALL0) { ++ return C_DCON20S_32 // lu12i.w + ori + lu32i.d ++ } ++ return C_DCON ++ } ++ if hi20 == ALL0 { ++ return C_DCON12_32S // lu12i.w + ori + lu52i.d ++ } ++ if ((hi20 == ST1 || hi20 == ALL1) && hi12 == ALL1) || (hi20 == ST0 && hi12 == ALL0) { ++ return C_DCON20S_32 // lu12i.w + ori + lu32i.d ++ } ++ return C_DCON ++} ++ + // In Loong64,there are 8 CFRs, denoted as fcc0-fcc7. + // There are 4 FCSRs, denoted as fcsr0-fcsr3. + func (c *ctxt0) rclass(r int16) int { +@@ -935,7 +1106,14 @@ func cmp(a int, b int) bool { + } + switch a { + case C_DCON: +- if b == C_LCON { ++ if b == C_LCON || b == C_DCON32_0 || ++ b == C_DCON12_0 || b == C_DCON20S_0 || ++ b == C_DCON12_20S || b == C_DCON12_12S || ++ b == C_DCON20S_20 || b == C_DCON32_20 || ++ b == C_DCON20S_12S || b == C_DCON32_12S || ++ b == C_DCON12_32S || b == C_DCON20S_32 || ++ b == C_DCON12_12U || b == C_DCON20S_12U || ++ b == C_DCON32_12U { + return true + } + fallthrough +@@ -944,6 +1122,22 @@ func cmp(a int, b int) bool { + return true + } + ++ case C_DCON12_0: ++ ++ case C_DCON12_20S: ++ if b == C_DCON20S_20 || b == C_DCON12_12S || ++ b == C_DCON20S_12S || b == C_DCON12_12U || ++ b == C_DCON20S_12U || b == C_DCON20S_0 { ++ return true ++ } ++ ++ case C_DCON32_12S: ++ if b == C_DCON32_20 || b == C_DCON12_32S || ++ b == C_DCON20S_32 || b == C_DCON32_12U || ++ b == C_DCON32_0 { ++ return true ++ } ++ + case C_ADD0CON: + if b == C_ADDCON { + return true +@@ -2015,6 +2209,129 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + c.ctxt.Diag("illegal register combination: %v\n", p) + } + o1 = OP_RRR(atomicInst[p.As], uint32(rk), uint32(rj), uint32(rd)) ++ ++ case 67: // mov $dcon12_0, r ++ v := c.vregoff(&p.From) ++ o1 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(0), uint32(p.To.Reg)) ++ ++ case 68: // mov $dcon12_20S, r ++ v := c.vregoff(&p.From) ++ contype := c.aclass(&p.From) ++ switch contype { ++ default: // C_DCON12_20S ++ o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(p.To.Reg)) ++ o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg)) ++ case C_DCON20S_20: ++ o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(p.To.Reg)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg)) ++ case C_DCON12_12S: ++ o1 = OP_12IRR(c.opirr(AADDV), uint32(v), uint32(0), uint32(p.To.Reg)) ++ o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg)) ++ case C_DCON20S_12S, C_DCON20S_0: ++ o1 = OP_12IRR(c.opirr(AADD), uint32(v), uint32(0), uint32(p.To.Reg)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg)) ++ case C_DCON12_12U: ++ o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(p.To.Reg)) ++ o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg)) ++ case C_DCON20S_12U: ++ o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(p.To.Reg)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg)) ++ } ++ ++ case 69: // mov $dcon32_12S, r ++ v := c.vregoff(&p.From) ++ contype := c.aclass(&p.From) ++ switch contype { ++ default: // C_DCON32_12S, C_DCON32_0 ++ o1 = OP_12IRR(c.opirr(AADD), uint32(v), uint32(0), uint32(p.To.Reg)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg)) ++ o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg)) ++ case C_DCON32_20: ++ o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(p.To.Reg)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg)) ++ o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg)) ++ case C_DCON12_32S: ++ o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(p.To.Reg)) ++ o2 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(p.To.Reg), uint32(p.To.Reg)) ++ o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg)) ++ case C_DCON20S_32: ++ o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(p.To.Reg)) ++ o2 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(p.To.Reg), uint32(p.To.Reg)) ++ o3 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg)) ++ case C_DCON32_12U: ++ o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(p.To.Reg)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg)) ++ o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg)) ++ } ++ ++ case 70: // add $dcon12_0,[r1],r2 ++ v := c.vregoff(&p.From) ++ r := int(p.Reg) ++ if r == 0 { ++ r = int(p.To.Reg) ++ } ++ o1 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(0), uint32(REGTMP)) ++ o2 = OP_RRR(c.oprrr(p.As), uint32(REGTMP), uint32(r), uint32(p.To.Reg)) ++ ++ case 71: // add $dcon12_20S,[r1],r2 ++ v := c.vregoff(&p.From) ++ r := int(p.Reg) ++ if r == 0 { ++ r = int(p.To.Reg) ++ } ++ contype := c.aclass(&p.From) ++ switch contype { ++ default: // C_DCON12_20S ++ o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(REGTMP)) ++ o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP)) ++ case C_DCON20S_20: ++ o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(REGTMP)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP)) ++ case C_DCON12_12S: ++ o1 = OP_12IRR(c.opirr(AADDV), uint32(v), uint32(0), uint32(REGTMP)) ++ o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP)) ++ case C_DCON20S_12S, C_DCON20S_0: ++ o1 = OP_12IRR(c.opirr(AADD), uint32(v), uint32(0), uint32(REGTMP)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP)) ++ case C_DCON12_12U: ++ o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(REGTMP)) ++ o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP)) ++ case C_DCON20S_12U: ++ o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(REGTMP)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP)) ++ } ++ o3 = OP_RRR(c.oprrr(p.As), uint32(REGTMP), uint32(r), uint32(p.To.Reg)) ++ ++ case 72: // add $dcon32_12S,[r1],r2 ++ v := c.vregoff(&p.From) ++ r := int(p.Reg) ++ if r == 0 { ++ r = int(p.To.Reg) ++ } ++ contype := c.aclass(&p.From) ++ switch contype { ++ default: // C_DCON32_12S, C_DCON32_0 ++ o1 = OP_12IRR(c.opirr(AADD), uint32(v), uint32(0), uint32(REGTMP)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP)) ++ o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP)) ++ case C_DCON32_20: ++ o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(REGTMP)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP)) ++ o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP)) ++ case C_DCON12_32S: ++ o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(REGTMP)) ++ o2 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(REGTMP), uint32(REGTMP)) ++ o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP)) ++ case C_DCON20S_32: ++ o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(REGTMP)) ++ o2 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(REGTMP), uint32(REGTMP)) ++ o3 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP)) ++ case C_DCON32_12U: ++ o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(REGTMP)) ++ o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP)) ++ o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP)) ++ } ++ o4 = OP_RRR(c.oprrr(p.As), uint32(REGTMP), uint32(r), uint32(p.To.Reg)) + } + + out[0] = o1 +diff --git a/src/cmd/internal/obj/loong64/cnames.go b/src/cmd/internal/obj/loong64/cnames.go +index ce76109d2a..a2f04a22ee 100644 +--- a/src/cmd/internal/obj/loong64/cnames.go ++++ b/src/cmd/internal/obj/loong64/cnames.go +@@ -21,6 +21,20 @@ var cnames0 = []string{ + "ADDCON", + "ANDCON", + "LCON", ++ "DCON20S_0", ++ "DCON12_0", ++ "DCON32_0", ++ "DCON12_20S", ++ "DCON20S_20", ++ "DCON32_20", ++ "DCON12_12S", ++ "DCON20S_12S", ++ "DCON32_12S", ++ "DCON12_32S", ++ "DCON20S_32", ++ "DCON12_12U", ++ "DCON20S_12U", ++ "DCON32_12U", + "DCON", + "SACON", + "LACON", +-- +2.38.1 + diff --git a/0012-math-big-optimize-addVV-function-for-loong64.patch b/0012-math-big-optimize-addVV-function-for-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..8d91ab457176ba0293acd53bab799992c13f77ce --- /dev/null +++ b/0012-math-big-optimize-addVV-function-for-loong64.patch @@ -0,0 +1,85 @@ +From a7a4eb8120aaf7d5f8d2146f190c64118c7e1235 Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Thu, 6 Jun 2024 15:30:20 +0800 +Subject: [PATCH 12/44] math/big: optimize addVV function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark results on Loongson 3C5000 (which is an LA464 implementation): + +goos: linux +goarch: loong64 +pkg: math/big +cpu: Loongson-3C5000 @ 2200.00MHz + │ test/old_3c5000_addvv.log │ test/new_3c5000_addvv.log │ + │ sec/op │ sec/op vs base │ +AddVV/1 10.920n ± 0% 7.671n ± 0% -29.75% (p=0.000 n=20) +AddVV/2 14.100n ± 0% 8.849n ± 0% -37.24% (p=0.000 n=20) +AddVV/3 16.38n ± 0% 11.07n ± 0% -32.42% (p=0.000 n=20) +AddVV/4 18.65n ± 0% 12.86n ± 0% -31.05% (p=0.000 n=20) +AddVV/5 20.93n ± 0% 15.01n ± 0% -28.28% (p=0.000 n=20) +AddVV/10 31.84n ± 0% 22.75n ± 0% -28.53% (p=0.000 n=20) +AddVV/100 242.4n ± 0% 149.7n ± 0% -38.24% (p=0.000 n=20) +AddVV/1000 2.290µ ± 0% 1.378µ ± 0% -39.83% (p=0.000 n=20) +AddVV/10000 32.73µ ± 0% 19.36µ ± 0% -40.84% (p=0.000 n=20) +AddVV/100000 340.9µ ± 0% 238.5µ ± 0% -30.03% (p=0.000 n=20) +geomean 213.2n 141.2n -33.79% + +Change-Id: I7983a93d9b97d4e9ebe96a49107ec6db9194b013 +--- + src/math/big/arith_loong64.s | 31 +++++++++++++++++++++++++++++-- + 1 file changed, 29 insertions(+), 2 deletions(-) + +diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s +index 847e3127fb..bd7204cf06 100644 +--- a/src/math/big/arith_loong64.s ++++ b/src/math/big/arith_loong64.s +@@ -2,15 +2,42 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !math_big_pure_go && loong64 ++//go:build !math_big_pure_go + + #include "textflag.h" + + // This file provides fast assembly versions for the elementary + // arithmetic operations on vectors implemented in arith.go. + ++// func addVV(z, x, y []Word) (c Word) + TEXT ·addVV(SB),NOSPLIT,$0 +- JMP ·addVV_g(SB) ++ // input: ++ // R4: z ++ // R5: z_len ++ // R7: x ++ // R10: y ++ MOVV z+0(FP), R4 ++ MOVV z_len+8(FP), R5 ++ MOVV x+24(FP), R7 ++ MOVV y+48(FP), R10 ++ MOVV $0, R6 ++ SLLV $3, R5 ++ MOVV $0, R8 ++loop: ++ BEQ R5, R6, done ++ MOVV (R6)(R7), R9 ++ MOVV (R6)(R10), R11 ++ ADDV R9, R11, R11 // x1 + y1 = z1', if z1' < x1 then z1' overflow ++ ADDV R8, R11, R12 // z1' + c0 = z1, if z1 < z1' then z1 overflow ++ SGTU R9, R11, R9 ++ SGTU R11, R12, R11 ++ MOVV R12, (R6)(R4) ++ OR R9, R11, R8 ++ ADDV $8, R6 ++ JMP loop ++done: ++ MOVV R8, c+72(FP) ++ RET + + TEXT ·subVV(SB),NOSPLIT,$0 + JMP ·subVV_g(SB) +-- +2.38.1 + diff --git a/0013-math-big-optimize-addVW-function-for-loong64.patch b/0013-math-big-optimize-addVW-function-for-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..853a0d43914daf3d1d20664bb8ea8f31f137c34e --- /dev/null +++ b/0013-math-big-optimize-addVW-function-for-loong64.patch @@ -0,0 +1,82 @@ +From 94a6bdcacffb17b8adf57ce0919a3d31ac70b646 Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Tue, 11 Jun 2024 16:09:10 +0800 +Subject: [PATCH 13/44] math/big: optimize addVW function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark results on Loongson 3C5000 (which is an LA464 implementation): + +goos: linux +goarch: loong64 +pkg: math/big +cpu: Loongson-3C5000 @ 2200.00MHz + │ test/old_3c5000_addvw.log │ test/new_3c5000_addvw.log │ + │ sec/op │ sec/op vs base │ +AddVW/1 9.555n ± 0% 5.915n ± 0% -38.09% (p=0.000 n=20) +AddVW/2 11.370n ± 0% 6.825n ± 0% -39.97% (p=0.000 n=20) +AddVW/3 12.485n ± 0% 7.970n ± 0% -36.16% (p=0.000 n=20) +AddVW/4 14.980n ± 0% 9.718n ± 0% -35.13% (p=0.000 n=20) +AddVW/5 16.73n ± 0% 10.63n ± 0% -36.46% (p=0.000 n=20) +AddVW/10 24.57n ± 0% 15.18n ± 0% -38.23% (p=0.000 n=20) +AddVW/100 184.9n ± 0% 102.4n ± 0% -44.62% (p=0.000 n=20) +AddVW/1000 1721.0n ± 0% 921.4n ± 0% -46.46% (p=0.000 n=20) +AddVW/10000 16.83µ ± 0% 11.68µ ± 0% -30.58% (p=0.000 n=20) +AddVW/100000 184.7µ ± 0% 131.3µ ± 0% -28.93% (p=0.000 n=20) +AddVWext/1 9.554n ± 0% 5.915n ± 0% -38.09% (p=0.000 n=20) +AddVWext/2 11.370n ± 0% 6.825n ± 0% -39.97% (p=0.000 n=20) +AddVWext/3 12.505n ± 0% 7.969n ± 0% -36.27% (p=0.000 n=20) +AddVWext/4 14.980n ± 0% 9.718n ± 0% -35.13% (p=0.000 n=20) +AddVWext/5 16.70n ± 0% 10.63n ± 0% -36.33% (p=0.000 n=20) +AddVWext/10 24.54n ± 0% 15.18n ± 0% -38.13% (p=0.000 n=20) +AddVWext/100 185.0n ± 0% 102.4n ± 0% -44.65% (p=0.000 n=20) +AddVWext/1000 1721.0n ± 0% 921.4n ± 0% -46.46% (p=0.000 n=20) +AddVWext/10000 16.83µ ± 0% 11.68µ ± 0% -30.60% (p=0.000 n=20) +AddVWext/100000 184.9µ ± 0% 130.4µ ± 0% -29.51% (p=0.000 n=20) +geomean 155.5n 96.87n -37.70% + +Change-Id: I824a90cb365e09d7d0d4a2c53ff4b30cf057a75e +--- + src/math/big/arith_loong64.s | 24 +++++++++++++++++++++++- + 1 file changed, 23 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s +index bd7204cf06..bd6fec1b8d 100644 +--- a/src/math/big/arith_loong64.s ++++ b/src/math/big/arith_loong64.s +@@ -42,8 +42,30 @@ done: + TEXT ·subVV(SB),NOSPLIT,$0 + JMP ·subVV_g(SB) + ++// func addVW(z, x []Word, y Word) (c Word) + TEXT ·addVW(SB),NOSPLIT,$0 +- JMP ·addVW_g(SB) ++ // input: ++ // R4: z ++ // R5: z_len ++ // R7: x ++ // R10: y ++ MOVV z+0(FP), R4 ++ MOVV z_len+8(FP), R5 ++ MOVV x+24(FP), R7 ++ MOVV y+48(FP), R10 ++ MOVV $0, R6 ++ SLLV $3, R5 ++loop: ++ BEQ R5, R6, done ++ MOVV (R6)(R7), R8 ++ ADDV R8, R10, R9 // x1 + c = z1, if z1 < x1 then z1 overflow ++ SGTU R8, R9, R10 ++ MOVV R9, (R6)(R4) ++ ADDV $8, R6 ++ JMP loop ++done: ++ MOVV R10, c+56(FP) ++ RET + + TEXT ·subVW(SB),NOSPLIT,$0 + JMP ·subVW_g(SB) +-- +2.38.1 + diff --git a/0014-math-big-optimize-subVV-function-for-loong64.patch b/0014-math-big-optimize-subVV-function-for-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..1a9c5dd8394467656a88cc5f1288e506562532a8 --- /dev/null +++ b/0014-math-big-optimize-subVV-function-for-loong64.patch @@ -0,0 +1,77 @@ +From 7939ebdcaa1156ef4e9d8f896f4877df88d7636c Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Tue, 11 Jun 2024 19:06:29 +0800 +Subject: [PATCH 14/44] math/big: optimize subVV function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark results on Loongson 3C5000 (which is an LA464 implementation): + +goos: linux +goarch: loong64 +pkg: math/big +cpu: Loongson-3C5000 @ 2200.00MHz + │ test/old_3c5000_subvv.log │ test/new_3c5000_subvv.log │ + │ sec/op │ sec/op vs base │ +SubVV/1 10.920n ± 0% 7.657n ± 0% -29.88% (p=0.000 n=20) +SubVV/2 14.100n ± 0% 8.841n ± 0% -37.30% (p=0.000 n=20) +SubVV/3 16.38n ± 0% 11.06n ± 0% -32.48% (p=0.000 n=20) +SubVV/4 18.65n ± 0% 12.85n ± 0% -31.10% (p=0.000 n=20) +SubVV/5 20.93n ± 0% 14.79n ± 0% -29.34% (p=0.000 n=20) +SubVV/10 32.30n ± 0% 22.29n ± 0% -30.99% (p=0.000 n=20) +SubVV/100 244.3n ± 0% 149.2n ± 0% -38.93% (p=0.000 n=20) +SubVV/1000 2.292µ ± 0% 1.378µ ± 0% -39.88% (p=0.000 n=20) +SubVV/10000 26.26µ ± 0% 25.64µ ± 0% -2.33% (p=0.000 n=20) +SubVV/100000 341.3µ ± 0% 238.0µ ± 0% -30.26% (p=0.000 n=20) +geomean 209.1n 144.5n -30.86% + +Change-Id: I3863c2c6728f1b0f8fecbf77de13254299c5b1cb +--- + src/math/big/arith_loong64.s | 29 ++++++++++++++++++++++++++++- + 1 file changed, 28 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s +index bd6fec1b8d..8016c25207 100644 +--- a/src/math/big/arith_loong64.s ++++ b/src/math/big/arith_loong64.s +@@ -39,8 +39,35 @@ done: + MOVV R8, c+72(FP) + RET + ++// func subVV(z, x, y []Word) (c Word) + TEXT ·subVV(SB),NOSPLIT,$0 +- JMP ·subVV_g(SB) ++ // input: ++ // R4: z ++ // R5: z_len ++ // R7: x ++ // R10: y ++ MOVV z+0(FP), R4 ++ MOVV z_len+8(FP), R5 ++ MOVV x+24(FP), R7 ++ MOVV y+48(FP), R10 ++ MOVV $0, R6 ++ SLLV $3, R5 ++ MOVV $0, R8 ++loop: ++ BEQ R5, R6, done ++ MOVV (R6)(R7), R9 ++ MOVV (R6)(R10), R11 ++ SUBV R11, R9, R11 // x1 - y1 = z1', if z1' > x1 then overflow ++ SUBV R8, R11, R12 // z1' - c0 = z1, if z1 > z1' then overflow ++ SGTU R11, R9, R9 ++ SGTU R12, R11, R11 ++ MOVV R12, (R6)(R4) ++ OR R9, R11, R8 ++ ADDV $8, R6 ++ JMP loop ++done: ++ MOVV R8, c+72(FP) ++ RET + + // func addVW(z, x []Word, y Word) (c Word) + TEXT ·addVW(SB),NOSPLIT,$0 +-- +2.38.1 + diff --git a/0015-math-big-optimize-subVW-function-for-loong64.patch b/0015-math-big-optimize-subVW-function-for-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..82c8a1c43ae0fc4c6ba1edd1e62c907c855baf55 --- /dev/null +++ b/0015-math-big-optimize-subVW-function-for-loong64.patch @@ -0,0 +1,82 @@ +From b8516483f552400ef8708645b8a10bed5f666dba Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Tue, 11 Jun 2024 20:33:50 +0800 +Subject: [PATCH 15/44] math/big: optimize subVW function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark results on Loongson 3C5000 (which is an LA464 implementation): + +goos: linux +goarch: loong64 +pkg: math/big +cpu: Loongson-3C5000 @ 2200.00MHz + │ test/old_3c5000_subvw.log │ test/new_3c5000_subvw.log │ + │ sec/op │ sec/op vs base │ +SubVW/1 8.564n ± 0% 5.915n ± 0% -30.93% (p=0.000 n=20) +SubVW/2 11.675n ± 0% 6.825n ± 0% -41.54% (p=0.000 n=20) +SubVW/3 13.410n ± 0% 7.969n ± 0% -40.57% (p=0.000 n=20) +SubVW/4 15.300n ± 0% 9.740n ± 0% -36.34% (p=0.000 n=20) +SubVW/5 17.34n ± 1% 10.66n ± 0% -38.55% (p=0.000 n=20) +SubVW/10 26.55n ± 0% 15.21n ± 0% -42.70% (p=0.000 n=20) +SubVW/100 199.2n ± 0% 102.5n ± 0% -48.52% (p=0.000 n=20) +SubVW/1000 1866.5n ± 1% 924.6n ± 0% -50.46% (p=0.000 n=20) +SubVW/10000 17.67µ ± 2% 12.04µ ± 2% -31.83% (p=0.000 n=20) +SubVW/100000 186.4µ ± 0% 132.0µ ± 0% -29.17% (p=0.000 n=20) +SubVWext/1 8.616n ± 0% 5.949n ± 0% -30.95% (p=0.000 n=20) +SubVWext/2 11.410n ± 0% 7.008n ± 1% -38.58% (p=0.000 n=20) +SubVWext/3 13.255n ± 1% 8.073n ± 0% -39.09% (p=0.000 n=20) +SubVWext/4 15.095n ± 0% 9.893n ± 0% -34.47% (p=0.000 n=20) +SubVWext/5 16.87n ± 0% 10.86n ± 0% -35.63% (p=0.000 n=20) +SubVWext/10 26.00n ± 0% 15.54n ± 0% -40.22% (p=0.000 n=20) +SubVWext/100 196.0n ± 0% 104.3n ± 1% -46.76% (p=0.000 n=20) +SubVWext/1000 1847.0n ± 0% 923.7n ± 0% -49.99% (p=0.000 n=20) +SubVWext/10000 17.30µ ± 1% 11.71µ ± 1% -32.31% (p=0.000 n=20) +SubVWext/100000 187.5µ ± 0% 131.6µ ± 0% -29.82% (p=0.000 n=20) +geomean 159.7n 97.79n -38.79% + +Change-Id: I21a6903e79b02cb22282e80c9bfe2ae9f1a87589 +--- + src/math/big/arith_loong64.s | 24 +++++++++++++++++++++++- + 1 file changed, 23 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s +index 8016c25207..02d8262129 100644 +--- a/src/math/big/arith_loong64.s ++++ b/src/math/big/arith_loong64.s +@@ -94,8 +94,30 @@ done: + MOVV R10, c+56(FP) + RET + ++// func subVW(z, x []Word, y Word) (c Word) + TEXT ·subVW(SB),NOSPLIT,$0 +- JMP ·subVW_g(SB) ++ // input: ++ // R4: z ++ // R5: z_len ++ // R7: x ++ // R10: y ++ MOVV z+0(FP), R4 ++ MOVV z_len+8(FP), R5 ++ MOVV x+24(FP), R7 ++ MOVV y+48(FP), R10 ++ MOVV $0, R6 ++ SLLV $3, R5 ++loop: ++ BEQ R5, R6, done ++ MOVV (R6)(R7), R8 ++ SUBV R10, R8, R11 // x1 - c = z1, if z1 > x1 then overflow ++ SGTU R11, R8, R10 ++ MOVV R11, (R6)(R4) ++ ADDV $8, R6 ++ JMP loop ++done: ++ MOVV R10, c+56(FP) ++ RET + + TEXT ·shlVU(SB),NOSPLIT,$0 + JMP ·shlVU_g(SB) +-- +2.38.1 + diff --git a/0016-math-big-optimize-shlVU-function-for-loong64.patch b/0016-math-big-optimize-shlVU-function-for-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..a7fb046e70251547d0934928b6f9000eb7ac700a --- /dev/null +++ b/0016-math-big-optimize-shlVU-function-for-loong64.patch @@ -0,0 +1,92 @@ +From 3d520765bbff022132512b918379fe1a5e788f2e Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Thu, 13 Jun 2024 11:36:30 +0800 +Subject: [PATCH 16/44] math/big: optimize shlVU function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark results on Loongson 3A5000 (which is an LA464 implementation): + +goos: linux +goarch: loong64 +pkg: math/big +cpu: Loongson-3A5000-HV @ 2500.00MHz + │ old_3a5000_shlvu.log │ new_3a5000_shlvu_1st.log │ + │ sec/op │ sec/op vs base │ +NonZeroShifts/1/shlVU 7.606n ± 0% 5.304n ± 0% -30.27% (p=0.000 n=20) +NonZeroShifts/2/shlVU 9.608n ± 0% 6.164n ± 0% -35.85% (p=0.000 n=20) +NonZeroShifts/3/shlVU 11.610n ± 0% 6.984n ± 0% -39.84% (p=0.000 n=20) +NonZeroShifts/4/shlVU 12.210n ± 0% 8.869n ± 0% -27.36% (p=0.000 n=20) +NonZeroShifts/5/shlVU 14.11n ± 0% 10.41n ± 0% -26.22% (p=0.000 n=20) +NonZeroShifts/10/shlVU 22.02n ± 0% 14.77n ± 0% -32.92% (p=0.000 n=20) +NonZeroShifts/100/shlVU 161.30n ± 0% 91.15n ± 0% -43.49% (p=0.000 n=20) +NonZeroShifts/1000/shlVU 1514.0n ± 0% 811.7n ± 0% -46.39% (p=0.000 n=20) +NonZeroShifts/10000/shlVU 21.53µ ± 0% 10.54µ ± 0% -51.04% (p=0.000 n=20) +NonZeroShifts/100000/shlVU 208.1µ ± 0% 113.0µ ± 0% -45.69% (p=0.000 n=20) +geomean 142.8n 87.87n -38.46% + +Change-Id: I8e13eb0af27ac3d6846e559cdb61d2b544b05353 +--- + src/math/big/arith_loong64.s | 44 +++++++++++++++++++++++++++++++++++- + 1 file changed, 43 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s +index 02d8262129..1820988d3f 100644 +--- a/src/math/big/arith_loong64.s ++++ b/src/math/big/arith_loong64.s +@@ -119,8 +119,50 @@ done: + MOVV R10, c+56(FP) + RET + ++// func shlVU(z, x []Word, s uint) (c Word) + TEXT ·shlVU(SB),NOSPLIT,$0 +- JMP ·shlVU_g(SB) ++ // input: ++ // R4: z ++ // R5: z_len ++ // R7: x ++ // R10: s ++ MOVV z_len+8(FP), R5 ++ MOVV s+48(FP), R10 ++ MOVV z+0(FP), R4 ++ MOVV x+24(FP), R7 ++ BEQ R5, len0 ++ SLLV $3, R5 ++ BEQ R10, copy ++ MOVV $64, R9 ++ ADDV $-8, R7 // &x[-1] ++ SUB R10, R9 // ŝ = 64 - s ++ MOVV (R5)(R7), R6 ++ SRLV R9, R6, R8 // c = x[len(z)-1] >> ŝ ++loop: ++ ADDV $-8, R5 ++ BEQ R5, done ++ SLLV R10, R6, R12 ++ MOVV (R5)(R7), R6 ++ SRLV R9, R6, R11 ++ OR R11, R12 ++ MOVV R12, (R5)(R4) // z[i] = x[i]<>ŝ ++ JMP loop ++done: ++ SLLV R10, R6 ++ MOVV R8, c+56(FP) ++ MOVV R6, 0(R4) // z[0] = x[0] << s ++ RET ++copy: ++ BEQ R7, R4, len0 ++copyloop: ++ ADDV $-8, R5 ++ BLT R5, R0, len0 ++ MOVV (R5)(R7), R9 ++ MOVV R9, (R5)(R4) ++ JMP copyloop ++len0: ++ MOVV R0, c+56(FP) ++ RET + + TEXT ·shrVU(SB),NOSPLIT,$0 + JMP ·shrVU_g(SB) +-- +2.38.1 + diff --git a/0017-math-big-optimize-shrVU-function-for-loong64.patch b/0017-math-big-optimize-shrVU-function-for-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..504501900a094f24b89d68fa1e33f51664a5575e --- /dev/null +++ b/0017-math-big-optimize-shrVU-function-for-loong64.patch @@ -0,0 +1,92 @@ +From 14d44d92f1d59c42e85bd89797a3730f48699dc6 Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Tue, 18 Jun 2024 02:00:38 +0000 +Subject: [PATCH 17/44] math/big: optimize shrVU function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark results on Loongson 3A5000 (which is an LA464 implementation): + +goos: linux +goarch: loong64 +pkg: math/big +cpu: Loongson-3A5000-HV @ 2500.00MHz + │ test/old_3a5000_shrvu.log │ test/new_3a5000_shrvu.log │ + │ sec/op │ sec/op vs base │ +NonZeroShifts/1/shrVU 7.968n ± 0% 5.210n ± 0% -34.62% (p=0.000 n=20) +NonZeroShifts/2/shrVU 9.608n ± 0% 6.178n ± 0% -35.70% (p=0.000 n=20) +NonZeroShifts/3/shrVU 11.400n ± 0% 7.419n ± 0% -34.92% (p=0.000 n=20) +NonZeroShifts/4/shrVU 13.350n ± 0% 9.159n ± 0% -31.39% (p=0.000 n=20) +NonZeroShifts/5/shrVU 15.93n ± 0% 10.58n ± 0% -33.58% (p=0.000 n=20) +NonZeroShifts/10/shrVU 24.42n ± 0% 15.70n ± 0% -35.71% (p=0.000 n=20) +NonZeroShifts/100/shrVU 190.60n ± 0% 90.87n ± 0% -52.32% (p=0.000 n=20) +NonZeroShifts/1000/shrVU 1782.0n ± 0% 811.5n ± 0% -54.46% (p=0.000 n=20) +NonZeroShifts/10000/shrVU 21.54µ ± 0% 12.55µ ± 0% -41.76% (p=0.000 n=20) +NonZeroShifts/100000/shrVU 224.1µ ± 0% 126.2µ ± 0% -43.71% (p=0.000 n=20) +geomean 153.9n 91.78n -40.35% + +Change-Id: I86f1f3ac44d60ad8dc2e77bdb9b541f55eb18e74 +--- + src/math/big/arith_loong64.s | 45 +++++++++++++++++++++++++++++++++++- + 1 file changed, 44 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s +index 1820988d3f..bdaaf14821 100644 +--- a/src/math/big/arith_loong64.s ++++ b/src/math/big/arith_loong64.s +@@ -165,7 +165,50 @@ len0: + RET + + TEXT ·shrVU(SB),NOSPLIT,$0 +- JMP ·shrVU_g(SB) ++ // input: ++ // R4: z ++ // R5: z_len ++ // R7: x ++ // R10: s ++ MOVV z_len+8(FP), R5 ++ MOVV s+48(FP), R10 ++ MOVV z+0(FP), R4 ++ MOVV x+24(FP), R7 ++ BEQ R5, len0 ++ SLLV $3, R5 ++ BEQ R10, copy ++ MOVV 0(R7), R6 ++ MOVV $64, R9 ++ MOVV $8, R8 ++ SUB R10, R9 // ŝ = 64 - s ++ ADDV $-8, R4 // &z[-1] ++ SLLV R9, R6, R13 // c = x[0] << ŝ ++loop: ++ BEQ R5, R8, done ++ SRLV R10, R6, R12 ++ MOVV (R8)(R7), R6 ++ SLLV R9, R6, R11 ++ OR R11, R12 ++ MOVV R12, (R8)(R4) // z[i-1] = x[i-1]>>s | x[i]<<ŝ ++ ADDV $8, R8 ++ JMP loop ++done: ++ SRLV R10, R6 ++ MOVV R13, c+56(FP) ++ MOVV R6, (R8)(R4) // z[len(z)-1] = x[len(z)-1] >> s ++ RET ++copy: ++ MOVV $0, R8 ++ BEQ R7, R4, len0 ++copyloop: ++ BEQ R5, R8, len0 ++ MOVV (R8)(R7), R9 ++ MOVV R9, (R8)(R4) ++ ADDV $8, R8 ++ JMP copyloop ++len0: ++ MOVV R0, c+56(FP) ++ RET + + TEXT ·mulAddVWW(SB),NOSPLIT,$0 + JMP ·mulAddVWW_g(SB) +-- +2.38.1 + diff --git a/0018-math-big-optimize-mulAddVWW-function-for-loong64.patch b/0018-math-big-optimize-mulAddVWW-function-for-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..0ad375fa32d09ecea23d0d692da2b3adc95add16 --- /dev/null +++ b/0018-math-big-optimize-mulAddVWW-function-for-loong64.patch @@ -0,0 +1,77 @@ +From b956f69c885cd7fdf5305fd4047fd939000c9745 Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Wed, 19 Jun 2024 06:31:00 +0000 +Subject: [PATCH 18/44] math/big: optimize mulAddVWW function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark results on Loongson 3A5000 (which is an LA464 implementation): + +goos: linux +goarch: loong64 +pkg: math/big +cpu: Loongson-3A5000-HV @ 2500.00MHz + │ test/old_3a5000_muladdvww.log │ test/new_3a5000_muladdvww.log │ + │ sec/op │ sec/op vs base │ +MulAddVWW/1 7.606n ± 0% 6.987n ± 0% -8.14% (p=0.000 n=20) +MulAddVWW/2 9.207n ± 0% 8.567n ± 0% -6.95% (p=0.000 n=20) +MulAddVWW/3 10.810n ± 0% 9.223n ± 0% -14.68% (p=0.000 n=20) +MulAddVWW/4 13.01n ± 0% 12.41n ± 0% -4.61% (p=0.000 n=20) +MulAddVWW/5 15.79n ± 0% 12.99n ± 0% -17.73% (p=0.000 n=20) +MulAddVWW/10 25.62n ± 0% 20.02n ± 0% -21.86% (p=0.000 n=20) +MulAddVWW/100 217.0n ± 0% 170.9n ± 0% -21.24% (p=0.000 n=20) +MulAddVWW/1000 2.064µ ± 0% 1.612µ ± 0% -21.90% (p=0.000 n=20) +MulAddVWW/10000 24.50µ ± 0% 16.74µ ± 0% -31.66% (p=0.000 n=20) +MulAddVWW/100000 239.1µ ± 0% 171.1µ ± 0% -28.45% (p=0.000 n=20) +geomean 159.2n 130.3n -18.18% + +Change-Id: I063434bc382f4f1234f879172ab671a3d6f2eb80 +--- + src/math/big/arith_loong64.s | 29 ++++++++++++++++++++++++++++- + 1 file changed, 28 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s +index bdaaf14821..fe7c971120 100644 +--- a/src/math/big/arith_loong64.s ++++ b/src/math/big/arith_loong64.s +@@ -210,8 +210,35 @@ len0: + MOVV R0, c+56(FP) + RET + ++// func mulAddVWW(z, x []Word, y, r Word) (c Word) + TEXT ·mulAddVWW(SB),NOSPLIT,$0 +- JMP ·mulAddVWW_g(SB) ++ // input: ++ // R4: z ++ // R5: z_len ++ // R7: x ++ // R10: y ++ // R11: r ++ MOVV z+0(FP), R4 ++ MOVV z_len+8(FP), R5 ++ MOVV x+24(FP), R7 ++ MOVV y+48(FP), R10 ++ MOVV r+56(FP), R11 ++ SLLV $3, R5 ++ MOVV $0, R6 ++loop: ++ BEQ R5, R6, done ++ MOVV (R6)(R7), R8 ++ MULV R8, R10, R9 ++ MULHVU R8, R10, R12 ++ ADDV R9, R11, R8 ++ SGTU R9, R8, R11 // if (c' = lo + c) < lo then overflow ++ MOVV R8, (R6)(R4) ++ ADDV R12, R11 ++ ADDV $8, R6 ++ JMP loop ++done: ++ MOVV R11, c+64(FP) ++ RET + + TEXT ·addMulVVW(SB),NOSPLIT,$0 + JMP ·addMulVVW_g(SB) +-- +2.38.1 + diff --git a/0019-math-big-optimize-addMulVVW-function-for-loong64.patch b/0019-math-big-optimize-addMulVVW-function-for-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..965a89c81ebc046ba70e60c87a94fc970c1005c4 --- /dev/null +++ b/0019-math-big-optimize-addMulVVW-function-for-loong64.patch @@ -0,0 +1,77 @@ +From e7a6135d5c0fc4685ad18a82e770acf9f226b08e Mon Sep 17 00:00:00 2001 +From: Huang Qiqi +Date: Wed, 19 Jun 2024 08:05:24 +0000 +Subject: [PATCH 19/44] math/big: optimize addMulVVW function for loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark results on Loongson 3A5000 (which is an LA464 implementation): + +goos: linux +goarch: loong64 +pkg: math/big +cpu: Loongson-3A5000-HV @ 2500.00MHz + │ test/old_3a5000_addmulvvw.log │ test/new_3a5000_addmulvvw.log │ + │ sec/op │ sec/op vs base │ +AddMulVVW/1 9.208n ± 0% 5.777n ± 0% -37.26% (p=0.000 n=20) +AddMulVVW/2 11.950n ± 0% 7.763n ± 0% -35.04% (p=0.000 n=20) +AddMulVVW/3 14.01n ± 0% 10.41n ± 0% -25.70% (p=0.000 n=20) +AddMulVVW/4 16.01n ± 0% 13.21n ± 0% -17.49% (p=0.000 n=20) +AddMulVVW/5 18.01n ± 0% 14.12n ± 0% -21.57% (p=0.000 n=20) +AddMulVVW/10 29.60n ± 0% 23.35n ± 0% -21.11% (p=0.000 n=20) +AddMulVVW/100 273.4n ± 0% 173.8n ± 0% -36.43% (p=0.000 n=20) +AddMulVVW/1000 2.516µ ± 0% 1.615µ ± 0% -35.81% (p=0.000 n=20) +AddMulVVW/10000 30.31µ ± 0% 21.54µ ± 0% -28.93% (p=0.000 n=20) +AddMulVVW/100000 322.5µ ± 0% 234.1µ ± 0% -27.41% (p=0.000 n=20) +geomean 197.1n 139.9n -29.00% + +Change-Id: Ib7e95b50f7af893abee72ec26948a65115455692 +--- + src/math/big/arith_loong64.s | 32 +++++++++++++++++++++++++++++++- + 1 file changed, 31 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s +index fe7c971120..012af94f5c 100644 +--- a/src/math/big/arith_loong64.s ++++ b/src/math/big/arith_loong64.s +@@ -240,5 +240,35 @@ done: + MOVV R11, c+64(FP) + RET + ++// func addMulVVW(z, x []Word, y Word) (c Word) + TEXT ·addMulVVW(SB),NOSPLIT,$0 +- JMP ·addMulVVW_g(SB) ++ // input: ++ // R4: z ++ // R5: z_len ++ // R7: x ++ // R10: y ++ MOVV z_len+8(FP), R5 ++ MOVV x+24(FP), R7 ++ MOVV z+0(FP), R4 ++ MOVV y+48(FP), R10 ++ MOVV $0, R6 ++ SLLV $3, R5 ++ MOVV $0, R11 ++loop: ++ BEQ R5, R6, done ++ MOVV (R6)(R7), R8 ++ MOVV (R6)(R4), R9 ++ MULV R8, R10, R12 ++ MULHVU R8, R10, R13 ++ ADDV R12, R9, R8 ++ SGTU R12, R8, R9 ++ ADDV R13, R9 ++ ADDV R8, R11, R12 ++ SGTU R8, R12, R11 ++ MOVV R12, (R6)(R4) ++ ADDV $8, R6 ++ ADDV R9, R11 ++ JMP loop ++done: ++ MOVV R11, c+56(FP) ++ RET +-- +2.38.1 + diff --git a/0020-cmd-compile-fold-constant-shift-with-extension-on-lo.patch b/0020-cmd-compile-fold-constant-shift-with-extension-on-lo.patch new file mode 100644 index 0000000000000000000000000000000000000000..48553defe786efdc60d7eafbccac34ccc458148f --- /dev/null +++ b/0020-cmd-compile-fold-constant-shift-with-extension-on-lo.patch @@ -0,0 +1,376 @@ +From f10d1a3db9650a738d0254a58aadb62ec89eaca9 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Tue, 24 Sep 2024 16:59:06 +0800 +Subject: [PATCH 20/44] cmd/compile: fold constant shift with extension on + loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +goos: linux +goarch: loong64 +pkg: test/bench/go1 +cpu: Loongson-3A6000 @ 2500.00MHz + │ bench.old │ bench.new │ + │ sec/op │ sec/op vs base │ +BinaryTree17 7.775 ± 1% 7.747 ± 1% ~ (p=0.713 n=15) +Fannkuch11 2.645 ± 0% 2.646 ± 0% +0.05% (p=0.002 n=15) +FmtFprintfEmpty 35.87n ± 0% 35.85n ± 0% -0.06% (p=0.000 n=15) +FmtFprintfString 59.50n ± 0% 59.17n ± 0% -0.55% (p=0.000 n=15) +FmtFprintfInt 62.03n ± 0% 62.38n ± 0% +0.56% (p=0.000 n=15) +FmtFprintfIntInt 97.73n ± 0% 96.51n ± 0% -1.25% (p=0.000 n=15) +FmtFprintfPrefixedInt 116.6n ± 0% 118.8n ± 0% +1.89% (p=0.000 n=15) +FmtFprintfFloat 204.1n ± 0% 200.3n ± 0% -1.86% (p=0.000 n=15) +FmtManyArgs 455.1n ± 0% 464.8n ± 0% +2.13% (p=0.000 n=15) +GobDecode 7.127m ± 1% 7.063m ± 1% -0.89% (p=0.033 n=15) +GobEncode 8.061m ± 1% 8.069m ± 5% ~ (p=0.870 n=15) +Gzip 279.8m ± 0% 271.4m ± 0% -3.00% (p=0.000 n=15) +Gunzip 32.63m ± 0% 31.68m ± 0% -2.93% (p=0.000 n=15) +HTTPClientServer 53.39µ ± 0% 53.12µ ± 0% -0.51% (p=0.000 n=15) +JSONEncode 9.323m ± 0% 8.990m ± 1% -3.57% (p=0.000 n=15) +JSONDecode 46.65m ± 1% 46.58m ± 0% ~ (p=0.050 n=15) +Mandelbrot200 4.600m ± 0% 4.603m ± 0% +0.06% (p=0.000 n=15) +GoParse 4.651m ± 0% 4.765m ± 1% +2.45% (p=0.000 n=15) +RegexpMatchEasy0_32 59.64n ± 0% 58.26n ± 0% -2.31% (p=0.000 n=15) +RegexpMatchEasy0_1K 457.3n ± 0% 458.0n ± 0% +0.15% (p=0.002 n=15) +RegexpMatchEasy1_32 59.24n ± 0% 60.12n ± 0% +1.49% (p=0.000 n=15) +RegexpMatchEasy1_1K 556.6n ± 0% 556.9n ± 0% +0.05% (p=0.002 n=15) +RegexpMatchMedium_32 801.5n ± 0% 799.5n ± 0% -0.25% (p=0.000 n=15) +RegexpMatchMedium_1K 27.25µ ± 0% 27.21µ ± 0% -0.15% (p=0.001 n=15) +RegexpMatchHard_32 1.382µ ± 0% 1.412µ ± 0% +2.17% (p=0.000 n=15) +RegexpMatchHard_1K 40.84µ ± 0% 40.91µ ± 0% +0.18% (p=0.000 n=15) +Revcomp 474.5m ± 0% 473.9m ± 0% ~ (p=0.081 n=15) +Template 76.85m ± 1% 74.71m ± 1% -2.79% (p=0.000 n=15) +TimeParse 271.1n ± 0% 269.1n ± 0% -0.74% (p=0.000 n=15) +TimeFormat 289.5n ± 0% 287.5n ± 0% -0.69% (p=0.000 n=15) +geomean 51.59µ 51.40µ -0.38% + +Change-Id: I721e930c30b3d1cb88a79306ec51990505d850f1 +--- + .../internal/ssa/_gen/LOONG64latelower.rules | 19 ++ + src/cmd/compile/internal/ssa/config.go | 2 + + .../internal/ssa/rewriteLOONG64latelower.go | 246 ++++++++++++++++++ + test/codegen/shift.go | 3 + + 4 files changed, 270 insertions(+) + create mode 100644 src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules + create mode 100644 src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go + +diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules +new file mode 100644 +index 0000000000..1158f84422 +--- /dev/null ++++ b/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules +@@ -0,0 +1,19 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++// Fold constant shift with extension. ++(SRAVconst (MOVBreg x) [c]) && c < 8 => (SRAVconst (SLLVconst x [56]) [56+c]) ++(SRAVconst (MOVHreg x) [c]) && c < 16 => (SRAVconst (SLLVconst x [48]) [48+c]) ++(SRAVconst (MOVWreg x) [c]) && c < 32 => (SRAVconst (SLLVconst x [32]) [32+c]) ++(SRLVconst (MOVBUreg x) [c]) && c < 8 => (SRLVconst (SLLVconst x [56]) [56+c]) ++(SRLVconst (MOVHUreg x) [c]) && c < 16 => (SRLVconst (SLLVconst x [48]) [48+c]) ++(SRLVconst (MOVWUreg x) [c]) && c < 32 => (SRLVconst (SLLVconst x [32]) [32+c]) ++(SLLVconst (MOVBUreg x) [c]) && c <= 56 => (SRLVconst (SLLVconst x [56]) [56-c]) ++(SLLVconst (MOVHUreg x) [c]) && c <= 48 => (SRLVconst (SLLVconst x [48]) [48-c]) ++(SLLVconst (MOVWUreg x) [c]) && c <= 32 => (SRLVconst (SLLVconst x [32]) [32-c]) ++ ++// Shift by zero. ++(SRAVconst x [0]) => x ++(SRLVconst x [0]) => x ++(SLLVconst x [0]) => x +diff --git a/src/cmd/compile/internal/ssa/config.go b/src/cmd/compile/internal/ssa/config.go +index d674cca009..9c4f60f613 100644 +--- a/src/cmd/compile/internal/ssa/config.go ++++ b/src/cmd/compile/internal/ssa/config.go +@@ -280,6 +280,8 @@ func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat boo + c.RegSize = 8 + c.lowerBlock = rewriteBlockLOONG64 + c.lowerValue = rewriteValueLOONG64 ++ c.lateLowerBlock = rewriteBlockLOONG64latelower ++ c.lateLowerValue = rewriteValueLOONG64latelower + c.registers = registersLOONG64[:] + c.gpRegMask = gpRegMaskLOONG64 + c.fpRegMask = fpRegMaskLOONG64 +diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go b/src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go +new file mode 100644 +index 0000000000..f092b0a1ef +--- /dev/null ++++ b/src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go +@@ -0,0 +1,246 @@ ++// Code generated from _gen/LOONG64latelower.rules using 'go generate'; DO NOT EDIT. ++ ++package ssa ++ ++func rewriteValueLOONG64latelower(v *Value) bool { ++ switch v.Op { ++ case OpLOONG64SLLVconst: ++ return rewriteValueLOONG64latelower_OpLOONG64SLLVconst(v) ++ case OpLOONG64SRAVconst: ++ return rewriteValueLOONG64latelower_OpLOONG64SRAVconst(v) ++ case OpLOONG64SRLVconst: ++ return rewriteValueLOONG64latelower_OpLOONG64SRLVconst(v) ++ } ++ return false ++} ++func rewriteValueLOONG64latelower_OpLOONG64SLLVconst(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (SLLVconst (MOVBUreg x) [c]) ++ // cond: c <= 56 ++ // result: (SRLVconst (SLLVconst x [56]) [56-c]) ++ for { ++ c := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVBUreg { ++ break ++ } ++ x := v_0.Args[0] ++ if !(c <= 56) { ++ break ++ } ++ v.reset(OpLOONG64SRLVconst) ++ v.AuxInt = int64ToAuxInt(56 - c) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64) ++ v0.AuxInt = int64ToAuxInt(56) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SLLVconst (MOVHUreg x) [c]) ++ // cond: c <= 48 ++ // result: (SRLVconst (SLLVconst x [48]) [48-c]) ++ for { ++ c := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVHUreg { ++ break ++ } ++ x := v_0.Args[0] ++ if !(c <= 48) { ++ break ++ } ++ v.reset(OpLOONG64SRLVconst) ++ v.AuxInt = int64ToAuxInt(48 - c) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64) ++ v0.AuxInt = int64ToAuxInt(48) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SLLVconst (MOVWUreg x) [c]) ++ // cond: c <= 32 ++ // result: (SRLVconst (SLLVconst x [32]) [32-c]) ++ for { ++ c := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVWUreg { ++ break ++ } ++ x := v_0.Args[0] ++ if !(c <= 32) { ++ break ++ } ++ v.reset(OpLOONG64SRLVconst) ++ v.AuxInt = int64ToAuxInt(32 - c) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64) ++ v0.AuxInt = int64ToAuxInt(32) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SLLVconst x [0]) ++ // result: x ++ for { ++ if auxIntToInt64(v.AuxInt) != 0 { ++ break ++ } ++ x := v_0 ++ v.copyOf(x) ++ return true ++ } ++ return false ++} ++func rewriteValueLOONG64latelower_OpLOONG64SRAVconst(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (SRAVconst (MOVBreg x) [c]) ++ // cond: c < 8 ++ // result: (SRAVconst (SLLVconst x [56]) [56+c]) ++ for { ++ c := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVBreg { ++ break ++ } ++ x := v_0.Args[0] ++ if !(c < 8) { ++ break ++ } ++ v.reset(OpLOONG64SRAVconst) ++ v.AuxInt = int64ToAuxInt(56 + c) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.Int64) ++ v0.AuxInt = int64ToAuxInt(56) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SRAVconst (MOVHreg x) [c]) ++ // cond: c < 16 ++ // result: (SRAVconst (SLLVconst x [48]) [48+c]) ++ for { ++ c := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVHreg { ++ break ++ } ++ x := v_0.Args[0] ++ if !(c < 16) { ++ break ++ } ++ v.reset(OpLOONG64SRAVconst) ++ v.AuxInt = int64ToAuxInt(48 + c) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.Int64) ++ v0.AuxInt = int64ToAuxInt(48) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SRAVconst (MOVWreg x) [c]) ++ // cond: c < 32 ++ // result: (SRAVconst (SLLVconst x [32]) [32+c]) ++ for { ++ c := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVWreg { ++ break ++ } ++ x := v_0.Args[0] ++ if !(c < 32) { ++ break ++ } ++ v.reset(OpLOONG64SRAVconst) ++ v.AuxInt = int64ToAuxInt(32 + c) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.Int64) ++ v0.AuxInt = int64ToAuxInt(32) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SRAVconst x [0]) ++ // result: x ++ for { ++ if auxIntToInt64(v.AuxInt) != 0 { ++ break ++ } ++ x := v_0 ++ v.copyOf(x) ++ return true ++ } ++ return false ++} ++func rewriteValueLOONG64latelower_OpLOONG64SRLVconst(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (SRLVconst (MOVBUreg x) [c]) ++ // cond: c < 8 ++ // result: (SRLVconst (SLLVconst x [56]) [56+c]) ++ for { ++ c := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVBUreg { ++ break ++ } ++ x := v_0.Args[0] ++ if !(c < 8) { ++ break ++ } ++ v.reset(OpLOONG64SRLVconst) ++ v.AuxInt = int64ToAuxInt(56 + c) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64) ++ v0.AuxInt = int64ToAuxInt(56) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SRLVconst (MOVHUreg x) [c]) ++ // cond: c < 16 ++ // result: (SRLVconst (SLLVconst x [48]) [48+c]) ++ for { ++ c := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVHUreg { ++ break ++ } ++ x := v_0.Args[0] ++ if !(c < 16) { ++ break ++ } ++ v.reset(OpLOONG64SRLVconst) ++ v.AuxInt = int64ToAuxInt(48 + c) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64) ++ v0.AuxInt = int64ToAuxInt(48) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SRLVconst (MOVWUreg x) [c]) ++ // cond: c < 32 ++ // result: (SRLVconst (SLLVconst x [32]) [32+c]) ++ for { ++ c := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVWUreg { ++ break ++ } ++ x := v_0.Args[0] ++ if !(c < 32) { ++ break ++ } ++ v.reset(OpLOONG64SRLVconst) ++ v.AuxInt = int64ToAuxInt(32 + c) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64) ++ v0.AuxInt = int64ToAuxInt(32) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SRLVconst x [0]) ++ // result: x ++ for { ++ if auxIntToInt64(v.AuxInt) != 0 { ++ break ++ } ++ x := v_0 ++ v.copyOf(x) ++ return true ++ } ++ return false ++} ++func rewriteBlockLOONG64latelower(b *Block) bool { ++ return false ++} +diff --git a/test/codegen/shift.go b/test/codegen/shift.go +index 2d8cf86857..ad69d69aa5 100644 +--- a/test/codegen/shift.go ++++ b/test/codegen/shift.go +@@ -61,18 +61,21 @@ func rshConst64x64Overflow8(v int8) int64 { + func lshConst32x64(v int32) int32 { + // ppc64x:"SLW" + // riscv64:"SLLI",-"AND",-"SLTIU", -"MOVW" ++ // loong64:"SLLV" + return v << uint64(29) + } + + func rshConst32Ux64(v uint32) uint32 { + // ppc64x:"SRW" + // riscv64:"SRLIW",-"AND",-"SLTIU", -"MOVW" ++ // loong64:"SLLV","SRLV",-"MOVWU" + return v >> uint64(29) + } + + func rshConst32x64(v int32) int32 { + // ppc64x:"SRAW" + // riscv64:"SRAIW",-"OR",-"SLTIU", -"MOVW" ++ // loong64:"SLLV","SRAV",-"MOVW" + return v >> uint64(29) + } + +-- +2.38.1 + diff --git a/0021-test-codegen-fix-the-matching-instructions-inside-pl.patch b/0021-test-codegen-fix-the-matching-instructions-inside-pl.patch new file mode 100644 index 0000000000000000000000000000000000000000..f9e0b7be338df01040c2c024600d5efd4d90c443 --- /dev/null +++ b/0021-test-codegen-fix-the-matching-instructions-inside-pl.patch @@ -0,0 +1,31 @@ +From 53fc992fd2ba2f64eb436c5cf210e31e70282fc0 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Tue, 8 Oct 2024 16:23:56 +0800 +Subject: [PATCH 21/44] test/codegen: fix the matching instructions inside + plain comments for func rshConst32Ux64 on loong64 + +after add rules for (x << lc) >> rc in commit "cmd/compile: add patterns +for bitfield opcodes on loong64", the generated assembly from func +rshConst32Ux64 matches BSTRPICKV, not SLLV and SRLV. + +Change-Id: I4348716156abc3410134495edb977a88727139f8 +--- + test/codegen/shift.go | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/test/codegen/shift.go b/test/codegen/shift.go +index ad69d69aa5..6112a989b9 100644 +--- a/test/codegen/shift.go ++++ b/test/codegen/shift.go +@@ -68,7 +68,7 @@ func lshConst32x64(v int32) int32 { + func rshConst32Ux64(v uint32) uint32 { + // ppc64x:"SRW" + // riscv64:"SRLIW",-"AND",-"SLTIU", -"MOVW" +- // loong64:"SLLV","SRLV",-"MOVWU" ++ // loong64:"BSTRPICKV",-"SLLV",-"SRLV",-"MOVWU" + return v >> uint64(29) + } + +-- +2.38.1 + diff --git a/0022-cmd-compile-optimize-shifts-of-int32-and-uint32-on-l.patch b/0022-cmd-compile-optimize-shifts-of-int32-and-uint32-on-l.patch new file mode 100644 index 0000000000000000000000000000000000000000..c119f2e94194934fc073159898e1795f4b99b70a --- /dev/null +++ b/0022-cmd-compile-optimize-shifts-of-int32-and-uint32-on-l.patch @@ -0,0 +1,1064 @@ +From 2ab1123adf4a080d91ef549b76572bf4b22f907f Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Thu, 24 Oct 2024 17:41:01 +0800 +Subject: [PATCH 22/44] cmd/compile: optimize shifts of int32 and uint32 on + loong64 + +Change-Id: I6b8d110cfed8d55e2b753259a45f55e09b8f759d +--- + src/cmd/compile/internal/loong64/ssa.go | 6 + + .../compile/internal/ssa/_gen/LOONG64.rules | 39 +- + .../compile/internal/ssa/_gen/LOONG64Ops.go | 6 + + src/cmd/compile/internal/ssa/opGen.go | 90 ++++ + .../compile/internal/ssa/rewriteLOONG64.go | 431 +++++++++++++----- + test/codegen/shift.go | 20 +- + 6 files changed, 462 insertions(+), 130 deletions(-) + +diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go +index 0ba9efa1d3..bd761c407e 100644 +--- a/src/cmd/compile/internal/loong64/ssa.go ++++ b/src/cmd/compile/internal/loong64/ssa.go +@@ -165,8 +165,11 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + ssa.OpLOONG64OR, + ssa.OpLOONG64XOR, + ssa.OpLOONG64NOR, ++ ssa.OpLOONG64SLL, + ssa.OpLOONG64SLLV, ++ ssa.OpLOONG64SRL, + ssa.OpLOONG64SRLV, ++ ssa.OpLOONG64SRA, + ssa.OpLOONG64SRAV, + ssa.OpLOONG64ROTR, + ssa.OpLOONG64ROTRV, +@@ -274,8 +277,11 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + ssa.OpLOONG64ORconst, + ssa.OpLOONG64XORconst, + ssa.OpLOONG64NORconst, ++ ssa.OpLOONG64SLLconst, + ssa.OpLOONG64SLLVconst, ++ ssa.OpLOONG64SRLconst, + ssa.OpLOONG64SRLVconst, ++ ssa.OpLOONG64SRAconst, + ssa.OpLOONG64SRAVconst, + ssa.OpLOONG64ROTRconst, + ssa.OpLOONG64ROTRVconst, +diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules +index 00a0a84f33..014cd6fb05 100644 +--- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules +@@ -62,10 +62,10 @@ + (Lsh64x16 x y) => (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) + (Lsh64x8 x y) => (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) + +-(Lsh32x64 x y) => (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) +-(Lsh32x32 x y) => (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) +-(Lsh32x16 x y) => (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) +-(Lsh32x8 x y) => (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) ++(Lsh32x64 x y) => (MASKEQZ (SLL x y) (SGTU (MOVVconst [32]) y)) ++(Lsh32x32 x y) => (MASKEQZ (SLL x (ZeroExt32to64 y)) (SGTU (MOVVconst [32]) (ZeroExt32to64 y))) ++(Lsh32x16 x y) => (MASKEQZ (SLL x (ZeroExt16to64 y)) (SGTU (MOVVconst [32]) (ZeroExt16to64 y))) ++(Lsh32x8 x y) => (MASKEQZ (SLL x (ZeroExt8to64 y)) (SGTU (MOVVconst [32]) (ZeroExt8to64 y))) + + (Lsh16x64 x y) => (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) + (Lsh16x32 x y) => (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) +@@ -82,10 +82,10 @@ + (Rsh64Ux16 x y) => (MASKEQZ (SRLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) + (Rsh64Ux8 x y) => (MASKEQZ (SRLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) + +-(Rsh32Ux64 x y) => (MASKEQZ (SRLV (ZeroExt32to64 x) y) (SGTU (MOVVconst [64]) y)) +-(Rsh32Ux32 x y) => (MASKEQZ (SRLV (ZeroExt32to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) +-(Rsh32Ux16 x y) => (MASKEQZ (SRLV (ZeroExt32to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) +-(Rsh32Ux8 x y) => (MASKEQZ (SRLV (ZeroExt32to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) ++(Rsh32Ux64 x y) => (MASKEQZ (SRL x y) (SGTU (MOVVconst [32]) y)) ++(Rsh32Ux32 x y) => (MASKEQZ (SRL x (ZeroExt32to64 y)) (SGTU (MOVVconst [32]) (ZeroExt32to64 y))) ++(Rsh32Ux16 x y) => (MASKEQZ (SRL x (ZeroExt16to64 y)) (SGTU (MOVVconst [32]) (ZeroExt16to64 y))) ++(Rsh32Ux8 x y) => (MASKEQZ (SRL x (ZeroExt8to64 y)) (SGTU (MOVVconst [32]) (ZeroExt8to64 y))) + + (Rsh16Ux64 x y) => (MASKEQZ (SRLV (ZeroExt16to64 x) y) (SGTU (MOVVconst [64]) y)) + (Rsh16Ux32 x y) => (MASKEQZ (SRLV (ZeroExt16to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) +@@ -102,10 +102,10 @@ + (Rsh64x16 x y) => (SRAV x (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) + (Rsh64x8 x y) => (SRAV x (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) + +-(Rsh32x64 x y) => (SRAV (SignExt32to64 x) (OR (NEGV (SGTU y (MOVVconst [63]))) y)) +-(Rsh32x32 x y) => (SRAV (SignExt32to64 x) (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) +-(Rsh32x16 x y) => (SRAV (SignExt32to64 x) (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) +-(Rsh32x8 x y) => (SRAV (SignExt32to64 x) (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) ++(Rsh32x64 x y) => (SRA x (OR (NEGV (SGTU y (MOVVconst [31]))) y)) ++(Rsh32x32 x y) => (SRA x (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [31]))) (ZeroExt32to64 y))) ++(Rsh32x16 x y) => (SRA x (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [31]))) (ZeroExt16to64 y))) ++(Rsh32x8 x y) => (SRA x (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [31]))) (ZeroExt8to64 y))) + + (Rsh16x64 x y) => (SRAV (SignExt16to64 x) (OR (NEGV (SGTU y (MOVVconst [63]))) y)) + (Rsh16x32 x y) => (SRAV (SignExt16to64 x) (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) +@@ -683,15 +683,30 @@ + (XOR x (MOVVconst [c])) && is32Bit(c) => (XORconst [c] x) + (NOR x (MOVVconst [c])) && is32Bit(c) => (NORconst [c] x) + ++(SLL _ (MOVVconst [c])) && uint64(c)>=32 => (MOVVconst [0]) + (SLLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0]) ++(SRL _ (MOVVconst [c])) && uint64(c)>=32 => (MOVVconst [0]) + (SRLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0]) ++(SRA x (MOVVconst [c])) && uint64(c)>=32 => (SRAconst x [31]) + (SRAV x (MOVVconst [c])) && uint64(c)>=64 => (SRAVconst x [63]) ++(SLL x (MOVVconst [c])) && uint64(c) >=0 && uint64(c) <= 31 => (SLLconst x [c]) + (SLLV x (MOVVconst [c])) => (SLLVconst x [c]) ++(SRL x (MOVVconst [c])) && uint64(c) >=0 && uint64(c) <= 31 => (SRLconst x [c]) + (SRLV x (MOVVconst [c])) => (SRLVconst x [c]) ++(SRA x (MOVVconst [c])) && uint64(c) >=0 && uint64(c) <= 31 => (SRAconst x [c]) + (SRAV x (MOVVconst [c])) => (SRAVconst x [c]) + (ROTR x (MOVVconst [c])) => (ROTRconst x [c&31]) + (ROTRV x (MOVVconst [c])) => (ROTRVconst x [c&63]) + ++// Avoid unnecessary zero and sign extension when right shifting. ++(SRLVconst [rc] (MOVWUreg y)) && rc >= 0 && rc <= 31 => (SRLconst [int64(rc)] y) ++(SRAVconst [rc] (MOVWreg y)) && rc >= 0 && rc <= 31 => (SRAconst [int64(rc)] y) ++ ++// Replace right shifts that exceed size of signed type. ++(SRAVconst [rc] (MOVBreg y)) && rc >= 8 => (SRAVconst [63] (SLLVconst [56] y)) ++(SRAVconst [rc] (MOVHreg y)) && rc >= 16 => (SRAVconst [63] (SLLVconst [48] y)) ++(SRAVconst [rc] (MOVWreg y)) && rc >= 32 => (SRAconst [31] y) ++ + // If the shift amount is larger than the datasize(32, 16, 8), we can optimize to constant 0. + (MOVWUreg (SLLVconst [lc] x)) && lc >= 32 => (MOVVconst [0]) + (MOVHUreg (SLLVconst [lc] x)) && lc >= 16 => (MOVVconst [0]) +diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go +index 8f17158b64..4b3f1fd689 100644 +--- a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go +@@ -240,11 +240,17 @@ func init() { + {name: "FCOPYSGD", argLength: 2, reg: fp21, asm: "FCOPYSGD"}, // float64 + + // shifts ++ {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << arg1, shift amount is mod 32 + {name: "SLLV", argLength: 2, reg: gp21, asm: "SLLV"}, // arg0 << arg1, shift amount is mod 64 ++ {name: "SLLconst", argLength: 1, reg: gp11, asm: "SLL", aux: "Int64"}, // arg0 << auxInt + {name: "SLLVconst", argLength: 1, reg: gp11, asm: "SLLV", aux: "Int64"}, // arg0 << auxInt ++ {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> arg1, unsigned, shift amount is mod 32 + {name: "SRLV", argLength: 2, reg: gp21, asm: "SRLV"}, // arg0 >> arg1, unsigned, shift amount is mod 64 ++ {name: "SRLconst", argLength: 1, reg: gp11, asm: "SRL", aux: "Int64"}, // arg0 >> auxInt, unsigned + {name: "SRLVconst", argLength: 1, reg: gp11, asm: "SRLV", aux: "Int64"}, // arg0 >> auxInt, unsigned ++ {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> arg1, signed, shift amount is mod 32 + {name: "SRAV", argLength: 2, reg: gp21, asm: "SRAV"}, // arg0 >> arg1, signed, shift amount is mod 64 ++ {name: "SRAconst", argLength: 1, reg: gp11, asm: "SRA", aux: "Int64"}, // arg0 >> auxInt, signed + {name: "SRAVconst", argLength: 1, reg: gp11, asm: "SRAV", aux: "Int64"}, // arg0 >> auxInt, signed + {name: "ROTR", argLength: 2, reg: gp21, asm: "ROTR"}, // arg0 right rotate by (arg1 mod 32) bits + {name: "ROTRV", argLength: 2, reg: gp21, asm: "ROTRV"}, // arg0 right rotate by (arg1 mod 64) bits +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index df1ddfa69e..643d012ca1 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -1824,11 +1824,17 @@ const ( + OpLOONG64MASKEQZ + OpLOONG64MASKNEZ + OpLOONG64FCOPYSGD ++ OpLOONG64SLL + OpLOONG64SLLV ++ OpLOONG64SLLconst + OpLOONG64SLLVconst ++ OpLOONG64SRL + OpLOONG64SRLV ++ OpLOONG64SRLconst + OpLOONG64SRLVconst ++ OpLOONG64SRA + OpLOONG64SRAV ++ OpLOONG64SRAconst + OpLOONG64SRAVconst + OpLOONG64ROTR + OpLOONG64ROTRV +@@ -24541,6 +24547,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SLL", ++ argLen: 2, ++ asm: loong64.ASLL, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 ++ {1, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ outputs: []outputInfo{ ++ {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ }, ++ }, + { + name: "SLLV", + argLen: 2, +@@ -24555,6 +24575,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SLLconst", ++ auxType: auxInt64, ++ argLen: 1, ++ asm: loong64.ASLL, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ outputs: []outputInfo{ ++ {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ }, ++ }, + { + name: "SLLVconst", + auxType: auxInt64, +@@ -24569,6 +24603,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SRL", ++ argLen: 2, ++ asm: loong64.ASRL, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 ++ {1, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ outputs: []outputInfo{ ++ {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ }, ++ }, + { + name: "SRLV", + argLen: 2, +@@ -24583,6 +24631,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SRLconst", ++ auxType: auxInt64, ++ argLen: 1, ++ asm: loong64.ASRL, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ outputs: []outputInfo{ ++ {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ }, ++ }, + { + name: "SRLVconst", + auxType: auxInt64, +@@ -24597,6 +24659,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SRA", ++ argLen: 2, ++ asm: loong64.ASRA, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 ++ {1, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ outputs: []outputInfo{ ++ {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ }, ++ }, + { + name: "SRAV", + argLen: 2, +@@ -24611,6 +24687,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SRAconst", ++ auxType: auxInt64, ++ argLen: 1, ++ asm: loong64.ASRA, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ outputs: []outputInfo{ ++ {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 ++ }, ++ }, ++ }, + { + name: "SRAVconst", + auxType: auxInt64, +diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64.go b/src/cmd/compile/internal/ssa/rewriteLOONG64.go +index ab39040de1..93bf95eb51 100644 +--- a/src/cmd/compile/internal/ssa/rewriteLOONG64.go ++++ b/src/cmd/compile/internal/ssa/rewriteLOONG64.go +@@ -440,14 +440,20 @@ func rewriteValueLOONG64(v *Value) bool { + return rewriteValueLOONG64_OpLOONG64SGTUconst(v) + case OpLOONG64SGTconst: + return rewriteValueLOONG64_OpLOONG64SGTconst(v) ++ case OpLOONG64SLL: ++ return rewriteValueLOONG64_OpLOONG64SLL(v) + case OpLOONG64SLLV: + return rewriteValueLOONG64_OpLOONG64SLLV(v) + case OpLOONG64SLLVconst: + return rewriteValueLOONG64_OpLOONG64SLLVconst(v) ++ case OpLOONG64SRA: ++ return rewriteValueLOONG64_OpLOONG64SRA(v) + case OpLOONG64SRAV: + return rewriteValueLOONG64_OpLOONG64SRAV(v) + case OpLOONG64SRAVconst: + return rewriteValueLOONG64_OpLOONG64SRAVconst(v) ++ case OpLOONG64SRL: ++ return rewriteValueLOONG64_OpLOONG64SRL(v) + case OpLOONG64SRLV: + return rewriteValueLOONG64_OpLOONG64SRLV(v) + case OpLOONG64SRLVconst: +@@ -5953,6 +5959,43 @@ func rewriteValueLOONG64_OpLOONG64SGTconst(v *Value) bool { + } + return false + } ++func rewriteValueLOONG64_OpLOONG64SLL(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (SLL _ (MOVVconst [c])) ++ // cond: uint64(c)>=32 ++ // result: (MOVVconst [0]) ++ for { ++ if v_1.Op != OpLOONG64MOVVconst { ++ break ++ } ++ c := auxIntToInt64(v_1.AuxInt) ++ if !(uint64(c) >= 32) { ++ break ++ } ++ v.reset(OpLOONG64MOVVconst) ++ v.AuxInt = int64ToAuxInt(0) ++ return true ++ } ++ // match: (SLL x (MOVVconst [c])) ++ // cond: uint64(c) >=0 && uint64(c) <= 31 ++ // result: (SLLconst x [c]) ++ for { ++ x := v_0 ++ if v_1.Op != OpLOONG64MOVVconst { ++ break ++ } ++ c := auxIntToInt64(v_1.AuxInt) ++ if !(uint64(c) >= 0 && uint64(c) <= 31) { ++ break ++ } ++ v.reset(OpLOONG64SLLconst) ++ v.AuxInt = int64ToAuxInt(c) ++ v.AddArg(x) ++ return true ++ } ++ return false ++} + func rewriteValueLOONG64_OpLOONG64SLLV(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +@@ -6002,6 +6045,45 @@ func rewriteValueLOONG64_OpLOONG64SLLVconst(v *Value) bool { + } + return false + } ++func rewriteValueLOONG64_OpLOONG64SRA(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (SRA x (MOVVconst [c])) ++ // cond: uint64(c)>=32 ++ // result: (SRAconst x [31]) ++ for { ++ x := v_0 ++ if v_1.Op != OpLOONG64MOVVconst { ++ break ++ } ++ c := auxIntToInt64(v_1.AuxInt) ++ if !(uint64(c) >= 32) { ++ break ++ } ++ v.reset(OpLOONG64SRAconst) ++ v.AuxInt = int64ToAuxInt(31) ++ v.AddArg(x) ++ return true ++ } ++ // match: (SRA x (MOVVconst [c])) ++ // cond: uint64(c) >=0 && uint64(c) <= 31 ++ // result: (SRAconst x [c]) ++ for { ++ x := v_0 ++ if v_1.Op != OpLOONG64MOVVconst { ++ break ++ } ++ c := auxIntToInt64(v_1.AuxInt) ++ if !(uint64(c) >= 0 && uint64(c) <= 31) { ++ break ++ } ++ v.reset(OpLOONG64SRAconst) ++ v.AuxInt = int64ToAuxInt(c) ++ v.AddArg(x) ++ return true ++ } ++ return false ++} + func rewriteValueLOONG64_OpLOONG64SRAV(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +@@ -6039,6 +6121,85 @@ func rewriteValueLOONG64_OpLOONG64SRAV(v *Value) bool { + } + func rewriteValueLOONG64_OpLOONG64SRAVconst(v *Value) bool { + v_0 := v.Args[0] ++ b := v.Block ++ // match: (SRAVconst [rc] (MOVWreg y)) ++ // cond: rc >= 0 && rc <= 31 ++ // result: (SRAconst [int64(rc)] y) ++ for { ++ t := v.Type ++ rc := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVWreg { ++ break ++ } ++ y := v_0.Args[0] ++ if !(rc >= 0 && rc <= 31) { ++ break ++ } ++ v.reset(OpLOONG64SRAconst) ++ v.Type = t ++ v.AuxInt = int64ToAuxInt(int64(rc)) ++ v.AddArg(y) ++ return true ++ } ++ // match: (SRAVconst [rc] (MOVBreg y)) ++ // cond: rc >= 8 ++ // result: (SRAVconst [63] (SLLVconst [56] y)) ++ for { ++ t := v.Type ++ rc := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVBreg { ++ break ++ } ++ y := v_0.Args[0] ++ if !(rc >= 8) { ++ break ++ } ++ v.reset(OpLOONG64SRAVconst) ++ v.AuxInt = int64ToAuxInt(63) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, t) ++ v0.AuxInt = int64ToAuxInt(56) ++ v0.AddArg(y) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SRAVconst [rc] (MOVHreg y)) ++ // cond: rc >= 16 ++ // result: (SRAVconst [63] (SLLVconst [48] y)) ++ for { ++ t := v.Type ++ rc := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVHreg { ++ break ++ } ++ y := v_0.Args[0] ++ if !(rc >= 16) { ++ break ++ } ++ v.reset(OpLOONG64SRAVconst) ++ v.AuxInt = int64ToAuxInt(63) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, t) ++ v0.AuxInt = int64ToAuxInt(48) ++ v0.AddArg(y) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SRAVconst [rc] (MOVWreg y)) ++ // cond: rc >= 32 ++ // result: (SRAconst [31] y) ++ for { ++ rc := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVWreg { ++ break ++ } ++ y := v_0.Args[0] ++ if !(rc >= 32) { ++ break ++ } ++ v.reset(OpLOONG64SRAconst) ++ v.AuxInt = int64ToAuxInt(31) ++ v.AddArg(y) ++ return true ++ } + // match: (SRAVconst [c] (MOVVconst [d])) + // result: (MOVVconst [d>>uint64(c)]) + for { +@@ -6053,6 +6214,43 @@ func rewriteValueLOONG64_OpLOONG64SRAVconst(v *Value) bool { + } + return false + } ++func rewriteValueLOONG64_OpLOONG64SRL(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (SRL _ (MOVVconst [c])) ++ // cond: uint64(c)>=32 ++ // result: (MOVVconst [0]) ++ for { ++ if v_1.Op != OpLOONG64MOVVconst { ++ break ++ } ++ c := auxIntToInt64(v_1.AuxInt) ++ if !(uint64(c) >= 32) { ++ break ++ } ++ v.reset(OpLOONG64MOVVconst) ++ v.AuxInt = int64ToAuxInt(0) ++ return true ++ } ++ // match: (SRL x (MOVVconst [c])) ++ // cond: uint64(c) >=0 && uint64(c) <= 31 ++ // result: (SRLconst x [c]) ++ for { ++ x := v_0 ++ if v_1.Op != OpLOONG64MOVVconst { ++ break ++ } ++ c := auxIntToInt64(v_1.AuxInt) ++ if !(uint64(c) >= 0 && uint64(c) <= 31) { ++ break ++ } ++ v.reset(OpLOONG64SRLconst) ++ v.AuxInt = int64ToAuxInt(c) ++ v.AddArg(x) ++ return true ++ } ++ return false ++} + func rewriteValueLOONG64_OpLOONG64SRLV(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +@@ -6157,6 +6355,25 @@ func rewriteValueLOONG64_OpLOONG64SRLVconst(v *Value) bool { + v.AddArg(x) + return true + } ++ // match: (SRLVconst [rc] (MOVWUreg y)) ++ // cond: rc >= 0 && rc <= 31 ++ // result: (SRLconst [int64(rc)] y) ++ for { ++ t := v.Type ++ rc := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpLOONG64MOVWUreg { ++ break ++ } ++ y := v_0.Args[0] ++ if !(rc >= 0 && rc <= 31) { ++ break ++ } ++ v.reset(OpLOONG64SRLconst) ++ v.Type = t ++ v.AuxInt = int64ToAuxInt(int64(rc)) ++ v.AddArg(y) ++ return true ++ } + // match: (SRLVconst [rc] (MOVWUreg x)) + // cond: rc >= 32 + // result: (MOVVconst [0]) +@@ -7262,19 +7479,19 @@ func rewriteValueLOONG64_OpLsh32x16(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Lsh32x16 x y) +- // result: (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) ++ // result: (MASKEQZ (SLL x (ZeroExt16to64 y)) (SGTU (MOVVconst [32]) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 + v.reset(OpLOONG64MASKEQZ) +- v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) + v1.AddArg(y) + v0.AddArg2(x, v1) + v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) + v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v2.AddArg2(v3, v1) + v.AddArg2(v0, v2) + return true +@@ -7286,19 +7503,19 @@ func rewriteValueLOONG64_OpLsh32x32(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Lsh32x32 x y) +- // result: (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) ++ // result: (MASKEQZ (SLL x (ZeroExt32to64 y)) (SGTU (MOVVconst [32]) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 + v.reset(OpLOONG64MASKEQZ) +- v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t) + v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) + v1.AddArg(y) + v0.AddArg2(x, v1) + v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) + v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v2.AddArg2(v3, v1) + v.AddArg2(v0, v2) + return true +@@ -7310,17 +7527,17 @@ func rewriteValueLOONG64_OpLsh32x64(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Lsh32x64 x y) +- // result: (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) ++ // result: (MASKEQZ (SLL x y) (SGTU (MOVVconst [32]) y)) + for { + t := v.Type + x := v_0 + y := v_1 + v.reset(OpLOONG64MASKEQZ) +- v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t) + v0.AddArg2(x, y) + v1 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) + v2 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v2.AuxInt = int64ToAuxInt(64) ++ v2.AuxInt = int64ToAuxInt(32) + v1.AddArg2(v2, y) + v.AddArg2(v0, v1) + return true +@@ -7332,19 +7549,19 @@ func rewriteValueLOONG64_OpLsh32x8(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Lsh32x8 x y) +- // result: (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) ++ // result: (MASKEQZ (SLL x (ZeroExt8to64 y)) (SGTU (MOVVconst [32]) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 + v.reset(OpLOONG64MASKEQZ) +- v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) + v1.AddArg(y) + v0.AddArg2(x, v1) + v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) + v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v2.AddArg2(v3, v1) + v.AddArg2(v0, v2) + return true +@@ -8694,23 +8911,21 @@ func rewriteValueLOONG64_OpRsh32Ux16(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Rsh32Ux16 x y) +- // result: (MASKEQZ (SRLV (ZeroExt32to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) ++ // result: (MASKEQZ (SRL x (ZeroExt16to64 y)) (SGTU (MOVVconst [32]) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 + v.reset(OpLOONG64MASKEQZ) +- v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) +- v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v1.AddArg(x) +- v2 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +- v2.AddArg(y) +- v0.AddArg2(v1, v2) +- v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) +- v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v4.AuxInt = int64ToAuxInt(64) +- v3.AddArg2(v4, v2) +- v.AddArg2(v0, v3) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t) ++ v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v1.AddArg(y) ++ v0.AddArg2(x, v1) ++ v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) ++ v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) ++ v3.AuxInt = int64ToAuxInt(32) ++ v2.AddArg2(v3, v1) ++ v.AddArg2(v0, v2) + return true + } + } +@@ -8720,23 +8935,21 @@ func rewriteValueLOONG64_OpRsh32Ux32(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Rsh32Ux32 x y) +- // result: (MASKEQZ (SRLV (ZeroExt32to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) ++ // result: (MASKEQZ (SRL x (ZeroExt32to64 y)) (SGTU (MOVVconst [32]) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 + v.reset(OpLOONG64MASKEQZ) +- v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t) + v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v1.AddArg(x) +- v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v2.AddArg(y) +- v0.AddArg2(v1, v2) +- v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) +- v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v4.AuxInt = int64ToAuxInt(64) +- v3.AddArg2(v4, v2) +- v.AddArg2(v0, v3) ++ v1.AddArg(y) ++ v0.AddArg2(x, v1) ++ v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) ++ v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) ++ v3.AuxInt = int64ToAuxInt(32) ++ v2.AddArg2(v3, v1) ++ v.AddArg2(v0, v2) + return true + } + } +@@ -8746,21 +8959,19 @@ func rewriteValueLOONG64_OpRsh32Ux64(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Rsh32Ux64 x y) +- // result: (MASKEQZ (SRLV (ZeroExt32to64 x) y) (SGTU (MOVVconst [64]) y)) ++ // result: (MASKEQZ (SRL x y) (SGTU (MOVVconst [32]) y)) + for { + t := v.Type + x := v_0 + y := v_1 + v.reset(OpLOONG64MASKEQZ) +- v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) +- v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v1.AddArg(x) +- v0.AddArg2(v1, y) +- v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) +- v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v3.AuxInt = int64ToAuxInt(64) +- v2.AddArg2(v3, y) +- v.AddArg2(v0, v2) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t) ++ v0.AddArg2(x, y) ++ v1 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) ++ v2 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) ++ v2.AuxInt = int64ToAuxInt(32) ++ v1.AddArg2(v2, y) ++ v.AddArg2(v0, v1) + return true + } + } +@@ -8770,23 +8981,21 @@ func rewriteValueLOONG64_OpRsh32Ux8(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Rsh32Ux8 x y) +- // result: (MASKEQZ (SRLV (ZeroExt32to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) ++ // result: (MASKEQZ (SRL x (ZeroExt8to64 y)) (SGTU (MOVVconst [32]) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 + v.reset(OpLOONG64MASKEQZ) +- v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) +- v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v1.AddArg(x) +- v2 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +- v2.AddArg(y) +- v0.AddArg2(v1, v2) +- v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) +- v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v4.AuxInt = int64ToAuxInt(64) +- v3.AddArg2(v4, v2) +- v.AddArg2(v0, v3) ++ v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t) ++ v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v1.AddArg(y) ++ v0.AddArg2(x, v1) ++ v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) ++ v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) ++ v3.AuxInt = int64ToAuxInt(32) ++ v2.AddArg2(v3, v1) ++ v.AddArg2(v0, v2) + return true + } + } +@@ -8796,25 +9005,23 @@ func rewriteValueLOONG64_OpRsh32x16(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Rsh32x16 x y) +- // result: (SRAV (SignExt32to64 x) (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) ++ // result: (SRA x (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [31]))) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 +- v.reset(OpLOONG64SRAV) +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v1 := b.NewValue0(v.Pos, OpLOONG64OR, t) +- v2 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +- v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) +- v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +- v4.AddArg(y) +- v5 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v5.AuxInt = int64ToAuxInt(63) +- v3.AddArg2(v4, v5) +- v2.AddArg(v3) +- v1.AddArg2(v2, v4) +- v.AddArg2(v0, v1) ++ v.reset(OpLOONG64SRA) ++ v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) ++ v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) ++ v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) ++ v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v3.AddArg(y) ++ v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) ++ v4.AuxInt = int64ToAuxInt(31) ++ v2.AddArg2(v3, v4) ++ v1.AddArg(v2) ++ v0.AddArg2(v1, v3) ++ v.AddArg2(x, v0) + return true + } + } +@@ -8824,25 +9031,23 @@ func rewriteValueLOONG64_OpRsh32x32(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Rsh32x32 x y) +- // result: (SRAV (SignExt32to64 x) (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) ++ // result: (SRA x (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [31]))) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 +- v.reset(OpLOONG64SRAV) +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v1 := b.NewValue0(v.Pos, OpLOONG64OR, t) +- v2 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +- v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) +- v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v4.AddArg(y) +- v5 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v5.AuxInt = int64ToAuxInt(63) +- v3.AddArg2(v4, v5) +- v2.AddArg(v3) +- v1.AddArg2(v2, v4) +- v.AddArg2(v0, v1) ++ v.reset(OpLOONG64SRA) ++ v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) ++ v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) ++ v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) ++ v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) ++ v3.AddArg(y) ++ v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) ++ v4.AuxInt = int64ToAuxInt(31) ++ v2.AddArg2(v3, v4) ++ v1.AddArg(v2) ++ v0.AddArg2(v1, v3) ++ v.AddArg2(x, v0) + return true + } + } +@@ -8852,23 +9057,21 @@ func rewriteValueLOONG64_OpRsh32x64(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Rsh32x64 x y) +- // result: (SRAV (SignExt32to64 x) (OR (NEGV (SGTU y (MOVVconst [63]))) y)) ++ // result: (SRA x (OR (NEGV (SGTU y (MOVVconst [31]))) y)) + for { + t := v.Type + x := v_0 + y := v_1 +- v.reset(OpLOONG64SRAV) +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v1 := b.NewValue0(v.Pos, OpLOONG64OR, t) +- v2 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +- v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) +- v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v4.AuxInt = int64ToAuxInt(63) +- v3.AddArg2(y, v4) +- v2.AddArg(v3) +- v1.AddArg2(v2, y) +- v.AddArg2(v0, v1) ++ v.reset(OpLOONG64SRA) ++ v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) ++ v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) ++ v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) ++ v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) ++ v3.AuxInt = int64ToAuxInt(31) ++ v2.AddArg2(y, v3) ++ v1.AddArg(v2) ++ v0.AddArg2(v1, y) ++ v.AddArg2(x, v0) + return true + } + } +@@ -8878,25 +9081,23 @@ func rewriteValueLOONG64_OpRsh32x8(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Rsh32x8 x y) +- // result: (SRAV (SignExt32to64 x) (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) ++ // result: (SRA x (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [31]))) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 +- v.reset(OpLOONG64SRAV) +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v1 := b.NewValue0(v.Pos, OpLOONG64OR, t) +- v2 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +- v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) +- v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +- v4.AddArg(y) +- v5 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) +- v5.AuxInt = int64ToAuxInt(63) +- v3.AddArg2(v4, v5) +- v2.AddArg(v3) +- v1.AddArg2(v2, v4) +- v.AddArg2(v0, v1) ++ v.reset(OpLOONG64SRA) ++ v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) ++ v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) ++ v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool) ++ v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v3.AddArg(y) ++ v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64) ++ v4.AuxInt = int64ToAuxInt(31) ++ v2.AddArg2(v3, v4) ++ v1.AddArg(v2) ++ v0.AddArg2(v1, v3) ++ v.AddArg2(x, v0) + return true + } + } +diff --git a/test/codegen/shift.go b/test/codegen/shift.go +index 6112a989b9..3c669edcb2 100644 +--- a/test/codegen/shift.go ++++ b/test/codegen/shift.go +@@ -11,87 +11,99 @@ package codegen + // ------------------ // + + func lshConst64x64(v int64) int64 { ++ // loong64:"SLLV" + // ppc64x:"SLD" + // riscv64:"SLLI",-"AND",-"SLTIU" + return v << uint64(33) + } + + func rshConst64Ux64(v uint64) uint64 { ++ // loong64:"SRLV" + // ppc64x:"SRD" + // riscv64:"SRLI\t",-"AND",-"SLTIU" + return v >> uint64(33) + } + + func rshConst64Ux64Overflow32(v uint32) uint64 { ++ // loong64:"MOVV\t\\$0,",-"SRL\t" + // riscv64:"MOV\t\\$0,",-"SRL" + return uint64(v) >> 32 + } + + func rshConst64Ux64Overflow16(v uint16) uint64 { ++ // loong64:"MOVV\t\\$0,",-"SRLV" + // riscv64:"MOV\t\\$0,",-"SRL" + return uint64(v) >> 16 + } + + func rshConst64Ux64Overflow8(v uint8) uint64 { ++ // loong64:"MOVV\t\\$0,",-"SRLV" + // riscv64:"MOV\t\\$0,",-"SRL" + return uint64(v) >> 8 + } + + func rshConst64x64(v int64) int64 { ++ // loong64:"SRAV" + // ppc64x:"SRAD" + // riscv64:"SRAI\t",-"OR",-"SLTIU" + return v >> uint64(33) + } + + func rshConst64x64Overflow32(v int32) int64 { ++ // loong64:"SRA\t\\$31" + // riscv64:"SRAIW",-"SLLI",-"SRAI\t" + return int64(v) >> 32 + } + + func rshConst64x64Overflow16(v int16) int64 { ++ // loong64:"SLLV\t\\$48","SRAV\t\\$63" + // riscv64:"SLLI","SRAI",-"SRAIW" + return int64(v) >> 16 + } + + func rshConst64x64Overflow8(v int8) int64 { ++ // loong64:"SLLV\t\\$56","SRAV\t\\$63" + // riscv64:"SLLI","SRAI",-"SRAIW" + return int64(v) >> 8 + } + + func lshConst32x64(v int32) int32 { ++ // loong64:"SLL\t" + // ppc64x:"SLW" + // riscv64:"SLLI",-"AND",-"SLTIU", -"MOVW" +- // loong64:"SLLV" + return v << uint64(29) + } + + func rshConst32Ux64(v uint32) uint32 { ++ // loong64:"SRL\t" + // ppc64x:"SRW" + // riscv64:"SRLIW",-"AND",-"SLTIU", -"MOVW" +- // loong64:"BSTRPICKV",-"SLLV",-"SRLV",-"MOVWU" + return v >> uint64(29) + } + + func rshConst32x64(v int32) int32 { ++ // loong64:"SRA\t" + // ppc64x:"SRAW" + // riscv64:"SRAIW",-"OR",-"SLTIU", -"MOVW" +- // loong64:"SLLV","SRAV",-"MOVW" + return v >> uint64(29) + } + + func lshConst64x32(v int64) int64 { ++ // loong64:"SLLV" + // ppc64x:"SLD" + // riscv64:"SLLI",-"AND",-"SLTIU" + return v << uint32(33) + } + + func rshConst64Ux32(v uint64) uint64 { ++ // loong64:"SRLV" + // ppc64x:"SRD" + // riscv64:"SRLI\t",-"AND",-"SLTIU" + return v >> uint32(33) + } + + func rshConst64x32(v int64) int64 { ++ // loong64:"SRAV" + // ppc64x:"SRAD" + // riscv64:"SRAI\t",-"OR",-"SLTIU" + return v >> uint32(33) +@@ -253,6 +265,7 @@ func rshGuarded64U(v uint64, s uint) uint64 { + // s390x:-"RISBGZ",-"AND",-"LOCGR" + // wasm:-"Select",-".*LtU" + // arm64:"LSR",-"CSEL" ++ // loong64:"SRLV" + return v >> s + } + panic("shift too large") +@@ -264,6 +277,7 @@ func rshGuarded64(v int64, s uint) int64 { + // s390x:-"RISBGZ",-"AND",-"LOCGR" + // wasm:-"Select",-".*LtU" + // arm64:"ASR",-"CSEL" ++ // loong64:"SRAV" + return v >> s + } + panic("shift too large") +-- +2.38.1 + diff --git a/0023-cmd-compile-simplify-bounded-shift-on-loong64.patch b/0023-cmd-compile-simplify-bounded-shift-on-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..c4ba508b2422f8ec4217d0e36810d86d60d1d763 --- /dev/null +++ b/0023-cmd-compile-simplify-bounded-shift-on-loong64.patch @@ -0,0 +1,2206 @@ +From 03f91ceb084274b0840d7c2cf7a7cb83a7fb2ed0 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Fri, 15 Nov 2024 17:28:07 +0800 +Subject: [PATCH 23/44] cmd/compile: simplify bounded shift on loong64 + +Use the shiftIsBounded function to generate more efficient shift instructions. +Also optimize shift ops when the shift value is v&63 and v&31. + +Change-Id: I12548101a7cea6bca7f5fef2b12c4b8af8a20bb3 +--- + .../compile/internal/ssa/_gen/LOONG64.rules | 146 +-- + .../compile/internal/ssa/rewriteLOONG64.go | 968 ++++++++++++++++++ + test/codegen/shift.go | 16 + + 3 files changed, 1071 insertions(+), 59 deletions(-) + +diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules +index 014cd6fb05..9d0435f434 100644 +--- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules +@@ -57,65 +57,84 @@ + // shifts + // hardware instruction uses only the low 6 bits of the shift + // we compare to 64 to ensure Go semantics for large shifts +-(Lsh64x64 x y) => (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) +-(Lsh64x32 x y) => (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) +-(Lsh64x16 x y) => (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) +-(Lsh64x8 x y) => (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) +- +-(Lsh32x64 x y) => (MASKEQZ (SLL x y) (SGTU (MOVVconst [32]) y)) +-(Lsh32x32 x y) => (MASKEQZ (SLL x (ZeroExt32to64 y)) (SGTU (MOVVconst [32]) (ZeroExt32to64 y))) +-(Lsh32x16 x y) => (MASKEQZ (SLL x (ZeroExt16to64 y)) (SGTU (MOVVconst [32]) (ZeroExt16to64 y))) +-(Lsh32x8 x y) => (MASKEQZ (SLL x (ZeroExt8to64 y)) (SGTU (MOVVconst [32]) (ZeroExt8to64 y))) +- +-(Lsh16x64 x y) => (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) +-(Lsh16x32 x y) => (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) +-(Lsh16x16 x y) => (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) +-(Lsh16x8 x y) => (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) +- +-(Lsh8x64 x y) => (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) +-(Lsh8x32 x y) => (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) +-(Lsh8x16 x y) => (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) +-(Lsh8x8 x y) => (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) +- +-(Rsh64Ux64 x y) => (MASKEQZ (SRLV x y) (SGTU (MOVVconst [64]) y)) +-(Rsh64Ux32 x y) => (MASKEQZ (SRLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) +-(Rsh64Ux16 x y) => (MASKEQZ (SRLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) +-(Rsh64Ux8 x y) => (MASKEQZ (SRLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) +- +-(Rsh32Ux64 x y) => (MASKEQZ (SRL x y) (SGTU (MOVVconst [32]) y)) +-(Rsh32Ux32 x y) => (MASKEQZ (SRL x (ZeroExt32to64 y)) (SGTU (MOVVconst [32]) (ZeroExt32to64 y))) +-(Rsh32Ux16 x y) => (MASKEQZ (SRL x (ZeroExt16to64 y)) (SGTU (MOVVconst [32]) (ZeroExt16to64 y))) +-(Rsh32Ux8 x y) => (MASKEQZ (SRL x (ZeroExt8to64 y)) (SGTU (MOVVconst [32]) (ZeroExt8to64 y))) +- +-(Rsh16Ux64 x y) => (MASKEQZ (SRLV (ZeroExt16to64 x) y) (SGTU (MOVVconst [64]) y)) +-(Rsh16Ux32 x y) => (MASKEQZ (SRLV (ZeroExt16to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) +-(Rsh16Ux16 x y) => (MASKEQZ (SRLV (ZeroExt16to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) +-(Rsh16Ux8 x y) => (MASKEQZ (SRLV (ZeroExt16to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) +- +-(Rsh8Ux64 x y) => (MASKEQZ (SRLV (ZeroExt8to64 x) y) (SGTU (MOVVconst [64]) y)) +-(Rsh8Ux32 x y) => (MASKEQZ (SRLV (ZeroExt8to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) +-(Rsh8Ux16 x y) => (MASKEQZ (SRLV (ZeroExt8to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) +-(Rsh8Ux8 x y) => (MASKEQZ (SRLV (ZeroExt8to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) +- +-(Rsh64x64 x y) => (SRAV x (OR (NEGV (SGTU y (MOVVconst [63]))) y)) +-(Rsh64x32 x y) => (SRAV x (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) +-(Rsh64x16 x y) => (SRAV x (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) +-(Rsh64x8 x y) => (SRAV x (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) +- +-(Rsh32x64 x y) => (SRA x (OR (NEGV (SGTU y (MOVVconst [31]))) y)) +-(Rsh32x32 x y) => (SRA x (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [31]))) (ZeroExt32to64 y))) +-(Rsh32x16 x y) => (SRA x (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [31]))) (ZeroExt16to64 y))) +-(Rsh32x8 x y) => (SRA x (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [31]))) (ZeroExt8to64 y))) +- +-(Rsh16x64 x y) => (SRAV (SignExt16to64 x) (OR (NEGV (SGTU y (MOVVconst [63]))) y)) +-(Rsh16x32 x y) => (SRAV (SignExt16to64 x) (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) +-(Rsh16x16 x y) => (SRAV (SignExt16to64 x) (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) +-(Rsh16x8 x y) => (SRAV (SignExt16to64 x) (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) +- +-(Rsh8x64 x y) => (SRAV (SignExt8to64 x) (OR (NEGV (SGTU y (MOVVconst [63]))) y)) +-(Rsh8x32 x y) => (SRAV (SignExt8to64 x) (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) +-(Rsh8x16 x y) => (SRAV (SignExt8to64 x) (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) +-(Rsh8x8 x y) => (SRAV (SignExt8to64 x) (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) ++ ++// left shift ++(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLLV x y) ++(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y) ++(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLLV x y) ++(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLLV x y) ++ ++(Lsh64x64 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) ++(Lsh64x32 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) ++(Lsh64x16 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) ++(Lsh64x8 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) ++ ++(Lsh32x64 x y) && !shiftIsBounded(v) => (MASKEQZ (SLL x y) (SGTU (MOVVconst [32]) y)) ++(Lsh32x32 x y) && !shiftIsBounded(v) => (MASKEQZ (SLL x (ZeroExt32to64 y)) (SGTU (MOVVconst [32]) (ZeroExt32to64 y))) ++(Lsh32x16 x y) && !shiftIsBounded(v) => (MASKEQZ (SLL x (ZeroExt16to64 y)) (SGTU (MOVVconst [32]) (ZeroExt16to64 y))) ++(Lsh32x8 x y) && !shiftIsBounded(v) => (MASKEQZ (SLL x (ZeroExt8to64 y)) (SGTU (MOVVconst [32]) (ZeroExt8to64 y))) ++ ++(Lsh16x64 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) ++(Lsh16x32 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) ++(Lsh16x16 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) ++(Lsh16x8 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) ++ ++(Lsh8x64 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) ++(Lsh8x32 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) ++(Lsh8x16 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) ++(Lsh8x8 x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) ++ ++// unsigned right shift ++(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRLV x y) ++(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL x y) ++(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRLV (ZeroExt16to64 x) y) ++(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRLV (ZeroExt8to64 x) y) ++ ++(Rsh64Ux64 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV x y) (SGTU (MOVVconst [64]) y)) ++(Rsh64Ux32 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) ++(Rsh64Ux16 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) ++(Rsh64Ux8 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) ++ ++(Rsh32Ux64 x y) && !shiftIsBounded(v) => (MASKEQZ (SRL x y) (SGTU (MOVVconst [32]) y)) ++(Rsh32Ux32 x y) && !shiftIsBounded(v) => (MASKEQZ (SRL x (ZeroExt32to64 y)) (SGTU (MOVVconst [32]) (ZeroExt32to64 y))) ++(Rsh32Ux16 x y) && !shiftIsBounded(v) => (MASKEQZ (SRL x (ZeroExt16to64 y)) (SGTU (MOVVconst [32]) (ZeroExt16to64 y))) ++(Rsh32Ux8 x y) && !shiftIsBounded(v) => (MASKEQZ (SRL x (ZeroExt8to64 y)) (SGTU (MOVVconst [32]) (ZeroExt8to64 y))) ++ ++(Rsh16Ux64 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV (ZeroExt16to64 x) y) (SGTU (MOVVconst [64]) y)) ++(Rsh16Ux32 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV (ZeroExt16to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) ++(Rsh16Ux16 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV (ZeroExt16to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) ++(Rsh16Ux8 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV (ZeroExt16to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) ++ ++(Rsh8Ux64 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV (ZeroExt8to64 x) y) (SGTU (MOVVconst [64]) y)) ++(Rsh8Ux32 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV (ZeroExt8to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) ++(Rsh8Ux16 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV (ZeroExt8to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) ++(Rsh8Ux8 x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV (ZeroExt8to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) ++ ++// signed right shift ++(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAV x y) ++(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA x y) ++(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAV (SignExt16to64 x) y) ++(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAV (SignExt8to64 x) y) ++ ++(Rsh64x64 x y) && !shiftIsBounded(v) => (SRAV x (OR (NEGV (SGTU y (MOVVconst [63]))) y)) ++(Rsh64x32 x y) && !shiftIsBounded(v) => (SRAV x (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) ++(Rsh64x16 x y) && !shiftIsBounded(v) => (SRAV x (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) ++(Rsh64x8 x y) && !shiftIsBounded(v) => (SRAV x (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) ++ ++(Rsh32x64 x y) && !shiftIsBounded(v) => (SRA x (OR (NEGV (SGTU y (MOVVconst [31]))) y)) ++(Rsh32x32 x y) && !shiftIsBounded(v) => (SRA x (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [31]))) (ZeroExt32to64 y))) ++(Rsh32x16 x y) && !shiftIsBounded(v) => (SRA x (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [31]))) (ZeroExt16to64 y))) ++(Rsh32x8 x y) && !shiftIsBounded(v) => (SRA x (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [31]))) (ZeroExt8to64 y))) ++ ++(Rsh16x64 x y) && !shiftIsBounded(v) => (SRAV (SignExt16to64 x) (OR (NEGV (SGTU y (MOVVconst [63]))) y)) ++(Rsh16x32 x y) && !shiftIsBounded(v) => (SRAV (SignExt16to64 x) (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) ++(Rsh16x16 x y) && !shiftIsBounded(v) => (SRAV (SignExt16to64 x) (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) ++(Rsh16x8 x y) && !shiftIsBounded(v) => (SRAV (SignExt16to64 x) (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) ++ ++(Rsh8x64 x y) && !shiftIsBounded(v) => (SRAV (SignExt8to64 x) (OR (NEGV (SGTU y (MOVVconst [63]))) y)) ++(Rsh8x32 x y) && !shiftIsBounded(v) => (SRAV (SignExt8to64 x) (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) ++(Rsh8x16 x y) && !shiftIsBounded(v) => (SRAV (SignExt8to64 x) (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) ++(Rsh8x8 x y) && !shiftIsBounded(v) => (SRAV (SignExt8to64 x) (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) + + // bitfield ops + +@@ -698,6 +717,15 @@ + (ROTR x (MOVVconst [c])) => (ROTRconst x [c&31]) + (ROTRV x (MOVVconst [c])) => (ROTRVconst x [c&63]) + ++// SLLV/SRLV/SRAV only considers the bottom 6 bits of y, similarly SLL/SRL/SRA only considers the ++// bottom 5 bits of y. ++(SLL x (ANDconst [31] y)) => (SLL x y) ++(SRL x (ANDconst [31] y)) => (SRL x y) ++(SRA x (ANDconst [31] y)) => (SRA x y) ++(SLLV x (ANDconst [63] y)) => (SLLV x y) ++(SRLV x (ANDconst [63] y)) => (SRLV x y) ++(SRAV x (ANDconst [63] y)) => (SRAV x y) ++ + // Avoid unnecessary zero and sign extension when right shifting. + (SRLVconst [rc] (MOVWUreg y)) && rc >= 0 && rc <= 31 => (SRLconst [int64(rc)] y) + (SRAVconst [rc] (MOVWreg y)) && rc >= 0 && rc <= 31 => (SRAconst [int64(rc)] y) +diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64.go b/src/cmd/compile/internal/ssa/rewriteLOONG64.go +index 93bf95eb51..9efdca9c9c 100644 +--- a/src/cmd/compile/internal/ssa/rewriteLOONG64.go ++++ b/src/cmd/compile/internal/ssa/rewriteLOONG64.go +@@ -5994,6 +5994,18 @@ func rewriteValueLOONG64_OpLOONG64SLL(v *Value) bool { + v.AddArg(x) + return true + } ++ // match: (SLL x (ANDconst [31] y)) ++ // result: (SLL x y) ++ for { ++ x := v_0 ++ if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 31 { ++ break ++ } ++ y := v_1.Args[0] ++ v.reset(OpLOONG64SLL) ++ v.AddArg2(x, y) ++ return true ++ } + return false + } + func rewriteValueLOONG64_OpLOONG64SLLV(v *Value) bool { +@@ -6027,6 +6039,18 @@ func rewriteValueLOONG64_OpLOONG64SLLV(v *Value) bool { + v.AddArg(x) + return true + } ++ // match: (SLLV x (ANDconst [63] y)) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 63 { ++ break ++ } ++ y := v_1.Args[0] ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + return false + } + func rewriteValueLOONG64_OpLOONG64SLLVconst(v *Value) bool { +@@ -6082,6 +6106,18 @@ func rewriteValueLOONG64_OpLOONG64SRA(v *Value) bool { + v.AddArg(x) + return true + } ++ // match: (SRA x (ANDconst [31] y)) ++ // result: (SRA x y) ++ for { ++ x := v_0 ++ if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 31 { ++ break ++ } ++ y := v_1.Args[0] ++ v.reset(OpLOONG64SRA) ++ v.AddArg2(x, y) ++ return true ++ } + return false + } + func rewriteValueLOONG64_OpLOONG64SRAV(v *Value) bool { +@@ -6117,6 +6153,18 @@ func rewriteValueLOONG64_OpLOONG64SRAV(v *Value) bool { + v.AddArg(x) + return true + } ++ // match: (SRAV x (ANDconst [63] y)) ++ // result: (SRAV x y) ++ for { ++ x := v_0 ++ if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 63 { ++ break ++ } ++ y := v_1.Args[0] ++ v.reset(OpLOONG64SRAV) ++ v.AddArg2(x, y) ++ return true ++ } + return false + } + func rewriteValueLOONG64_OpLOONG64SRAVconst(v *Value) bool { +@@ -6249,6 +6297,18 @@ func rewriteValueLOONG64_OpLOONG64SRL(v *Value) bool { + v.AddArg(x) + return true + } ++ // match: (SRL x (ANDconst [31] y)) ++ // result: (SRL x y) ++ for { ++ x := v_0 ++ if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 31 { ++ break ++ } ++ y := v_1.Args[0] ++ v.reset(OpLOONG64SRL) ++ v.AddArg2(x, y) ++ return true ++ } + return false + } + func rewriteValueLOONG64_OpLOONG64SRLV(v *Value) bool { +@@ -6282,6 +6342,18 @@ func rewriteValueLOONG64_OpLOONG64SRLV(v *Value) bool { + v.AddArg(x) + return true + } ++ // match: (SRLV x (ANDconst [63] y)) ++ // result: (SRLV x y) ++ for { ++ x := v_0 ++ if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 63 { ++ break ++ } ++ y := v_1.Args[0] ++ v.reset(OpLOONG64SRLV) ++ v.AddArg2(x, y) ++ return true ++ } + return false + } + func rewriteValueLOONG64_OpLOONG64SRLVconst(v *Value) bool { +@@ -7384,12 +7456,29 @@ func rewriteValueLOONG64_OpLsh16x16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh16x16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh16x16 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +@@ -7402,18 +7491,36 @@ func rewriteValueLOONG64_OpLsh16x16(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh16x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh16x32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh16x32 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +@@ -7426,18 +7533,36 @@ func rewriteValueLOONG64_OpLsh16x32(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh16x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh16x64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh16x64 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v0.AddArg2(x, y) +@@ -7448,18 +7573,36 @@ func rewriteValueLOONG64_OpLsh16x64(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh16x8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh16x8 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +@@ -7472,18 +7615,36 @@ func rewriteValueLOONG64_OpLsh16x8(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh32x16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLL x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLL) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh32x16 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLL x (ZeroExt16to64 y)) (SGTU (MOVVconst [32]) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +@@ -7496,18 +7657,36 @@ func rewriteValueLOONG64_OpLsh32x16(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh32x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh32x32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLL x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLL) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh32x32 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLL x (ZeroExt32to64 y)) (SGTU (MOVVconst [32]) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t) + v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +@@ -7520,18 +7699,36 @@ func rewriteValueLOONG64_OpLsh32x32(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh32x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh32x64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLL x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLL) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh32x64 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLL x y) (SGTU (MOVVconst [32]) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t) + v0.AddArg2(x, y) +@@ -7542,18 +7739,36 @@ func rewriteValueLOONG64_OpLsh32x64(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh32x8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLL x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLL) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh32x8 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLL x (ZeroExt8to64 y)) (SGTU (MOVVconst [32]) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +@@ -7566,18 +7781,36 @@ func rewriteValueLOONG64_OpLsh32x8(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh64x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh64x16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh64x16 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +@@ -7590,18 +7823,36 @@ func rewriteValueLOONG64_OpLsh64x16(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh64x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh64x32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh64x32 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +@@ -7614,18 +7865,36 @@ func rewriteValueLOONG64_OpLsh64x32(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh64x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh64x64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh64x64 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v0.AddArg2(x, y) +@@ -7636,18 +7905,36 @@ func rewriteValueLOONG64_OpLsh64x64(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh64x8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh64x8 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +@@ -7660,18 +7947,36 @@ func rewriteValueLOONG64_OpLsh64x8(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh8x16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh8x16 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +@@ -7684,18 +7989,36 @@ func rewriteValueLOONG64_OpLsh8x16(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh8x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh8x32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh8x32 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +@@ -7708,18 +8031,36 @@ func rewriteValueLOONG64_OpLsh8x32(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh8x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh8x64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh8x64 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x y) (SGTU (MOVVconst [64]) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v0.AddArg2(x, y) +@@ -7730,18 +8071,36 @@ func rewriteValueLOONG64_OpLsh8x64(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpLsh8x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Lsh8x8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SLLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SLLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Lsh8x8 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SLLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +@@ -7754,6 +8113,7 @@ func rewriteValueLOONG64_OpLsh8x8(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpMod16(v *Value) bool { + v_1 := v.Args[1] +@@ -8698,12 +9058,31 @@ func rewriteValueLOONG64_OpRsh16Ux16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh16Ux16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV (ZeroExt16to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh16Ux16 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV (ZeroExt16to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +@@ -8718,18 +9097,38 @@ func rewriteValueLOONG64_OpRsh16Ux16(v *Value) bool { + v.AddArg2(v0, v3) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh16Ux32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh16Ux32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV (ZeroExt16to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh16Ux32 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV (ZeroExt16to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +@@ -8744,18 +9143,38 @@ func rewriteValueLOONG64_OpRsh16Ux32(v *Value) bool { + v.AddArg2(v0, v3) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh16Ux64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh16Ux64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV (ZeroExt16to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh16Ux64 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV (ZeroExt16to64 x) y) (SGTU (MOVVconst [64]) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +@@ -8768,18 +9187,38 @@ func rewriteValueLOONG64_OpRsh16Ux64(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh16Ux8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh16Ux8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV (ZeroExt16to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh16Ux8 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV (ZeroExt16to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +@@ -8794,18 +9233,38 @@ func rewriteValueLOONG64_OpRsh16Ux8(v *Value) bool { + v.AddArg2(v0, v3) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh16x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh16x16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV (SignExt16to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh16x16 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV (SignExt16to64 x) (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64) + v0.AddArg(x) +@@ -8822,18 +9281,38 @@ func rewriteValueLOONG64_OpRsh16x16(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh16x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh16x32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV (SignExt16to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh16x32 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV (SignExt16to64 x) (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64) + v0.AddArg(x) +@@ -8850,18 +9329,38 @@ func rewriteValueLOONG64_OpRsh16x32(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh16x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh16x64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV (SignExt16to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh16x64 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV (SignExt16to64 x) (OR (NEGV (SGTU y (MOVVconst [63]))) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64) + v0.AddArg(x) +@@ -8876,18 +9375,38 @@ func rewriteValueLOONG64_OpRsh16x64(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh16x8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV (SignExt16to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh16x8 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV (SignExt16to64 x) (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64) + v0.AddArg(x) +@@ -8904,18 +9423,36 @@ func rewriteValueLOONG64_OpRsh16x8(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh32Ux16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh32Ux16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRL x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRL) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh32Ux16 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRL x (ZeroExt16to64 y)) (SGTU (MOVVconst [32]) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +@@ -8928,18 +9465,36 @@ func rewriteValueLOONG64_OpRsh32Ux16(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh32Ux32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh32Ux32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRL x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRL) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh32Ux32 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRL x (ZeroExt32to64 y)) (SGTU (MOVVconst [32]) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t) + v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +@@ -8952,18 +9507,36 @@ func rewriteValueLOONG64_OpRsh32Ux32(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh32Ux64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh32Ux64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRL x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRL) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh32Ux64 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRL x y) (SGTU (MOVVconst [32]) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t) + v0.AddArg2(x, y) +@@ -8974,18 +9547,36 @@ func rewriteValueLOONG64_OpRsh32Ux64(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh32Ux8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh32Ux8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRL x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRL) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh32Ux8 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRL x (ZeroExt8to64 y)) (SGTU (MOVVconst [32]) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +@@ -8998,18 +9589,36 @@ func rewriteValueLOONG64_OpRsh32Ux8(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh32x16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRA x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRA) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh32x16 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRA x (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [31]))) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRA) + v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) + v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +@@ -9024,18 +9633,36 @@ func rewriteValueLOONG64_OpRsh32x16(v *Value) bool { + v.AddArg2(x, v0) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh32x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh32x32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRA x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRA) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh32x32 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRA x (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [31]))) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRA) + v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) + v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +@@ -9050,18 +9677,36 @@ func rewriteValueLOONG64_OpRsh32x32(v *Value) bool { + v.AddArg2(x, v0) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh32x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh32x64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRA x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRA) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh32x64 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRA x (OR (NEGV (SGTU y (MOVVconst [31]))) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRA) + v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) + v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +@@ -9074,18 +9719,36 @@ func rewriteValueLOONG64_OpRsh32x64(v *Value) bool { + v.AddArg2(x, v0) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh32x8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRA x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRA) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh32x8 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRA x (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [31]))) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRA) + v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) + v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +@@ -9100,18 +9763,36 @@ func rewriteValueLOONG64_OpRsh32x8(v *Value) bool { + v.AddArg2(x, v0) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh64Ux16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh64Ux16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh64Ux16 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV x (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +@@ -9124,18 +9805,36 @@ func rewriteValueLOONG64_OpRsh64Ux16(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh64Ux32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh64Ux32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh64Ux32 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV x (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +@@ -9148,18 +9847,36 @@ func rewriteValueLOONG64_OpRsh64Ux32(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh64Ux64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh64Ux64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh64Ux64 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV x y) (SGTU (MOVVconst [64]) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v0.AddArg2(x, y) +@@ -9170,18 +9887,36 @@ func rewriteValueLOONG64_OpRsh64Ux64(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh64Ux8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh64Ux8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh64Ux8 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV x (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +@@ -9194,18 +9929,36 @@ func rewriteValueLOONG64_OpRsh64Ux8(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh64x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh64x16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh64x16 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV x (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) + v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +@@ -9220,18 +9973,36 @@ func rewriteValueLOONG64_OpRsh64x16(v *Value) bool { + v.AddArg2(x, v0) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh64x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh64x32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh64x32 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV x (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) + v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +@@ -9246,18 +10017,36 @@ func rewriteValueLOONG64_OpRsh64x32(v *Value) bool { + v.AddArg2(x, v0) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh64x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh64x64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh64x64 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV x (OR (NEGV (SGTU y (MOVVconst [63]))) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) + v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +@@ -9270,18 +10059,36 @@ func rewriteValueLOONG64_OpRsh64x64(v *Value) bool { + v.AddArg2(x, v0) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh64x8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v.AddArg2(x, y) ++ return true ++ } + // match: (Rsh64x8 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV x (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpLOONG64OR, t) + v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t) +@@ -9296,18 +10103,38 @@ func rewriteValueLOONG64_OpRsh64x8(v *Value) bool { + v.AddArg2(x, v0) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh8Ux16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh8Ux16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV (ZeroExt8to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh8Ux16 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV (ZeroExt8to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst [64]) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +@@ -9322,18 +10149,38 @@ func rewriteValueLOONG64_OpRsh8Ux16(v *Value) bool { + v.AddArg2(v0, v3) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh8Ux32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh8Ux32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV (ZeroExt8to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh8Ux32 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV (ZeroExt8to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst [64]) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +@@ -9348,18 +10195,38 @@ func rewriteValueLOONG64_OpRsh8Ux32(v *Value) bool { + v.AddArg2(v0, v3) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh8Ux64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh8Ux64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV (ZeroExt8to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh8Ux64 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV (ZeroExt8to64 x) y) (SGTU (MOVVconst [64]) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +@@ -9372,18 +10239,38 @@ func rewriteValueLOONG64_OpRsh8Ux64(v *Value) bool { + v.AddArg2(v0, v2) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh8Ux8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh8Ux8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRLV (ZeroExt8to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRLV) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh8Ux8 x y) ++ // cond: !shiftIsBounded(v) + // result: (MASKEQZ (SRLV (ZeroExt8to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst [64]) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64MASKEQZ) + v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t) + v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +@@ -9398,18 +10285,38 @@ func rewriteValueLOONG64_OpRsh8Ux8(v *Value) bool { + v.AddArg2(v0, v3) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh8x16 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV (SignExt8to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh8x16 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV (SignExt8to64 x) (OR (NEGV (SGTU (ZeroExt16to64 y) (MOVVconst [63]))) (ZeroExt16to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64) + v0.AddArg(x) +@@ -9426,18 +10333,38 @@ func rewriteValueLOONG64_OpRsh8x16(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh8x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh8x32 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV (SignExt8to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh8x32 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV (SignExt8to64 x) (OR (NEGV (SGTU (ZeroExt32to64 y) (MOVVconst [63]))) (ZeroExt32to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64) + v0.AddArg(x) +@@ -9454,18 +10381,38 @@ func rewriteValueLOONG64_OpRsh8x32(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh8x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh8x64 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV (SignExt8to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh8x64 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV (SignExt8to64 x) (OR (NEGV (SGTU y (MOVVconst [63]))) y)) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64) + v0.AddArg(x) +@@ -9480,18 +10427,38 @@ func rewriteValueLOONG64_OpRsh8x64(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpRsh8x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types ++ // match: (Rsh8x8 x y) ++ // cond: shiftIsBounded(v) ++ // result: (SRAV (SignExt8to64 x) y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(shiftIsBounded(v)) { ++ break ++ } ++ v.reset(OpLOONG64SRAV) ++ v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64) ++ v0.AddArg(x) ++ v.AddArg2(v0, y) ++ return true ++ } + // match: (Rsh8x8 x y) ++ // cond: !shiftIsBounded(v) + // result: (SRAV (SignExt8to64 x) (OR (NEGV (SGTU (ZeroExt8to64 y) (MOVVconst [63]))) (ZeroExt8to64 y))) + for { + t := v.Type + x := v_0 + y := v_1 ++ if !(!shiftIsBounded(v)) { ++ break ++ } + v.reset(OpLOONG64SRAV) + v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64) + v0.AddArg(x) +@@ -9508,6 +10475,7 @@ func rewriteValueLOONG64_OpRsh8x8(v *Value) bool { + v.AddArg2(v0, v1) + return true + } ++ return false + } + func rewriteValueLOONG64_OpSelect0(v *Value) bool { + v_0 := v.Args[0] +diff --git a/test/codegen/shift.go b/test/codegen/shift.go +index 3c669edcb2..db4e6409a8 100644 +--- a/test/codegen/shift.go ++++ b/test/codegen/shift.go +@@ -115,6 +115,7 @@ func rshConst64x32(v int64) int64 { + + func lshMask64x64(v int64, s uint64) int64 { + // arm64:"LSL",-"AND" ++ // loong64:"SLLV",-"AND" + // ppc64x:"RLDICL",-"ORN",-"ISEL" + // riscv64:"SLL",-"AND\t",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" +@@ -123,6 +124,7 @@ func lshMask64x64(v int64, s uint64) int64 { + + func rshMask64Ux64(v uint64, s uint64) uint64 { + // arm64:"LSR",-"AND",-"CSEL" ++ // loong64:"SRLV",-"AND" + // ppc64x:"RLDICL",-"ORN",-"ISEL" + // riscv64:"SRL\t",-"AND\t",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" +@@ -131,6 +133,7 @@ func rshMask64Ux64(v uint64, s uint64) uint64 { + + func rshMask64x64(v int64, s uint64) int64 { + // arm64:"ASR",-"AND",-"CSEL" ++ // loong64:"SRAV",-"AND" + // ppc64x:"RLDICL",-"ORN",-"ISEL" + // riscv64:"SRA\t",-"OR",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" +@@ -139,14 +142,21 @@ func rshMask64x64(v int64, s uint64) int64 { + + func lshMask32x64(v int32, s uint64) int32 { + // arm64:"LSL",-"AND" ++ // loong64:"SLL\t","AND","SGTU","MASKEQZ" + // ppc64x:"ISEL",-"ORN" + // riscv64:"SLL",-"AND\t",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + return v << (s & 63) + } + ++func lsh5Mask32x64(v int32, s uint64) int32 { ++ // loong64:"SLL\t",-"AND" ++ return v << (s & 31) ++} ++ + func rshMask32Ux64(v uint32, s uint64) uint32 { + // arm64:"LSR",-"AND" ++ // loong64:"SRL\t","AND","SGTU","MASKEQZ" + // ppc64x:"ISEL",-"ORN" + // riscv64:"SRLW","SLTIU","NEG","AND\t",-"SRL\t" + // s390x:-"RISBGZ",-"AND",-"LOCGR" +@@ -154,12 +164,14 @@ func rshMask32Ux64(v uint32, s uint64) uint32 { + } + + func rsh5Mask32Ux64(v uint32, s uint64) uint32 { ++ // loong64:"SRL\t",-"AND" + // riscv64:"SRLW",-"AND\t",-"SLTIU",-"SRL\t" + return v >> (s & 31) + } + + func rshMask32x64(v int32, s uint64) int32 { + // arm64:"ASR",-"AND" ++ // loong64:"SRA\t","AND","SGTU","SUBVU","OR" + // ppc64x:"ISEL",-"ORN" + // riscv64:"SRAW","OR","SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" +@@ -167,12 +179,14 @@ func rshMask32x64(v int32, s uint64) int32 { + } + + func rsh5Mask32x64(v int32, s uint64) int32 { ++ // loong64:"SRA\t",-"AND" + // riscv64:"SRAW",-"OR",-"SLTIU" + return v >> (s & 31) + } + + func lshMask64x32(v int64, s uint32) int64 { + // arm64:"LSL",-"AND" ++ // loong64:"SLLV",-"AND" + // ppc64x:"RLDICL",-"ORN" + // riscv64:"SLL",-"AND\t",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" +@@ -181,6 +195,7 @@ func lshMask64x32(v int64, s uint32) int64 { + + func rshMask64Ux32(v uint64, s uint32) uint64 { + // arm64:"LSR",-"AND",-"CSEL" ++ // loong64:"SRLV",-"AND" + // ppc64x:"RLDICL",-"ORN" + // riscv64:"SRL\t",-"AND\t",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" +@@ -189,6 +204,7 @@ func rshMask64Ux32(v uint64, s uint32) uint64 { + + func rshMask64x32(v int64, s uint32) int64 { + // arm64:"ASR",-"AND",-"CSEL" ++ // loong64:"SRAV",-"AND" + // ppc64x:"RLDICL",-"ORN",-"ISEL" + // riscv64:"SRA\t",-"OR",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" +-- +2.38.1 + diff --git a/0024-runtime-use-ABIInternal-on-syscall-and-other-sys.stu.patch b/0024-runtime-use-ABIInternal-on-syscall-and-other-sys.stu.patch new file mode 100644 index 0000000000000000000000000000000000000000..bc81e40b22cf501e51b804355e9f185b57a9dd79 --- /dev/null +++ b/0024-runtime-use-ABIInternal-on-syscall-and-other-sys.stu.patch @@ -0,0 +1,505 @@ +From 7e54d3bbc1af00ca94819f9c1bbb61f822d37439 Mon Sep 17 00:00:00 2001 +From: Guoqi Chen +Date: Tue, 26 Nov 2024 15:44:28 +0800 +Subject: [PATCH 24/44] runtime: use ABIInternal on syscall and other sys.stuff + for loong64 + +Change-Id: Ieeb3f2af02c55a9ad62a19d0085b0e082a182db4 +--- + src/runtime/sys_linux_loong64.s | 227 +++++++++++--------------------- + 1 file changed, 79 insertions(+), 148 deletions(-) + +diff --git a/src/runtime/sys_linux_loong64.s b/src/runtime/sys_linux_loong64.s +index 57cee99da7..b4e9930755 100644 +--- a/src/runtime/sys_linux_loong64.s ++++ b/src/runtime/sys_linux_loong64.s +@@ -47,8 +47,7 @@ + #define SYS_timer_delete 111 + + // func exit(code int32) +-TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4 +- MOVW code+0(FP), R4 ++TEXT runtime·exit(SB),NOSPLIT,$0 + MOVV $SYS_exit_group, R11 + SYSCALL + RET +@@ -67,48 +66,49 @@ TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8 + JMP 0(PC) + + // func open(name *byte, mode, perm int32) int32 +-TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0-20 ++TEXT runtime·open(SB),NOSPLIT,$0 ++ // before: ++ // R4: name ++ // R5: mode ++ // R6: perm ++ ++ // after: ++ // R4: AT_FDCWD ++ // R5: name ++ // R6: mode ++ // R7: perm ++ ++ MOVW R6, R7 ++ MOVW R5, R6 ++ MOVV R4, R5 + MOVW $AT_FDCWD, R4 // AT_FDCWD, so this acts like open +- MOVV name+0(FP), R5 +- MOVW mode+8(FP), R6 +- MOVW perm+12(FP), R7 ++ + MOVV $SYS_openat, R11 + SYSCALL + MOVW $-4096, R5 + BGEU R5, R4, 2(PC) + MOVW $-1, R4 +- MOVW R4, ret+16(FP) + RET + + // func closefd(fd int32) int32 +-TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0-12 +- MOVW fd+0(FP), R4 ++TEXT runtime·closefd(SB),NOSPLIT,$0 + MOVV $SYS_close, R11 + SYSCALL + MOVW $-4096, R5 + BGEU R5, R4, 2(PC) + MOVW $-1, R4 +- MOVW R4, ret+8(FP) + RET + + // func write1(fd uintptr, p unsafe.Pointer, n int32) int32 +-TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0-28 +- MOVV fd+0(FP), R4 +- MOVV p+8(FP), R5 +- MOVW n+16(FP), R6 ++TEXT runtime·write1(SB),NOSPLIT,$0 + MOVV $SYS_write, R11 + SYSCALL +- MOVW R4, ret+24(FP) + RET + + // func read(fd int32, p unsafe.Pointer, n int32) int32 +-TEXT runtime·read(SB),NOSPLIT|NOFRAME,$0-28 +- MOVW fd+0(FP), R4 +- MOVV p+8(FP), R5 +- MOVW n+16(FP), R6 ++TEXT runtime·read(SB),NOSPLIT,$0 + MOVV $SYS_read, R11 + SYSCALL +- MOVW R4, ret+24(FP) + RET + + // func pipe2(flags int32) (r, w int32, errno int32) +@@ -121,16 +121,15 @@ TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20 + RET + + // func usleep(usec uint32) +-TEXT runtime·usleep(SB),NOSPLIT,$16-4 +- MOVWU usec+0(FP), R7 ++TEXT runtime·usleep(SB),NOSPLIT,$16 + MOVV $1000, R6 +- MULVU R6, R7, R7 ++ MULVU R6, R4, R4 + MOVV $1000000000, R6 + +- DIVVU R6, R7, R5 // ts->tv_sec +- REMVU R6, R7, R4 // ts->tv_nsec ++ DIVVU R6, R4, R5 // ts->tv_sec ++ REMVU R6, R4, R8 // ts->tv_nsec + MOVV R5, 8(R3) +- MOVV R4, 16(R3) ++ MOVV R8, 16(R3) + + // nanosleep(&ts, 0) + ADDV $8, R3, R4 +@@ -140,14 +139,14 @@ TEXT runtime·usleep(SB),NOSPLIT,$16-4 + RET + + // func gettid() uint32 +-TEXT runtime·gettid(SB),NOSPLIT,$0-4 ++TEXT runtime·gettid(SB),NOSPLIT,$0 + MOVV $SYS_gettid, R11 + SYSCALL +- MOVW R4, ret+0(FP) + RET + + // func raise(sig uint32) +-TEXT runtime·raise(SB),NOSPLIT|NOFRAME,$0 ++TEXT runtime·raise(SB),NOSPLIT,$0 ++ MOVW R4, R24 // backup sig + MOVV $SYS_getpid, R11 + SYSCALL + MOVW R4, R23 +@@ -155,87 +154,66 @@ TEXT runtime·raise(SB),NOSPLIT|NOFRAME,$0 + SYSCALL + MOVW R4, R5 // arg 2 tid + MOVW R23, R4 // arg 1 pid +- MOVW sig+0(FP), R6 // arg 3 ++ MOVW R24, R6 // arg 3 + MOVV $SYS_tgkill, R11 + SYSCALL + RET + + // func raiseproc(sig uint32) +-TEXT runtime·raiseproc(SB),NOSPLIT|NOFRAME,$0 ++TEXT runtime·raiseproc(SB),NOSPLIT,$0 ++ MOVW R4, R24 // backup sig + MOVV $SYS_getpid, R11 + SYSCALL + //MOVW R4, R4 // arg 1 pid +- MOVW sig+0(FP), R5 // arg 2 ++ MOVW R24, R5 // arg 2 + MOVV $SYS_kill, R11 + SYSCALL + RET + + // func getpid() int +-TEXT ·getpid(SB),NOSPLIT|NOFRAME,$0-8 ++TEXT ·getpid(SB),NOSPLIT,$0 + MOVV $SYS_getpid, R11 + SYSCALL +- MOVV R4, ret+0(FP) + RET + + // func tgkill(tgid, tid, sig int) +-TEXT ·tgkill(SB),NOSPLIT|NOFRAME,$0-24 +- MOVV tgid+0(FP), R4 +- MOVV tid+8(FP), R5 +- MOVV sig+16(FP), R6 ++TEXT ·tgkill(SB),NOSPLIT,$0 + MOVV $SYS_tgkill, R11 + SYSCALL + RET + + // func setitimer(mode int32, new, old *itimerval) +-TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24 +- MOVW mode+0(FP), R4 +- MOVV new+8(FP), R5 +- MOVV old+16(FP), R6 ++TEXT runtime·setitimer(SB),NOSPLIT,$0 + MOVV $SYS_setitimer, R11 + SYSCALL + RET + + // func timer_create(clockid int32, sevp *sigevent, timerid *int32) int32 +-TEXT runtime·timer_create(SB),NOSPLIT,$0-28 +- MOVW clockid+0(FP), R4 +- MOVV sevp+8(FP), R5 +- MOVV timerid+16(FP), R6 ++TEXT runtime·timer_create(SB),NOSPLIT,$0 + MOVV $SYS_timer_create, R11 + SYSCALL +- MOVW R4, ret+24(FP) + RET + + // func timer_settime(timerid int32, flags int32, new, old *itimerspec) int32 +-TEXT runtime·timer_settime(SB),NOSPLIT,$0-28 +- MOVW timerid+0(FP), R4 +- MOVW flags+4(FP), R5 +- MOVV new+8(FP), R6 +- MOVV old+16(FP), R7 ++TEXT runtime·timer_settime(SB),NOSPLIT,$0 + MOVV $SYS_timer_settime, R11 + SYSCALL +- MOVW R4, ret+24(FP) + RET + + // func timer_delete(timerid int32) int32 +-TEXT runtime·timer_delete(SB),NOSPLIT,$0-12 +- MOVW timerid+0(FP), R4 ++TEXT runtime·timer_delete(SB),NOSPLIT,$0 + MOVV $SYS_timer_delete, R11 + SYSCALL +- MOVW R4, ret+8(FP) + RET + + // func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32 +-TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28 +- MOVV addr+0(FP), R4 +- MOVV n+8(FP), R5 +- MOVV dst+16(FP), R6 ++TEXT runtime·mincore(SB),NOSPLIT,$0 + MOVV $SYS_mincore, R11 + SYSCALL +- MOVW R4, ret+24(FP) + RET + + // func walltime() (sec int64, nsec int32) +-TEXT runtime·walltime(SB),NOSPLIT,$24-12 ++TEXT runtime·walltime(SB),NOSPLIT,$24 + MOVV R3, R23 // R23 is unchanged by C code + MOVV R3, R25 + +@@ -291,7 +269,7 @@ nosaveg: + JAL (R20) + + finish: +- MOVV 0(R3), R7 // sec ++ MOVV 0(R3), R4 // sec + MOVV 8(R3), R5 // nsec + + MOVV R23, R3 // restore SP +@@ -304,9 +282,6 @@ finish: + MOVV R25, m_vdsoSP(R24) + MOVV 8(R3), R25 + MOVV R25, m_vdsoPC(R24) +- +- MOVV R7, sec+0(FP) +- MOVW R5, nsec+8(FP) + RET + + fallback: +@@ -315,7 +290,7 @@ fallback: + JMP finish + + // func nanotime1() int64 +-TEXT runtime·nanotime1(SB),NOSPLIT,$16-8 ++TEXT runtime·nanotime1(SB),NOSPLIT,$24 + MOVV R3, R23 // R23 is unchanged by C code + MOVV R3, R25 + +@@ -389,8 +364,7 @@ finish: + // return nsec in R7 + MOVV $1000000000, R4 + MULVU R4, R7, R7 +- ADDVU R5, R7 +- MOVV R7, ret+0(FP) ++ ADDVU R5, R7, R4 + RET + + fallback: +@@ -399,11 +373,7 @@ fallback: + JMP finish + + // func rtsigprocmask(how int32, new, old *sigset, size int32) +-TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28 +- MOVW how+0(FP), R4 +- MOVV new+8(FP), R5 +- MOVV old+16(FP), R6 +- MOVW size+24(FP), R7 ++TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0 + MOVV $SYS_rt_sigprocmask, R11 + SYSCALL + MOVW $-4096, R5 +@@ -412,22 +382,21 @@ TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28 + RET + + // func rt_sigaction(sig uintptr, new, old *sigactiont, size uintptr) int32 +-TEXT runtime·rt_sigaction(SB),NOSPLIT|NOFRAME,$0-36 +- MOVV sig+0(FP), R4 +- MOVV new+8(FP), R5 +- MOVV old+16(FP), R6 +- MOVV size+24(FP), R7 ++TEXT runtime·rt_sigaction(SB),NOSPLIT,$0 + MOVV $SYS_rt_sigaction, R11 + SYSCALL +- MOVW R4, ret+32(FP) + RET + + // func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer) +-TEXT runtime·sigfwd(SB),NOSPLIT,$0-32 +- MOVW sig+8(FP), R4 +- MOVV info+16(FP), R5 +- MOVV ctx+24(FP), R6 +- MOVV fn+0(FP), R20 ++TEXT runtime·sigfwd(SB),NOSPLIT,$0 ++ // before: ++ // R4: fn, R5: sig, R6: info, R7: ctx ++ // after: ++ // R20: fn, R4: sig, R5: info, R6: ctx ++ MOVV R4, R20 ++ MOVV R5, R4 ++ MOVV R6, R5 ++ MOVV R7, R6 + JAL (R20) + RET + +@@ -460,48 +429,31 @@ TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0 + JMP runtime·sigtramp(SB) + + // func sysMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (p unsafe.Pointer, err int) +-TEXT runtime·sysMmap(SB),NOSPLIT|NOFRAME,$0 +- MOVV addr+0(FP), R4 +- MOVV n+8(FP), R5 +- MOVW prot+16(FP), R6 +- MOVW flags+20(FP), R7 +- MOVW fd+24(FP), R8 +- MOVW off+28(FP), R9 +- ++TEXT runtime·sysMmap(SB),NOSPLIT,$0 + MOVV $SYS_mmap, R11 + SYSCALL + MOVW $-4096, R5 + BGEU R5, R4, ok +- MOVV $0, p+32(FP) +- SUBVU R4, R0, R4 +- MOVV R4, err+40(FP) ++ SUBVU R4, R0, R5 ++ MOVV $0, R4 + RET + ok: +- MOVV R4, p+32(FP) +- MOVV $0, err+40(FP) ++ MOVV $0, R5 + RET + + // Call the function stored in _cgo_mmap using the GCC calling convention. + // This must be called on the system stack. + // func callCgoMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) uintptr +-TEXT runtime·callCgoMmap(SB),NOSPLIT,$0 +- MOVV addr+0(FP), R4 +- MOVV n+8(FP), R5 +- MOVW prot+16(FP), R6 +- MOVW flags+20(FP), R7 +- MOVW fd+24(FP), R8 +- MOVW off+28(FP), R9 ++TEXT runtime·callCgoMmap(SB),NOSPLIT,$0 + MOVV _cgo_mmap(SB), R13 + SUBV $16, R3 // reserve 16 bytes for sp-8 where fp may be saved. + JAL (R13) + ADDV $16, R3 +- MOVV R4, ret+32(FP) ++ MOVV R4, R4 + RET + + // func sysMunmap(addr unsafe.Pointer, n uintptr) +-TEXT runtime·sysMunmap(SB),NOSPLIT|NOFRAME,$0 +- MOVV addr+0(FP), R4 +- MOVV n+8(FP), R5 ++TEXT runtime·sysMunmap(SB),NOSPLIT,$0 + MOVV $SYS_munmap, R11 + SYSCALL + MOVW $-4096, R5 +@@ -512,9 +464,7 @@ TEXT runtime·sysMunmap(SB),NOSPLIT|NOFRAME,$0 + // Call the function stored in _cgo_munmap using the GCC calling convention. + // This must be called on the system stack. + // func callCgoMunmap(addr unsafe.Pointer, n uintptr) +-TEXT runtime·callCgoMunmap(SB),NOSPLIT,$0 +- MOVV addr+0(FP), R4 +- MOVV n+8(FP), R5 ++TEXT runtime·callCgoMunmap(SB),NOSPLIT,$0 + MOVV _cgo_munmap(SB), R13 + SUBV $16, R3 // reserve 16 bytes for sp-8 where fp may be saved. + JAL (R13) +@@ -522,38 +472,24 @@ TEXT runtime·callCgoMunmap(SB),NOSPLIT,$0 + RET + + // func madvise(addr unsafe.Pointer, n uintptr, flags int32) +-TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0 +- MOVV addr+0(FP), R4 +- MOVV n+8(FP), R5 +- MOVW flags+16(FP), R6 ++TEXT runtime·madvise(SB),NOSPLIT,$0 + MOVV $SYS_madvise, R11 + SYSCALL +- MOVW R4, ret+24(FP) + RET + + // func futex(addr unsafe.Pointer, op int32, val uint32, ts, addr2 unsafe.Pointer, val3 uint32) int32 +-TEXT runtime·futex(SB),NOSPLIT|NOFRAME,$0 +- MOVV addr+0(FP), R4 +- MOVW op+8(FP), R5 +- MOVW val+12(FP), R6 +- MOVV ts+16(FP), R7 +- MOVV addr2+24(FP), R8 +- MOVW val3+32(FP), R9 ++TEXT runtime·futex(SB),NOSPLIT,$0 + MOVV $SYS_futex, R11 + SYSCALL +- MOVW R4, ret+40(FP) + RET + + // int64 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void)); +-TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0 +- MOVW flags+0(FP), R4 +- MOVV stk+8(FP), R5 +- ++TEXT runtime·clone(SB),NOSPLIT,$0 + // Copy mp, gp, fn off parent stack for use by child. + // Careful: Linux system call clobbers ???. +- MOVV mp+16(FP), R23 +- MOVV gp+24(FP), R24 +- MOVV fn+32(FP), R25 ++ MOVV R6, R23 ++ MOVV R7, R24 ++ MOVV R8, R25 + + MOVV R23, -8(R5) + MOVV R24, -16(R5) +@@ -565,8 +501,7 @@ TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0 + SYSCALL + + // In parent, return. +- BEQ R4, 3(PC) +- MOVW R4, ret+40(FP) ++ BEQ R4, 2(PC) + RET + + // In child, on new stack. +@@ -606,9 +541,7 @@ nog: + JMP -3(PC) // keep exiting + + // func sigaltstack(new, old *stackt) +-TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0 +- MOVV new+0(FP), R4 +- MOVV old+8(FP), R5 ++TEXT runtime·sigaltstack(SB),NOSPLIT,$0 + MOVV $SYS_sigaltstack, R11 + SYSCALL + MOVW $-4096, R5 +@@ -617,42 +550,40 @@ TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0 + RET + + // func osyield() +-TEXT runtime·osyield(SB),NOSPLIT|NOFRAME,$0 ++TEXT runtime·osyield(SB),NOSPLIT,$0 + MOVV $SYS_sched_yield, R11 + SYSCALL + RET + + // func sched_getaffinity(pid, len uintptr, buf *uintptr) int32 +-TEXT runtime·sched_getaffinity(SB),NOSPLIT|NOFRAME,$0 +- MOVV pid+0(FP), R4 +- MOVV len+8(FP), R5 +- MOVV buf+16(FP), R6 ++TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0 + MOVV $SYS_sched_getaffinity, R11 + SYSCALL +- MOVW R4, ret+24(FP) + RET + + // func sbrk0() uintptr +-TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0-8 ++TEXT runtime·sbrk0(SB),NOSPLIT,$0 + // Implemented as brk(NULL). + MOVV $0, R4 + MOVV $SYS_brk, R11 + SYSCALL +- MOVV R4, ret+0(FP) + RET + ++// unimplemented, only needed for android; declared in stubs_linux.go + TEXT runtime·access(SB),$0-20 +- MOVV R0, 2(R0) // unimplemented, only needed for android; declared in stubs_linux.go ++ MOVV R0, 2(R0) + MOVW R0, ret+16(FP) // for vet + RET + ++// unimplemented, only needed for android; declared in stubs_linux.go + TEXT runtime·connect(SB),$0-28 +- MOVV R0, 2(R0) // unimplemented, only needed for android; declared in stubs_linux.go ++ MOVV R0, 2(R0) + MOVW R0, ret+24(FP) // for vet + RET + ++// unimplemented, only needed for android; declared in stubs_linux.go + TEXT runtime·socket(SB),$0-20 +- MOVV R0, 2(R0) // unimplemented, only needed for android; declared in stubs_linux.go ++ MOVV R0, 2(R0) + MOVW R0, ret+16(FP) // for vet + RET + +-- +2.38.1 + diff --git a/0025-runtime-use-correct-memory-barrier-in-exitThread-fun.patch b/0025-runtime-use-correct-memory-barrier-in-exitThread-fun.patch new file mode 100644 index 0000000000000000000000000000000000000000..0b81cb190f22ff450cd050c9bcc0f3cfc1827f41 --- /dev/null +++ b/0025-runtime-use-correct-memory-barrier-in-exitThread-fun.patch @@ -0,0 +1,34 @@ +From 5bb6b8ebb22faf46a01ff292c45a7dc72f2b5022 Mon Sep 17 00:00:00 2001 +From: Guoqi Chen +Date: Tue, 26 Nov 2024 17:10:32 +0800 +Subject: [PATCH 25/44] runtime: use correct memory barrier in exitThread + function on loong64 + +In the runtime.exitThread function, a storeRelease barrier +is required instead of a full barrier. + +Change-Id: I614c6f74e8c9fd56c3badf3bf450b3314e3f377c +--- + src/runtime/sys_linux_loong64.s | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/src/runtime/sys_linux_loong64.s b/src/runtime/sys_linux_loong64.s +index b4e9930755..830eb9d099 100644 +--- a/src/runtime/sys_linux_loong64.s ++++ b/src/runtime/sys_linux_loong64.s +@@ -56,10 +56,8 @@ TEXT runtime·exit(SB),NOSPLIT,$0 + TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8 + MOVV wait+0(FP), R19 + // We're done using the stack. +- MOVW $0, R11 +- DBAR +- MOVW R11, (R19) +- DBAR ++ DBAR $0x12 // StoreRelease barrier ++ MOVW R0, (R19) + MOVW $0, R4 // exit code + MOVV $SYS_exit, R11 + SYSCALL +-- +2.38.1 + diff --git a/0026-cmd-internal-obj-loong64-add-V-XV-SEQI-V-XV-.-AND-OR.patch b/0026-cmd-internal-obj-loong64-add-V-XV-SEQI-V-XV-.-AND-OR.patch new file mode 100644 index 0000000000000000000000000000000000000000..d73cc6515c410c16cd99184bc242d0799d477852 --- /dev/null +++ b/0026-cmd-internal-obj-loong64-add-V-XV-SEQI-V-XV-.-AND-OR.patch @@ -0,0 +1,410 @@ +From 38ab8bc5eb69cb2746b32fd4a6ca7931adb7722b Mon Sep 17 00:00:00 2001 +From: Guoqi Chen +Date: Fri, 29 Nov 2024 15:41:33 +0800 +Subject: [PATCH 26/44] cmd/internal/obj/loong64: add {V,XV}SEQI, + {V,XV}.{AND,OR,XOR,NOR} instructions support + +Go asm syntax: + VSEQB $1, V2, V3 + XVSEQB $2, X2, X3 + V{AND,OR,XOR,NOR}B $1, V2, V3 + XV{AND,OR,XOR,NOR}B $1, V2, V3 + V{AND,OR,XOR,NOR,ANDN,ORN}V V1, V2, V3 + XV{AND,OR,XOR,NOR,ANDN,ORN}V V1, V2, V3 + +Equivalent platform assembler syntax: + vseqi.b v3, v2, $1 + xvseqi.b x3, x2 ,$2 + v{and,or,xor,nor}.b v3, v2, $1 + xv{and,or,xor,nor}.b x3, x2, $1 + v{and,or,xor,nor,andn,orn}v v3, v2, v1 + xv{and,or,xor,nor,andn,orn}v x3, x2, x1 + +Change-Id: I56ae0db72c7f473755cbdc7f7171c1058a9def97 +--- + .../asm/internal/asm/testdata/loong64enc1.s | 38 ++++ + src/cmd/internal/obj/loong64/a.out.go | 21 +++ + src/cmd/internal/obj/loong64/anames.go | 20 ++ + src/cmd/internal/obj/loong64/asm.go | 173 ++++++++++++++++-- + 4 files changed, 238 insertions(+), 14 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index 3a3eb10a74..2418412a3a 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -506,6 +506,16 @@ lable2: + XVSEQH X3, X2, X4 // 448c0074 + XVSEQW X3, X2, X4 // 440c0174 + XVSEQV X3, X2, X4 // 448c0174 ++ VSEQB $0, V2, V3 // 43008072 ++ VSEQH $1, V2, V3 // 43848072 ++ VSEQW $8, V2, V3 // 43208172 ++ VSEQV $15, V2, V3 // 43bc8172 ++ VSEQV $-15, V2, V3 // 43c48172 ++ XVSEQB $0, X2, X4 // 44008076 ++ XVSEQH $3, X2, X4 // 448c8076 ++ XVSEQW $12, X2, X4 // 44308176 ++ XVSEQV $15, X2, X4 // 44bc8176 ++ XVSEQV $-15, X2, X4 // 44c48176 + + // VPCNT{B,H,W,V}, XVPCNT{B,H,W,V} instruction + VPCNTB V1, V2 // 22209c72 +@@ -517,6 +527,34 @@ lable2: + XVPCNTW X3, X2 // 62289c76 + XVPCNTV X3, X2 // 622c9c76 + ++ // VANDV,VORV,VXORV,VNORV,VANDNV,VORNV ++ VANDV V1, V2, V3 // 43042671 ++ VORV V1, V2, V3 // 43842671 ++ VXORV V1, V2, V3 // 43042771 ++ VNORV V1, V2, V3 // 43842771 ++ VANDNV V1, V2, V3 // 43042871 ++ VORNV V1, V2, V3 // 43842871 ++ ++ // VANDB,VORB,VXORB,VNORB ++ VANDB $0, V2, V3 // 4300d073 ++ VORB $64, V2, V3 // 4300d573 ++ VXORB $128, V2, V3 // 4300da73 ++ VNORB $255, V2, V3 // 43fcdf73 ++ ++ // XVANDV,XVORV,XVXORV,XVNORV,XVANDNV,XVORNV ++ XVANDV X1, X2, X3 // 43042675 ++ XVORV X1, X2, X3 // 43842675 ++ XVXORV X1, X2, X3 // 43042775 ++ XVNORV X1, X2, X3 // 43842775 ++ XVANDNV X1, X2, X3 // 43042875 ++ XVORNV X1, X2, X3 // 43842875 ++ ++ // XVANDB,XVORB,XVXORB,XVNORB ++ XVANDB $0, X2, X3 // 4300d077 ++ XVORB $1, X2, X3 // 4304d477 ++ XVXORB $127, X2, X3 // 43fcd977 ++ XVNORB $255, X2, X3 // 43fcdf77 ++ + // MOVV C_DCON12_0, r + MOVV $0x7a90000000000000, R4 // MOVV $8831558869273542656, R4 // 04a41e03 + MOVV $0xea90000000000000, R4 // MOVV $-1544734672188080128, R4 // 04a43a03 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index b2207c2523..bd3ce61826 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -726,6 +726,27 @@ const ( + AXVMOVQ + + // LSX and LASX Bit-manipulation Instructions ++ AVANDB ++ AVORB ++ AVXORB ++ AVNORB ++ AXVANDB ++ AXVORB ++ AXVXORB ++ AXVNORB ++ AVANDV ++ AVORV ++ AVXORV ++ AVNORV ++ AVANDNV ++ AVORNV ++ AXVANDV ++ AXVORV ++ AXVXORV ++ AXVNORV ++ AXVANDNV ++ AXVORNV ++ + AVPCNTB + AVPCNTH + AVPCNTW +diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go +index 3d2f329917..6c1537d123 100644 +--- a/src/cmd/internal/obj/loong64/anames.go ++++ b/src/cmd/internal/obj/loong64/anames.go +@@ -257,6 +257,26 @@ var Anames = []string{ + "FTINTRNEVD", + "VMOVQ", + "XVMOVQ", ++ "VANDB", ++ "VORB", ++ "VXORB", ++ "VNORB", ++ "XVANDB", ++ "XVORB", ++ "XVXORB", ++ "XVNORB", ++ "VANDV", ++ "VORV", ++ "VXORV", ++ "VNORV", ++ "VANDNV", ++ "VORNV", ++ "XVANDV", ++ "XVORV", ++ "XVXORV", ++ "XVNORV", ++ "XVANDNV", ++ "XVORNV", + "VPCNTB", + "VPCNTH", + "VPCNTW", +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index 5757c3c452..7247193c95 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -51,6 +51,8 @@ const ( + // branchLoopHead marks loop entry. + // Used to insert padding for under-aligned loops. + branchLoopHead ++ immFiledSi5 // The encoding of the immediate field in the instruction is 5-bits ++ immFiledUi8 // The encoding of the immediate field in the instruction is 8-bits + ) + + var optab = []Optab{ +@@ -88,6 +90,17 @@ var optab = []Optab{ + {ACMPEQF, C_FREG, C_FREG, C_NONE, C_FCCREG, C_NONE, 2, 4, 0, 0}, + {AVSEQB, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, + {AXVSEQB, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, ++ {AVSEQB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 13, 4, 0, immFiledSi5}, ++ {AXVSEQB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 13, 4, 0, immFiledSi5}, ++ {AVSEQB, C_ADDCON, C_VREG, C_NONE, C_VREG, C_NONE, 13, 4, 0, immFiledSi5}, ++ {AXVSEQB, C_ADDCON, C_XREG, C_NONE, C_XREG, C_NONE, 13, 4, 0, immFiledSi5}, ++ ++ {AVANDV, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, ++ {AXVANDV, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, ++ {AVANDB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, immFiledUi8}, ++ {AXVANDB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, immFiledUi8}, ++ {AVANDB, C_ADDCON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, immFiledUi8}, ++ {AXVANDB, C_ADDCON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, immFiledUi8}, + + {ACLOW, C_REG, C_NONE, C_NONE, C_REG, C_NONE, 9, 4, 0, 0}, + {AABSF, C_FREG, C_NONE, C_NONE, C_FREG, C_NONE, 9, 4, 0, 0}, +@@ -1499,6 +1512,7 @@ func buildop(ctxt *obj.Link) { + } + opset(i, r0) + } ++ + case AVSEQB: + opset(AVSEQH, r0) + opset(AVSEQW, r0) +@@ -1509,6 +1523,30 @@ func buildop(ctxt *obj.Link) { + opset(AXVSEQW, r0) + opset(AXVSEQV, r0) + ++ case AVANDB: ++ opset(AVORB, r0) ++ opset(AVXORB, r0) ++ opset(AVNORB, r0) ++ ++ case AXVANDB: ++ opset(AXVORB, r0) ++ opset(AXVXORB, r0) ++ opset(AXVNORB, r0) ++ ++ case AVANDV: ++ opset(AVORV, r0) ++ opset(AVXORV, r0) ++ opset(AVNORV, r0) ++ opset(AVANDNV, r0) ++ opset(AVORNV, r0) ++ ++ case AXVANDV: ++ opset(AXVORV, r0) ++ opset(AXVXORV, r0) ++ opset(AXVNORV, r0) ++ opset(AXVANDNV, r0) ++ opset(AXVORNV, r0) ++ + case AVPCNTB: + opset(AVPCNTH, r0) + opset(AVPCNTW, r0) +@@ -1551,6 +1589,14 @@ func OP_12IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 { + return op | (i&0xFFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0 + } + ++func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 { ++ return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0 ++} ++ ++func OP_5IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 { ++ return op | (i&0x1F)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0 ++} ++ + func OP_IR(op uint32, i uint32, r2 uint32) uint32 { + return op | (i&0xFFFFF)<<5 | (r2&0x1F)<<0 // ui20, rd5 + } +@@ -1623,12 +1669,10 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + + case 4: // add $scon,[r1],r2 + v := c.regoff(&p.From) +- + r := int(p.Reg) + if r == 0 { + r = int(p.To.Reg) + } +- + o1 = OP_12IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) + + case 5: // syscall +@@ -1738,6 +1782,36 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + c.ctxt.Diag("unexpected encoding\n%v", p) + } + ++ case 13: // add $si5,[r1],r2 ++ v := c.regoff(&p.From) ++ r := int(p.Reg) ++ if r == 0 { ++ r = int(p.To.Reg) ++ } ++ ++ switch o.flag { ++ case immFiledSi5: ++ c.checkimmFiled(p, v, 5, true) ++ o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) ++ default: ++ c.ctxt.Diag("Invalid immediate value type\n%v", p) ++ } ++ ++ case 14: // add $ui8,[r1],r2 ++ v := c.regoff(&p.From) ++ r := int(p.Reg) ++ if r == 0 { ++ r = int(p.To.Reg) ++ } ++ ++ switch o.flag { ++ case immFiledUi8: ++ c.checkimmFiled(p, v, 8, false) ++ o1 = OP_8IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) ++ default: ++ c.ctxt.Diag("Invalid immediate value type\n%v", p) ++ } ++ + case 15: // teq $c r,r + v := c.regoff(&p.From) + r := int(p.Reg) +@@ -1760,18 +1834,18 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + o2 = OP_15I(c.opi(ABREAK), uint32(v)) + + case 16: // sll $c,[r1],r2 +- v := c.regoff(&p.From) +- r := int(p.Reg) +- if r == 0 { +- r = int(p.To.Reg) +- } +- +- // instruction ending with V:6-digit immediate, others:5-digit immediate +- if v >= 32 && vshift(p.As) { +- o1 = OP_16IRR(c.opirr(p.As), uint32(v)&0x3f, uint32(r), uint32(p.To.Reg)) +- } else { +- o1 = OP_16IRR(c.opirr(p.As), uint32(v)&0x1f, uint32(r), uint32(p.To.Reg)) +- } ++ v := c.regoff(&p.From) ++ r := int(p.Reg) ++ if r == 0 { ++ r = int(p.To.Reg) ++ } ++ ++ // instruction ending with V:6-digit immediate, others:5-digit immediate ++ if v >= 32 && vshift(p.As) { ++ o1 = OP_16IRR(c.opirr(p.As), uint32(v)&0x3f, uint32(r), uint32(p.To.Reg)) ++ } else { ++ o1 = OP_16IRR(c.opirr(p.As), uint32(v)&0x1f, uint32(r), uint32(p.To.Reg)) ++ } + + case 17: // bstrpickw $msbw, r1, $lsbw, r2 + rd, rj := p.To.Reg, p.Reg +@@ -2348,6 +2422,21 @@ func (c *ctxt0) checkindex(p *obj.Prog, index uint32, mask uint32) { + } + } + ++// checkimmFiled checks whether the immediate value exceeds the valid encoding range ++func (c *ctxt0) checkimmFiled(p *obj.Prog, imm int32, bits uint8, isSigned bool) { ++ if isSigned { ++ bound := int32(1 << (bits - 1)) ++ if imm < -bound || imm > bound { ++ c.ctxt.Diag("signed immediate %v exceeds the %d-bit range: %v", imm, bits, p) ++ } ++ } else { ++ mask := uint32(0xffffffff) << bits ++ if uint32(imm) != (uint32(imm) & ^mask) { ++ c.ctxt.Diag("unsigned immediate %v exceeds the %d-bit range: %v", imm, bits, p) ++ } ++ } ++} ++ + func (c *ctxt0) vregoff(a *obj.Addr) int64 { + c.instoffset = 0 + c.aclass(a) +@@ -2588,6 +2677,30 @@ func (c *ctxt0) oprrr(a obj.As) uint32 { + return 0x0e003 << 15 // vseq.d + case AXVSEQV: + return 0x0e803 << 15 // xvseq.d ++ case AVANDV: ++ return 0x0E24C << 15 // vand.v ++ case AVORV: ++ return 0x0E24D << 15 // vor.v ++ case AVXORV: ++ return 0x0E24E << 15 // vxor.v ++ case AVNORV: ++ return 0x0E24F << 15 // vnor.v ++ case AVANDNV: ++ return 0x0E250 << 15 // vandn.v ++ case AVORNV: ++ return 0x0E251 << 15 // vorn.v ++ case AXVANDV: ++ return 0x0EA4C << 15 // xvand.v ++ case AXVORV: ++ return 0x0EA4D << 15 // xvor.v ++ case AXVXORV: ++ return 0x0EA4E << 15 // xvxor.v ++ case AXVNORV: ++ return 0x0EA4F << 15 // xvnor.v ++ case AXVANDNV: ++ return 0x0EA50 << 15 // xvandn.v ++ case AXVORNV: ++ return 0x0EA51 << 15 // xvorn.v + } + + if a < 0 { +@@ -2915,6 +3028,38 @@ func (c *ctxt0) opirr(a obj.As) uint32 { + return 0x021 << 24 + case ASCV: + return 0x023 << 24 ++ case AVANDB: ++ return 0x1CF4 << 18 // vandi.b ++ case AVORB: ++ return 0x1CF5 << 18 // vori.b ++ case AVXORB: ++ return 0x1CF6 << 18 // xori.b ++ case AVNORB: ++ return 0x1CF7 << 18 // xnori.b ++ case AXVANDB: ++ return 0x1DF4 << 18 // xvandi.b ++ case AXVORB: ++ return 0x1DF5 << 18 // xvori.b ++ case AXVXORB: ++ return 0x1DF6 << 18 // xvxori.b ++ case AXVNORB: ++ return 0x1DF7 << 18 // xvnor.b ++ case AVSEQB: ++ return 0x0E500 << 15 //vseqi.b ++ case AVSEQH: ++ return 0x0E501 << 15 // vseqi.h ++ case AVSEQW: ++ return 0x0E502 << 15 //vseqi.w ++ case AVSEQV: ++ return 0x0E503 << 15 //vseqi.d ++ case AXVSEQB: ++ return 0x0ED00 << 15 //xvseqi.b ++ case AXVSEQH: ++ return 0x0ED01 << 15 // xvseqi.h ++ case AXVSEQW: ++ return 0x0ED02 << 15 // xvseqi.w ++ case AXVSEQV: ++ return 0x0ED03 << 15 // xvseqi.d + } + + if a < 0 { +-- +2.38.1 + diff --git a/0027-cmd-internal-obj-loong64-add-V-XV-ADD-SUB-.-B-H-W-D-.patch b/0027-cmd-internal-obj-loong64-add-V-XV-ADD-SUB-.-B-H-W-D-.patch new file mode 100644 index 0000000000000000000000000000000000000000..2192272d7bb79100579ca52591c8b18158fec4a7 --- /dev/null +++ b/0027-cmd-internal-obj-loong64-add-V-XV-ADD-SUB-.-B-H-W-D-.patch @@ -0,0 +1,207 @@ +From f5bbb15710944ebcc7d2c808fe9087892a690bc4 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Wed, 11 Dec 2024 09:26:38 +0800 +Subject: [PATCH 27/44] cmd/internal/obj/loong64: add + {V,XV}{ADD/SUB}.{B,H,W,D,Q} instructions support + +Go asm syntax: + V{ADD/SUB}{B,H,W,V,Q} VK, VJ, VD + XV{ADD/SUB}{B,H,W,V,Q} XK, XJ, XD + +Equivalent platform assembler syntax: + v{add/sub}.{b,w,h,d,q} vd, vj, vk + xv{add/sub}.{b,w,h,d,q} xd, xj, xk + +Change-Id: Iadc28100c93d6d6c69e9641bfea78fa85d75bddf +--- + .../asm/internal/asm/testdata/loong64enc1.s | 22 +++++++ + src/cmd/internal/obj/loong64/a.out.go | 22 +++++++ + src/cmd/internal/obj/loong64/anames.go | 20 +++++++ + src/cmd/internal/obj/loong64/asm.go | 60 +++++++++++++++++++ + 4 files changed, 124 insertions(+) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index 2418412a3a..76faf2d3cb 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -555,6 +555,28 @@ lable2: + XVXORB $127, X2, X3 // 43fcd977 + XVNORB $255, X2, X3 // 43fcdf77 + ++ // [X]VADD{B,H,W,V,Q}, [X]VSUB{B,H,W,V,Q} instructions ++ VADDB V1, V2, V3 // 43040a70 ++ VADDH V1, V2, V3 // 43840a70 ++ VADDW V1, V2, V3 // 43040b70 ++ VADDV V1, V2, V3 // 43840b70 ++ VADDQ V1, V2, V3 // 43042d71 ++ VSUBB V1, V2, V3 // 43040c70 ++ VSUBH V1, V2, V3 // 43840c70 ++ VSUBW V1, V2, V3 // 43040d70 ++ VSUBV V1, V2, V3 // 43840d70 ++ VSUBQ V1, V2, V3 // 43842d71 ++ XVADDB X3, X2, X1 // 410c0a74 ++ XVADDH X3, X2, X1 // 418c0a74 ++ XVADDW X3, X2, X1 // 410c0b74 ++ XVADDV X3, X2, X1 // 418c0b74 ++ XVADDQ X3, X2, X1 // 410c2d75 ++ XVSUBB X3, X2, X1 // 410c0c74 ++ XVSUBH X3, X2, X1 // 418c0c74 ++ XVSUBW X3, X2, X1 // 410c0d74 ++ XVSUBV X3, X2, X1 // 418c0d74 ++ XVSUBQ X3, X2, X1 // 418c2d75 ++ + // MOVV C_DCON12_0, r + MOVV $0x7a90000000000000, R4 // MOVV $8831558869273542656, R4 // 04a41e03 + MOVV $0xea90000000000000, R4 // MOVV $-1544734672188080128, R4 // 04a43a03 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index bd3ce61826..3bef0da869 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -725,6 +725,28 @@ const ( + AVMOVQ + AXVMOVQ + ++ // LSX and LASX arithmetic instructions ++ AVADDB ++ AVADDH ++ AVADDW ++ AVADDV ++ AVADDQ ++ AXVADDB ++ AXVADDH ++ AXVADDW ++ AXVADDV ++ AXVADDQ ++ AVSUBB ++ AVSUBH ++ AVSUBW ++ AVSUBV ++ AVSUBQ ++ AXVSUBB ++ AXVSUBH ++ AXVSUBW ++ AXVSUBV ++ AXVSUBQ ++ + // LSX and LASX Bit-manipulation Instructions + AVANDB + AVORB +diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go +index 6c1537d123..194021219e 100644 +--- a/src/cmd/internal/obj/loong64/anames.go ++++ b/src/cmd/internal/obj/loong64/anames.go +@@ -257,6 +257,26 @@ var Anames = []string{ + "FTINTRNEVD", + "VMOVQ", + "XVMOVQ", ++ "VADDB", ++ "VADDH", ++ "VADDW", ++ "VADDV", ++ "VADDQ", ++ "XVADDB", ++ "XVADDH", ++ "XVADDW", ++ "XVADDV", ++ "XVADDQ", ++ "VSUBB", ++ "VSUBH", ++ "VSUBW", ++ "VSUBV", ++ "VSUBQ", ++ "XVSUBB", ++ "XVSUBH", ++ "XVSUBW", ++ "XVSUBV", ++ "XVSUBQ", + "VANDB", + "VORB", + "VXORB", +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index 7247193c95..7489b4dbf6 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -1539,6 +1539,16 @@ func buildop(ctxt *obj.Link) { + opset(AVNORV, r0) + opset(AVANDNV, r0) + opset(AVORNV, r0) ++ opset(AVADDB, r0) ++ opset(AVADDH, r0) ++ opset(AVADDW, r0) ++ opset(AVADDV, r0) ++ opset(AVADDQ, r0) ++ opset(AVSUBB, r0) ++ opset(AVSUBH, r0) ++ opset(AVSUBW, r0) ++ opset(AVSUBV, r0) ++ opset(AVSUBQ, r0) + + case AXVANDV: + opset(AXVORV, r0) +@@ -1546,6 +1556,16 @@ func buildop(ctxt *obj.Link) { + opset(AXVNORV, r0) + opset(AXVANDNV, r0) + opset(AXVORNV, r0) ++ opset(AXVADDB, r0) ++ opset(AXVADDH, r0) ++ opset(AXVADDW, r0) ++ opset(AXVADDV, r0) ++ opset(AXVADDQ, r0) ++ opset(AXVSUBB, r0) ++ opset(AXVSUBH, r0) ++ opset(AXVSUBW, r0) ++ opset(AXVSUBV, r0) ++ opset(AXVSUBQ, r0) + + case AVPCNTB: + opset(AVPCNTH, r0) +@@ -2701,6 +2721,46 @@ func (c *ctxt0) oprrr(a obj.As) uint32 { + return 0x0EA50 << 15 // xvandn.v + case AXVORNV: + return 0x0EA51 << 15 // xvorn.v ++ case AVADDB: ++ return 0xE014 << 15 // vadd.b ++ case AVADDH: ++ return 0xE015 << 15 // vadd.h ++ case AVADDW: ++ return 0xE016 << 15 // vadd.w ++ case AVADDV: ++ return 0xE017 << 15 // vadd.d ++ case AVADDQ: ++ return 0xE25A << 15 // vadd.q ++ case AVSUBB: ++ return 0xE018 << 15 // vsub.b ++ case AVSUBH: ++ return 0xE019 << 15 // vsub.h ++ case AVSUBW: ++ return 0xE01A << 15 // vsub.w ++ case AVSUBV: ++ return 0xE01B << 15 // vsub.d ++ case AVSUBQ: ++ return 0xE25B << 15 // vsub.q ++ case AXVADDB: ++ return 0xE814 << 15 // xvadd.b ++ case AXVADDH: ++ return 0xE815 << 15 // xvadd.h ++ case AXVADDW: ++ return 0xE816 << 15 // xvadd.w ++ case AXVADDV: ++ return 0xE817 << 15 // xvadd.d ++ case AXVADDQ: ++ return 0xEA5A << 15 // xvadd.q ++ case AXVSUBB: ++ return 0xE818 << 15 // xvsub.b ++ case AXVSUBH: ++ return 0xE819 << 15 // xvsub.h ++ case AXVSUBW: ++ return 0xE81A << 15 // xvsub.w ++ case AXVSUBV: ++ return 0xE81B << 15 // xvsub.d ++ case AXVSUBQ: ++ return 0xEA5B << 15 // xvsub.q + } + + if a < 0 { +-- +2.38.1 + diff --git a/0028-cmd-internal-obj-loong64-add-V-XV-ILV-L-H-.-B-H-W-D-.patch b/0028-cmd-internal-obj-loong64-add-V-XV-ILV-L-H-.-B-H-W-D-.patch new file mode 100644 index 0000000000000000000000000000000000000000..316746628a9aa2ec11178b65fa685d12a159372e --- /dev/null +++ b/0028-cmd-internal-obj-loong64-add-V-XV-ILV-L-H-.-B-H-W-D-.patch @@ -0,0 +1,181 @@ +From db7ccba69b0c246434a610f3be2ab31c8406b163 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Wed, 11 Dec 2024 10:24:13 +0800 +Subject: [PATCH 28/44] cmd/internal/obj/loong64: add {V,XV}ILV{L/H}.{B/H/W/D} + instructions support + +Go asm syntax: + VILV{L/H}{B/H/W/V} VK, VJ, VD + XVILV{L/H}{B/H/W/V} XK, XJ, XD +Equivalent platform assembler syntax: + vilv{l/h}.{b/h/w/d} vd, vj, vk + xvilv{l/h}.{b/h/w/d} xd, xj, xk + +Change-Id: If1f146fd5e049281494026bf4c24d302bcad1373 +--- + .../asm/internal/asm/testdata/loong64enc1.s | 18 +++++++ + src/cmd/internal/obj/loong64/a.out.go | 18 +++++++ + src/cmd/internal/obj/loong64/anames.go | 16 +++++++ + src/cmd/internal/obj/loong64/asm.go | 48 +++++++++++++++++++ + 4 files changed, 100 insertions(+) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index 76faf2d3cb..419f257c4a 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -577,6 +577,24 @@ lable2: + XVSUBV X3, X2, X1 // 418c0d74 + XVSUBQ X3, X2, X1 // 418c2d75 + ++ // [X]VILV{L/H}{B,H,W,V} instructions ++ VILVLB V1, V2, V3 // 43041a71 ++ VILVLH V1, V2, V3 // 43841a71 ++ VILVLW V1, V2, V3 // 43041b71 ++ VILVLV V1, V2, V3 // 43841b71 ++ VILVHB V1, V2, V3 // 43041c71 ++ VILVHH V1, V2, V3 // 43841c71 ++ VILVHW V1, V2, V3 // 43041d71 ++ VILVHV V1, V2, V3 // 43841d71 ++ XVILVLB X3, X2, X1 // 410c1a75 ++ XVILVLH X3, X2, X1 // 418c1a75 ++ XVILVLW X3, X2, X1 // 410c1b75 ++ XVILVLV X3, X2, X1 // 418c1b75 ++ XVILVHB X3, X2, X1 // 410c1c75 ++ XVILVHH X3, X2, X1 // 418c1c75 ++ XVILVHW X3, X2, X1 // 410c1d75 ++ XVILVHV X3, X2, X1 // 418c1d75 ++ + // MOVV C_DCON12_0, r + MOVV $0x7a90000000000000, R4 // MOVV $8831558869273542656, R4 // 04a41e03 + MOVV $0xea90000000000000, R4 // MOVV $-1544734672188080128, R4 // 04a43a03 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index 3bef0da869..c7f4769395 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -788,6 +788,24 @@ const ( + AVSEQV + AXVSEQV + ++ // LSX and LASX move and shuffle instructions ++ AVILVLB ++ AVILVLH ++ AVILVLW ++ AVILVLV ++ AVILVHB ++ AVILVHH ++ AVILVHW ++ AVILVHV ++ AXVILVLB ++ AXVILVLH ++ AXVILVLW ++ AXVILVLV ++ AXVILVHB ++ AXVILVHH ++ AXVILVHW ++ AXVILVHV ++ + ALAST + + // aliases +diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go +index 194021219e..485940e19c 100644 +--- a/src/cmd/internal/obj/loong64/anames.go ++++ b/src/cmd/internal/obj/loong64/anames.go +@@ -313,5 +313,21 @@ var Anames = []string{ + "XVSEQW", + "VSEQV", + "XVSEQV", ++ "VILVLB", ++ "VILVLH", ++ "VILVLW", ++ "VILVLV", ++ "VILVHB", ++ "VILVHH", ++ "VILVHW", ++ "VILVHV", ++ "XVILVLB", ++ "XVILVLH", ++ "XVILVLW", ++ "XVILVLV", ++ "XVILVHB", ++ "XVILVHH", ++ "XVILVHW", ++ "XVILVHV", + "LAST", + } +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index 7489b4dbf6..9ef414a132 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -1549,6 +1549,14 @@ func buildop(ctxt *obj.Link) { + opset(AVSUBW, r0) + opset(AVSUBV, r0) + opset(AVSUBQ, r0) ++ opset(AVILVLB, r0) ++ opset(AVILVLH, r0) ++ opset(AVILVLW, r0) ++ opset(AVILVLV, r0) ++ opset(AVILVHB, r0) ++ opset(AVILVHH, r0) ++ opset(AVILVHW, r0) ++ opset(AVILVHV, r0) + + case AXVANDV: + opset(AXVORV, r0) +@@ -1566,6 +1574,14 @@ func buildop(ctxt *obj.Link) { + opset(AXVSUBW, r0) + opset(AXVSUBV, r0) + opset(AXVSUBQ, r0) ++ opset(AXVILVLB, r0) ++ opset(AXVILVLH, r0) ++ opset(AXVILVLW, r0) ++ opset(AXVILVLV, r0) ++ opset(AXVILVHB, r0) ++ opset(AXVILVHH, r0) ++ opset(AXVILVHW, r0) ++ opset(AXVILVHV, r0) + + case AVPCNTB: + opset(AVPCNTH, r0) +@@ -2761,6 +2777,38 @@ func (c *ctxt0) oprrr(a obj.As) uint32 { + return 0xE81B << 15 // xvsub.d + case AXVSUBQ: + return 0xEA5B << 15 // xvsub.q ++ case AVILVLB: ++ return 0xE234 << 15 // vilvl.b ++ case AVILVLH: ++ return 0xE235 << 15 // vilvl.h ++ case AVILVLW: ++ return 0xE236 << 15 // vilvl.w ++ case AVILVLV: ++ return 0xE237 << 15 // vilvl.d ++ case AVILVHB: ++ return 0xE238 << 15 // vilvh.b ++ case AVILVHH: ++ return 0xE239 << 15 // vilvh.h ++ case AVILVHW: ++ return 0xE23A << 15 // vilvh.w ++ case AVILVHV: ++ return 0xE23B << 15 // vilvh.d ++ case AXVILVLB: ++ return 0xEA34 << 15 // xvilvl.b ++ case AXVILVLH: ++ return 0xEA35 << 15 // xvilvl.h ++ case AXVILVLW: ++ return 0xEA36 << 15 // xvilvl.w ++ case AXVILVLV: ++ return 0xEA37 << 15 // xvilvl.d ++ case AXVILVHB: ++ return 0xEA38 << 15 // xvilvh.b ++ case AXVILVHH: ++ return 0xEA39 << 15 // xvilvh.h ++ case AXVILVHW: ++ return 0xEA3A << 15 // xvilvh.w ++ case AXVILVHV: ++ return 0xEA3B << 15 // xvilvh.d + } + + if a < 0 { +-- +2.38.1 + diff --git a/0029-cmd-internal-obj-loong64-add-V-XV-SLL-SRL-SRA-ROTR-I.patch b/0029-cmd-internal-obj-loong64-add-V-XV-SLL-SRL-SRA-ROTR-I.patch new file mode 100644 index 0000000000000000000000000000000000000000..71ebec387f8fd8b8c24ca42ab3e6d936640ccb99 --- /dev/null +++ b/0029-cmd-internal-obj-loong64-add-V-XV-SLL-SRL-SRA-ROTR-I.patch @@ -0,0 +1,599 @@ +From d765027e47dec10f8869d04b0bf52661ac63f302 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Wed, 11 Dec 2024 14:19:04 +0800 +Subject: [PATCH 29/44] cmd/internal/obj/loong64: add + {V,XV}{SLL/SRL/SRA/ROTR}[I].{B/H/W/D} instructions support + +Go asm syntax: + V{SLL/SRL/SRA/ROTR}{B/H/W/V} $1, V2, V3 + XV{SLL/SRL/SRA/ROTR}{B/H/W/V} $1, X2, X3 + V{SLL/SRL/SRA/ROTR}{B/H/W/V} VK, VJ, VD + XV{SLL/SRL/SRA/ROTR}{B/H/W/V} XK, XJ, XD + +Equivalent platform assembler syntax: + v{sll/srl/sra/rotr}i.{b/h/w/d} v3, v2, $1 + xv{sll/srl/sra/rotr}i.{b/h/w/d} x3, x2, $1 + v{sll/srl/sra/rotr}.{b/h/w/d} vd, vj, vk + xv{sll/srl/sra/rotr}.{b/h/w/d} xd, xj, xk + +Change-Id: I8693e15f3778057e5a1e636d618c6f46acc5042b +--- + .../asm/internal/asm/testdata/loong64enc1.s | 130 +++++++++ + src/cmd/internal/obj/loong64/a.out.go | 33 +++ + src/cmd/internal/obj/loong64/anames.go | 32 ++ + src/cmd/internal/obj/loong64/asm.go | 274 +++++++++++++++++- + 4 files changed, 468 insertions(+), 1 deletion(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index 419f257c4a..79012784dc 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -595,6 +595,136 @@ lable2: + XVILVHW X3, X2, X1 // 410c1d75 + XVILVHV X3, X2, X1 // 418c1d75 + ++ // [X]{VSLL/VSRL/VSRA/VROTR}{B,H,W,V} instructions ++ VSLLB V1, V2, V3 // 4304e870 ++ VSLLH V1, V2, V3 // 4384e870 ++ VSLLW V1, V2, V3 // 4304e970 ++ VSLLV V1, V2, V3 // 4384e970 ++ VSRLB V1, V2, V3 // 4304ea70 ++ VSRLH V1, V2, V3 // 4384ea70 ++ VSRLW V1, V2, V3 // 4304eb70 ++ VSRLV V1, V2, V3 // 4384eb70 ++ VSRAB V1, V2, V3 // 4304ec70 ++ VSRAH V1, V2, V3 // 4384ec70 ++ VSRAW V1, V2, V3 // 4304ed70 ++ VSRAV V1, V2, V3 // 4384ed70 ++ VROTRB V1, V2, V3 // 4304ee70 ++ VROTRH V1, V2, V3 // 4384ee70 ++ VROTRW V1, V2, V3 // 4304ef70 ++ VROTRV V1, V2, V3 // 4384ef70 ++ XVSLLB X3, X2, X1 // 410ce874 ++ XVSLLH X3, X2, X1 // 418ce874 ++ XVSLLW X3, X2, X1 // 410ce974 ++ XVSLLV X3, X2, X1 // 418ce974 ++ XVSRLB X3, X2, X1 // 410cea74 ++ XVSRLH X3, X2, X1 // 418cea74 ++ XVSRLW X3, X2, X1 // 410ceb74 ++ XVSRLV X3, X2, X1 // 418ceb74 ++ XVSRAB X3, X2, X1 // 410cec74 ++ XVSRAH X3, X2, X1 // 418cec74 ++ XVSRAW X3, X2, X1 // 410ced74 ++ XVSRAV X3, X2, X1 // 418ced74 ++ XVROTRB X3, X2, X1 // 410cee74 ++ XVROTRH X3, X2, X1 // 418cee74 ++ XVROTRW X3, X2, X1 // 410cef74 ++ XVROTRV X3, X2, X1 // 418cef74 ++ VSLLB $0, V1, V2 // 22202c73 ++ VSLLB $7, V1, V2 // 223c2c73 ++ VSLLB $5, V1 // 21342c73 ++ VSLLH $0, V1, V2 // 22402c73 ++ VSLLH $15, V1, V2 // 227c2c73 ++ VSLLH $10, V1 // 21682c73 ++ VSLLW $0, V1, V2 // 22802c73 ++ VSLLW $31, V1, V2 // 22fc2c73 ++ VSLLW $11, V1 // 21ac2c73 ++ VSLLV $0, V1, V2 // 22002d73 ++ VSLLV $63, V1, V2 // 22fc2d73 ++ VSLLV $30, V1 // 21782d73 ++ VSRLB $0, V1, V2 // 22203073 ++ VSRLB $7, V1, V2 // 223c3073 ++ VSRLB $4, V1 // 21303073 ++ VSRLH $0, V1, V2 // 22403073 ++ VSRLH $15, V1, V2 // 227c3073 ++ VSRLH $9, V1 // 21643073 ++ VSRLW $0, V1, V2 // 22803073 ++ VSRLW $31, V1, V2 // 22fc3073 ++ VSRLW $16, V1 // 21c03073 ++ VSRLV $0, V1, V2 // 22003173 ++ VSRLV $63, V1, V2 // 22fc3173 ++ VSRLV $40, V1 // 21a03173 ++ VSRAB $0, V1, V2 // 22203473 ++ VSRAB $7, V1, V2 // 223c3473 ++ VSRAB $6, V1 // 21383473 ++ VSRAH $0, V1, V2 // 22403473 ++ VSRAH $15, V1, V2 // 227c3473 ++ VSRAH $8, V1 // 21603473 ++ VSRAW $0, V1, V2 // 22803473 ++ VSRAW $31, V1, V2 // 22fc3473 ++ VSRAW $12, V1 // 21b03473 ++ VSRAV $0, V1, V2 // 22003573 ++ VSRAV $63, V1, V2 // 22fc3573 ++ VSRAV $50, V1 // 21c83573 ++ VROTRB $0, V1, V2 // 2220a072 ++ VROTRB $7, V1, V2 // 223ca072 ++ VROTRB $3, V1 // 212ca072 ++ VROTRH $0, V1, V2 // 2240a072 ++ VROTRH $15, V1, V2 // 227ca072 ++ VROTRH $5, V1 // 2154a072 ++ VROTRW $0, V1, V2 // 2280a072 ++ VROTRW $31, V1, V2 // 22fca072 ++ VROTRW $18, V1 // 21c8a072 ++ VROTRV $0, V1, V2 // 2200a172 ++ VROTRV $63, V1, V2 // 22fca172 ++ VROTRV $52, V1 // 21d0a172 ++ XVSLLB $0, X2, X1 // 41202c77 ++ XVSLLB $7, X2, X1 // 413c2c77 ++ XVSLLB $4, X2 // 42302c77 ++ XVSLLH $0, X2, X1 // 41402c77 ++ XVSLLH $15, X2, X1 // 417c2c77 ++ XVSLLH $8, X2 // 42602c77 ++ XVSLLW $0, X2, X1 // 41802c77 ++ XVSLLW $31, X2, X1 // 41fc2c77 ++ XVSLLW $13, X2 // 42b42c77 ++ XVSLLV $0, X2, X1 // 41002d77 ++ XVSLLV $63, X2, X1 // 41fc2d77 ++ XVSLLV $36, X2 // 42902d77 ++ XVSRLB $0, X2, X1 // 41203077 ++ XVSRLB $7, X2, X1 // 413c3077 ++ XVSRLB $5, X2 // 42343077 ++ XVSRLH $0, X2, X1 // 41403077 ++ XVSRLH $15, X2, X1 // 417c3077 ++ XVSRLH $9, X2 // 42643077 ++ XVSRLW $0, X2, X1 // 41803077 ++ XVSRLW $31, X2, X1 // 41fc3077 ++ XVSRLW $14, X2 // 42b83077 ++ XVSRLV $0, X2, X1 // 41003177 ++ XVSRLV $63, X2, X1 // 41fc3177 ++ XVSRLV $45, X2 // 42b43177 ++ XVSRAB $0, X2, X1 // 41203477 ++ XVSRAB $7, X2, X1 // 413c3477 ++ XVSRAB $6, X2 // 42383477 ++ XVSRAH $0, X2, X1 // 41403477 ++ XVSRAH $15, X2, X1 // 417c3477 ++ XVSRAH $10, X2 // 42683477 ++ XVSRAW $0, X2, X1 // 41803477 ++ XVSRAW $31, X2, X1 // 41fc3477 ++ XVSRAW $16, X2 // 42c03477 ++ XVSRAV $0, X2, X1 // 41003577 ++ XVSRAV $63, X2, X1 // 41fc3577 ++ XVSRAV $48, X2 // 42c03577 ++ XVROTRB $0, X2, X1 // 4120a076 ++ XVROTRB $7, X2, X1 // 413ca076 ++ XVROTRB $3, X2 // 422ca076 ++ XVROTRH $0, X2, X1 // 4140a076 ++ XVROTRH $15, X2, X1 // 417ca076 ++ XVROTRH $13, X2 // 4274a076 ++ XVROTRW $0, X2, X1 // 4180a076 ++ XVROTRW $31, X2, X1 // 41fca076 ++ XVROTRW $24, X2 // 42e0a076 ++ XVROTRV $0, X2, X1 // 4100a176 ++ XVROTRV $63, X2, X1 // 41fca176 ++ XVROTRV $52, X2 // 42d0a176 ++ + // MOVV C_DCON12_0, r + MOVV $0x7a90000000000000, R4 // MOVV $8831558869273542656, R4 // 04a41e03 + MOVV $0xea90000000000000, R4 // MOVV $-1544734672188080128, R4 // 04a43a03 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index c7f4769395..3257d376b4 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -769,6 +769,39 @@ const ( + AXVANDNV + AXVORNV + ++ AVSLLB ++ AVSLLH ++ AVSLLW ++ AVSLLV ++ AVSRLB ++ AVSRLH ++ AVSRLW ++ AVSRLV ++ AVSRAB ++ AVSRAH ++ AVSRAW ++ AVSRAV ++ AVROTRB ++ AVROTRH ++ AVROTRW ++ AVROTRV ++ AXVSLLB ++ AXVSLLH ++ AXVSLLW ++ AXVSLLV ++ AXVSRLB ++ AXVSRLH ++ AXVSRLW ++ AXVSRLV ++ AXVSRAB ++ AXVSRAH ++ AXVSRAW ++ AXVSRAV ++ AXVROTRB ++ AXVROTRH ++ AXVROTRW ++ AXVROTRV ++ + AVPCNTB + AVPCNTH + AVPCNTW +diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go +index 485940e19c..776e272a0b 100644 +--- a/src/cmd/internal/obj/loong64/anames.go ++++ b/src/cmd/internal/obj/loong64/anames.go +@@ -297,6 +297,38 @@ var Anames = []string{ + "XVNORV", + "XVANDNV", + "XVORNV", ++ "VSLLB", ++ "VSLLH", ++ "VSLLW", ++ "VSLLV", ++ "VSRLB", ++ "VSRLH", ++ "VSRLW", ++ "VSRLV", ++ "VSRAB", ++ "VSRAH", ++ "VSRAW", ++ "VSRAV", ++ "VROTRB", ++ "VROTRH", ++ "VROTRW", ++ "VROTRV", ++ "XVSLLB", ++ "XVSLLH", ++ "XVSLLW", ++ "XVSLLV", ++ "XVSRLB", ++ "XVSRLH", ++ "XVSRLW", ++ "XVSRLV", ++ "XVSRAB", ++ "XVSRAH", ++ "XVSRAW", ++ "XVSRAV", ++ "XVROTRB", ++ "XVROTRH", ++ "XVROTRW", ++ "XVROTRV", + "VPCNTB", + "VPCNTH", + "VPCNTW", +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index 9ef414a132..25a40d736e 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -52,6 +52,10 @@ const ( + // Used to insert padding for under-aligned loops. + branchLoopHead + immFiledSi5 // The encoding of the immediate field in the instruction is 5-bits ++ immFiledUi3 // The encoding of the immediate field in the instruction is 3-bits ++ immFiledUi4 // The encoding of the immediate field in the instruction is 4-bits ++ immFiledUi5 // The encoding of the immediate field in the instruction is 5-bits ++ immFiledUi6 // The encoding of the immediate field in the instruction is 6-bits + immFiledUi8 // The encoding of the immediate field in the instruction is 8-bits + ) + +@@ -102,6 +106,34 @@ var optab = []Optab{ + {AVANDB, C_ADDCON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, immFiledUi8}, + {AXVANDB, C_ADDCON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, immFiledUi8}, + ++ {AVSLLB, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, ++ {AXVSLLB, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, ++ {AVSLLB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 29, 4, 0, immFiledUi3}, ++ {AXVSLLB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 29, 4, 0, immFiledUi3}, ++ {AVSLLB, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 29, 4, 0, immFiledUi3}, ++ {AXVSLLB, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 29, 4, 0, immFiledUi3}, ++ ++ {AVSLLH, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, ++ {AXVSLLH, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, ++ {AVSLLH, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 31, 4, 0, immFiledUi4}, ++ {AXVSLLH, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 31, 4, 0, immFiledUi4}, ++ {AVSLLH, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 31, 4, 0, immFiledUi4}, ++ {AXVSLLH, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 31, 4, 0, immFiledUi4}, ++ ++ {AVSLLW, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, ++ {AXVSLLW, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, ++ {AVSLLW, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 32, 4, 0, immFiledUi5}, ++ {AXVSLLW, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 32, 4, 0, immFiledUi5}, ++ {AVSLLW, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 32, 4, 0, immFiledUi5}, ++ {AXVSLLW, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 32, 4, 0, immFiledUi5}, ++ ++ {AVSLLV, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, ++ {AXVSLLV, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, ++ {AVSLLV, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 33, 4, 0, immFiledUi6}, ++ {AXVSLLV, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 33, 4, 0, immFiledUi6}, ++ {AVSLLV, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 33, 4, 0, immFiledUi6}, ++ {AXVSLLV, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 33, 4, 0, immFiledUi6}, ++ + {ACLOW, C_REG, C_NONE, C_NONE, C_REG, C_NONE, 9, 4, 0, 0}, + {AABSF, C_FREG, C_NONE, C_NONE, C_FREG, C_NONE, 9, 4, 0, 0}, + {AMOVVF, C_FREG, C_NONE, C_NONE, C_FREG, C_NONE, 9, 4, 0, 0}, +@@ -1521,7 +1553,7 @@ func buildop(ctxt *obj.Link) { + case AXVSEQB: + opset(AXVSEQH, r0) + opset(AXVSEQW, r0) +- opset(AXVSEQV, r0) ++ opset(AXVSEQV, r0) + + case AVANDB: + opset(AVORB, r0) +@@ -1583,6 +1615,46 @@ func buildop(ctxt *obj.Link) { + opset(AXVILVHW, r0) + opset(AXVILVHV, r0) + ++ case AVSLLB: ++ opset(AVSRLB, r0) ++ opset(AVSRAB, r0) ++ opset(AVROTRB, r0) ++ ++ case AXVSLLB: ++ opset(AXVSRLB, r0) ++ opset(AXVSRAB, r0) ++ opset(AXVROTRB, r0) ++ ++ case AVSLLH: ++ opset(AVSRLH, r0) ++ opset(AVSRAH, r0) ++ opset(AVROTRH, r0) ++ ++ case AXVSLLH: ++ opset(AXVSRLH, r0) ++ opset(AXVSRAH, r0) ++ opset(AXVROTRH, r0) ++ ++ case AVSLLW: ++ opset(AVSRLW, r0) ++ opset(AVSRAW, r0) ++ opset(AVROTRW, r0) ++ ++ case AXVSLLW: ++ opset(AXVSRLW, r0) ++ opset(AXVSRAW, r0) ++ opset(AXVROTRW, r0) ++ ++ case AVSLLV: ++ opset(AVSRLV, r0) ++ opset(AVSRAV, r0) ++ opset(AVROTRV, r0) ++ ++ case AXVSLLV: ++ opset(AXVSRLV, r0) ++ opset(AXVSRAV, r0) ++ opset(AXVROTRV, r0) ++ + case AVPCNTB: + opset(AVPCNTH, r0) + opset(AVPCNTW, r0) +@@ -1629,10 +1701,22 @@ func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 { + return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0 + } + ++func OP_6IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 { ++ return op | (i&0x3F)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0 ++} ++ + func OP_5IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 { + return op | (i&0x1F)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0 + } + ++func OP_4IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 { ++ return op | (i&0xF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0 ++} ++ ++func OP_3IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 { ++ return op | (i&0x7)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0 ++} ++ + func OP_IR(op uint32, i uint32, r2 uint32) uint32 { + return op | (i&0xFFFFF)<<5 | (r2&0x1F)<<0 // ui20, rd5 + } +@@ -1994,10 +2078,70 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + o1 = OP_12IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.From.Reg)) + } + ++ case 29: // add $ui3,[r1],r2 ++ v := c.regoff(&p.From) ++ r := int(p.Reg) ++ if r == 0 { ++ r = int(p.To.Reg) ++ } ++ ++ switch o.flag { ++ case immFiledUi3: ++ c.checkimmFiled(p, v, 3, false) ++ o1 = OP_3IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) ++ default: ++ c.ctxt.Diag("Invalid immediate value type\n%v", p) ++ } ++ + case 30: // mov gr/fr/fcc/fcsr, fr/fcc/fcsr/gr + a := c.specialFpMovInst(p.As, oclass(&p.From), oclass(&p.To)) + o1 = OP_RR(a, uint32(p.From.Reg), uint32(p.To.Reg)) + ++ case 31: // add $ui4,[r1],r2 ++ v := c.regoff(&p.From) ++ r := int(p.Reg) ++ if r == 0 { ++ r = int(p.To.Reg) ++ } ++ ++ switch o.flag { ++ case immFiledUi4: ++ c.checkimmFiled(p, v, 4, false) ++ o1 = OP_4IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) ++ default: ++ c.ctxt.Diag("Invalid immediate value type\n%v", p) ++ } ++ ++ case 32: // add $ui5,[r1],r2 ++ v := c.regoff(&p.From) ++ r := int(p.Reg) ++ if r == 0 { ++ r = int(p.To.Reg) ++ } ++ ++ switch o.flag { ++ case immFiledUi5: ++ c.checkimmFiled(p, v, 5, false) ++ o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) ++ default: ++ c.ctxt.Diag("Invalid immediate value type\n%v", p) ++ } ++ ++ case 33: // add $ui6,[r1],r2 ++ v := c.regoff(&p.From) ++ r := int(p.Reg) ++ if r == 0 { ++ r = int(p.To.Reg) ++ } ++ ++ switch o.flag { ++ case immFiledUi6: ++ c.checkimmFiled(p, v, 6, false) ++ o1 = OP_6IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) ++ default: ++ c.ctxt.Diag("Invalid immediate value type\n%v", p) ++ } ++ + case 34: // mov $con,fr + v := c.regoff(&p.From) + a := AADDU +@@ -2809,6 +2953,70 @@ func (c *ctxt0) oprrr(a obj.As) uint32 { + return 0xEA3A << 15 // xvilvh.w + case AXVILVHV: + return 0xEA3B << 15 // xvilvh.d ++ case AVSLLB: ++ return 0xE1D0 << 15 // vsll.b ++ case AVSLLH: ++ return 0xE1D1 << 15 // vsll.h ++ case AVSLLW: ++ return 0xE1D2 << 15 // vsll.w ++ case AVSLLV: ++ return 0xE1D3 << 15 // vsll.d ++ case AVSRLB: ++ return 0xE1D4 << 15 // vsrl.b ++ case AVSRLH: ++ return 0xE1D5 << 15 // vsrl.h ++ case AVSRLW: ++ return 0xE1D6 << 15 // vsrl.w ++ case AVSRLV: ++ return 0xE1D7 << 15 // vsrl.d ++ case AVSRAB: ++ return 0xE1D8 << 15 // vsra.b ++ case AVSRAH: ++ return 0xE1D9 << 15 // vsra.h ++ case AVSRAW: ++ return 0xE1DA << 15 // vsra.w ++ case AVSRAV: ++ return 0xE1DB << 15 // vsra.d ++ case AVROTRB: ++ return 0xE1DC << 15 // vrotr.b ++ case AVROTRH: ++ return 0xE1DD << 15 // vrotr.h ++ case AVROTRW: ++ return 0xE1DE << 15 // vrotr.w ++ case AVROTRV: ++ return 0xE1DF << 15 // vrotr.d ++ case AXVSLLB: ++ return 0xE9D0 << 15 // xvsll.b ++ case AXVSLLH: ++ return 0xE9D1 << 15 // xvsll.h ++ case AXVSLLW: ++ return 0xE9D2 << 15 // xvsll.w ++ case AXVSLLV: ++ return 0xE9D3 << 15 // xvsll.d ++ case AXVSRLB: ++ return 0xE9D4 << 15 // xvsrl.b ++ case AXVSRLH: ++ return 0xE9D5 << 15 // xvsrl.h ++ case AXVSRLW: ++ return 0xE9D6 << 15 // xvsrl.w ++ case AXVSRLV: ++ return 0xE9D7 << 15 // xvsrl.d ++ case AXVSRAB: ++ return 0xE9D8 << 15 // xvsra.b ++ case AXVSRAH: ++ return 0xE9D9 << 15 // xvsra.h ++ case AXVSRAW: ++ return 0xE9DA << 15 // xvsra.w ++ case AXVSRAV: ++ return 0xE9DB << 15 // xvsra.d ++ case AXVROTRB: ++ return 0xE9DC << 15 // xvrotr.b ++ case AXVROTRH: ++ return 0xE9DD << 15 // xvrotr.h ++ case AXVROTRW: ++ return 0xE9DE << 15 // xvrotr.w ++ case AXVROTRV: ++ return 0xE9DF << 15 // xvrotr.d + } + + if a < 0 { +@@ -3168,6 +3376,70 @@ func (c *ctxt0) opirr(a obj.As) uint32 { + return 0x0ED02 << 15 // xvseqi.w + case AXVSEQV: + return 0x0ED03 << 15 // xvseqi.d ++ case AVROTRB: ++ return 0x1CA8<<18 | 0x1<<13 // vrotri.b ++ case AVROTRH: ++ return 0x1CA8<<18 | 0x1<<14 // vrotri.h ++ case AVROTRW: ++ return 0x1CA8<<18 | 0x1<<15 // vrotri.w ++ case AVROTRV: ++ return 0x1CA8<<18 | 0x1<<16 // vrotri.d ++ case AXVROTRB: ++ return 0x1DA8<<18 | 0x1<<13 // xvrotri.b ++ case AXVROTRH: ++ return 0x1DA8<<18 | 0x1<<14 // xvrotri.h ++ case AXVROTRW: ++ return 0x1DA8<<18 | 0x1<<15 // xvrotri.w ++ case AXVROTRV: ++ return 0x1DA8<<18 | 0x1<<16 // xvrotri.d ++ case AVSLLB: ++ return 0x1CCB<<18 | 0x1<<13 // vslli.b ++ case AVSLLH: ++ return 0x1CCB<<18 | 0x1<<14 // vslli.h ++ case AVSLLW: ++ return 0x1CCB<<18 | 0x1<<15 // vslli.w ++ case AVSLLV: ++ return 0x1CCB<<18 | 0x1<<16 // vslli.d ++ case AVSRLB: ++ return 0x1CCC<<18 | 0x1<<13 // vsrli.b ++ case AVSRLH: ++ return 0x1CCC<<18 | 0x1<<14 // vsrli.h ++ case AVSRLW: ++ return 0x1CCC<<18 | 0x1<<15 // vsrli.w ++ case AVSRLV: ++ return 0x1CCC<<18 | 0x1<<16 // vsrli.d ++ case AVSRAB: ++ return 0x1CCD<<18 | 0x1<<13 // vsrai.b ++ case AVSRAH: ++ return 0x1CCD<<18 | 0x1<<14 // vsrai.h ++ case AVSRAW: ++ return 0x1CCD<<18 | 0x1<<15 // vsrai.w ++ case AVSRAV: ++ return 0x1CCD<<18 | 0x1<<16 // vsrai.d ++ case AXVSLLB: ++ return 0x1DCB<<18 | 0x1<<13 // xvslli.b ++ case AXVSLLH: ++ return 0x1DCB<<18 | 0x1<<14 // xvslli.h ++ case AXVSLLW: ++ return 0x1DCB<<18 | 0x1<<15 // xvslli.w ++ case AXVSLLV: ++ return 0x1DCB<<18 | 0x1<<16 // xvslli.d ++ case AXVSRLB: ++ return 0x1DCC<<18 | 0x1<<13 // xvsrli.b ++ case AXVSRLH: ++ return 0x1DCC<<18 | 0x1<<14 // xvsrli.h ++ case AXVSRLW: ++ return 0x1DCC<<18 | 0x1<<15 // xvsrli.w ++ case AXVSRLV: ++ return 0x1DCC<<18 | 0x1<<16 // xvsrli.d ++ case AXVSRAB: ++ return 0x1DCD<<18 | 0x1<<13 // xvsrai.b ++ case AXVSRAH: ++ return 0x1DCD<<18 | 0x1<<14 // xvsrai.h ++ case AXVSRAW: ++ return 0x1DCD<<18 | 0x1<<15 // xvsrai.w ++ case AXVSRAV: ++ return 0x1DCD<<18 | 0x1<<16 // xvsrai.d + } + + if a < 0 { +-- +2.38.1 + diff --git a/0030-cmd-internal-obj-loong64-add-V-XV-FSQRT-FRECIP-FRSQR.patch b/0030-cmd-internal-obj-loong64-add-V-XV-FSQRT-FRECIP-FRSQR.patch new file mode 100644 index 0000000000000000000000000000000000000000..ba201937d9220018abcfc3a9c8eb7ef92b211b0b --- /dev/null +++ b/0030-cmd-internal-obj-loong64-add-V-XV-FSQRT-FRECIP-FRSQR.patch @@ -0,0 +1,166 @@ +From 344852ff0ccb2b948dc77e0934f246cc5ddf9506 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Wed, 11 Dec 2024 16:49:08 +0800 +Subject: [PATCH 30/44] cmd/internal/obj/loong64: add + {V,XV}{FSQRT/FRECIP/FRSQRT}.{S/D} instructions support + +Go asm syntax: + V{FSQRT/FRECIP/FRSQRT}{F/D} VJ, VD + XV{FSQRT/FRECIP/FRSQRT}{F/D} XJ, XD + +Equivalent platform assembler syntax: + v{fsqrt/frecip/frsqrt}.{s/d} vd, vj + xv{fsqrt/frecip/frsqrt}.{s/d} xd, xj + +Change-Id: Ied0b959e703d2199939c9ac0608eb3408ea249fa +--- + .../asm/internal/asm/testdata/loong64enc1.s | 14 +++++++ + src/cmd/internal/obj/loong64/a.out.go | 14 +++++++ + src/cmd/internal/obj/loong64/anames.go | 12 ++++++ + src/cmd/internal/obj/loong64/asm.go | 38 ++++++++++++++++++- + 4 files changed, 77 insertions(+), 1 deletion(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index 79012784dc..e2e8a6de6c 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -725,6 +725,20 @@ lable2: + XVROTRV $63, X2, X1 // 41fca176 + XVROTRV $52, X2 // 42d0a176 + ++ // [X]VF{SQRT/RECIP/RSQRT}{F/D} instructions ++ VFSQRTF V1, V2 // 22e49c72 ++ VFSQRTD V1, V2 // 22e89c72 ++ VFRECIPF V1, V2 // 22f49c72 ++ VFRECIPD V1, V2 // 22f89c72 ++ VFRSQRTF V1, V2 // 22049d72 ++ VFRSQRTD V1, V2 // 22089d72 ++ XVFSQRTF X2, X1 // 41e49c76 ++ XVFSQRTD X2, X1 // 41e89c76 ++ XVFRECIPF X2, X1 // 41f49c76 ++ XVFRECIPD X2, X1 // 41f89c76 ++ XVFRSQRTF X2, X1 // 41049d76 ++ XVFRSQRTD X2, X1 // 41089d76 ++ + // MOVV C_DCON12_0, r + MOVV $0x7a90000000000000, R4 // MOVV $8831558869273542656, R4 // 04a41e03 + MOVV $0xea90000000000000, R4 // MOVV $-1544734672188080128, R4 // 04a43a03 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index 3257d376b4..bd2b1e8300 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -811,6 +811,20 @@ const ( + AXVPCNTW + AXVPCNTV + ++ // LSX and LASX floating point instructions ++ AVFSQRTF ++ AVFSQRTD ++ AVFRECIPF ++ AVFRECIPD ++ AVFRSQRTF ++ AVFRSQRTD ++ AXVFSQRTF ++ AXVFSQRTD ++ AXVFRECIPF ++ AXVFRECIPD ++ AXVFRSQRTF ++ AXVFRSQRTD ++ + // LSX and LASX integer comparison instruction + AVSEQB + AXVSEQB +diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go +index 776e272a0b..7dbe9b92e6 100644 +--- a/src/cmd/internal/obj/loong64/anames.go ++++ b/src/cmd/internal/obj/loong64/anames.go +@@ -337,6 +337,18 @@ var Anames = []string{ + "XVPCNTH", + "XVPCNTW", + "XVPCNTV", ++ "VFSQRTF", ++ "VFSQRTD", ++ "VFRECIPF", ++ "VFRECIPD", ++ "VFRSQRTF", ++ "VFRSQRTD", ++ "XVFSQRTF", ++ "XVFSQRTD", ++ "XVFRECIPF", ++ "XVFRECIPD", ++ "XVFRSQRTF", ++ "XVFRSQRTD", + "VSEQB", + "XVSEQB", + "VSEQH", +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index 25a40d736e..af38bef3aa 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -1553,7 +1553,7 @@ func buildop(ctxt *obj.Link) { + case AXVSEQB: + opset(AXVSEQH, r0) + opset(AXVSEQW, r0) +- opset(AXVSEQV, r0) ++ opset(AXVSEQV, r0) + + case AVANDB: + opset(AVORB, r0) +@@ -1659,11 +1659,23 @@ func buildop(ctxt *obj.Link) { + opset(AVPCNTH, r0) + opset(AVPCNTW, r0) + opset(AVPCNTV, r0) ++ opset(AVFSQRTF, r0) ++ opset(AVFSQRTD, r0) ++ opset(AVFRECIPF, r0) ++ opset(AVFRECIPD, r0) ++ opset(AVFRSQRTF, r0) ++ opset(AVFRSQRTD, r0) + + case AXVPCNTB: + opset(AXVPCNTH, r0) + opset(AXVPCNTW, r0) + opset(AXVPCNTV, r0) ++ opset(AXVFSQRTF, r0) ++ opset(AXVFSQRTD, r0) ++ opset(AXVFRECIPF, r0) ++ opset(AXVFRECIPD, r0) ++ opset(AXVFRSQRTF, r0) ++ opset(AXVFRSQRTD, r0) + } + } + } +@@ -3193,6 +3205,30 @@ func (c *ctxt0) oprr(a obj.As) uint32 { + return 0x1da70a << 10 // xvpcnt.w + case AXVPCNTV: + return 0x1da70b << 10 // xvpcnt.v ++ case AVFSQRTF: ++ return 0x1ca739 << 10 // vfsqrt.s ++ case AVFSQRTD: ++ return 0x1ca73a << 10 // vfsqrt.d ++ case AVFRECIPF: ++ return 0x1ca73d << 10 // vfrecip.s ++ case AVFRECIPD: ++ return 0x1ca73e << 10 // vfrecip.d ++ case AVFRSQRTF: ++ return 0x1ca741 << 10 // vfrsqrt.s ++ case AVFRSQRTD: ++ return 0x1ca742 << 10 // vfrsqrt.d ++ case AXVFSQRTF: ++ return 0x1da739 << 10 // xvfsqrt.s ++ case AXVFSQRTD: ++ return 0x1da73a << 10 // xvfsqrt.d ++ case AXVFRECIPF: ++ return 0x1da73d << 10 // xvfrecip.s ++ case AXVFRECIPD: ++ return 0x1da73e << 10 // xvfrecip.d ++ case AXVFRSQRTF: ++ return 0x1da741 << 10 // xvfrsqrt.s ++ case AXVFRSQRTD: ++ return 0x1da742 << 10 // xvfrsqrt.d + } + + c.ctxt.Diag("bad rr opcode %v", a) +-- +2.38.1 + diff --git a/0031-cmd-internal-obj-loong64-add-V-XV-NEG-B-H-W-V-instru.patch b/0031-cmd-internal-obj-loong64-add-V-XV-NEG-B-H-W-V-instru.patch new file mode 100644 index 0000000000000000000000000000000000000000..40e749c6f39030fd62a0f682dc3cc8486c1577e6 --- /dev/null +++ b/0031-cmd-internal-obj-loong64-add-V-XV-NEG-B-H-W-V-instru.patch @@ -0,0 +1,135 @@ +From 6849aaa3deb1fec44bb7625a70ecc2a19f86a389 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Wed, 11 Dec 2024 17:19:04 +0800 +Subject: [PATCH 31/44] cmd/internal/obj/loong64: add {V,XV}NEG{B/H/W/V} + instructions support + +Go asm syntax: + VNEG{B/H/W/V} VJ, VD + XVNEG{B/H/W/V} XJ, XD + +Equivalent platform assembler syntax: + vneg.{b/h/w/d} vd, vj + xvneg.{b/h/w/d} xd, xj + +Change-Id: Ib2df46b5386149efb44fe12e2485c01826339a5d +--- + .../asm/internal/asm/testdata/loong64enc1.s | 10 ++++++++ + src/cmd/internal/obj/loong64/a.out.go | 10 ++++++++ + src/cmd/internal/obj/loong64/anames.go | 8 +++++++ + src/cmd/internal/obj/loong64/asm.go | 24 +++++++++++++++++++ + 4 files changed, 52 insertions(+) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index e2e8a6de6c..9deb3cbafd 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -739,6 +739,16 @@ lable2: + XVFRSQRTF X2, X1 // 41049d76 + XVFRSQRTD X2, X1 // 41089d76 + ++ // [X]VNEG{B/H/W/V} instructions ++ VNEGB V1, V2 // 22309c72 ++ VNEGH V1, V2 // 22349c72 ++ VNEGW V1, V2 // 22389c72 ++ VNEGV V1, V2 // 223c9c72 ++ XVNEGB X2, X1 // 41309c76 ++ XVNEGH X2, X1 // 41349c76 ++ XVNEGW X2, X1 // 41389c76 ++ XVNEGV X2, X1 // 413c9c76 ++ + // MOVV C_DCON12_0, r + MOVV $0x7a90000000000000, R4 // MOVV $8831558869273542656, R4 // 04a41e03 + MOVV $0xea90000000000000, R4 // MOVV $-1544734672188080128, R4 // 04a43a03 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index bd2b1e8300..486dc9fa89 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -747,6 +747,16 @@ const ( + AXVSUBV + AXVSUBQ + ++ // LSX and LASX integer neg instructions ++ AVNEGB ++ AVNEGH ++ AVNEGW ++ AVNEGV ++ AXVNEGB ++ AXVNEGH ++ AXVNEGW ++ AXVNEGV ++ + // LSX and LASX Bit-manipulation Instructions + AVANDB + AVORB +diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go +index 7dbe9b92e6..d697b73e71 100644 +--- a/src/cmd/internal/obj/loong64/anames.go ++++ b/src/cmd/internal/obj/loong64/anames.go +@@ -277,6 +277,14 @@ var Anames = []string{ + "XVSUBW", + "XVSUBV", + "XVSUBQ", ++ "VNEGB", ++ "VNEGH", ++ "VNEGW", ++ "VNEGV", ++ "XVNEGB", ++ "XVNEGH", ++ "XVNEGW", ++ "XVNEGV", + "VANDB", + "VORB", + "VXORB", +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index af38bef3aa..e2c7afd82d 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -1665,6 +1665,10 @@ func buildop(ctxt *obj.Link) { + opset(AVFRECIPD, r0) + opset(AVFRSQRTF, r0) + opset(AVFRSQRTD, r0) ++ opset(AVNEGB, r0) ++ opset(AVNEGH, r0) ++ opset(AVNEGW, r0) ++ opset(AVNEGV, r0) + + case AXVPCNTB: + opset(AXVPCNTH, r0) +@@ -1676,6 +1680,10 @@ func buildop(ctxt *obj.Link) { + opset(AXVFRECIPD, r0) + opset(AXVFRSQRTF, r0) + opset(AXVFRSQRTD, r0) ++ opset(AXVNEGB, r0) ++ opset(AXVNEGH, r0) ++ opset(AXVNEGW, r0) ++ opset(AXVNEGV, r0) + } + } + } +@@ -3229,6 +3237,22 @@ func (c *ctxt0) oprr(a obj.As) uint32 { + return 0x1da741 << 10 // xvfrsqrt.s + case AXVFRSQRTD: + return 0x1da742 << 10 // xvfrsqrt.d ++ case AVNEGB: ++ return 0x1ca70c << 10 // vneg.b ++ case AVNEGH: ++ return 0x1ca70d << 10 // vneg.h ++ case AVNEGW: ++ return 0x1ca70e << 10 // vneg.w ++ case AVNEGV: ++ return 0x1ca70f << 10 // vneg.d ++ case AXVNEGB: ++ return 0x1da70c << 10 // xvneg.b ++ case AXVNEGH: ++ return 0x1da70d << 10 // xvneg.h ++ case AXVNEGW: ++ return 0x1da70e << 10 // xvneg.w ++ case AXVNEGV: ++ return 0x1da70f << 10 // xvneg.d + } + + c.ctxt.Diag("bad rr opcode %v", a) +-- +2.38.1 + diff --git a/0032-cmd-internal-obj-loong64-add-V-XV-MUL-B-H-W-V-and-V-.patch b/0032-cmd-internal-obj-loong64-add-V-XV-MUL-B-H-W-V-and-V-.patch new file mode 100644 index 0000000000000000000000000000000000000000..67724ac532b420e34fd99dc4c3ea21173f284099 --- /dev/null +++ b/0032-cmd-internal-obj-loong64-add-V-XV-MUL-B-H-W-V-and-V-.patch @@ -0,0 +1,235 @@ +From 984f12cbb1763c855882b3c8e89727ad560b38c1 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Wed, 11 Dec 2024 17:46:09 +0800 +Subject: [PATCH 32/44] cmd/internal/obj/loong64: add {V,XV}MUL{B/H/W/V} and + {V,XV}MUH{B/H/W/V}[U] instructions support + +Go asm syntax: + VMUL{B/H/W/V} VK, VJ, VD + VMUH{B/H/W/V}[U] VK, VJ, VD + XVMUL{B/H/W/V} XK, XJ, XD + XVMUH{B/H/W/V}[U] XK, XJ, XD + +Equivalent platform assembler syntax: + vmul.{b/h/w/d} vd, vj, vk + vmuh.{b/h/w/d}[u] vd, vj, vk + xvmul.{b/h/w/d} xd, xj, xk + xvmuh.{b/h/w/d}[u] xd, xj, xk + +Change-Id: I8890f8a41100e4681a833c27067f0f76b593f731 +--- + .../asm/internal/asm/testdata/loong64enc1.s | 26 +++++++ + src/cmd/internal/obj/loong64/a.out.go | 26 +++++++ + src/cmd/internal/obj/loong64/anames.go | 24 +++++++ + src/cmd/internal/obj/loong64/asm.go | 72 +++++++++++++++++++ + 4 files changed, 148 insertions(+) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index 9deb3cbafd..c8b490234f 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -749,6 +749,32 @@ lable2: + XVNEGW X2, X1 // 41389c76 + XVNEGV X2, X1 // 413c9c76 + ++ // [X]VMUL{B/H/W/V} and [X]VMUH{B/H/W/V}[U] instructions ++ VMULB V1, V2, V3 // 43048470 ++ VMULH V1, V2, V3 // 43848470 ++ VMULW V1, V2, V3 // 43048570 ++ VMULV V1, V2, V3 // 43848570 ++ VMUHB V1, V2, V3 // 43048670 ++ VMUHH V1, V2, V3 // 43848670 ++ VMUHW V1, V2, V3 // 43048770 ++ VMUHV V1, V2, V3 // 43848770 ++ VMUHBU V1, V2, V3 // 43048870 ++ VMUHHU V1, V2, V3 // 43848870 ++ VMUHWU V1, V2, V3 // 43048970 ++ VMUHVU V1, V2, V3 // 43848970 ++ XVMULB X3, X2, X1 // 410c8474 ++ XVMULH X3, X2, X1 // 418c8474 ++ XVMULW X3, X2, X1 // 410c8574 ++ XVMULV X3, X2, X1 // 418c8574 ++ XVMUHB X3, X2, X1 // 410c8674 ++ XVMUHH X3, X2, X1 // 418c8674 ++ XVMUHW X3, X2, X1 // 410c8774 ++ XVMUHV X3, X2, X1 // 418c8774 ++ XVMUHBU X3, X2, X1 // 410c8874 ++ XVMUHHU X3, X2, X1 // 418c8874 ++ XVMUHWU X3, X2, X1 // 410c8974 ++ XVMUHVU X3, X2, X1 // 418c8974 ++ + // MOVV C_DCON12_0, r + MOVV $0x7a90000000000000, R4 // MOVV $8831558869273542656, R4 // 04a41e03 + MOVV $0xea90000000000000, R4 // MOVV $-1544734672188080128, R4 // 04a43a03 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index 486dc9fa89..95744e77a1 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -757,6 +757,32 @@ const ( + AXVNEGW + AXVNEGV + ++ // LSX and LASX integer mul instructions ++ AVMULB ++ AVMULH ++ AVMULW ++ AVMULV ++ AVMUHB ++ AVMUHH ++ AVMUHW ++ AVMUHV ++ AVMUHBU ++ AVMUHHU ++ AVMUHWU ++ AVMUHVU ++ AXVMULB ++ AXVMULH ++ AXVMULW ++ AXVMULV ++ AXVMUHB ++ AXVMUHH ++ AXVMUHW ++ AXVMUHV ++ AXVMUHBU ++ AXVMUHHU ++ AXVMUHWU ++ AXVMUHVU ++ + // LSX and LASX Bit-manipulation Instructions + AVANDB + AVORB +diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go +index d697b73e71..d0cd3a26fa 100644 +--- a/src/cmd/internal/obj/loong64/anames.go ++++ b/src/cmd/internal/obj/loong64/anames.go +@@ -285,6 +285,30 @@ var Anames = []string{ + "XVNEGH", + "XVNEGW", + "XVNEGV", ++ "VMULB", ++ "VMULH", ++ "VMULW", ++ "VMULV", ++ "VMUHB", ++ "VMUHH", ++ "VMUHW", ++ "VMUHV", ++ "VMUHBU", ++ "VMUHHU", ++ "VMUHWU", ++ "VMUHVU", ++ "XVMULB", ++ "XVMULH", ++ "XVMULW", ++ "XVMULV", ++ "XVMUHB", ++ "XVMUHH", ++ "XVMUHW", ++ "XVMUHV", ++ "XVMUHBU", ++ "XVMUHHU", ++ "XVMUHWU", ++ "XVMUHVU", + "VANDB", + "VORB", + "VXORB", +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index e2c7afd82d..7fb99f66e6 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -1589,6 +1589,18 @@ func buildop(ctxt *obj.Link) { + opset(AVILVHH, r0) + opset(AVILVHW, r0) + opset(AVILVHV, r0) ++ opset(AVMULB, r0) ++ opset(AVMULH, r0) ++ opset(AVMULW, r0) ++ opset(AVMULV, r0) ++ opset(AVMUHB, r0) ++ opset(AVMUHH, r0) ++ opset(AVMUHW, r0) ++ opset(AVMUHV, r0) ++ opset(AVMUHBU, r0) ++ opset(AVMUHHU, r0) ++ opset(AVMUHWU, r0) ++ opset(AVMUHVU, r0) + + case AXVANDV: + opset(AXVORV, r0) +@@ -1614,6 +1626,18 @@ func buildop(ctxt *obj.Link) { + opset(AXVILVHH, r0) + opset(AXVILVHW, r0) + opset(AXVILVHV, r0) ++ opset(AXVMULB, r0) ++ opset(AXVMULH, r0) ++ opset(AXVMULW, r0) ++ opset(AXVMULV, r0) ++ opset(AXVMUHB, r0) ++ opset(AXVMUHH, r0) ++ opset(AXVMUHW, r0) ++ opset(AXVMUHV, r0) ++ opset(AXVMUHBU, r0) ++ opset(AXVMUHHU, r0) ++ opset(AXVMUHWU, r0) ++ opset(AXVMUHVU, r0) + + case AVSLLB: + opset(AVSRLB, r0) +@@ -3037,6 +3061,54 @@ func (c *ctxt0) oprrr(a obj.As) uint32 { + return 0xE9DE << 15 // xvrotr.w + case AXVROTRV: + return 0xE9DF << 15 // xvrotr.d ++ case AVMULB: ++ return 0xe108 << 15 // vmul.b ++ case AVMULH: ++ return 0xe109 << 15 // vmul.h ++ case AVMULW: ++ return 0xe10a << 15 // vmul.w ++ case AVMULV: ++ return 0xe10b << 15 // vmul.d ++ case AVMUHB: ++ return 0xe10c << 15 // vmuh.b ++ case AVMUHH: ++ return 0xe10d << 15 // vmuh.h ++ case AVMUHW: ++ return 0xe10e << 15 // vmuh.w ++ case AVMUHV: ++ return 0xe10f << 15 // vmuh.d ++ case AVMUHBU: ++ return 0xe110 << 15 // vmuh.bu ++ case AVMUHHU: ++ return 0xe111 << 15 // vmuh.hu ++ case AVMUHWU: ++ return 0xe112 << 15 // vmuh.wu ++ case AVMUHVU: ++ return 0xe113 << 15 // vmuh.du ++ case AXVMULB: ++ return 0xe908 << 15 // xvmul.b ++ case AXVMULH: ++ return 0xe909 << 15 // xvmul.h ++ case AXVMULW: ++ return 0xe90a << 15 // xvmul.w ++ case AXVMULV: ++ return 0xe90b << 15 // xvmul.d ++ case AXVMUHB: ++ return 0xe90c << 15 // xvmuh.b ++ case AXVMUHH: ++ return 0xe90d << 15 // xvmuh.h ++ case AXVMUHW: ++ return 0xe90e << 15 // xvmuh.w ++ case AXVMUHV: ++ return 0xe90f << 15 // xvmuh.d ++ case AXVMUHBU: ++ return 0xe910 << 15 // xvmuh.bu ++ case AXVMUHHU: ++ return 0xe911 << 15 // xvmuh.hu ++ case AXVMUHWU: ++ return 0xe912 << 15 // xvmuh.wu ++ case AXVMUHVU: ++ return 0xe913 << 15 // xvmuh.du + } + + if a < 0 { +-- +2.38.1 + diff --git a/0033-cmd-internal-obj-loong64-add-V-XV-DIV-B-H-W-V-U-and-.patch b/0033-cmd-internal-obj-loong64-add-V-XV-DIV-B-H-W-V-U-and-.patch new file mode 100644 index 0000000000000000000000000000000000000000..967f56d10b28d942f5868f28bd56209e5a7fbeb9 --- /dev/null +++ b/0033-cmd-internal-obj-loong64-add-V-XV-DIV-B-H-W-V-U-and-.patch @@ -0,0 +1,283 @@ +From 116a2261b3a110e6ff4f9608f447e6f07156d55f Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Wed, 11 Dec 2024 18:08:16 +0800 +Subject: [PATCH 33/44] cmd/internal/obj/loong64: add {V,XV}DIV{B/H/W/V}[U] and + {V,XV}MOD{B/H/W/V}[U] instructions support + +Go asm syntax: + VDIV{B/H/W/V}[U] VK, VJ, VD + XVDIV{B/H/W/V}[U] XK, XJ, XD + VMOD{B/H/W/V}[U] VK, VJ, VD + XVMOD{B/H/W/V}[U] XK, XJ, XD + +Equivalent platform assembler syntax: + vdiv.{b/h/w/d}[u] vd, vj, vk + xvdiv.{b/h/w/d}[u] xd, xj, xk + vmod.{b/h/w/d}[u] vd, vj, vk + xvmod.{b/h/w/d}[u] xd, xj, xk + +Change-Id: I27e9bc8999e6525a27f0bf12b21cc896c5a2a69c +--- + .../asm/internal/asm/testdata/loong64enc1.s | 34 +++++++ + src/cmd/internal/obj/loong64/a.out.go | 34 +++++++ + src/cmd/internal/obj/loong64/anames.go | 32 +++++++ + src/cmd/internal/obj/loong64/asm.go | 96 +++++++++++++++++++ + 4 files changed, 196 insertions(+) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index c8b490234f..bbac6036cf 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -775,6 +775,40 @@ lable2: + XVMUHWU X3, X2, X1 // 410c8974 + XVMUHVU X3, X2, X1 // 418c8974 + ++ // [X]VDIV{B/H/W/V}[U] and [X]VMOD{B/H/W/V}[U] instructions ++ VDIVB V1, V2, V3 // 4304e070 ++ VDIVH V1, V2, V3 // 4384e070 ++ VDIVW V1, V2, V3 // 4304e170 ++ VDIVV V1, V2, V3 // 4384e170 ++ VDIVBU V1, V2, V3 // 4304e470 ++ VDIVHU V1, V2, V3 // 4384e470 ++ VDIVWU V1, V2, V3 // 4304e570 ++ VDIVVU V1, V2, V3 // 4384e570 ++ VMODB V1, V2, V3 // 4304e270 ++ VMODH V1, V2, V3 // 4384e270 ++ VMODW V1, V2, V3 // 4304e370 ++ VMODV V1, V2, V3 // 4384e370 ++ VMODBU V1, V2, V3 // 4304e670 ++ VMODHU V1, V2, V3 // 4384e670 ++ VMODWU V1, V2, V3 // 4304e770 ++ VMODVU V1, V2, V3 // 4384e770 ++ XVDIVB X3, X2, X1 // 410ce074 ++ XVDIVH X3, X2, X1 // 418ce074 ++ XVDIVW X3, X2, X1 // 410ce174 ++ XVDIVV X3, X2, X1 // 418ce174 ++ XVDIVBU X3, X2, X1 // 410ce474 ++ XVDIVHU X3, X2, X1 // 418ce474 ++ XVDIVWU X3, X2, X1 // 410ce574 ++ XVDIVVU X3, X2, X1 // 418ce574 ++ XVMODB X3, X2, X1 // 410ce274 ++ XVMODH X3, X2, X1 // 418ce274 ++ XVMODW X3, X2, X1 // 410ce374 ++ XVMODV X3, X2, X1 // 418ce374 ++ XVMODBU X3, X2, X1 // 410ce674 ++ XVMODHU X3, X2, X1 // 418ce674 ++ XVMODWU X3, X2, X1 // 410ce774 ++ XVMODVU X3, X2, X1 // 418ce774 ++ + // MOVV C_DCON12_0, r + MOVV $0x7a90000000000000, R4 // MOVV $8831558869273542656, R4 // 04a41e03 + MOVV $0xea90000000000000, R4 // MOVV $-1544734672188080128, R4 // 04a43a03 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index 95744e77a1..9164e9d59f 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -783,6 +783,40 @@ const ( + AXVMUHWU + AXVMUHVU + ++ // LSX and LASX integer div and mod instructions ++ AVDIVB ++ AVDIVH ++ AVDIVW ++ AVDIVV ++ AVDIVBU ++ AVDIVHU ++ AVDIVWU ++ AVDIVVU ++ AVMODB ++ AVMODH ++ AVMODW ++ AVMODV ++ AVMODBU ++ AVMODHU ++ AVMODWU ++ AVMODVU ++ AXVDIVB ++ AXVDIVH ++ AXVDIVW ++ AXVDIVV ++ AXVDIVBU ++ AXVDIVHU ++ AXVDIVWU ++ AXVDIVVU ++ AXVMODB ++ AXVMODH ++ AXVMODW ++ AXVMODV ++ AXVMODBU ++ AXVMODHU ++ AXVMODWU ++ AXVMODVU ++ + // LSX and LASX Bit-manipulation Instructions + AVANDB + AVORB +diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go +index d0cd3a26fa..15a264c8e2 100644 +--- a/src/cmd/internal/obj/loong64/anames.go ++++ b/src/cmd/internal/obj/loong64/anames.go +@@ -309,6 +309,38 @@ var Anames = []string{ + "XVMUHHU", + "XVMUHWU", + "XVMUHVU", ++ "VDIVB", ++ "VDIVH", ++ "VDIVW", ++ "VDIVV", ++ "VDIVBU", ++ "VDIVHU", ++ "VDIVWU", ++ "VDIVVU", ++ "VMODB", ++ "VMODH", ++ "VMODW", ++ "VMODV", ++ "VMODBU", ++ "VMODHU", ++ "VMODWU", ++ "VMODVU", ++ "XVDIVB", ++ "XVDIVH", ++ "XVDIVW", ++ "XVDIVV", ++ "XVDIVBU", ++ "XVDIVHU", ++ "XVDIVWU", ++ "XVDIVVU", ++ "XVMODB", ++ "XVMODH", ++ "XVMODW", ++ "XVMODV", ++ "XVMODBU", ++ "XVMODHU", ++ "XVMODWU", ++ "XVMODVU", + "VANDB", + "VORB", + "VXORB", +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index 7fb99f66e6..7a14137374 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -1601,6 +1601,22 @@ func buildop(ctxt *obj.Link) { + opset(AVMUHHU, r0) + opset(AVMUHWU, r0) + opset(AVMUHVU, r0) ++ opset(AVDIVB, r0) ++ opset(AVDIVH, r0) ++ opset(AVDIVW, r0) ++ opset(AVDIVV, r0) ++ opset(AVMODB, r0) ++ opset(AVMODH, r0) ++ opset(AVMODW, r0) ++ opset(AVMODV, r0) ++ opset(AVDIVBU, r0) ++ opset(AVDIVHU, r0) ++ opset(AVDIVWU, r0) ++ opset(AVDIVVU, r0) ++ opset(AVMODBU, r0) ++ opset(AVMODHU, r0) ++ opset(AVMODWU, r0) ++ opset(AVMODVU, r0) + + case AXVANDV: + opset(AXVORV, r0) +@@ -1638,6 +1654,22 @@ func buildop(ctxt *obj.Link) { + opset(AXVMUHHU, r0) + opset(AXVMUHWU, r0) + opset(AXVMUHVU, r0) ++ opset(AXVDIVB, r0) ++ opset(AXVDIVH, r0) ++ opset(AXVDIVW, r0) ++ opset(AXVDIVV, r0) ++ opset(AXVMODB, r0) ++ opset(AXVMODH, r0) ++ opset(AXVMODW, r0) ++ opset(AXVMODV, r0) ++ opset(AXVDIVBU, r0) ++ opset(AXVDIVHU, r0) ++ opset(AXVDIVWU, r0) ++ opset(AXVDIVVU, r0) ++ opset(AXVMODBU, r0) ++ opset(AXVMODHU, r0) ++ opset(AXVMODWU, r0) ++ opset(AXVMODVU, r0) + + case AVSLLB: + opset(AVSRLB, r0) +@@ -3109,6 +3141,70 @@ func (c *ctxt0) oprrr(a obj.As) uint32 { + return 0xe912 << 15 // xvmuh.wu + case AXVMUHVU: + return 0xe913 << 15 // xvmuh.du ++ case AVDIVB: ++ return 0xe1c0 << 15 // vdiv.b ++ case AVDIVH: ++ return 0xe1c1 << 15 // vdiv.h ++ case AVDIVW: ++ return 0xe1c2 << 15 // vdiv.w ++ case AVDIVV: ++ return 0xe1c3 << 15 // vdiv.d ++ case AVMODB: ++ return 0xe1c4 << 15 // vmod.b ++ case AVMODH: ++ return 0xe1c5 << 15 // vmod.h ++ case AVMODW: ++ return 0xe1c6 << 15 // vmod.w ++ case AVMODV: ++ return 0xe1c7 << 15 // vmod.d ++ case AVDIVBU: ++ return 0xe1c8 << 15 // vdiv.bu ++ case AVDIVHU: ++ return 0xe1c9 << 15 // vdiv.hu ++ case AVDIVWU: ++ return 0xe1ca << 15 // vdiv.wu ++ case AVDIVVU: ++ return 0xe1cb << 15 // vdiv.du ++ case AVMODBU: ++ return 0xe1cc << 15 // vmod.bu ++ case AVMODHU: ++ return 0xe1cd << 15 // vmod.hu ++ case AVMODWU: ++ return 0xe1ce << 15 // vmod.wu ++ case AVMODVU: ++ return 0xe1cf << 15 // vmod.du ++ case AXVDIVB: ++ return 0xe9c0 << 15 // xvdiv.b ++ case AXVDIVH: ++ return 0xe9c1 << 15 // xvdiv.h ++ case AXVDIVW: ++ return 0xe9c2 << 15 // xvdiv.w ++ case AXVDIVV: ++ return 0xe9c3 << 15 // xvdiv.d ++ case AXVMODB: ++ return 0xe9c4 << 15 // xvmod.b ++ case AXVMODH: ++ return 0xe9c5 << 15 // xvmod.h ++ case AXVMODW: ++ return 0xe9c6 << 15 // xvmod.w ++ case AXVMODV: ++ return 0xe9c7 << 15 // xvmod.d ++ case AXVDIVBU: ++ return 0xe9c8 << 15 // xvdiv.bu ++ case AXVDIVHU: ++ return 0xe9c9 << 15 // xvdiv.hu ++ case AXVDIVWU: ++ return 0xe9ca << 15 // xvdiv.wu ++ case AXVDIVVU: ++ return 0xe9cb << 15 // xvdiv.du ++ case AXVMODBU: ++ return 0xe9cc << 15 // xvmod.bu ++ case AXVMODHU: ++ return 0xe9cd << 15 // xvmod.hu ++ case AXVMODWU: ++ return 0xe9ce << 15 // xvmod.wu ++ case AXVMODVU: ++ return 0xe9cf << 15 // xvmod.du + } + + if a < 0 { +-- +2.38.1 + diff --git a/0034-cmd-internal-obj-loong64-add-V-XV-BITCLR-BITSET-BITR.patch b/0034-cmd-internal-obj-loong64-add-V-XV-BITCLR-BITSET-BITR.patch new file mode 100644 index 0000000000000000000000000000000000000000..2a2d4b13d8efbdfe73ea616c881cc4dc7968ad49 --- /dev/null +++ b/0034-cmd-internal-obj-loong64-add-V-XV-BITCLR-BITSET-BITR.patch @@ -0,0 +1,341 @@ +From 054df785d79675c02f6bd2ad3ace9f1ce5874e84 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Thu, 12 Dec 2024 10:54:00 +0800 +Subject: [PATCH 34/44] cmd/internal/obj/loong64: add + {V,XV}{BITCLR/BITSET/BITREV}[I].{B/H/W/D} instructions support + +Go asm syntax: + V{BITCLR/BITSET/BITREV}{B/H/W/V} $1, V2, V3 + XV{BITCLR/BITSET/BITREV}{B/H/W/V} $1, X2, X3 + V{BITCLR/BITSET/BITREV}{B/H/W/V} VK, VJ, VD + XV{BITCLR/BITSET/BITREV}{B/H/W/V} XK, XJ, XD + +Equivalent platform assembler syntax: + v{bitclr/bitset/bitrev}i.{b/h/w/d} v3, v2, $1 + xv{bitclr/bitset/bitrev}i.{b/h/w/d} x3, x2, $1 + v{bitclr/bitset/bitrev}.{b/h/w/d} vd, vj, vk + xv{bitclr/bitset/bitrev}.{b/h/w/d} xd, xj, xk + +Change-Id: Id44e6cb7c22d650bb6b4d9f6faee5dcda4edb24e +--- + .../asm/internal/asm/testdata/loong64enc1.s | 50 ++++++++ + src/cmd/internal/obj/loong64/a.out.go | 25 ++++ + src/cmd/internal/obj/loong64/anames.go | 24 ++++ + src/cmd/internal/obj/loong64/asm.go | 120 ++++++++++++++++++ + 4 files changed, 219 insertions(+) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index bbac6036cf..19070c89ef 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -809,6 +809,56 @@ lable2: + XVMODWU X3, X2, X1 // 410ce774 + XVMODVU X3, X2, X1 // 418ce774 + ++ // [X]{VBITCLR/VBITSET/VBITREV}{B,H,W,V} instructions ++ VBITCLRB V1, V2, V3 // 43040c71 ++ VBITCLRH V1, V2, V3 // 43840c71 ++ VBITCLRW V1, V2, V3 // 43040d71 ++ VBITCLRV V1, V2, V3 // 43840d71 ++ VBITSETB V1, V2, V3 // 43040e71 ++ VBITSETH V1, V2, V3 // 43840e71 ++ VBITSETW V1, V2, V3 // 43040f71 ++ VBITSETV V1, V2, V3 // 43840f71 ++ VBITREVB V1, V2, V3 // 43041071 ++ VBITREVH V1, V2, V3 // 43841071 ++ VBITREVW V1, V2, V3 // 43041171 ++ VBITREVV V1, V2, V3 // 43841171 ++ XVBITCLRB X3, X2, X1 // 410c0c75 ++ XVBITCLRH X3, X2, X1 // 418c0c75 ++ XVBITCLRW X3, X2, X1 // 410c0d75 ++ XVBITCLRV X3, X2, X1 // 418c0d75 ++ XVBITSETB X3, X2, X1 // 410c0e75 ++ XVBITSETH X3, X2, X1 // 418c0e75 ++ XVBITSETW X3, X2, X1 // 410c0f75 ++ XVBITSETV X3, X2, X1 // 418c0f75 ++ XVBITREVB X3, X2, X1 // 410c1075 ++ XVBITREVH X3, X2, X1 // 418c1075 ++ XVBITREVW X3, X2, X1 // 410c1175 ++ XVBITREVV X3, X2, X1 // 418c1175 ++ VBITCLRB $7, V2, V3 // 433c1073 ++ VBITCLRH $15, V2, V3 // 437c1073 ++ VBITCLRW $31, V2, V3 // 43fc1073 ++ VBITCLRV $63, V2, V3 // 43fc1173 ++ VBITSETB $7, V2, V3 // 433c1473 ++ VBITSETH $15, V2, V3 // 437c1473 ++ VBITSETW $31, V2, V3 // 43fc1473 ++ VBITSETV $63, V2, V3 // 43fc1573 ++ VBITREVB $7, V2, V3 // 433c1873 ++ VBITREVH $15, V2, V3 // 437c1873 ++ VBITREVW $31, V2, V3 // 43fc1873 ++ VBITREVV $63, V2, V3 // 43fc1973 ++ XVBITCLRB $7, X2, X1 // 413c1077 ++ XVBITCLRH $15, X2, X1 // 417c1077 ++ XVBITCLRW $31, X2, X1 // 41fc1077 ++ XVBITCLRV $63, X2, X1 // 41fc1177 ++ XVBITSETB $7, X2, X1 // 413c1477 ++ XVBITSETH $15, X2, X1 // 417c1477 ++ XVBITSETW $31, X2, X1 // 41fc1477 ++ XVBITSETV $63, X2, X1 // 41fc1577 ++ XVBITREVB $7, X2, X1 // 413c1877 ++ XVBITREVH $15, X2, X1 // 417c1877 ++ XVBITREVW $31, X2, X1 // 41fc1877 ++ XVBITREVV $63, X2, X1 // 41fc1977 ++ + // MOVV C_DCON12_0, r + MOVV $0x7a90000000000000, R4 // MOVV $8831558869273542656, R4 // 04a41e03 + MOVV $0xea90000000000000, R4 // MOVV $-1544734672188080128, R4 // 04a43a03 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index 9164e9d59f..1fadbc648a 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -881,6 +881,31 @@ const ( + AXVPCNTW + AXVPCNTV + ++ AVBITCLRB ++ AVBITCLRH ++ AVBITCLRW ++ AVBITCLRV ++ AVBITSETB ++ AVBITSETH ++ AVBITSETW ++ AVBITSETV ++ AVBITREVB ++ AVBITREVH ++ AVBITREVW ++ AVBITREVV ++ AXVBITCLRB ++ AXVBITCLRH ++ AXVBITCLRW ++ AXVBITCLRV ++ AXVBITSETB ++ AXVBITSETH ++ AXVBITSETW ++ AXVBITSETV ++ AXVBITREVB ++ AXVBITREVH ++ AXVBITREVW ++ AXVBITREVV ++ + // LSX and LASX floating point instructions + AVFSQRTF + AVFSQRTD +diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go +index 15a264c8e2..aee0da0a6e 100644 +--- a/src/cmd/internal/obj/loong64/anames.go ++++ b/src/cmd/internal/obj/loong64/anames.go +@@ -401,6 +401,30 @@ var Anames = []string{ + "XVPCNTH", + "XVPCNTW", + "XVPCNTV", ++ "VBITCLRB", ++ "VBITCLRH", ++ "VBITCLRW", ++ "VBITCLRV", ++ "VBITSETB", ++ "VBITSETH", ++ "VBITSETW", ++ "VBITSETV", ++ "VBITREVB", ++ "VBITREVH", ++ "VBITREVW", ++ "VBITREVV", ++ "XVBITCLRB", ++ "XVBITCLRH", ++ "XVBITCLRW", ++ "XVBITCLRV", ++ "XVBITSETB", ++ "XVBITSETH", ++ "XVBITSETW", ++ "XVBITSETV", ++ "XVBITREVB", ++ "XVBITREVH", ++ "XVBITREVW", ++ "XVBITREVV", + "VFSQRTF", + "VFSQRTD", + "VFRECIPF", +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index 7a14137374..657d32ae81 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -1675,41 +1675,65 @@ func buildop(ctxt *obj.Link) { + opset(AVSRLB, r0) + opset(AVSRAB, r0) + opset(AVROTRB, r0) ++ opset(AVBITCLRB, r0) ++ opset(AVBITSETB, r0) ++ opset(AVBITREVB, r0) + + case AXVSLLB: + opset(AXVSRLB, r0) + opset(AXVSRAB, r0) + opset(AXVROTRB, r0) ++ opset(AXVBITCLRB, r0) ++ opset(AXVBITSETB, r0) ++ opset(AXVBITREVB, r0) + + case AVSLLH: + opset(AVSRLH, r0) + opset(AVSRAH, r0) + opset(AVROTRH, r0) ++ opset(AVBITCLRH, r0) ++ opset(AVBITSETH, r0) ++ opset(AVBITREVH, r0) + + case AXVSLLH: + opset(AXVSRLH, r0) + opset(AXVSRAH, r0) + opset(AXVROTRH, r0) ++ opset(AXVBITCLRH, r0) ++ opset(AXVBITSETH, r0) ++ opset(AXVBITREVH, r0) + + case AVSLLW: + opset(AVSRLW, r0) + opset(AVSRAW, r0) + opset(AVROTRW, r0) ++ opset(AVBITCLRW, r0) ++ opset(AVBITSETW, r0) ++ opset(AVBITREVW, r0) + + case AXVSLLW: + opset(AXVSRLW, r0) + opset(AXVSRAW, r0) + opset(AXVROTRW, r0) ++ opset(AXVBITCLRW, r0) ++ opset(AXVBITSETW, r0) ++ opset(AXVBITREVW, r0) + + case AVSLLV: + opset(AVSRLV, r0) + opset(AVSRAV, r0) + opset(AVROTRV, r0) ++ opset(AVBITCLRV, r0) ++ opset(AVBITSETV, r0) ++ opset(AVBITREVV, r0) + + case AXVSLLV: + opset(AXVSRLV, r0) + opset(AXVSRAV, r0) + opset(AXVROTRV, r0) ++ opset(AXVBITCLRV, r0) ++ opset(AXVBITSETV, r0) ++ opset(AXVBITREVV, r0) + + case AVPCNTB: + opset(AVPCNTH, r0) +@@ -3205,6 +3229,54 @@ func (c *ctxt0) oprrr(a obj.As) uint32 { + return 0xe9ce << 15 // xvmod.wu + case AXVMODVU: + return 0xe9cf << 15 // xvmod.du ++ case AVBITCLRB: ++ return 0xe218 << 15 // vbitclr.b ++ case AVBITCLRH: ++ return 0xe219 << 15 // vbitclr.h ++ case AVBITCLRW: ++ return 0xe21a << 15 // vbitclr.w ++ case AVBITCLRV: ++ return 0xe21b << 15 // vbitclr.d ++ case AVBITSETB: ++ return 0xe21c << 15 // vbitset.b ++ case AVBITSETH: ++ return 0xe21d << 15 // vbitset.h ++ case AVBITSETW: ++ return 0xe21e << 15 // vbitset.w ++ case AVBITSETV: ++ return 0xe21f << 15 // vbitset.d ++ case AVBITREVB: ++ return 0xe220 << 15 // vbitrev.b ++ case AVBITREVH: ++ return 0xe221 << 15 // vbitrev.h ++ case AVBITREVW: ++ return 0xe222 << 15 // vbitrev.w ++ case AVBITREVV: ++ return 0xe223 << 15 // vbitrev.d ++ case AXVBITCLRB: ++ return 0xea18 << 15 // xvbitclr.b ++ case AXVBITCLRH: ++ return 0xea19 << 15 // xvbitclr.h ++ case AXVBITCLRW: ++ return 0xea1a << 15 // xvbitclr.w ++ case AXVBITCLRV: ++ return 0xea1b << 15 // xvbitclr.d ++ case AXVBITSETB: ++ return 0xea1c << 15 // xvbitset.b ++ case AXVBITSETH: ++ return 0xea1d << 15 // xvbitset.h ++ case AXVBITSETW: ++ return 0xea1e << 15 // xvbitset.w ++ case AXVBITSETV: ++ return 0xea1f << 15 // xvbitset.d ++ case AXVBITREVB: ++ return 0xea20 << 15 // xvbitrev.b ++ case AXVBITREVH: ++ return 0xea21 << 15 // xvbitrev.h ++ case AXVBITREVW: ++ return 0xea22 << 15 // xvbitrev.w ++ case AXVBITREVV: ++ return 0xea23 << 15 // xvbitrev.d + } + + if a < 0 { +@@ -3668,6 +3740,54 @@ func (c *ctxt0) opirr(a obj.As) uint32 { + return 0x1DCD<<18 | 0x1<<15 // xvsrai.w + case AXVSRAV: + return 0x1DCD<<18 | 0x1<<16 // xvsrai.d ++ case AVBITCLRB: ++ return 0x1CC4<<18 | 0x1<<13 // vbitclri.b ++ case AVBITCLRH: ++ return 0x1CC4<<18 | 0x1<<14 // vbitclri.h ++ case AVBITCLRW: ++ return 0x1CC4<<18 | 0x1<<15 // vbitclri.w ++ case AVBITCLRV: ++ return 0x1CC4<<18 | 0x1<<16 // vbitclri.d ++ case AVBITSETB: ++ return 0x1CC5<<18 | 0x1<<13 // vbitseti.b ++ case AVBITSETH: ++ return 0x1CC5<<18 | 0x1<<14 // vbitseti.h ++ case AVBITSETW: ++ return 0x1CC5<<18 | 0x1<<15 // vbitseti.w ++ case AVBITSETV: ++ return 0x1CC5<<18 | 0x1<<16 // vbitseti.d ++ case AVBITREVB: ++ return 0x1CC6<<18 | 0x1<<13 // vbitrevi.b ++ case AVBITREVH: ++ return 0x1CC6<<18 | 0x1<<14 // vbitrevi.h ++ case AVBITREVW: ++ return 0x1CC6<<18 | 0x1<<15 // vbitrevi.w ++ case AVBITREVV: ++ return 0x1CC6<<18 | 0x1<<16 // vbitrevi.d ++ case AXVBITCLRB: ++ return 0x1DC4<<18 | 0x1<<13 // xvbitclri.b ++ case AXVBITCLRH: ++ return 0x1DC4<<18 | 0x1<<14 // xvbitclri.h ++ case AXVBITCLRW: ++ return 0x1DC4<<18 | 0x1<<15 // xvbitclri.w ++ case AXVBITCLRV: ++ return 0x1DC4<<18 | 0x1<<16 // xvbitclri.d ++ case AXVBITSETB: ++ return 0x1DC5<<18 | 0x1<<13 // xvbitseti.b ++ case AXVBITSETH: ++ return 0x1DC5<<18 | 0x1<<14 // xvbitseti.h ++ case AXVBITSETW: ++ return 0x1DC5<<18 | 0x1<<15 // xvbitseti.w ++ case AXVBITSETV: ++ return 0x1DC5<<18 | 0x1<<16 // xvbitseti.d ++ case AXVBITREVB: ++ return 0x1DC6<<18 | 0x1<<13 // xvbitrevi.b ++ case AXVBITREVH: ++ return 0x1DC6<<18 | 0x1<<14 // xvbitrevi.h ++ case AXVBITREVW: ++ return 0x1DC6<<18 | 0x1<<15 // xvbitrevi.w ++ case AXVBITREVV: ++ return 0x1DC6<<18 | 0x1<<16 // xvbitrevi.d + } + + if a < 0 { +-- +2.38.1 + diff --git a/0035-crypto-chacha20-add-loong64-SIMD-implementation.patch b/0035-crypto-chacha20-add-loong64-SIMD-implementation.patch new file mode 100644 index 0000000000000000000000000000000000000000..2c8eb2b11612f20bef6bf8f08996bedaa193b3b8 --- /dev/null +++ b/0035-crypto-chacha20-add-loong64-SIMD-implementation.patch @@ -0,0 +1,490 @@ +From d6bdc012b1c105a007d0fb5d7d1642f1a5653b1d Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Fri, 13 Dec 2024 17:09:31 +0800 +Subject: [PATCH 35/44] crypto/chacha20: add loong64 SIMD implementation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The performance of chacha20 has been greatly improved. + +goos: linux +goarch: loong64 +pkg: golang.org/x/crypto/chacha20 +cpu: Loongson-3A6000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +ChaCha20/64 171.9n ± 0% 159.3n ± 0% -7.33% (p=0.000 n=20) +ChaCha20/256 592.2n ± 0% 142.8n ± 0% -75.89% (p=0.000 n=20) +ChaCha20/10x25 981.5n ± 0% 518.8n ± 0% -47.14% (p=0.000 n=20) +ChaCha20/4096 8.991µ ± 0% 1.732µ ± 0% -80.74% (p=0.000 n=20) +ChaCha20/100x40 10.651µ ± 0% 5.135µ ± 0% -51.79% (p=0.000 n=20) +ChaCha20/65536 143.43µ ± 0% 28.76µ ± 0% -79.95% (p=0.000 n=20) +ChaCha20/1000x65 146.17µ ± 0% 37.13µ ± 0% -74.60% (p=0.000 n=20) +geomean 5.721µ 1.962µ -65.70% + + | bench.old | bench.new | + | B/s | B/s vs base | +ChaCha20/64 355.1Mi ± 0% 383.1Mi ± 0% +7.89% (p=0.000 n=20) +ChaCha20/256 412.2Mi ± 0% 1710.2Mi ± 0% +314.86% (p=0.000 n=20) +ChaCha20/10x25 242.9Mi ± 0% 459.6Mi ± 0% +89.19% (p=0.000 n=20) +ChaCha20/4096 434.5Mi ± 0% 2255.8Mi ± 0% +419.22% (p=0.000 n=20) +ChaCha20/100x40 358.1Mi ± 0% 742.9Mi ± 0% +107.44% (p=0.000 n=20) +ChaCha20/65536 435.8Mi ± 0% 2173.2Mi ± 0% +398.72% (p=0.000 n=20) +ChaCha20/1000x65 424.1Mi ± 0% 1669.4Mi ± 0% +293.64% (p=0.000 n=20) +geomean 373.9Mi 1.065Gi +191.55% + +goos: linux +goarch: loong64 +pkg: golang.org/x/crypto/chacha20 +cpu: Loongson-3A5000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +ChaCha20/64 234.5n ± 0% 295.8n ± 0% +26.14% (p=0.000 n=20) +ChaCha20/256 782.0n ± 0% 274.6n ± 0% -64.88% (p=0.000 n=20) +ChaCha20/10x25 1340.0n ± 0% 752.7n ± 0% -43.83% (p=0.000 n=20) +ChaCha20/4096 11.744µ ± 0% 3.455µ ± 0% -70.58% (p=0.000 n=20) +ChaCha20/100x40 14.151µ ± 0% 7.435µ ± 0% -47.46% (p=0.000 n=20) +ChaCha20/65536 188.05µ ± 0% 54.33µ ± 0% -71.11% (p=0.000 n=20) +ChaCha20/1000x65 191.44µ ± 0% 66.29µ ± 0% -65.37% (p=0.000 n=20) +geomean 7.604µ 3.436µ -54.81% + + | bench.old | bench.new | + | B/s | B/s vs base | +ChaCha20/64 260.3Mi ± 0% 206.3Mi ± 0% -20.73% (p=0.000 n=20) +ChaCha20/256 312.2Mi ± 0% 888.9Mi ± 0% +184.75% (p=0.000 n=20) +ChaCha20/10x25 177.9Mi ± 0% 316.8Mi ± 0% +78.08% (p=0.000 n=20) +ChaCha20/4096 332.6Mi ± 0% 1130.8Mi ± 0% +239.95% (p=0.000 n=20) +ChaCha20/100x40 269.6Mi ± 0% 513.1Mi ± 0% +90.34% (p=0.000 n=20) +ChaCha20/65536 332.4Mi ± 0% 1150.5Mi ± 0% +246.16% (p=0.000 n=20) +ChaCha20/1000x65 323.8Mi ± 0% 935.2Mi ± 0% +188.81% (p=0.000 n=20) +geomean 281.3Mi 622.6Mi +121.31% + +Change-Id: Iab4934d78b845e3b248bd5d0a9a62e4e9c516831 +--- + .../x/crypto/chacha20/chacha_loong64.go | 17 + + .../x/crypto/chacha20/chacha_loong64.s | 374 ++++++++++++++++++ + .../x/crypto/chacha20/chacha_noasm.go | 2 +- + 3 files changed, 392 insertions(+), 1 deletion(-) + create mode 100644 src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.go + create mode 100644 src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.s + +diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.go b/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.go +new file mode 100644 +index 0000000000..d0f5d909fc +--- /dev/null ++++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.go +@@ -0,0 +1,17 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++//go:build gc && !purego ++ ++package chacha20 ++ ++const bufSize = 256 ++ ++//go:noescape ++func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) ++ ++func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { ++ // add cpu.Loong64.HasLSX check TODO ++ xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter) ++} +diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.s b/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.s +new file mode 100644 +index 0000000000..883c8d992a +--- /dev/null ++++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.s +@@ -0,0 +1,374 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++// derived from chacha_arm64.s ++ ++//go:build gc && !purego ++ ++#include "textflag.h" ++ ++DATA ·constants+0x00(SB)/4, $0x61707865 ++DATA ·constants+0x04(SB)/4, $0x3320646e ++DATA ·constants+0x08(SB)/4, $0x79622d32 ++DATA ·constants+0x0c(SB)/4, $0x6b206574 ++GLOBL ·constants(SB), NOPTR|RODATA, $32 ++ ++DATA ·incRotMatrix+0x00(SB)/4, $0x00000000 ++DATA ·incRotMatrix+0x04(SB)/4, $0x00000001 ++DATA ·incRotMatrix+0x08(SB)/4, $0x00000002 ++DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003 ++GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32 ++ ++#define NUM_ROUNDS 10 ++ ++// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) ++TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 ++ MOVV dst+0(FP), R4 ++ MOVV src+24(FP), R5 ++ MOVV src_len+32(FP), R6 ++ MOVV key+48(FP), R7 ++ MOVV nonce+56(FP), R8 ++ MOVV counter+64(FP), R9 ++ ++ MOVV $·constants(SB), R10 ++ MOVV $·incRotMatrix(SB), R11 ++ ++ MOVW (R9), R12 ++ ++loop: ++ MOVV $NUM_ROUNDS, R15 ++ // load 4-32bit data from incRotMatrix added to counter ++ VMOVQ (R11), V30 ++ ++ // load contants ++ // VLDREPL.W $0, R10, V0 ++ WORD $0x30200140 ++ // VLDREPL.W $1, R10, V1 ++ WORD $0x30200541 ++ // VLDREPL.W $2, R10, V2 ++ WORD $0x30200942 ++ // VLDREPL.W $3, R10, V3 ++ WORD $0x30200d43 ++ ++ // load keys ++ // VLDREPL.W $0, R7, V4 ++ WORD $0x302000e4 ++ // VLDREPL.W $1, R7, V5 ++ WORD $0x302004e5 ++ // VLDREPL.W $2, R7, V6 ++ WORD $0x302008e6 ++ // VLDREPL.W $3, R7, V7 ++ WORD $0x30200ce7 ++ // VLDREPL.W $4, R7, V8 ++ WORD $0x302010e8 ++ // VLDREPL.W $5, R7, V9 ++ WORD $0x302014e9 ++ // VLDREPL.W $6, R7, V10 ++ WORD $0x302018ea ++ // VLDREPL.W $7, R7, V11 ++ WORD $0x30201ceb ++ ++ // load counter + nonce ++ // VLDREPL.W $0, R9, V12 ++ WORD $0x3020012c ++ ++ // VLDREPL.W $0, R8, V13 ++ WORD $0x3020010d ++ // VLDREPL.W $1, R8, V14 ++ WORD $0x3020050e ++ // VLDREPL.W $2, R8, V15 ++ WORD $0x3020090f ++ ++ // update counter ++ VADDW V30, V12, V12 ++ ++chacha: ++ // V0..V3 += V4..V7 ++ // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16) ++ VADDW V0, V4, V0 ++ VADDW V1, V5, V1 ++ VADDW V2, V6, V2 ++ VADDW V3, V7, V3 ++ VXORV V12, V0, V12 ++ VXORV V13, V1, V13 ++ VXORV V14, V2, V14 ++ VXORV V15, V3, V15 ++ VROTRW $16, V12, V12 ++ VROTRW $16, V13, V13 ++ VROTRW $16, V14, V14 ++ VROTRW $16, V15, V15 ++ ++ // V8..V11 += V12..V15 ++ // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12) ++ VADDW V8, V12, V8 ++ VADDW V9, V13, V9 ++ VADDW V10, V14, V10 ++ VADDW V11, V15, V11 ++ VXORV V4, V8, V4 ++ VXORV V5, V9, V5 ++ VXORV V6, V10, V6 ++ VXORV V7, V11, V7 ++ VROTRW $20, V4, V4 ++ VROTRW $20, V5, V5 ++ VROTRW $20, V6, V6 ++ VROTRW $20, V7, V7 ++ ++ // V0..V3 += V4..V7 ++ // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8) ++ VADDW V0, V4, V0 ++ VADDW V1, V5, V1 ++ VADDW V2, V6, V2 ++ VADDW V3, V7, V3 ++ VXORV V12, V0, V12 ++ VXORV V13, V1, V13 ++ VXORV V14, V2, V14 ++ VXORV V15, V3, V15 ++ VROTRW $24, V12, V12 ++ VROTRW $24, V13, V13 ++ VROTRW $24, V14, V14 ++ VROTRW $24, V15, V15 ++ ++ // V8..V11 += V12..V15 ++ // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7) ++ VADDW V12, V8, V8 ++ VADDW V13, V9, V9 ++ VADDW V14, V10, V10 ++ VADDW V15, V11, V11 ++ VXORV V4, V8, V4 ++ VXORV V5, V9, V5 ++ VXORV V6, V10, V6 ++ VXORV V7, V11, V7 ++ VROTRW $25, V4, V4 ++ VROTRW $25, V5, V5 ++ VROTRW $25, V6, V6 ++ VROTRW $25, V7, V7 ++ ++ // V0..V3 += V5..V7, V4 ++ // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16) ++ VADDW V0, V5, V0 ++ VADDW V1, V6, V1 ++ VADDW V2, V7, V2 ++ VADDW V3, V4, V3 ++ VXORV V15, V0, V15 ++ VXORV V12, V1, V12 ++ VXORV V13, V2, V13 ++ VXORV V14, V3, V14 ++ VROTRW $16, V15, V15 ++ VROTRW $16, V12, V12 ++ VROTRW $16, V13, V13 ++ VROTRW $16, V14, V14 ++ ++ // V10,V11,V8,V9 += V15,V12,V13,V14 ++ // V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 12) ++ VADDW V10, V15, V10 ++ VADDW V11, V12, V11 ++ VADDW V8, V13, V8 ++ VADDW V9, V14, V9 ++ VXORV V5, V10, V5 ++ VXORV V6, V11, V6 ++ VXORV V7, V8, V7 ++ VXORV V4, V9, V4 ++ VROTRW $20, V5, V5 ++ VROTRW $20, V6, V6 ++ VROTRW $20, V7, V7 ++ VROTRW $20, V4, V4 ++ ++ // V0..V3 += V5..V7, V4 ++ // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 8) ++ VADDW V5, V0, V0 ++ VADDW V6, V1, V1 ++ VADDW V7, V2, V2 ++ VADDW V4, V3, V3 ++ VXORV V15, V0, V15 ++ VXORV V12, V1, V12 ++ VXORV V13, V2, V13 ++ VXORV V14, V3, V14 ++ VROTRW $24, V15, V15 ++ VROTRW $24, V12, V12 ++ VROTRW $24, V13, V13 ++ VROTRW $24, V14, V14 ++ ++ // V10,V11,V8,V9 += V15,V12,V13,V14 ++ // V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 7) ++ VADDW V15, V10, V10 ++ VADDW V12, V11, V11 ++ VADDW V13, V8, V8 ++ VADDW V14, V9, V9 ++ VXORV V5, V10, V5 ++ VXORV V6, V11, V6 ++ VXORV V7, V8, V7 ++ VXORV V4, V9, V4 ++ VROTRW $25, V5, V5 ++ VROTRW $25, V6, V6 ++ VROTRW $25, V7, V7 ++ VROTRW $25, V4, V4 ++ ++ SUBV $1, R15 ++ BNE R15, R0, chacha ++ ++ // load origin contants ++ // VLDREPL.W $0, R10, V16 ++ WORD $0x30200150 ++ // VLDREPL.W $1, R10, V17 ++ WORD $0x30200551 ++ // VLDREPL.W $2, R10, V18 ++ WORD $0x30200952 ++ // VLDREPL.W $3, R10, V19 ++ WORD $0x30200d53 ++ ++ // load origin keys ++ // VLDREPL.W $0, R7, V20 ++ WORD $0x302000f4 ++ // VLDREPL.W $1, R7, V21 ++ WORD $0x302004f5 ++ // VLDREPL.W $2, R7, V22 ++ WORD $0x302008f6 ++ // VLDREPL.W $3, R7, V23 ++ WORD $0x30200cf7 ++ // VLDREPL.W $4, R7, V24 ++ WORD $0x302010f8 ++ // VLDREPL.W $5, R7, V25 ++ WORD $0x302014f9 ++ // VLDREPL.W $6, R7, V26 ++ WORD $0x302018fa ++ // VLDREPL.W $7, R7, V27 ++ WORD $0x30201cfb ++ ++ // add back the initial state to generate the key stream ++ VADDW V30, V12, V12 // update counter in advance to prevent V30 from being overwritten ++ VADDW V16, V0, V0 ++ VADDW V17, V1, V1 ++ VADDW V18, V2, V2 ++ VADDW V19, V3, V3 ++ ++ // load origin counter + nonce ++ // VLDREPL.W $0, R9, V28 ++ WORD $0x3020013c ++ // VLDREPL.W $0, R8, V29 ++ WORD $0x3020011d ++ // VLDREPL.W $1, R8, V30 ++ WORD $0x3020051e ++ // VLDREPL.W $2, R8, V31 ++ WORD $0x3020091f ++ ++ VADDW V20, V4, V4 ++ VADDW V21, V5, V5 ++ VADDW V22, V6, V6 ++ VADDW V23, V7, V7 ++ VADDW V24, V8, V8 ++ VADDW V25, V9, V9 ++ VADDW V26, V10, V10 ++ VADDW V27, V11, V11 ++ VADDW V28, V12, V12 ++ VADDW V29, V13, V13 ++ VADDW V30, V14, V14 ++ VADDW V31, V15, V15 ++ ++ // shuffle ++ VILVLW V0, V1, V16 ++ VILVHW V0, V1, V17 ++ VILVLW V2, V3, V18 ++ VILVHW V2, V3, V19 ++ VILVLW V4, V5 ,V20 ++ VILVHW V4, V5, V21 ++ VILVLW V6, V7, V22 ++ VILVHW V6, V7, V23 ++ VILVLW V8, V9, V24 ++ VILVHW V8, V9, V25 ++ VILVLW V10, V11, V26 ++ VILVHW V10, V11, V27 ++ VILVLW V12, V13, V28 ++ VILVHW V12, V13, V29 ++ VILVLW V14, V15, V30 ++ VILVHW V14, V15, V31 ++ VILVLV V16, V18, V0 ++ VILVHV V16, V18, V4 ++ VILVLV V17, V19, V8 ++ VILVHV V17, V19, V12 ++ ++ // load src data from R5 ++ VMOVQ 0(R5), V16 ++ VMOVQ 16(R5), V17 ++ VMOVQ 32(R5), V18 ++ VMOVQ 48(R5), V19 ++ ++ VILVLV V20, V22, V1 ++ VILVHV V20, V22, V5 ++ VILVLV V21, V23, V9 ++ VILVHV V21, V23, V13 ++ ++ VMOVQ 64(R5), V20 ++ VMOVQ 80(R5), V21 ++ VMOVQ 96(R5), V22 ++ VMOVQ 112(R5), V23 ++ ++ VILVLV V24, V26, V2 ++ VILVHV V24, V26, V6 ++ VILVLV V25, V27, V10 ++ VILVHV V25, V27, V14 ++ ++ VMOVQ 128(R5), V24 ++ VMOVQ 144(R5), V25 ++ VMOVQ 160(R5), V26 ++ VMOVQ 176(R5), V27 ++ ++ VILVLV V28, V30, V3 ++ VILVHV V28, V30, V7 ++ VILVLV V29, V31, V11 ++ VILVHV V29, V31, V15 ++ ++ VMOVQ 192(R5), V28 ++ VMOVQ 208(R5), V29 ++ VMOVQ 224(R5), V30 ++ VMOVQ 240(R5), V31 ++ ++ VXORV V0, V16, V16 ++ VXORV V1, V17, V17 ++ VXORV V2, V18, V18 ++ VXORV V3, V19, V19 ++ ++ VMOVQ V16, 0(R4) ++ VMOVQ V17, 16(R4) ++ VMOVQ V18, 32(R4) ++ VMOVQ V19, 48(R4) ++ ++ VXORV V4, V20, V20 ++ VXORV V5, V21, V21 ++ VXORV V6, V22, V22 ++ VXORV V7, V23, V23 ++ ++ VMOVQ V20, 64(R4) ++ VMOVQ V21, 80(R4) ++ VMOVQ V22, 96(R4) ++ VMOVQ V23, 112(R4) ++ ++ VXORV V8, V24, V24 ++ VXORV V9, V25, V25 ++ VXORV V10, V26, V26 ++ VXORV V11, V27, V27 ++ ++ VMOVQ V24, 128(R4) ++ VMOVQ V25, 144(R4) ++ VMOVQ V26, 160(R4) ++ VMOVQ V27, 176(R4) ++ ++ VXORV V12, V28, V28 ++ VXORV V13, V29, V29 ++ VXORV V14, V30, V30 ++ VXORV V15, V31, V31 ++ ++ VMOVQ V28, 192(R4) ++ VMOVQ V29, 208(R4) ++ VMOVQ V30, 224(R4) ++ VMOVQ V31, 240(R4) ++ ++ ADD $4, R12, R12 ++ MOVW R12, (R9) // update counter ++ ++ ADDV $256, R4, R4 ++ ADDV $256, R5, R5 ++ SUBV $256, R6, R6 ++ BNE R6, R0, loop ++ ++ RET +diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go b/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go +index c709b72847..3853cc0e0b 100644 +--- a/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go ++++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego ++//go:build (!arm64 && !loong64 && !s390x && !ppc64 && !ppc64le) || !gc || purego + + package chacha20 + +-- +2.38.1 + diff --git a/0036-internal-bytealg-optimize-Count-String-in-loong64.patch b/0036-internal-bytealg-optimize-Count-String-in-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..3a513c22d1714506707ce1b4ebb7eeff0af434cd --- /dev/null +++ b/0036-internal-bytealg-optimize-Count-String-in-loong64.patch @@ -0,0 +1,268 @@ +From 1698704d825764d2cbdbbf2718c582cf45d66fb0 Mon Sep 17 00:00:00 2001 +From: Guoqi Chen +Date: Tue, 10 Dec 2024 21:06:28 +0800 +Subject: [PATCH 36/44] internal/bytealg: optimize Count{,String} in loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Benchmark on Loongson 3A6000 and 3A5000: + +goos: linux +goarch: loong64 +pkg: bytes +cpu: Loongson-3A6000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +CountSingle/10 12.81n ± 0% 10.74n ± 0% -16.16% (p=0.000 n=10) +CountSingle/32 33.135n ± 0% 8.007n ± 0% -75.84% (p=0.000 n=10) +CountSingle/4K 4057.0n ± 0% 207.5n ± 0% -94.89% (p=0.000 n=10) +CountSingle/4M 4161.7µ ± 0% 217.1µ ± 0% -94.78% (p=0.000 n=10) +CountSingle/64M 68.722m ± 0% 3.717m ± 11% -94.59% (p=0.000 n=10) +geomean 13.76µ 1.705µ -87.61% + + | bench.old | bench.new | + | B/s | B/s vs base | +CountSingle/10 744.4Mi ± 0% 887.8Mi ± 0% +19.26% (p=0.000 n=10) +CountSingle/32 921.0Mi ± 0% 3811.5Mi ± 0% +313.84% (p=0.000 n=10) +CountSingle/4K 962.7Mi ± 0% 18825.3Mi ± 0% +1855.40% (p=0.000 n=10) +CountSingle/4M 961.2Mi ± 0% 18425.4Mi ± 0% +1817.02% (p=0.000 n=10) +CountSingle/64M 931.3Mi ± 0% 17216.0Mi ± 10% +1748.62% (p=0.000 n=10) +geomean 900.1Mi 7.092Gi +706.88% + +goos: linux +goarch: loong64 +pkg: bytes +cpu: Loongson-3A5000-HV @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +CountSingle/10 14.03n ± 1% 14.82n ± 0% +5.67% (p=0.000 n=10) +CountSingle/32 36.23n ± 0% 11.61n ± 0% -67.95% (p=0.000 n=10) +CountSingle/4K 4367.0n ± 0% 323.5n ± 0% -92.59% (p=0.000 n=10) +CountSingle/4M 4538.6µ ± 0% 381.2µ ± 0% -91.60% (p=0.000 n=10) +CountSingle/64M 76.575m ± 22% 7.971m ± 0% -89.59% (p=0.000 n=10) +geomean 15.05µ 2.790µ -81.46% + + | bench.old | bench.new | + | B/s | B/s vs base | +CountSingle/10 680.0Mi ± 1% 643.7Mi ± 0% -5.34% (p=0.000 n=10) +CountSingle/32 842.2Mi ± 0% 2628.4Mi ± 0% +212.07% (p=0.000 n=10) +CountSingle/4K 894.5Mi ± 0% 12075.4Mi ± 0% +1249.95% (p=0.000 n=10) +CountSingle/4M 881.3Mi ± 0% 10492.9Mi ± 0% +1090.57% (p=0.000 n=10) +CountSingle/64M 835.8Mi ± 18% 8028.7Mi ± 0% +860.61% (p=0.000 n=10) +geomean 822.9Mi 4.334Gi +439.27% + +Change-Id: I0a45139965b3e5eb09ab22be75145302f88a1915 +--- + src/internal/bytealg/bytealg.go | 3 + + src/internal/bytealg/count_loong64.s | 110 ++++++++++++++++++-------- + src/internal/cpu/cpu.go | 1 + + src/internal/cpu/cpu_loong64.go | 1 + + src/internal/cpu/cpu_loong64_hwcap.go | 2 + + 5 files changed, 85 insertions(+), 32 deletions(-) + +diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go +index 6b79a2e1fa..a5f71ce342 100644 +--- a/src/internal/bytealg/bytealg.go ++++ b/src/internal/bytealg/bytealg.go +@@ -18,6 +18,9 @@ const ( + offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX) + + offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9) ++ ++ offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX) ++ offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX) + ) + + // MaxLen is the maximum length of the string to be searched for (argument b) in Index. +diff --git a/src/internal/bytealg/count_loong64.s b/src/internal/bytealg/count_loong64.s +index db8ba2cb24..5c9dfeb0eb 100644 +--- a/src/internal/bytealg/count_loong64.s ++++ b/src/internal/bytealg/count_loong64.s +@@ -25,17 +25,81 @@ TEXT ·CountString(SB),NOSPLIT,$0-32 + // R5 = s_len + // R6 = byte to count + TEXT countbody<>(SB),NOSPLIT,$0 +- MOVV R0, R7 // count +- ADDV R4, R5 // end ++ MOVV R0, R7 // count ++ ++ // short path to handle 0-byte case ++ BEQ R5, done ++ ++ // jump directly to tail length < 4 ++ MOVV $4, R8 ++ BLT R5, R8, tail ++ ++ // jump directly to genericCountBody if length < 16 ++ MOVV $16, R8 ++ BLT R5, R8, genericCountBody ++ ++ // jump directly to lsxCountBody if length < 64 ++ MOVV $64, R8 ++ BLT R5, R8, lsxCountBody ++lasxCountBody: ++ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R8 ++ BEQ R8, lsxCountBody ++ MOVV $32, R9 ++ XVMOVQ R6, X0.B32 ++ ++ PCALIGN $16 ++lasxLoop: ++ XVMOVQ (R4), X1 ++ XVSEQB X0, X1, X2 ++ XVANDB $1, X2, X2 ++ XVPCNTV X2, X3 ++ XVMOVQ X3.V[0], R8 ++ ADDV R8, R7 ++ XVMOVQ X3.V[1], R8 ++ ADDV R8, R7 ++ XVMOVQ X3.V[2], R8 ++ ADDV R8, R7 ++ XVMOVQ X3.V[3], R8 ++ ADDV R8, R7 ++ ADDV $-32, R5 ++ ADDV $32, R4 ++ BGE R5, R9, lasxLoop ++ ++lsxCountBody: ++ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R8 ++ BEQ R8, genericCountBody ++ // jump directly to genericCountBody if length < 16 ++ MOVV $16, R9 ++ BLT R5, R9, genericCountBody ++ VMOVQ R6, V0.B16 ++ ++ PCALIGN $16 ++lsxLoop: ++ VMOVQ (R4), V1 ++ VSEQB V0, V1, V2 ++ VANDB $1, V2, V2 ++ VPCNTV V2, V3 ++ VMOVQ V3.V[0], R8 ++ ADDV R8, R7 ++ VMOVQ V3.V[1], R8 ++ ADDV R8, R7 ++ ADDV $-16, R5 ++ ADDV $16, R4 ++ BGE R5, R9, lsxLoop ++ ++ // Work with genericCountBody shorter than 16 bytes ++genericCountBody: ++ MOVV $4, R9 + MOVV $1, R17 + +-loop: +- ADDV $8, R4, R9 ++ PCALIGN $16 ++genericLoop: + BLT R5, R9, tail +- MOVV (R4), R8 ++ ADDV $-4, R5 ++ MOVWU (R4)(R5), R8 + + AND $0xff, R8, R10 +- WORD $0xcf210b // bstrpick.w r11, r8, 15, 8 ++ BSTRPICKW $15, R8, $8, R11 + XOR R6, R10, R10 + XOR R6, R11, R11 + MASKNEZ R10, R17, R12 +@@ -43,8 +107,8 @@ loop: + ADDV R7, R12, R7 + ADDV R7, R13, R7 + +- WORD $0xd7410a // bstrpick.w r10, r8, 23, 16 +- WORD $0xdf610b // bstrpick.w r11, r8, 31, 24 ++ BSTRPICKW $23, R8, $16, R10 ++ BSTRPICKW $31, R8, $24, R11 + XOR R6, R10, R10 + XOR R6, R11, R11 + MASKNEZ R10, R17, R12 +@@ -52,35 +116,17 @@ loop: + ADDV R7, R12, R7 + ADDV R7, R13, R7 + +- WORD $0xe7810a // bstrpick.w r10, r8, 39, 32 +- WORD $0xefa10b // bstrpick.w r11, r8, 47, 40 +- XOR R6, R10, R10 +- XOR R6, R11, R11 +- MASKNEZ R10, R17, R12 +- MASKNEZ R11, R17, R13 +- ADDV R7, R12, R7 +- ADDV R7, R13, R7 +- +- WORD $0xf7c10a // bstrpick.w r10, r8, 55, 48 +- WORD $0xffe10b // bstrpick.w r11, r8, 63, 56 +- XOR R6, R10, R10 +- XOR R6, R11, R11 +- MASKNEZ R10, R17, R12 +- MASKNEZ R11, R17, R13 +- ADDV R7, R12, R7 +- ADDV R7, R13, R7 +- +- MOVV R9, R4 +- JMP loop ++ JMP genericLoop + ++ // Work with tail shorter than 4 bytes ++ PCALIGN $16 + tail: +- BEQ R4, R5, done +- MOVBU (R4), R8 +- ADDV $1, R4 ++ BEQ R5, done ++ ADDV $-1, R5 ++ MOVBU (R4)(R5), R8 + BNE R6, R8, tail + ADDV $1, R7 + JMP tail +- + done: + MOVV R7, R4 + RET +diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go +index cd3db10523..2443b31fc8 100644 +--- a/src/internal/cpu/cpu.go ++++ b/src/internal/cpu/cpu.go +@@ -83,6 +83,7 @@ var ARM64 struct { + var Loong64 struct { + _ CacheLinePad + HasLSX bool // support 128-bit vector extension ++ HasLASX bool // support 256-bit vector extension + HasCRC32 bool // support CRC instruction + HasLAMCAS bool // support AMCAS[_DB].{B/H/W/D} + HasLAM_BH bool // support AM{SWAP/ADD}[_DB].{B/H} instruction +diff --git a/src/internal/cpu/cpu_loong64.go b/src/internal/cpu/cpu_loong64.go +index 92583d0bca..9a58ea251c 100644 +--- a/src/internal/cpu/cpu_loong64.go ++++ b/src/internal/cpu/cpu_loong64.go +@@ -27,6 +27,7 @@ func get_cpucfg(reg uint32) uint32 + func doinit() { + options = []option{ + {Name: "lsx", Feature: &Loong64.HasLSX}, ++ {Name: "lasx", Feature: &Loong64.HasLASX}, + {Name: "crc32", Feature: &Loong64.HasCRC32}, + {Name: "lamcas", Feature: &Loong64.HasLAMCAS}, + {Name: "lam_bh", Feature: &Loong64.HasLAM_BH}, +diff --git a/src/internal/cpu/cpu_loong64_hwcap.go b/src/internal/cpu/cpu_loong64_hwcap.go +index 58397adae8..6c6b8a81f2 100644 +--- a/src/internal/cpu/cpu_loong64_hwcap.go ++++ b/src/internal/cpu/cpu_loong64_hwcap.go +@@ -13,12 +13,14 @@ var HWCap uint + // HWCAP bits. These are exposed by the Linux kernel. + const ( + hwcap_LOONGARCH_LSX = 1 << 4 ++ hwcap_LOONGARCH_LASX = 1 << 5 + ) + + func hwcapInit() { + // TODO: Features that require kernel support like LSX and LASX can + // be detected here once needed in std library or by the compiler. + Loong64.HasLSX = hwcIsSet(HWCap, hwcap_LOONGARCH_LSX) ++ Loong64.HasLASX = hwcIsSet(HWCap, hwcap_LOONGARCH_LASX) + } + + func hwcIsSet(hwc uint, val uint) bool { +-- +2.38.1 + diff --git a/0037-cmd-internal-obj-cmd-asm-reclassify-32-bit-immediate.patch b/0037-cmd-internal-obj-cmd-asm-reclassify-32-bit-immediate.patch new file mode 100644 index 0000000000000000000000000000000000000000..ccad194868f90528439f26426f941baa445ffaca --- /dev/null +++ b/0037-cmd-internal-obj-cmd-asm-reclassify-32-bit-immediate.patch @@ -0,0 +1,690 @@ +From a713105842cd7b88dbb573980731062c218a8310 Mon Sep 17 00:00:00 2001 +From: limeidan +Date: Mon, 16 Dec 2024 16:31:37 +0800 +Subject: [PATCH 37/44] cmd/internal/obj, cmd/asm: reclassify 32-bit immediate + value + +Change-Id: If9fd257ca0837a8c8597889c4f5ed3d4edc602c1 +--- + .../asm/internal/asm/testdata/loong64enc1.s | 4 +- + .../asm/internal/asm/testdata/loong64enc2.s | 2 +- + src/cmd/internal/obj/loong64/a.out.go | 31 +- + src/cmd/internal/obj/loong64/asm.go | 376 +++++++----------- + src/cmd/internal/obj/loong64/cnames.go | 25 +- + 5 files changed, 186 insertions(+), 252 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index 19070c89ef..b40d86e596 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -107,8 +107,8 @@ lable2: + MOVV $4(R4), R5 // 8510c002 + MOVW $-1, R4 // 04fcff02 + MOVV $-1, R4 // 04fcff02 +- MOVW $1, R4 // 0404c002 +- MOVV $1, R4 // 0404c002 ++ MOVW $1, R4 // 04048003 ++ MOVV $1, R4 // 04048003 + ADD $-1, R4, R5 // 85fcbf02 + ADD $-1, R4 // 84fcbf02 + ADDV $-1, R4, R5 // 85fcff02 +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc2.s b/src/cmd/asm/internal/asm/testdata/loong64enc2.s +index ee3bad74b1..91aed4e2c7 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc2.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc2.s +@@ -12,7 +12,7 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0 + AND $-1, R4, R5 // 1efcbf0285f81400 + AND $-1, R4 // 1efcbf0284f81400 + MOVW $-1, F4 // 1efcbf02c4a71401 +- MOVW $1, F4 // 1e048002c4a71401 ++ MOVW $1, F4 // 1e048003c4a71401 + TEQ $4, R4, R5 // 8508005c04002a00 + TEQ $4, R4 // 0408005c04002a00 + TNE $4, R4, R5 // 8508005804002a00 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index 1fadbc648a..f2d4c41d68 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -325,19 +325,26 @@ const ( + C_XREG + C_ARNG // Vn. + C_ELEM // Vn.[index] ++ + C_ZCON +- C_SCON // 12 bit signed +- C_UCON // 32 bit signed, low 12 bits 0 +- +- // When the immediate value is SCON, it can choose either the ADDCON implementation +- // or the ANDCON implementation, using ADD0CON/AND0CON to distinguish them, so that +- // the program can choose the implementation with fewer instructions. +- C_ADD0CON +- C_AND0CON +- +- C_ADDCON // -0x800 <= v < 0 +- C_ANDCON // 0 < v <= 0xFFF +- C_LCON // other 32 ++ C_U1CON // 1 bit unsigned constant ++ C_U2CON // 2 bit unsigned constant ++ C_U3CON // 3 bit unsigned constant ++ C_U4CON // 4 bit unsigned constant ++ C_U5CON // 5 bit unsigned constant ++ C_U6CON // 6 bit unsigned constant ++ C_U7CON // 7 bit unsigned constant ++ C_U8CON // 8 bit unsigned constant ++ C_S5CON // 5 bit signed constant ++ C_US12CON // same as C_S12CON, increase the priority of C_S12CON in special cases. ++ C_UU12CON // same as C_U12CON, increase the priority of C_U12CON in special cases. ++ C_S12CON // 12 bit signed constant, -0x800 < v <= 0x7ff ++ C_U12CON // 12 bit unsigned constant, 0 < v <= 0xfff ++ C_12CON // 12 bit signed constant, or 12 bit unsigned constant ++ C_U15CON // 15 bit unsigned constant ++ C_15CON20_0 // 15 bit unsigned constant, low 12 bits 0 ++ C_32CON20_0 // 32 bit signed, low 12 bits 0 ++ C_32CON // other 32 bit signed + + // 64 bit signed, lo32 bits 0, hi20 bits are not 0, hi12 bits can + // be obtained by sign extension of the hi20 bits. +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index 657d32ae81..2480cf9382 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -51,12 +51,6 @@ const ( + // branchLoopHead marks loop entry. + // Used to insert padding for under-aligned loops. + branchLoopHead +- immFiledSi5 // The encoding of the immediate field in the instruction is 5-bits +- immFiledUi3 // The encoding of the immediate field in the instruction is 3-bits +- immFiledUi4 // The encoding of the immediate field in the instruction is 4-bits +- immFiledUi5 // The encoding of the immediate field in the instruction is 5-bits +- immFiledUi6 // The encoding of the immediate field in the instruction is 6-bits +- immFiledUi8 // The encoding of the immediate field in the instruction is 8-bits + ) + + var optab = []Optab{ +@@ -94,45 +88,41 @@ var optab = []Optab{ + {ACMPEQF, C_FREG, C_FREG, C_NONE, C_FCCREG, C_NONE, 2, 4, 0, 0}, + {AVSEQB, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, + {AXVSEQB, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, +- {AVSEQB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 13, 4, 0, immFiledSi5}, +- {AXVSEQB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 13, 4, 0, immFiledSi5}, +- {AVSEQB, C_ADDCON, C_VREG, C_NONE, C_VREG, C_NONE, 13, 4, 0, immFiledSi5}, +- {AXVSEQB, C_ADDCON, C_XREG, C_NONE, C_XREG, C_NONE, 13, 4, 0, immFiledSi5}, ++ {AVSEQB, C_S5CON, C_VREG, C_NONE, C_VREG, C_NONE, 13, 4, 0, 0}, ++ {AXVSEQB, C_S5CON, C_XREG, C_NONE, C_XREG, C_NONE, 13, 4, 0, 0}, + + {AVANDV, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, + {AXVANDV, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, +- {AVANDB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, immFiledUi8}, +- {AXVANDB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, immFiledUi8}, +- {AVANDB, C_ADDCON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, immFiledUi8}, +- {AXVANDB, C_ADDCON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, immFiledUi8}, ++ {AVANDB, C_U8CON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, 0}, ++ {AXVANDB, C_U8CON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, 0}, + + {AVSLLB, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, + {AXVSLLB, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, +- {AVSLLB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 29, 4, 0, immFiledUi3}, +- {AXVSLLB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 29, 4, 0, immFiledUi3}, +- {AVSLLB, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 29, 4, 0, immFiledUi3}, +- {AXVSLLB, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 29, 4, 0, immFiledUi3}, ++ {AVSLLB, C_U3CON, C_VREG, C_NONE, C_VREG, C_NONE, 29, 4, 0, 0}, ++ {AXVSLLB, C_U3CON, C_XREG, C_NONE, C_XREG, C_NONE, 29, 4, 0, 0}, ++ {AVSLLB, C_U3CON, C_NONE, C_NONE, C_VREG, C_NONE, 29, 4, 0, 0}, ++ {AXVSLLB, C_U3CON, C_NONE, C_NONE, C_XREG, C_NONE, 29, 4, 0, 0}, + + {AVSLLH, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, + {AXVSLLH, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, +- {AVSLLH, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 31, 4, 0, immFiledUi4}, +- {AXVSLLH, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 31, 4, 0, immFiledUi4}, +- {AVSLLH, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 31, 4, 0, immFiledUi4}, +- {AXVSLLH, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 31, 4, 0, immFiledUi4}, ++ {AVSLLH, C_U4CON, C_VREG, C_NONE, C_VREG, C_NONE, 31, 4, 0, 0}, ++ {AXVSLLH, C_U4CON, C_XREG, C_NONE, C_XREG, C_NONE, 31, 4, 0, 0}, ++ {AVSLLH, C_U4CON, C_NONE, C_NONE, C_VREG, C_NONE, 31, 4, 0, 0}, ++ {AXVSLLH, C_U4CON, C_NONE, C_NONE, C_XREG, C_NONE, 31, 4, 0, 0}, + + {AVSLLW, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, + {AXVSLLW, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, +- {AVSLLW, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 32, 4, 0, immFiledUi5}, +- {AXVSLLW, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 32, 4, 0, immFiledUi5}, +- {AVSLLW, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 32, 4, 0, immFiledUi5}, +- {AXVSLLW, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 32, 4, 0, immFiledUi5}, ++ {AVSLLW, C_U5CON, C_VREG, C_NONE, C_VREG, C_NONE, 32, 4, 0, 0}, ++ {AXVSLLW, C_U5CON, C_XREG, C_NONE, C_XREG, C_NONE, 32, 4, 0, 0}, ++ {AVSLLW, C_U5CON, C_NONE, C_NONE, C_VREG, C_NONE, 32, 4, 0, 0}, ++ {AXVSLLW, C_U5CON, C_NONE, C_NONE, C_XREG, C_NONE, 32, 4, 0, 0}, + + {AVSLLV, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0}, + {AXVSLLV, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0}, +- {AVSLLV, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 33, 4, 0, immFiledUi6}, +- {AXVSLLV, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 33, 4, 0, immFiledUi6}, +- {AVSLLV, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 33, 4, 0, immFiledUi6}, +- {AXVSLLV, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 33, 4, 0, immFiledUi6}, ++ {AVSLLV, C_U6CON, C_VREG, C_NONE, C_VREG, C_NONE, 33, 4, 0, 0}, ++ {AXVSLLV, C_U6CON, C_XREG, C_NONE, C_XREG, C_NONE, 33, 4, 0, 0}, ++ {AVSLLV, C_U6CON, C_NONE, C_NONE, C_VREG, C_NONE, 33, 4, 0, 0}, ++ {AXVSLLV, C_U6CON, C_NONE, C_NONE, C_XREG, C_NONE, 33, 4, 0, 0}, + + {ACLOW, C_REG, C_NONE, C_NONE, C_REG, C_NONE, 9, 4, 0, 0}, + {AABSF, C_FREG, C_NONE, C_NONE, C_FREG, C_NONE, 9, 4, 0, 0}, +@@ -229,48 +219,46 @@ var optab = []Optab{ + + {AMOVW, C_LACON, C_NONE, C_NONE, C_REG, C_NONE, 26, 12, REGSP, 0}, + {AMOVV, C_LACON, C_NONE, C_NONE, C_REG, C_NONE, 26, 12, REGSP, 0}, +- {AMOVW, C_ADDCON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0}, +- {AMOVV, C_ADDCON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0}, +- {AMOVW, C_ANDCON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0}, +- {AMOVV, C_ANDCON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0}, +- +- {AMOVW, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 24, 4, 0, 0}, +- {AMOVV, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 24, 4, 0, 0}, +- {AMOVW, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP}, +- {AMOVV, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP}, ++ {AMOVW, C_12CON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0}, ++ {AMOVV, C_12CON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0}, ++ ++ {AMOVW, C_32CON20_0, C_NONE, C_NONE, C_REG, C_NONE, 24, 4, 0, 0}, ++ {AMOVV, C_32CON20_0, C_NONE, C_NONE, C_REG, C_NONE, 24, 4, 0, 0}, ++ {AMOVW, C_32CON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP}, ++ {AMOVV, C_32CON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP}, + {AMOVV, C_DCON12_0, C_NONE, C_NONE, C_REG, C_NONE, 67, 4, 0, NOTUSETMP}, + {AMOVV, C_DCON12_20S, C_NONE, C_NONE, C_REG, C_NONE, 68, 8, 0, NOTUSETMP}, + {AMOVV, C_DCON32_12S, C_NONE, C_NONE, C_REG, C_NONE, 69, 12, 0, NOTUSETMP}, + {AMOVV, C_DCON, C_NONE, C_NONE, C_REG, C_NONE, 59, 16, 0, NOTUSETMP}, + +- {AADD, C_ADD0CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, +- {AADD, C_ADD0CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, +- {AADD, C_ANDCON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, +- {AADD, C_ANDCON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, +- +- {AADDV, C_ADD0CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, +- {AADDV, C_ADD0CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, +- {AADDV, C_ANDCON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, +- {AADDV, C_ANDCON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, +- +- {AAND, C_AND0CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, +- {AAND, C_AND0CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, +- {AAND, C_ADDCON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, +- {AAND, C_ADDCON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, +- +- {AADD, C_UCON, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, +- {AADD, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, +- {AADDV, C_UCON, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, +- {AADDV, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, +- {AAND, C_UCON, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, +- {AAND, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, +- +- {AADD, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, +- {AADDV, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, +- {AAND, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, +- {AADD, C_LCON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, +- {AADDV, C_LCON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, +- {AAND, C_LCON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, ++ {AADD, C_US12CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, ++ {AADD, C_US12CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, ++ {AADD, C_U12CON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, ++ {AADD, C_U12CON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, ++ ++ {AADDV, C_US12CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, ++ {AADDV, C_US12CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, ++ {AADDV, C_U12CON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, ++ {AADDV, C_U12CON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, ++ ++ {AAND, C_UU12CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, ++ {AAND, C_UU12CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0}, ++ {AAND, C_S12CON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, ++ {AAND, C_S12CON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0}, ++ ++ {AADD, C_32CON20_0, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, ++ {AADD, C_32CON20_0, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, ++ {AADDV, C_32CON20_0, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, ++ {AADDV, C_32CON20_0, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, ++ {AAND, C_32CON20_0, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, ++ {AAND, C_32CON20_0, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0}, ++ ++ {AADD, C_32CON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, ++ {AADDV, C_32CON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, ++ {AAND, C_32CON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, ++ {AADD, C_32CON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, ++ {AADDV, C_32CON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, ++ {AAND, C_32CON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0}, + + {AADDV, C_DCON, C_NONE, C_NONE, C_REG, C_NONE, 60, 20, 0, 0}, + {AADDV, C_DCON, C_REG, C_NONE, C_REG, C_NONE, 60, 20, 0, 0}, +@@ -289,18 +277,18 @@ var optab = []Optab{ + {AAND, C_DCON32_12S, C_NONE, C_NONE, C_REG, C_NONE, 72, 16, 0, 0}, + {AAND, C_DCON32_12S, C_REG, C_NONE, C_REG, C_NONE, 72, 16, 0, 0}, + +- {ASLL, C_SCON, C_REG, C_NONE, C_REG, C_NONE, 16, 4, 0, 0}, +- {ASLL, C_SCON, C_NONE, C_NONE, C_REG, C_NONE, 16, 4, 0, 0}, ++ {ASLL, C_U5CON, C_REG, C_NONE, C_REG, C_NONE, 16, 4, 0, 0}, ++ {ASLL, C_U5CON, C_NONE, C_NONE, C_REG, C_NONE, 16, 4, 0, 0}, + +- {ASLLV, C_SCON, C_REG, C_NONE, C_REG, C_NONE, 16, 4, 0, 0}, +- {ASLLV, C_SCON, C_NONE, C_NONE, C_REG, C_NONE, 16, 4, 0, 0}, ++ {ASLLV, C_U6CON, C_REG, C_NONE, C_REG, C_NONE, 16, 4, 0, 0}, ++ {ASLLV, C_U6CON, C_NONE, C_NONE, C_REG, C_NONE, 16, 4, 0, 0}, + +- {ABSTRPICKW, C_SCON, C_REG, C_SCON, C_REG, C_NONE, 17, 4, 0, 0}, +- {ABSTRPICKW, C_SCON, C_REG, C_ZCON, C_REG, C_NONE, 17, 4, 0, 0}, ++ {ABSTRPICKW, C_U6CON, C_REG, C_U6CON, C_REG, C_NONE, 17, 4, 0, 0}, ++ {ABSTRPICKW, C_U6CON, C_REG, C_ZCON, C_REG, C_NONE, 17, 4, 0, 0}, + {ABSTRPICKW, C_ZCON, C_REG, C_ZCON, C_REG, C_NONE, 17, 4, 0, 0}, + + {ASYSCALL, C_NONE, C_NONE, C_NONE, C_NONE, C_NONE, 5, 4, 0, 0}, +- {ASYSCALL, C_ANDCON, C_NONE, C_NONE, C_NONE, C_NONE, 5, 4, 0, 0}, ++ {ASYSCALL, C_U15CON, C_NONE, C_NONE, C_NONE, C_NONE, 5, 4, 0, 0}, + + {ABEQ, C_REG, C_REG, C_NONE, C_BRAN, C_NONE, 6, 4, 0, 0}, + {ABEQ, C_REG, C_NONE, C_NONE, C_BRAN, C_NONE, 6, 4, 0, 0}, +@@ -348,8 +336,7 @@ var optab = []Optab{ + {AMOVV, C_FREG, C_NONE, C_NONE, C_FCCREG, C_NONE, 30, 4, 0, 0}, + {AMOVV, C_FCCREG, C_NONE, C_NONE, C_FREG, C_NONE, 30, 4, 0, 0}, + +- {AMOVW, C_ADDCON, C_NONE, C_NONE, C_FREG, C_NONE, 34, 8, 0, 0}, +- {AMOVW, C_ANDCON, C_NONE, C_NONE, C_FREG, C_NONE, 34, 8, 0, 0}, ++ {AMOVW, C_12CON, C_NONE, C_NONE, C_FREG, C_NONE, 34, 8, 0, 0}, + + {AMOVB, C_REG, C_NONE, C_NONE, C_TLS_IE, C_NONE, 56, 16, 0, 0}, + {AMOVW, C_REG, C_NONE, C_NONE, C_TLS_IE, C_NONE, 56, 16, 0, 0}, +@@ -363,13 +350,13 @@ var optab = []Optab{ + {AMOVBU, C_TLS_IE, C_NONE, C_NONE, C_REG, C_NONE, 57, 16, 0, 0}, + {AMOVWU, C_TLS_IE, C_NONE, C_NONE, C_REG, C_NONE, 57, 16, 0, 0}, + +- {AWORD, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 38, 4, 0, 0}, ++ {AWORD, C_32CON, C_NONE, C_NONE, C_NONE, C_NONE, 38, 4, 0, 0}, + {AWORD, C_DCON, C_NONE, C_NONE, C_NONE, C_NONE, 61, 4, 0, 0}, + + {AMOVV, C_GOTADDR, C_NONE, C_NONE, C_REG, C_NONE, 65, 8, 0, 0}, + +- {ATEQ, C_SCON, C_REG, C_NONE, C_REG, C_NONE, 15, 8, 0, 0}, +- {ATEQ, C_SCON, C_NONE, C_NONE, C_REG, C_NONE, 15, 8, 0, 0}, ++ {ATEQ, C_US12CON, C_REG, C_NONE, C_REG, C_NONE, 15, 8, 0, 0}, ++ {ATEQ, C_US12CON, C_NONE, C_NONE, C_REG, C_NONE, 15, 8, 0, 0}, + + {ARDTIMELW, C_NONE, C_NONE, C_NONE, C_REG, C_REG, 62, 4, 0, 0}, + {AAMSWAPW, C_REG, C_NONE, C_NONE, C_ZOREG, C_REG, 66, 4, 0, 0}, +@@ -409,12 +396,12 @@ var optab = []Optab{ + + {AVMOVQ, C_ELEM, C_NONE, C_NONE, C_ARNG, C_NONE, 45, 4, 0, 0}, + +- {obj.APCALIGN, C_SCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, +- {obj.APCDATA, C_LCON, C_NONE, C_NONE, C_LCON, C_NONE, 0, 0, 0, 0}, ++ {obj.APCALIGN, C_U12CON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, ++ {obj.APCDATA, C_32CON, C_NONE, C_NONE, C_32CON, C_NONE, 0, 0, 0, 0}, + {obj.APCDATA, C_DCON, C_NONE, C_NONE, C_DCON, C_NONE, 0, 0, 0, 0}, +- {obj.AFUNCDATA, C_SCON, C_NONE, C_NONE, C_ADDR, C_NONE, 0, 0, 0, 0}, ++ {obj.AFUNCDATA, C_U12CON, C_NONE, C_NONE, C_ADDR, C_NONE, 0, 0, 0, 0}, + {obj.ANOP, C_NONE, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, +- {obj.ANOP, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, // nop variants, see #40689 ++ {obj.ANOP, C_32CON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, // nop variants, see #40689 + {obj.ANOP, C_DCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, // nop variants, see #40689 + {obj.ANOP, C_REG, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, + {obj.ANOP, C_FREG, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, +@@ -857,34 +844,35 @@ func (c *ctxt0) aclass(a *obj.Addr) int { + } + + if c.instoffset >= 0 { +- if c.instoffset == 0 { +- return C_ZCON +- } +- if c.instoffset <= 0x7ff { +- return C_SCON +- } +- if c.instoffset <= 0xfff { +- return C_ANDCON +- } +- if c.instoffset&0xfff == 0 && isuint32(uint64(c.instoffset)) { // && ((instoffset & (1<<31)) == 0) +- return C_UCON ++ sbits := bits.Len64(uint64(c.instoffset)) ++ switch { ++ case sbits <=8: ++ return C_ZCON + sbits ++ case sbits <= 12: ++ if c.instoffset <= 0x7ff { ++ return C_US12CON ++ } ++ return C_U12CON ++ case sbits <= 15: ++ if c.instoffset & 0xfff == 0 { ++ return C_15CON20_0 ++ } ++ return C_U15CON + } +- if isint32(c.instoffset) || isuint32(uint64(c.instoffset)) { +- return C_LCON ++ } else { ++ sbits := bits.Len64(uint64(^c.instoffset)) ++ switch { ++ case sbits < 5: ++ return C_S5CON ++ case sbits < 12: ++ return C_S12CON + } +- return C_LCON + } + +- if c.instoffset >= -0x800 { +- return C_ADDCON ++ if c.instoffset&0xfff == 0 { ++ return C_32CON20_0 + } +- if c.instoffset&0xfff == 0 && isint32(c.instoffset) { +- return C_UCON +- } +- if isint32(c.instoffset) { +- return C_LCON +- } +- return C_LCON ++ return C_32CON + + case obj.TYPE_BRANCH: + return C_BRAN +@@ -1130,10 +1118,11 @@ func (c *ctxt0) oplook(p *obj.Prog) *Optab { + + ops := oprange[p.As&obj.AMask] + c1 := &xcmp[a1] ++ c3 := &xcmp[a3] + c4 := &xcmp[a4] + for i := range ops { + op := &ops[i] +- if (int(op.reg) == a2) && int(op.from3) == a3 && c1[op.from1] && c4[op.to1] && (int(op.to2) == a5) { ++ if (int(op.reg) == a2) && c3[op.from3] && c1[op.from1] && c4[op.to1] && (int(op.to2) == a5) { + p.Optab = uint16(cap(optab) - cap(ops) + i + 1) + return op + } +@@ -1151,21 +1140,41 @@ func cmp(a int, b int) bool { + } + switch a { + case C_DCON: +- if b == C_LCON || b == C_DCON32_0 || +- b == C_DCON12_0 || b == C_DCON20S_0 || +- b == C_DCON12_20S || b == C_DCON12_12S || +- b == C_DCON20S_20 || b == C_DCON32_20 || +- b == C_DCON20S_12S || b == C_DCON32_12S || +- b == C_DCON12_32S || b == C_DCON20S_32 || +- b == C_DCON12_12U || b == C_DCON20S_12U || +- b == C_DCON32_12U { +- return true +- } +- fallthrough +- case C_LCON: +- if b == C_ZCON || b == C_SCON || b == C_UCON || b == C_ADDCON || b == C_ANDCON { +- return true +- } ++ return cmp(C_32CON, b) || cmp(C_DCON12_20S, b) || cmp(C_DCON32_12S, b) || b == C_DCON12_0 ++ case C_32CON: ++ return cmp(C_32CON20_0, b) || cmp(C_U15CON, b) || cmp(C_S12CON, b) ++ case C_32CON20_0: ++ return b == C_15CON20_0 || b == C_ZCON ++ case C_U15CON: ++ return cmp(C_U12CON, b) || b == C_15CON20_0 ++ case C_12CON: ++ return cmp(C_U12CON, b) || cmp(C_S12CON, b) ++ case C_UU12CON: ++ return cmp(C_U12CON, b) ++ case C_U12CON: ++ return cmp(C_U8CON, b) || b == C_US12CON ++ case C_U8CON: ++ return cmp(C_U7CON, b) ++ case C_U7CON: ++ return cmp(C_U6CON, b) ++ case C_U6CON: ++ return cmp(C_U5CON, b) ++ case C_U5CON: ++ return cmp(C_U4CON, b) ++ case C_U4CON: ++ return cmp(C_U3CON, b) ++ case C_U3CON: ++ return cmp(C_U2CON, b) ++ case C_U2CON: ++ return cmp(C_U1CON, b) ++ case C_U1CON: ++ return cmp(C_ZCON, b) ++ case C_US12CON: ++ return cmp(C_S12CON, b) ++ case C_S12CON: ++ return cmp(C_S5CON, b) || cmp(C_U8CON, b) || b == C_US12CON ++ case C_S5CON: ++ return cmp(C_ZCON, b) || cmp(C_U4CON, b) + + case C_DCON12_0: + +@@ -1183,62 +1192,20 @@ func cmp(a int, b int) bool { + return true + } + +- case C_ADD0CON: +- if b == C_ADDCON { +- return true +- } +- fallthrough +- +- case C_ADDCON: +- if b == C_ZCON || b == C_SCON { +- return true +- } +- +- case C_AND0CON: +- if b == C_ANDCON { +- return true +- } +- fallthrough +- +- case C_ANDCON: +- if b == C_ZCON || b == C_SCON { +- return true +- } +- +- case C_UCON: +- if b == C_ZCON { +- return true +- } +- +- case C_SCON: +- if b == C_ZCON { +- return true +- } +- + case C_LACON: +- if b == C_SACON { +- return true +- } ++ return b == C_SACON + + case C_LAUTO: +- if b == C_SAUTO { +- return true +- } ++ return b == C_SAUTO + + case C_REG: +- if b == C_ZCON { +- return true +- } ++ return b == C_ZCON + + case C_LOREG: +- if b == C_ZOREG || b == C_SOREG { +- return true +- } ++ return b == C_ZOREG || b == C_SOREG + + case C_SOREG: +- if b == C_ZOREG { +- return true +- } ++ return b == C_ZOREG + } + + return false +@@ -1881,7 +1848,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + r = int(o.param) + } + a := add +- if o.from1 == C_ANDCON { ++ if o.from1 == C_12CON && v > 0 { + a = AOR + } + +@@ -2008,15 +1975,9 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + if r == 0 { + r = int(p.To.Reg) + } +- +- switch o.flag { +- case immFiledSi5: +- c.checkimmFiled(p, v, 5, true) +- o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) +- default: +- c.ctxt.Diag("Invalid immediate value type\n%v", p) +- } +- ++ ++ o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) ++ + case 14: // add $ui8,[r1],r2 + v := c.regoff(&p.From) + r := int(p.Reg) +@@ -2024,13 +1985,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + r = int(p.To.Reg) + } + +- switch o.flag { +- case immFiledUi8: +- c.checkimmFiled(p, v, 8, false) +- o1 = OP_8IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) +- default: +- c.ctxt.Diag("Invalid immediate value type\n%v", p) +- } ++ o1 = OP_8IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) + + case 15: // teq $c r,r + v := c.regoff(&p.From) +@@ -2185,13 +2140,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + r = int(p.To.Reg) + } + +- switch o.flag { +- case immFiledUi3: +- c.checkimmFiled(p, v, 3, false) +- o1 = OP_3IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) +- default: +- c.ctxt.Diag("Invalid immediate value type\n%v", p) +- } ++ o1 = OP_3IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) + + case 30: // mov gr/fr/fcc/fcsr, fr/fcc/fcsr/gr + a := c.specialFpMovInst(p.As, oclass(&p.From), oclass(&p.To)) +@@ -2204,13 +2153,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + r = int(p.To.Reg) + } + +- switch o.flag { +- case immFiledUi4: +- c.checkimmFiled(p, v, 4, false) +- o1 = OP_4IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) +- default: +- c.ctxt.Diag("Invalid immediate value type\n%v", p) +- } ++ o1 = OP_4IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) + + case 32: // add $ui5,[r1],r2 + v := c.regoff(&p.From) +@@ -2219,13 +2162,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + r = int(p.To.Reg) + } + +- switch o.flag { +- case immFiledUi5: +- c.checkimmFiled(p, v, 5, false) +- o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) +- default: +- c.ctxt.Diag("Invalid immediate value type\n%v", p) +- } ++ o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) + + case 33: // add $ui6,[r1],r2 + v := c.regoff(&p.From) +@@ -2234,18 +2171,12 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { + r = int(p.To.Reg) + } + +- switch o.flag { +- case immFiledUi6: +- c.checkimmFiled(p, v, 6, false) +- o1 = OP_6IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) +- default: +- c.ctxt.Diag("Invalid immediate value type\n%v", p) +- } ++ o1 = OP_6IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg)) + + case 34: // mov $con,fr + v := c.regoff(&p.From) + a := AADDU +- if o.from1 == C_ANDCON { ++ if v > 0 { + a = AOR + } + a2 := c.specialFpMovInst(p.As, C_REG, oclass(&p.To)) +@@ -2702,21 +2633,6 @@ func (c *ctxt0) checkindex(p *obj.Prog, index uint32, mask uint32) { + } + } + +-// checkimmFiled checks whether the immediate value exceeds the valid encoding range +-func (c *ctxt0) checkimmFiled(p *obj.Prog, imm int32, bits uint8, isSigned bool) { +- if isSigned { +- bound := int32(1 << (bits - 1)) +- if imm < -bound || imm > bound { +- c.ctxt.Diag("signed immediate %v exceeds the %d-bit range: %v", imm, bits, p) +- } +- } else { +- mask := uint32(0xffffffff) << bits +- if uint32(imm) != (uint32(imm) & ^mask) { +- c.ctxt.Diag("unsigned immediate %v exceeds the %d-bit range: %v", imm, bits, p) +- } +- } +-} +- + func (c *ctxt0) vregoff(a *obj.Addr) int64 { + c.instoffset = 0 + c.aclass(a) +diff --git a/src/cmd/internal/obj/loong64/cnames.go b/src/cmd/internal/obj/loong64/cnames.go +index a2f04a22ee..1d38f1ee36 100644 +--- a/src/cmd/internal/obj/loong64/cnames.go ++++ b/src/cmd/internal/obj/loong64/cnames.go +@@ -14,13 +14,24 @@ var cnames0 = []string{ + "ARNG", + "ELEM", + "ZCON", +- "SCON", +- "UCON", +- "ADD0CON", +- "AND0CON", +- "ADDCON", +- "ANDCON", +- "LCON", ++ "U1CON", ++ "U2CON", ++ "U3CON", ++ "U4CON", ++ "U5CON", ++ "U6CON", ++ "U7CON", ++ "U8CON", ++ "S5CON", ++ "US12CON", ++ "UU12CON", ++ "S12CON", ++ "U12CON", ++ "12CON", ++ "U15CON", ++ "15CON20_0", ++ "32CON20_0", ++ "32CON", + "DCON20S_0", + "DCON12_0", + "DCON32_0", +-- +2.38.1 + diff --git a/0038-crypto-internal-poly1305-implement-function-update-i.patch b/0038-crypto-internal-poly1305-implement-function-update-i.patch new file mode 100644 index 0000000000000000000000000000000000000000..e18caf23e121c1bd51c48e07b0a441cfffe140d6 --- /dev/null +++ b/0038-crypto-internal-poly1305-implement-function-update-i.patch @@ -0,0 +1,298 @@ +From 9e01e315f3ea08fc01854bf8beb2cdeb9ff6dddc Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Thu, 19 Dec 2024 15:38:48 +0800 +Subject: [PATCH 38/44] crypto/internal/poly1305: implement function update in + assembly on loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +There is some improvement in performance on Loongson 3A5000 and 3A6000. + +goos: linux +goarch: loong64 +pkg: golang.org/x/crypto/internal/poly1305 +cpu: Loongson-3A5000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +64 122.8n ± 0% 101.2n ± 0% -17.59% (p=0.000 n=10) +1K 1152.0n ± 0% 779.4n ± 0% -32.34% (p=0.000 n=10) +2M 2.356m ± 0% 1.556m ± 0% -33.94% (p=0.000 n=10) +64Unaligned 122.7n ± 0% 102.5n ± 0% -16.46% (p=0.000 n=10) +1KUnaligned 1152.0n ± 0% 802.4n ± 0% -30.35% (p=0.000 n=10) +2MUnaligned 2.336m ± 0% 1.582m ± 0% -32.26% (p=0.000 n=10) +Write64 77.92n ± 0% 57.45n ± 0% -26.27% (p=0.000 n=10) +Write1K 1106.0n ± 0% 736.2n ± 0% -33.44% (p=0.000 n=10) +Write2M 2.356m ± 0% 1.562m ± 0% -33.69% (p=0.000 n=10) +Write64Unaligned 77.87n ± 0% 59.71n ± 0% -23.33% (p=0.000 n=10) +Write1KUnaligned 1106.0n ± 0% 749.5n ± 0% -32.23% (p=0.000 n=10) +Write2MUnaligned 2.335m ± 0% 1.580m ± 0% -32.34% (p=0.000 n=10) +geomean 6.373µ 4.530µ -28.93% + + | bench.old | bench.new | + | B/s | B/s vs base | +64 497.1Mi ± 0% 603.3Mi ± 0% +21.37% (p=0.000 n=10) +1K 847.6Mi ± 0% 1252.9Mi ± 0% +47.82% (p=0.000 n=10) +2M 849.0Mi ± 0% 1285.3Mi ± 0% +51.39% (p=0.000 n=10) +64Unaligned 497.4Mi ± 0% 595.5Mi ± 0% +19.73% (p=0.000 n=10) +1KUnaligned 847.6Mi ± 0% 1217.1Mi ± 0% +43.59% (p=0.000 n=10) +2MUnaligned 856.3Mi ± 0% 1264.0Mi ± 0% +47.61% (p=0.000 n=10) +Write64 783.3Mi ± 0% 1062.4Mi ± 0% +35.64% (p=0.000 n=10) +Write1K 882.8Mi ± 0% 1326.5Mi ± 0% +50.25% (p=0.000 n=10) +Write2M 849.0Mi ± 0% 1280.3Mi ± 0% +50.80% (p=0.000 n=10) +Write64Unaligned 783.8Mi ± 0% 1022.3Mi ± 0% +30.43% (p=0.000 n=10) +Write1KUnaligned 882.8Mi ± 0% 1303.0Mi ± 0% +47.59% (p=0.000 n=10) +Write2MUnaligned 856.5Mi ± 0% 1266.0Mi ± 0% +47.81% (p=0.000 n=10) +geomean 772.2Mi 1.061Gi +40.72% + +goos: linux +goarch: loong64 +pkg: golang.org/x/crypto/internal/poly1305 +cpu: Loongson-3A6000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +64 92.06n ± 0% 77.56n ± 0% -15.75% (p=0.000 n=10) +1K 998.4n ± 0% 683.0n ± 0% -31.59% (p=0.000 n=10) +2M 1.978m ± 0% 1.323m ± 0% -33.11% (p=0.000 n=10) +64Unaligned 92.06n ± 0% 77.56n ± 0% -15.75% (p=0.000 n=10) +1KUnaligned 998.4n ± 0% 683.0n ± 0% -31.59% (p=0.000 n=10) +2MUnaligned 1.979m ± 0% 1.369m ± 0% -30.82% (p=0.000 n=10) +Write64 65.25n ± 0% 50.39n ± 0% -22.77% (p=0.000 n=10) +Write1K 970.7n ± 0% 656.8n ± 0% -32.34% (p=0.000 n=10) +Write2M 1.966m ± 0% 1.323m ± 0% -32.73% (p=0.000 n=10) +Write64Unaligned 65.24n ± 0% 50.37n ± 0% -22.79% (p=0.000 n=10) +Write1KUnaligned 970.8n ± 0% 656.8n ± 0% -32.34% (p=0.000 n=10) +Write2MUnaligned 1.966m ± 0% 1.368m ± 0% -30.42% (p=0.000 n=10) +geomean 5.319µ 3.834µ -27.93% + + | bench.old | bench.new | + | B/s | B/s vs base | +64 663.0Mi ± 0% 786.9Mi ± 0% +18.69% (p=0.000 n=10) +1K 978.1Mi ± 0% 1429.8Mi ± 0% +46.18% (p=0.000 n=10) +2M 1011.0Mi ± 0% 1511.4Mi ± 0% +49.50% (p=0.000 n=10) +64Unaligned 663.0Mi ± 0% 786.9Mi ± 0% +18.69% (p=0.000 n=10) +1KUnaligned 978.1Mi ± 0% 1429.8Mi ± 0% +46.18% (p=0.000 n=10) +2MUnaligned 1010.6Mi ± 0% 1460.9Mi ± 0% +44.56% (p=0.000 n=10) +Write64 935.4Mi ± 0% 1211.3Mi ± 0% +29.49% (p=0.000 n=10) +Write1K 1006.0Mi ± 0% 1486.9Mi ± 0% +47.81% (p=0.000 n=10) +Write2M 1017.3Mi ± 0% 1512.1Mi ± 0% +48.64% (p=0.000 n=10) +Write64Unaligned 935.5Mi ± 0% 1211.7Mi ± 0% +29.53% (p=0.000 n=10) +Write1KUnaligned 1005.9Mi ± 0% 1486.9Mi ± 0% +47.81% (p=0.000 n=10) +Write2MUnaligned 1017.1Mi ± 0% 1461.8Mi ± 0% +43.71% (p=0.000 n=10) +geomean 925.3Mi 1.254Gi +38.75% + +Change-Id: Iec990384a7be9a89a019c2b3b546d9fc59a2d58e +--- + .../x/crypto/internal/poly1305/mac_noasm.go | 2 +- + .../x/crypto/internal/poly1305/sum_loong64.go | 47 +++++++ + .../x/crypto/internal/poly1305/sum_loong64.s | 131 ++++++++++++++++++ + 3 files changed, 179 insertions(+), 1 deletion(-) + create mode 100644 src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.go + create mode 100644 src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.s + +diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go b/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go +index bd896bdc76..8d99551fee 100644 +--- a/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go ++++ b/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego ++//go:build (!amd64 && !loong64 && !ppc64le && !ppc64 && !s390x) || !gc || purego + + package poly1305 + +diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.go b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.go +new file mode 100644 +index 0000000000..d4dc8f91ec +--- /dev/null ++++ b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.go +@@ -0,0 +1,47 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++//go:build gc && !purego ++ ++package poly1305 ++ ++//go:noescape ++func update(state *macState, msg []byte) ++ ++// mac is a wrapper for macGeneric that redirects calls that would have gone to ++// updateGeneric to update. ++// ++// Its Write and Sum methods are otherwise identical to the macGeneric ones, but ++// using function pointers would carry a major performance cost. ++type mac struct{ macGeneric } ++ ++func (h *mac) Write(p []byte) (int, error) { ++ nn := len(p) ++ if h.offset > 0 { ++ n := copy(h.buffer[h.offset:], p) ++ if h.offset+n < TagSize { ++ h.offset += n ++ return nn, nil ++ } ++ p = p[n:] ++ h.offset = 0 ++ update(&h.macState, h.buffer[:]) ++ } ++ if n := len(p) - (len(p) % TagSize); n > 0 { ++ update(&h.macState, p[:n]) ++ p = p[n:] ++ } ++ if len(p) > 0 { ++ h.offset += copy(h.buffer[h.offset:], p) ++ } ++ return nn, nil ++} ++ ++func (h *mac) Sum(out *[16]byte) { ++ state := h.macState ++ if h.offset > 0 { ++ update(&state, h.buffer[:h.offset]) ++ } ++ finalize(out, &state.h, &state.s) ++} +diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.s b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.s +new file mode 100644 +index 0000000000..baf0c95333 +--- /dev/null ++++ b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.s +@@ -0,0 +1,131 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++//go:build gc && !purego ++ ++// func update(state *macState, msg []byte) ++TEXT ·update(SB), $0-32 ++ MOVV state+0(FP), R4 ++ MOVV msg_base+8(FP), R5 ++ MOVV msg_len+16(FP), R6 ++ MOVV $16, R7 ++ MOVV (R4), R8 // h0 ++ MOVV 8(R4), R9 // h1 ++ MOVV 16(R4), R10 // h2 ++ MOVV 24(R4), R11 // r0 ++ MOVV 32(R4), R12 // r1 ++ ++ BLT R6, R7, bytes_between_0_and_15 ++ ++loop: ++ MOVV (R5), R14 // msg[0:8] ++ MOVV 8(R5), R16 // msg[8:16] ++ ADDV R14, R8, R8 // h0 ++ ADDV R9, R16, R27 ++ SGTU R14, R8, R24 // h0.carry ++ SGTU R9, R27, R28 ++ ADDV R27, R24, R9 // h1 ++ SGTU R27, R9, R24 ++ OR R24, R28, R24 // h1.carry ++ ADDV $1, R24, R24 ++ ADDV R10, R24, R10 // h2 ++ ++ ADDV $16, R5, R5 // msg = msg[16:] ++ ++multiply: ++ MULV R8, R11, R13 // h0r0.lo ++ MULHVU R8, R11, R16 // h0r0.hi ++ MOVV R13, R14 ++ MOVV R16, R15 ++ MULV R9, R11, R13 // h1r0.lo ++ MULHVU R9, R11, R16 // h1r0.hi ++ ADDV R13, R15, R15 ++ SGTU R13, R15, R24 ++ ADDV R24, R16, R16 ++ MULV R10, R11, R25 ++ ADDV R16, R25, R25 ++ MULV R8, R12, R13 // h0r1.lo ++ MULHVU R8, R12, R16 // h0r1.hi ++ ADDV R13, R15, R15 ++ SGTU R13, R15, R24 ++ ADDV R24, R16, R16 ++ MOVV R16, R8 ++ MULV R10, R12, R26 // h2r1 ++ MULV R9, R12, R13 // h1r1.lo ++ MULHVU R9, R12, R16 // h1r1.hi ++ ADDV R13, R25, R25 ++ ADDV R16, R26, R27 ++ SGTU R13, R25, R24 ++ SGTU R16, R27, R28 ++ ADDV R27, R24, R26 ++ SGTU R27, R26, R24 ++ OR R24, R28, R24 ++ ADDV R8, R25, R25 ++ SGTU R8, R25, R24 ++ ADDV R24, R26, R26 ++ MOVV R14, R8 ++ MOVV R15, R9 ++ MOVV R25, R10 ++ MOVV R25, R14 ++ AND $3, R10, R10 ++ AND $-4, R14, R14 ++ ADDV R14, R8, R8 ++ ADDV R26, R9, R27 ++ SGTU R14, R8, R24 ++ SGTU R26, R27, R28 ++ ADDV R27, R24, R9 ++ SGTU R27, R9, R24 ++ OR R24, R28, R24 ++ ADDV R24, R10, R10 ++ SLLV $62, R26, R27 ++ SRLV $2, R25, R28 ++ SRLV $2, R26, R26 ++ OR R27, R28, R25 ++ ADDV R25, R8, R8 ++ ADDV R26, R9, R27 ++ SGTU R25, R8, R24 ++ SGTU R26, R27, R28 ++ ADDV R27, R24, R9 ++ SGTU R27, R9, R24 ++ OR R24, R28, R24 ++ ADDV R24, R10, R10 ++ ++ SUBV $16, R6, R6 ++ BGE R6, R7, loop ++ ++bytes_between_0_and_15: ++ BEQ R6, R0, done ++ MOVV $1, R14 ++ XOR R15, R15 ++ XOR R25, R25 ++ ADDV R6, R5, R5 ++ ++flush_buffer: ++ SRLV $56, R14, R24 ++ SLLV $8, R15, R28 ++ OR R24, R28, R15 ++ SLLV $8, R14, R14 ++ MOVBU -1(R5), R25 ++ XOR R25, R14, R14 ++ SUBV $1, R5, R5 ++ SUBV $1, R6, R6 ++ BNE R6, R0, flush_buffer ++ ++ ADDV R14, R8, R8 ++ SGTU R14, R8, R24 ++ ADDV R15, R9, R27 ++ SGTU R15, R27, R28 ++ ADDV R27, R24, R9 ++ SGTU R27, R9, R24 ++ OR R24, R28, R24 ++ ADDV R10, R24, R10 ++ ++ MOVV $16, R6 ++ JMP multiply ++ ++done: ++ MOVV R8, (R4) ++ MOVV R9, 8(R4) ++ MOVV R10, 16(R4) ++ RET +-- +2.38.1 + diff --git a/0039-runtime-optimize-the-implementation-of-memclrNoHeapP.patch b/0039-runtime-optimize-the-implementation-of-memclrNoHeapP.patch new file mode 100644 index 0000000000000000000000000000000000000000..289a1f5ac28f28b391aab13473136c3cdf82e036 --- /dev/null +++ b/0039-runtime-optimize-the-implementation-of-memclrNoHeapP.patch @@ -0,0 +1,374 @@ +From 0e94e34886a3632315e444c5fd0ba448239c500e Mon Sep 17 00:00:00 2001 +From: chenguoqi +Date: Tue, 31 Dec 2024 18:31:50 +0800 +Subject: [PATCH 39/44] runtime: optimize the implementation of + memclrNoHeapPointers on loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +goos: linux +goarch: loong64 +pkg: runtime +cpu: Loongson-3A6000 @ 2500.00MHz + │ bench.old │ bench.new │ + │ sec/op │ sec/op vs base │ +Memclr/5 2.456n ± 0% 3.202n ± 0% +30.37% (p=0.000 n=10) +Memclr/16 2.806n ± 0% 2.810n ± 1% +0.14% (p=0.002 n=10) +Memclr/64 5.053n ± 1% 5.045n ± 1% ~ (p=0.591 n=10) +Memclr/256 10.240n ± 0% 6.027n ± 0% -41.14% (p=0.000 n=10) +Memclr/4096 107.00n ± 0% 30.46n ± 0% -71.53% (p=0.000 n=10) +Memclr/65536 1676.0n ± 0% 431.3n ± 0% -74.26% (p=0.000 n=10) +Memclr/1M 52.52µ ± 0% 32.81µ ± 0% -37.54% (p=0.000 n=10) +Memclr/4M 210.0µ ± 0% 131.3µ ± 0% -37.48% (p=0.000 n=10) +Memclr/8M 420.0µ ± 0% 262.8µ ± 1% -37.43% (p=0.000 n=10) +Memclr/16M 846.7µ ± 0% 528.8µ ± 0% -37.55% (p=0.000 n=10) +Memclr/64M 3.388m ± 0% 2.180m ± 1% -35.66% (p=0.000 n=10) +MemclrUnaligned/0_5 4.382n ± 0% 4.006n ± 0% -8.59% (p=0.000 n=10) +MemclrUnaligned/0_16 4.600n ± 0% 4.204n ± 0% -8.60% (p=0.000 n=10) +MemclrUnaligned/0_64 5.604n ± 0% 5.005n ± 0% -10.69% (p=0.000 n=10) +MemclrUnaligned/0_256 10.340n ± 0% 6.808n ± 0% -34.16% (p=0.000 n=10) +MemclrUnaligned/0_4096 107.10n ± 0% 33.81n ± 0% -68.43% (p=0.000 n=10) +MemclrUnaligned/0_65536 1701.0n ± 0% 441.6n ± 0% -74.04% (p=0.000 n=10) +MemclrUnaligned/1_5 4.386n ± 0% 4.004n ± 0% -8.71% (p=0.000 n=10) +MemclrUnaligned/1_16 4.597n ± 0% 4.203n ± 0% -8.56% (p=0.000 n=10) +MemclrUnaligned/1_64 7.204n ± 0% 7.106n ± 0% -1.36% (p=0.000 n=10) +MemclrUnaligned/1_256 12.580n ± 0% 9.796n ± 0% -22.13% (p=0.000 n=10) +MemclrUnaligned/1_4096 115.60n ± 0% 38.63n ± 0% -66.58% (p=0.000 n=10) +MemclrUnaligned/1_65536 1709.0n ± 0% 446.5n ± 0% -73.87% (p=0.000 n=10) +MemclrUnaligned/4_5 4.386n ± 0% 4.005n ± 0% -8.69% (p=0.000 n=10) +MemclrUnaligned/4_16 4.597n ± 0% 4.203n ± 0% -8.57% (p=0.000 n=10) +MemclrUnaligned/4_64 7.204n ± 0% 7.104n ± 0% -1.39% (p=0.000 n=10) +MemclrUnaligned/4_256 12.58n ± 0% 10.66n ± 0% -15.22% (p=0.000 n=10) +MemclrUnaligned/4_4096 114.30n ± 0% 39.99n ± 0% -65.01% (p=0.000 n=10) +MemclrUnaligned/4_65536 1709.0n ± 0% 449.8n ± 0% -73.68% (p=0.000 n=10) +MemclrUnaligned/7_5 4.381n ± 0% 4.002n ± 0% -8.64% (p=0.000 n=10) +MemclrUnaligned/7_16 4.597n ± 0% 4.202n ± 0% -8.59% (p=0.000 n=10) +MemclrUnaligned/7_64 7.204n ± 0% 7.104n ± 0% -1.39% (p=0.000 n=10) +MemclrUnaligned/7_256 12.58n ± 0% 10.60n ± 0% -15.74% (p=0.000 n=10) +MemclrUnaligned/7_4096 115.50n ± 0% 39.75n ± 0% -65.58% (p=0.000 n=10) +MemclrUnaligned/7_65536 1709.0n ± 0% 447.1n ± 0% -73.84% (p=0.000 n=10) +MemclrUnaligned/0_1M 52.52µ ± 0% 32.80µ ± 0% -37.56% (p=0.000 n=10) +MemclrUnaligned/0_4M 210.0µ ± 0% 131.2µ ± 0% -37.53% (p=0.000 n=10) +MemclrUnaligned/0_8M 419.9µ ± 0% 262.5µ ± 0% -37.48% (p=0.000 n=10) +MemclrUnaligned/0_16M 845.0µ ± 0% 528.1µ ± 0% -37.51% (p=0.000 n=10) +MemclrUnaligned/0_64M 3.406m ± 0% 2.165m ± 1% -36.44% (p=0.000 n=10) +MemclrUnaligned/1_1M 52.53µ ± 0% 32.80µ ± 0% -37.55% (p=0.000 n=10) +MemclrUnaligned/1_4M 210.2µ ± 0% 131.3µ ± 0% -37.55% (p=0.000 n=10) +MemclrUnaligned/1_8M 419.9µ ± 0% 262.4µ ± 0% -37.50% (p=0.000 n=10) +MemclrUnaligned/1_16M 844.2µ ± 0% 528.0µ ± 0% -37.46% (p=0.000 n=10) +MemclrUnaligned/1_64M 3.369m ± 0% 2.161m ± 5% -35.84% (p=0.000 n=10) +MemclrUnaligned/4_1M 52.53µ ± 0% 32.80µ ± 0% -37.55% (p=0.000 n=10) +MemclrUnaligned/4_4M 210.2µ ± 0% 131.2µ ± 0% -37.59% (p=0.000 n=10) +MemclrUnaligned/4_8M 419.9µ ± 0% 262.4µ ± 0% -37.52% (p=0.000 n=10) +MemclrUnaligned/4_16M 844.5µ ± 0% 527.9µ ± 0% -37.49% (p=0.000 n=10) +MemclrUnaligned/4_64M 3.366m ± 0% 2.173m ± 0% -35.46% (p=0.000 n=10) +MemclrUnaligned/7_1M 52.52µ ± 0% 32.80µ ± 0% -37.55% (p=0.000 n=10) +MemclrUnaligned/7_4M 210.2µ ± 0% 131.5µ ± 0% -37.45% (p=0.000 n=10) +MemclrUnaligned/7_8M 419.9µ ± 0% 262.6µ ± 0% -37.47% (p=0.000 n=10) +MemclrUnaligned/7_16M 844.4µ ± 0% 529.0µ ± 0% -37.36% (p=0.000 n=10) +MemclrUnaligned/7_64M 3.372m ± 1% 2.201m ± 0% -34.72% (p=0.000 n=10) +MemclrRange/1K_2K 2703.0n ± 0% 948.1n ± 0% -64.93% (p=0.000 n=10) +MemclrRange/2K_8K 8.826µ ± 0% 2.458µ ± 0% -72.15% (p=0.000 n=10) +MemclrRange/4K_16K 8.325µ ± 0% 2.210µ ± 0% -73.45% (p=0.000 n=10) +MemclrRange/160K_228K 83.40µ ± 0% 31.27µ ± 0% -62.50% (p=0.000 n=10) +MemclrKnownSize1 0.4003n ± 0% 0.4002n ± 0% -0.02% (p=0.027 n=10) +MemclrKnownSize2 0.4003n ± 0% 0.4002n ± 0% -0.02% (p=0.000 n=10) +MemclrKnownSize4 0.4003n ± 0% 0.4002n ± 0% -0.02% (p=0.000 n=10) +MemclrKnownSize8 0.4003n ± 0% 0.4002n ± 0% -0.02% (p=0.000 n=10) +MemclrKnownSize16 0.4213n ± 1% 0.8007n ± 0% +90.03% (p=0.000 n=10) +MemclrKnownSize32 2.001n ± 0% 1.602n ± 0% -19.94% (p=0.000 n=10) +MemclrKnownSize64 2.010n ± 0% 2.402n ± 0% +19.47% (p=0.000 n=10) +MemclrKnownSize112 3.202n ± 0% 2.803n ± 0% -12.46% (p=0.000 n=10) +MemclrKnownSize128 3.442n ± 0% 3.236n ± 0% -6.00% (p=0.000 n=10) +MemclrKnownSize192 5.204n ± 0% 5.205n ± 0% ~ (p=0.279 n=10) +MemclrKnownSize248 6.301n ± 0% 6.299n ± 0% -0.03% (p=0.000 n=10) +MemclrKnownSize256 6.707n ± 0% 6.704n ± 0% -0.04% (p=0.018 n=10) +MemclrKnownSize512 13.610n ± 0% 6.989n ± 0% -48.65% (p=0.000 n=10) +MemclrKnownSize1024 26.420n ± 0% 8.458n ± 0% -67.99% (p=0.000 n=10) +MemclrKnownSize4096 103.30n ± 0% 28.02n ± 0% -72.88% (p=0.000 n=10) +MemclrKnownSize512KiB 26.28µ ± 0% 16.41µ ± 0% -37.53% (p=0.000 n=10) +geomean 624.0n 397.1n -36.37% + +Change-Id: I702b9c1991cf13f9338c189c5ef59cb2c6f279de +--- + src/runtime/cpuflags.go | 3 +- + src/runtime/memclr_loong64.s | 214 ++++++++++++++++++++++++----------- + 2 files changed, 152 insertions(+), 65 deletions(-) + +diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go +index e81e50f5df..06424642c7 100644 +--- a/src/runtime/cpuflags.go ++++ b/src/runtime/cpuflags.go +@@ -20,7 +20,8 @@ const ( + + offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA) + +- offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX) ++ offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX) ++ offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX) + ) + + var ( +diff --git a/src/runtime/memclr_loong64.s b/src/runtime/memclr_loong64.s +index 346b210c8d..0d0d9f0cbb 100644 +--- a/src/runtime/memclr_loong64.s ++++ b/src/runtime/memclr_loong64.s +@@ -11,6 +11,7 @@ + // R5: n + // R6: ptrend + // R7: tmp ++// R8: tmp + + // Algorithm: + // +@@ -38,44 +39,129 @@ + + // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) + TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16 +- BEQ R5, clr_0 ++ // <=64 bytes, clear directly, not check aligned ++generic_small: + ADDV R4, R5, R6 ++ BEQ R4, R6, clr_0 ++ MOVV $2, R7 ++ BLT R5, R7, clr_1 ++ MOVV $3, R7 ++ BLT R5, R7, clr_2 ++ MOVV $4, R7 ++ BLT R5, R7, clr_3 ++ MOVV $5, R7 ++ BLT R5, R7, clr_4 ++ MOVV $8, R7 ++ BLT R5, R7, clr_5_7 ++ MOVV $9, R7 ++ BLT R5, R7, clr_8 ++ MOVV $17, R7 ++ BLT R5, R7, clr_9_16 ++ MOVV $33, R7 ++ BLT R5, R7, clr_17_32 ++ MOVV $65, R7 ++ BLT R5, R7, clr_33_64 + +-tail: +- // <=64 bytes, clear directly, not check aligned +- SGTU $2, R5, R7 +- BNE R7, clr_1 +- SGTU $3, R5, R7 +- BNE R7, clr_2 +- SGTU $4, R5, R7 +- BNE R7, clr_3 +- SGTU $5, R5, R7 +- BNE R7, clr_4 +- SGTU $8, R5, R7 +- BNE R7, clr_5through7 +- SGTU $9, R5, R7 +- BNE R7, clr_8 +- SGTU $17, R5, R7 +- BNE R7, clr_9through16 +- SGTU $33, R5, R7 +- BNE R7, clr_17through32 +- SGTU $65, R5, R7 +- BNE R7, clr_33through64 ++lasx_large: ++ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7 ++ BEQ R7, lsx_large ++ ++ // X0 = 0 ++ XVMOVQ R0, X0.V4 ++ ++ // check 32-byte alignment ++ AND $31, R4, R7 ++ BEQ R7, lasx_large_aligned ++ XVMOVQ X0, (R4) ++ SUBV R7, R4 ++ ADDV R7, R5 ++ SUBV $32, R5 // newn = n - (32 - (ptr & 31)) ++ ADDV $32, R4 // newptr = ptr + (32 - (ptr & 31)) ++ ++lasx_large_aligned: ++ MOVV $256, R8 ++ BLT R5, R8, lasx_small ++lasx_large_body: ++ XVMOVQ X0, 0(R4) ++ XVMOVQ X0, 32(R4) ++ XVMOVQ X0, 64(R4) ++ XVMOVQ X0, 96(R4) ++ XVMOVQ X0, 128(R4) ++ XVMOVQ X0, 160(R4) ++ XVMOVQ X0, 192(R4) ++ XVMOVQ X0, 224(R4) ++ SUBV $256, R5 ++ ADDV $256, R4 ++ BGE R5, R8, lasx_large_body ++ ++lasx_small: ++ MOVV $32, R8 ++ BLT R5, R8, generic_small ++lasx_small_body: ++ XVMOVQ X0, (R4) ++ SUBV $32, R5 ++ ADDV $32, R4 ++ BGE R5, R8, lasx_small_body ++lasx_tail: ++ JMP generic_small ++ ++lsx_large: ++ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7 ++ BEQ R7, generic_large ++ ++ // V0 = 0 ++ VMOVQ R0, V0.V2 + ++ // check 16-byte alignment ++ AND $15, R4, R7 ++ BEQ R7, lsx_large_aligned ++ VMOVQ V0, (R4) ++ SUBV R7, R4 ++ ADDV R7, R5 ++ SUBV $16, R5 // newn = n - (16 - (ptr & 15)) ++ ADDV $16, R4 // newptr = ptr + (16 - (ptr & 15)) ++ ++lsx_large_aligned: ++ MOVV $128, R8 ++ BLT R5, R8, lsx_small ++lsx_large_body: ++ VMOVQ V0, 0(R4) ++ VMOVQ V0, 16(R4) ++ VMOVQ V0, 32(R4) ++ VMOVQ V0, 48(R4) ++ VMOVQ V0, 64(R4) ++ VMOVQ V0, 80(R4) ++ VMOVQ V0, 96(R4) ++ VMOVQ V0, 112(R4) ++ SUBV $128, R5 ++ ADDV $128, R4 ++ BGE R5, R8, lsx_large_body ++ ++lsx_small: ++ MOVV $16, R8 ++ BLT R5, R8, generic_small ++lsx_small_body: ++ VMOVQ V0, (R4) ++ SUBV $16, R5 ++ ADDV $16, R4 ++ BGE R5, R8, lsx_small_body ++lsx_tail: ++ JMP generic_small ++ ++generic_large: + // n > 64 bytes, check aligned + AND $7, R4, R7 +- BEQ R7, body +- +-head: ++ BEQ R7, generic_large_aligned + MOVV R0, (R4) +- SUBV R7, R4 + ADDV R7, R5 +- ADDV $8, R4 // newptr = ptr + (8 - (ptr & 7)) ++ SUBV R7, R4 + SUBV $8, R5 // newn = n - (8 - (ptr & 7)) +- SGTU $65, R5, R7 +- BNE R7, clr_33through64 ++ ADDV $8, R4 // newptr = ptr + (8 - (ptr & 7)) + +-body: ++generic_large_aligned: ++ MOVV $65, R7 ++ BLT R5, R7, generic_small ++generic_large_body: + MOVV R0, (R4) + MOVV R0, 8(R4) + MOVV R0, 16(R4) +@@ -84,52 +170,52 @@ body: + MOVV R0, 40(R4) + MOVV R0, 48(R4) + MOVV R0, 56(R4) +- ADDV $-64, R5 ++ SUBV $64, R5 + ADDV $64, R4 +- SGTU $65, R5, R7 +- BEQ R7, body +- BEQ R5, clr_0 +- JMP tail ++ BGE R5, R7, generic_large_body ++generic_tail: ++ JMP generic_small + +-clr_0: ++clr_33_64: ++ MOVV R0, (R4) ++ MOVV R0, 8(R4) ++ MOVV R0, 16(R4) ++ MOVV R0, 24(R4) ++ MOVV R0, -32(R6) ++ MOVV R0, -24(R6) ++ MOVV R0, -16(R6) ++ MOVV R0, -8(R6) + RET +-clr_1: +- MOVB R0, (R4) ++ ++clr_17_32: ++ MOVV R0, (R4) ++ MOVV R0, 8(R4) ++ MOVV R0, -16(R6) ++ MOVV R0, -8(R6) + RET +-clr_2: +- MOVH R0, (R4) ++clr_9_16: ++ MOVV R0, (R4) ++ MOVV R0, -8(R6) + RET +-clr_3: +- MOVH R0, (R4) +- MOVB R0, 2(R4) ++clr_8: ++ MOVV R0, (R4) + RET +-clr_4: ++clr_5_7: + MOVW R0, (R4) ++ MOVW R0, -4(R6) + RET +-clr_5through7: ++clr_4: + MOVW R0, (R4) +- MOVW R0, -4(R6) + RET +-clr_8: +- MOVV R0, (R4) ++clr_3: ++ MOVH R0, (R4) ++ MOVB R0, 2(R4) + RET +-clr_9through16: +- MOVV R0, (R4) +- MOVV R0, -8(R6) ++clr_2: ++ MOVH R0, (R4) + RET +-clr_17through32: +- MOVV R0, (R4) +- MOVV R0, 8(R4) +- MOVV R0, -16(R6) +- MOVV R0, -8(R6) ++clr_1: ++ MOVB R0, (R4) + RET +-clr_33through64: +- MOVV R0, (R4) +- MOVV R0, 8(R4) +- MOVV R0, 16(R4) +- MOVV R0, 24(R4) +- MOVV R0, -32(R6) +- MOVV R0, -24(R6) +- MOVV R0, -16(R6) +- MOVV R0, -8(R6) ++clr_0: + RET +-- +2.38.1 + diff --git a/0040-runtime-race-add-the-implementation-of-atomic.-Or-An.patch b/0040-runtime-race-add-the-implementation-of-atomic.-Or-An.patch new file mode 100644 index 0000000000000000000000000000000000000000..39a261b130f8233e6a4ef7081b817fd0c5160f66 --- /dev/null +++ b/0040-runtime-race-add-the-implementation-of-atomic.-Or-An.patch @@ -0,0 +1,75 @@ +From 88b165cf7d4cb6a77f47d3c291d3ee7e1f13695e Mon Sep 17 00:00:00 2001 +From: Guoqi Chen +Date: Fri, 10 Jan 2025 10:31:47 +0800 +Subject: [PATCH 40/44] runtime/race: add the implementation of atomic.{Or,And} + on loong64 + +Change-Id: Ia4298a4d92fce210e3c743b2d5ce2b28b82d4971 +--- + src/runtime/race_loong64.s | 50 +++++++++++++++++++++++ + 2 files changed, 50 insertions(+) + +diff --git a/src/runtime/race_loong64.s b/src/runtime/race_loong64.s +index 04f264b21b..e6c11d44f7 100644 +--- a/src/runtime/race_loong64.s ++++ b/src/runtime/race_loong64.s +@@ -308,6 +308,56 @@ TEXT sync∕atomic·AddUintptr(SB), NOSPLIT, $0-24 + GO_ARGS + JMP sync∕atomic·AddInt64(SB) + ++// And ++TEXT sync∕atomic·AndInt32(SB), NOSPLIT, $0-20 ++ GO_ARGS ++ MOVV $__tsan_go_atomic32_fetch_and(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·AndInt64(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ MOVV $__tsan_go_atomic64_fetch_and(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·AndUint32(SB), NOSPLIT, $0-20 ++ GO_ARGS ++ JMP sync∕atomic·AndInt32(SB) ++ ++TEXT sync∕atomic·AndUint64(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ JMP sync∕atomic·AndInt64(SB) ++ ++TEXT sync∕atomic·AndUintptr(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ JMP sync∕atomic·AndInt64(SB) ++ ++// Or ++TEXT sync∕atomic·OrInt32(SB), NOSPLIT, $0-20 ++ GO_ARGS ++ MOVV $__tsan_go_atomic32_fetch_or(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·OrInt64(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ MOVV $__tsan_go_atomic64_fetch_or(SB), RCALL ++ JAL racecallatomic<>(SB) ++ RET ++ ++TEXT sync∕atomic·OrUint32(SB), NOSPLIT, $0-20 ++ GO_ARGS ++ JMP sync∕atomic·OrInt32(SB) ++ ++TEXT sync∕atomic·OrUint64(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ JMP sync∕atomic·OrInt64(SB) ++ ++TEXT sync∕atomic·OrUintptr(SB), NOSPLIT, $0-24 ++ GO_ARGS ++ JMP sync∕atomic·OrInt64(SB) ++ + // CompareAndSwap + TEXT sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-17 + GO_ARGS +-- +2.38.1 + diff --git a/0041-cmd-internal-obj-loong64-add-F-MAXA-MINA-.-S-D-instr.patch b/0041-cmd-internal-obj-loong64-add-F-MAXA-MINA-.-S-D-instr.patch new file mode 100644 index 0000000000000000000000000000000000000000..847b4b1f3f22a9f0cb249f5e573e8f6c950ac84f --- /dev/null +++ b/0041-cmd-internal-obj-loong64-add-F-MAXA-MINA-.-S-D-instr.patch @@ -0,0 +1,107 @@ +From e652e32e37bfd898af333a32b73cfde6ab2116fa Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Mon, 30 Dec 2024 10:08:58 +0800 +Subject: [PATCH 41/44] cmd/internal/obj/loong64: add F{MAXA/MINA}.{S/D} + instructions + +Go asm syntax: + F{MAXA/MINA}{F/D} FK, FJ, FD + +Equivalent platform assembler syntax: + f{maxa/mina}.{s/d} fd, fj, fk + +Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html + +Change-Id: I6790657d2f36bdf5e6818b6c0aaa48117e782b8d +--- + src/cmd/asm/internal/asm/testdata/loong64enc1.s | 9 +++++++++ + src/cmd/internal/obj/loong64/a.out.go | 6 ++++++ + src/cmd/internal/obj/loong64/anames.go | 4 ++++ + src/cmd/internal/obj/loong64/asm.go | 12 ++++++++++++ + 4 files changed, 31 insertions(+) + +diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +index b40d86e596..32d3b3f0a2 100644 +--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s ++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s +@@ -346,6 +346,15 @@ lable2: + FTINTVF F0, F1 // 01241b01 + FTINTVD F0, F1 // 01281b01 + ++ FMAXAF F4, F5, F6 // a6900c01 ++ FMAXAF F4, F5 // a5900c01 ++ FMAXAD F4, F5, F6 // a6100d01 ++ FMAXAD F4, F5 // a5100d01 ++ FMINAF F4, F5, F6 // a6900e01 ++ FMINAF F4, F5 // a5900e01 ++ FMINAD F4, F5, F6 // a6100f01 ++ FMINAD F4, F5 // a5100f01 ++ + FTINTRMWF F0, F2 // 02041a01 + FTINTRMWD F0, F2 // 02081a01 + FTINTRMVF F0, F2 // 02241a01 +diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go +index f2d4c41d68..857ea649e7 100644 +--- a/src/cmd/internal/obj/loong64/a.out.go ++++ b/src/cmd/internal/obj/loong64/a.out.go +@@ -688,6 +688,12 @@ const ( + AFMAXF + AFMAXD + ++ // 3.2.1.4 ++ AFMAXAF ++ AFMAXAD ++ AFMINAF ++ AFMINAD ++ + // 3.2.1.7 + AFCOPYSGF + AFCOPYSGD +diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go +index aee0da0a6e..d2acdf7042 100644 +--- a/src/cmd/internal/obj/loong64/anames.go ++++ b/src/cmd/internal/obj/loong64/anames.go +@@ -223,6 +223,10 @@ var Anames = []string{ + "FMIND", + "FMAXF", + "FMAXD", ++ "FMAXAF", ++ "FMAXAD", ++ "FMINAF", ++ "FMINAD", + "FCOPYSGF", + "FCOPYSGD", + "FSCALEBF", +diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go +index 2480cf9382..31f5376f8e 100644 +--- a/src/cmd/internal/obj/loong64/asm.go ++++ b/src/cmd/internal/obj/loong64/asm.go +@@ -1347,6 +1347,10 @@ func buildop(ctxt *obj.Link) { + opset(AFCOPYSGD, r0) + opset(AFSCALEBF, r0) + opset(AFSCALEBD, r0) ++ opset(AFMAXAF, r0) ++ opset(AFMAXAD, r0) ++ opset(AFMINAF, r0) ++ opset(AFMINAD, r0) + + case AFMADDF: + opset(AFMADDD, r0) +@@ -2811,6 +2815,14 @@ func (c *ctxt0) oprrr(a obj.As) uint32 { + return 0x211 << 15 // fmax.s + case AFMAXD: + return 0x212 << 15 // fmax.d ++ case AFMAXAF: ++ return 0x219 << 15 // fmaxa.s ++ case AFMAXAD: ++ return 0x21a << 15 // fmaxa.d ++ case AFMINAF: ++ return 0x21d << 15 // fmina.s ++ case AFMINAD: ++ return 0x21e << 15 // fmina.d + case AFSCALEBF: + return 0x221 << 15 // fscaleb.s + case AFSCALEBD: +-- +2.38.1 + diff --git a/0042-math-implement-func-archExp-and-archExp2-in-assembly.patch b/0042-math-implement-func-archExp-and-archExp2-in-assembly.patch new file mode 100644 index 0000000000000000000000000000000000000000..a9303c24b03f58fa52421ec53e6a08ff1d6c7e5f --- /dev/null +++ b/0042-math-implement-func-archExp-and-archExp2-in-assembly.patch @@ -0,0 +1,358 @@ +From f463c4a1db9ac0e4be9d67bc53f4ddb8515232d3 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Tue, 31 Dec 2024 21:02:47 +0800 +Subject: [PATCH 42/44] math: implement func archExp and archExp2 in assembly + on loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +goos: linux +goarch: loong64 +pkg: math +cpu: Loongson-3A6000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +Exp 26.30n ± 0% 12.93n ± 0% -50.85% (p=0.000 n=10) +ExpGo 26.86n ± 0% 26.92n ± 0% +0.22% (p=0.000 n=10) +Expm1 16.76n ± 0% 16.75n ± 0% ~ (p=0.060 n=10) +Exp2 23.05n ± 0% 12.12n ± 0% -47.42% (p=0.000 n=10) +Exp2Go 23.41n ± 0% 23.47n ± 0% +0.28% (p=0.000 n=10) +geomean 22.97n 17.54n -23.64% + +goos: linux +goarch: loong64 +pkg: math/cmplx +cpu: Loongson-3A6000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +Exp 51.32n ± 0% 35.41n ± 0% -30.99% (p=0.000 n=10) + +goos: linux +goarch: loong64 +pkg: math +cpu: Loongson-3A5000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +Exp 50.27n ± 0% 48.75n ± 1% -3.01% (p=0.000 n=10) +ExpGo 50.72n ± 0% 50.44n ± 0% -0.55% (p=0.000 n=10) +Expm1 28.40n ± 0% 28.32n ± 0% ~ (p=0.360 n=10) +Exp2 50.09n ± 0% 21.49n ± 1% -57.10% (p=0.000 n=10) +Exp2Go 50.05n ± 0% 49.69n ± 0% -0.72% (p=0.000 n=10) +geomean 44.85n 37.52n -16.35% + +goos: linux +goarch: loong64 +pkg: math/cmplx +cpu: Loongson-3A5000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +Exp 88.56n ± 0% 67.29n ± 0% -24.03% (p=0.000 n=10) + +Change-Id: I89e456d26fc075d83335ee4a31227d2aface5714 +--- + src/math/exp2_asm.go | 2 +- + src/math/exp2_noasm.go | 2 +- + src/math/exp_asm.go | 2 +- + src/math/exp_loong64.s | 236 +++++++++++++++++++++++++++++++++++++++++ + src/math/exp_noasm.go | 2 +- + 5 files changed, 240 insertions(+), 4 deletions(-) + create mode 100644 src/math/exp_loong64.s + +diff --git a/src/math/exp2_asm.go b/src/math/exp2_asm.go +index c26b2c3fab..1e78759374 100644 +--- a/src/math/exp2_asm.go ++++ b/src/math/exp2_asm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build arm64 ++//go:build arm64 || loong64 + + package math + +diff --git a/src/math/exp2_noasm.go b/src/math/exp2_noasm.go +index c2b409329f..847138b622 100644 +--- a/src/math/exp2_noasm.go ++++ b/src/math/exp2_noasm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !arm64 ++//go:build !arm64 && !loong64 + + package math + +diff --git a/src/math/exp_asm.go b/src/math/exp_asm.go +index 424442845b..125529fca3 100644 +--- a/src/math/exp_asm.go ++++ b/src/math/exp_asm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build amd64 || arm64 || s390x ++//go:build amd64 || arm64 || loong64 || s390x + + package math + +diff --git a/src/math/exp_loong64.s b/src/math/exp_loong64.s +new file mode 100644 +index 0000000000..3d24214289 +--- /dev/null ++++ b/src/math/exp_loong64.s +@@ -0,0 +1,236 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++#include "textflag.h" ++ ++#define NearZero 0x3e30000000000000 // 2**-28 ++#define PosInf 0x7ff0000000000000 ++#define FracMask 0x000fffffffffffff ++#define C1 0x3cb0000000000000 // 2**-52 ++ ++DATA exprodata<>+0(SB)/8, $0.0 ++DATA exprodata<>+8(SB)/8, $0.5 ++DATA exprodata<>+16(SB)/8, $1.0 ++DATA exprodata<>+24(SB)/8, $2.0 ++DATA exprodata<>+32(SB)/8, $6.93147180369123816490e-01 // Ln2Hi ++DATA exprodata<>+40(SB)/8, $1.90821492927058770002e-10 // Ln2Lo ++DATA exprodata<>+48(SB)/8, $1.44269504088896338700e+00 // Log2e ++DATA exprodata<>+56(SB)/8, $7.09782712893383973096e+02 // Overflow ++DATA exprodata<>+64(SB)/8, $-7.45133219101941108420e+02 // Underflow ++DATA exprodata<>+72(SB)/8, $1.0239999999999999e+03 // Overflow2 ++DATA exprodata<>+80(SB)/8, $-1.0740e+03 // Underflow2 ++DATA exprodata<>+88(SB)/8, $3.7252902984619141e-09 // NearZero ++GLOBL exprodata<>+0(SB), NOPTR|RODATA, $96 ++ ++DATA expmultirodata<>+0(SB)/8, $1.66666666666666657415e-01 // P1 ++DATA expmultirodata<>+8(SB)/8, $-2.77777777770155933842e-03 // P2 ++DATA expmultirodata<>+16(SB)/8, $6.61375632143793436117e-05 // P3 ++DATA expmultirodata<>+24(SB)/8, $-1.65339022054652515390e-06 // P4 ++DATA expmultirodata<>+32(SB)/8, $4.13813679705723846039e-08 // P5 ++GLOBL expmultirodata<>+0(SB), NOPTR|RODATA, $40 ++ ++// Exp returns e**x, the base-e exponential of x. ++// This is an assembly implementation of the method used for function Exp in file exp.go. ++// ++// func Exp(x float64) float64 ++TEXT ·archExp(SB),$0-16 ++ MOVD x+0(FP), F0 // F0 = x ++ ++ MOVV $exprodata<>+0(SB), R10 ++ MOVD 56(R10), F1 // Overflow ++ MOVD 64(R10), F2 // Underflow ++ MOVD 88(R10), F3 // NearZero ++ MOVD 16(R10), F17 // 1.0 ++ ++ CMPEQD F0, F0, FCC0 ++ BFPF isNaN // x = NaN, return NaN ++ ++ CMPGTD F0, F1, FCC0 ++ BFPT overflow // x > Overflow, return PosInf ++ ++ CMPGTD F2, F0, FCC0 ++ BFPT underflow // x < Underflow, return 0 ++ ++ ABSD F0, F5 ++ CMPGTD F3, F5, FCC0 ++ BFPT nearzero // fabs(x) < NearZero, return 1 + x ++ ++ // argument reduction, x = k*ln2 + r, |r| <= 0.5*ln2 ++ // computed as r = hi - lo for extra precision. ++ MOVD 0(R10), F5 // 0.0 ++ MOVD 8(R10), F3 // 0.5 ++ MOVD 48(R10), F2 // Log2e ++ CMPGTD F0, F5, FCC0 ++ BFPT add // x > 0 ++sub: ++ FMSUBD F3, F2, F0, F3 // Log2e*x - 0.5 ++ JMP 2(PC) ++add: ++ FMADDD F3, F2, F0, F3 // Log2e*x + 0.5 ++ ++ FTINTRZVD F3, F4 // float64 -> int64 ++ MOVV F4, R5 // R5 = int(k) ++ FFINTDV F4, F3 // int64 -> float64 ++ ++ MOVD 32(R10), F4 // F4 = Ln2Hi ++ MOVD 40(R10), F5 // F5 = Ln2Lo ++ FNMSUBD F0, F3, F4, F4 // F4 = hi = x - float64(int(k))*Ln2Hi ++ MULD F3, F5, F5 // F5 = lo = float64(int(k)) * Ln2Lo ++ SUBD F5, F4, F6 // F6 = r = hi - lo ++ MULD F6, F6, F7 // F7 = t = r * r ++ ++ // compute c ++ MOVV $expmultirodata<>+0(SB), R11 ++ MOVD 32(R11), F8 // F8 = P5 ++ MOVD 24(R11), F9 // F9 = P4 ++ FMADDD F9, F8, F7, F13 // P4+t*P5 ++ MOVD 16(R11), F10 // F10 = P3 ++ FMADDD F10, F13, F7, F13 // P3+t*(P4+t*P5) ++ MOVD 8(R11), F11 // F11 = P2 ++ FMADDD F11, F13, F7, F13 // P2+t*(P3+t*(P4+t*P5)) ++ MOVD 0(R11), F12 // F12 = P1 ++ FMADDD F12, F13, F7, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5))) ++ FNMSUBD F6, F13, F7, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5)))) ++ ++ // compute y ++ MOVD 24(R10), F14 // F14 = 2.0 ++ SUBD F13, F14, F14 // F14 = 2 - c ++ MULD F6, F13, F15 // F15 = r*c ++ DIVD F14, F15, F15 // F15 = (r*c)/(2-c) ++ SUBD F15, F5, F15 // F15 = lo-(r*c)/(2-c) ++ SUBD F4, F15, F15 // F15 = (lo-(r*c)/(2-c))-hi ++ SUBD F15, F17, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi) ++ ++ // inline Ldexp(y, k), benefit: ++ // 1, no parameter pass overhead. ++ // 2, skip unnecessary checks for Inf/NaN/Zero ++ MOVV F16, R4 ++ MOVV $FracMask, R9 ++ AND R9, R4, R6 // fraction ++ SRLV $52, R4, R7 // exponent ++ ADDV R5, R7 // R5 = int(k) ++ MOVV $1, R12 ++ BGE R7, R12, normal ++ ADDV $52, R7 // denormal ++ MOVV $C1, R8 ++ MOVV R8, F17 // m = 2**-52 ++normal: ++ SLLV $52, R7 ++ OR R7, R6, R4 ++ MOVV R4, F0 ++ MULD F17, F0 // return m * x ++ MOVD F0, ret+8(FP) ++ RET ++nearzero: ++ ADDD F17, F0, F0 ++isNaN: ++ MOVD F0, ret+8(FP) ++ RET ++underflow: ++ MOVV R0, ret+8(FP) ++ RET ++overflow: ++ MOVV $PosInf, R4 ++ MOVV R4, ret+8(FP) ++ RET ++ ++ ++// Exp2 returns 2**x, the base-2 exponential of x. ++// This is an assembly implementation of the method used for function Exp2 in file exp.go. ++// ++// func Exp2(x float64) float64 ++TEXT ·archExp2(SB),$0-16 ++ MOVD x+0(FP), F0 // F0 = x ++ ++ MOVV $exprodata<>+0(SB), R10 ++ MOVD 72(R10), F1 // Overflow2 ++ MOVD 80(R10), F2 // Underflow2 ++ MOVD 88(R10), F3 // NearZero ++ ++ CMPEQD F0, F0, FCC0 ++ BFPF isNaN // x = NaN, return NaN ++ ++ CMPGTD F0, F1, FCC0 ++ BFPT overflow // x > Overflow, return PosInf ++ ++ CMPGTD F2, F0, FCC0 ++ BFPT underflow // x < Underflow, return 0 ++ ++ // argument reduction; x = r*lg(e) + k with |r| <= ln(2)/2 ++ // computed as r = hi - lo for extra precision. ++ MOVD 0(R10), F10 // 0.0 ++ MOVD 8(R10), F2 // 0.5 ++ CMPGTD F0, F10, FCC0 ++ BFPT add ++sub: ++ SUBD F2, F0, F3 // x - 0.5 ++ JMP 2(PC) ++add: ++ ADDD F2, F0, F3 // x + 0.5 ++ ++ FTINTRZVD F3, F4 // float64 -> int64 ++ MOVV F4, R5 // R5 = int(k) ++ FFINTDV F4, F3 // F3 = float64(int(k)) ++ ++ MOVD 32(R10), F4 // F4 = Ln2Hi ++ MOVD 40(R10), F5 // F5 = Ln2Lo ++ SUBD F3, F0, F3 // t = x - float64(int(k)) ++ MULD F3, F4 // F4 = hi = t * Ln2Hi ++ FNMSUBD F10, F3, F5, F5 // F5 = lo = -t * Ln2Lo ++ SUBD F5, F4, F6 // F6 = r = hi - lo ++ MULD F6, F6, F7 // F7 = t = r * r ++ ++ // compute c ++ MOVV $expmultirodata<>+0(SB), R11 ++ MOVD 32(R11), F8 // F8 = P5 ++ MOVD 24(R11), F9 // F9 = P4 ++ FMADDD F9, F8, F7, F13 // P4+t*P5 ++ MOVD 16(R11), F10 // F10 = P3 ++ FMADDD F10, F13, F7, F13 // P3+t*(P4+t*P5) ++ MOVD 8(R11), F11 // F11 = P2 ++ FMADDD F11, F13, F7, F13 // P2+t*(P3+t*(P4+t*P5)) ++ MOVD 0(R11), F12 // F12 = P1 ++ FMADDD F12, F13, F7, F13 // P1+t*(P2+t*(P3+t*(P4+t*P5))) ++ FNMSUBD F6, F13, F7, F13 // F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5)))) ++ ++ // compute y ++ MOVD 24(R10), F14 // F14 = 2.0 ++ SUBD F13, F14, F14 // F14 = 2 - c ++ MULD F6, F13, F15 // F15 = r*c ++ DIVD F14, F15 // F15 = (r*c)/(2-c) ++ ++ MOVD 16(R10), F17 // 1.0 ++ SUBD F15, F5, F15 // lo-(r*c)/(2-c) ++ SUBD F4, F15, F15 // (lo-(r*c)/(2-c))-hi ++ SUBD F15, F17, F16 // F16 = y = 1-((lo-(r*c)/(2-c))-hi) ++ ++ // inline Ldexp(y, k), benefit: ++ // 1, no parameter pass overhead. ++ // 2, skip unnecessary checks for Inf/NaN/Zero ++ MOVV F16, R4 ++ MOVV $FracMask, R9 ++ SRLV $52, R4, R7 // exponent ++ AND R9, R4, R6 // fraction ++ ADDV R5, R7 // R5 = int(k) ++ MOVV $1, R12 ++ BGE R7, R12, normal ++ ++ ADDV $52, R7 // denormal ++ MOVV $C1, R8 ++ MOVV R8, F17 // m = 2**-52 ++normal: ++ SLLV $52, R7 ++ OR R7, R6, R4 ++ MOVV R4, F0 ++ MULD F17, F0 // return m * x ++isNaN: ++ MOVD F0, ret+8(FP) ++ RET ++underflow: ++ MOVV R0, ret+8(FP) ++ RET ++overflow: ++ MOVV $PosInf, R4 ++ MOVV R4, ret+8(FP) ++ RET +diff --git a/src/math/exp_noasm.go b/src/math/exp_noasm.go +index bd3f02412a..bf5e84b736 100644 +--- a/src/math/exp_noasm.go ++++ b/src/math/exp_noasm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !amd64 && !arm64 && !s390x ++//go:build !amd64 && !arm64 && !loong64 && !s390x + + package math + +-- +2.38.1 + diff --git a/0043-math-implement-func-archLog-in-assembly-on-loong64.patch b/0043-math-implement-func-archLog-in-assembly-on-loong64.patch new file mode 100644 index 0000000000000000000000000000000000000000..f01c831480e7521902cef28d0a507e0a1dfbf614 --- /dev/null +++ b/0043-math-implement-func-archLog-in-assembly-on-loong64.patch @@ -0,0 +1,217 @@ +From 066bd3bf1a03e21cc27b463164461a56ce107d59 Mon Sep 17 00:00:00 2001 +From: Xiaolin Zhao +Date: Mon, 6 Jan 2025 15:40:06 +0800 +Subject: [PATCH 43/44] math: implement func archLog in assembly on loong64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +goos: linux +goarch: loong64 +pkg: math +cpu: Loongson-3A6000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +Log 18.87n ± 0% 12.85n ± 0% -31.90% (p=0.000 n=10) +Logb 5.203n ± 0% 5.604n ± 0% +7.71% (p=0.000 n=10) +Log1p 16.78n ± 0% 16.78n ± 0% ~ (p=0.450 n=10) +Log10 20.47n ± 0% 13.59n ± 0% -33.61% (p=0.000 n=10) +Log2 6.804n ± 0% 8.805n ± 0% +29.40% (p=0.000 n=10) +geomean 11.81n 10.77n -8.82% + +goos: linux +goarch: loong64 +pkg: math +cpu: Loongson-3A5000 @ 2500.00MHz + | bench.old | bench.new | + | sec/op | sec/op vs base | +Log 28.28n ± 0% 24.95n ± 1% -11.78% (p=0.000 n=10) +Logb 7.609n ± 0% 7.207n ± 0% -5.29% (p=0.000 n=10) +Log1p 27.27n ± 0% 27.18n ± 1% ~ (p=0.078 n=10) +Log10 29.56n ± 0% 26.56n ± 0% -10.16% (p=0.000 n=10) +Log2 11.43n ± 0% 10.41n ± 0% -8.92% (p=0.000 n=10) +geomean 18.17n 16.83n -7.38% + +Change-Id: I42a17280874c28b31a3b5c75fc19ddac90c92f32 +--- + src/math/log_asm.go | 2 +- + src/math/log_loong64.s | 140 +++++++++++++++++++++++++++++++++++++++++ + src/math/log_stub.go | 2 +- + 3 files changed, 142 insertions(+), 2 deletions(-) + create mode 100644 src/math/log_loong64.s + +diff --git a/src/math/log_asm.go b/src/math/log_asm.go +index 848cce13b2..82372d1e64 100644 +--- a/src/math/log_asm.go ++++ b/src/math/log_asm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build amd64 || s390x ++//go:build amd64 || loong64 || s390x + + package math + +diff --git a/src/math/log_loong64.s b/src/math/log_loong64.s +new file mode 100644 +index 0000000000..534295cb53 +--- /dev/null ++++ b/src/math/log_loong64.s +@@ -0,0 +1,140 @@ ++// Copyright 2025 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++#include "textflag.h" ++ ++DATA logrodata<>+0(SB)/8, $0.5 ++DATA logrodata<>+8(SB)/8, $1.0 ++DATA logrodata<>+16(SB)/8, $2.0 ++DATA logrodata<>+24(SB)/8, $7.07106781186547524401e-01 // sqrt(2)/2 ++DATA logrodata<>+32(SB)/8, $6.93147180369123816490e-01 // Ln2Hi ++DATA logrodata<>+40(SB)/8, $1.90821492927058770002e-10 // Ln2Lo ++DATA logrodata<>+48(SB)/8, $6.666666666666735130e-01 // L1 ++DATA logrodata<>+56(SB)/8, $3.999999999940941908e-01 // L2 ++DATA logrodata<>+64(SB)/8, $2.857142874366239149e-01 // L3 ++DATA logrodata<>+72(SB)/8, $2.222219843214978396e-01 // L4 ++DATA logrodata<>+80(SB)/8, $1.818357216161805012e-01 // L5 ++DATA logrodata<>+88(SB)/8, $1.531383769920937332e-01 // L6 ++DATA logrodata<>+96(SB)/8, $1.479819860511658591e-01 // L7 ++DATA logrodata<>+104(SB)/8, $2.2250738585072014e-308 // 2**-1022 ++GLOBL logrodata<>+0(SB), NOPTR|RODATA, $112 ++ ++#define NaN 0x7FF8000000000001 ++#define NegInf 0xFFF0000000000000 ++#define PosInf 0x7FF0000000000000 ++ ++// func Log(x float64) float64 ++TEXT ·archLog(SB),NOSPLIT,$0 ++ // test bits for special cases ++ MOVD x+0(FP), F0 ++ MOVV x+0(FP), R4 ++ MOVV $logrodata<>+0(SB), R10 ++ FCLASSD F0, F4 ++ MOVV F4, R5 ++ AND $67, R5, R6 // NaN or +Inf ++ AND $544, R5, R7 // +0 or -0 ++ AND $28, R5, R8 // <0 ++ BNE R6, R0, isInfOrNaN ++ BNE R7, R0, isZero ++ BNE R8, R0, isNegative ++ ++ // reduce ++ // f1, ki := Frexp(x) FIXME ++ MOVD 104(R10), F4 ++ ABSD F0, F1 ++ CMPGED F1, F4, FCC0 ++ BFPT direct_return ++ MOVV $0x10000000000000, R5 // 1 << 52 ++ MULV R4, R5, R4 // R4 = y ++ MOVV $-52, R15 // R15 = ki (exp) ++ JMP 2(PC) ++direct_return: ++ MOVV $0, R15 // R15 = ki (exp) F0 = y ++ ++ MOVV $0x000FFFFFFFFFFFFF, R5 ++ AND R4, R5, R7 // x &^= mask << shift ++ MOVV $0x3FE0000000000000, R6 // (-1 + bias) << shift ++ OR R6, R7 // x |= (-1 + bias) << shift ++ MOVV R7, F2 // F2 = f1 ++ SRLV $52, R4 // x >> shift ++ AND $0x7FF, R4 // (x>>shift)&mask ++ SUBV $0x3FE, R4 // int((x>>shift)&mask) - bias + 1 ++ ADDV R4, R15, R4 // R4 = exp ++ ++ // if f1 < math.Sqrt2/2 { k -= 1; f1 *= 2 } ++ MOVD 0(R10), F10 // 0.5 ++ MOVD 8(R10), F3 // 1.0 ++ MOVD 16(R10), F4 // 2.0 ++ MOVD 24(R10), F0 // sqrt(2)/2 ++ CMPGED F2, F0, FCC0 // if f1 >= Sqrt2/2 ++ BFPT next ++ MULD F4, F2, F2 // f1 *= 2 ++ SUBV $1, R4, R4 ++next: ++ MOVV R4, F1 // k-- ++ FFINTDV F1, F1 // F1 = k ++ // f := f1 - 1 ++ SUBD F3, F2, F2 ++ ++ // compute ++ MOVD 96(R10), F17 // L7 ++ MOVD 80(R10), F15 // L5 ++ MOVD 64(R10), F13 // L3 ++ MOVD 48(R10), F11 // L1 ++ ADDD F4, F2, F3 // 2 + f ++ DIVD F3, F2, F4 // s := f / (2 + f) ++ MULD F4, F4, F5 // s2 := s * s ++ MULD F5, F5, F6 // s4 := s2 * s2 ++ // t1 := s2 * (L1 + s4*(L3+s4*(L5+s4*L7))) ++ MULD F17, F6, F7 // s4*L7 ++ ADDD F15, F7 // L5+s4*L7 ++ MULD F6, F7 // s4*(L5+s4*L7) ++ ADDD F13, F7 // L3+s4*(L5+s4*L7) ++ MULD F6, F7 // s4*(L3+s4*(L5+s4*L7)) ++ ADDD F11, F7 // L1 + s4*(L3+s4*(L5+s4*L7)) ++ MULD F5, F7 // s2 * (L1 + s4*(L3+s4*(L5+s4*L7))) ++ ++ MOVD 88(R10), F16 // L6 ++ MOVD 72(R10), F14 // L4 ++ MOVD 56(R10), F12 // L2 ++ // t2 := s4 * (L2 + s4*(L4+s4*L6)) ++ MULD F6, F16, F8 // s4*L6 ++ ADDD F14, F8 // L4+s4*L6 ++ MULD F6, F8 // s4*(L4+s4*L6) ++ ADDD F12, F8 // L2 + s4*(L4+s4*L6) ++ MULD F6, F8 // s4 * (L2 + s4*(L4+s4*L6)) ++ ++ // R := t1 + t2 ++ ADDD F7, F8 ++ ++ // hfsq := 0.5 * f * f ++ MULD F2, F2, F12 // f * f ++ MULD F10, F12, F9 // 0.5 * f * f ++ ++ // return k*Ln2Hi - ((hfsq - (s*(hfsq+R) + k*Ln2Lo)) - f) ++ MOVD 40(R10), F19 // Ln2Lo ++ MOVD 32(R10), F18 // Ln2Hi ++ // f9=hfsq, f1=k, f4=s, f8=R, f2=f ++ ADDD F9, F8, F10 // F10 = hfsq+R ++ MULD F1, F19, F11 // F11 = k*Ln2Lo ++ MULD F10, F4, F12 // F12 = s*(hfsq+R) ++ MULD F1, F18, F15 // F15 = k*Ln2Hi ++ ADDD F12, F11, F13 // F13 = s*(hfsq+R) + k*Ln2Lo ++ SUBD F13, F9, F14 // F14 = hfsq - (s*(hfsq+R) + k*Ln2Lo) ++ SUBD F2, F14, F14 // F14 = (hfsq - (s*(hfsq+R) + k*Ln2Lo)) - f ++ SUBD F14, F15, F0 ++ MOVD F0, ret+8(FP) ++ RET ++ ++isInfOrNaN: ++ MOVV R4, ret+8(FP) // +Inf or NaN, return x ++ RET ++isNegative: ++ MOVV $NaN, R4 ++ MOVV R4, ret+8(FP) // return NaN ++ RET ++isZero: ++ MOVV $NegInf, R4 ++ MOVV R4, ret+8(FP) // return -Inf ++ RET +diff --git a/src/math/log_stub.go b/src/math/log_stub.go +index d35992bf37..1dd4058435 100644 +--- a/src/math/log_stub.go ++++ b/src/math/log_stub.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !amd64 && !s390x ++//go:build !amd64 && !loong64 && !s390x + + package math + +-- +2.38.1 + diff --git a/0044-cmd-go-internal-work-allow-a-bunch-of-loong64-specif.patch b/0044-cmd-go-internal-work-allow-a-bunch-of-loong64-specif.patch new file mode 100644 index 0000000000000000000000000000000000000000..40422df348c4672d8c6836dd29ad3fe19709142c --- /dev/null +++ b/0044-cmd-go-internal-work-allow-a-bunch-of-loong64-specif.patch @@ -0,0 +1,126 @@ +From fc3470aafbb3facc619e4813eaf0ea10d5c7eda9 Mon Sep 17 00:00:00 2001 +From: WANG Xuerui +Date: Sun, 9 Feb 2025 18:57:49 +0800 +Subject: [PATCH 44/44] cmd/go/internal/work: allow a bunch of loong64-specific + flags + +Recognize and allow all LoongArch-specific CFLAGS as standardized +in the LoongArch Toolchain Conventions v1.1, and implemented in current +versions of GCC and Clang, to enable advanced cgo use cases on loong64. +These flags are also allowed for linker invocations in case of possible +LTO. + +See: https://github.com/loongson/la-toolchain-conventions/blob/releases/v1.1/LoongArch-toolchain-conventions-EN.adoc#list + +While at it, also add support for -mtls-dialect as some C programs +may benefit performance-wise from the optional TLSDESC usage. This flag +is not specific to loong64 though; it is available for amd64, arm, +arm64, loong64, riscv64 and x86. + +Fixes #71597. + +Change-Id: I35d2507edb71fa324ae429a3ae3c739644a9cac1 +--- + src/cmd/go/internal/work/security.go | 13 ++++++++-- + src/cmd/go/internal/work/security_test.go | 31 +++++++++++++++++++++++ + 2 files changed, 42 insertions(+), 2 deletions(-) + +diff --git a/src/cmd/go/internal/work/security.go b/src/cmd/go/internal/work/security.go +index 50bfd0ab70..c3d62ddc23 100644 +--- a/src/cmd/go/internal/work/security.go ++++ b/src/cmd/go/internal/work/security.go +@@ -96,17 +96,21 @@ var validCompilerFlags = []*lazyregexp.Regexp{ + re(`-g([^@\-].*)?`), + re(`-m32`), + re(`-m64`), +- re(`-m(abi|arch|cpu|fpu|tune)=([^@\-].*)`), ++ re(`-m(abi|arch|cpu|fpu|simd|tls-dialect|tune)=([^@\-].*)`), + re(`-m(no-)?v?aes`), + re(`-marm`), + re(`-m(no-)?avx[0-9a-z]*`), + re(`-mcmodel=[0-9a-z-]+`), + re(`-mfloat-abi=([^@\-].*)`), ++ re(`-m(soft|single|double)-float`), + re(`-mfpmath=[0-9a-z,+]*`), + re(`-m(no-)?avx[0-9a-z.]*`), + re(`-m(no-)?ms-bitfields`), + re(`-m(no-)?stack-(.+)`), + re(`-mmacosx-(.+)`), ++ re(`-m(no-)?relax`), ++ re(`-m(no-)?strict-align`), ++ re(`-m(no-)?(lsx|lasx|frecipe|div32|lam-bh|lamcas|ld-seq-sa)`), + re(`-mios-simulator-version-min=(.+)`), + re(`-miphoneos-version-min=(.+)`), + re(`-mlarge-data-threshold=[0-9]+`), +@@ -166,8 +170,13 @@ var validLinkerFlags = []*lazyregexp.Regexp{ + re(`-flat_namespace`), + re(`-g([^@\-].*)?`), + re(`-headerpad_max_install_names`), +- re(`-m(abi|arch|cpu|fpu|tune)=([^@\-].*)`), ++ re(`-m(abi|arch|cpu|fpu|simd|tls-dialect|tune)=([^@\-].*)`), ++ re(`-mcmodel=[0-9a-z-]+`), + re(`-mfloat-abi=([^@\-].*)`), ++ re(`-m(soft|single|double)-float`), ++ re(`-m(no-)?relax`), ++ re(`-m(no-)?strict-align`), ++ re(`-m(no-)?(lsx|lasx|frecipe|div32|lam-bh|lamcas|ld-seq-sa)`), + re(`-mmacosx-(.+)`), + re(`-mios-simulator-version-min=(.+)`), + re(`-miphoneos-version-min=(.+)`), +diff --git a/src/cmd/go/internal/work/security_test.go b/src/cmd/go/internal/work/security_test.go +index 35af621764..48f98100a5 100644 +--- a/src/cmd/go/internal/work/security_test.go ++++ b/src/cmd/go/internal/work/security_test.go +@@ -50,10 +50,35 @@ var goodCompilerFlags = [][]string{ + {"-ftls-model=local-dynamic"}, + {"-g"}, + {"-ggdb"}, ++ {"-mabi=lp64d"}, + {"-march=souza"}, + {"-mcmodel=medium"}, + {"-mcpu=123"}, + {"-mfpu=123"}, ++ {"-mtls-dialect=gnu"}, ++ {"-mtls-dialect=gnu2"}, ++ {"-mtls-dialect=trad"}, ++ {"-mtls-dialect=desc"}, ++ {"-mtls-dialect=xyz"}, ++ {"-msimd=lasx"}, ++ {"-msimd=xyz"}, ++ {"-mdouble-float"}, ++ {"-mrelax"}, ++ {"-mstrict-align"}, ++ {"-mlsx"}, ++ {"-mlasx"}, ++ {"-mfrecipe"}, ++ {"-mlam-bh"}, ++ {"-mlamcas"}, ++ {"-mld-seq-sa"}, ++ {"-mno-relax"}, ++ {"-mno-strict-align"}, ++ {"-mno-lsx"}, ++ {"-mno-lasx"}, ++ {"-mno-frecipe"}, ++ {"-mno-lam-bh"}, ++ {"-mno-lamcas"}, ++ {"-mno-ld-seq-sa"}, + {"-mlarge-data-threshold=16"}, + {"-mtune=happybirthday"}, + {"-mstack-overflow"}, +@@ -96,7 +121,13 @@ var badCompilerFlags = [][]string{ + {"-march=@dawn"}, + {"-march=-dawn"}, + {"-mcmodel=@model"}, ++ {"-mfpu=@0"}, ++ {"-mfpu=-0"}, + {"-mlarge-data-threshold=@12"}, ++ {"-mtls-dialect=@gnu"}, ++ {"-mtls-dialect=-gnu"}, ++ {"-msimd=@none"}, ++ {"-msimd=-none"}, + {"-std=@c99"}, + {"-std=-c99"}, + {"-x@c"}, +-- +2.38.1 + diff --git a/golang.spec b/golang.spec index b3d2c41ba6c04a2bcbfae5b0e5decfe1743793d0..7bc84e369af3f698df833d4a8800ddbab3ab4497 100644 --- a/golang.spec +++ b/golang.spec @@ -1,4 +1,4 @@ -%define anolis_release 1 +%define anolis_release 2 # Disable debuginfo packages %global debug_package %{nil} @@ -42,7 +42,11 @@ %endif # Build golang shared objects for stdlib +%ifarch loongarch64 +%bcond_with shared +%else %bcond_without shared +%endif # Pre build std lib with -race enabled # Disabled due to 1.20 new cache usage, see 1.20 upstream release notes @@ -73,8 +77,50 @@ Source0: https://go.dev/dl/go%{go_api}%{?go_patch:.%{go_patch}}.src.tar.g # make possible to override default traceback level at build time by setting build tag rpm_crashtraceback Source1: anolis.go -# Exclude for temporary -ExcludeArch: loongarch64 +Patch1: 0001-cmd-link-internal-add-support-for-internal-linking-o.patch +Patch2: 0002-cmd-dist-internal-platform-enable-internal-linking-f.patch +Patch3: 0003-cmd-runtime-enable-race-detector-on-loong64.patch +Patch4: 0004-runtime-delete-on-register-ABI-fallback-path-for-rac.patch +Patch5: 0005-cmd-internal-obj-loong64-remove-unused-register-alia.patch +Patch6: 0006-internal-bytealg-optimize-IndexByte-and-IndexByteStr.patch +Patch7: 0007-internal-bytealg-optimize-memequal-and-memequal_varl.patch +Patch8: 0008-internal-bytealg-optimize-Index-and-IndexString-func.patch +Patch9: 0009-internal-bytealg-optimize-Count-and-CountString-func.patch +Patch10: 0010-internal-bytealg-adjust-the-format-of-assembly-files.patch +Patch11: 0011-cmd-internal-obj-loong64-optimize-immediate-loading.patch +Patch12: 0012-math-big-optimize-addVV-function-for-loong64.patch +Patch13: 0013-math-big-optimize-addVW-function-for-loong64.patch +Patch14: 0014-math-big-optimize-subVV-function-for-loong64.patch +Patch15: 0015-math-big-optimize-subVW-function-for-loong64.patch +Patch16: 0016-math-big-optimize-shlVU-function-for-loong64.patch +Patch17: 0017-math-big-optimize-shrVU-function-for-loong64.patch +Patch18: 0018-math-big-optimize-mulAddVWW-function-for-loong64.patch +Patch19: 0019-math-big-optimize-addMulVVW-function-for-loong64.patch +Patch20: 0020-cmd-compile-fold-constant-shift-with-extension-on-lo.patch +Patch21: 0021-test-codegen-fix-the-matching-instructions-inside-pl.patch +Patch22: 0022-cmd-compile-optimize-shifts-of-int32-and-uint32-on-l.patch +Patch23: 0023-cmd-compile-simplify-bounded-shift-on-loong64.patch +Patch24: 0024-runtime-use-ABIInternal-on-syscall-and-other-sys.stu.patch +Patch25: 0025-runtime-use-correct-memory-barrier-in-exitThread-fun.patch +Patch26: 0026-cmd-internal-obj-loong64-add-V-XV-SEQI-V-XV-.-AND-OR.patch +Patch27: 0027-cmd-internal-obj-loong64-add-V-XV-ADD-SUB-.-B-H-W-D-.patch +Patch28: 0028-cmd-internal-obj-loong64-add-V-XV-ILV-L-H-.-B-H-W-D-.patch +Patch29: 0029-cmd-internal-obj-loong64-add-V-XV-SLL-SRL-SRA-ROTR-I.patch +Patch30: 0030-cmd-internal-obj-loong64-add-V-XV-FSQRT-FRECIP-FRSQR.patch +Patch31: 0031-cmd-internal-obj-loong64-add-V-XV-NEG-B-H-W-V-instru.patch +Patch32: 0032-cmd-internal-obj-loong64-add-V-XV-MUL-B-H-W-V-and-V-.patch +Patch33: 0033-cmd-internal-obj-loong64-add-V-XV-DIV-B-H-W-V-U-and-.patch +Patch34: 0034-cmd-internal-obj-loong64-add-V-XV-BITCLR-BITSET-BITR.patch +Patch35: 0035-crypto-chacha20-add-loong64-SIMD-implementation.patch +Patch36: 0036-internal-bytealg-optimize-Count-String-in-loong64.patch +Patch37: 0037-cmd-internal-obj-cmd-asm-reclassify-32-bit-immediate.patch +Patch38: 0038-crypto-internal-poly1305-implement-function-update-i.patch +Patch39: 0039-runtime-optimize-the-implementation-of-memclrNoHeapP.patch +Patch40: 0040-runtime-race-add-the-implementation-of-atomic.-Or-An.patch +Patch41: 0041-cmd-internal-obj-loong64-add-F-MAXA-MINA-.-S-D-instr.patch +Patch42: 0042-math-implement-func-archExp-and-archExp2-in-assembly.patch +Patch43: 0043-math-implement-func-archLog-in-assembly-on-loong64.patch +Patch44: 0044-cmd-go-internal-work-allow-a-bunch-of-loong64-specif.patch # The compiler is written in Go. Needs go(1.4+) compiler for build. %if %{with bootstrap} @@ -545,6 +591,13 @@ fi %files docs -f go-docs.list %changelog +* Mon Feb 24 2025 limeidan - 1.24.0-2 +- add internal linker support on loong64 +- optimize the internal/bytealg package on loong64 +- optimize the math/big package on loong64 +- add new instructions support on loong64 +- optimize memory operation function of runtime on loong64 + * Tue Feb 18 2025 gaochang - 1.24.0-1 - update to 1.24.0 @@ -554,7 +607,7 @@ fi * Wed Jul 10 2024 yangxinyu - 1.21.11-1 - update to 1.21.11 fix cve-2024-24789 -* Thu Mon 13 2024 chenguoqi - 1.21.10-2 +* Thu Jun 13 2024 chenguoqi - 1.21.10-2 - add buildmode={plugin,shared} support on linux/loong64 - asan and msan support on linux/loong64 - loong64 disassembler support diff --git a/race_linux_loong64.syso b/race_linux_loong64.syso index 6fdb3bad77751956e4c1ee6c0732ddcc3a7fc3dc..0d2b4946fbf31abc042ea4ee852785cb13cce5a6 100644 Binary files a/race_linux_loong64.syso and b/race_linux_loong64.syso differ