From 881ac8a472c301bf423a50d36ef6b7c55e51962b Mon Sep 17 00:00:00 2001
From: Julian Zhu <julian.oerv@isrc.iscas.ac.cn>
Date: Fri, 26 Sep 2025 17:24:06 +0800
Subject: [PATCH] - Backport RISC-V RVA23 support for RISC-V 64

Signed-off-by: Julian Zhu <julian.oerv@isrc.iscas.ac.cn>
---
 ...rnal-obj-riscv-cmd-link-improve-TLS-.patch |  335 +++
 ...-most-repetitive-operations-to-simpl.patch |  209 ++
 ...bigmod-provide-assembly-addMulVVW-fo.patch |  169 ++
 ...-or-zero-extend-for-32-bit-equality-.patch |  242 ++
 ...mprove-FP-FMA-performance-on-riscv64.patch |  276 ++
 ...single-precision-FMA-code-generation.patch |  512 ++++
 ...T-cmd-internal-obj-riscv-cmd-link-ad.patch |  180 ++
 ...-riscv-clean-up-error-checking-for-e.patch |   41 +
 ...-riscv-correct-message-in-regVal-pan.patch |   34 +
 ...bj-riscv-simplify-instructionsForMOV.patch |   56 +
 ...fix-wrong-cache-line-size-of-riscv64.patch |   34 +
 ...bj-riscv-clean-up-immediate-checking.patch |  206 ++
 ...rnal-intrinsify-publicationBarrier-o.patch |  145 +
 ...rnal-stop-lowering-OpConvert-on-risc.patch |  117 +
 ...mize-right-shifts-of-uint32-on-riscv.patch |  558 ++++
 ...l-ld-assign-temporary-addresses-to-p.patch |  267 ++
 ...mize-right-shifts-of-int32-on-riscv6.patch |  540 ++++
 ...-riscv-support-subtraction-with-a-co.patch |   72 +
 ...-riscv-fix-the-offset-of-JALR-transf.patch |  119 +
 ...-riscv-improve-handling-of-invalid-a.patch |  376 +++
 ...ition-of-constants-in-riscv64-assemb.patch |  555 ++++
 ...-riscv-add-support-of-PCALIGN-direct.patch |  152 +
 ...-optimize-Count-with-PCALIGN-in-risc.patch |   94 +
 ...ect-code-generation-for-right-shifts.patch |  980 +++++++
 ...ovide-optimised-assembly-for-riscv64.patch |  380 +++
 ...o-add-GORISCV64-environment-variable.patch |  396 +++
 ...ement-float-min-max-in-hardware-for-.patch |  520 ++++
 ...ement-float-min-max-in-hardware-for-.patch |  348 +++
 ...ompile-improve-rotations-for-riscv64.patch |  596 ++++
 ...rnal-obj-enable-rounding-mode-suffix.patch |  308 +++
 ...-assembly-implementations-on-riscv64.patch |  113 +
 ...l-riscv64-generate-local-text-symbol.patch |   47 +
 ...internal-obj-provide-rotation-pseudo.patch |  864 ++++++
 ...-support-Zba-Zbb-Zbs-extensions-in-r.patch |  617 +++++
 ...-riscv-improve-register-MOVB-MOVH-MO.patch |  118 +
 ...-riscv-use-native-rotation-instructi.patch |   62 +
 ...-riscv-check-immediate-for-rotation-.patch |  102 +
 ...est-codegen-add-Mul-test-for-riscv64.patch |   31 +
 ...v64-assembly-implementations-of-roun.patch |  127 +
 ...le-drop-TODO-in-NilCheck-for-riscv64.patch |   49 +
 ...-implement-addVV-in-riscv64-assembly.patch |  148 +
 ...-implement-subVV-in-riscv64-assembly.patch |  148 +
 ...integer-min-max-instructions-on-risc.patch |  360 +++
 ...-implement-addVW-in-riscv64-assembly.patch |  146 +
 ...-implement-subVW-in-riscv64-assembly.patch |  146 +
 ...ovide-optimised-assembly-for-riscv64.patch |  349 +++
 ...lement-mulAddVWW-in-riscv64-assembly.patch |  141 +
 ...lement-addMulVVW-in-riscv64-assembly.patch |  158 ++
 ...-initial-codegen-tests-for-integer-m.patch |   63 +
 ...rnal-ssa-combine-shift-and-addition-.patch |  232 ++
 ...-assembly-implementations-on-riscv64.patch |  125 +
 ...-codegen-add-Rotate-test-for-riscv64.patch |   62 +
 2052-runtime-add-asm_riscv64.h.patch          |   67 +
 ...internal-obj-riscv-always-provide-AN.patch |  387 +++
 ...ovide-optimised-assembly-for-riscv64.patch |  385 +++
 ...rnal-obj-riscv-rename-the-iIEncoding.patch |  200 ++
 ...-riscv-add-vector-instruction-encodi.patch | 2444 +++++++++++++++++
 ...-cmd-asm-add-vector-registers-to-ris.patch |  139 +
 ...-riscv-update-references-to-RISC-V-s.patch |  578 ++++
 ...-add-prologue_end-DWARF-stmt-for-ris.patch |   58 +
 ...-riscv-update-RISC-V-instruction-tab.patch |  371 +++
 ...prove-performance-of-riscv64-assembl.patch |  110 +
 ...tealg-optimize-IndexByte-for-riscv64.patch |  466 ++++
 ...-riscv-rework-instruction-encoding-i.patch |  624 +++++
 ...vide-runtime-detection-of-RISC-V-ext.patch |  255 ++
 ...23u64-as-a-valid-value-for-GORISCV64.patch |  190 ++
 ...-riscv-update-references-to-RISC-V-s.patch |  671 +++++
 ...t-merge-symbols-on-riscv64-when-dyna.patch |  589 ++++
 ...-riscv-support-MOVD-with-floating-po.patch |   83 +
 ...rnal-obj-riscv-implement-vector-conf.patch |  618 +++++
 ...-clean-up-and-simplify-the-riscv64-e.patch |  160 ++
 ...ytealg-eliminate-HashStrBytes-HashSt.patch |  126 +
 ...-riscv-implement-vector-load-store-i.patch |  539 ++++
 ...ternal-obj-riscv-add-riscv64-CSR-map.patch |  363 +++
 ...hten-the-TrailingZeros64-test-on-386.patch |   36 +
 ...-riscv64-codegen-for-arithmetic-test.patch |  102 +
 ...-riscv64-rva23u64-specifiers-to-exis.patch |   84 +
 ...-a-test-for-negation-and-conversion-.patch |   39 +
 ...ine-negation-and-word-sign-extension.patch |   80 +
 ...rnal-ssa-remove-double-negation-with.patch |   97 +
 ...-riscv-prevent-duplicate-error-repor.patch |  189 ++
 ...-riscv-prevent-panics-on-bad-branche.patch |   74 +
 ...-riscv-fix-the-encoding-for-REV8-and.patch |   41 +
 ...-riscv-factor-out-shift-constant-cod.patch |  151 +
 ...-asm-add-additional-tests-for-consta.patch |   75 +
 ...-combined-conversion-and-shift-tests.patch |   95 +
 ...-riscv-internal-bytealg-synthesize-M.patch |  453 +++
 ...-riscv-improve-constant-construction.patch |  239 ++
 ...rnal-ssa-optimise-more-branches-with.patch |  125 +
 ...-riscv-add-support-for-vector-intege.patch | 1327 +++++++++
 ...-riscv-add-support-for-vector-fixed-.patch |  266 ++
 ...move-unnecessary-move-op-replace-wit.patch |   66 +
 ...prove-performance-of-riscv64-assembl.patch |  120 +
 ...-on-riscv64-when-building-with-gcc-1.patch |   81 +
 ...-deduplicate-code-between-Count-Coun.patch |   63 +
 ...-riscv-add-support-for-vector-floati.patch | 1735 ++++++++++++
 ...-riscv-add-support-for-vector-reduct.patch |  176 ++
 ...-riscv-add-support-for-vector-mask-i.patch |  269 ++
 ...-riscv-add-support-for-vector-permut.patch |  287 ++
 ...-vector-implementation-of-equal-for-.patch |  186 ++
 ...-vector-implementation-of-indexbyte-.patch |  156 ++
 ...-riscv-reject-invalid-vadc-vsbc-enco.patch |  123 +
 ...-riscv-fix-LMUL-encoding-for-MF2-and.patch |   68 +
 ...d-generic-simplifications-on-riscv64.patch |  203 ++
 ...-riscv-fix-vector-integer-multiply-a.patch |  187 ++
 ...mise-float-int-register-moves-on-ris.patch |  663 +++++
 ...-vector-implementation-of-compare-fo.patch |  163 ++
 ...rnal-ssagen-improve-intrinsic-archit.patch |  101 +
 ...rnal-ssagen-factor-out-intrinsics-co.patch | 2066 ++++++++++++++
 ...rnal-ssagen-add-initial-test-coverag.patch | 1254 +++++++++
 ...nal-add-GOARM64-environment-variable.patch |  232 ++
 ...rnal-ssagen-provide-intrinsicBuilder.patch |  706 +++++
 ...ternal-ssagen-improve-intrinsic-test.patch |  155 ++
 ...lify-intrinsification-of-BitLen16-an.patch |  582 ++++
 ...lify-intrinsification-of-TrailingZer.patch |  563 ++++
 ...insify-math-bits.TrailingZeros-on-ri.patch |  375 +++
 ...rnal-ssagen-use-an-alias-for-math-bi.patch |   86 +
 ...-intrinsify-math-bits.Len-on-riscv64.patch |  446 +++
 ...ntrinsify-math-bits.Bswap-on-riscv64.patch |  331 +++
 golang.spec                                   |  132 +-
 120 files changed, 37495 insertions(+), 1 deletion(-)
 create mode 100644 2000-cmd-asm-cmd-internal-obj-riscv-cmd-link-improve-TLS-.patch
 create mode 100644 2001-cmd-compile-fold-most-repetitive-operations-to-simpl.patch
 create mode 100644 2002-crypto-internal-bigmod-provide-assembly-addMulVVW-fo.patch
 create mode 100644 2003-cmd-compile-sign-or-zero-extend-for-32-bit-equality-.patch
 create mode 100644 2004-cmd-compile-improve-FP-FMA-performance-on-riscv64.patch
 create mode 100644 2005-cmd-compile-add-single-precision-FMA-code-generation.patch
 create mode 100644 2006-NOT-FULL-BACKPORT-cmd-internal-obj-riscv-cmd-link-ad.patch
 create mode 100644 2007-cmd-internal-obj-riscv-clean-up-error-checking-for-e.patch
 create mode 100644 2008-cmd-internal-obj-riscv-correct-message-in-regVal-pan.patch
 create mode 100644 2009-cmd-internal-obj-riscv-simplify-instructionsForMOV.patch
 create mode 100644 2010-internal-cpu-fix-wrong-cache-line-size-of-riscv64.patch
 create mode 100644 2011-cmd-internal-obj-riscv-clean-up-immediate-checking.patch
 create mode 100644 2012-cmd-compile-internal-intrinsify-publicationBarrier-o.patch
 create mode 100644 2013-cmd-compile-internal-stop-lowering-OpConvert-on-risc.patch
 create mode 100644 2014-cmd-compile-optimize-right-shifts-of-uint32-on-riscv.patch
 create mode 100644 2015-cmd-link-internal-ld-assign-temporary-addresses-to-p.patch
 create mode 100644 2016-cmd-compile-optimize-right-shifts-of-int32-on-riscv6.patch
 create mode 100644 2017-cmd-internal-obj-riscv-support-subtraction-with-a-co.patch
 create mode 100644 2018-cmd-internal-obj-riscv-fix-the-offset-of-JALR-transf.patch
 create mode 100644 2019-cmd-internal-obj-riscv-improve-handling-of-invalid-a.patch
 create mode 100644 2020-all-clean-up-addition-of-constants-in-riscv64-assemb.patch
 create mode 100644 2021-cmd-internal-obj-riscv-add-support-of-PCALIGN-direct.patch
 create mode 100644 2022-internal-bytealg-optimize-Count-with-PCALIGN-in-risc.patch
 create mode 100644 2023-cmd-compile-correct-code-generation-for-right-shifts.patch
 create mode 100644 2024-crypto-sha512-provide-optimised-assembly-for-riscv64.patch
 create mode 100644 2025-cmd-go-add-GORISCV64-environment-variable.patch
 create mode 100644 2026-cmd-compile-implement-float-min-max-in-hardware-for-.patch
 create mode 100644 2027-cmd-compile-implement-float-min-max-in-hardware-for-.patch
 create mode 100644 2028-cmd-compile-improve-rotations-for-riscv64.patch
 create mode 100644 2029-cmd-asm-cmd-internal-obj-enable-rounding-mode-suffix.patch
 create mode 100644 2030-math-add-round-assembly-implementations-on-riscv64.patch
 create mode 100644 2031-cmd-link-internal-riscv64-generate-local-text-symbol.patch
 create mode 100644 2032-cmd-compile-cmd-internal-obj-provide-rotation-pseudo.patch
 create mode 100644 2033-cmd-internal-obj-support-Zba-Zbb-Zbs-extensions-in-r.patch
 create mode 100644 2034-cmd-internal-obj-riscv-improve-register-MOVB-MOVH-MO.patch
 create mode 100644 2035-cmd-internal-obj-riscv-use-native-rotation-instructi.patch
 create mode 100644 2036-cmd-internal-obj-riscv-check-immediate-for-rotation-.patch
 create mode 100644 2037-test-codegen-add-Mul-test-for-riscv64.patch
 create mode 100644 2038-math-remove-riscv64-assembly-implementations-of-roun.patch
 create mode 100644 2039-cmd-compile-drop-TODO-in-NilCheck-for-riscv64.patch
 create mode 100644 2040-math-big-implement-addVV-in-riscv64-assembly.patch
 create mode 100644 2041-math-big-implement-subVV-in-riscv64-assembly.patch
 create mode 100644 2042-cmd-compile-use-integer-min-max-instructions-on-risc.patch
 create mode 100644 2043-math-big-implement-addVW-in-riscv64-assembly.patch
 create mode 100644 2044-math-big-implement-subVW-in-riscv64-assembly.patch
 create mode 100644 2045-crypto-sha256-provide-optimised-assembly-for-riscv64.patch
 create mode 100644 2046-math-big-implement-mulAddVWW-in-riscv64-assembly.patch
 create mode 100644 2047-math-big-implement-addMulVVW-in-riscv64-assembly.patch
 create mode 100644 2048-test-codegen-add-initial-codegen-tests-for-integer-m.patch
 create mode 100644 2049-cmd-compile-internal-ssa-combine-shift-and-addition-.patch
 create mode 100644 2050-math-add-round-assembly-implementations-on-riscv64.patch
 create mode 100644 2051-test-codegen-add-Rotate-test-for-riscv64.patch
 create mode 100644 2052-runtime-add-asm_riscv64.h.patch
 create mode 100644 2053-cmd-compile-cmd-internal-obj-riscv-always-provide-AN.patch
 create mode 100644 2054-crypto-md5-provide-optimised-assembly-for-riscv64.patch
 create mode 100644 2055-cmd-internal-obj-riscv-rename-the-iIEncoding.patch
 create mode 100644 2056-cmd-internal-obj-riscv-add-vector-instruction-encodi.patch
 create mode 100644 2057-cmd-internal-obj-cmd-asm-add-vector-registers-to-ris.patch
 create mode 100644 2058-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch
 create mode 100644 2059-cmd-internal-obj-add-prologue_end-DWARF-stmt-for-ris.patch
 create mode 100644 2060-cmd-internal-obj-riscv-update-RISC-V-instruction-tab.patch
 create mode 100644 2061-crypto-sha512-improve-performance-of-riscv64-assembl.patch
 create mode 100644 2062-internal-bytealg-optimize-IndexByte-for-riscv64.patch
 create mode 100644 2063-cmd-internal-obj-riscv-rework-instruction-encoding-i.patch
 create mode 100644 2064-cpu-internal-provide-runtime-detection-of-RISC-V-ext.patch
 create mode 100644 2065-cmd-go-add-rva23u64-as-a-valid-value-for-GORISCV64.patch
 create mode 100644 2066-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch
 create mode 100644 2067-cmd-compile-don-t-merge-symbols-on-riscv64-when-dyna.patch
 create mode 100644 2068-cmd-internal-obj-riscv-support-MOVD-with-floating-po.patch
 create mode 100644 2069-cmd-asm-cmd-internal-obj-riscv-implement-vector-conf.patch
 create mode 100644 2070-internal-bytealg-clean-up-and-simplify-the-riscv64-e.patch
 create mode 100644 2071-bytes-internal-bytealg-eliminate-HashStrBytes-HashSt.patch
 create mode 100644 2072-cmd-internal-obj-riscv-implement-vector-load-store-i.patch
 create mode 100644 2073-cmd-internal-obj-riscv-add-riscv64-CSR-map.patch
 create mode 100644 2074-test-codegen-tighten-the-TrailingZeros64-test-on-386.patch
 create mode 100644 2075-test-codegen-add-riscv64-codegen-for-arithmetic-test.patch
 create mode 100644 2076-test-codegen-add-riscv64-rva23u64-specifiers-to-exis.patch
 create mode 100644 2077-test-codegen-add-a-test-for-negation-and-conversion-.patch
 create mode 100644 2078-cmd-compile-combine-negation-and-word-sign-extension.patch
 create mode 100644 2079-cmd-compile-internal-ssa-remove-double-negation-with.patch
 create mode 100644 2080-cmd-internal-obj-riscv-prevent-duplicate-error-repor.patch
 create mode 100644 2081-cmd-internal-obj-riscv-prevent-panics-on-bad-branche.patch
 create mode 100644 2082-cmd-internal-obj-riscv-fix-the-encoding-for-REV8-and.patch
 create mode 100644 2083-cmd-internal-obj-riscv-factor-out-shift-constant-cod.patch
 create mode 100644 2084-cmd-asm-internal-asm-add-additional-tests-for-consta.patch
 create mode 100644 2085-test-codegen-add-combined-conversion-and-shift-tests.patch
 create mode 100644 2086-cmd-internal-obj-riscv-internal-bytealg-synthesize-M.patch
 create mode 100644 2087-cmd-internal-obj-riscv-improve-constant-construction.patch
 create mode 100644 2088-cmd-compile-internal-ssa-optimise-more-branches-with.patch
 create mode 100644 2089-cmd-internal-obj-riscv-add-support-for-vector-intege.patch
 create mode 100644 2090-cmd-internal-obj-riscv-add-support-for-vector-fixed-.patch
 create mode 100644 2091-crypto-sha512-remove-unnecessary-move-op-replace-wit.patch
 create mode 100644 2092-crypto-sha256-improve-performance-of-riscv64-assembl.patch
 create mode 100644 2093-cmd-link-fix-cgo-on-riscv64-when-building-with-gcc-1.patch
 create mode 100644 2094-internal-bytealg-deduplicate-code-between-Count-Coun.patch
 create mode 100644 2095-cmd-internal-obj-riscv-add-support-for-vector-floati.patch
 create mode 100644 2096-cmd-internal-obj-riscv-add-support-for-vector-reduct.patch
 create mode 100644 2097-cmd-internal-obj-riscv-add-support-for-vector-mask-i.patch
 create mode 100644 2098-cmd-internal-obj-riscv-add-support-for-vector-permut.patch
 create mode 100644 2099-internal-bytealg-vector-implementation-of-equal-for-.patch
 create mode 100644 2100-internal-bytealg-vector-implementation-of-indexbyte-.patch
 create mode 100644 2101-cmd-internal-obj-riscv-reject-invalid-vadc-vsbc-enco.patch
 create mode 100644 2102-cmd-internal-obj-riscv-fix-LMUL-encoding-for-MF2-and.patch
 create mode 100644 2103-cmd-compile-add-generic-simplifications-on-riscv64.patch
 create mode 100644 2104-cmd-internal-obj-riscv-fix-vector-integer-multiply-a.patch
 create mode 100644 2105-cmd-compile-optimise-float-int-register-moves-on-ris.patch
 create mode 100644 2106-internal-bytealg-vector-implementation-of-compare-fo.patch
 create mode 100644 2107-cmd-compile-internal-ssagen-improve-intrinsic-archit.patch
 create mode 100644 2108-cmd-compile-internal-ssagen-factor-out-intrinsics-co.patch
 create mode 100644 2109-cmd-compile-internal-ssagen-add-initial-test-coverag.patch
 create mode 100644 2110-cmd-dist-internal-add-GOARM64-environment-variable.patch
 create mode 100644 2111-cmd-compile-internal-ssagen-provide-intrinsicBuilder.patch
 create mode 100644 2112-cmd-compile-internal-ssagen-improve-intrinsic-test.patch
 create mode 100644 2113-cmd-compile-simplify-intrinsification-of-BitLen16-an.patch
 create mode 100644 2114-cmd-compile-simplify-intrinsification-of-TrailingZer.patch
 create mode 100644 2115-cmd-compile-intrinsify-math-bits.TrailingZeros-on-ri.patch
 create mode 100644 2116-cmd-compile-internal-ssagen-use-an-alias-for-math-bi.patch
 create mode 100644 2117-cmd-compile-intrinsify-math-bits.Len-on-riscv64.patch
 create mode 100644 2118-cmd-compile-intrinsify-math-bits.Bswap-on-riscv64.patch

diff --git a/2000-cmd-asm-cmd-internal-obj-riscv-cmd-link-improve-TLS-.patch b/2000-cmd-asm-cmd-internal-obj-riscv-cmd-link-improve-TLS-.patch
new file mode 100644
index 0000000..e3da792
--- /dev/null
+++ b/2000-cmd-asm-cmd-internal-obj-riscv-cmd-link-improve-TLS-.patch
@@ -0,0 +1,335 @@
+From 4c97a50488b7e40651b55e440792a2840a6269db Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 001/120] cmd/asm,cmd/internal/obj/riscv,cmd/link: improve TLS
+ handling on riscv64
+
+The existing Thread Local Storage (TLS) implementation for riscv64 uses
+initial-exec (IE) mode, however a MOV of a TLS symbol currently loads the
+thread pointer offset and not the actual address or memory location.
+
+Rework TLS on riscv64 to generate the full instruction sequence needed to
+load from or store to a TLS symbol. Additionally, provide support for both
+initial-exec (IE) and local-exec (LE) TLS - in many cases we can use LE,
+which is slightly more efficient and easier to support in the linker.
+
+Change-Id: I1b43f8888b3b6b10354bbb79d604771e64d92645
+Reviewed-on: https://go-review.googlesource.com/c/go/+/431103
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: David Chase <drchase@google.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+---
+ src/cmd/asm/internal/asm/endtoend_test.go   |  5 ++
+ src/cmd/asm/internal/asm/testdata/riscv64.s | 10 ++++
+ src/cmd/internal/obj/riscv/obj.go           | 63 +++++++++++++++++++--
+ src/cmd/internal/objabi/reloctype.go        | 16 +++---
+ src/cmd/internal/objabi/reloctype_string.go |  8 +--
+ src/cmd/link/internal/riscv64/asm.go        | 22 ++++---
+ src/runtime/tls_riscv64.s                   | 11 +---
+ 7 files changed, 103 insertions(+), 32 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/endtoend_test.go b/src/cmd/asm/internal/asm/endtoend_test.go
+index ef41667c8e..02bc6b7923 100644
+--- a/src/cmd/asm/internal/asm/endtoend_test.go
++++ b/src/cmd/asm/internal/asm/endtoend_test.go
+@@ -68,6 +68,11 @@ Diff:
+ 			continue
+ 		}
+ 
++		// Ignore GLOBL.
++		if strings.HasPrefix(line, "GLOBL ") {
++			continue
++		}
++
+ 		// The general form of a test input line is:
+ 		//	// comment
+ 		//	INST args [// printed form] [// hex encoding]
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 53538320f0..9899ec9e7b 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -354,6 +354,14 @@ start:
+ 	MOVD	F0, 4(X5)				// 27b20200
+ 	MOVD	F0, F1					// d3000022
+ 
++	// TLS load with local-exec (LUI + ADDIW + ADD of TP + load)
++	MOV	tls(SB), X5				// b70f00009b8f0f00b38f4f0083b20f00
++	MOVB	tls(SB), X5				// b70f00009b8f0f00b38f4f0083820f00
++
++	// TLS store with local-exec (LUI + ADDIW + ADD of TP + store)
++	MOV	X5, tls(SB)				// b70f00009b8f0f00b38f4f0023b05f00
++	MOVB	X5, tls(SB)				// b70f00009b8f0f00b38f4f0023805f00
++
+ 	// NOT pseudo-instruction
+ 	NOT	X5					// 93c2f2ff
+ 	NOT	X5, X6					// 13c3f2ff
+@@ -407,3 +415,5 @@ start:
+ 	FLTD	F0, F1, X5				// d39200a2
+ 	FLED	F0, F1, X5				// d38200a2
+ 	FEQD	F0, F1, X5				// d3a200a2
++
++GLOBL tls(SB), TLSBSS, $8
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 43fa7351bf..2e55fac812 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1827,6 +1827,53 @@ func instructionsForStore(p *obj.Prog, as obj.As, rd int16) []*instruction {
+ 	return []*instruction{insLUI, insADD, ins}
+ }
+ 
++func instructionsForTLS(p *obj.Prog, ins *instruction) []*instruction {
++	insAddTP := &instruction{as: AADD, rd: REG_TMP, rs1: REG_TMP, rs2: REG_TP}
++
++	var inss []*instruction
++	if p.Ctxt.Flag_shared {
++		// TLS initial-exec mode - load TLS offset from GOT, add the thread pointer
++		// register, then load from or store to the resulting memory location.
++		insAUIPC := &instruction{as: AAUIPC, rd: REG_TMP}
++		insLoadTLSOffset := &instruction{as: ALD, rd: REG_TMP, rs1: REG_TMP}
++		inss = []*instruction{insAUIPC, insLoadTLSOffset, insAddTP, ins}
++	} else {
++		// TLS local-exec mode - load upper TLS offset, add the lower TLS offset,
++		// add the thread pointer register, then load from or store to the resulting
++		// memory location. Note that this differs from the suggested three
++		// instruction sequence, as the Go linker does not currently have an
++		// easy way to handle relocation across 12 bytes of machine code.
++		insLUI := &instruction{as: ALUI, rd: REG_TMP}
++		insADDIW := &instruction{as: AADDIW, rd: REG_TMP, rs1: REG_TMP}
++		inss = []*instruction{insLUI, insADDIW, insAddTP, ins}
++	}
++	return inss
++}
++
++func instructionsForTLSLoad(p *obj.Prog) []*instruction {
++	if p.From.Sym.Type != objabi.STLSBSS {
++		p.Ctxt.Diag("%v: %v is not a TLS symbol", p, p.From.Sym)
++		return nil
++	}
++
++	ins := instructionForProg(p)
++	ins.as, ins.rs1, ins.rs2, ins.imm = movToLoad(p.As), REG_TMP, obj.REG_NONE, 0
++
++	return instructionsForTLS(p, ins)
++}
++
++func instructionsForTLSStore(p *obj.Prog) []*instruction {
++	if p.To.Sym.Type != objabi.STLSBSS {
++		p.Ctxt.Diag("%v: %v is not a TLS symbol", p, p.To.Sym)
++		return nil
++	}
++
++	ins := instructionForProg(p)
++	ins.as, ins.rd, ins.rs1, ins.rs2, ins.imm = movToStore(p.As), REG_TMP, uint32(p.From.Reg), obj.REG_NONE, 0
++
++	return instructionsForTLS(p, ins)
++}
++
+ // instructionsForMOV returns the machine instructions for an *obj.Prog that
+ // uses a MOV pseudo-instruction.
+ func instructionsForMOV(p *obj.Prog) []*instruction {
+@@ -1939,6 +1986,10 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 			inss = instructionsForLoad(p, movToLoad(p.As), addrToReg(p.From))
+ 
+ 		case obj.NAME_EXTERN, obj.NAME_STATIC:
++			if p.From.Sym.Type == objabi.STLSBSS {
++				return instructionsForTLSLoad(p)
++			}
++
+ 			// Note that the values for $off_hi and $off_lo are currently
+ 			// zero and will be assigned during relocation.
+ 			//
+@@ -1966,6 +2017,10 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 			inss = instructionsForStore(p, movToStore(p.As), addrToReg(p.To))
+ 
+ 		case obj.NAME_EXTERN, obj.NAME_STATIC:
++			if p.To.Sym.Type == objabi.STLSBSS {
++				return instructionsForTLSStore(p)
++			}
++
+ 			// Note that the values for $off_hi and $off_lo are currently
+ 			// zero and will be assigned during relocation.
+ 			//
+@@ -2244,10 +2299,10 @@ func assemble(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
+ 				break
+ 			}
+ 			if addr.Sym.Type == objabi.STLSBSS {
+-				if rt == objabi.R_RISCV_PCREL_ITYPE {
+-					rt = objabi.R_RISCV_TLS_IE_ITYPE
+-				} else if rt == objabi.R_RISCV_PCREL_STYPE {
+-					rt = objabi.R_RISCV_TLS_IE_STYPE
++				if ctxt.Flag_shared {
++					rt = objabi.R_RISCV_TLS_IE
++				} else {
++					rt = objabi.R_RISCV_TLS_LE
+ 				}
+ 			}
+ 
+diff --git a/src/cmd/internal/objabi/reloctype.go b/src/cmd/internal/objabi/reloctype.go
+index 996c300d95..3eaa5824e6 100644
+--- a/src/cmd/internal/objabi/reloctype.go
++++ b/src/cmd/internal/objabi/reloctype.go
+@@ -269,21 +269,21 @@ const (
+ 	// only used by the linker and are not emitted by the compiler or assembler.
+ 	R_RISCV_CALL_TRAMP
+ 
+-	// R_RISCV_PCREL_ITYPE resolves a 32-bit PC-relative address using an
++	// R_RISCV_PCREL_ITYPE resolves a 32 bit PC-relative address using an
+ 	// AUIPC + I-type instruction pair.
+ 	R_RISCV_PCREL_ITYPE
+ 
+-	// R_RISCV_PCREL_STYPE resolves a 32-bit PC-relative address using an
++	// R_RISCV_PCREL_STYPE resolves a 32 bit PC-relative address using an
+ 	// AUIPC + S-type instruction pair.
+ 	R_RISCV_PCREL_STYPE
+ 
+-	// R_RISCV_TLS_IE_ITYPE resolves a 32-bit TLS initial-exec TOC offset
+-	// address using an AUIPC + I-type instruction pair.
+-	R_RISCV_TLS_IE_ITYPE
++	// R_RISCV_TLS_IE resolves a 32 bit TLS initial-exec address using an
++	// AUIPC + I-type instruction pair.
++	R_RISCV_TLS_IE
+ 
+-	// R_RISCV_TLS_IE_STYPE resolves a 32-bit TLS initial-exec TOC offset
+-	// address using an AUIPC + S-type instruction pair.
+-	R_RISCV_TLS_IE_STYPE
++	// R_RISCV_TLS_LE resolves a 32 bit TLS local-exec address using an
++	// LUI + I-type instruction sequence.
++	R_RISCV_TLS_LE
+ 
+ 	// R_PCRELDBL relocates s390x 2-byte aligned PC-relative addresses.
+ 	// TODO(mundaym): remove once variants can be serialized - see issue 14218.
+diff --git a/src/cmd/internal/objabi/reloctype_string.go b/src/cmd/internal/objabi/reloctype_string.go
+index c7441efa28..bc8fb6b73c 100644
+--- a/src/cmd/internal/objabi/reloctype_string.go
++++ b/src/cmd/internal/objabi/reloctype_string.go
+@@ -71,8 +71,8 @@ func _() {
+ 	_ = x[R_RISCV_CALL_TRAMP-61]
+ 	_ = x[R_RISCV_PCREL_ITYPE-62]
+ 	_ = x[R_RISCV_PCREL_STYPE-63]
+-	_ = x[R_RISCV_TLS_IE_ITYPE-64]
+-	_ = x[R_RISCV_TLS_IE_STYPE-65]
++	_ = x[R_RISCV_TLS_IE-64]
++	_ = x[R_RISCV_TLS_LE-65]
+ 	_ = x[R_PCRELDBL-66]
+ 	_ = x[R_ADDRLOONG64-67]
+ 	_ = x[R_ADDRLOONG64U-68]
+@@ -91,9 +91,9 @@ func _() {
+ 	_ = x[R_INITORDER-81]
+ }
+ 
+-const _RelocType_name = "R_ADDRR_ADDRPOWERR_ADDRARM64R_ADDRMIPSR_ADDROFFR_SIZER_CALLR_CALLARMR_CALLARM64R_CALLINDR_CALLPOWERR_CALLMIPSR_CONSTR_PCRELR_TLS_LER_TLS_IER_GOTOFFR_PLT0R_PLT1R_PLT2R_USEFIELDR_USETYPER_USEIFACER_USEIFACEMETHODR_USEGENERICIFACEMETHODR_METHODOFFR_KEEPR_POWER_TOCR_GOTPCRELR_JMPMIPSR_DWARFSECREFR_DWARFFILEREFR_ARM64_TLS_LER_ARM64_TLS_IER_ARM64_GOTPCRELR_ARM64_GOTR_ARM64_PCRELR_ARM64_PCREL_LDST8R_ARM64_PCREL_LDST16R_ARM64_PCREL_LDST32R_ARM64_PCREL_LDST64R_ARM64_LDST8R_ARM64_LDST16R_ARM64_LDST32R_ARM64_LDST64R_ARM64_LDST128R_POWER_TLS_LER_POWER_TLS_IER_POWER_TLSR_POWER_TLS_IE_PCREL34R_POWER_TLS_LE_TPREL34R_ADDRPOWER_DSR_ADDRPOWER_GOTR_ADDRPOWER_GOT_PCREL34R_ADDRPOWER_PCRELR_ADDRPOWER_TOCRELR_ADDRPOWER_TOCREL_DSR_ADDRPOWER_D34R_ADDRPOWER_PCREL34R_RISCV_CALLR_RISCV_CALL_TRAMPR_RISCV_PCREL_ITYPER_RISCV_PCREL_STYPER_RISCV_TLS_IE_ITYPER_RISCV_TLS_IE_STYPER_PCRELDBLR_ADDRLOONG64R_ADDRLOONG64UR_ADDRLOONG64TLSR_ADDRLOONG64TLSUR_CALLLOONG64R_LOONG64_TLS_IE_PCREL_HIR_LOONG64_TLS_IE_LOR_JMPLOONG64R_ADDRMIPSUR_ADDRMIPSTLSR_ADDRCUOFFR_WASMIMPORTR_XCOFFREFR_PEIMAGEOFFR_INITORDER"
++const _RelocType_name = "R_ADDRR_ADDRPOWERR_ADDRARM64R_ADDRMIPSR_ADDROFFR_SIZER_CALLR_CALLARMR_CALLARM64R_CALLINDR_CALLPOWERR_CALLMIPSR_CONSTR_PCRELR_TLS_LER_TLS_IER_GOTOFFR_PLT0R_PLT1R_PLT2R_USEFIELDR_USETYPER_USEIFACER_USEIFACEMETHODR_USEGENERICIFACEMETHODR_METHODOFFR_KEEPR_POWER_TOCR_GOTPCRELR_JMPMIPSR_DWARFSECREFR_DWARFFILEREFR_ARM64_TLS_LER_ARM64_TLS_IER_ARM64_GOTPCRELR_ARM64_GOTR_ARM64_PCRELR_ARM64_PCREL_LDST8R_ARM64_PCREL_LDST16R_ARM64_PCREL_LDST32R_ARM64_PCREL_LDST64R_ARM64_LDST8R_ARM64_LDST16R_ARM64_LDST32R_ARM64_LDST64R_ARM64_LDST128R_POWER_TLS_LER_POWER_TLS_IER_POWER_TLSR_POWER_TLS_IE_PCREL34R_POWER_TLS_LE_TPREL34R_ADDRPOWER_DSR_ADDRPOWER_GOTR_ADDRPOWER_GOT_PCREL34R_ADDRPOWER_PCRELR_ADDRPOWER_TOCRELR_ADDRPOWER_TOCREL_DSR_ADDRPOWER_D34R_ADDRPOWER_PCREL34R_RISCV_CALLR_RISCV_CALL_TRAMPR_RISCV_PCREL_ITYPER_RISCV_PCREL_STYPER_RISCV_TLS_IER_RISCV_TLS_LER_PCRELDBLR_ADDRLOONG64R_ADDRLOONG64UR_ADDRLOONG64TLSR_ADDRLOONG64TLSUR_CALLLOONG64R_LOONG64_TLS_IE_PCREL_HIR_LOONG64_TLS_IE_LOR_JMPLOONG64R_ADDRMIPSUR_ADDRMIPSTLSR_ADDRCUOFFR_WASMIMPORTR_XCOFFREFR_PEIMAGEOFFR_INITORDER"
+ 
+-var _RelocType_index = [...]uint16{0, 6, 17, 28, 38, 47, 53, 59, 68, 79, 88, 99, 109, 116, 123, 131, 139, 147, 153, 159, 165, 175, 184, 194, 210, 233, 244, 250, 261, 271, 280, 293, 307, 321, 335, 351, 362, 375, 394, 414, 434, 454, 467, 481, 495, 509, 524, 538, 552, 563, 585, 607, 621, 636, 659, 676, 694, 715, 730, 749, 761, 779, 798, 817, 837, 857, 867, 880, 894, 910, 927, 940, 965, 984, 996, 1007, 1020, 1031, 1043, 1053, 1065, 1076}
++var _RelocType_index = [...]uint16{0, 6, 17, 28, 38, 47, 53, 59, 68, 79, 88, 99, 109, 116, 123, 131, 139, 147, 153, 159, 165, 175, 184, 194, 210, 233, 244, 250, 261, 271, 280, 293, 307, 321, 335, 351, 362, 375, 394, 414, 434, 454, 467, 481, 495, 509, 524, 538, 552, 563, 585, 607, 621, 636, 659, 676, 694, 715, 730, 749, 761, 779, 798, 817, 831, 845, 855, 868, 882, 898, 915, 928, 953, 972, 984, 995, 1008, 1019, 1031, 1041, 1053, 1064}
+ 
+ func (i RelocType) String() string {
+ 	i -= 1
+diff --git a/src/cmd/link/internal/riscv64/asm.go b/src/cmd/link/internal/riscv64/asm.go
+index 6b5c0cbe5a..f3186398eb 100644
+--- a/src/cmd/link/internal/riscv64/asm.go
++++ b/src/cmd/link/internal/riscv64/asm.go
+@@ -39,7 +39,7 @@ func genSymsLate(ctxt *ld.Link, ldr *loader.Loader) {
+ 		for ri := 0; ri < relocs.Count(); ri++ {
+ 			r := relocs.At(ri)
+ 			if r.Type() != objabi.R_RISCV_PCREL_ITYPE && r.Type() != objabi.R_RISCV_PCREL_STYPE &&
+-				r.Type() != objabi.R_RISCV_TLS_IE_ITYPE && r.Type() != objabi.R_RISCV_TLS_IE_STYPE {
++				r.Type() != objabi.R_RISCV_TLS_IE {
+ 				continue
+ 			}
+ 			if r.Off() == 0 && ldr.SymType(s) == sym.STEXT {
+@@ -101,7 +101,7 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym,
+ 		out.Write64(uint64(elf.R_RISCV_JAL) | uint64(elfsym)<<32)
+ 		out.Write64(uint64(r.Xadd))
+ 
+-	case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE_ITYPE, objabi.R_RISCV_TLS_IE_STYPE:
++	case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE:
+ 		// Find the text symbol for the AUIPC instruction targeted
+ 		// by this relocation.
+ 		relocs := ldr.Relocs(s)
+@@ -127,10 +127,8 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym,
+ 			hiRel, loRel = elf.R_RISCV_PCREL_HI20, elf.R_RISCV_PCREL_LO12_I
+ 		case objabi.R_RISCV_PCREL_STYPE:
+ 			hiRel, loRel = elf.R_RISCV_PCREL_HI20, elf.R_RISCV_PCREL_LO12_S
+-		case objabi.R_RISCV_TLS_IE_ITYPE:
++		case objabi.R_RISCV_TLS_IE:
+ 			hiRel, loRel = elf.R_RISCV_TLS_GOT_HI20, elf.R_RISCV_PCREL_LO12_I
+-		case objabi.R_RISCV_TLS_IE_STYPE:
+-			hiRel, loRel = elf.R_RISCV_TLS_GOT_HI20, elf.R_RISCV_PCREL_LO12_S
+ 		}
+ 		out.Write64(uint64(sectoff))
+ 		out.Write64(uint64(hiRel) | uint64(elfsym)<<32)
+@@ -139,6 +137,14 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym,
+ 		out.Write64(uint64(loRel) | uint64(hi20ElfSym)<<32)
+ 		out.Write64(uint64(0))
+ 
++	case objabi.R_RISCV_TLS_LE:
++		out.Write64(uint64(sectoff))
++		out.Write64(uint64(elf.R_RISCV_TPREL_HI20) | uint64(elfsym)<<32)
++		out.Write64(uint64(r.Xadd))
++		out.Write64(uint64(sectoff + 4))
++		out.Write64(uint64(elf.R_RISCV_TPREL_LO12_I) | uint64(elfsym)<<32)
++		out.Write64(uint64(r.Xadd))
++
+ 	default:
+ 		return false
+ 	}
+@@ -189,7 +195,7 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade
+ 		case objabi.R_RISCV_CALL, objabi.R_RISCV_CALL_TRAMP:
+ 			return val, 1, true
+ 
+-		case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE_ITYPE, objabi.R_RISCV_TLS_IE_STYPE:
++		case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE, objabi.R_RISCV_TLS_LE:
+ 			return val, 2, true
+ 		}
+ 
+@@ -211,7 +217,7 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade
+ 
+ 		return val, 0, true
+ 
+-	case objabi.R_RISCV_TLS_IE_ITYPE, objabi.R_RISCV_TLS_IE_STYPE:
++	case objabi.R_RISCV_TLS_IE, objabi.R_RISCV_TLS_LE:
+ 		// TLS relocations are not currently handled for internal linking.
+ 		// For now, TLS is only used when cgo is in use and cgo currently
+ 		// requires external linking. However, we need to accept these
+@@ -273,7 +279,7 @@ func extreloc(target *ld.Target, ldr *loader.Loader, r loader.Reloc, s loader.Sy
+ 	case objabi.R_RISCV_CALL, objabi.R_RISCV_CALL_TRAMP:
+ 		return ld.ExtrelocSimple(ldr, r), true
+ 
+-	case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE_ITYPE, objabi.R_RISCV_TLS_IE_STYPE:
++	case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE, objabi.R_RISCV_TLS_LE:
+ 		return ld.ExtrelocViaOuterSym(ldr, r, s), true
+ 	}
+ 	return loader.ExtReloc{}, false
+diff --git a/src/runtime/tls_riscv64.s b/src/runtime/tls_riscv64.s
+index 397919aeba..a0a58ea4a0 100644
+--- a/src/runtime/tls_riscv64.s
++++ b/src/runtime/tls_riscv64.s
+@@ -12,19 +12,14 @@
+ // NOTE: mcall() assumes this clobbers only X31 (REG_TMP).
+ TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0-0
+ 	MOVB	runtime·iscgo(SB), X31
+-	BEQ	X0, X31, nocgo
+-
+-	MOV	runtime·tls_g(SB), X31
+-	ADD	TP, X31		// add offset to thread pointer (X4)
+-	MOV	g, (X31)
++	BEQZ	X31, nocgo
+ 
++	MOV	g, runtime·tls_g(SB)
+ nocgo:
+ 	RET
+ 
+ TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0-0
+-	MOV	runtime·tls_g(SB), X31
+-	ADD	TP, X31		// add offset to thread pointer (X4)
+-	MOV	(X31), g
++	MOV	runtime·tls_g(SB), g
+ 	RET
+ 
+ GLOBL runtime·tls_g(SB), TLSBSS, $8
+-- 
+2.39.5
+
diff --git a/2001-cmd-compile-fold-most-repetitive-operations-to-simpl.patch b/2001-cmd-compile-fold-most-repetitive-operations-to-simpl.patch
new file mode 100644
index 0000000..cca54e3
--- /dev/null
+++ b/2001-cmd-compile-fold-most-repetitive-operations-to-simpl.patch
@@ -0,0 +1,209 @@
+From f1ab206096dedb2c0920ae2aa154323d443b2c65 Mon Sep 17 00:00:00 2001
+From: Junxian Zhu <zhujunxian@oss.cipunited.com>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 001/119] cmd/compile: fold most repetitive operations to
+ simplify riscv64 rules
+
+Most of repetitive rules in riscv64 are simple, so that we can simplify and fold it with | without losting rules readability.
+
+No change in the actual compiler code after running rulegen.
+
+Change-Id: Id0bbfd93e63b49b7f66ecb62eb9440b4900c7938
+Reviewed-on: https://go-review.googlesource.com/c/go/+/498455
+Reviewed-by: Keith Randall <khr@google.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+Run-TryBot: M Zhuo <mzh@golangcn.org>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Michael Knyszek <mknyszek@google.com>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+---
+ .../compile/internal/ssa/_gen/RISCV64.rules   | 109 +++++-------------
+ 1 file changed, 28 insertions(+), 81 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 9a6fcebdc5..d90427132c 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -3,21 +3,11 @@
+ // license that can be found in the LICENSE file.
+ 
+ // Lowering arithmetic
+-(Add64 ...) => (ADD ...)
+-(AddPtr ...) => (ADD ...)
+-(Add32 ...) => (ADD ...)
+-(Add16 ...) => (ADD ...)
+-(Add8 ...) => (ADD ...)
+-(Add32F ...) => (FADDS ...)
+-(Add64F ...) => (FADDD ...)
+-
+-(Sub64 ...) => (SUB ...)
+-(SubPtr ...) => (SUB ...)
+-(Sub32 ...) => (SUB ...)
+-(Sub16 ...) => (SUB ...)
+-(Sub8 ...) => (SUB ...)
+-(Sub32F ...) => (FSUBS ...)
+-(Sub64F ...) => (FSUBD ...)
++(Add(Ptr|64|32|16|8) ...) => (ADD ...)
++(Add(64|32)F ...) => (FADD(D|S) ...)
++
++(Sub(Ptr|64|32|16|8) ...) => (SUB ...)
++(Sub(64|32)F ...) => (FSUB(D|S) ...)
+ 
+ (Mul64 ...) => (MUL  ...)
+ (Mul64uhilo ...) => (LoweredMuluhilo ...)
+@@ -25,11 +15,9 @@
+ (Mul32 ...) => (MULW ...)
+ (Mul16 x y) => (MULW (SignExt16to32 x) (SignExt16to32 y))
+ (Mul8 x y)  => (MULW (SignExt8to32 x)  (SignExt8to32 y))
+-(Mul32F ...) => (FMULS ...)
+-(Mul64F ...) => (FMULD ...)
++(Mul(64|32)F ...) => (FMUL(D|S) ...)
+ 
+-(Div32F ...) => (FDIVS ...)
+-(Div64F ...) => (FDIVD ...)
++(Div(64|32)F ...) => (FDIV(D|S) ...)
+ 
+ (Div64 x y [false])  => (DIV x y)
+ (Div64u ...) => (DIVU ...)
+@@ -65,32 +53,15 @@
+ (Mod8 x y)   => (REMW  (SignExt8to32  x) (SignExt8to32  y))
+ (Mod8u x y)  => (REMUW (ZeroExt8to32  x) (ZeroExt8to32  y))
+ 
+-(And64 ...) => (AND ...)
+-(And32 ...) => (AND ...)
+-(And16 ...) => (AND ...)
+-(And8  ...) => (AND ...)
+-
+-(Or64 ...) => (OR ...)
+-(Or32 ...) => (OR ...)
+-(Or16 ...) => (OR ...)
+-(Or8  ...) => (OR ...)
+-
+-(Xor64 ...) => (XOR ...)
+-(Xor32 ...) => (XOR ...)
+-(Xor16 ...) => (XOR ...)
+-(Xor8  ...) => (XOR ...)
+-
+-(Neg64  ...) => (NEG ...)
+-(Neg32  ...) => (NEG ...)
+-(Neg16  ...) => (NEG ...)
+-(Neg8   ...) => (NEG ...)
+-(Neg32F ...) => (FNEGS ...)
+-(Neg64F ...) => (FNEGD ...)
+-
+-(Com64 ...) => (NOT ...)
+-(Com32 ...) => (NOT ...)
+-(Com16 ...) => (NOT ...)
+-(Com8  ...) => (NOT ...)
++(And(64|32|16|8) ...) => (AND ...)
++(Or(64|32|16|8) ...) => (OR ...)
++(Xor(64|32|16|8) ...) => (XOR ...)
++
++(Neg(64|32|16|8) ...) => (NEG ...)
++(Neg(64|32)F ...) => (FNEG(D|S) ...)
++
++(Com(64|32|16|8) ...) => (NOT ...)
++
+ 
+ (Sqrt ...) => (FSQRTD ...)
+ (Sqrt32 ...) => (FSQRTS ...)
+@@ -132,8 +103,7 @@
+ 
+ (CvtBoolToUint8 ...) => (Copy ...)
+ 
+-(Round32F ...) => (Copy ...)
+-(Round64F ...) => (Copy ...)
++(Round(64|32)F ...) => (Copy ...)
+ 
+ (Slicemask <t> x) => (SRAI [63] (NEG <t> x))
+ 
+@@ -250,36 +220,26 @@
+ (Less32U x y) => (SLTU (ZeroExt32to64 x) (ZeroExt32to64 y))
+ (Less16U x y) => (SLTU (ZeroExt16to64 x) (ZeroExt16to64 y))
+ (Less8U  x y) => (SLTU (ZeroExt8to64  x) (ZeroExt8to64  y))
+-(Less64F ...) => (FLTD ...)
+-(Less32F ...) => (FLTS ...)
++(Less(64|32)F ...) => (FLT(D|S) ...)
+ 
+ // Convert x <= y to !(y > x).
+-(Leq64  x y) => (Not (Less64  y x))
+-(Leq32  x y) => (Not (Less32  y x))
+-(Leq16  x y) => (Not (Less16  y x))
+-(Leq8   x y) => (Not (Less8   y x))
+-(Leq64U x y) => (Not (Less64U y x))
+-(Leq32U x y) => (Not (Less32U y x))
+-(Leq16U x y) => (Not (Less16U y x))
+-(Leq8U  x y) => (Not (Less8U  y x))
+-(Leq64F ...) => (FLED ...)
+-(Leq32F ...) => (FLES ...)
++(Leq(64|32|16|8)  x y) => (Not (Less(64|32|16|8)  y x))
++(Leq(64|32|16|8)U x y) => (Not (Less(64|32|16|8)U y x))
++(Leq(64|32)F ...) => (FLE(D|S) ...)
+ 
+ (EqPtr x y) => (SEQZ (SUB <typ.Uintptr> x y))
+ (Eq64  x y) => (SEQZ (SUB <x.Type> x y))
+ (Eq32  x y) => (SEQZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+ (Eq16  x y) => (SEQZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+ (Eq8   x y) => (SEQZ (SUB <x.Type> (ZeroExt8to64  x) (ZeroExt8to64  y)))
+-(Eq64F ...) => (FEQD ...)
+-(Eq32F ...) => (FEQS ...)
++(Eq(64|32)F ...) => (FEQ(D|S) ...)
+ 
+ (NeqPtr x y) => (SNEZ (SUB <typ.Uintptr> x y))
+ (Neq64  x y) => (SNEZ (SUB <x.Type> x y))
+ (Neq32  x y) => (SNEZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+ (Neq16  x y) => (SNEZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+ (Neq8   x y) => (SNEZ (SUB <x.Type> (ZeroExt8to64  x) (ZeroExt8to64  y)))
+-(Neq64F ...) => (FNED ...)
+-(Neq32F ...) => (FNES ...)
++(Neq(64|32)F ...) => (FNE(D|S) ...)
+ 
+ // Loads
+ (Load <t> ptr mem) &&  t.IsBoolean()                   => (MOVBUload ptr mem)
+@@ -537,10 +497,7 @@
+ (OffPtr [off] ptr) && is32Bit(off) => (ADDI [off] ptr)
+ (OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr)
+ 
+-(Const8  [val]) => (MOVDconst [int64(val)])
+-(Const16 [val]) => (MOVDconst [int64(val)])
+-(Const32 [val]) => (MOVDconst [int64(val)])
+-(Const64 [val]) => (MOVDconst [int64(val)])
++(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])
+ (Const32F [val]) => (FMVSX (MOVDconst [int64(math.Float32bits(val))]))
+ (Const64F [val]) => (FMVDX (MOVDconst [int64(math.Float64bits(val))]))
+ (ConstNil) => (MOVDconst [0])
+@@ -557,18 +514,9 @@
+ (TailCall ...) => (CALLtail ...)
+ 
+ // Atomic Intrinsics
+-(AtomicLoad8   ...) => (LoweredAtomicLoad8  ...)
+-(AtomicLoad32  ...) => (LoweredAtomicLoad32 ...)
+-(AtomicLoad64  ...) => (LoweredAtomicLoad64 ...)
+-(AtomicLoadPtr ...) => (LoweredAtomicLoad64 ...)
+-
+-(AtomicStore8       ...) => (LoweredAtomicStore8  ...)
+-(AtomicStore32      ...) => (LoweredAtomicStore32 ...)
+-(AtomicStore64      ...) => (LoweredAtomicStore64 ...)
+-(AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...)
+-
+-(AtomicAdd32 ...) => (LoweredAtomicAdd32 ...)
+-(AtomicAdd64 ...) => (LoweredAtomicAdd64 ...)
++(AtomicLoad(Ptr|64|32|8)  ...) => (LoweredAtomicLoad(64|64|32|8) ...)
++(AtomicStore(PtrNoWB|64|32|8) ...) => (LoweredAtomicStore(64|64|32|8) ...)
++(AtomicAdd(64|32) ...) => (LoweredAtomicAdd(64|32) ...)
+ 
+ // AtomicAnd8(ptr,val) => LoweredAtomicAnd32(ptr&^3, ^((uint8(val) ^ 0xff) << ((ptr & 3) * 8)))
+ (AtomicAnd8 ptr val mem) =>
+@@ -581,8 +529,7 @@
+ (AtomicCompareAndSwap32 ptr old new mem) => (LoweredAtomicCas32 ptr (SignExt32to64 old) new mem)
+ (AtomicCompareAndSwap64 ...) => (LoweredAtomicCas64 ...)
+ 
+-(AtomicExchange32 ...) => (LoweredAtomicExchange32 ...)
+-(AtomicExchange64 ...) => (LoweredAtomicExchange64 ...)
++(AtomicExchange(64|32) ...) => (LoweredAtomicExchange(64|32) ...)
+ 
+ // AtomicOr8(ptr,val)  => LoweredAtomicOr32(ptr&^3, uint32(val)<<((ptr&3)*8))
+ (AtomicOr8 ptr val mem) =>
+-- 
+2.39.5
+
diff --git a/2002-crypto-internal-bigmod-provide-assembly-addMulVVW-fo.patch b/2002-crypto-internal-bigmod-provide-assembly-addMulVVW-fo.patch
new file mode 100644
index 0000000..49c6a49
--- /dev/null
+++ b/2002-crypto-internal-bigmod-provide-assembly-addMulVVW-fo.patch
@@ -0,0 +1,169 @@
+From c63ac393ef890036d861a284e7404e1758b40113 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 002/119] crypto/internal/bigmod: provide assembly addMulVVW*
+ for riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This provides an assembly implementation of addMulVVW* for riscv64,
+processing four words per loop, resulting in a performance gain
+of 23%+ for RSA decryption/signing on a StarFive VisionFive 2:
+
+                       │    rsa1     │                rsa2                 │
+                       │   sec/op    │   sec/op     vs base                │
+DecryptPKCS1v15/2048-4   24.29m ± 0%   18.65m ± 0%  -23.24% (p=0.000 n=10)
+DecryptPKCS1v15/3072-4   73.28m ± 0%   54.08m ± 0%  -26.20% (p=0.000 n=10)
+DecryptPKCS1v15/4096-4   163.5m ± 0%   119.1m ± 0%  -27.17% (p=0.000 n=10)
+EncryptPKCS1v15/2048-4   1.505m ± 0%   1.446m ± 0%   -3.93% (p=0.000 n=10)
+DecryptOAEP/2048-4       24.37m ± 0%   18.72m ± 0%  -23.17% (p=0.000 n=10)
+EncryptOAEP/2048-4       1.570m ± 0%   1.510m ± 0%   -3.84% (p=0.000 n=10)
+SignPKCS1v15/2048-4      24.52m ± 0%   18.80m ± 0%  -23.36% (p=0.000 n=10)
+VerifyPKCS1v15/2048-4    1.491m ± 0%   1.431m ± 0%   -4.00% (p=0.000 n=10)
+SignPSS/2048-4           24.60m ± 0%   18.89m ± 0%  -23.21% (p=0.000 n=10)
+VerifyPSS/2048-4         1.565m ± 0%   1.504m ± 0%   -3.87% (p=0.000 n=10)
+geomean                  10.90m        9.066m       -16.79%
+
+Change-Id: I8414ba0028b0781a945610abe02c285d2387aef3
+Reviewed-on: https://go-review.googlesource.com/c/go/+/516536
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Filippo Valsorda <filippo@golang.org>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Michael Knyszek <mknyszek@google.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+---
+ src/crypto/internal/bigmod/nat_asm.go    |  2 +-
+ src/crypto/internal/bigmod/nat_noasm.go  |  2 +-
+ src/crypto/internal/bigmod/nat_riscv64.s | 91 ++++++++++++++++++++++++
+ 3 files changed, 93 insertions(+), 2 deletions(-)
+ create mode 100644 src/crypto/internal/bigmod/nat_riscv64.s
+
+diff --git a/src/crypto/internal/bigmod/nat_asm.go b/src/crypto/internal/bigmod/nat_asm.go
+index 5eb91e1c6c..0283b07e68 100644
+--- a/src/crypto/internal/bigmod/nat_asm.go
++++ b/src/crypto/internal/bigmod/nat_asm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !purego && (386 || amd64 || arm || arm64 || ppc64 || ppc64le || s390x)
++//go:build !purego && (386 || amd64 || arm || arm64 || ppc64 || ppc64le || riscv64 || s390x)
+ 
+ package bigmod
+ 
+diff --git a/src/crypto/internal/bigmod/nat_noasm.go b/src/crypto/internal/bigmod/nat_noasm.go
+index eff12536f9..71f38da754 100644
+--- a/src/crypto/internal/bigmod/nat_noasm.go
++++ b/src/crypto/internal/bigmod/nat_noasm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build purego || !(386 || amd64 || arm || arm64 || ppc64 || ppc64le || s390x)
++//go:build purego || !(386 || amd64 || arm || arm64 || ppc64 || ppc64le || riscv64 || s390x)
+ 
+ package bigmod
+ 
+diff --git a/src/crypto/internal/bigmod/nat_riscv64.s b/src/crypto/internal/bigmod/nat_riscv64.s
+new file mode 100644
+index 0000000000..1d8c8c8900
+--- /dev/null
++++ b/src/crypto/internal/bigmod/nat_riscv64.s
+@@ -0,0 +1,91 @@
++// Copyright 2023 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++//go:build !purego
++
++#include "textflag.h"
++
++// func addMulVVW1024(z, x *uint, y uint) (c uint)
++TEXT ·addMulVVW1024(SB),$0-32
++	MOV	$16, X30
++	JMP	addMulVVWx(SB)
++
++// func addMulVVW1536(z, x *uint, y uint) (c uint)
++TEXT ·addMulVVW1536(SB),$0-32
++	MOV	$24, X30
++	JMP	addMulVVWx(SB)
++
++// func addMulVVW2048(z, x *uint, y uint) (c uint)
++TEXT ·addMulVVW2048(SB),$0-32
++	MOV	$32, X30
++	JMP	addMulVVWx(SB)
++
++TEXT addMulVVWx(SB),NOFRAME|NOSPLIT,$0
++	MOV	z+0(FP), X5
++	MOV	x+8(FP), X7
++	MOV	y+16(FP), X6
++	MOV	$0, X29
++
++	BEQZ	X30, done
++loop:
++	MOV	0*8(X5), X10	// z[0]
++	MOV	1*8(X5), X13	// z[1]
++	MOV	2*8(X5), X16	// z[2]
++	MOV	3*8(X5), X19	// z[3]
++
++	MOV	0*8(X7), X8	// x[0]
++	MOV	1*8(X7), X11	// x[1]
++	MOV	2*8(X7), X14	// x[2]
++	MOV	3*8(X7), X17	// x[3]
++
++	MULHU	X8, X6, X9	// z_hi[0] = x[0] * y
++	MUL	X8, X6, X8	// z_lo[0] = x[0] * y
++	ADD	X8, X10, X21	// z_lo[0] = x[0] * y + z[0]
++	SLTU	X8, X21, X22
++	ADD	X9, X22, X9	// z_hi[0] = x[0] * y + z[0]
++	ADD	X21, X29, X10	// z_lo[0] = x[0] * y + z[0] + c
++	SLTU	X21, X10, X22
++	ADD	X9, X22, X29	// next c
++
++	MULHU	X11, X6, X12	// z_hi[1] = x[1] * y
++	MUL	X11, X6, X11	// z_lo[1] = x[1] * y
++	ADD	X11, X13, X21	// z_lo[1] = x[1] * y + z[1]
++	SLTU	X11, X21, X22
++	ADD	X12, X22, X12	// z_hi[1] = x[1] * y + z[1]
++	ADD	X21, X29, X13	// z_lo[1] = x[1] * y + z[1] + c
++	SLTU	X21, X13, X22
++	ADD	X12, X22, X29	// next c
++
++	MULHU	X14, X6, X15	// z_hi[2] = x[2] * y
++	MUL	X14, X6, X14	// z_lo[2] = x[2] * y
++	ADD	X14, X16, X21	// z_lo[2] = x[2] * y + z[2]
++	SLTU	X14, X21, X22
++	ADD	X15, X22, X15	// z_hi[2] = x[2] * y + z[2]
++	ADD	X21, X29, X16	// z_lo[2] = x[2] * y + z[2] + c
++	SLTU	X21, X16, X22
++	ADD	X15, X22, X29	// next c
++
++	MULHU	X17, X6, X18	// z_hi[3] = x[3] * y
++	MUL	X17, X6, X17	// z_lo[3] = x[3] * y
++	ADD	X17, X19, X21	// z_lo[3] = x[3] * y + z[3]
++	SLTU	X17, X21, X22
++	ADD	X18, X22, X18	// z_hi[3] = x[3] * y + z[3]
++	ADD	X21, X29, X19	// z_lo[3] = x[3] * y + z[3] + c
++	SLTU	X21, X19, X22
++	ADD	X18, X22, X29	// next c
++
++	MOV	X10, 0*8(X5)	// z[0]
++	MOV	X13, 1*8(X5)	// z[1]
++	MOV	X16, 2*8(X5)	// z[2]
++	MOV	X19, 3*8(X5)	// z[3]
++
++	ADDI	$32, X5
++	ADDI	$32, X7
++
++	ADDI	$-4, X30
++	BNEZ	X30, loop
++
++done:
++	MOV	X29, c+24(FP)
++	RET
+-- 
+2.39.5
+
diff --git a/2003-cmd-compile-sign-or-zero-extend-for-32-bit-equality-.patch b/2003-cmd-compile-sign-or-zero-extend-for-32-bit-equality-.patch
new file mode 100644
index 0000000..f42c95c
--- /dev/null
+++ b/2003-cmd-compile-sign-or-zero-extend-for-32-bit-equality-.patch
@@ -0,0 +1,242 @@
+From 8d8ed2bb0d3c76380a641adec7ff5ee9a26e000e Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 003/119] cmd/compile: sign or zero extend for 32 bit equality
+ on riscv64
+
+For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
+sign extend for signed types and zero extend for unsigned types. This makes
+no difference to the equality test (via SUB), however it increases the
+likelihood of avoiding unnecessary sign or zero extension simply for the
+purpose of equality testing.
+
+While here, replace the Neq* rules with (Not (Eq*)) - this makes no
+difference to the generated code (as the intermediates get expanded and
+eliminated), however it means that changes to the equality rules also
+reflect in the inequality rules.
+
+As an example, the following:
+
+   lw      t0,956(t0)
+   slli    t0,t0,0x20
+   srli    t0,t0,0x20
+   li      t1,1
+   bne     t1,t0,278fc
+
+Becomes:
+
+   lw      t0,1024(t0)
+   li      t1,1
+   bne     t1,t0,278b0
+
+Removes almost 1000 instructions from the Go binary on riscv64.
+
+Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
+Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Michael Knyszek <mknyszek@google.com>
+---
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  13 +--
+ .../compile/internal/ssa/rewriteRISCV64.go    | 101 +++++++++++-------
+ 2 files changed, 67 insertions(+), 47 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index d90427132c..181b46a7ce 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -229,16 +229,17 @@
+ 
+ (EqPtr x y) => (SEQZ (SUB <typ.Uintptr> x y))
+ (Eq64  x y) => (SEQZ (SUB <x.Type> x y))
+-(Eq32  x y) => (SEQZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y)))
++(Eq32  x y) &&  x.Type.IsSigned() => (SEQZ (SUB <x.Type> (SignExt32to64 x) (SignExt32to64 y)))
++(Eq32  x y) && !x.Type.IsSigned() => (SEQZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+ (Eq16  x y) => (SEQZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+ (Eq8   x y) => (SEQZ (SUB <x.Type> (ZeroExt8to64  x) (ZeroExt8to64  y)))
+ (Eq(64|32)F ...) => (FEQ(D|S) ...)
+ 
+-(NeqPtr x y) => (SNEZ (SUB <typ.Uintptr> x y))
+-(Neq64  x y) => (SNEZ (SUB <x.Type> x y))
+-(Neq32  x y) => (SNEZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+-(Neq16  x y) => (SNEZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+-(Neq8   x y) => (SNEZ (SUB <x.Type> (ZeroExt8to64  x) (ZeroExt8to64  y)))
++(NeqPtr x y) => (Not (EqPtr x y))
++(Neq64  x y) => (Not (Eq64  x y))
++(Neq32  x y) => (Not (Eq32  x y))
++(Neq16  x y) => (Not (Eq16  x y))
++(Neq8   x y) => (Not (Eq8   x y))
+ (Neq(64|32)F ...) => (FNE(D|S) ...)
+ 
+ // Loads
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index ffbeb1df47..e8002599ef 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -1081,20 +1081,50 @@ func rewriteValueRISCV64_OpEq32(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Eq32 x y)
++	// cond: x.Type.IsSigned()
++	// result: (SEQZ (SUB <x.Type> (SignExt32to64 x) (SignExt32to64 y)))
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			x := v_0
++			y := v_1
++			if !(x.Type.IsSigned()) {
++				continue
++			}
++			v.reset(OpRISCV64SEQZ)
++			v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
++			v1 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
++			v1.AddArg(x)
++			v2 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
++			v2.AddArg(y)
++			v0.AddArg2(v1, v2)
++			v.AddArg(v0)
++			return true
++		}
++		break
++	}
++	// match: (Eq32 x y)
++	// cond: !x.Type.IsSigned()
+ 	// result: (SEQZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+ 	for {
+-		x := v_0
+-		y := v_1
+-		v.reset(OpRISCV64SEQZ)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v2.AddArg(y)
+-		v0.AddArg2(v1, v2)
+-		v.AddArg(v0)
+-		return true
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			x := v_0
++			y := v_1
++			if !(!x.Type.IsSigned()) {
++				continue
++			}
++			v.reset(OpRISCV64SEQZ)
++			v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
++			v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
++			v1.AddArg(x)
++			v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
++			v2.AddArg(y)
++			v0.AddArg2(v1, v2)
++			v.AddArg(v0)
++			return true
++		}
++		break
+ 	}
++	return false
+ }
+ func rewriteValueRISCV64_OpEq64(v *Value) bool {
+ 	v_1 := v.Args[1]
+@@ -2942,17 +2972,13 @@ func rewriteValueRISCV64_OpNeq16(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Neq16 x y)
+-	// result: (SNEZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
++	// result: (Not (Eq16 x y))
+ 	for {
+ 		x := v_0
+ 		y := v_1
+-		v.reset(OpRISCV64SNEZ)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v2 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+-		v2.AddArg(y)
+-		v0.AddArg2(v1, v2)
++		v.reset(OpNot)
++		v0 := b.NewValue0(v.Pos, OpEq16, typ.Bool)
++		v0.AddArg2(x, y)
+ 		v.AddArg(v0)
+ 		return true
+ 	}
+@@ -2963,17 +2989,13 @@ func rewriteValueRISCV64_OpNeq32(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Neq32 x y)
+-	// result: (SNEZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y)))
++	// result: (Not (Eq32 x y))
+ 	for {
+ 		x := v_0
+ 		y := v_1
+-		v.reset(OpRISCV64SNEZ)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v2.AddArg(y)
+-		v0.AddArg2(v1, v2)
++		v.reset(OpNot)
++		v0 := b.NewValue0(v.Pos, OpEq32, typ.Bool)
++		v0.AddArg2(x, y)
+ 		v.AddArg(v0)
+ 		return true
+ 	}
+@@ -2982,13 +3004,14 @@ func rewriteValueRISCV64_OpNeq64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
++	typ := &b.Func.Config.Types
+ 	// match: (Neq64 x y)
+-	// result: (SNEZ (SUB <x.Type> x y))
++	// result: (Not (Eq64 x y))
+ 	for {
+ 		x := v_0
+ 		y := v_1
+-		v.reset(OpRISCV64SNEZ)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
++		v.reset(OpNot)
++		v0 := b.NewValue0(v.Pos, OpEq64, typ.Bool)
+ 		v0.AddArg2(x, y)
+ 		v.AddArg(v0)
+ 		return true
+@@ -3000,17 +3023,13 @@ func rewriteValueRISCV64_OpNeq8(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Neq8 x y)
+-	// result: (SNEZ (SUB <x.Type> (ZeroExt8to64 x) (ZeroExt8to64 y)))
++	// result: (Not (Eq8 x y))
+ 	for {
+ 		x := v_0
+ 		y := v_1
+-		v.reset(OpRISCV64SNEZ)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v2 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+-		v2.AddArg(y)
+-		v0.AddArg2(v1, v2)
++		v.reset(OpNot)
++		v0 := b.NewValue0(v.Pos, OpEq8, typ.Bool)
++		v0.AddArg2(x, y)
+ 		v.AddArg(v0)
+ 		return true
+ 	}
+@@ -3038,12 +3057,12 @@ func rewriteValueRISCV64_OpNeqPtr(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (NeqPtr x y)
+-	// result: (SNEZ (SUB <typ.Uintptr> x y))
++	// result: (Not (EqPtr x y))
+ 	for {
+ 		x := v_0
+ 		y := v_1
+-		v.reset(OpRISCV64SNEZ)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SUB, typ.Uintptr)
++		v.reset(OpNot)
++		v0 := b.NewValue0(v.Pos, OpEqPtr, typ.Bool)
+ 		v0.AddArg2(x, y)
+ 		v.AddArg(v0)
+ 		return true
+-- 
+2.39.5
+
diff --git a/2004-cmd-compile-improve-FP-FMA-performance-on-riscv64.patch b/2004-cmd-compile-improve-FP-FMA-performance-on-riscv64.patch
new file mode 100644
index 0000000..a0f302c
--- /dev/null
+++ b/2004-cmd-compile-improve-FP-FMA-performance-on-riscv64.patch
@@ -0,0 +1,276 @@
+From a2f69cbaaae63c86b4e8f29085414a237c24def4 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 004/119] cmd/compile: improve FP FMA performance on riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+FMADD/FMSUB/FNSUB are an efficient FP FMA instructions, which can
+be used by the compiler to improve FP performance.
+
+Erf               188.0n ± 2%   139.5n ± 2%  -25.82% (p=0.000 n=10)
+Erfc              193.6n ± 1%   143.2n ± 1%  -26.01% (p=0.000 n=10)
+Erfinv            244.4n ± 2%   172.6n ± 0%  -29.40% (p=0.000 n=10)
+Erfcinv           244.7n ± 2%   173.0n ± 1%  -29.31% (p=0.000 n=10)
+geomean           216.0n        156.3n       -27.65%
+
+Ref: The RISC-V Instruction Set Manual Volume I: Unprivileged ISA
+11.6 Single-Precision Floating-Point Computational Instructions
+
+Change-Id: I89aa3a4df7576fdd47f4a6ee608ac16feafd093c
+Reviewed-on: https://go-review.googlesource.com/c/go/+/506036
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Run-TryBot: M Zhuo <mzh@golangcn.org>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+Reviewed-by: Keith Randall <khr@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |  3 +
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  5 +-
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |  4 ++
+ src/cmd/compile/internal/ssa/opGen.go         | 28 ++++++++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 72 ++++++++++++++++++-
+ test/codegen/floats.go                        |  3 +
+ 6 files changed, 112 insertions(+), 3 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 2eb1e7ffa0..143e7c525a 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -694,6 +694,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.To.Sym = ir.Syms.Duffcopy
+ 		p.To.Offset = v.AuxInt
+ 
++	case ssa.OpRISCV64LoweredRound32F, ssa.OpRISCV64LoweredRound64F:
++		// input is already rounded
++
+ 	case ssa.OpClobber, ssa.OpClobberReg:
+ 		// TODO: implement for clobberdead experiment. Nop is ok for now.
+ 
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 181b46a7ce..ac68dfed76 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -103,7 +103,7 @@
+ 
+ (CvtBoolToUint8 ...) => (Copy ...)
+ 
+-(Round(64|32)F ...) => (Copy ...)
++(Round(32|64)F ...) => (LoweredRound(32|64)F ...)
+ 
+ (Slicemask <t> x) => (SRAI [63] (NEG <t> x))
+ 
+@@ -780,6 +780,9 @@
+ (Select0 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MULHU x y)
+ (Select1 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MUL x y)
+ 
++(FADDD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FMADDD x y a)
++(FSUBD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FNMSUBD x y a)
++(FSUBD (FMULD x y) a) && a.Block.Func.useFMA(v) => (FMSUBD x y a)
+ // Merge negation into fused multiply-add and multiply-subtract.
+ //
+ // Key:
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 52e87cbe72..69f2950a88 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -237,6 +237,10 @@ func init() {
+ 		// gets correctly ordered with respect to GC safepoints.
+ 		{name: "MOVconvert", argLength: 2, reg: gp11, asm: "MOV"}, // arg0, but converted to int/ptr as appropriate; arg1=mem
+ 
++		// Round ops to block fused-multiply-add extraction.
++		{name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true},
++		{name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true},
++
+ 		// Calls
+ 		{name: "CALLstatic", argLength: -1, reg: call, aux: "CallOff", call: true},               // call static function aux.(*gc.Sym). last arg=mem, auxint=argsize, returns mem
+ 		{name: "CALLtail", argLength: -1, reg: call, aux: "CallOff", call: true, tailCall: true}, // tail call static function aux.(*gc.Sym). last arg=mem, auxint=argsize, returns mem
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index 1480fcf45b..e838a26f79 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2382,6 +2382,8 @@ const (
+ 	OpRISCV64SLTU
+ 	OpRISCV64SLTIU
+ 	OpRISCV64MOVconvert
++	OpRISCV64LoweredRound32F
++	OpRISCV64LoweredRound64F
+ 	OpRISCV64CALLstatic
+ 	OpRISCV64CALLtail
+ 	OpRISCV64CALLclosure
+@@ -31916,6 +31918,32 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:         "LoweredRound32F",
++		argLen:       1,
++		resultInArg0: true,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
++	{
++		name:         "LoweredRound64F",
++		argLen:       1,
++		resultInArg0: true,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
+ 	{
+ 		name:    "CALLstatic",
+ 		auxType: auxCallOff,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index e8002599ef..17af023db3 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -440,6 +440,8 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpRISCV64AND(v)
+ 	case OpRISCV64ANDI:
+ 		return rewriteValueRISCV64_OpRISCV64ANDI(v)
++	case OpRISCV64FADDD:
++		return rewriteValueRISCV64_OpRISCV64FADDD(v)
+ 	case OpRISCV64FMADDD:
+ 		return rewriteValueRISCV64_OpRISCV64FMADDD(v)
+ 	case OpRISCV64FMSUBD:
+@@ -448,6 +450,8 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpRISCV64FNMADDD(v)
+ 	case OpRISCV64FNMSUBD:
+ 		return rewriteValueRISCV64_OpRISCV64FNMSUBD(v)
++	case OpRISCV64FSUBD:
++		return rewriteValueRISCV64_OpRISCV64FSUBD(v)
+ 	case OpRISCV64MOVBUload:
+ 		return rewriteValueRISCV64_OpRISCV64MOVBUload(v)
+ 	case OpRISCV64MOVBUreg:
+@@ -541,10 +545,10 @@ func rewriteValueRISCV64(v *Value) bool {
+ 	case OpRotateLeft8:
+ 		return rewriteValueRISCV64_OpRotateLeft8(v)
+ 	case OpRound32F:
+-		v.Op = OpCopy
++		v.Op = OpRISCV64LoweredRound32F
+ 		return true
+ 	case OpRound64F:
+-		v.Op = OpCopy
++		v.Op = OpRISCV64LoweredRound64F
+ 		return true
+ 	case OpRsh16Ux16:
+ 		return rewriteValueRISCV64_OpRsh16Ux16(v)
+@@ -3335,6 +3339,31 @@ func rewriteValueRISCV64_OpRISCV64ANDI(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64FADDD(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (FADDD a (FMULD x y))
++	// cond: a.Block.Func.useFMA(v)
++	// result: (FMADDD x y a)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			a := v_0
++			if v_1.Op != OpRISCV64FMULD {
++				continue
++			}
++			y := v_1.Args[1]
++			x := v_1.Args[0]
++			if !(a.Block.Func.useFMA(v)) {
++				continue
++			}
++			v.reset(OpRISCV64FMADDD)
++			v.AddArg3(x, y, a)
++			return true
++		}
++		break
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64FMADDD(v *Value) bool {
+ 	v_2 := v.Args[2]
+ 	v_1 := v.Args[1]
+@@ -3515,6 +3544,45 @@ func rewriteValueRISCV64_OpRISCV64FNMSUBD(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64FSUBD(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (FSUBD a (FMULD x y))
++	// cond: a.Block.Func.useFMA(v)
++	// result: (FNMSUBD x y a)
++	for {
++		a := v_0
++		if v_1.Op != OpRISCV64FMULD {
++			break
++		}
++		y := v_1.Args[1]
++		x := v_1.Args[0]
++		if !(a.Block.Func.useFMA(v)) {
++			break
++		}
++		v.reset(OpRISCV64FNMSUBD)
++		v.AddArg3(x, y, a)
++		return true
++	}
++	// match: (FSUBD (FMULD x y) a)
++	// cond: a.Block.Func.useFMA(v)
++	// result: (FMSUBD x y a)
++	for {
++		if v_0.Op != OpRISCV64FMULD {
++			break
++		}
++		y := v_0.Args[1]
++		x := v_0.Args[0]
++		a := v_1
++		if !(a.Block.Func.useFMA(v)) {
++			break
++		}
++		v.reset(OpRISCV64FMSUBD)
++		v.AddArg3(x, y, a)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64MOVBUload(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+diff --git a/test/codegen/floats.go b/test/codegen/floats.go
+index 9cb62e031a..1c5fc8a31a 100644
+--- a/test/codegen/floats.go
++++ b/test/codegen/floats.go
+@@ -88,17 +88,20 @@ func FusedAdd64(x, y, z float64) float64 {
+ 	// s390x:"FMADD\t"
+ 	// ppc64x:"FMADD\t"
+ 	// arm64:"FMADDD"
++	// riscv64:"FMADDD\t"
+ 	return x*y + z
+ }
+ 
+ func FusedSub64_a(x, y, z float64) float64 {
+ 	// s390x:"FMSUB\t"
+ 	// ppc64x:"FMSUB\t"
++	// riscv64:"FMSUBD\t"
+ 	return x*y - z
+ }
+ 
+ func FusedSub64_b(x, y, z float64) float64 {
+ 	// arm64:"FMSUBD"
++	// riscv64:"FNMSUBD\t"
+ 	return z - x*y
+ }
+ 
+-- 
+2.39.5
+
diff --git a/2005-cmd-compile-add-single-precision-FMA-code-generation.patch b/2005-cmd-compile-add-single-precision-FMA-code-generation.patch
new file mode 100644
index 0000000..d5a7daa
--- /dev/null
+++ b/2005-cmd-compile-add-single-precision-FMA-code-generation.patch
@@ -0,0 +1,512 @@
+From 567178ee5e574611b048418b56905c63e98c9658 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 005/119] cmd/compile: add single-precision FMA code generation
+ for riscv64
+
+This CL adds FMADDS,FMSUBS,FNMADDS,FNMSUBS SSA support for riscv
+
+Change-Id: I1e7dd322b46b9e0f4923dbba256303d69ed12066
+Reviewed-on: https://go-review.googlesource.com/c/go/+/506616
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: David Chase <drchase@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Keith Randall <khr@google.com>
+Run-TryBot: M Zhuo <mzh@golangcn.org>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |   3 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   |   9 +-
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |   4 +
+ src/cmd/compile/internal/ssa/opGen.go         |  68 +++++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 256 ++++++++++++++++++
+ test/codegen/floats.go                        |   3 +
+ 6 files changed, 339 insertions(+), 4 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 143e7c525a..f8cf786920 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -332,7 +332,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p2.From.Reg = v.Reg1()
+ 		p2.To.Type = obj.TYPE_REG
+ 		p2.To.Reg = v.Reg1()
+-	case ssa.OpRISCV64FMADDD, ssa.OpRISCV64FMSUBD, ssa.OpRISCV64FNMADDD, ssa.OpRISCV64FNMSUBD:
++	case ssa.OpRISCV64FMADDD, ssa.OpRISCV64FMSUBD, ssa.OpRISCV64FNMADDD, ssa.OpRISCV64FNMSUBD,
++		ssa.OpRISCV64FMADDS, ssa.OpRISCV64FMSUBS, ssa.OpRISCV64FNMADDS, ssa.OpRISCV64FNMSUBS:
+ 		r := v.Reg()
+ 		r1 := v.Args[0].Reg()
+ 		r2 := v.Args[1].Reg()
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index ac68dfed76..e0bf00d45d 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -780,9 +780,10 @@
+ (Select0 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MULHU x y)
+ (Select1 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MUL x y)
+ 
+-(FADDD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FMADDD x y a)
+-(FSUBD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FNMSUBD x y a)
+-(FSUBD (FMULD x y) a) && a.Block.Func.useFMA(v) => (FMSUBD x y a)
++(FADD(S|D) a (FMUL(S|D) x y)) && a.Block.Func.useFMA(v) => (FMADD(S|D) x y a)
++(FSUB(S|D) a (FMUL(S|D) x y)) && a.Block.Func.useFMA(v) => (FNMSUB(S|D) x y a)
++(FSUB(S|D) (FMUL(S|D) x y) a) && a.Block.Func.useFMA(v) => (FMSUB(S|D) x y a)
++
+ // Merge negation into fused multiply-add and multiply-subtract.
+ //
+ // Key:
+@@ -793,5 +794,7 @@
+ //                D B
+ //
+ // Note: multiplication commutativity handled by rule generator.
++(F(MADD|NMADD|MSUB|NMSUB)S neg:(FNEGS x) y z) && neg.Uses == 1 => (F(NMSUB|MSUB|NMADD|MADD)S x y z)
++(F(MADD|NMADD|MSUB|NMSUB)S x y neg:(FNEGS z)) && neg.Uses == 1 => (F(MSUB|NMSUB|MADD|NMADD)S x y z)
+ (F(MADD|NMADD|MSUB|NMSUB)D neg:(FNEGD x) y z) && neg.Uses == 1 => (F(NMSUB|MSUB|NMADD|MADD)D x y z)
+ (F(MADD|NMADD|MSUB|NMSUB)D x y neg:(FNEGD z)) && neg.Uses == 1 => (F(MSUB|NMSUB|MADD|NMADD)D x y z)
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 69f2950a88..317e9150c9 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -411,6 +411,10 @@ func init() {
+ 		{name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS", commutative: false, typ: "Float32"},                                          // arg0 - arg1
+ 		{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, typ: "Float32"},                                           // arg0 * arg1
+ 		{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", commutative: false, typ: "Float32"},                                          // arg0 / arg1
++		{name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS", commutative: true, typ: "Float32"},                                         // (arg0 * arg1) + arg2
++		{name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS", commutative: true, typ: "Float32"},                                         // (arg0 * arg1) - arg2
++		{name: "FNMADDS", argLength: 3, reg: fp31, asm: "FNMADDS", commutative: true, typ: "Float32"},                                       // -(arg0 * arg1) + arg2
++		{name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS", commutative: true, typ: "Float32"},                                       // -(arg0 * arg1) - arg2
+ 		{name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS", typ: "Float32"},                                                            // sqrt(arg0)
+ 		{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS", typ: "Float32"},                                                              // -arg0
+ 		{name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"},                                                              // reinterpret arg0 as float
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index e838a26f79..5af047c38f 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2418,6 +2418,10 @@ const (
+ 	OpRISCV64FSUBS
+ 	OpRISCV64FMULS
+ 	OpRISCV64FDIVS
++	OpRISCV64FMADDS
++	OpRISCV64FMSUBS
++	OpRISCV64FNMADDS
++	OpRISCV64FNMSUBS
+ 	OpRISCV64FSQRTS
+ 	OpRISCV64FNEGS
+ 	OpRISCV64FMVSX
+@@ -32391,6 +32395,70 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:        "FMADDS",
++		argLen:      3,
++		commutative: true,
++		asm:         riscv.AFMADDS,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
++	{
++		name:        "FMSUBS",
++		argLen:      3,
++		commutative: true,
++		asm:         riscv.AFMSUBS,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
++	{
++		name:        "FNMADDS",
++		argLen:      3,
++		commutative: true,
++		asm:         riscv.AFNMADDS,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
++	{
++		name:        "FNMSUBS",
++		argLen:      3,
++		commutative: true,
++		asm:         riscv.AFNMSUBS,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
+ 	{
+ 		name:   "FSQRTS",
+ 		argLen: 1,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 17af023db3..0ad6433bf4 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -442,16 +442,28 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpRISCV64ANDI(v)
+ 	case OpRISCV64FADDD:
+ 		return rewriteValueRISCV64_OpRISCV64FADDD(v)
++	case OpRISCV64FADDS:
++		return rewriteValueRISCV64_OpRISCV64FADDS(v)
+ 	case OpRISCV64FMADDD:
+ 		return rewriteValueRISCV64_OpRISCV64FMADDD(v)
++	case OpRISCV64FMADDS:
++		return rewriteValueRISCV64_OpRISCV64FMADDS(v)
+ 	case OpRISCV64FMSUBD:
+ 		return rewriteValueRISCV64_OpRISCV64FMSUBD(v)
++	case OpRISCV64FMSUBS:
++		return rewriteValueRISCV64_OpRISCV64FMSUBS(v)
+ 	case OpRISCV64FNMADDD:
+ 		return rewriteValueRISCV64_OpRISCV64FNMADDD(v)
++	case OpRISCV64FNMADDS:
++		return rewriteValueRISCV64_OpRISCV64FNMADDS(v)
+ 	case OpRISCV64FNMSUBD:
+ 		return rewriteValueRISCV64_OpRISCV64FNMSUBD(v)
++	case OpRISCV64FNMSUBS:
++		return rewriteValueRISCV64_OpRISCV64FNMSUBS(v)
+ 	case OpRISCV64FSUBD:
+ 		return rewriteValueRISCV64_OpRISCV64FSUBD(v)
++	case OpRISCV64FSUBS:
++		return rewriteValueRISCV64_OpRISCV64FSUBS(v)
+ 	case OpRISCV64MOVBUload:
+ 		return rewriteValueRISCV64_OpRISCV64MOVBUload(v)
+ 	case OpRISCV64MOVBUreg:
+@@ -3364,6 +3376,31 @@ func rewriteValueRISCV64_OpRISCV64FADDD(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64FADDS(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (FADDS a (FMULS x y))
++	// cond: a.Block.Func.useFMA(v)
++	// result: (FMADDS x y a)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			a := v_0
++			if v_1.Op != OpRISCV64FMULS {
++				continue
++			}
++			y := v_1.Args[1]
++			x := v_1.Args[0]
++			if !(a.Block.Func.useFMA(v)) {
++				continue
++			}
++			v.reset(OpRISCV64FMADDS)
++			v.AddArg3(x, y, a)
++			return true
++		}
++		break
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64FMADDD(v *Value) bool {
+ 	v_2 := v.Args[2]
+ 	v_1 := v.Args[1]
+@@ -3409,6 +3446,51 @@ func rewriteValueRISCV64_OpRISCV64FMADDD(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64FMADDS(v *Value) bool {
++	v_2 := v.Args[2]
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (FMADDS neg:(FNEGS x) y z)
++	// cond: neg.Uses == 1
++	// result: (FNMSUBS x y z)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			neg := v_0
++			if neg.Op != OpRISCV64FNEGS {
++				continue
++			}
++			x := neg.Args[0]
++			y := v_1
++			z := v_2
++			if !(neg.Uses == 1) {
++				continue
++			}
++			v.reset(OpRISCV64FNMSUBS)
++			v.AddArg3(x, y, z)
++			return true
++		}
++		break
++	}
++	// match: (FMADDS x y neg:(FNEGS z))
++	// cond: neg.Uses == 1
++	// result: (FMSUBS x y z)
++	for {
++		x := v_0
++		y := v_1
++		neg := v_2
++		if neg.Op != OpRISCV64FNEGS {
++			break
++		}
++		z := neg.Args[0]
++		if !(neg.Uses == 1) {
++			break
++		}
++		v.reset(OpRISCV64FMSUBS)
++		v.AddArg3(x, y, z)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64FMSUBD(v *Value) bool {
+ 	v_2 := v.Args[2]
+ 	v_1 := v.Args[1]
+@@ -3454,6 +3536,51 @@ func rewriteValueRISCV64_OpRISCV64FMSUBD(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64FMSUBS(v *Value) bool {
++	v_2 := v.Args[2]
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (FMSUBS neg:(FNEGS x) y z)
++	// cond: neg.Uses == 1
++	// result: (FNMADDS x y z)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			neg := v_0
++			if neg.Op != OpRISCV64FNEGS {
++				continue
++			}
++			x := neg.Args[0]
++			y := v_1
++			z := v_2
++			if !(neg.Uses == 1) {
++				continue
++			}
++			v.reset(OpRISCV64FNMADDS)
++			v.AddArg3(x, y, z)
++			return true
++		}
++		break
++	}
++	// match: (FMSUBS x y neg:(FNEGS z))
++	// cond: neg.Uses == 1
++	// result: (FMADDS x y z)
++	for {
++		x := v_0
++		y := v_1
++		neg := v_2
++		if neg.Op != OpRISCV64FNEGS {
++			break
++		}
++		z := neg.Args[0]
++		if !(neg.Uses == 1) {
++			break
++		}
++		v.reset(OpRISCV64FMADDS)
++		v.AddArg3(x, y, z)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64FNMADDD(v *Value) bool {
+ 	v_2 := v.Args[2]
+ 	v_1 := v.Args[1]
+@@ -3499,6 +3626,51 @@ func rewriteValueRISCV64_OpRISCV64FNMADDD(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64FNMADDS(v *Value) bool {
++	v_2 := v.Args[2]
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (FNMADDS neg:(FNEGS x) y z)
++	// cond: neg.Uses == 1
++	// result: (FMSUBS x y z)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			neg := v_0
++			if neg.Op != OpRISCV64FNEGS {
++				continue
++			}
++			x := neg.Args[0]
++			y := v_1
++			z := v_2
++			if !(neg.Uses == 1) {
++				continue
++			}
++			v.reset(OpRISCV64FMSUBS)
++			v.AddArg3(x, y, z)
++			return true
++		}
++		break
++	}
++	// match: (FNMADDS x y neg:(FNEGS z))
++	// cond: neg.Uses == 1
++	// result: (FNMSUBS x y z)
++	for {
++		x := v_0
++		y := v_1
++		neg := v_2
++		if neg.Op != OpRISCV64FNEGS {
++			break
++		}
++		z := neg.Args[0]
++		if !(neg.Uses == 1) {
++			break
++		}
++		v.reset(OpRISCV64FNMSUBS)
++		v.AddArg3(x, y, z)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64FNMSUBD(v *Value) bool {
+ 	v_2 := v.Args[2]
+ 	v_1 := v.Args[1]
+@@ -3544,6 +3716,51 @@ func rewriteValueRISCV64_OpRISCV64FNMSUBD(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64FNMSUBS(v *Value) bool {
++	v_2 := v.Args[2]
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (FNMSUBS neg:(FNEGS x) y z)
++	// cond: neg.Uses == 1
++	// result: (FMADDS x y z)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			neg := v_0
++			if neg.Op != OpRISCV64FNEGS {
++				continue
++			}
++			x := neg.Args[0]
++			y := v_1
++			z := v_2
++			if !(neg.Uses == 1) {
++				continue
++			}
++			v.reset(OpRISCV64FMADDS)
++			v.AddArg3(x, y, z)
++			return true
++		}
++		break
++	}
++	// match: (FNMSUBS x y neg:(FNEGS z))
++	// cond: neg.Uses == 1
++	// result: (FNMADDS x y z)
++	for {
++		x := v_0
++		y := v_1
++		neg := v_2
++		if neg.Op != OpRISCV64FNEGS {
++			break
++		}
++		z := neg.Args[0]
++		if !(neg.Uses == 1) {
++			break
++		}
++		v.reset(OpRISCV64FNMADDS)
++		v.AddArg3(x, y, z)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64FSUBD(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+@@ -3583,6 +3800,45 @@ func rewriteValueRISCV64_OpRISCV64FSUBD(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64FSUBS(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (FSUBS a (FMULS x y))
++	// cond: a.Block.Func.useFMA(v)
++	// result: (FNMSUBS x y a)
++	for {
++		a := v_0
++		if v_1.Op != OpRISCV64FMULS {
++			break
++		}
++		y := v_1.Args[1]
++		x := v_1.Args[0]
++		if !(a.Block.Func.useFMA(v)) {
++			break
++		}
++		v.reset(OpRISCV64FNMSUBS)
++		v.AddArg3(x, y, a)
++		return true
++	}
++	// match: (FSUBS (FMULS x y) a)
++	// cond: a.Block.Func.useFMA(v)
++	// result: (FMSUBS x y a)
++	for {
++		if v_0.Op != OpRISCV64FMULS {
++			break
++		}
++		y := v_0.Args[1]
++		x := v_0.Args[0]
++		a := v_1
++		if !(a.Block.Func.useFMA(v)) {
++			break
++		}
++		v.reset(OpRISCV64FMSUBS)
++		v.AddArg3(x, y, a)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64MOVBUload(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+diff --git a/test/codegen/floats.go b/test/codegen/floats.go
+index 1c5fc8a31a..7991174b66 100644
+--- a/test/codegen/floats.go
++++ b/test/codegen/floats.go
+@@ -70,17 +70,20 @@ func FusedAdd32(x, y, z float32) float32 {
+ 	// s390x:"FMADDS\t"
+ 	// ppc64x:"FMADDS\t"
+ 	// arm64:"FMADDS"
++	// riscv64:"FMADDS\t"
+ 	return x*y + z
+ }
+ 
+ func FusedSub32_a(x, y, z float32) float32 {
+ 	// s390x:"FMSUBS\t"
+ 	// ppc64x:"FMSUBS\t"
++	// riscv64:"FMSUBS\t"
+ 	return x*y - z
+ }
+ 
+ func FusedSub32_b(x, y, z float32) float32 {
+ 	// arm64:"FMSUBS"
++	// riscv64:"FNMSUBS\t"
+ 	return z - x*y
+ }
+ 
+-- 
+2.39.5
+
diff --git a/2006-NOT-FULL-BACKPORT-cmd-internal-obj-riscv-cmd-link-ad.patch b/2006-NOT-FULL-BACKPORT-cmd-internal-obj-riscv-cmd-link-ad.patch
new file mode 100644
index 0000000..9b7f6a0
--- /dev/null
+++ b/2006-NOT-FULL-BACKPORT-cmd-internal-obj-riscv-cmd-link-ad.patch
@@ -0,0 +1,180 @@
+From 40c441967b1e581a63ab416802def74d5af22c5d Mon Sep 17 00:00:00 2001
+From: Julian Zhu <julian.oerv@isrc.iscas.ac.cn>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 006/119] [NOT FULL BACKPORT] cmd/internal/obj/riscv,cmd/link:
+ add support for internal cgo linking on riscv64
+
+---
+ src/cmd/internal/obj/riscv/cpu.go | 18 +++++--
+ src/cmd/internal/obj/riscv/obj.go | 82 +++++++++++++++++++++++++++++--
+ 2 files changed, 92 insertions(+), 8 deletions(-)
+
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index dde1231e15..bfd5153da4 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -619,14 +619,26 @@ var unaryDst = map[obj.As]bool{
+ 
+ // Instruction encoding masks.
+ const (
+-	// JTypeImmMask is a mask including only the immediate portion of
+-	// J-type instructions.
+-	JTypeImmMask = 0xfffff000
++	// BTypeImmMask is a mask including only the immediate portion of
++	// B-type instructions.
++	BTypeImmMask = 0xfe000f80
++
++	// CBTypeImmMask is a mask including only the immediate portion of
++	// CB-type instructions.
++	CBTypeImmMask = 0x1c7c
++
++	// CJTypeImmMask is a mask including only the immediate portion of
++	// CJ-type instructions.
++	CJTypeImmMask = 0x1f7c
+ 
+ 	// ITypeImmMask is a mask including only the immediate portion of
+ 	// I-type instructions.
+ 	ITypeImmMask = 0xfff00000
+ 
++	// JTypeImmMask is a mask including only the immediate portion of
++	// J-type instructions.
++	JTypeImmMask = 0xfffff000
++
+ 	// STypeImmMask is a mask including only the immediate portion of
+ 	// S-type instructions.
+ 	STypeImmMask = 0xfe000f80
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 2e55fac812..776c3a8df6 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1181,6 +1181,12 @@ func validateRaw(ctxt *obj.Link, ins *instruction) {
+ 	}
+ }
+ 
++// extractBitAndShift extracts the specified bit from the given immediate,
++// before shifting it to the requested position and returning it.
++func extractBitAndShift(imm uint32, bit, pos int) uint32 {
++	return ((imm >> bit) & 1) << pos
++}
++
+ // encodeR encodes an R-type RISC-V instruction.
+ func encodeR(as obj.As, rs1, rs2, rd, funct3, funct7 uint32) uint32 {
+ 	enc := encode(as)
+@@ -1272,6 +1278,11 @@ func encodeSF(ins *instruction) uint32 {
+ 	return encodeS(ins.as, regI(ins.rd), regF(ins.rs1), uint32(ins.imm))
+ }
+ 
++// encodeBImmediate encodes an immediate for a B-type RISC-V instruction.
++func encodeBImmediate(imm uint32) uint32 {
++	return (imm>>12)<<31 | ((imm>>5)&0x3f)<<25 | ((imm>>1)&0xf)<<8 | ((imm>>11)&0x1)<<7
++}
++
+ // encodeB encodes a B-type RISC-V instruction.
+ func encodeB(ins *instruction) uint32 {
+ 	imm := immI(ins.as, ins.imm, 13)
+@@ -1281,7 +1292,7 @@ func encodeB(ins *instruction) uint32 {
+ 	if enc == nil {
+ 		panic("encodeB: could not encode instruction")
+ 	}
+-	return (imm>>12)<<31 | ((imm>>5)&0x3f)<<25 | rs2<<20 | rs1<<15 | enc.funct3<<12 | ((imm>>1)&0xf)<<8 | ((imm>>11)&0x1)<<7 | enc.opcode
++	return encodeBImmediate(imm) | rs2<<20 | rs1<<15 | enc.funct3<<12 | enc.opcode
+ }
+ 
+ // encodeU encodes a U-type RISC-V instruction.
+@@ -1315,6 +1326,37 @@ func encodeJ(ins *instruction) uint32 {
+ 	return encodeJImmediate(imm) | rd<<7 | enc.opcode
+ }
+ 
++// encodeCBImmediate encodes an immediate for a CB-type RISC-V instruction.
++func encodeCBImmediate(imm uint32) uint32 {
++	// Bit order - [8|4:3|7:6|2:1|5]
++	bits := extractBitAndShift(imm, 8, 7)
++	bits |= extractBitAndShift(imm, 4, 6)
++	bits |= extractBitAndShift(imm, 3, 5)
++	bits |= extractBitAndShift(imm, 7, 4)
++	bits |= extractBitAndShift(imm, 6, 3)
++	bits |= extractBitAndShift(imm, 2, 2)
++	bits |= extractBitAndShift(imm, 1, 1)
++	bits |= extractBitAndShift(imm, 5, 0)
++	return (bits>>5)<<10 | (bits&0x1f)<<2
++}
++
++// encodeCJImmediate encodes an immediate for a CJ-type RISC-V instruction.
++func encodeCJImmediate(imm uint32) uint32 {
++	// Bit order - [11|4|9:8|10|6|7|3:1|5]
++	bits := extractBitAndShift(imm, 11, 10)
++	bits |= extractBitAndShift(imm, 4, 9)
++	bits |= extractBitAndShift(imm, 9, 8)
++	bits |= extractBitAndShift(imm, 8, 7)
++	bits |= extractBitAndShift(imm, 10, 6)
++	bits |= extractBitAndShift(imm, 6, 5)
++	bits |= extractBitAndShift(imm, 7, 4)
++	bits |= extractBitAndShift(imm, 3, 3)
++	bits |= extractBitAndShift(imm, 2, 2)
++	bits |= extractBitAndShift(imm, 1, 1)
++	bits |= extractBitAndShift(imm, 5, 0)
++	return bits << 2
++}
++
+ func encodeRawIns(ins *instruction) uint32 {
+ 	// Treat the raw value specially as a 32-bit unsigned integer.
+ 	// Nobody wants to enter negative machine code.
+@@ -1324,14 +1366,34 @@ func encodeRawIns(ins *instruction) uint32 {
+ 	return uint32(ins.imm)
+ }
+ 
+-func EncodeJImmediate(imm int64) (int64, error) {
+-	if !immIFits(imm, 21) {
+-		return 0, fmt.Errorf("immediate %#x does not fit in 21 bits", imm)
++func EncodeBImmediate(imm int64) (int64, error) {
++	if !immIFits(imm, 13) {
++		return 0, fmt.Errorf("immediate %#x does not fit in 13 bits", imm)
+ 	}
+ 	if imm&1 != 0 {
+ 		return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm)
+ 	}
+-	return int64(encodeJImmediate(uint32(imm))), nil
++	return int64(encodeBImmediate(uint32(imm))), nil
++}
++
++func EncodeCBImmediate(imm int64) (int64, error) {
++	if !immIFits(imm, 9) {
++		return 0, fmt.Errorf("immediate %#x does not fit in 9 bits", imm)
++	}
++	if imm&1 != 0 {
++		return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm)
++	}
++	return int64(encodeCBImmediate(uint32(imm))), nil
++}
++
++func EncodeCJImmediate(imm int64) (int64, error) {
++	if !immIFits(imm, 12) {
++		return 0, fmt.Errorf("immediate %#x does not fit in 12 bits", imm)
++	}
++	if imm&1 != 0 {
++		return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm)
++	}
++	return int64(encodeCJImmediate(uint32(imm))), nil
+ }
+ 
+ func EncodeIImmediate(imm int64) (int64, error) {
+@@ -1341,6 +1403,16 @@ func EncodeIImmediate(imm int64) (int64, error) {
+ 	return imm << 20, nil
+ }
+ 
++func EncodeJImmediate(imm int64) (int64, error) {
++	if !immIFits(imm, 21) {
++		return 0, fmt.Errorf("immediate %#x does not fit in 21 bits", imm)
++	}
++	if imm&1 != 0 {
++		return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm)
++	}
++	return int64(encodeJImmediate(uint32(imm))), nil
++}
++
+ func EncodeSImmediate(imm int64) (int64, error) {
+ 	if !immIFits(imm, 12) {
+ 		return 0, fmt.Errorf("immediate %#x does not fit in 12 bits", imm)
+-- 
+2.39.5
+
diff --git a/2007-cmd-internal-obj-riscv-clean-up-error-checking-for-e.patch b/2007-cmd-internal-obj-riscv-clean-up-error-checking-for-e.patch
new file mode 100644
index 0000000..1a725e2
--- /dev/null
+++ b/2007-cmd-internal-obj-riscv-clean-up-error-checking-for-e.patch
@@ -0,0 +1,41 @@
+From b74c6eef59b684a8c9b65084399050aaaa6ac162 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 007/119] cmd/internal/obj/riscv: clean up error checking for
+ encoding
+
+Replace a "fixme" with a more appropriate error. Also invert the condition
+so that the error returns early, which is more Go idiomatic.
+
+Change-Id: I03006572c4010fb47037bed3ee1fd7f92bfc20d3
+Reviewed-on: https://go-review.googlesource.com/c/go/+/523457
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+---
+ src/cmd/internal/obj/riscv/obj.go | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 776c3a8df6..4a386eb1fc 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1722,10 +1722,10 @@ func (ins *instruction) encode() (uint32, error) {
+ 	if err != nil {
+ 		return 0, err
+ 	}
+-	if enc.length > 0 {
+-		return enc.encode(ins), nil
++	if enc.length <= 0 {
++		return 0, fmt.Errorf("%v: encoding called for a pseudo instruction", ins.as)
+ 	}
+-	return 0, fmt.Errorf("fixme")
++	return enc.encode(ins), nil
+ }
+ 
+ func (ins *instruction) length() int {
+-- 
+2.39.5
+
diff --git a/2008-cmd-internal-obj-riscv-correct-message-in-regVal-pan.patch b/2008-cmd-internal-obj-riscv-correct-message-in-regVal-pan.patch
new file mode 100644
index 0000000..b01e6ea
--- /dev/null
+++ b/2008-cmd-internal-obj-riscv-correct-message-in-regVal-pan.patch
@@ -0,0 +1,34 @@
+From 0d075e31f49e99beab462ae9115f6a6438e38b61 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 008/119] cmd/internal/obj/riscv: correct message in regVal
+ panic
+
+Change-Id: I68be4110216145ad1fb2e5095e1f2b143f9e69ac
+Reviewed-on: https://go-review.googlesource.com/c/go/+/523456
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+---
+ src/cmd/internal/obj/riscv/obj.go | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 4a386eb1fc..cf80c82f79 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -973,7 +973,7 @@ func Split32BitImmediate(imm int64) (low, high int64, err error) {
+ 
+ func regVal(r, min, max uint32) uint32 {
+ 	if r < min || r > max {
+-		panic(fmt.Sprintf("register out of range, want %d < %d < %d", min, r, max))
++		panic(fmt.Sprintf("register out of range, want %d <= %d <= %d", min, r, max))
+ 	}
+ 	return r - min
+ }
+-- 
+2.39.5
+
diff --git a/2009-cmd-internal-obj-riscv-simplify-instructionsForMOV.patch b/2009-cmd-internal-obj-riscv-simplify-instructionsForMOV.patch
new file mode 100644
index 0000000..f291cae
--- /dev/null
+++ b/2009-cmd-internal-obj-riscv-simplify-instructionsForMOV.patch
@@ -0,0 +1,56 @@
+From 6f5dfd0c04b3433056eea4d6193a1b04423dd43f Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 009/119] cmd/internal/obj/riscv: simplify instructionsForMOV
+
+Rather than handling shift based scaling in two locations, rework logic
+so there is a single exit path.
+
+Change-Id: I832b4932d53183736050059a11019ced08281b3b
+Reviewed-on: https://go-review.googlesource.com/c/go/+/523455
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+---
+ src/cmd/internal/obj/riscv/obj.go | 21 ++++++++-------------
+ 1 file changed, 8 insertions(+), 13 deletions(-)
+
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index cf80c82f79..7b5621f650 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1992,20 +1992,15 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 		ins.as, ins.rs1, ins.rs2, ins.imm = AADDI, REG_ZERO, obj.REG_NONE, low
+ 
+ 		// LUI is only necessary if the constant does not fit in 12 bits.
+-		if high == 0 {
+-			if insSLLI != nil {
+-				inss = append(inss, insSLLI)
++		if high != 0 {
++			// LUI top20bits(c), R
++			// ADD bottom12bits(c), R, R
++			insLUI := &instruction{as: ALUI, rd: ins.rd, imm: high}
++			inss = []*instruction{insLUI}
++			if low != 0 {
++				ins.as, ins.rs1 = AADDIW, ins.rd
++				inss = append(inss, ins)
+ 			}
+-			break
+-		}
+-
+-		// LUI top20bits(c), R
+-		// ADD bottom12bits(c), R, R
+-		insLUI := &instruction{as: ALUI, rd: ins.rd, imm: high}
+-		inss = []*instruction{insLUI}
+-		if low != 0 {
+-			ins.as, ins.rs1 = AADDIW, ins.rd
+-			inss = append(inss, ins)
+ 		}
+ 		if insSLLI != nil {
+ 			inss = append(inss, insSLLI)
+-- 
+2.39.5
+
diff --git a/2010-internal-cpu-fix-wrong-cache-line-size-of-riscv64.patch b/2010-internal-cpu-fix-wrong-cache-line-size-of-riscv64.patch
new file mode 100644
index 0000000..97a69b7
--- /dev/null
+++ b/2010-internal-cpu-fix-wrong-cache-line-size-of-riscv64.patch
@@ -0,0 +1,34 @@
+From 8b2675740c122a84bf74d387aaa595d76d5b3192 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 010/119] internal/cpu: fix wrong cache line size of riscv64
+
+All of riscv CPU using 64B for cache-line size.
+i.e. U540 of Hifive Unleashed (https://www.sifive.com/boards/hifive-unleashed)
+
+Change-Id: I0d72d88ac026f45383c3b3eb3a77233d3c2e4004
+Reviewed-on: https://go-review.googlesource.com/c/go/+/526659
+Run-TryBot: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Heschi Kreinick <heschi@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+---
+ src/internal/cpu/cpu_riscv64.go | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/internal/cpu/cpu_riscv64.go b/src/internal/cpu/cpu_riscv64.go
+index 54b8c3378b..2173fe8886 100644
+--- a/src/internal/cpu/cpu_riscv64.go
++++ b/src/internal/cpu/cpu_riscv64.go
+@@ -4,7 +4,7 @@
+ 
+ package cpu
+ 
+-const CacheLinePadSize = 32
++const CacheLinePadSize = 64
+ 
+ func doinit() {
+ }
+-- 
+2.39.5
+
diff --git a/2011-cmd-internal-obj-riscv-clean-up-immediate-checking.patch b/2011-cmd-internal-obj-riscv-clean-up-immediate-checking.patch
new file mode 100644
index 0000000..9d56a30
--- /dev/null
+++ b/2011-cmd-internal-obj-riscv-clean-up-immediate-checking.patch
@@ -0,0 +1,206 @@
+From e7c39e53ac0c6aeb715ddeb724f4324cecc19ef5 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 011/119] cmd/internal/obj/riscv: clean up immediate checking
+
+Change immIFits to return an error in the case that it does not fit.
+This allows for deduplication and consistency of error messages.
+Additionally, since we've already calculated the min and max values,
+we can easily include these in the message. Also provide and use
+immEven, for the same reasons.
+
+Change-Id: Ie680558744f3e9bc19d6913c4144ce9ddbd0429c
+Reviewed-on: https://go-review.googlesource.com/c/go/+/523458
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Run-TryBot: M Zhuo <mzh@golangcn.org>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Matthew Dempsky <mdempsky@google.com>
+---
+ src/cmd/internal/obj/riscv/obj.go | 93 ++++++++++++++++++-------------
+ 1 file changed, 54 insertions(+), 39 deletions(-)
+
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 7b5621f650..ab41e53b8c 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -941,12 +941,12 @@ func signExtend(val int64, bit uint) int64 {
+ // result. For example, high may be used in LUI and low in a following ADDI to
+ // generate a full 32-bit constant.
+ func Split32BitImmediate(imm int64) (low, high int64, err error) {
+-	if !immIFits(imm, 32) {
+-		return 0, 0, fmt.Errorf("immediate does not fit in 32 bits: %d", imm)
++	if err := immIFits(imm, 32); err != nil {
++		return 0, 0, err
+ 	}
+ 
+ 	// Nothing special needs to be done if the immediate fits in 12 bits.
+-	if immIFits(imm, 12) {
++	if err := immIFits(imm, 12); err == nil {
+ 		return imm, 0, nil
+ 	}
+ 
+@@ -1006,26 +1006,41 @@ func regFAddr(a obj.Addr) uint32 {
+ 	return regAddr(a, REG_F0, REG_F31)
+ }
+ 
+-// immIFits reports whether immediate value x fits in nbits bits
+-// as a signed integer.
+-func immIFits(x int64, nbits uint) bool {
++// immEven checks that the immediate is a multiple of two. If it
++// is not, an error is returned.
++func immEven(x int64) error {
++	if x&1 != 0 {
++		return fmt.Errorf("immediate %#x is not a multiple of two", x)
++	}
++	return nil
++}
++
++// immIFits checks whether the immediate value x fits in nbits bits
++// as a signed integer. If it does not, an error is returned.
++func immIFits(x int64, nbits uint) error {
+ 	nbits--
+-	var min int64 = -1 << nbits
+-	var max int64 = 1<<nbits - 1
+-	return min <= x && x <= max
++	min := int64(-1) << nbits
++	max := int64(1)<<nbits - 1
++	if x < min || x > max {
++		if nbits <= 16 {
++			return fmt.Errorf("signed immediate %d must be in range [%d, %d] (%d bits)", x, min, max, nbits)
++		}
++		return fmt.Errorf("signed immediate %#x must be in range [%#x, %#x] (%d bits)", x, min, max, nbits)
++	}
++	return nil
+ }
+ 
+ // immI extracts the signed integer of the specified size from an immediate.
+ func immI(as obj.As, imm int64, nbits uint) uint32 {
+-	if !immIFits(imm, nbits) {
+-		panic(fmt.Sprintf("%v: signed immediate %d cannot fit in %d bits", as, imm, nbits))
++	if err := immIFits(imm, nbits); err != nil {
++		panic(fmt.Sprintf("%v: %v", as, err))
+ 	}
+ 	return uint32(imm)
+ }
+ 
+ func wantImmI(ctxt *obj.Link, as obj.As, imm int64, nbits uint) {
+-	if !immIFits(imm, nbits) {
+-		ctxt.Diag("%v: signed immediate %d cannot be larger than %d bits", as, imm, nbits)
++	if err := immIFits(imm, nbits); err != nil {
++		ctxt.Diag("%v: %v", as, err)
+ 	}
+ }
+ 
+@@ -1057,8 +1072,8 @@ func wantFloatReg(ctxt *obj.Link, as obj.As, pos string, r uint32) {
+ 
+ // wantEvenOffset checks that the offset is a multiple of two.
+ func wantEvenOffset(ctxt *obj.Link, as obj.As, offset int64) {
+-	if offset%1 != 0 {
+-		ctxt.Diag("%v: jump offset %d must be a multiple of two", as, offset)
++	if err := immEven(offset); err != nil {
++		ctxt.Diag("%v: %v", as, err)
+ 	}
+ }
+ 
+@@ -1367,62 +1382,62 @@ func encodeRawIns(ins *instruction) uint32 {
+ }
+ 
+ func EncodeBImmediate(imm int64) (int64, error) {
+-	if !immIFits(imm, 13) {
+-		return 0, fmt.Errorf("immediate %#x does not fit in 13 bits", imm)
++	if err := immIFits(imm, 13); err != nil {
++		return 0, err
+ 	}
+-	if imm&1 != 0 {
+-		return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm)
++	if err := immEven(imm); err != nil {
++		return 0, err
+ 	}
+ 	return int64(encodeBImmediate(uint32(imm))), nil
+ }
+ 
+ func EncodeCBImmediate(imm int64) (int64, error) {
+-	if !immIFits(imm, 9) {
+-		return 0, fmt.Errorf("immediate %#x does not fit in 9 bits", imm)
++	if err := immIFits(imm, 9); err != nil {
++		return 0, err
+ 	}
+-	if imm&1 != 0 {
+-		return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm)
++	if err := immEven(imm); err != nil {
++		return 0, err
+ 	}
+ 	return int64(encodeCBImmediate(uint32(imm))), nil
+ }
+ 
+ func EncodeCJImmediate(imm int64) (int64, error) {
+-	if !immIFits(imm, 12) {
+-		return 0, fmt.Errorf("immediate %#x does not fit in 12 bits", imm)
++	if err := immIFits(imm, 12); err != nil {
++		return 0, err
+ 	}
+-	if imm&1 != 0 {
+-		return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm)
++	if err := immEven(imm); err != nil {
++		return 0, err
+ 	}
+ 	return int64(encodeCJImmediate(uint32(imm))), nil
+ }
+ 
+ func EncodeIImmediate(imm int64) (int64, error) {
+-	if !immIFits(imm, 12) {
+-		return 0, fmt.Errorf("immediate %#x does not fit in 12 bits", imm)
++	if err := immIFits(imm, 12); err != nil {
++		return 0, err
+ 	}
+ 	return imm << 20, nil
+ }
+ 
+ func EncodeJImmediate(imm int64) (int64, error) {
+-	if !immIFits(imm, 21) {
+-		return 0, fmt.Errorf("immediate %#x does not fit in 21 bits", imm)
++	if err := immIFits(imm, 21); err != nil {
++		return 0, err
+ 	}
+-	if imm&1 != 0 {
+-		return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm)
++	if err := immEven(imm); err != nil {
++		return 0, err
+ 	}
+ 	return int64(encodeJImmediate(uint32(imm))), nil
+ }
+ 
+ func EncodeSImmediate(imm int64) (int64, error) {
+-	if !immIFits(imm, 12) {
+-		return 0, fmt.Errorf("immediate %#x does not fit in 12 bits", imm)
++	if err := immIFits(imm, 12); err != nil {
++		return 0, err
+ 	}
+ 	return ((imm >> 5) << 25) | ((imm & 0x1f) << 7), nil
+ }
+ 
+ func EncodeUImmediate(imm int64) (int64, error) {
+-	if !immIFits(imm, 20) {
+-		return 0, fmt.Errorf("immediate %#x does not fit in 20 bits", imm)
++	if err := immIFits(imm, 20); err != nil {
++		return 0, err
+ 	}
+ 	return imm << 12, nil
+ }
+@@ -1974,9 +1989,9 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 		// 	MOV $1, X10
+ 		// 	SLLI $63, X10, X10
+ 		var insSLLI *instruction
+-		if !immIFits(ins.imm, 32) {
++		if err := immIFits(ins.imm, 32); err != nil {
+ 			ctz := bits.TrailingZeros64(uint64(ins.imm))
+-			if immIFits(ins.imm>>ctz, 32) {
++			if err := immIFits(ins.imm>>ctz, 32); err == nil {
+ 				ins.imm = ins.imm >> ctz
+ 				insSLLI = &instruction{as: ASLLI, rd: ins.rd, rs1: ins.rd, imm: int64(ctz)}
+ 			}
+-- 
+2.39.5
+
diff --git a/2012-cmd-compile-internal-intrinsify-publicationBarrier-o.patch b/2012-cmd-compile-internal-intrinsify-publicationBarrier-o.patch
new file mode 100644
index 0000000..9e690b0
--- /dev/null
+++ b/2012-cmd-compile-internal-intrinsify-publicationBarrier-o.patch
@@ -0,0 +1,145 @@
+From b0843e50edcaff87cfd59e6f70433faf00321bc9 Mon Sep 17 00:00:00 2001
+From: Xianmiao Qu <cooper.qu@linux.alibaba.com>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 012/119] cmd/compile/internal: intrinsify publicationBarrier
+ on riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This enables publicationBarrier to be used as an intrinsic
+on riscv64, optimizing the required function call and return
+instructions for invoking the "runtime.publicationBarrier"
+function.
+
+This function is called by mallocgc. The benchmark results for malloc tested on Lichee-Pi-4A(TH1520, RISC-V 2.0G C910 x4) are as follows.
+
+goos: linux
+goarch: riscv64
+pkg: runtime
+                    │   old.txt   │              new.txt               │
+                    │   sec/op    │   sec/op     vs base               │
+Malloc8-4             92.78n ± 1%   90.77n ± 1%  -2.17% (p=0.001 n=10)
+Malloc16-4            156.5n ± 1%   151.7n ± 2%  -3.10% (p=0.000 n=10)
+MallocTypeInfo8-4     131.7n ± 1%   130.6n ± 2%       ~ (p=0.165 n=10)
+MallocTypeInfo16-4    186.5n ± 2%   186.2n ± 1%       ~ (p=0.956 n=10)
+MallocLargeStruct-4   1.345µ ± 1%   1.355µ ± 1%       ~ (p=0.093 n=10)
+geomean               216.9n        214.5n       -1.10%
+
+
+Change-Id: Ieab6c02309614bac5c1b12b5ee3311f988ff644d
+Reviewed-on: https://go-review.googlesource.com/c/go/+/531719
+Reviewed-by: Michael Pratt <mpratt@google.com>
+Auto-Submit: Michael Pratt <mpratt@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Run-TryBot: M Zhuo <mzh@golangcn.org>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+---
+ src/cmd/compile/internal/riscv64/ssa.go         | 4 ++++
+ src/cmd/compile/internal/ssa/_gen/RISCV64.rules | 3 +++
+ src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go | 3 +++
+ src/cmd/compile/internal/ssa/opGen.go           | 8 ++++++++
+ src/cmd/compile/internal/ssa/rewriteRISCV64.go  | 3 +++
+ src/cmd/compile/internal/ssagen/ssa.go          | 2 +-
+ 6 files changed, 22 insertions(+), 1 deletion(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index f8cf786920..1100878794 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -695,6 +695,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.To.Sym = ir.Syms.Duffcopy
+ 		p.To.Offset = v.AuxInt
+ 
++	case ssa.OpRISCV64LoweredPubBarrier:
++		// FENCE
++		s.Prog(v.Op.Asm())
++
+ 	case ssa.OpRISCV64LoweredRound32F, ssa.OpRISCV64LoweredRound64F:
+ 		// input is already rounded
+ 
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index e0bf00d45d..e498218c60 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -412,6 +412,9 @@
+ // Write barrier.
+ (WB ...) => (LoweredWB ...)
+ 
++// Publication barrier as intrinsic
++(PubBarrier ...) => (LoweredPubBarrier ...)
++
+ (PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+ (PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+ (PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 317e9150c9..741769f036 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -399,6 +399,9 @@ func init() {
+ 		// Returns a pointer to a write barrier buffer in X24.
+ 		{name: "LoweredWB", argLength: 1, reg: regInfo{clobbers: (callerSave &^ (gpMask | regNamed["g"])) | regNamed["X1"], outputs: []regMask{regNamed["X24"]}}, clobberFlags: true, aux: "Int64"},
+ 
++		// Do data barrier. arg0=memorys
++		{name: "LoweredPubBarrier", argLength: 1, asm: "FENCE", hasSideEffects: true},
++
+ 		// There are three of these functions so that they can have three different register inputs.
+ 		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
+ 		// default registers to match so we don't need to copy registers around unnecessarily.
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index 5af047c38f..1e99c2bc07 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2411,6 +2411,7 @@ const (
+ 	OpRISCV64LoweredGetCallerSP
+ 	OpRISCV64LoweredGetCallerPC
+ 	OpRISCV64LoweredWB
++	OpRISCV64LoweredPubBarrier
+ 	OpRISCV64LoweredPanicBoundsA
+ 	OpRISCV64LoweredPanicBoundsB
+ 	OpRISCV64LoweredPanicBoundsC
+@@ -32301,6 +32302,13 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:           "LoweredPubBarrier",
++		argLen:         1,
++		hasSideEffects: true,
++		asm:            riscv.AFENCE,
++		reg:            regInfo{},
++	},
+ 	{
+ 		name:    "LoweredPanicBoundsA",
+ 		auxType: auxInt64,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 0ad6433bf4..1ca03a58a9 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -432,6 +432,9 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return true
+ 	case OpPanicBounds:
+ 		return rewriteValueRISCV64_OpPanicBounds(v)
++	case OpPubBarrier:
++		v.Op = OpRISCV64LoweredPubBarrier
++		return true
+ 	case OpRISCV64ADD:
+ 		return rewriteValueRISCV64_OpRISCV64ADD(v)
+ 	case OpRISCV64ADDI:
+diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
+index d243ebb7cd..cc70dc8f7d 100644
+--- a/src/cmd/compile/internal/ssagen/ssa.go
++++ b/src/cmd/compile/internal/ssagen/ssa.go
+@@ -4108,7 +4108,7 @@ func InitTables() {
+ 			s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
+ 			return nil
+ 		},
+-		sys.ARM64, sys.PPC64)
++		sys.ARM64, sys.PPC64, sys.RISCV64)
+ 
+ 	brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X}
+ 	if buildcfg.GOPPC64 >= 10 {
+-- 
+2.39.5
+
diff --git a/2013-cmd-compile-internal-stop-lowering-OpConvert-on-risc.patch b/2013-cmd-compile-internal-stop-lowering-OpConvert-on-risc.patch
new file mode 100644
index 0000000..c670ed2
--- /dev/null
+++ b/2013-cmd-compile-internal-stop-lowering-OpConvert-on-risc.patch
@@ -0,0 +1,117 @@
+From 1d9850b6b89b6a5ca5558bfc44b1c1ff7777b4ca Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 013/119] cmd/compile/internal: stop lowering OpConvert on
+ riscv64
+
+Lowering for OpConvert was removed for all architectures in CL#108496,
+prior to the riscv64 port being upstreamed. Remove lowering of OpConvert
+on riscv64, which brings it inline with all other architectures. This
+results in 1,600+ instructions being removed from the riscv64 go binary.
+
+Change-Id: Iaaf1f8b397875926604048b66ad8ac91a98c871e
+Reviewed-on: https://go-review.googlesource.com/c/go/+/533335
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Michael Pratt <mpratt@google.com>
+---
+ src/cmd/compile/internal/riscv64/ssa.go         |  2 +-
+ src/cmd/compile/internal/ssa/_gen/RISCV64.rules |  2 --
+ src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go |  6 ------
+ src/cmd/compile/internal/ssa/opGen.go           | 14 --------------
+ src/cmd/compile/internal/ssa/rewriteRISCV64.go  |  3 ---
+ 5 files changed, 1 insertion(+), 26 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 1100878794..182cd8690e 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -193,7 +193,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		// input args need no code
+ 	case ssa.OpPhi:
+ 		ssagen.CheckLoweredPhi(v)
+-	case ssa.OpCopy, ssa.OpRISCV64MOVconvert, ssa.OpRISCV64MOVDreg:
++	case ssa.OpCopy, ssa.OpRISCV64MOVDreg:
+ 		if v.Type.IsMemory() {
+ 			return
+ 		}
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index e498218c60..031c68c8a0 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -396,8 +396,6 @@
+ 		(ADD <ptr.Type> ptr (MOVDconst [s-moveSize(t.Alignment(), config)]))
+ 		mem)
+ 
+-(Convert ...) => (MOVconvert ...)
+-
+ // Checks
+ (IsNonNil ...) => (SNEZ ...)
+ (IsInBounds ...) => (Less64U ...)
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 741769f036..e8194be1df 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -231,12 +231,6 @@ func init() {
+ 		{name: "SLTU", argLength: 2, reg: gp21, asm: "SLTU"},                 // arg0 < arg1, unsigned, result is 0 or 1
+ 		{name: "SLTIU", argLength: 1, reg: gp11, asm: "SLTIU", aux: "Int64"}, // arg0 < auxint, unsigned, result is 0 or 1
+ 
+-		// MOVconvert converts between pointers and integers.
+-		// We have a special op for this so as to not confuse GC
+-		// (particularly stack maps). It takes a memory arg so it
+-		// gets correctly ordered with respect to GC safepoints.
+-		{name: "MOVconvert", argLength: 2, reg: gp11, asm: "MOV"}, // arg0, but converted to int/ptr as appropriate; arg1=mem
+-
+ 		// Round ops to block fused-multiply-add extraction.
+ 		{name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true},
+ 		{name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true},
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index 1e99c2bc07..14453a4532 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2381,7 +2381,6 @@ const (
+ 	OpRISCV64SLTI
+ 	OpRISCV64SLTU
+ 	OpRISCV64SLTIU
+-	OpRISCV64MOVconvert
+ 	OpRISCV64LoweredRound32F
+ 	OpRISCV64LoweredRound64F
+ 	OpRISCV64CALLstatic
+@@ -31910,19 +31909,6 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
+-	{
+-		name:   "MOVconvert",
+-		argLen: 2,
+-		asm:    riscv.AMOV,
+-		reg: regInfo{
+-			inputs: []inputInfo{
+-				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+-			},
+-			outputs: []outputInfo{
+-				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+-			},
+-		},
+-	},
+ 	{
+ 		name:         "LoweredRound32F",
+ 		argLen:       1,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 1ca03a58a9..e71102d27e 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -132,9 +132,6 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpConstBool(v)
+ 	case OpConstNil:
+ 		return rewriteValueRISCV64_OpConstNil(v)
+-	case OpConvert:
+-		v.Op = OpRISCV64MOVconvert
+-		return true
+ 	case OpCopysign:
+ 		v.Op = OpRISCV64FSGNJD
+ 		return true
+-- 
+2.39.5
+
diff --git a/2014-cmd-compile-optimize-right-shifts-of-uint32-on-riscv.patch b/2014-cmd-compile-optimize-right-shifts-of-uint32-on-riscv.patch
new file mode 100644
index 0000000..c6c5189
--- /dev/null
+++ b/2014-cmd-compile-optimize-right-shifts-of-uint32-on-riscv.patch
@@ -0,0 +1,558 @@
+From 1e9c0a4876d93e28c0f078d5f9194628c4ed0470 Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 014/119] cmd/compile: optimize right shifts of uint32 on riscv
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The compiler is currently zero extending 32 bit unsigned integers to
+64 bits before right shifting them using a 64 bit shift instruction.
+There's no need to do this as RISC-V has instructions for right
+shifting 32 bit unsigned values (srlw and srliw) which zero extend
+the result of the shift to 64 bits.  Change the compiler so that
+it uses srlw and srliw for 32 bit unsigned shifts reducing in most
+cases the number of instructions needed to perform the shift.
+
+Here are some examples of code sequences that are changed by this
+patch:
+
+uint32(a) >> 2
+
+  before:
+
+    sll     x5,x10,0x20
+    srl     x10,x5,0x22
+
+  after:
+
+    srlw    x10,x10,0x2
+
+uint32(a) >> int(b)
+
+  before:
+
+    sll     x5,x10,0x20
+    srl     x5,x5,0x20
+    srl     x5,x5,x11
+    sltiu   x6,x11,64
+    neg     x6,x6
+    and     x10,x5,x6
+
+  after:
+
+    srlw    x5,x10,x11
+    sltiu   x6,x11,32
+    neg     x6,x6
+    and     x10,x5,x6
+
+bits.RotateLeft32(uint32(a), 1)
+
+  before:
+
+    sll     x5,x10,0x1
+    sll     x6,x10,0x20
+    srl     x7,x6,0x3f
+    or      x5,x5,x7
+
+  after:
+
+   sll     x5,x10,0x1
+   srlw    x6,x10,0x1f
+   or      x10,x5,x6
+
+bits.RotateLeft32(uint32(a), int(b))
+
+  before:
+    and     x6,x11,31
+    sll     x7,x10,x6
+    sll     x8,x10,0x20
+    srl     x8,x8,0x20
+    add     x6,x6,-32
+    neg     x6,x6
+    srl     x9,x8,x6
+    sltiu   x6,x6,64
+    neg     x6,x6
+    and     x6,x9,x6
+    or      x6,x6,x7
+
+  after:
+
+    and     x5,x11,31
+    sll     x6,x10,x5
+    add     x5,x5,-32
+    neg     x5,x5
+    srlw    x7,x10,x5
+    sltiu   x5,x5,32
+    neg     x5,x5
+    and     x5,x7,x5
+    or      x10,x6,x5
+
+The one regression observed is the following case, an unbounded right
+shift of a uint32 where the value we're shifting by is known to be
+< 64 but > 31.  As this is an unusual case this commit does not
+optimize for it, although the existing code does.
+
+uint32(a) >> (b & 63)
+
+  before:
+
+    sll     x5,x10,0x20
+    srl     x5,x5,0x20
+    and     x6,x11,63
+    srl     x10,x5,x6
+
+  after
+
+    and     x5,x11,63
+    srlw    x6,x10,x5
+    sltiu   x5,x5,32
+    neg     x5,x5
+    and     x10,x6,x5
+
+Here we have one extra instruction.
+
+Some benchmark highlights, generated on a VisionFive2 8GB running
+Ubuntu 23.04.
+
+pkg: math/bits
+LeadingZeros32-4    18.64n ± 0%     17.32n ± 0%   -7.11% (p=0.000 n=10)
+LeadingZeros64-4    15.47n ± 0%     15.51n ± 0%   +0.26% (p=0.027 n=10)
+TrailingZeros16-4   18.48n ± 0%     17.68n ± 0%   -4.33% (p=0.000 n=10)
+TrailingZeros32-4   16.87n ± 0%     16.07n ± 0%   -4.74% (p=0.000 n=10)
+TrailingZeros64-4   15.26n ± 0%     15.27n ± 0%   +0.07% (p=0.043 n=10)
+OnesCount32-4       20.08n ± 0%     19.29n ± 0%   -3.96% (p=0.000 n=10)
+RotateLeft-4        8.864n ± 0%     8.838n ± 0%   -0.30% (p=0.006 n=10)
+RotateLeft32-4      8.837n ± 0%     8.032n ± 0%   -9.11% (p=0.000 n=10)
+Reverse32-4         29.77n ± 0%     26.52n ± 0%  -10.93% (p=0.000 n=10)
+ReverseBytes32-4    9.640n ± 0%     8.838n ± 0%   -8.32% (p=0.000 n=10)
+Sub32-4             8.835n ± 0%     8.035n ± 0%   -9.06% (p=0.000 n=10)
+geomean             11.50n          11.33n        -1.45%
+
+pkg: crypto/md5
+Hash8Bytes-4             1.486µ ± 0%   1.426µ ± 0%  -4.04% (p=0.000 n=10)
+Hash64-4                 2.079µ ± 0%   1.968µ ± 0%  -5.36% (p=0.000 n=10)
+Hash128-4                2.720µ ± 0%   2.557µ ± 0%  -5.99% (p=0.000 n=10)
+Hash256-4                3.996µ ± 0%   3.733µ ± 0%  -6.58% (p=0.000 n=10)
+Hash512-4                6.541µ ± 0%   6.072µ ± 0%  -7.18% (p=0.000 n=10)
+Hash1K-4                 11.64µ ± 0%   10.75µ ± 0%  -7.58% (p=0.000 n=10)
+Hash8K-4                 82.95µ ± 0%   76.32µ ± 0%  -7.99% (p=0.000 n=10)
+Hash1M-4                10.436m ± 0%   9.591m ± 0%  -8.10% (p=0.000 n=10)
+Hash8M-4                 83.50m ± 0%   76.73m ± 0%  -8.10% (p=0.000 n=10)
+Hash8BytesUnaligned-4    1.494µ ± 0%   1.434µ ± 0%  -4.02% (p=0.000 n=10)
+Hash1KUnaligned-4        11.64µ ± 0%   10.76µ ± 0%  -7.52% (p=0.000 n=10)
+Hash8KUnaligned-4        83.01µ ± 0%   76.32µ ± 0%  -8.07% (p=0.000 n=10)
+geomean                  28.32µ        26.42µ       -6.72%
+
+Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693
+Reviewed-on: https://go-review.googlesource.com/c/go/+/528975
+Reviewed-by: Michael Pratt <mpratt@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |  4 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   | 18 +++--
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   | 14 ++--
+ src/cmd/compile/internal/ssa/opGen.go         | 30 +++++++++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 65 ++++++++++++++++---
+ test/codegen/shift.go                         | 21 +++---
+ 6 files changed, 122 insertions(+), 30 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 182cd8690e..332f5841b7 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -278,7 +278,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.To.Type = obj.TYPE_REG
+ 		p.To.Reg = rd
+ 	case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64SUBW, ssa.OpRISCV64XOR, ssa.OpRISCV64OR, ssa.OpRISCV64AND,
+-		ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRL,
++		ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW,
+ 		ssa.OpRISCV64SLT, ssa.OpRISCV64SLTU, ssa.OpRISCV64MUL, ssa.OpRISCV64MULW, ssa.OpRISCV64MULH,
+ 		ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW,
+ 		ssa.OpRISCV64DIVUW, ssa.OpRISCV64REM, ssa.OpRISCV64REMU, ssa.OpRISCV64REMW,
+@@ -356,7 +356,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.To.Type = obj.TYPE_REG
+ 		p.To.Reg = v.Reg()
+ 	case ssa.OpRISCV64ADDI, ssa.OpRISCV64ADDIW, ssa.OpRISCV64XORI, ssa.OpRISCV64ORI, ssa.OpRISCV64ANDI,
+-		ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRLI, ssa.OpRISCV64SLTI,
++		ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI,
+ 		ssa.OpRISCV64SLTIU:
+ 		p := s.Prog(v.Op.Asm())
+ 		p.From.Type = obj.TYPE_CONST
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 031c68c8a0..4cacabb236 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -150,8 +150,9 @@
+ (Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y)
+ (Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y)
+ 
+-// SRL only considers the bottom 6 bits of y. If y > 64, the result should
+-// always be 0. See Lsh above for a detailed description.
++// SRL only considers the bottom 6 bits of y, similarly SRLW only considers the
++// bottom 5 bits of y. Ensure that the result is always zero if the shift exceeds
++// the maximum value. See Lsh above for a detailed description.
+ (Rsh8Ux8   <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+ (Rsh8Ux16  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+ (Rsh8Ux32  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+@@ -160,10 +161,10 @@
+ (Rsh16Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+ (Rsh16Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+ (Rsh16Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] y)))
+-(Rsh32Ux8  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+-(Rsh32Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+-(Rsh32Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+-(Rsh32Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] y)))
++(Rsh32Ux8  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt8to64  y))))
++(Rsh32Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt16to64 y))))
++(Rsh32Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt32to64 y))))
++(Rsh32Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] y)))
+ (Rsh64Ux8  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+ (Rsh64Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+ (Rsh64Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+@@ -705,6 +706,10 @@
+ // But for now, this is enough to get rid of lots of them.
+ (MOVDnop (MOVDconst [c])) => (MOVDconst [c])
+ 
++// Avoid unnecessary zero extension when right shifting.
++(SRL <t> (MOVWUreg x) y) => (SRLW <t> x y)
++(SRLI <t> [x] (MOVWUreg y)) => (SRLIW <t> [int64(x&31)] y)
++
+ // Fold constant into immediate instructions where possible.
+ (ADD (MOVDconst <t> [val]) x) && is32Bit(val) && !t.IsPtr() => (ADDI [val] x)
+ (AND (MOVDconst [val]) x) && is32Bit(val) => (ANDI [val] x)
+@@ -712,6 +717,7 @@
+ (XOR (MOVDconst [val]) x) && is32Bit(val) => (XORI [val] x)
+ (SLL  x (MOVDconst [val])) => (SLLI [int64(val&63)] x)
+ (SRL  x (MOVDconst [val])) => (SRLI [int64(val&63)] x)
++(SRLW x (MOVDconst [val])) => (SRLIW [int64(val&31)] x)
+ (SRA  x (MOVDconst [val])) => (SRAI [int64(val&63)] x)
+ (SLT  x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTI  [val] x)
+ (SLTU x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTIU [val] x)
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index e8194be1df..360eff6bcf 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -207,12 +207,14 @@ func init() {
+ 		{name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}, resultInArg0: true}, // nop, return arg0 in same register
+ 
+ 		// Shift ops
+-		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                 // arg0 << (aux1 & 63)
+-		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                 // arg0 >> (aux1 & 63), signed
+-		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                 // arg0 >> (aux1 & 63), unsigned
+-		{name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"}, // arg0 << auxint, shift amount 0-63
+-		{name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-63
+-		{name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-63
++		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                   // arg0 << (aux1 & 63)
++		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                   // arg0 >> (aux1 & 63), signed
++		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                   // arg0 >> (aux1 & 63), unsigned
++		{name: "SRLW", argLength: 2, reg: gp21, asm: "SRLW"},                 // arg0 >> (aux1 & 31), unsigned
++		{name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"},   // arg0 << auxint, shift amount 0-63
++		{name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"},   // arg0 >> auxint, signed, shift amount 0-63
++		{name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"},   // arg0 >> auxint, unsigned, shift amount 0-63
++		{name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-31
+ 
+ 		// Bitwise ops
+ 		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index 14453a4532..dadf1f20c7 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2365,9 +2365,11 @@ const (
+ 	OpRISCV64SLL
+ 	OpRISCV64SRA
+ 	OpRISCV64SRL
++	OpRISCV64SRLW
+ 	OpRISCV64SLLI
+ 	OpRISCV64SRAI
+ 	OpRISCV64SRLI
++	OpRISCV64SRLIW
+ 	OpRISCV64XOR
+ 	OpRISCV64XORI
+ 	OpRISCV64OR
+@@ -31685,6 +31687,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "SRLW",
++		argLen: 2,
++		asm:    riscv.ASRLW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:    "SLLI",
+ 		auxType: auxInt64,
+@@ -31727,6 +31743,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:    "SRLIW",
++		auxType: auxInt64,
++		argLen:  1,
++		asm:     riscv.ASRLIW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:        "XOR",
+ 		argLen:      2,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index e71102d27e..7d16fe887f 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -542,6 +542,8 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpRISCV64SRL(v)
+ 	case OpRISCV64SRLI:
+ 		return rewriteValueRISCV64_OpRISCV64SRLI(v)
++	case OpRISCV64SRLW:
++		return rewriteValueRISCV64_OpRISCV64SRLW(v)
+ 	case OpRISCV64SUB:
+ 		return rewriteValueRISCV64_OpRISCV64SUB(v)
+ 	case OpRISCV64SUBW:
+@@ -6290,6 +6292,20 @@ func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	// match: (SRL <t> (MOVWUreg x) y)
++	// result: (SRLW <t> x y)
++	for {
++		t := v.Type
++		if v_0.Op != OpRISCV64MOVWUreg {
++			break
++		}
++		x := v_0.Args[0]
++		y := v_1
++		v.reset(OpRISCV64SRLW)
++		v.Type = t
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (SRL x (MOVDconst [val]))
+ 	// result: (SRLI [int64(val&63)] x)
+ 	for {
+@@ -6307,6 +6323,21 @@ func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool {
+ }
+ func rewriteValueRISCV64_OpRISCV64SRLI(v *Value) bool {
+ 	v_0 := v.Args[0]
++	// match: (SRLI <t> [x] (MOVWUreg y))
++	// result: (SRLIW <t> [x] y)
++	for {
++		t := v.Type
++		x := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpRISCV64MOVWUreg {
++			break
++		}
++		y := v_0.Args[0]
++		v.reset(OpRISCV64SRLIW)
++		v.Type = t
++		v.AuxInt = int64ToAuxInt(x)
++		v.AddArg(y)
++		return true
++	}
+ 	// match: (SRLI [x] (MOVDconst [y]))
+ 	// result: (MOVDconst [int64(uint64(y) >> uint32(x))])
+ 	for {
+@@ -6321,6 +6352,24 @@ func rewriteValueRISCV64_OpRISCV64SRLI(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64SRLW(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (SRLW x (MOVDconst [val]))
++	// result: (SRLIW [int64(val&31)] x)
++	for {
++		x := v_0
++		if v_1.Op != OpRISCV64MOVDconst {
++			break
++		}
++		val := auxIntToInt64(v_1.AuxInt)
++		v.reset(OpRISCV64SRLIW)
++		v.AuxInt = int64ToAuxInt(int64(val & 31))
++		v.AddArg(x)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64SUB(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+@@ -6937,7 +6986,7 @@ func rewriteValueRISCV64_OpRsh32Ux16(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux16 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
++	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt16to64 y))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -6952,7 +7001,7 @@ func rewriteValueRISCV64_OpRsh32Ux16(v *Value) bool {
+ 		v0.AddArg2(v1, y)
+ 		v2 := b.NewValue0(v.Pos, OpNeg32, t)
+ 		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+ 		v4.AddArg(y)
+ 		v3.AddArg(v4)
+@@ -6984,7 +7033,7 @@ func rewriteValueRISCV64_OpRsh32Ux32(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux32 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
++	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt32to64 y))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -6999,7 +7048,7 @@ func rewriteValueRISCV64_OpRsh32Ux32(v *Value) bool {
+ 		v0.AddArg2(v1, y)
+ 		v2 := b.NewValue0(v.Pos, OpNeg32, t)
+ 		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+ 		v4.AddArg(y)
+ 		v3.AddArg(v4)
+@@ -7031,7 +7080,7 @@ func rewriteValueRISCV64_OpRsh32Ux64(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux64 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] y)))
++	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7046,7 +7095,7 @@ func rewriteValueRISCV64_OpRsh32Ux64(v *Value) bool {
+ 		v0.AddArg2(v1, y)
+ 		v2 := b.NewValue0(v.Pos, OpNeg32, t)
+ 		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v3.AddArg(y)
+ 		v2.AddArg(v3)
+ 		v.AddArg2(v0, v2)
+@@ -7076,7 +7125,7 @@ func rewriteValueRISCV64_OpRsh32Ux8(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux8 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
++	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt8to64 y))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7091,7 +7140,7 @@ func rewriteValueRISCV64_OpRsh32Ux8(v *Value) bool {
+ 		v0.AddArg2(v1, y)
+ 		v2 := b.NewValue0(v.Pos, OpNeg32, t)
+ 		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+ 		v4.AddArg(y)
+ 		v3.AddArg(v4)
+diff --git a/test/codegen/shift.go b/test/codegen/shift.go
+index d34ff9b428..302560d5b0 100644
+--- a/test/codegen/shift.go
++++ b/test/codegen/shift.go
+@@ -18,7 +18,7 @@ func lshConst64x64(v int64) int64 {
+ 
+ func rshConst64Ux64(v uint64) uint64 {
+ 	// ppc64x:"SRD"
+-	// riscv64:"SRLI",-"AND",-"SLTIU"
++	// riscv64:"SRLI\t",-"AND",-"SLTIU"
+ 	return v >> uint64(33)
+ }
+ 
+@@ -36,7 +36,7 @@ func lshConst32x64(v int32) int32 {
+ 
+ func rshConst32Ux64(v uint32) uint32 {
+ 	// ppc64x:"SRW"
+-	// riscv64:"SRLI",-"AND",-"SLTIU", -"MOVW"
++	// riscv64:"SRLIW",-"AND",-"SLTIU", -"MOVW"
+ 	return v >> uint64(29)
+ }
+ 
+@@ -54,7 +54,7 @@ func lshConst64x32(v int64) int64 {
+ 
+ func rshConst64Ux32(v uint64) uint64 {
+ 	// ppc64x:"SRD"
+-	// riscv64:"SRLI",-"AND",-"SLTIU"
++	// riscv64:"SRLI\t",-"AND",-"SLTIU"
+ 	return v >> uint32(33)
+ }
+ 
+@@ -79,7 +79,7 @@ func lshMask64x64(v int64, s uint64) int64 {
+ func rshMask64Ux64(v uint64, s uint64) uint64 {
+ 	// arm64:"LSR",-"AND",-"CSEL"
+ 	// ppc64x:"ANDCC",-"ORN",-"ISEL"
+-	// riscv64:"SRL",-"AND\t",-"SLTIU"
++	// riscv64:"SRL\t",-"AND\t",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 	return v >> (s & 63)
+ }
+@@ -103,11 +103,16 @@ func lshMask32x64(v int32, s uint64) int32 {
+ func rshMask32Ux64(v uint32, s uint64) uint32 {
+ 	// arm64:"LSR",-"AND"
+ 	// ppc64x:"ISEL",-"ORN"
+-	// riscv64:"SRL",-"AND\t",-"SLTIU"
++	// riscv64:"SRLW","SLTIU","NEG","AND\t",-"SRL\t"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 	return v >> (s & 63)
+ }
+ 
++func rsh5Mask32Ux64(v uint32, s uint64) uint32 {
++	// riscv64:"SRLW",-"AND\t",-"SLTIU",-"SRL\t"
++	return v >> (s & 31)
++}
++
+ func rshMask32x64(v int32, s uint64) int32 {
+ 	// arm64:"ASR",-"AND"
+ 	// ppc64x:"ISEL",-"ORN"
+@@ -127,7 +132,7 @@ func lshMask64x32(v int64, s uint32) int64 {
+ func rshMask64Ux32(v uint64, s uint32) uint64 {
+ 	// arm64:"LSR",-"AND",-"CSEL"
+ 	// ppc64x:"ANDCC",-"ORN"
+-	// riscv64:"SRL",-"AND\t",-"SLTIU"
++	// riscv64:"SRL\t",-"AND\t",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 	return v >> (s & 63)
+ }
+@@ -149,7 +154,7 @@ func lshMask64x32Ext(v int64, s int32) int64 {
+ 
+ func rshMask64Ux32Ext(v uint64, s int32) uint64 {
+ 	// ppc64x:"ANDCC",-"ORN",-"ISEL"
+-	// riscv64:"SRL",-"AND\t",-"SLTIU"
++	// riscv64:"SRL\t",-"AND\t",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 	return v >> uint(s&63)
+ }
+@@ -206,7 +211,7 @@ func lshGuarded64(v int64, s uint) int64 {
+ 
+ func rshGuarded64U(v uint64, s uint) uint64 {
+ 	if s < 64 {
+-		// riscv64:"SRL",-"AND",-"SLTIU"
++		// riscv64:"SRL\t",-"AND",-"SLTIU"
+ 		// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 		// wasm:-"Select",-".*LtU"
+ 		// arm64:"LSR",-"CSEL"
+-- 
+2.39.5
+
diff --git a/2015-cmd-link-internal-ld-assign-temporary-addresses-to-p.patch b/2015-cmd-link-internal-ld-assign-temporary-addresses-to-p.patch
new file mode 100644
index 0000000..fd56038
--- /dev/null
+++ b/2015-cmd-link-internal-ld-assign-temporary-addresses-to-p.patch
@@ -0,0 +1,267 @@
+From b8f30343204b5a19577d25b00614d08e26a77947 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 015/119] cmd/link/internal/ld: assign temporary addresses to
+ per-package text
+
+If trampolines may be required, the current text addressing second
+pass resets all assigned addresses, before assigning addresses and
+laying down trampolines in a linear fashion. However, this approach
+means that intra-package calls are to a symbol that has not yet
+been assigned an address, when the symbol is ahead of the current
+function.
+
+In the case of RISC-V the JAL instruction is limited to +/-1MiB.
+As such, if a call is to a symbol with no address currently assigned,
+we have to assume that a trampoline will be required. During the
+relocation phase we can fix up and avoid trampolines in some cases,
+however this results in unused trampolines that are still present
+in the binary (since removing them would change text addresses).
+
+In order to significantly reduce the number of unused trampolines,
+assign temporary addresses to functions within the same package,
+based on the maximum number of trampolines that may be required by
+a function. This allows for better decisions to be made regarding
+the requirement for intra-package trampolines, as we reset the
+addressing for a function, assign its final address and lay down
+any resulting trampolines.
+
+This results in ~2,300 unused trampolines being removed from the
+Go binary and ~5,600 unused trampolines being removed from the
+compile binary, on linux/riscv64.
+
+This reapplies CL 349650, however does not pass big to assignAddress
+when assigning temporary addresses, as this can result in side
+effects such as section splitting.
+
+Change-Id: Id7febdb65d962d6b1297a91294a8dc27c94d8696
+Reviewed-on: https://go-review.googlesource.com/c/go/+/534760
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Than McIntosh <thanm@google.com>
+---
+ src/cmd/link/internal/ld/data.go    | 89 +++++++++++++++++++++--------
+ src/cmd/link/internal/ld/ld_test.go | 68 ++++++++++++++++++++++
+ 2 files changed, 132 insertions(+), 25 deletions(-)
+
+diff --git a/src/cmd/link/internal/ld/data.go b/src/cmd/link/internal/ld/data.go
+index 0550f07d5c..02905f9f42 100644
+--- a/src/cmd/link/internal/ld/data.go
++++ b/src/cmd/link/internal/ld/data.go
+@@ -84,14 +84,15 @@ func maxSizeTrampolines(ctxt *Link, ldr *loader.Loader, s loader.Sym, isTramp bo
+ 		}
+ 	}
+ 
+-	if ctxt.IsARM() {
++	switch {
++	case ctxt.IsARM():
+ 		return n * 20 // Trampolines in ARM range from 3 to 5 instructions.
+-	}
+-	if ctxt.IsPPC64() {
+-		return n * 16 // Trampolines in PPC64 are 4 instructions.
+-	}
+-	if ctxt.IsARM64() {
++	case ctxt.IsARM64():
+ 		return n * 12 // Trampolines in ARM64 are 3 instructions.
++	case ctxt.IsPPC64():
++		return n * 16 // Trampolines in PPC64 are 4 instructions.
++	case ctxt.IsRISCV64():
++		return n * 8 // Trampolines in RISCV64 are 2 instructions.
+ 	}
+ 	panic("unreachable")
+ }
+@@ -118,18 +119,21 @@ func trampoline(ctxt *Link, s loader.Sym) {
+ 			continue // something is wrong. skip it here and we'll emit a better error later
+ 		}
+ 
+-		// RISC-V is only able to reach +/-1MiB via a JAL instruction,
+-		// which we can readily exceed in the same package. As such, we
+-		// need to generate trampolines when the address is unknown.
+-		if ldr.SymValue(rs) == 0 && !ctxt.Target.IsRISCV64() && ldr.SymType(rs) != sym.SDYNIMPORT && ldr.SymType(rs) != sym.SUNDEFEXT {
++		if ldr.SymValue(rs) == 0 && ldr.SymType(rs) != sym.SDYNIMPORT && ldr.SymType(rs) != sym.SUNDEFEXT {
++			// Symbols in the same package are laid out together.
++			// Except that if SymPkg(s) == "", it is a host object symbol
++			// which may call an external symbol via PLT.
+ 			if ldr.SymPkg(s) != "" && ldr.SymPkg(rs) == ldr.SymPkg(s) {
+-				// Symbols in the same package are laid out together.
+-				// Except that if SymPkg(s) == "", it is a host object symbol
+-				// which may call an external symbol via PLT.
+-				continue
++				// RISC-V is only able to reach +/-1MiB via a JAL instruction.
++				// We need to generate a trampoline when an address is
++				// currently unknown.
++				if !ctxt.Target.IsRISCV64() {
++					continue
++				}
+ 			}
++			// Runtime packages are laid out together.
+ 			if isRuntimeDepPkg(ldr.SymPkg(s)) && isRuntimeDepPkg(ldr.SymPkg(rs)) {
+-				continue // runtime packages are laid out together
++				continue
+ 			}
+ 		}
+ 		thearch.Trampoline(ctxt, ldr, ri, rs, s)
+@@ -2419,8 +2423,8 @@ func (ctxt *Link) textaddress() {
+ 		limit = 1
+ 	}
+ 
+-	// First pass: assign addresses assuming the program is small and
+-	// don't generate trampolines.
++	// First pass: assign addresses assuming the program is small and will
++	// not require trampoline generation.
+ 	big := false
+ 	for _, s := range ctxt.Textp {
+ 		sect, n, va = assignAddress(ctxt, sect, n, s, va, false, big)
+@@ -2435,21 +2439,45 @@ func (ctxt *Link) textaddress() {
+ 	if big {
+ 		// reset addresses
+ 		for _, s := range ctxt.Textp {
+-			if ldr.OuterSym(s) != 0 || s == text {
+-				continue
+-			}
+-			oldv := ldr.SymValue(s)
+-			for sub := s; sub != 0; sub = ldr.SubSym(sub) {
+-				ldr.SetSymValue(sub, ldr.SymValue(sub)-oldv)
++			if s != text {
++				resetAddress(ctxt, s)
+ 			}
+ 		}
+ 		va = start
+ 
+ 		ntramps := 0
+-		for _, s := range ctxt.Textp {
++		var curPkg string
++		for i, s := range ctxt.Textp {
++			// When we find the first symbol in a package, perform a
++			// single iteration that assigns temporary addresses to all
++			// of the text in the same package, using the maximum possible
++			// number of trampolines. This allows for better decisions to
++			// be made regarding reachability and the need for trampolines.
++			if symPkg := ldr.SymPkg(s); symPkg != "" && curPkg != symPkg {
++				curPkg = symPkg
++				vaTmp := va
++				for j := i; j < len(ctxt.Textp); j++ {
++					curSym := ctxt.Textp[j]
++					if symPkg := ldr.SymPkg(curSym); symPkg == "" || curPkg != symPkg {
++						break
++					}
++					// We do not pass big to assignAddress here, as this
++					// can result in side effects such as section splitting.
++					sect, n, vaTmp = assignAddress(ctxt, sect, n, curSym, vaTmp, false, false)
++					vaTmp += maxSizeTrampolines(ctxt, ldr, curSym, false)
++				}
++			}
++
++			// Reset address for current symbol.
++			if s != text {
++				resetAddress(ctxt, s)
++			}
++
++			// Assign actual address for current symbol.
+ 			sect, n, va = assignAddress(ctxt, sect, n, s, va, false, big)
+ 
+-			trampoline(ctxt, s) // resolve jumps, may add trampolines if jump too far
++			// Resolve jumps, adding trampolines if they are needed.
++			trampoline(ctxt, s)
+ 
+ 			// lay down trampolines after each function
+ 			for ; ntramps < len(ctxt.tramps); ntramps++ {
+@@ -2597,6 +2625,17 @@ func assignAddress(ctxt *Link, sect *sym.Section, n int, s loader.Sym, va uint64
+ 	return sect, n, va
+ }
+ 
++func resetAddress(ctxt *Link, s loader.Sym) {
++	ldr := ctxt.loader
++	if ldr.OuterSym(s) != 0 {
++		return
++	}
++	oldv := ldr.SymValue(s)
++	for sub := s; sub != 0; sub = ldr.SubSym(sub) {
++		ldr.SetSymValue(sub, ldr.SymValue(sub)-oldv)
++	}
++}
++
+ // Return whether we may need to split text sections.
+ //
+ // On PPC64x, when external linking, a text section should not be
+diff --git a/src/cmd/link/internal/ld/ld_test.go b/src/cmd/link/internal/ld/ld_test.go
+index a7a6082f54..1767667759 100644
+--- a/src/cmd/link/internal/ld/ld_test.go
++++ b/src/cmd/link/internal/ld/ld_test.go
+@@ -344,3 +344,71 @@ func main() {
+ 		})
+ 	}
+ }
++
++func TestRISCVTrampolines(t *testing.T) {
++	testenv.MustHaveGoBuild(t)
++	t.Parallel()
++
++	tmpDir := t.TempDir()
++	tmpFile := filepath.Join(tmpDir, "x.s")
++
++	// Calling b from a or c should not use trampolines, however
++	// calling from d to a will require one.
++	buf := new(bytes.Buffer)
++	fmt.Fprintf(buf, "TEXT a(SB),$0-0\n")
++	for i := 0; i < 1<<17; i++ {
++		fmt.Fprintf(buf, "\tADD $0, X0, X0\n")
++	}
++	fmt.Fprintf(buf, "\tCALL b(SB)\n")
++	fmt.Fprintf(buf, "\tRET\n")
++	fmt.Fprintf(buf, "TEXT b(SB),$0-0\n")
++	fmt.Fprintf(buf, "\tRET\n")
++	fmt.Fprintf(buf, "TEXT c(SB),$0-0\n")
++	fmt.Fprintf(buf, "\tCALL b(SB)\n")
++	fmt.Fprintf(buf, "\tRET\n")
++	fmt.Fprintf(buf, "TEXT ·d(SB),0,$0-0\n")
++	for i := 0; i < 1<<17; i++ {
++		fmt.Fprintf(buf, "\tADD $0, X0, X0\n")
++	}
++	fmt.Fprintf(buf, "\tCALL a(SB)\n")
++	fmt.Fprintf(buf, "\tCALL c(SB)\n")
++	fmt.Fprintf(buf, "\tRET\n")
++	if err := os.WriteFile(tmpFile, buf.Bytes(), 0644); err != nil {
++		t.Fatalf("Failed to write assembly file: %v", err)
++	}
++
++	if err := os.WriteFile(filepath.Join(tmpDir, "go.mod"), []byte("module riscvtramp"), 0644); err != nil {
++		t.Fatalf("Failed to write file: %v\n", err)
++	}
++	main := `package main
++func main() {
++	d()
++}
++
++func d()
++`
++	if err := os.WriteFile(filepath.Join(tmpDir, "x.go"), []byte(main), 0644); err != nil {
++		t.Fatalf("failed to write main: %v\n", err)
++	}
++	cmd := testenv.Command(t, testenv.GoToolPath(t), "build", "-ldflags=-linkmode=internal")
++	cmd.Dir = tmpDir
++	cmd.Env = append(os.Environ(), "GOARCH=riscv64", "GOOS=linux")
++	out, err := cmd.CombinedOutput()
++	if err != nil {
++		t.Fatalf("Build failed: %v, output: %s", err, out)
++	}
++
++	// Check what trampolines exist.
++	cmd = testenv.Command(t, testenv.GoToolPath(t), "tool", "nm", filepath.Join(tmpDir, "riscvtramp"))
++	cmd.Env = append(os.Environ(), "GOARCH=riscv64", "GOOS=linux")
++	out, err = cmd.CombinedOutput()
++	if err != nil {
++		t.Fatalf("nm failure: %s\n%s\n", err, string(out))
++	}
++	if !bytes.Contains(out, []byte(" T a-tramp0")) {
++		t.Errorf("Trampoline a-tramp0 is missing")
++	}
++	if bytes.Contains(out, []byte(" T b-tramp0")) {
++		t.Errorf("Trampoline b-tramp0 exists unnecessarily")
++	}
++}
+-- 
+2.39.5
+
diff --git a/2016-cmd-compile-optimize-right-shifts-of-int32-on-riscv6.patch b/2016-cmd-compile-optimize-right-shifts-of-int32-on-riscv6.patch
new file mode 100644
index 0000000..42867e1
--- /dev/null
+++ b/2016-cmd-compile-optimize-right-shifts-of-int32-on-riscv6.patch
@@ -0,0 +1,540 @@
+From 3018460e65fe8c0a0afe5b0bf09db3c9b0d909c9 Mon Sep 17 00:00:00 2001
+From: Ubuntu <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 016/119] cmd/compile: optimize right shifts of int32 on
+ riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The compiler is currently sign extending 32 bit signed integers to
+64 bits before right shifting them using a 64 bit shift instruction.
+There's no need to do this as RISC-V has instructions for right
+shifting 32 bit signed values (sraw and sraiw) which sign extend
+the result of the shift to 64 bits.  Change the compiler so that
+it uses sraw and sraiw for shifts of signed 32 bit integers reducing
+in most cases the number of instructions needed to perform the shift.
+
+Here are some examples of code sequences that are changed by this
+patch:
+
+int32(a) >> 2
+
+  before:
+
+    sll     x5,x10,0x20
+    sra     x10,x5,0x22
+
+  after:
+
+    sraw    x10,x10,0x2
+
+int32(v) >> int(s)
+
+  before:
+
+    sext.w  x5,x10
+    sltiu   x6,x11,64
+    add     x6,x6,-1
+    or      x6,x11,x6
+    sra     x10,x5,x6
+
+  after:
+
+    sltiu   x5,x11,32
+    add     x5,x5,-1
+    or      x5,x11,x5
+    sraw    x10,x10,x5
+
+int32(v) >> (int(s) & 31)
+
+  before:
+
+    sext.w  x5,x10
+    and     x6,x11,63
+    sra     x10,x5,x6
+
+after:
+
+    and     x5,x11,31
+    sraw    x10,x10,x5
+
+int32(100) >> int(a)
+
+  before:
+
+    bltz    x10,<target address calls runtime.panicshift>
+    sltiu   x5,x10,64
+    add     x5,x5,-1
+    or      x5,x10,x5
+    li      x6,100
+    sra     x10,x6,x5
+
+  after:
+
+    bltz    x10,<target address calls runtime.panicshift>
+    sltiu   x5,x10,32
+    add     x5,x5,-1
+    or      x5,x10,x5
+    li      x6,100
+    sraw    x10,x6,x5
+
+int32(v) >> (int(s) & 63)
+
+  before:
+
+    sext.w  x5,x10
+    and     x6,x11,63
+    sra     x10,x5,x6
+
+  after:
+
+    and     x5,x11,63
+    sltiu   x6,x5,32
+    add     x6,x6,-1
+    or      x5,x5,x6
+    sraw    x10,x10,x5
+
+In most cases we eliminate one instruction.  In the case where
+we shift a int32 constant by a variable the number of instructions
+generated is identical.  A sra is simply replaced by a sraw.  In the
+unusual case where we shift right by a variable anded with a constant
+> 31 but < 64, we generate two additional instructions.  As this is
+an unusual case we do not try to optimize for it.
+
+Some improvements can be seen in some of the existing benchmarks,
+notably in the utf8 package which performs right shifts of runes
+which are signed 32 bit integers.
+
+                      |  utf8-old   |              utf8-new            |
+                      |   sec/op    |   sec/op     vs base             |
+EncodeASCIIRune-4       17.68n ± 0%   17.67n ± 0%       ~ (p=0.312 n=10)
+EncodeJapaneseRune-4    35.34n ± 0%   34.53n ± 1%  -2.31% (p=0.000 n=10)
+AppendASCIIRune-4       3.213n ± 0%   3.213n ± 0%       ~ (p=0.318 n=10)
+AppendJapaneseRune-4    36.14n ± 0%   35.35n ± 0%  -2.19% (p=0.000 n=10)
+DecodeASCIIRune-4       28.11n ± 0%   27.36n ± 0%  -2.69% (p=0.000 n=10)
+DecodeJapaneseRune-4    38.55n ± 0%   38.58n ± 0%       ~ (p=0.612 n=10)
+
+Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2
+Reviewed-on: https://go-review.googlesource.com/c/go/+/535115
+Run-TryBot: Joel Sing <joel@sing.id.au>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+Reviewed-by: David Chase <drchase@google.com>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |  4 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   | 26 +++++---
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |  2 +
+ src/cmd/compile/internal/ssa/opGen.go         | 30 +++++++++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 65 ++++++++++++++++---
+ test/codegen/shift.go                         | 23 ++++---
+ 6 files changed, 121 insertions(+), 29 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 332f5841b7..22338188e5 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -278,7 +278,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.To.Type = obj.TYPE_REG
+ 		p.To.Reg = rd
+ 	case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64SUBW, ssa.OpRISCV64XOR, ssa.OpRISCV64OR, ssa.OpRISCV64AND,
+-		ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW,
++		ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRAW, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW,
+ 		ssa.OpRISCV64SLT, ssa.OpRISCV64SLTU, ssa.OpRISCV64MUL, ssa.OpRISCV64MULW, ssa.OpRISCV64MULH,
+ 		ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW,
+ 		ssa.OpRISCV64DIVUW, ssa.OpRISCV64REM, ssa.OpRISCV64REMU, ssa.OpRISCV64REMW,
+@@ -356,7 +356,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.To.Type = obj.TYPE_REG
+ 		p.To.Reg = v.Reg()
+ 	case ssa.OpRISCV64ADDI, ssa.OpRISCV64ADDIW, ssa.OpRISCV64XORI, ssa.OpRISCV64ORI, ssa.OpRISCV64ANDI,
+-		ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI,
++		ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRAIW, ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI,
+ 		ssa.OpRISCV64SLTIU:
+ 		p := s.Prog(v.Op.Asm())
+ 		p.From.Type = obj.TYPE_CONST
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 4cacabb236..9afe5995ae 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -175,16 +175,19 @@
+ (Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt32to64 x) y)
+ (Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL x                 y)
+ 
+-// SRA only considers the bottom 6 bits of y. If y > 64, the result should
+-// be either 0 or -1 based on the sign bit.
++// SRA only considers the bottom 6 bits of y, similarly SRAW only considers the
++// bottom 5 bits. If y is greater than the maximum value (either 63 or 31
++// depending on the instruction),  the result of the shift should be either 0
++// or -1 based on the sign bit of x.
+ //
+-// We implement this by performing the max shift (-1) if y >= 64.
++// We implement this by performing the max shift (-1) if y > the maximum value.
+ //
+ // We OR (uint64(y < 64) - 1) into y before passing it to SRA. This leaves
+-// us with -1 (0xffff...) if y >= 64.
++// us with -1 (0xffff...) if y >= 64.  Similarly, we OR (uint64(y < 32) - 1) into y
++// before passing it to SRAW.
+ //
+ // We don't need to sign-extend the OR result, as it will be at minimum 8 bits,
+-// more than the 6 bits SRA cares about.
++// more than the 5 or 6 bits SRAW and SRA care about.
+ (Rsh8x8   <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
+ (Rsh8x16  <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+ (Rsh8x32  <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+@@ -193,10 +196,10 @@
+ (Rsh16x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+ (Rsh16x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+ (Rsh16x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+-(Rsh32x8  <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
+-(Rsh32x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+-(Rsh32x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+-(Rsh32x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
++(Rsh32x8  <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt8to64  y)))))
++(Rsh32x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt16to64 y)))))
++(Rsh32x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt32to64 y)))))
++(Rsh32x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] y))))
+ (Rsh64x8  <t> x y) && !shiftIsBounded(v) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
+ (Rsh64x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+ (Rsh64x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+@@ -706,9 +709,11 @@
+ // But for now, this is enough to get rid of lots of them.
+ (MOVDnop (MOVDconst [c])) => (MOVDconst [c])
+ 
+-// Avoid unnecessary zero extension when right shifting.
++// Avoid unnecessary zero and sign extension when right shifting.
+ (SRL <t> (MOVWUreg x) y) => (SRLW <t> x y)
+ (SRLI <t> [x] (MOVWUreg y)) => (SRLIW <t> [int64(x&31)] y)
++(SRA <t> (MOVWreg x) y) => (SRAW <t> x y)
++(SRAI <t> [x] (MOVWreg y)) => (SRAIW <t> [int64(x&31)] y)
+ 
+ // Fold constant into immediate instructions where possible.
+ (ADD (MOVDconst <t> [val]) x) && is32Bit(val) && !t.IsPtr() => (ADDI [val] x)
+@@ -719,6 +724,7 @@
+ (SRL  x (MOVDconst [val])) => (SRLI [int64(val&63)] x)
+ (SRLW x (MOVDconst [val])) => (SRLIW [int64(val&31)] x)
+ (SRA  x (MOVDconst [val])) => (SRAI [int64(val&63)] x)
++(SRAW x (MOVDconst [val])) => (SRAIW [int64(val&31)] x)
+ (SLT  x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTI  [val] x)
+ (SLTU x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTIU [val] x)
+ 
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 360eff6bcf..93f20f8a99 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -209,10 +209,12 @@ func init() {
+ 		// Shift ops
+ 		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                   // arg0 << (aux1 & 63)
+ 		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                   // arg0 >> (aux1 & 63), signed
++		{name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"},                 // arg0 >> (aux1 & 31), signed
+ 		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                   // arg0 >> (aux1 & 63), unsigned
+ 		{name: "SRLW", argLength: 2, reg: gp21, asm: "SRLW"},                 // arg0 >> (aux1 & 31), unsigned
+ 		{name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"},   // arg0 << auxint, shift amount 0-63
+ 		{name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"},   // arg0 >> auxint, signed, shift amount 0-63
++		{name: "SRAIW", argLength: 1, reg: gp11, asm: "SRAIW", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-31
+ 		{name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"},   // arg0 >> auxint, unsigned, shift amount 0-63
+ 		{name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-31
+ 
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index dadf1f20c7..62b516ce61 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2364,10 +2364,12 @@ const (
+ 	OpRISCV64MOVDnop
+ 	OpRISCV64SLL
+ 	OpRISCV64SRA
++	OpRISCV64SRAW
+ 	OpRISCV64SRL
+ 	OpRISCV64SRLW
+ 	OpRISCV64SLLI
+ 	OpRISCV64SRAI
++	OpRISCV64SRAIW
+ 	OpRISCV64SRLI
+ 	OpRISCV64SRLIW
+ 	OpRISCV64XOR
+@@ -31673,6 +31675,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "SRAW",
++		argLen: 2,
++		asm:    riscv.ASRAW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:   "SRL",
+ 		argLen: 2,
+@@ -31729,6 +31745,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:    "SRAIW",
++		auxType: auxInt64,
++		argLen:  1,
++		asm:     riscv.ASRAIW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:    "SRLI",
+ 		auxType: auxInt64,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 7d16fe887f..572dac249e 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -538,6 +538,8 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpRISCV64SRA(v)
+ 	case OpRISCV64SRAI:
+ 		return rewriteValueRISCV64_OpRISCV64SRAI(v)
++	case OpRISCV64SRAW:
++		return rewriteValueRISCV64_OpRISCV64SRAW(v)
+ 	case OpRISCV64SRL:
+ 		return rewriteValueRISCV64_OpRISCV64SRL(v)
+ 	case OpRISCV64SRLI:
+@@ -6258,6 +6260,20 @@ func rewriteValueRISCV64_OpRISCV64SNEZ(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64SRA(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	// match: (SRA <t> (MOVWreg x) y)
++	// result: (SRAW <t> x y)
++	for {
++		t := v.Type
++		if v_0.Op != OpRISCV64MOVWreg {
++			break
++		}
++		x := v_0.Args[0]
++		y := v_1
++		v.reset(OpRISCV64SRAW)
++		v.Type = t
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (SRA x (MOVDconst [val]))
+ 	// result: (SRAI [int64(val&63)] x)
+ 	for {
+@@ -6275,6 +6291,21 @@ func rewriteValueRISCV64_OpRISCV64SRA(v *Value) bool {
+ }
+ func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool {
+ 	v_0 := v.Args[0]
++	// match: (SRAI <t> [x] (MOVWreg y))
++	// result: (SRAIW <t> [int64(x&31)] y)
++	for {
++		t := v.Type
++		x := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpRISCV64MOVWreg {
++			break
++		}
++		y := v_0.Args[0]
++		v.reset(OpRISCV64SRAIW)
++		v.Type = t
++		v.AuxInt = int64ToAuxInt(int64(x & 31))
++		v.AddArg(y)
++		return true
++	}
+ 	// match: (SRAI [x] (MOVDconst [y]))
+ 	// result: (MOVDconst [int64(y) >> uint32(x)])
+ 	for {
+@@ -6289,6 +6320,24 @@ func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64SRAW(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (SRAW x (MOVDconst [val]))
++	// result: (SRAIW [int64(val&31)] x)
++	for {
++		x := v_0
++		if v_1.Op != OpRISCV64MOVDconst {
++			break
++		}
++		val := auxIntToInt64(v_1.AuxInt)
++		v.reset(OpRISCV64SRAIW)
++		v.AuxInt = int64ToAuxInt(int64(val & 31))
++		v.AddArg(x)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+@@ -7172,7 +7221,7 @@ func rewriteValueRISCV64_OpRsh32x16(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x16 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
++	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt16to64 y)))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7188,7 +7237,7 @@ func rewriteValueRISCV64_OpRsh32x16(v *Value) bool {
+ 		v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
+ 		v2.AuxInt = int64ToAuxInt(-1)
+ 		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+ 		v4.AddArg(y)
+ 		v3.AddArg(v4)
+@@ -7221,7 +7270,7 @@ func rewriteValueRISCV64_OpRsh32x32(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x32 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
++	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt32to64 y)))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7237,7 +7286,7 @@ func rewriteValueRISCV64_OpRsh32x32(v *Value) bool {
+ 		v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
+ 		v2.AuxInt = int64ToAuxInt(-1)
+ 		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+ 		v4.AddArg(y)
+ 		v3.AddArg(v4)
+@@ -7270,7 +7319,7 @@ func rewriteValueRISCV64_OpRsh32x64(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x64 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
++	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] y))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7286,7 +7335,7 @@ func rewriteValueRISCV64_OpRsh32x64(v *Value) bool {
+ 		v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
+ 		v2.AuxInt = int64ToAuxInt(-1)
+ 		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v3.AddArg(y)
+ 		v2.AddArg(v3)
+ 		v1.AddArg2(y, v2)
+@@ -7317,7 +7366,7 @@ func rewriteValueRISCV64_OpRsh32x8(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x8 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y)))))
++	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt8to64 y)))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7333,7 +7382,7 @@ func rewriteValueRISCV64_OpRsh32x8(v *Value) bool {
+ 		v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
+ 		v2.AuxInt = int64ToAuxInt(-1)
+ 		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+ 		v4.AddArg(y)
+ 		v3.AddArg(v4)
+diff --git a/test/codegen/shift.go b/test/codegen/shift.go
+index 302560d5b0..b9d888ca6c 100644
+--- a/test/codegen/shift.go
++++ b/test/codegen/shift.go
+@@ -24,7 +24,7 @@ func rshConst64Ux64(v uint64) uint64 {
+ 
+ func rshConst64x64(v int64) int64 {
+ 	// ppc64x:"SRAD"
+-	// riscv64:"SRAI",-"OR",-"SLTIU"
++	// riscv64:"SRAI\t",-"OR",-"SLTIU"
+ 	return v >> uint64(33)
+ }
+ 
+@@ -42,7 +42,7 @@ func rshConst32Ux64(v uint32) uint32 {
+ 
+ func rshConst32x64(v int32) int32 {
+ 	// ppc64x:"SRAW"
+-	// riscv64:"SRAI",-"OR",-"SLTIU", -"MOVW"
++	// riscv64:"SRAIW",-"OR",-"SLTIU", -"MOVW"
+ 	return v >> uint64(29)
+ }
+ 
+@@ -60,7 +60,7 @@ func rshConst64Ux32(v uint64) uint64 {
+ 
+ func rshConst64x32(v int64) int64 {
+ 	// ppc64x:"SRAD"
+-	// riscv64:"SRAI",-"OR",-"SLTIU"
++	// riscv64:"SRAI\t",-"OR",-"SLTIU"
+ 	return v >> uint32(33)
+ }
+ 
+@@ -87,7 +87,7 @@ func rshMask64Ux64(v uint64, s uint64) uint64 {
+ func rshMask64x64(v int64, s uint64) int64 {
+ 	// arm64:"ASR",-"AND",-"CSEL"
+ 	// ppc64x:"ANDCC",-"ORN",-"ISEL"
+-	// riscv64:"SRA",-"OR",-"SLTIU"
++	// riscv64:"SRA\t",-"OR",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 	return v >> (s & 63)
+ }
+@@ -116,11 +116,16 @@ func rsh5Mask32Ux64(v uint32, s uint64) uint32 {
+ func rshMask32x64(v int32, s uint64) int32 {
+ 	// arm64:"ASR",-"AND"
+ 	// ppc64x:"ISEL",-"ORN"
+-	// riscv64:"SRA",-"OR",-"SLTIU"
++	// riscv64:"SRAW","OR","SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 	return v >> (s & 63)
+ }
+ 
++func rsh5Mask32x64(v int32, s uint64) int32 {
++	// riscv64:"SRAW",-"OR",-"SLTIU"
++	return v >> (s & 31)
++}
++
+ func lshMask64x32(v int64, s uint32) int64 {
+ 	// arm64:"LSL",-"AND"
+ 	// ppc64x:"ANDCC",-"ORN"
+@@ -139,8 +144,8 @@ func rshMask64Ux32(v uint64, s uint32) uint64 {
+ 
+ func rshMask64x32(v int64, s uint32) int64 {
+ 	// arm64:"ASR",-"AND",-"CSEL"
+-	// ppc64x:"ANDCC",-"ORN",-"ISEL"
+-	// riscv64:"SRA",-"OR",-"SLTIU"
++	// ppc64x:"RLDICL",-"ORN",-"ISEL"
++	// riscv64:"SRA\t",-"OR",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 	return v >> (s & 63)
+ }
+@@ -161,7 +166,7 @@ func rshMask64Ux32Ext(v uint64, s int32) uint64 {
+ 
+ func rshMask64x32Ext(v int64, s int32) int64 {
+ 	// ppc64x:"ANDCC",-"ORN",-"ISEL"
+-	// riscv64:"SRA",-"OR",-"SLTIU"
++	// riscv64:"SRA\t",-"OR",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 	return v >> uint(s&63)
+ }
+@@ -222,7 +227,7 @@ func rshGuarded64U(v uint64, s uint) uint64 {
+ 
+ func rshGuarded64(v int64, s uint) int64 {
+ 	if s < 64 {
+-		// riscv64:"SRA",-"OR",-"SLTIU"
++		// riscv64:"SRA\t",-"OR",-"SLTIU"
+ 		// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 		// wasm:-"Select",-".*LtU"
+ 		// arm64:"ASR",-"CSEL"
+-- 
+2.39.5
+
diff --git a/2017-cmd-internal-obj-riscv-support-subtraction-with-a-co.patch b/2017-cmd-internal-obj-riscv-support-subtraction-with-a-co.patch
new file mode 100644
index 0000000..df82012
--- /dev/null
+++ b/2017-cmd-internal-obj-riscv-support-subtraction-with-a-co.patch
@@ -0,0 +1,72 @@
+From 2a7bd73050a6314f4ba9ab19ce6b85764612f5e6 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 017/119] cmd/internal/obj/riscv: support subtraction with a
+ constant
+
+Allow SUB and SUBW to be specified with a constant, which are mapped
+to ADDI and ADDIW with negated values.
+
+Change-Id: I7dc55692febc81ea87393b0a3a7d23a43c30313b
+Reviewed-on: https://go-review.googlesource.com/c/go/+/538915
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Heschi Kreinick <heschi@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Wang Yaduo <wangyaduo@linux.alibaba.com>
+Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s | 5 +++++
+ src/cmd/internal/obj/riscv/obj.go           | 4 ++++
+ 2 files changed, 9 insertions(+)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 9899ec9e7b..11a9e30080 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -94,6 +94,10 @@ start:
+ 
+ 	SUB	X6, X5, X7				// b3836240
+ 	SUB	X5, X6					// 33035340
++	SUB	$-2047, X5, X6				// 1383f27f
++	SUB	$2048, X5, X6				// 13830280
++	SUB	$-2047, X5				// 9382f27f
++	SUB	$2048, X5				// 93820280
+ 
+ 	SRA	X6, X5, X7				// b3d36240
+ 	SRA	X5, X6					// 33535340
+@@ -157,6 +161,7 @@ start:
+ 	ADDW	$1, X6					// 1b031300
+ 	SLLW	$1, X6					// 1b131300
+ 	SRLW	$1, X6					// 1b531300
++	SUBW	$1, X6					// 1b03f3ff
+ 	SRAW	$1, X6					// 1b531340
+ 
+ 	// 5.3: Load and Store Instructions (RV64I)
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index ab41e53b8c..997c962bdd 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -69,6 +69,8 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 		switch p.As {
+ 		case AADD:
+ 			p.As = AADDI
++		case ASUB:
++			p.As, p.From.Offset = AADDI, -p.From.Offset
+ 		case ASLT:
+ 			p.As = ASLTI
+ 		case ASLTU:
+@@ -87,6 +89,8 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 			p.As = ASRAI
+ 		case AADDW:
+ 			p.As = AADDIW
++		case ASUBW:
++			p.As, p.From.Offset = AADDIW, -p.From.Offset
+ 		case ASLLW:
+ 			p.As = ASLLIW
+ 		case ASRLW:
+-- 
+2.39.5
+
diff --git a/2018-cmd-internal-obj-riscv-fix-the-offset-of-JALR-transf.patch b/2018-cmd-internal-obj-riscv-fix-the-offset-of-JALR-transf.patch
new file mode 100644
index 0000000..03e335e
--- /dev/null
+++ b/2018-cmd-internal-obj-riscv-fix-the-offset-of-JALR-transf.patch
@@ -0,0 +1,119 @@
+From 2ca7a420f327934e43c481827c30348ff2dc2340 Mon Sep 17 00:00:00 2001
+From: Wang Yaduo <wangyaduo@linux.alibaba.com>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 018/119] cmd/internal/obj/riscv: fix the offset of JALR
+ transformed from JAL
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Currently, the offset of JALR is zero all the time, which is transformed
+from JAL with over ±1MB offset. This causes the segment fault for the
+wrong address.
+
+Change-Id: I4dcb3eb13bd1ea71e9eb27f07c03ffec376608ab
+Reviewed-on: https://go-review.googlesource.com/c/go/+/538135
+Run-TryBot: M Zhuo <mzh@golangcn.org>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Heschi Kreinick <heschi@google.com>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/internal/obj/riscv/asm_test.go | 66 ++++++++++++++++++++++++++
+ src/cmd/internal/obj/riscv/obj.go      |  2 +-
+ 2 files changed, 67 insertions(+), 1 deletion(-)
+
+diff --git a/src/cmd/internal/obj/riscv/asm_test.go b/src/cmd/internal/obj/riscv/asm_test.go
+index c22428cdc5..afe0525532 100644
+--- a/src/cmd/internal/obj/riscv/asm_test.go
++++ b/src/cmd/internal/obj/riscv/asm_test.go
+@@ -126,6 +126,72 @@ func genLargeCall(buf *bytes.Buffer) {
+ 	fmt.Fprintln(buf, "RET")
+ }
+ 
++// TestLargeJump generates a large jump (>1MB of text) with a JMP to the
++// end of the function, in order to ensure that it assembles correctly.
++func TestLargeJump(t *testing.T) {
++	if testing.Short() {
++		t.Skip("Skipping test in short mode")
++	}
++	if runtime.GOARCH != "riscv64" {
++		t.Skip("Require riscv64 to run")
++	}
++	testenv.MustHaveGoBuild(t)
++
++	dir := t.TempDir()
++
++	if err := os.WriteFile(filepath.Join(dir, "go.mod"), []byte("module largejump"), 0644); err != nil {
++		t.Fatalf("Failed to write file: %v\n", err)
++	}
++	main := `package main
++
++import "fmt"
++
++func main() {
++        fmt.Print(x())
++}
++
++func x() uint64
++`
++	if err := os.WriteFile(filepath.Join(dir, "x.go"), []byte(main), 0644); err != nil {
++		t.Fatalf("failed to write main: %v\n", err)
++	}
++
++	// Generate a very large jump instruction.
++	buf := bytes.NewBuffer(make([]byte, 0, 7000000))
++	genLargeJump(buf)
++
++	if err := os.WriteFile(filepath.Join(dir, "x.s"), buf.Bytes(), 0644); err != nil {
++		t.Fatalf("Failed to write file: %v\n", err)
++	}
++
++	// Build generated files.
++	cmd := testenv.Command(t, testenv.GoToolPath(t), "build", "-o", "x.exe")
++	cmd.Dir = dir
++	out, err := cmd.CombinedOutput()
++	if err != nil {
++		t.Errorf("Build failed: %v, output: %s", err, out)
++	}
++
++	cmd = testenv.Command(t, filepath.Join(dir, "x.exe"))
++	out, err = cmd.CombinedOutput()
++	if string(out) != "1" {
++		t.Errorf(`Got test output %q, want "1"`, string(out))
++	}
++}
++
++func genLargeJump(buf *bytes.Buffer) {
++	fmt.Fprintln(buf, "TEXT ·x(SB),0,$0-8")
++	fmt.Fprintln(buf, "MOV  X0, X10")
++	fmt.Fprintln(buf, "JMP end")
++	for i := 0; i < 1<<18; i++ {
++		fmt.Fprintln(buf, "ADD $1, X10, X10")
++	}
++	fmt.Fprintln(buf, "end:")
++	fmt.Fprintln(buf, "ADD $1, X10, X10")
++	fmt.Fprintln(buf, "MOV X10, r+0(FP)")
++	fmt.Fprintln(buf, "RET")
++}
++
+ // Issue 20348.
+ func TestNoRet(t *testing.T) {
+ 	dir, err := os.MkdirTemp("", "testnoret")
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 997c962bdd..3ab1ae94b9 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -731,7 +731,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
+ 					ctxt.Diag("%v: jump displacement %d too large", p, p.To.Target().Pc-p.Pc)
+ 				}
+ 				p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: high, Sym: cursym}
+-				p.Link.From.Offset = low
++				p.Link.To.Offset = low
+ 			}
+ 		}
+ 	}
+-- 
+2.39.5
+
diff --git a/2019-cmd-internal-obj-riscv-improve-handling-of-invalid-a.patch b/2019-cmd-internal-obj-riscv-improve-handling-of-invalid-a.patch
new file mode 100644
index 0000000..06ad9c9
--- /dev/null
+++ b/2019-cmd-internal-obj-riscv-improve-handling-of-invalid-a.patch
@@ -0,0 +1,376 @@
+From 74d9867e13eaa1dea10c5eddedc88bedb4fbf865 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 019/119] cmd/internal/obj/riscv: improve handling of invalid
+ assembly
+
+Currently, instruction validation failure will result in a panic during
+encoding. Furthermore, the errors generated do not include the PC or
+file/line information that is normally present.
+
+Fix this by:
+
+- Tracking and printing the *obj.Prog associated with the instruction,
+  including the assembly instruction/opcode if it differs. This provides
+  the standard PC and file/line prefix, which is also expected by assembly
+  error end-to-end tests.
+
+- Not proceeding with assembly if errors exist - with the current design,
+  errors are identified during validation, which is run via preprocess.
+  Attempts to encode invalid instructions will intentionally panic.
+
+Add some additional riscv64 encoding errors, now that we can actually do so.
+
+Change-Id: I64a7b83680c4d12aebdc96c67f9df625b5ef90d3
+Reviewed-on: https://go-review.googlesource.com/c/go/+/523459
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Heschi Kreinick <heschi@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Run-TryBot: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+---
+ .../asm/internal/asm/testdata/riscv64error.s  |   5 +-
+ src/cmd/internal/obj/riscv/obj.go             | 201 ++++++++++--------
+ 2 files changed, 116 insertions(+), 90 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index cdb8a028bd..2dc9db3fb1 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -38,5 +38,8 @@ TEXT errors(SB),$0
+ 	SLLIW	$-1, X5, X6			// ERROR "shift amount out of range 0 to 31"
+ 	SRLIW	$-1, X5, X6			// ERROR "shift amount out of range 0 to 31"
+ 	SRAIW	$-1, X5, X6			// ERROR "shift amount out of range 0 to 31"
+-
++	SD	X5, 4294967296(X6)		// ERROR "constant 4294967296 too large"
++	SRLI	$1, X5, F1			// ERROR "expected integer register in rd position but got non-integer register F1"
++	SRLI	$1, F1, X5			// ERROR "expected integer register in rs1 position but got non-integer register F1"
++	FNES	F1, (X5)			// ERROR "needs an integer register output"
+ 	RET
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 3ab1ae94b9..195cd26413 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1042,154 +1042,154 @@ func immI(as obj.As, imm int64, nbits uint) uint32 {
+ 	return uint32(imm)
+ }
+ 
+-func wantImmI(ctxt *obj.Link, as obj.As, imm int64, nbits uint) {
++func wantImmI(ctxt *obj.Link, ins *instruction, imm int64, nbits uint) {
+ 	if err := immIFits(imm, nbits); err != nil {
+-		ctxt.Diag("%v: %v", as, err)
++		ctxt.Diag("%v: %v", ins, err)
+ 	}
+ }
+ 
+-func wantReg(ctxt *obj.Link, as obj.As, pos string, descr string, r, min, max uint32) {
++func wantReg(ctxt *obj.Link, ins *instruction, pos string, descr string, r, min, max uint32) {
+ 	if r < min || r > max {
+ 		var suffix string
+ 		if r != obj.REG_NONE {
+ 			suffix = fmt.Sprintf(" but got non-%s register %s", descr, RegName(int(r)))
+ 		}
+-		ctxt.Diag("%v: expected %s register in %s position%s", as, descr, pos, suffix)
++		ctxt.Diag("%v: expected %s register in %s position%s", ins, descr, pos, suffix)
+ 	}
+ }
+ 
+-func wantNoneReg(ctxt *obj.Link, as obj.As, pos string, r uint32) {
++func wantNoneReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
+ 	if r != obj.REG_NONE {
+-		ctxt.Diag("%v: expected no register in %s but got register %s", as, pos, RegName(int(r)))
++		ctxt.Diag("%v: expected no register in %s but got register %s", ins, pos, RegName(int(r)))
+ 	}
+ }
+ 
+ // wantIntReg checks that r is an integer register.
+-func wantIntReg(ctxt *obj.Link, as obj.As, pos string, r uint32) {
+-	wantReg(ctxt, as, pos, "integer", r, REG_X0, REG_X31)
++func wantIntReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
++	wantReg(ctxt, ins, pos, "integer", r, REG_X0, REG_X31)
+ }
+ 
+ // wantFloatReg checks that r is a floating-point register.
+-func wantFloatReg(ctxt *obj.Link, as obj.As, pos string, r uint32) {
+-	wantReg(ctxt, as, pos, "float", r, REG_F0, REG_F31)
++func wantFloatReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
++	wantReg(ctxt, ins, pos, "float", r, REG_F0, REG_F31)
+ }
+ 
+ // wantEvenOffset checks that the offset is a multiple of two.
+-func wantEvenOffset(ctxt *obj.Link, as obj.As, offset int64) {
++func wantEvenOffset(ctxt *obj.Link, ins *instruction, offset int64) {
+ 	if err := immEven(offset); err != nil {
+-		ctxt.Diag("%v: %v", as, err)
++		ctxt.Diag("%v: %v", ins, err)
+ 	}
+ }
+ 
+ func validateRIII(ctxt *obj.Link, ins *instruction) {
+-	wantIntReg(ctxt, ins.as, "rd", ins.rd)
+-	wantIntReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantIntReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantIntReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateRFFF(ctxt *obj.Link, ins *instruction) {
+-	wantFloatReg(ctxt, ins.as, "rd", ins.rd)
+-	wantFloatReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantFloatReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantFloatReg(ctxt, ins, "rd", ins.rd)
++	wantFloatReg(ctxt, ins, "rs1", ins.rs1)
++	wantFloatReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateRFFFF(ctxt *obj.Link, ins *instruction) {
+-	wantFloatReg(ctxt, ins.as, "rd", ins.rd)
+-	wantFloatReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantFloatReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantFloatReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantFloatReg(ctxt, ins, "rd", ins.rd)
++	wantFloatReg(ctxt, ins, "rs1", ins.rs1)
++	wantFloatReg(ctxt, ins, "rs2", ins.rs2)
++	wantFloatReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateRFFI(ctxt *obj.Link, ins *instruction) {
+-	wantIntReg(ctxt, ins.as, "rd", ins.rd)
+-	wantFloatReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantFloatReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantFloatReg(ctxt, ins, "rs1", ins.rs1)
++	wantFloatReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateRFI(ctxt *obj.Link, ins *instruction) {
+-	wantIntReg(ctxt, ins.as, "rd", ins.rd)
+-	wantNoneReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantFloatReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantFloatReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateRIF(ctxt *obj.Link, ins *instruction) {
+-	wantFloatReg(ctxt, ins.as, "rd", ins.rd)
+-	wantNoneReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantIntReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantFloatReg(ctxt, ins, "rd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantIntReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateRFF(ctxt *obj.Link, ins *instruction) {
+-	wantFloatReg(ctxt, ins.as, "rd", ins.rd)
+-	wantNoneReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantFloatReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantFloatReg(ctxt, ins, "rd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantFloatReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateII(ctxt *obj.Link, ins *instruction) {
+-	wantImmI(ctxt, ins.as, ins.imm, 12)
+-	wantIntReg(ctxt, ins.as, "rd", ins.rd)
+-	wantIntReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantNoneReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantImmI(ctxt, ins, ins.imm, 12)
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateIF(ctxt *obj.Link, ins *instruction) {
+-	wantImmI(ctxt, ins.as, ins.imm, 12)
+-	wantFloatReg(ctxt, ins.as, "rd", ins.rd)
+-	wantIntReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantNoneReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantImmI(ctxt, ins, ins.imm, 12)
++	wantFloatReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateSI(ctxt *obj.Link, ins *instruction) {
+-	wantImmI(ctxt, ins.as, ins.imm, 12)
+-	wantIntReg(ctxt, ins.as, "rd", ins.rd)
+-	wantIntReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantNoneReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantImmI(ctxt, ins, ins.imm, 12)
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateSF(ctxt *obj.Link, ins *instruction) {
+-	wantImmI(ctxt, ins.as, ins.imm, 12)
+-	wantIntReg(ctxt, ins.as, "rd", ins.rd)
+-	wantFloatReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantNoneReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantImmI(ctxt, ins, ins.imm, 12)
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantFloatReg(ctxt, ins, "rs1", ins.rs1)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateB(ctxt *obj.Link, ins *instruction) {
+ 	// Offsets are multiples of two, so accept 13 bit immediates for the
+ 	// 12 bit slot. We implicitly drop the least significant bit in encodeB.
+-	wantEvenOffset(ctxt, ins.as, ins.imm)
+-	wantImmI(ctxt, ins.as, ins.imm, 13)
+-	wantNoneReg(ctxt, ins.as, "rd", ins.rd)
+-	wantIntReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantIntReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantEvenOffset(ctxt, ins, ins.imm)
++	wantImmI(ctxt, ins, ins.imm, 13)
++	wantNoneReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantIntReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateU(ctxt *obj.Link, ins *instruction) {
+-	wantImmI(ctxt, ins.as, ins.imm, 20)
+-	wantIntReg(ctxt, ins.as, "rd", ins.rd)
+-	wantNoneReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantNoneReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantImmI(ctxt, ins, ins.imm, 20)
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateJ(ctxt *obj.Link, ins *instruction) {
+ 	// Offsets are multiples of two, so accept 21 bit immediates for the
+ 	// 20 bit slot. We implicitly drop the least significant bit in encodeJ.
+-	wantEvenOffset(ctxt, ins.as, ins.imm)
+-	wantImmI(ctxt, ins.as, ins.imm, 21)
+-	wantIntReg(ctxt, ins.as, "rd", ins.rd)
+-	wantNoneReg(ctxt, ins.as, "rs1", ins.rs1)
+-	wantNoneReg(ctxt, ins.as, "rs2", ins.rs2)
+-	wantNoneReg(ctxt, ins.as, "rs3", ins.rs3)
++	wantEvenOffset(ctxt, ins, ins.imm)
++	wantImmI(ctxt, ins, ins.imm, 21)
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateRaw(ctxt *obj.Link, ins *instruction) {
+@@ -1726,14 +1726,26 @@ func encodingForAs(as obj.As) (encoding, error) {
+ }
+ 
+ type instruction struct {
+-	as     obj.As // Assembler opcode
+-	rd     uint32 // Destination register
+-	rs1    uint32 // Source register 1
+-	rs2    uint32 // Source register 2
+-	rs3    uint32 // Source register 3
+-	imm    int64  // Immediate
+-	funct3 uint32 // Function 3
+-	funct7 uint32 // Function 7 (or Function 2)
++	p      *obj.Prog // Prog that instruction is for
++	as     obj.As    // Assembler opcode
++	rd     uint32    // Destination register
++	rs1    uint32    // Source register 1
++	rs2    uint32    // Source register 2
++	rs3    uint32    // Source register 3
++	imm    int64     // Immediate
++	funct3 uint32    // Function 3
++	funct7 uint32    // Function 7 (or Function 2)
++}
++
++func (ins *instruction) String() string {
++	if ins.p == nil {
++		return ins.as.String()
++	}
++	var suffix string
++	if ins.p.As != ins.as {
++		suffix = fmt.Sprintf(" (%v)", ins.as)
++	}
++	return fmt.Sprintf("%v%v", ins.p, suffix)
+ }
+ 
+ func (ins *instruction) encode() (uint32, error) {
+@@ -2199,13 +2211,13 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		ins.imm = p.To.Offset
+ 
+ 	case AMOV, AMOVB, AMOVH, AMOVW, AMOVBU, AMOVHU, AMOVWU, AMOVF, AMOVD:
+-		return instructionsForMOV(p)
++		inss = instructionsForMOV(p)
+ 
+ 	case ALW, ALWU, ALH, ALHU, ALB, ALBU, ALD, AFLW, AFLD:
+-		return instructionsForLoad(p, ins.as, p.From.Reg)
++		inss = instructionsForLoad(p, ins.as, p.From.Reg)
+ 
+ 	case ASW, ASH, ASB, ASD, AFSW, AFSD:
+-		return instructionsForStore(p, ins.as, p.To.Reg)
++		inss = instructionsForStore(p, ins.as, p.To.Reg)
+ 
+ 	case ALRW, ALRD:
+ 		// Set aq to use acquire access ordering
+@@ -2245,7 +2257,7 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 	case AFNES, AFNED:
+ 		// Replace FNE[SD] with FEQ[SD] and NOT.
+ 		if p.To.Type != obj.TYPE_REG {
+-			p.Ctxt.Diag("%v needs an integer register output", ins.as)
++			p.Ctxt.Diag("%v needs an integer register output", p)
+ 			return nil
+ 		}
+ 		if ins.as == AFNES {
+@@ -2334,6 +2346,11 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 			p.Ctxt.Diag("%v: shift amount out of range 0 to 31", p)
+ 		}
+ 	}
++
++	for _, ins := range inss {
++		ins.p = p
++	}
++
+ 	return inss
+ }
+ 
+@@ -2345,6 +2362,12 @@ func assemble(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
+ 		ctxt.Retpoline = false // don't keep printing
+ 	}
+ 
++	// If errors were encountered during preprocess/validation, proceeding
++	// and attempting to encode said instructions will only lead to panics.
++	if ctxt.Errors > 0 {
++		return
++	}
++
+ 	for p := cursym.Func().Text; p != nil; p = p.Link {
+ 		switch p.As {
+ 		case AJAL:
+-- 
+2.39.5
+
diff --git a/2020-all-clean-up-addition-of-constants-in-riscv64-assemb.patch b/2020-all-clean-up-addition-of-constants-in-riscv64-assemb.patch
new file mode 100644
index 0000000..091f569
--- /dev/null
+++ b/2020-all-clean-up-addition-of-constants-in-riscv64-assemb.patch
@@ -0,0 +1,555 @@
+From 29b9498db7d1bf02c85eb98cba4e2bb63237ba05 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 020/119] all: clean up addition of constants in riscv64
+ assembly
+
+Use ADD with constants, instead of ADDI. Also use SUB with a positive constant
+rather than ADD with a negative constant. The resulting assembly is still the
+same.
+
+Change-Id: Ife10bf5ae4122e525f0e7d41b5e463e748236a9c
+Reviewed-on: https://go-review.googlesource.com/c/go/+/540136
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Heschi Kreinick <heschi@google.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+---
+ src/crypto/internal/bigmod/nat_riscv64.s |  6 +--
+ src/internal/bytealg/compare_riscv64.s   | 12 ++---
+ src/internal/bytealg/equal_riscv64.s     | 10 ++--
+ src/internal/bytealg/indexbyte_riscv64.s |  4 +-
+ src/runtime/asm_riscv64.s                | 10 ++--
+ src/runtime/memclr_riscv64.s             | 14 +++---
+ src/runtime/memmove_riscv64.s            | 64 ++++++++++++------------
+ src/runtime/mkpreempt.go                 |  2 +-
+ src/runtime/preempt_riscv64.s            |  2 +-
+ src/runtime/sys_linux_riscv64.s          |  4 +-
+ 10 files changed, 64 insertions(+), 64 deletions(-)
+
+diff --git a/src/crypto/internal/bigmod/nat_riscv64.s b/src/crypto/internal/bigmod/nat_riscv64.s
+index 1d8c8c8900..c1d9cc0dd4 100644
+--- a/src/crypto/internal/bigmod/nat_riscv64.s
++++ b/src/crypto/internal/bigmod/nat_riscv64.s
+@@ -80,10 +80,10 @@ loop:
+ 	MOV	X16, 2*8(X5)	// z[2]
+ 	MOV	X19, 3*8(X5)	// z[3]
+ 
+-	ADDI	$32, X5
+-	ADDI	$32, X7
++	ADD	$32, X5
++	ADD	$32, X7
+ 
+-	ADDI	$-4, X30
++	SUB	$4, X30
+ 	BNEZ	X30, loop
+ 
+ done:
+diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s
+index a4164a2b81..b1e1f7bcc7 100644
+--- a/src/internal/bytealg/compare_riscv64.s
++++ b/src/internal/bytealg/compare_riscv64.s
+@@ -53,7 +53,7 @@ use_a_len:
+ 	ADD	$8, X7, X7
+ 	SUB	X7, X5, X5
+ align:
+-	ADD	$-1, X7
++	SUB	$1, X7
+ 	MOVBU	0(X10), X8
+ 	MOVBU	0(X12), X9
+ 	BNE	X8, X9, cmp
+@@ -79,7 +79,7 @@ compare32:
+ 	BNE	X17, X18, cmp8b
+ 	ADD	$32, X10
+ 	ADD	$32, X12
+-	ADD	$-32, X5
++	SUB	$32, X5
+ 	BGE	X5, X6, compare32
+ 	BEQZ	X5, cmp_len
+ 
+@@ -95,7 +95,7 @@ compare16:
+ 	BNE	X17, X18, cmp8b
+ 	ADD	$16, X10
+ 	ADD	$16, X12
+-	ADD	$-16, X5
++	SUB	$16, X5
+ 	BEQZ	X5, cmp_len
+ 
+ check8_unaligned:
+@@ -128,7 +128,7 @@ compare8_unaligned:
+ 	BNE	X29, X30, cmp1h
+ 	ADD	$8, X10
+ 	ADD	$8, X12
+-	ADD	$-8, X5
++	SUB	$8, X5
+ 	BGE	X5, X6, compare8_unaligned
+ 	BEQZ	X5, cmp_len
+ 
+@@ -150,7 +150,7 @@ compare4_unaligned:
+ 	BNE	X19, X20, cmp1d
+ 	ADD	$4, X10
+ 	ADD	$4, X12
+-	ADD	$-4, X5
++	SUB	$4, X5
+ 	BGE	X5, X6, compare4_unaligned
+ 
+ compare1:
+@@ -160,7 +160,7 @@ compare1:
+ 	BNE	X8, X9, cmp
+ 	ADD	$1, X10
+ 	ADD	$1, X12
+-	ADD	$-1, X5
++	SUB	$1, X5
+ 	JMP	compare1
+ 
+ 	// Compare 8 bytes of memory in X15/X16 that are known to differ.
+diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s
+index 503aac5751..7f470ce0a0 100644
+--- a/src/internal/bytealg/equal_riscv64.s
++++ b/src/internal/bytealg/equal_riscv64.s
+@@ -41,7 +41,7 @@ TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0
+ 	ADD	$8, X9, X9
+ 	SUB	X9, X12, X12
+ align:
+-	ADD	$-1, X9
++	SUB	$1, X9
+ 	MOVBU	0(X10), X19
+ 	MOVBU	0(X11), X20
+ 	BNE	X19, X20, not_eq
+@@ -67,7 +67,7 @@ loop32:
+ 	BNE	X16, X17, not_eq
+ 	ADD	$32, X10
+ 	ADD	$32, X11
+-	ADD	$-32, X12
++	SUB	$32, X12
+ 	BGE	X12, X9, loop32
+ 	BEQZ	X12, eq
+ 
+@@ -83,7 +83,7 @@ loop16:
+ 	BNE	X21, X22, not_eq
+ 	ADD	$16, X10
+ 	ADD	$16, X11
+-	ADD	$-16, X12
++	SUB	$16, X12
+ 	BGE	X12, X23, loop16
+ 	BEQZ	X12, eq
+ 
+@@ -105,7 +105,7 @@ loop4:
+ 	BNE	X16, X17, not_eq
+ 	ADD	$4, X10
+ 	ADD	$4, X11
+-	ADD	$-4, X12
++	SUB	$4, X12
+ 	BGE	X12, X23, loop4
+ 
+ loop1:
+@@ -115,7 +115,7 @@ loop1:
+ 	BNE	X19, X20, not_eq
+ 	ADD	$1, X10
+ 	ADD	$1, X11
+-	ADD	$-1, X12
++	SUB	$1, X12
+ 	JMP	loop1
+ 
+ not_eq:
+diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s
+index 8be78ed950..de00983c7b 100644
+--- a/src/internal/bytealg/indexbyte_riscv64.s
++++ b/src/internal/bytealg/indexbyte_riscv64.s
+@@ -13,7 +13,7 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+ 	AND	$0xff, X13
+ 	MOV	X10, X12		// store base for later
+ 	ADD	X10, X11		// end
+-	ADD	$-1, X10
++	SUB	$1, X10
+ 
+ loop:
+ 	ADD	$1, X10
+@@ -35,7 +35,7 @@ TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+ 	AND	$0xff, X12
+ 	MOV	X10, X13		// store base for later
+ 	ADD	X10, X11		// end
+-	ADD	$-1, X10
++	SUB	$1, X10
+ 
+ loop:
+ 	ADD	$1, X10
+diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
+index eb53cbbf47..bb0d161ad4 100644
+--- a/src/runtime/asm_riscv64.s
++++ b/src/runtime/asm_riscv64.s
+@@ -9,7 +9,7 @@
+ // func rt0_go()
+ TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0
+ 	// X2 = stack; A0 = argc; A1 = argv
+-	ADD	$-24, X2
++	SUB	$24, X2
+ 	MOV	A0, 8(X2)	// argc
+ 	MOV	A1, 16(X2)	// argv
+ 
+@@ -57,7 +57,7 @@ nocgo:
+ 
+ 	// create a new goroutine to start program
+ 	MOV	$runtime·mainPC(SB), T0		// entry
+-	ADD	$-16, X2
++	SUB	$16, X2
+ 	MOV	T0, 8(X2)
+ 	MOV	ZERO, 0(X2)
+ 	CALL	runtime·newproc(SB)
+@@ -200,7 +200,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
+ 	MOV	(g_sched+gobuf_sp)(g), X2
+ 	// Create a stack frame on g0 to call newstack.
+ 	MOV	ZERO, -8(X2)	// Zero saved LR in frame
+-	ADD	$-8, X2
++	SUB	$8, X2
+ 	CALL	runtime·newstack(SB)
+ 
+ 	// Not reached, but make sure the return PC from the call to newstack
+@@ -285,7 +285,7 @@ TEXT runtime·mcall<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-8
+ 	MOV	0(CTXT), T1			// code pointer
+ 	MOV	(g_sched+gobuf_sp)(g), X2	// sp = m->g0->sched.sp
+ 	// we don't need special macro for regabi since arg0(X10) = g
+-	ADD	$-16, X2
++	SUB	$16, X2
+ 	MOV	X10, 8(X2)			// setup g
+ 	MOV	ZERO, 0(X2)			// clear return address
+ 	JALR	RA, T1
+@@ -338,7 +338,7 @@ TEXT ·asmcgocall(SB),NOSPLIT,$0-20
+ 	// Now on a scheduling stack (a pthread-created stack).
+ g0:
+ 	// Save room for two of our pointers.
+-	ADD	$-16, X2
++	SUB	$16, X2
+ 	MOV	X9, 0(X2)	// save old g on stack
+ 	MOV	(g_stack+stack_hi)(X9), X9
+ 	SUB	X8, X9, X8
+diff --git a/src/runtime/memclr_riscv64.s b/src/runtime/memclr_riscv64.s
+index 1c1e6ab54d..16c511c603 100644
+--- a/src/runtime/memclr_riscv64.s
++++ b/src/runtime/memclr_riscv64.s
+@@ -23,7 +23,7 @@ TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
+ 	SUB	X5, X9, X5
+ 	SUB	X5, X11, X11
+ align:
+-	ADD	$-1, X5
++	SUB	$1, X5
+ 	MOVB	ZERO, 0(X10)
+ 	ADD	$1, X10
+ 	BNEZ	X5, align
+@@ -47,7 +47,7 @@ loop64:
+ 	MOV	ZERO, 48(X10)
+ 	MOV	ZERO, 56(X10)
+ 	ADD	$64, X10
+-	ADD	$-64, X11
++	SUB	$64, X11
+ 	BGE	X11, X9, loop64
+ 	BEQZ	X11, done
+ 
+@@ -60,7 +60,7 @@ zero32:
+ 	MOV	ZERO, 16(X10)
+ 	MOV	ZERO, 24(X10)
+ 	ADD	$32, X10
+-	ADD	$-32, X11
++	SUB	$32, X11
+ 	BEQZ	X11, done
+ 
+ check16:
+@@ -70,7 +70,7 @@ zero16:
+ 	MOV	ZERO, 0(X10)
+ 	MOV	ZERO, 8(X10)
+ 	ADD	$16, X10
+-	ADD	$-16, X11
++	SUB	$16, X11
+ 	BEQZ	X11, done
+ 
+ check8:
+@@ -79,7 +79,7 @@ check8:
+ zero8:
+ 	MOV	ZERO, 0(X10)
+ 	ADD	$8, X10
+-	ADD	$-8, X11
++	SUB	$8, X11
+ 	BEQZ	X11, done
+ 
+ check4:
+@@ -91,13 +91,13 @@ zero4:
+ 	MOVB	ZERO, 2(X10)
+ 	MOVB	ZERO, 3(X10)
+ 	ADD	$4, X10
+-	ADD	$-4, X11
++	SUB	$4, X11
+ 
+ loop1:
+ 	BEQZ	X11, done
+ 	MOVB	ZERO, 0(X10)
+ 	ADD	$1, X10
+-	ADD	$-1, X11
++	SUB	$1, X11
+ 	JMP	loop1
+ 
+ done:
+diff --git a/src/runtime/memmove_riscv64.s b/src/runtime/memmove_riscv64.s
+index f5db86562b..e099a64100 100644
+--- a/src/runtime/memmove_riscv64.s
++++ b/src/runtime/memmove_riscv64.s
+@@ -32,7 +32,7 @@ TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
+ 	SUB	X5, X9, X5
+ 	SUB	X5, X12, X12
+ f_align:
+-	ADD	$-1, X5
++	SUB	$1, X5
+ 	MOVB	0(X11), X14
+ 	MOVB	X14, 0(X10)
+ 	ADD	$1, X10
+@@ -65,7 +65,7 @@ f_loop64:
+ 	MOV	X21, 56(X10)
+ 	ADD	$64, X10
+ 	ADD	$64, X11
+-	ADD	$-64, X12
++	SUB	$64, X12
+ 	BGE	X12, X9, f_loop64
+ 	BEQZ	X12, done
+ 
+@@ -83,7 +83,7 @@ f_loop32:
+ 	MOV	X17, 24(X10)
+ 	ADD	$32, X10
+ 	ADD	$32, X11
+-	ADD	$-32, X12
++	SUB	$32, X12
+ 	BGE	X12, X9, f_loop32
+ 	BEQZ	X12, done
+ 
+@@ -97,7 +97,7 @@ f_loop16:
+ 	MOV	X15, 8(X10)
+ 	ADD	$16, X10
+ 	ADD	$16, X11
+-	ADD	$-16, X12
++	SUB	$16, X12
+ 	BGE	X12, X9, f_loop16
+ 	BEQZ	X12, done
+ 
+@@ -109,7 +109,7 @@ f_loop8:
+ 	MOV	X14, 0(X10)
+ 	ADD	$8, X10
+ 	ADD	$8, X11
+-	ADD	$-8, X12
++	SUB	$8, X12
+ 	BGE	X12, X9, f_loop8
+ 	BEQZ	X12, done
+ 	JMP	f_loop4_check
+@@ -136,7 +136,7 @@ f_loop8_unaligned:
+ 	MOVB	X21, 7(X10)
+ 	ADD	$8, X10
+ 	ADD	$8, X11
+-	ADD	$-8, X12
++	SUB	$8, X12
+ 	BGE	X12, X9, f_loop8_unaligned
+ 
+ f_loop4_check:
+@@ -153,7 +153,7 @@ f_loop4:
+ 	MOVB	X17, 3(X10)
+ 	ADD	$4, X10
+ 	ADD	$4, X11
+-	ADD	$-4, X12
++	SUB	$4, X12
+ 	BGE	X12, X9, f_loop4
+ 
+ f_loop1:
+@@ -162,7 +162,7 @@ f_loop1:
+ 	MOVB	X14, 0(X10)
+ 	ADD	$1, X10
+ 	ADD	$1, X11
+-	ADD	$-1, X12
++	SUB	$1, X12
+ 	JMP	f_loop1
+ 
+ backward:
+@@ -182,9 +182,9 @@ backward:
+ 	// Move one byte at a time until we reach 8 byte alignment.
+ 	SUB	X5, X12, X12
+ b_align:
+-	ADD	$-1, X5
+-	ADD	$-1, X10
+-	ADD	$-1, X11
++	SUB	$1, X5
++	SUB	$1, X10
++	SUB	$1, X11
+ 	MOVB	0(X11), X14
+ 	MOVB	X14, 0(X10)
+ 	BNEZ	X5, b_align
+@@ -197,8 +197,8 @@ b_loop_check:
+ 	MOV	$64, X9
+ 	BLT	X12, X9, b_loop32_check
+ b_loop64:
+-	ADD	$-64, X10
+-	ADD	$-64, X11
++	SUB	$64, X10
++	SUB	$64, X11
+ 	MOV	0(X11), X14
+ 	MOV	8(X11), X15
+ 	MOV	16(X11), X16
+@@ -215,7 +215,7 @@ b_loop64:
+ 	MOV	X19, 40(X10)
+ 	MOV	X20, 48(X10)
+ 	MOV	X21, 56(X10)
+-	ADD	$-64, X12
++	SUB	$64, X12
+ 	BGE	X12, X9, b_loop64
+ 	BEQZ	X12, done
+ 
+@@ -223,8 +223,8 @@ b_loop32_check:
+ 	MOV	$32, X9
+ 	BLT	X12, X9, b_loop16_check
+ b_loop32:
+-	ADD	$-32, X10
+-	ADD	$-32, X11
++	SUB	$32, X10
++	SUB	$32, X11
+ 	MOV	0(X11), X14
+ 	MOV	8(X11), X15
+ 	MOV	16(X11), X16
+@@ -233,7 +233,7 @@ b_loop32:
+ 	MOV	X15, 8(X10)
+ 	MOV	X16, 16(X10)
+ 	MOV	X17, 24(X10)
+-	ADD	$-32, X12
++	SUB	$32, X12
+ 	BGE	X12, X9, b_loop32
+ 	BEQZ	X12, done
+ 
+@@ -241,13 +241,13 @@ b_loop16_check:
+ 	MOV	$16, X9
+ 	BLT	X12, X9, b_loop8_check
+ b_loop16:
+-	ADD	$-16, X10
+-	ADD	$-16, X11
++	SUB	$16, X10
++	SUB	$16, X11
+ 	MOV	0(X11), X14
+ 	MOV	8(X11), X15
+ 	MOV	X14, 0(X10)
+ 	MOV	X15, 8(X10)
+-	ADD	$-16, X12
++	SUB	$16, X12
+ 	BGE	X12, X9, b_loop16
+ 	BEQZ	X12, done
+ 
+@@ -255,11 +255,11 @@ b_loop8_check:
+ 	MOV	$8, X9
+ 	BLT	X12, X9, b_loop4_check
+ b_loop8:
+-	ADD	$-8, X10
+-	ADD	$-8, X11
++	SUB	$8, X10
++	SUB	$8, X11
+ 	MOV	0(X11), X14
+ 	MOV	X14, 0(X10)
+-	ADD	$-8, X12
++	SUB	$8, X12
+ 	BGE	X12, X9, b_loop8
+ 	BEQZ	X12, done
+ 	JMP	b_loop4_check
+@@ -268,8 +268,8 @@ b_loop8_unaligned_check:
+ 	MOV	$8, X9
+ 	BLT	X12, X9, b_loop4_check
+ b_loop8_unaligned:
+-	ADD	$-8, X10
+-	ADD	$-8, X11
++	SUB	$8, X10
++	SUB	$8, X11
+ 	MOVB	0(X11), X14
+ 	MOVB	1(X11), X15
+ 	MOVB	2(X11), X16
+@@ -286,15 +286,15 @@ b_loop8_unaligned:
+ 	MOVB	X19, 5(X10)
+ 	MOVB	X20, 6(X10)
+ 	MOVB	X21, 7(X10)
+-	ADD	$-8, X12
++	SUB	$8, X12
+ 	BGE	X12, X9, b_loop8_unaligned
+ 
+ b_loop4_check:
+ 	MOV	$4, X9
+ 	BLT	X12, X9, b_loop1
+ b_loop4:
+-	ADD	$-4, X10
+-	ADD	$-4, X11
++	SUB	$4, X10
++	SUB	$4, X11
+ 	MOVB	0(X11), X14
+ 	MOVB	1(X11), X15
+ 	MOVB	2(X11), X16
+@@ -303,16 +303,16 @@ b_loop4:
+ 	MOVB	X15, 1(X10)
+ 	MOVB	X16, 2(X10)
+ 	MOVB	X17, 3(X10)
+-	ADD	$-4, X12
++	SUB	$4, X12
+ 	BGE	X12, X9, b_loop4
+ 
+ b_loop1:
+ 	BEQZ	X12, done
+-	ADD	$-1, X10
+-	ADD	$-1, X11
++	SUB	$1, X10
++	SUB	$1, X11
+ 	MOVB	0(X11), X14
+ 	MOVB	X14, 0(X10)
+-	ADD	$-1, X12
++	SUB	$1, X12
+ 	JMP	b_loop1
+ 
+ done:
+diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
+index 0bfbd379e0..a96ae59c15 100644
+--- a/src/runtime/mkpreempt.go
++++ b/src/runtime/mkpreempt.go
+@@ -576,7 +576,7 @@ func genRISCV64() {
+ 	}
+ 
+ 	p("MOV X1, -%d(X2)", l.stack)
+-	p("ADD $-%d, X2", l.stack)
++	p("SUB $%d, X2", l.stack)
+ 	l.save()
+ 	p("CALL ·asyncPreempt2(SB)")
+ 	l.restore()
+diff --git a/src/runtime/preempt_riscv64.s b/src/runtime/preempt_riscv64.s
+index 56df6c30e0..bbb6447dc5 100644
+--- a/src/runtime/preempt_riscv64.s
++++ b/src/runtime/preempt_riscv64.s
+@@ -5,7 +5,7 @@
+ 
+ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+ 	MOV X1, -464(X2)
+-	ADD $-464, X2
++	SUB $464, X2
+ 	MOV X5, 8(X2)
+ 	MOV X6, 16(X2)
+ 	MOV X7, 24(X2)
+diff --git a/src/runtime/sys_linux_riscv64.s b/src/runtime/sys_linux_riscv64.s
+index d1558fd6f7..ffec2b5b75 100644
+--- a/src/runtime/sys_linux_riscv64.s
++++ b/src/runtime/sys_linux_riscv64.s
+@@ -256,7 +256,7 @@ TEXT runtime·walltime(SB),NOSPLIT,$40-12
+ 	MOV	(g_sched+gobuf_sp)(T1), X2
+ 
+ noswitch:
+-	ADDI	$-24, X2 // Space for result
++	SUB	$24, X2 // Space for result
+ 	ANDI	$~7, X2 // Align for C code
+ 	MOV	$8(X2), A1
+ 
+@@ -328,7 +328,7 @@ TEXT runtime·nanotime1(SB),NOSPLIT,$40-8
+ 	MOV	(g_sched+gobuf_sp)(T1), X2
+ 
+ noswitch:
+-	ADDI	$-24, X2 // Space for result
++	SUB	$24, X2 // Space for result
+ 	ANDI	$~7, X2 // Align for C code
+ 	MOV	$8(X2), A1
+ 
+-- 
+2.39.5
+
diff --git a/2021-cmd-internal-obj-riscv-add-support-of-PCALIGN-direct.patch b/2021-cmd-internal-obj-riscv-add-support-of-PCALIGN-direct.patch
new file mode 100644
index 0000000..c262b58
--- /dev/null
+++ b/2021-cmd-internal-obj-riscv-add-support-of-PCALIGN-direct.patch
@@ -0,0 +1,152 @@
+From 035407e3926edc25a042c9dd90d489ca2d4f1cfe Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 021/119] cmd/internal/obj/riscv: add support of PCALIGN
+ directive
+
+Add support for PCALIGN directive on riscv.
+This directive can be used within Go asm to align instruction
+by padding NOP directives.
+
+This patch also adds a test to verify the correctness of the PCALIGN
+directive.
+
+Original credit by Cooper Qu (Alibaba)
+https://gitee.com/xuantie_riscv/xuantie-patch
+
+Change-Id: I8b6524a2bf81a1baf7c9d04b7da2db6c1a7b428f
+Reviewed-on: https://go-review.googlesource.com/c/go/+/541740
+Run-TryBot: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Wang Yaduo <wangyaduo@linux.alibaba.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+---
+ src/cmd/internal/obj/riscv/asm_test.go | 32 ++++++++++++++++++++++++++
+ src/cmd/internal/obj/riscv/obj.go      | 32 ++++++++++++++++++++++++++
+ 2 files changed, 64 insertions(+)
+
+diff --git a/src/cmd/internal/obj/riscv/asm_test.go b/src/cmd/internal/obj/riscv/asm_test.go
+index afe0525532..96ea230841 100644
+--- a/src/cmd/internal/obj/riscv/asm_test.go
++++ b/src/cmd/internal/obj/riscv/asm_test.go
+@@ -9,8 +9,10 @@ import (
+ 	"fmt"
+ 	"internal/testenv"
+ 	"os"
++	"os/exec"
+ 	"path/filepath"
+ 	"runtime"
++	"strings"
+ 	"testing"
+ )
+ 
+@@ -277,3 +279,33 @@ func TestBranch(t *testing.T) {
+ 		t.Errorf("Branch test failed: %v\n%s", err, out)
+ 	}
+ }
++
++func TestPCAlign(t *testing.T) {
++	dir := t.TempDir()
++	tmpfile := filepath.Join(dir, "x.s")
++	asm := `
++TEXT _stub(SB),$0-0
++	FENCE
++	PCALIGN	$8
++	FENCE
++	RET
++`
++	if err := os.WriteFile(tmpfile, []byte(asm), 0644); err != nil {
++		t.Fatal(err)
++	}
++	cmd := exec.Command(testenv.GoToolPath(t), "tool", "asm", "-o", filepath.Join(dir, "x.o"), "-S", tmpfile)
++	cmd.Env = append(os.Environ(), "GOARCH=riscv64", "GOOS=linux")
++	out, err := cmd.CombinedOutput()
++	if err != nil {
++		t.Errorf("Failed to assemble: %v\n%s", err, out)
++	}
++	// The expected instruction sequence after alignment:
++	//	FENCE
++	//	NOP
++	//	FENCE
++	//	RET
++	want := "0f 00 f0 0f 13 00 00 00 0f 00 f0 0f 67 80 00 00"
++	if !strings.Contains(string(out), want) {
++		t.Errorf("PCALIGN test failed - got %s\nwant %s", out, want)
++	}
++}
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 195cd26413..02d08fec76 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -308,6 +308,12 @@ func setPCs(p *obj.Prog, pc int64) int64 {
+ 		for _, ins := range instructionsForProg(p) {
+ 			pc += int64(ins.length())
+ 		}
++
++		if p.As == obj.APCALIGN {
++			alignedValue := p.From.Offset
++			v := pcAlignPadLength(pc, alignedValue)
++			pc += int64(v)
++		}
+ 	}
+ 	return pc
+ }
+@@ -733,6 +739,16 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
+ 				p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: high, Sym: cursym}
+ 				p.Link.To.Offset = low
+ 			}
++
++		case obj.APCALIGN:
++			alignedValue := p.From.Offset
++			if (alignedValue&(alignedValue-1) != 0) || 4 > alignedValue || alignedValue > 2048 {
++				ctxt.Diag("alignment value of an instruction must be a power of two and in the range [4, 2048], got %d\n", alignedValue)
++			}
++			// Update the current text symbol alignment value.
++			if int32(alignedValue) > cursym.Func().Align {
++				cursym.Func().Align = int32(alignedValue)
++			}
+ 		}
+ 	}
+ 
+@@ -744,6 +760,10 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
+ 	}
+ }
+ 
++func pcAlignPadLength(pc int64, alignedValue int64) int {
++	return int(-pc & (alignedValue - 1))
++}
++
+ func stacksplit(ctxt *obj.Link, p *obj.Prog, cursym *obj.LSym, newprog obj.ProgAlloc, framesize int64) *obj.Prog {
+ 	// Leaf function with no frame is effectively NOSPLIT.
+ 	if framesize == 0 {
+@@ -1707,6 +1727,7 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	obj.ANOP:      pseudoOpEncoding,
+ 	obj.ADUFFZERO: pseudoOpEncoding,
+ 	obj.ADUFFCOPY: pseudoOpEncoding,
++	obj.APCALIGN:  pseudoOpEncoding,
+ }
+ 
+ // encodingForAs returns the encoding for an obj.As.
+@@ -2421,6 +2442,17 @@ func assemble(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
+ 			rel.Sym = addr.Sym
+ 			rel.Add = addr.Offset
+ 			rel.Type = rt
++
++		case obj.APCALIGN:
++			alignedValue := p.From.Offset
++			v := pcAlignPadLength(p.Pc, alignedValue)
++			offset := p.Pc
++			for ; v >= 4; v -= 4 {
++				// NOP
++				cursym.WriteBytes(ctxt, offset, []byte{0x13, 0, 0, 0})
++				offset += 4
++			}
++			continue
+ 		}
+ 
+ 		offset := p.Pc
+-- 
+2.39.5
+
diff --git a/2022-internal-bytealg-optimize-Count-with-PCALIGN-in-risc.patch b/2022-internal-bytealg-optimize-Count-with-PCALIGN-in-risc.patch
new file mode 100644
index 0000000..a7edc82
--- /dev/null
+++ b/2022-internal-bytealg-optimize-Count-with-PCALIGN-in-risc.patch
@@ -0,0 +1,94 @@
+From d8a1c916d517a531b1862fe55a22086b3e5767c7 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 022/119] internal/bytealg: optimize Count with PCALIGN in
+ riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+For #63678
+
+Benchmark on Milk-V Mars CM eMMC (Starfive/JH7110 SoC)
+
+goos: linux
+goarch: riscv64
+pkg: bytes
+                │ /root/bytes.old.bench │        /root/bytes.pc16.bench         │
+                │        sec/op         │   sec/op     vs base                  │
+Count/10                    223.9n ± 1%   220.8n ± 1%   -1.36% (p=0.001 n=10)
+Count/32                    571.6n ± 0%   571.3n ± 0%        ~ (p=0.054 n=10)
+Count/4K                    38.56µ ± 0%   38.55µ ± 0%   -0.01% (p=0.010 n=10)
+Count/4M                    40.13m ± 0%   39.21m ± 0%   -2.28% (p=0.000 n=10)
+Count/64M                   627.5m ± 0%   627.4m ± 0%   -0.01% (p=0.019 n=10)
+CountEasy/10                101.3n ± 0%   101.3n ± 0%        ~ (p=1.000 n=10) ¹
+CountEasy/32                139.3n ± 0%   139.3n ± 0%        ~ (p=1.000 n=10) ¹
+CountEasy/4K                5.565µ ± 0%   5.564µ ± 0%   -0.02% (p=0.001 n=10)
+CountEasy/4M                5.619m ± 0%   5.619m ± 0%        ~ (p=0.190 n=10)
+CountEasy/64M               89.94m ± 0%   89.93m ± 0%        ~ (p=0.436 n=10)
+CountSingle/10              53.80n ± 0%   46.06n ± 0%  -14.39% (p=0.000 n=10)
+CountSingle/32             104.30n ± 0%   79.64n ± 0%  -23.64% (p=0.000 n=10)
+CountSingle/4K             10.413µ ± 0%   7.247µ ± 0%  -30.40% (p=0.000 n=10)
+CountSingle/4M             11.603m ± 0%   8.388m ± 0%  -27.71% (p=0.000 n=10)
+CountSingle/64M             230.9m ± 0%   172.3m ± 0%  -25.40% (p=0.000 n=10)
+CountHard1                  9.981m ± 0%   9.981m ± 0%        ~ (p=0.810 n=10)
+CountHard2                  9.981m ± 0%   9.981m ± 0%        ~ (p=0.315 n=10)
+CountHard3                  9.981m ± 0%   9.981m ± 0%        ~ (p=0.159 n=10)
+geomean                     144.6µ        133.5µ        -7.70%
+¹ all samples are equal
+
+                │ /root/bytes.old.bench │        /root/bytes.pc16.bench         │
+                │          B/s          │      B/s       vs base                │
+Count/10                   42.60Mi ± 1%    43.19Mi ± 1%   +1.39% (p=0.001 n=10)
+Count/32                   53.38Mi ± 0%    53.42Mi ± 0%   +0.06% (p=0.049 n=10)
+Count/4K                   101.3Mi ± 0%    101.3Mi ± 0%        ~ (p=0.077 n=10)
+Count/4M                   99.68Mi ± 0%   102.01Mi ± 0%   +2.34% (p=0.000 n=10)
+Count/64M                  102.0Mi ± 0%    102.0Mi ± 0%        ~ (p=0.076 n=10)
+CountEasy/10               94.18Mi ± 0%    94.18Mi ± 0%        ~ (p=0.054 n=10)
+CountEasy/32               219.1Mi ± 0%    219.1Mi ± 0%   +0.01% (p=0.016 n=10)
+CountEasy/4K               702.0Mi ± 0%    702.0Mi ± 0%   +0.00% (p=0.000 n=10)
+CountEasy/4M               711.9Mi ± 0%    711.9Mi ± 0%        ~ (p=0.133 n=10)
+CountEasy/64M              711.6Mi ± 0%    711.7Mi ± 0%        ~ (p=0.447 n=10)
+CountSingle/10             177.2Mi ± 0%    207.0Mi ± 0%  +16.81% (p=0.000 n=10)
+CountSingle/32             292.7Mi ± 0%    383.2Mi ± 0%  +30.91% (p=0.000 n=10)
+CountSingle/4K             375.1Mi ± 0%    539.0Mi ± 0%  +43.70% (p=0.000 n=10)
+CountSingle/4M             344.7Mi ± 0%    476.9Mi ± 0%  +38.33% (p=0.000 n=10)
+CountSingle/64M            277.2Mi ± 0%    371.5Mi ± 0%  +34.05% (p=0.000 n=10)
+geomean                    199.7Mi         219.8Mi       +10.10%
+
+Change-Id: I1abf6b220b9802028f8ad5eebc8d3b7cfa3e89ea
+Reviewed-on: https://go-review.googlesource.com/c/go/+/541756
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Run-TryBot: M Zhuo <mzh@golangcn.org>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Wang Yaduo <wangyaduo@linux.alibaba.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+---
+ src/internal/bytealg/count_riscv64.s | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/src/internal/bytealg/count_riscv64.s b/src/internal/bytealg/count_riscv64.s
+index d123cbd7c6..3f255cd263 100644
+--- a/src/internal/bytealg/count_riscv64.s
++++ b/src/internal/bytealg/count_riscv64.s
+@@ -14,6 +14,7 @@ TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
+ 	MOV	ZERO, X14	// count
+ 	ADD	X10, X11	// end
+ 
++	PCALIGN	$16
+ loop:
+ 	BEQ	X10, X11, done
+ 	MOVBU	(X10), X15
+@@ -34,6 +35,7 @@ TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
+ 	MOV	ZERO, X14	// count
+ 	ADD	X10, X11	// end
+ 
++	PCALIGN	$16
+ loop:
+ 	BEQ	X10, X11, done
+ 	MOVBU	(X10), X15
+-- 
+2.39.5
+
diff --git a/2023-cmd-compile-correct-code-generation-for-right-shifts.patch b/2023-cmd-compile-correct-code-generation-for-right-shifts.patch
new file mode 100644
index 0000000..9f62ab0
--- /dev/null
+++ b/2023-cmd-compile-correct-code-generation-for-right-shifts.patch
@@ -0,0 +1,980 @@
+From 5cd157e29e4ea9dec96f4a7e6d35eb80ebdbee98 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 023/119] cmd/compile: correct code generation for right shifts
+ on riscv64
+
+The code generation on riscv64 will currently result in incorrect
+assembly when a 32 bit integer is right shifted by an amount that
+exceeds the size of the type. In particular, this occurs when an
+int32 or uint32 is cast to a 64 bit type and right shifted by a
+value larger than 31.
+
+Fix this by moving the SRAW/SRLW conversion into the right shift
+rules and removing the SignExt32to64/ZeroExt32to64. Add additional
+rules that rewrite to SRAIW/SRLIW when the shift is less than the
+size of the type, or replace/eliminate the shift when it exceeds
+the size of the type.
+
+Add SSA tests that would have caught this issue. Also add additional
+codegen tests to ensure that the resulting assembly is what we
+expect in these overflow cases.
+
+Fixes #64285
+
+Change-Id: Ie97b05668597cfcb91413afefaab18ee1aa145ec
+Reviewed-on: https://go-review.googlesource.com/c/go/+/545035
+Reviewed-by: Russ Cox <rsc@golang.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+---
+ .../compile/internal/ssa/_gen/RISCV64.rules   | 100 +++--
+ .../compile/internal/ssa/rewriteRISCV64.go    | 415 ++++++++++--------
+ .../internal/test/testdata/arith_test.go      |  66 +++
+ test/codegen/shift.go                         |  30 ++
+ 4 files changed, 387 insertions(+), 224 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 9afe5995ae..fc206c42d3 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -153,27 +153,27 @@
+ // SRL only considers the bottom 6 bits of y, similarly SRLW only considers the
+ // bottom 5 bits of y. Ensure that the result is always zero if the shift exceeds
+ // the maximum value. See Lsh above for a detailed description.
+-(Rsh8Ux8   <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+-(Rsh8Ux16  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+-(Rsh8Ux32  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+-(Rsh8Ux64  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] y)))
+-(Rsh16Ux8  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+-(Rsh16Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+-(Rsh16Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+-(Rsh16Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] y)))
+-(Rsh32Ux8  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt8to64  y))))
+-(Rsh32Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt16to64 y))))
+-(Rsh32Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt32to64 y))))
+-(Rsh32Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] y)))
+-(Rsh64Ux8  <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
+-(Rsh64Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
+-(Rsh64Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
+-(Rsh64Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL <t> x                 y) (Neg64 <t> (SLTIU <t> [64] y)))
+-
+-(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) => (SRL (ZeroExt8to64  x) y)
+-(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt16to64 x) y)
+-(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt32to64 x) y)
+-(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL x                 y)
++(Rsh8Ux8   <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
++(Rsh8Ux16  <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
++(Rsh8Ux32  <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
++(Rsh8Ux64  <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t> (ZeroExt8to64  x) y) (Neg8  <t> (SLTIU <t> [64] y)))
++(Rsh16Ux8  <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
++(Rsh16Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
++(Rsh16Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
++(Rsh16Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] y)))
++(Rsh32Ux8  <t> x y) && !shiftIsBounded(v) => (AND (SRLW <t>  x                y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt8to64  y))))
++(Rsh32Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRLW <t>  x                y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt16to64 y))))
++(Rsh32Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRLW <t>  x                y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt32to64 y))))
++(Rsh32Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRLW <t>  x                y) (Neg32 <t> (SLTIU <t> [32] y)))
++(Rsh64Ux8  <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t>  x                y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64  y))))
++(Rsh64Ux16 <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t>  x                y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
++(Rsh64Ux32 <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t>  x                y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
++(Rsh64Ux64 <t> x y) && !shiftIsBounded(v) => (AND (SRL  <t>  x                y) (Neg64 <t> (SLTIU <t> [64] y)))
++
++(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) => (SRL  (ZeroExt8to64  x) y)
++(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL  (ZeroExt16to64 x) y)
++(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRLW x                 y)
++(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL  x                 y)
+ 
+ // SRA only considers the bottom 6 bits of y, similarly SRAW only considers the
+ // bottom 5 bits. If y is greater than the maximum value (either 63 or 31
+@@ -188,27 +188,27 @@
+ //
+ // We don't need to sign-extend the OR result, as it will be at minimum 8 bits,
+ // more than the 5 or 6 bits SRAW and SRA care about.
+-(Rsh8x8   <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
+-(Rsh8x16  <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+-(Rsh8x32  <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+-(Rsh8x64  <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+-(Rsh16x8  <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
+-(Rsh16x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+-(Rsh16x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+-(Rsh16x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+-(Rsh32x8  <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt8to64  y)))))
+-(Rsh32x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt16to64 y)))))
+-(Rsh32x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt32to64 y)))))
+-(Rsh32x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] y))))
+-(Rsh64x8  <t> x y) && !shiftIsBounded(v) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
+-(Rsh64x16 <t> x y) && !shiftIsBounded(v) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
+-(Rsh64x32 <t> x y) && !shiftIsBounded(v) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
+-(Rsh64x64 <t> x y) && !shiftIsBounded(v) => (SRA <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
+-
+-(Rsh8x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRA (SignExt8to64  x) y)
+-(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt16to64 x) y)
+-(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt32to64 x) y)
+-(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA x                 y)
++(Rsh8x8   <t> x y) && !shiftIsBounded(v) => (SRA  <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
++(Rsh8x16  <t> x y) && !shiftIsBounded(v) => (SRA  <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
++(Rsh8x32  <t> x y) && !shiftIsBounded(v) => (SRA  <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
++(Rsh8x64  <t> x y) && !shiftIsBounded(v) => (SRA  <t> (SignExt8to64  x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
++(Rsh16x8  <t> x y) && !shiftIsBounded(v) => (SRA  <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
++(Rsh16x16 <t> x y) && !shiftIsBounded(v) => (SRA  <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
++(Rsh16x32 <t> x y) && !shiftIsBounded(v) => (SRA  <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
++(Rsh16x64 <t> x y) && !shiftIsBounded(v) => (SRA  <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
++(Rsh32x8  <t> x y) && !shiftIsBounded(v) => (SRAW <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt8to64  y)))))
++(Rsh32x16 <t> x y) && !shiftIsBounded(v) => (SRAW <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt16to64 y)))))
++(Rsh32x32 <t> x y) && !shiftIsBounded(v) => (SRAW <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt32to64 y)))))
++(Rsh32x64 <t> x y) && !shiftIsBounded(v) => (SRAW <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] y))))
++(Rsh64x8  <t> x y) && !shiftIsBounded(v) => (SRA  <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64  y)))))
++(Rsh64x16 <t> x y) && !shiftIsBounded(v) => (SRA  <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
++(Rsh64x32 <t> x y) && !shiftIsBounded(v) => (SRA  <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
++(Rsh64x64 <t> x y) && !shiftIsBounded(v) => (SRA  <t> x                 (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
++
++(Rsh8x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRA  (SignExt8to64  x) y)
++(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA  (SignExt16to64 x) y)
++(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW  x                y)
++(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA   x                y)
+ 
+ // Rotates.
+ (RotateLeft8  <t> x (MOVDconst [c])) => (Or8  (Lsh8x64  <t> x (MOVDconst [c&7]))  (Rsh8Ux64  <t> x (MOVDconst [-c&7])))
+@@ -710,10 +710,18 @@
+ (MOVDnop (MOVDconst [c])) => (MOVDconst [c])
+ 
+ // Avoid unnecessary zero and sign extension when right shifting.
+-(SRL <t> (MOVWUreg x) y) => (SRLW <t> x y)
+-(SRLI <t> [x] (MOVWUreg y)) => (SRLIW <t> [int64(x&31)] y)
+-(SRA <t> (MOVWreg x) y) => (SRAW <t> x y)
+-(SRAI <t> [x] (MOVWreg y)) => (SRAIW <t> [int64(x&31)] y)
++(SRAI <t> [x] (MOVWreg  y)) && x >= 0 && x <= 31 => (SRAIW <t> [int64(x)] y)
++(SRLI <t> [x] (MOVWUreg y)) && x >= 0 && x <= 31 => (SRLIW <t> [int64(x)] y)
++
++// Replace right shifts that exceed size of signed type.
++(SRAI <t> [x] (MOVBreg y)) && x >=  8 => (SRAI  [63] (SLLI <t> [56] y))
++(SRAI <t> [x] (MOVHreg y)) && x >= 16 => (SRAI  [63] (SLLI <t> [48] y))
++(SRAI <t> [x] (MOVWreg y)) && x >= 32 => (SRAIW [31] y)
++
++// Eliminate right shifts that exceed size of unsigned type.
++(SRLI <t> [x] (MOVBUreg y)) && x >=  8 => (MOVDconst <t> [0])
++(SRLI <t> [x] (MOVHUreg y)) && x >= 16 => (MOVDconst <t> [0])
++(SRLI <t> [x] (MOVWUreg y)) && x >= 32 => (MOVDconst <t> [0])
+ 
+ // Fold constant into immediate instructions where possible.
+ (ADD (MOVDconst <t> [val]) x) && is32Bit(val) && !t.IsPtr() => (ADDI [val] x)
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 572dac249e..41edcdf8b8 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -2,8 +2,10 @@
+ 
+ package ssa
+ 
+-import "math"
+-import "cmd/compile/internal/types"
++import (
++	"cmd/compile/internal/types"
++	"math"
++)
+ 
+ func rewriteValueRISCV64(v *Value) bool {
+ 	switch v.Op {
+@@ -6260,20 +6262,6 @@ func rewriteValueRISCV64_OpRISCV64SNEZ(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64SRA(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+-	// match: (SRA <t> (MOVWreg x) y)
+-	// result: (SRAW <t> x y)
+-	for {
+-		t := v.Type
+-		if v_0.Op != OpRISCV64MOVWreg {
+-			break
+-		}
+-		x := v_0.Args[0]
+-		y := v_1
+-		v.reset(OpRISCV64SRAW)
+-		v.Type = t
+-		v.AddArg2(x, y)
+-		return true
+-	}
+ 	// match: (SRA x (MOVDconst [val]))
+ 	// result: (SRAI [int64(val&63)] x)
+ 	for {
+@@ -6291,8 +6279,10 @@ func rewriteValueRISCV64_OpRISCV64SRA(v *Value) bool {
+ }
+ func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool {
+ 	v_0 := v.Args[0]
++	b := v.Block
+ 	// match: (SRAI <t> [x] (MOVWreg y))
+-	// result: (SRAIW <t> [int64(x&31)] y)
++	// cond: x >= 0 && x <= 31
++	// result: (SRAIW <t> [int64(x)] y)
+ 	for {
+ 		t := v.Type
+ 		x := auxIntToInt64(v.AuxInt)
+@@ -6300,9 +6290,71 @@ func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool {
+ 			break
+ 		}
+ 		y := v_0.Args[0]
++		if !(x >= 0 && x <= 31) {
++			break
++		}
+ 		v.reset(OpRISCV64SRAIW)
+ 		v.Type = t
+-		v.AuxInt = int64ToAuxInt(int64(x & 31))
++		v.AuxInt = int64ToAuxInt(int64(x))
++		v.AddArg(y)
++		return true
++	}
++	// match: (SRAI <t> [x] (MOVBreg y))
++	// cond: x >= 8
++	// result: (SRAI [63] (SLLI <t> [56] y))
++	for {
++		t := v.Type
++		x := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpRISCV64MOVBreg {
++			break
++		}
++		y := v_0.Args[0]
++		if !(x >= 8) {
++			break
++		}
++		v.reset(OpRISCV64SRAI)
++		v.AuxInt = int64ToAuxInt(63)
++		v0 := b.NewValue0(v.Pos, OpRISCV64SLLI, t)
++		v0.AuxInt = int64ToAuxInt(56)
++		v0.AddArg(y)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SRAI <t> [x] (MOVHreg y))
++	// cond: x >= 16
++	// result: (SRAI [63] (SLLI <t> [48] y))
++	for {
++		t := v.Type
++		x := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpRISCV64MOVHreg {
++			break
++		}
++		y := v_0.Args[0]
++		if !(x >= 16) {
++			break
++		}
++		v.reset(OpRISCV64SRAI)
++		v.AuxInt = int64ToAuxInt(63)
++		v0 := b.NewValue0(v.Pos, OpRISCV64SLLI, t)
++		v0.AuxInt = int64ToAuxInt(48)
++		v0.AddArg(y)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SRAI <t> [x] (MOVWreg y))
++	// cond: x >= 32
++	// result: (SRAIW [31] y)
++	for {
++		x := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpRISCV64MOVWreg {
++			break
++		}
++		y := v_0.Args[0]
++		if !(x >= 32) {
++			break
++		}
++		v.reset(OpRISCV64SRAIW)
++		v.AuxInt = int64ToAuxInt(31)
+ 		v.AddArg(y)
+ 		return true
+ 	}
+@@ -6341,20 +6393,6 @@ func rewriteValueRISCV64_OpRISCV64SRAW(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+-	// match: (SRL <t> (MOVWUreg x) y)
+-	// result: (SRLW <t> x y)
+-	for {
+-		t := v.Type
+-		if v_0.Op != OpRISCV64MOVWUreg {
+-			break
+-		}
+-		x := v_0.Args[0]
+-		y := v_1
+-		v.reset(OpRISCV64SRLW)
+-		v.Type = t
+-		v.AddArg2(x, y)
+-		return true
+-	}
+ 	// match: (SRL x (MOVDconst [val]))
+ 	// result: (SRLI [int64(val&63)] x)
+ 	for {
+@@ -6373,7 +6411,8 @@ func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64SRLI(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	// match: (SRLI <t> [x] (MOVWUreg y))
+-	// result: (SRLIW <t> [x] y)
++	// cond: x >= 0 && x <= 31
++	// result: (SRLIW <t> [int64(x)] y)
+ 	for {
+ 		t := v.Type
+ 		x := auxIntToInt64(v.AuxInt)
+@@ -6381,12 +6420,66 @@ func rewriteValueRISCV64_OpRISCV64SRLI(v *Value) bool {
+ 			break
+ 		}
+ 		y := v_0.Args[0]
++		if !(x >= 0 && x <= 31) {
++			break
++		}
+ 		v.reset(OpRISCV64SRLIW)
+ 		v.Type = t
+-		v.AuxInt = int64ToAuxInt(x)
++		v.AuxInt = int64ToAuxInt(int64(x))
+ 		v.AddArg(y)
+ 		return true
+ 	}
++	// match: (SRLI <t> [x] (MOVBUreg y))
++	// cond: x >= 8
++	// result: (MOVDconst <t> [0])
++	for {
++		t := v.Type
++		x := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpRISCV64MOVBUreg {
++			break
++		}
++		if !(x >= 8) {
++			break
++		}
++		v.reset(OpRISCV64MOVDconst)
++		v.Type = t
++		v.AuxInt = int64ToAuxInt(0)
++		return true
++	}
++	// match: (SRLI <t> [x] (MOVHUreg y))
++	// cond: x >= 16
++	// result: (MOVDconst <t> [0])
++	for {
++		t := v.Type
++		x := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpRISCV64MOVHUreg {
++			break
++		}
++		if !(x >= 16) {
++			break
++		}
++		v.reset(OpRISCV64MOVDconst)
++		v.Type = t
++		v.AuxInt = int64ToAuxInt(0)
++		return true
++	}
++	// match: (SRLI <t> [x] (MOVWUreg y))
++	// cond: x >= 32
++	// result: (MOVDconst <t> [0])
++	for {
++		t := v.Type
++		x := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpRISCV64MOVWUreg {
++			break
++		}
++		if !(x >= 32) {
++			break
++		}
++		v.reset(OpRISCV64MOVDconst)
++		v.Type = t
++		v.AuxInt = int64ToAuxInt(0)
++		return true
++	}
+ 	// match: (SRLI [x] (MOVDconst [y]))
+ 	// result: (MOVDconst [int64(uint64(y) >> uint32(x))])
+ 	for {
+@@ -7035,7 +7128,7 @@ func rewriteValueRISCV64_OpRsh32Ux16(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux16 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt16to64 y))))
++	// result: (AND (SRLW <t> x y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt16to64 y))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7044,33 +7137,29 @@ func rewriteValueRISCV64_OpRsh32Ux16(v *Value) bool {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64AND)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v0.AddArg2(v1, y)
+-		v2 := b.NewValue0(v.Pos, OpNeg32, t)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
+-		v3.AuxInt = int64ToAuxInt(32)
+-		v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+-		v4.AddArg(y)
+-		v3.AddArg(v4)
++		v0 := b.NewValue0(v.Pos, OpRISCV64SRLW, t)
++		v0.AddArg2(x, y)
++		v1 := b.NewValue0(v.Pos, OpNeg32, t)
++		v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
++		v2.AuxInt = int64ToAuxInt(32)
++		v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v3.AddArg(y)
+ 		v2.AddArg(v3)
+-		v.AddArg2(v0, v2)
++		v1.AddArg(v2)
++		v.AddArg2(v0, v1)
+ 		return true
+ 	}
+ 	// match: (Rsh32Ux16 x y)
+ 	// cond: shiftIsBounded(v)
+-	// result: (SRL (ZeroExt32to64 x) y)
++	// result: (SRLW x y)
+ 	for {
+ 		x := v_0
+ 		y := v_1
+ 		if !(shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRL)
+-		v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v0.AddArg(x)
+-		v.AddArg2(v0, y)
++		v.reset(OpRISCV64SRLW)
++		v.AddArg2(x, y)
+ 		return true
+ 	}
+ 	return false
+@@ -7082,7 +7171,7 @@ func rewriteValueRISCV64_OpRsh32Ux32(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux32 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt32to64 y))))
++	// result: (AND (SRLW <t> x y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt32to64 y))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7091,33 +7180,29 @@ func rewriteValueRISCV64_OpRsh32Ux32(v *Value) bool {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64AND)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v0.AddArg2(v1, y)
+-		v2 := b.NewValue0(v.Pos, OpNeg32, t)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
+-		v3.AuxInt = int64ToAuxInt(32)
+-		v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v4.AddArg(y)
+-		v3.AddArg(v4)
++		v0 := b.NewValue0(v.Pos, OpRISCV64SRLW, t)
++		v0.AddArg2(x, y)
++		v1 := b.NewValue0(v.Pos, OpNeg32, t)
++		v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
++		v2.AuxInt = int64ToAuxInt(32)
++		v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
++		v3.AddArg(y)
+ 		v2.AddArg(v3)
+-		v.AddArg2(v0, v2)
++		v1.AddArg(v2)
++		v.AddArg2(v0, v1)
+ 		return true
+ 	}
+ 	// match: (Rsh32Ux32 x y)
+ 	// cond: shiftIsBounded(v)
+-	// result: (SRL (ZeroExt32to64 x) y)
++	// result: (SRLW x y)
+ 	for {
+ 		x := v_0
+ 		y := v_1
+ 		if !(shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRL)
+-		v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v0.AddArg(x)
+-		v.AddArg2(v0, y)
++		v.reset(OpRISCV64SRLW)
++		v.AddArg2(x, y)
+ 		return true
+ 	}
+ 	return false
+@@ -7126,10 +7211,9 @@ func rewriteValueRISCV64_OpRsh32Ux64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+-	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux64 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] y)))
++	// result: (AND (SRLW <t> x y) (Neg32 <t> (SLTIU <t> [32] y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7138,31 +7222,27 @@ func rewriteValueRISCV64_OpRsh32Ux64(v *Value) bool {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64AND)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v0.AddArg2(v1, y)
+-		v2 := b.NewValue0(v.Pos, OpNeg32, t)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
+-		v3.AuxInt = int64ToAuxInt(32)
+-		v3.AddArg(y)
+-		v2.AddArg(v3)
+-		v.AddArg2(v0, v2)
++		v0 := b.NewValue0(v.Pos, OpRISCV64SRLW, t)
++		v0.AddArg2(x, y)
++		v1 := b.NewValue0(v.Pos, OpNeg32, t)
++		v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
++		v2.AuxInt = int64ToAuxInt(32)
++		v2.AddArg(y)
++		v1.AddArg(v2)
++		v.AddArg2(v0, v1)
+ 		return true
+ 	}
+ 	// match: (Rsh32Ux64 x y)
+ 	// cond: shiftIsBounded(v)
+-	// result: (SRL (ZeroExt32to64 x) y)
++	// result: (SRLW x y)
+ 	for {
+ 		x := v_0
+ 		y := v_1
+ 		if !(shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRL)
+-		v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v0.AddArg(x)
+-		v.AddArg2(v0, y)
++		v.reset(OpRISCV64SRLW)
++		v.AddArg2(x, y)
+ 		return true
+ 	}
+ 	return false
+@@ -7174,7 +7254,7 @@ func rewriteValueRISCV64_OpRsh32Ux8(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux8 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt8to64 y))))
++	// result: (AND (SRLW <t> x y) (Neg32 <t> (SLTIU <t> [32] (ZeroExt8to64 y))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7183,33 +7263,29 @@ func rewriteValueRISCV64_OpRsh32Ux8(v *Value) bool {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64AND)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v0.AddArg2(v1, y)
+-		v2 := b.NewValue0(v.Pos, OpNeg32, t)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
+-		v3.AuxInt = int64ToAuxInt(32)
+-		v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+-		v4.AddArg(y)
+-		v3.AddArg(v4)
++		v0 := b.NewValue0(v.Pos, OpRISCV64SRLW, t)
++		v0.AddArg2(x, y)
++		v1 := b.NewValue0(v.Pos, OpNeg32, t)
++		v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
++		v2.AuxInt = int64ToAuxInt(32)
++		v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v3.AddArg(y)
+ 		v2.AddArg(v3)
+-		v.AddArg2(v0, v2)
++		v1.AddArg(v2)
++		v.AddArg2(v0, v1)
+ 		return true
+ 	}
+ 	// match: (Rsh32Ux8 x y)
+ 	// cond: shiftIsBounded(v)
+-	// result: (SRL (ZeroExt32to64 x) y)
++	// result: (SRLW x y)
+ 	for {
+ 		x := v_0
+ 		y := v_1
+ 		if !(shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRL)
+-		v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v0.AddArg(x)
+-		v.AddArg2(v0, y)
++		v.reset(OpRISCV64SRLW)
++		v.AddArg2(x, y)
+ 		return true
+ 	}
+ 	return false
+@@ -7221,7 +7297,7 @@ func rewriteValueRISCV64_OpRsh32x16(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x16 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt16to64 y)))))
++	// result: (SRAW <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt16to64 y)))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7229,36 +7305,32 @@ func rewriteValueRISCV64_OpRsh32x16(v *Value) bool {
+ 		if !(!shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRA)
++		v.reset(OpRISCV64SRAW)
+ 		v.Type = t
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
+-		v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
+-		v2.AuxInt = int64ToAuxInt(-1)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
+-		v3.AuxInt = int64ToAuxInt(32)
+-		v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+-		v4.AddArg(y)
+-		v3.AddArg(v4)
++		v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
++		v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
++		v1.AuxInt = int64ToAuxInt(-1)
++		v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
++		v2.AuxInt = int64ToAuxInt(32)
++		v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v3.AddArg(y)
+ 		v2.AddArg(v3)
+-		v1.AddArg2(y, v2)
+-		v.AddArg2(v0, v1)
++		v1.AddArg(v2)
++		v0.AddArg2(y, v1)
++		v.AddArg2(x, v0)
+ 		return true
+ 	}
+ 	// match: (Rsh32x16 x y)
+ 	// cond: shiftIsBounded(v)
+-	// result: (SRA (SignExt32to64 x) y)
++	// result: (SRAW x y)
+ 	for {
+ 		x := v_0
+ 		y := v_1
+ 		if !(shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRA)
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v.AddArg2(v0, y)
++		v.reset(OpRISCV64SRAW)
++		v.AddArg2(x, y)
+ 		return true
+ 	}
+ 	return false
+@@ -7270,7 +7342,7 @@ func rewriteValueRISCV64_OpRsh32x32(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x32 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt32to64 y)))))
++	// result: (SRAW <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt32to64 y)))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7278,36 +7350,32 @@ func rewriteValueRISCV64_OpRsh32x32(v *Value) bool {
+ 		if !(!shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRA)
++		v.reset(OpRISCV64SRAW)
+ 		v.Type = t
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
+-		v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
+-		v2.AuxInt = int64ToAuxInt(-1)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
+-		v3.AuxInt = int64ToAuxInt(32)
+-		v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v4.AddArg(y)
+-		v3.AddArg(v4)
++		v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
++		v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
++		v1.AuxInt = int64ToAuxInt(-1)
++		v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
++		v2.AuxInt = int64ToAuxInt(32)
++		v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
++		v3.AddArg(y)
+ 		v2.AddArg(v3)
+-		v1.AddArg2(y, v2)
+-		v.AddArg2(v0, v1)
++		v1.AddArg(v2)
++		v0.AddArg2(y, v1)
++		v.AddArg2(x, v0)
+ 		return true
+ 	}
+ 	// match: (Rsh32x32 x y)
+ 	// cond: shiftIsBounded(v)
+-	// result: (SRA (SignExt32to64 x) y)
++	// result: (SRAW x y)
+ 	for {
+ 		x := v_0
+ 		y := v_1
+ 		if !(shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRA)
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v.AddArg2(v0, y)
++		v.reset(OpRISCV64SRAW)
++		v.AddArg2(x, y)
+ 		return true
+ 	}
+ 	return false
+@@ -7316,10 +7384,9 @@ func rewriteValueRISCV64_OpRsh32x64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+-	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x64 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] y))))
++	// result: (SRAW <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] y))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7327,34 +7394,30 @@ func rewriteValueRISCV64_OpRsh32x64(v *Value) bool {
+ 		if !(!shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRA)
++		v.reset(OpRISCV64SRAW)
+ 		v.Type = t
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
+-		v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
+-		v2.AuxInt = int64ToAuxInt(-1)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
+-		v3.AuxInt = int64ToAuxInt(32)
+-		v3.AddArg(y)
+-		v2.AddArg(v3)
+-		v1.AddArg2(y, v2)
+-		v.AddArg2(v0, v1)
++		v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
++		v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
++		v1.AuxInt = int64ToAuxInt(-1)
++		v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
++		v2.AuxInt = int64ToAuxInt(32)
++		v2.AddArg(y)
++		v1.AddArg(v2)
++		v0.AddArg2(y, v1)
++		v.AddArg2(x, v0)
+ 		return true
+ 	}
+ 	// match: (Rsh32x64 x y)
+ 	// cond: shiftIsBounded(v)
+-	// result: (SRA (SignExt32to64 x) y)
++	// result: (SRAW x y)
+ 	for {
+ 		x := v_0
+ 		y := v_1
+ 		if !(shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRA)
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v.AddArg2(v0, y)
++		v.reset(OpRISCV64SRAW)
++		v.AddArg2(x, y)
+ 		return true
+ 	}
+ 	return false
+@@ -7366,7 +7429,7 @@ func rewriteValueRISCV64_OpRsh32x8(v *Value) bool {
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x8 <t> x y)
+ 	// cond: !shiftIsBounded(v)
+-	// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt8to64 y)))))
++	// result: (SRAW <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [32] (ZeroExt8to64 y)))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+@@ -7374,36 +7437,32 @@ func rewriteValueRISCV64_OpRsh32x8(v *Value) bool {
+ 		if !(!shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRA)
++		v.reset(OpRISCV64SRAW)
+ 		v.Type = t
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
+-		v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
+-		v2.AuxInt = int64ToAuxInt(-1)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
+-		v3.AuxInt = int64ToAuxInt(32)
+-		v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+-		v4.AddArg(y)
+-		v3.AddArg(v4)
++		v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
++		v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
++		v1.AuxInt = int64ToAuxInt(-1)
++		v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
++		v2.AuxInt = int64ToAuxInt(32)
++		v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v3.AddArg(y)
+ 		v2.AddArg(v3)
+-		v1.AddArg2(y, v2)
+-		v.AddArg2(v0, v1)
++		v1.AddArg(v2)
++		v0.AddArg2(y, v1)
++		v.AddArg2(x, v0)
+ 		return true
+ 	}
+ 	// match: (Rsh32x8 x y)
+ 	// cond: shiftIsBounded(v)
+-	// result: (SRA (SignExt32to64 x) y)
++	// result: (SRAW x y)
+ 	for {
+ 		x := v_0
+ 		y := v_1
+ 		if !(shiftIsBounded(v)) {
+ 			break
+ 		}
+-		v.reset(OpRISCV64SRA)
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v.AddArg2(v0, y)
++		v.reset(OpRISCV64SRAW)
++		v.AddArg2(x, y)
+ 		return true
+ 	}
+ 	return false
+diff --git a/src/cmd/compile/internal/test/testdata/arith_test.go b/src/cmd/compile/internal/test/testdata/arith_test.go
+index 2b8cd9fad3..cd7b5bc2c4 100644
+--- a/src/cmd/compile/internal/test/testdata/arith_test.go
++++ b/src/cmd/compile/internal/test/testdata/arith_test.go
+@@ -268,6 +268,70 @@ func testOverflowConstShift(t *testing.T) {
+ 	}
+ }
+ 
++//go:noinline
++func rsh64x64ConstOverflow8(x int8) int64 {
++	return int64(x) >> 9
++}
++
++//go:noinline
++func rsh64x64ConstOverflow16(x int16) int64 {
++	return int64(x) >> 17
++}
++
++//go:noinline
++func rsh64x64ConstOverflow32(x int32) int64 {
++	return int64(x) >> 33
++}
++
++func testArithRightShiftConstOverflow(t *testing.T) {
++	allSet := int64(-1)
++	if got, want := rsh64x64ConstOverflow8(0x7f), int64(0); got != want {
++		t.Errorf("rsh64x64ConstOverflow8 failed: got %v, want %v", got, want)
++	}
++	if got, want := rsh64x64ConstOverflow16(0x7fff), int64(0); got != want {
++		t.Errorf("rsh64x64ConstOverflow16 failed: got %v, want %v", got, want)
++	}
++	if got, want := rsh64x64ConstOverflow32(0x7ffffff), int64(0); got != want {
++		t.Errorf("rsh64x64ConstOverflow32 failed: got %v, want %v", got, want)
++	}
++	if got, want := rsh64x64ConstOverflow8(int8(-1)), allSet; got != want {
++		t.Errorf("rsh64x64ConstOverflow8 failed: got %v, want %v", got, want)
++	}
++	if got, want := rsh64x64ConstOverflow16(int16(-1)), allSet; got != want {
++		t.Errorf("rsh64x64ConstOverflow16 failed: got %v, want %v", got, want)
++	}
++	if got, want := rsh64x64ConstOverflow32(int32(-1)), allSet; got != want {
++		t.Errorf("rsh64x64ConstOverflow32 failed: got %v, want %v", got, want)
++	}
++}
++
++//go:noinline
++func rsh64Ux64ConstOverflow8(x uint8) uint64 {
++	return uint64(x) >> 9
++}
++
++//go:noinline
++func rsh64Ux64ConstOverflow16(x uint16) uint64 {
++	return uint64(x) >> 17
++}
++
++//go:noinline
++func rsh64Ux64ConstOverflow32(x uint32) uint64 {
++	return uint64(x) >> 33
++}
++
++func testRightShiftConstOverflow(t *testing.T) {
++	if got, want := rsh64Ux64ConstOverflow8(0xff), uint64(0); got != want {
++		t.Errorf("rsh64Ux64ConstOverflow8 failed: got %v, want %v", got, want)
++	}
++	if got, want := rsh64Ux64ConstOverflow16(0xffff), uint64(0); got != want {
++		t.Errorf("rsh64Ux64ConstOverflow16 failed: got %v, want %v", got, want)
++	}
++	if got, want := rsh64Ux64ConstOverflow32(0xffffffff), uint64(0); got != want {
++		t.Errorf("rsh64Ux64ConstOverflow32 failed: got %v, want %v", got, want)
++	}
++}
++
+ // test64BitConstMult tests that rewrite rules don't fold 64 bit constants
+ // into multiply instructions.
+ func test64BitConstMult(t *testing.T) {
+@@ -918,6 +982,8 @@ func TestArithmetic(t *testing.T) {
+ 	testShiftCX(t)
+ 	testSubConst(t)
+ 	testOverflowConstShift(t)
++	testArithRightShiftConstOverflow(t)
++	testRightShiftConstOverflow(t)
+ 	testArithConstShift(t)
+ 	testArithRshConst(t)
+ 	testLargeConst(t)
+diff --git a/test/codegen/shift.go b/test/codegen/shift.go
+index b9d888ca6c..51b9b2e39c 100644
+--- a/test/codegen/shift.go
++++ b/test/codegen/shift.go
+@@ -22,12 +22,42 @@ func rshConst64Ux64(v uint64) uint64 {
+ 	return v >> uint64(33)
+ }
+ 
++func rshConst64Ux64Overflow32(v uint32) uint64 {
++	// riscv64:"MOV\t\\$0,",-"SRL"
++	return uint64(v) >> 32
++}
++
++func rshConst64Ux64Overflow16(v uint16) uint64 {
++	// riscv64:"MOV\t\\$0,",-"SRL"
++	return uint64(v) >> 16
++}
++
++func rshConst64Ux64Overflow8(v uint8) uint64 {
++	// riscv64:"MOV\t\\$0,",-"SRL"
++	return uint64(v) >> 8
++}
++
+ func rshConst64x64(v int64) int64 {
+ 	// ppc64x:"SRAD"
+ 	// riscv64:"SRAI\t",-"OR",-"SLTIU"
+ 	return v >> uint64(33)
+ }
+ 
++func rshConst64x64Overflow32(v int32) int64 {
++	// riscv64:"SRAIW",-"SLLI",-"SRAI\t"
++	return int64(v) >> 32
++}
++
++func rshConst64x64Overflow16(v int16) int64 {
++	// riscv64:"SLLI","SRAI",-"SRAIW"
++	return int64(v) >> 16
++}
++
++func rshConst64x64Overflow8(v int8) int64 {
++	// riscv64:"SLLI","SRAI",-"SRAIW"
++	return int64(v) >> 8
++}
++
+ func lshConst32x64(v int32) int32 {
+ 	// ppc64x:"SLW"
+ 	// riscv64:"SLLI",-"AND",-"SLTIU", -"MOVW"
+-- 
+2.39.5
+
diff --git a/2024-crypto-sha512-provide-optimised-assembly-for-riscv64.patch b/2024-crypto-sha512-provide-optimised-assembly-for-riscv64.patch
new file mode 100644
index 0000000..5463752
--- /dev/null
+++ b/2024-crypto-sha512-provide-optimised-assembly-for-riscv64.patch
@@ -0,0 +1,380 @@
+From 981c83755a2aba3e61156f28dd483d8a555dcaa7 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:34:22 +0800
+Subject: [PATCH 024/119] crypto/sha512: provide optimised assembly for riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Provide an optimised assembly implementation of sha512 for riscv64.
+This results in significant performance gains.
+
+On a StarFive VisionFive 2:
+
+                    │   sha512a   │               sha512b               │
+                    │   sec/op    │   sec/op     vs base                │
+Hash8Bytes/New-4      7.998µ ± 0%   6.962µ ± 0%  -12.96% (p=0.000 n=10)
+Hash8Bytes/Sum384-4   8.113µ ± 0%   6.651µ ± 0%  -18.02% (p=0.000 n=10)
+Hash8Bytes/Sum512-4   8.269µ ± 0%   6.748µ ± 0%  -18.39% (p=0.000 n=10)
+Hash1K/New-4          57.38µ ± 0%   36.92µ ± 0%  -35.66% (p=0.000 n=10)
+Hash1K/Sum384-4       57.47µ ± 0%   36.57µ ± 0%  -36.37% (p=0.000 n=10)
+Hash1K/Sum512-4       57.61µ ± 0%   36.75µ ± 0%  -36.21% (p=0.000 n=10)
+Hash8K/New-4          402.5µ ± 0%   245.4µ ± 0%  -39.02% (p=0.000 n=10)
+Hash8K/Sum384-4       402.5µ ± 0%   245.1µ ± 0%  -39.12% (p=0.000 n=10)
+Hash8K/Sum512-4       402.7µ ± 0%   245.3µ ± 0%  -39.09% (p=0.000 n=10)
+
+                    │   sha512a    │                sha512b                │
+                    │     B/s      │      B/s       vs base                │
+Hash8Bytes/New-4      976.6Ki ± 0%   1123.0Ki ± 0%  +15.00% (p=0.000 n=10)
+Hash8Bytes/Sum384-4   966.8Ki ± 0%   1171.9Ki ± 0%  +21.21% (p=0.000 n=10)
+Hash8Bytes/Sum512-4   947.3Ki ± 0%   1162.1Ki ± 1%  +22.68% (p=0.000 n=10)
+Hash1K/New-4          17.01Mi ± 0%    26.45Mi ± 0%  +55.47% (p=0.000 n=10)
+Hash1K/Sum384-4       16.99Mi ± 0%    26.70Mi ± 0%  +57.13% (p=0.000 n=10)
+Hash1K/Sum512-4       16.95Mi ± 0%    26.57Mi ± 0%  +56.74% (p=0.000 n=10)
+Hash8K/New-4          19.41Mi ± 0%    31.83Mi ± 0%  +63.99% (p=0.000 n=10)
+Hash8K/Sum384-4       19.41Mi ± 0%    31.88Mi ± 0%  +64.28% (p=0.000 n=10)
+Hash8K/Sum512-4       19.40Mi ± 0%    31.85Mi ± 0%  +64.21% (p=0.000 n=10)
+
+Change-Id: I92629a106b75b0526e9f2a8fe3cc4a6f7fc63c8c
+Reviewed-on: https://go-review.googlesource.com/c/go/+/518631
+Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: M Zhuo <mzh@golangcn.org>
+Reviewed-by: Wang Yaduo <wangyaduo@linux.alibaba.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/crypto/sha512/sha512block_decl.go    |   2 +-
+ src/crypto/sha512/sha512block_generic.go |   2 +-
+ src/crypto/sha512/sha512block_riscv64.s  | 291 +++++++++++++++++++++++
+ 3 files changed, 293 insertions(+), 2 deletions(-)
+ create mode 100644 src/crypto/sha512/sha512block_riscv64.s
+
+diff --git a/src/crypto/sha512/sha512block_decl.go b/src/crypto/sha512/sha512block_decl.go
+index 4ad4418bc0..d5d03d0f3c 100644
+--- a/src/crypto/sha512/sha512block_decl.go
++++ b/src/crypto/sha512/sha512block_decl.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build s390x || ppc64le || ppc64
++//go:build ppc64le || ppc64 || riscv64 || s390x
+ 
+ package sha512
+ 
+diff --git a/src/crypto/sha512/sha512block_generic.go b/src/crypto/sha512/sha512block_generic.go
+index 02ecc2c794..f11c0980bd 100644
+--- a/src/crypto/sha512/sha512block_generic.go
++++ b/src/crypto/sha512/sha512block_generic.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
++//go:build !amd64 && !arm64 && !ppc64 && !ppc64le && !riscv64 && !s390x
+ 
+ package sha512
+ 
+diff --git a/src/crypto/sha512/sha512block_riscv64.s b/src/crypto/sha512/sha512block_riscv64.s
+new file mode 100644
+index 0000000000..361aafe49d
+--- /dev/null
++++ b/src/crypto/sha512/sha512block_riscv64.s
+@@ -0,0 +1,291 @@
++// Copyright 2023 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++#include "textflag.h"
++
++// SHA512 block routine. See sha512block.go for Go equivalent.
++//
++// The algorithm is detailed in FIPS 180-4:
++//
++//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
++//
++// Wt = Mt; for 0 <= t <= 15
++// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
++//
++// a = H0
++// b = H1
++// c = H2
++// d = H3
++// e = H4
++// f = H5
++// g = H6
++// h = H7
++//
++// for t = 0 to 79 {
++//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
++//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
++//    h = g
++//    g = f
++//    f = e
++//    e = d + T1
++//    d = c
++//    c = b
++//    b = a
++//    a = T1 + T2
++// }
++//
++// H0 = a + H0
++// H1 = b + H1
++// H2 = c + H2
++// H3 = d + H3
++// H4 = e + H4
++// H5 = f + H5
++// H6 = g + H6
++// H7 = h + H7
++
++#define ROR(s, r, d, t1, t2) \
++	SLL	$(64-s), r, t1; \
++	SRL	$(s), r, t2; \
++	OR	t1, t2, d
++
++// Wt = Mt; for 0 <= t <= 15
++#define MSGSCHEDULE0(index) \
++	MOVBU	((index*8)+0)(X29), X5; \
++	MOVBU	((index*8)+1)(X29), X6; \
++	MOVBU	((index*8)+2)(X29), X7; \
++	MOVBU	((index*8)+3)(X29), X8; \
++	SLL	$56, X5; \
++	SLL	$48, X6; \
++	OR	X5, X6, X5; \
++	SLL	$40, X7; \
++	OR	X5, X7, X5; \
++	SLL	$32, X8; \
++	OR	X5, X8, X5; \
++	MOVBU	((index*8)+4)(X29), X9; \
++	MOVBU	((index*8)+5)(X29), X6; \
++	MOVBU	((index*8)+6)(X29), X7; \
++	MOVBU	((index*8)+7)(X29), X8; \
++	SLL	$24, X9; \
++	OR	X5, X9, X5; \
++	SLL	$16, X6; \
++	OR	X5, X6, X5; \
++	SLL	$8, X7; \
++	OR	X5, X7, X5; \
++	OR	X5, X8, X5; \
++	MOV	X5, (index*8)(X19)
++
++// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
++//   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
++//   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
++#define MSGSCHEDULE1(index) \
++	MOV	(((index-2)&0xf)*8)(X19), X5; \
++	MOV	(((index-15)&0xf)*8)(X19), X6; \
++	MOV	(((index-7)&0xf)*8)(X19), X9; \
++	MOV	(((index-16)&0xf)*8)(X19), X21; \
++	ROR(19, X5, X7, X23, X24); \
++	ROR(61, X5, X8, X23, X24); \
++	SRL	$6, X5; \
++	XOR	X7, X5; \
++	XOR	X8, X5; \
++	ADD	X9, X5; \
++	ROR(1, X6, X7, X23, X24); \
++	ROR(8, X6, X8, X23, X24); \
++	SRL	$7, X6; \
++	XOR	X7, X6; \
++	XOR	X8, X6; \
++	ADD	X6, X5; \
++	ADD	X21, X5; \
++	MOV	X5, ((index&0xf)*8)(X19)
++
++// Calculate T1 in X5.
++// h is also used as an accumulator. Wt is passed in X5.
++//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
++//     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
++//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
++#define SHA512T1(index, e, f, g, h) \
++	MOV	(index*8)(X18), X8; \
++	ADD	X5, h; \
++	ROR(14, e, X6, X23, X24); \
++	ADD	X8, h; \
++	ROR(18, e, X7, X23, X24); \
++	XOR	X7, X6; \
++	ROR(41, e, X8, X23, X24); \
++	XOR	X8, X6; \
++	ADD	X6, h; \
++	AND	e, f, X5; \
++	NOT	e, X7; \
++	AND	g, X7; \
++	XOR	X7, X5; \
++	ADD	h, X5
++
++// Calculate T2 in X6.
++//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
++//     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
++//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
++#define SHA512T2(a, b, c) \
++	ROR(28, a, X6, X23, X24); \
++	ROR(34, a, X7, X23, X24); \
++	XOR	X7, X6; \
++	ROR(39, a, X8, X23, X24); \
++	XOR	X8, X6; \
++	AND	a, b, X7; \
++	AND	a, c, X8; \
++	XOR	X8, X7; \
++	AND	b, c, X9; \
++	XOR	X9, X7; \
++	ADD	X7, X6
++
++// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
++// The values for e and a are stored in d and h, ready for rotation.
++#define SHA512ROUND(index, a, b, c, d, e, f, g, h) \
++	SHA512T1(index, e, f, g, h); \
++	SHA512T2(a, b, c); \
++	MOV	X6, h; \
++	ADD	X5, d; \
++	ADD	X5, h
++
++#define SHA512ROUND0(index, a, b, c, d, e, f, g, h) \
++	MSGSCHEDULE0(index); \
++	SHA512ROUND(index, a, b, c, d, e, f, g, h)
++
++#define SHA512ROUND1(index, a, b, c, d, e, f, g, h) \
++	MSGSCHEDULE1(index); \
++	SHA512ROUND(index, a, b, c, d, e, f, g, h)
++
++// func block(dig *digest, p []byte)
++TEXT ·block(SB),0,$128-32
++	MOV	p_base+8(FP), X29
++	MOV	p_len+16(FP), X30
++	SRL	$7, X30
++	SLL	$7, X30
++
++	ADD	X29, X30, X28
++	BEQ	X28, X29, end
++
++	MOV	·_K(SB), X18		// const table
++	ADD	$8, X2, X19		// message schedule
++
++	MOV	dig+0(FP), X20
++	MOV	(0*8)(X20), X10		// a = H0
++	MOV	(1*8)(X20), X11		// b = H1
++	MOV	(2*8)(X20), X12		// c = H2
++	MOV	(3*8)(X20), X13		// d = H3
++	MOV	(4*8)(X20), X14		// e = H4
++	MOV	(5*8)(X20), X15		// f = H5
++	MOV	(6*8)(X20), X16		// g = H6
++	MOV	(7*8)(X20), X17		// h = H7
++
++loop:
++	SHA512ROUND0(0, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA512ROUND0(1, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA512ROUND0(2, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA512ROUND0(3, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA512ROUND0(4, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA512ROUND0(5, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA512ROUND0(6, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA512ROUND0(7, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA512ROUND0(8, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA512ROUND0(9, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA512ROUND0(10, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA512ROUND0(11, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA512ROUND0(12, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA512ROUND0(13, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA512ROUND0(14, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA512ROUND0(15, X11, X12, X13, X14, X15, X16, X17, X10)
++
++	SHA512ROUND1(16, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA512ROUND1(17, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA512ROUND1(18, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA512ROUND1(19, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA512ROUND1(20, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA512ROUND1(21, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA512ROUND1(22, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA512ROUND1(23, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA512ROUND1(24, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA512ROUND1(25, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA512ROUND1(26, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA512ROUND1(27, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA512ROUND1(28, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA512ROUND1(29, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA512ROUND1(30, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA512ROUND1(31, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA512ROUND1(32, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA512ROUND1(33, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA512ROUND1(34, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA512ROUND1(35, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA512ROUND1(36, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA512ROUND1(37, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA512ROUND1(38, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA512ROUND1(39, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA512ROUND1(40, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA512ROUND1(41, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA512ROUND1(42, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA512ROUND1(43, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA512ROUND1(44, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA512ROUND1(45, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA512ROUND1(46, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA512ROUND1(47, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA512ROUND1(48, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA512ROUND1(49, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA512ROUND1(50, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA512ROUND1(51, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA512ROUND1(52, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA512ROUND1(53, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA512ROUND1(54, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA512ROUND1(55, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA512ROUND1(56, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA512ROUND1(57, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA512ROUND1(58, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA512ROUND1(59, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA512ROUND1(60, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA512ROUND1(61, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA512ROUND1(62, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA512ROUND1(63, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA512ROUND1(64, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA512ROUND1(65, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA512ROUND1(66, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA512ROUND1(67, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA512ROUND1(68, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA512ROUND1(69, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA512ROUND1(70, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA512ROUND1(71, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA512ROUND1(72, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA512ROUND1(73, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA512ROUND1(74, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA512ROUND1(75, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA512ROUND1(76, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA512ROUND1(77, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA512ROUND1(78, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA512ROUND1(79, X11, X12, X13, X14, X15, X16, X17, X10)
++
++	MOV	(0*8)(X20), X5
++	MOV	(1*8)(X20), X6
++	MOV	(2*8)(X20), X7
++	MOV	(3*8)(X20), X8
++	ADD	X5, X10		// H0 = a + H0
++	ADD	X6, X11		// H1 = b + H1
++	ADD	X7, X12		// H2 = c + H2
++	ADD	X8, X13		// H3 = d + H3
++	MOV	X10, (0*8)(X20)
++	MOV	X11, (1*8)(X20)
++	MOV	X12, (2*8)(X20)
++	MOV	X13, (3*8)(X20)
++	MOV	(4*8)(X20), X5
++	MOV	(5*8)(X20), X6
++	MOV	(6*8)(X20), X7
++	MOV	(7*8)(X20), X8
++	ADD	X5, X14		// H4 = e + H4
++	ADD	X6, X15		// H5 = f + H5
++	ADD	X7, X16		// H6 = g + H6
++	ADD	X8, X17		// H7 = h + H7
++	MOV	X14, (4*8)(X20)
++	MOV	X15, (5*8)(X20)
++	MOV	X16, (6*8)(X20)
++	MOV	X17, (7*8)(X20)
++
++	ADD	$128, X29
++	BNE	X28, X29, loop
++
++end:
++	RET
+-- 
+2.39.5
+
diff --git a/2025-cmd-go-add-GORISCV64-environment-variable.patch b/2025-cmd-go-add-GORISCV64-environment-variable.patch
new file mode 100644
index 0000000..285fb4b
--- /dev/null
+++ b/2025-cmd-go-add-GORISCV64-environment-variable.patch
@@ -0,0 +1,396 @@
+From 75cafea7e262d3f3d23ea8a5e90172354558a1e3 Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:33 +0800
+Subject: [PATCH 025/119] cmd/go: add GORISCV64 environment variable
+
+The variable represents the RISC-V user-mode application profile for
+which to compile.  Valid values are rva20u64 (the default) and
+rva22u64.
+
+Setting GORISCV64=rva20u64 defines the riscv64.rva20u64 build tag,
+sets the internal variable buildcfg.GORISCV64 to 20 and defines the
+macro GORISCV64_rva20u64 for use in assembly language code.
+
+Setting GORISCV64=rva22u64 defines the riscv64.rva20u64 and
+riscv64.rva22u64 build tags, sets the internal variable
+buildcfg.GORISCV64 to 22 and defines the macro GORISCV64_rva22u64
+for use in assembly language code.
+
+This patch only provides a mechanism for the compiler and hand-coded
+assembly language functions to take advantage of the RISC-V
+extensions mandated by the application profiles.  Further patches
+will be required to get the compiler/assembler and assembly language
+functions to actually generate and use these extensions.
+
+Fixes #61476
+
+Change-Id: I9195ae6ee71703cd2112160e89157ab63b8391af
+Reviewed-on: https://go-review.googlesource.com/c/go/+/541135
+Reviewed-by: M Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Wang Yaduo <wangyaduo@linux.alibaba.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Bryan Mills <bcmills@google.com>
+Run-TryBot: M Zhuo <mengzhuo1203@gmail.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+---
+ src/cmd/dist/build.go                    | 15 +++++++
+ src/cmd/dist/buildruntime.go             |  1 +
+ src/cmd/go/alldocs.go                    |  9 +++-
+ src/cmd/go/internal/cfg/cfg.go           | 19 +++++----
+ src/cmd/go/internal/help/helpdoc.go      |  9 +++-
+ src/cmd/go/internal/work/gc.go           |  5 +++
+ src/cmd/go/testdata/script/tooltags.txt  | 20 +++++++++
+ src/cmd/internal/testdir/testdir_test.go |  2 +-
+ src/internal/buildcfg/cfg.go             | 53 +++++++++++++++++-------
+ src/internal/buildcfg/cfg_test.go        | 14 +++++++
+ src/internal/cfg/cfg.go                  |  1 +
+ 11 files changed, 122 insertions(+), 26 deletions(-)
+
+diff --git a/src/cmd/dist/build.go b/src/cmd/dist/build.go
+index ce573686ec..7d720cc5e1 100644
+--- a/src/cmd/dist/build.go
++++ b/src/cmd/dist/build.go
+@@ -39,6 +39,7 @@ var (
+ 	gomips           string
+ 	gomips64         string
+ 	goppc64          string
++	goriscv64        string
+ 	goroot           string
+ 	goroot_final     string
+ 	goextlinkenabled string
+@@ -184,6 +185,12 @@ func xinit() {
+ 	}
+ 	goppc64 = b
+ 
++	b = os.Getenv("GORISCV64")
++	if b == "" {
++		b = "rva20u64"
++	}
++	goriscv64 = b
++
+ 	if p := pathf("%s/src/all.bash", goroot); !isfile(p) {
+ 		fatalf("$GOROOT is not set correctly or not exported\n"+
+ 			"\tGOROOT=%s\n"+
+@@ -244,6 +251,7 @@ func xinit() {
+ 	os.Setenv("GOMIPS", gomips)
+ 	os.Setenv("GOMIPS64", gomips64)
+ 	os.Setenv("GOPPC64", goppc64)
++	os.Setenv("GORISCV64", goriscv64)
+ 	os.Setenv("GOROOT", goroot)
+ 	os.Setenv("GOROOT_FINAL", goroot_final)
+ 
+@@ -899,6 +907,10 @@ func runInstall(pkg string, ch chan struct{}) {
+ 			asmArgs = append(asmArgs, "-D", "GOPPC64_power8")
+ 		}
+ 	}
++	if goarch == "riscv64" {
++		// Define GORISCV64_value from goriscv64
++		asmArgs = append(asmArgs, "-D", "GORISCV64_"+goriscv64)
++	}
+ 	goasmh := pathf("%s/go_asm.h", workdir)
+ 	if IsRuntimePackagePath(pkg) {
+ 		asmArgs = append(asmArgs, "-compiling-runtime")
+@@ -1253,6 +1265,9 @@ func cmdenv() {
+ 	if goarch == "ppc64" || goarch == "ppc64le" {
+ 		xprintf(format, "GOPPC64", goppc64)
+ 	}
++	if goarch == "riscv64" {
++		xprintf(format, "GORISCV64", goriscv64)
++	}
+ 	xprintf(format, "GOWORK", "off")
+ 
+ 	if *path {
+diff --git a/src/cmd/dist/buildruntime.go b/src/cmd/dist/buildruntime.go
+index 816b944400..7095f43772 100644
+--- a/src/cmd/dist/buildruntime.go
++++ b/src/cmd/dist/buildruntime.go
+@@ -58,6 +58,7 @@ func mkbuildcfg(file string) {
+ 	fmt.Fprintf(&buf, "const defaultGOMIPS = `%s`\n", gomips)
+ 	fmt.Fprintf(&buf, "const defaultGOMIPS64 = `%s`\n", gomips64)
+ 	fmt.Fprintf(&buf, "const defaultGOPPC64 = `%s`\n", goppc64)
++	fmt.Fprintf(&buf, "const defaultGORISCV64 = `%s`\n", goriscv64)
+ 	fmt.Fprintf(&buf, "const defaultGOEXPERIMENT = `%s`\n", goexperiment)
+ 	fmt.Fprintf(&buf, "const defaultGO_EXTLINK_ENABLED = `%s`\n", goextlinkenabled)
+ 	fmt.Fprintf(&buf, "const defaultGO_LDSO = `%s`\n", defaultldso)
+diff --git a/src/cmd/go/alldocs.go b/src/cmd/go/alldocs.go
+index bb28756133..db737b062e 100644
+--- a/src/cmd/go/alldocs.go
++++ b/src/cmd/go/alldocs.go
+@@ -1978,10 +1978,13 @@
+ //     ppc64.power8, ppc64.power9, and ppc64.power10
+ //     (or ppc64le.power8, ppc64le.power9, and ppc64le.power10)
+ //     feature build tags.
++//   - For GOARCH=riscv64,
++//     GORISCV64=rva20u64 and rva22u64 correspond to the riscv64.rva20u64
++//     and riscv64.rva22u64 build tags.
+ //   - For GOARCH=wasm, GOWASM=satconv and signext
+ //     correspond to the wasm.satconv and wasm.signext feature build tags.
+ //
+-// For GOARCH=amd64, arm, ppc64, and ppc64le, a particular feature level
++// For GOARCH=amd64, arm, ppc64, ppc64le, and riscv64, a particular feature level
+ // sets the feature build tags for all previous levels as well.
+ // For example, GOAMD64=v2 sets the amd64.v1 and amd64.v2 feature flags.
+ // This ensures that code making use of v2 features continues to compile
+@@ -2275,6 +2278,10 @@
+ //	GOPPC64
+ //		For GOARCH=ppc64{,le}, the target ISA (Instruction Set Architecture).
+ //		Valid values are power8 (default), power9, power10.
++//	GORISCV64
++//		For GOARCH=riscv64, the RISC-V user-mode application profile for which
++//		to compile. Valid values are rva20u64 (default), rva22u64.
++//		See https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc
+ //	GOWASM
+ //		For GOARCH=wasm, comma-separated list of experimental WebAssembly features to use.
+ //		Valid values are satconv, signext.
+diff --git a/src/cmd/go/internal/cfg/cfg.go b/src/cmd/go/internal/cfg/cfg.go
+index 3b591a17d0..da7174c153 100644
+--- a/src/cmd/go/internal/cfg/cfg.go
++++ b/src/cmd/go/internal/cfg/cfg.go
+@@ -409,14 +409,15 @@ var (
+ 	GOMODCACHE = envOr("GOMODCACHE", gopathDir("pkg/mod"))
+ 
+ 	// Used in envcmd.MkEnv and build ID computations.
+-	GOARM    = envOr("GOARM", fmt.Sprint(buildcfg.GOARM))
+-	GOARM64	 = envOr("GOARM64", fmt.Sprint(buildcfg.GOARM64))
+-	GO386    = envOr("GO386", buildcfg.GO386)
+-	GOAMD64  = envOr("GOAMD64", fmt.Sprintf("%s%d", "v", buildcfg.GOAMD64))
+-	GOMIPS   = envOr("GOMIPS", buildcfg.GOMIPS)
+-	GOMIPS64 = envOr("GOMIPS64", buildcfg.GOMIPS64)
+-	GOPPC64  = envOr("GOPPC64", fmt.Sprintf("%s%d", "power", buildcfg.GOPPC64))
+-	GOWASM   = envOr("GOWASM", fmt.Sprint(buildcfg.GOWASM))
++	GOARM     = envOr("GOARM", fmt.Sprint(buildcfg.GOARM))
++	GOARM64   = envOr("GOARM64", fmt.Sprint(buildcfg.GOARM64))
++	GO386     = envOr("GO386", buildcfg.GO386)
++	GOAMD64   = envOr("GOAMD64", fmt.Sprintf("%s%d", "v", buildcfg.GOAMD64))
++	GOMIPS    = envOr("GOMIPS", buildcfg.GOMIPS)
++	GOMIPS64  = envOr("GOMIPS64", buildcfg.GOMIPS64)
++	GOPPC64   = envOr("GOPPC64", fmt.Sprintf("%s%d", "power", buildcfg.GOPPC64))
++	GORISCV64 = envOr("GORISCV64", fmt.Sprintf("rva%du64", buildcfg.GORISCV64))
++	GOWASM    = envOr("GOWASM", fmt.Sprint(buildcfg.GOWASM))
+ 
+ 	GOPROXY    = envOr("GOPROXY", "")
+ 	GOSUMDB    = envOr("GOSUMDB", "")
+@@ -449,6 +450,8 @@ func GetArchEnv() (key, val string) {
+ 		return "GOMIPS64", GOMIPS64
+ 	case "ppc64", "ppc64le":
+ 		return "GOPPC64", GOPPC64
++	case "riscv64":
++		return "GORISCV64", GORISCV64
+ 	case "wasm":
+ 		return "GOWASM", GOWASM
+ 	}
+diff --git a/src/cmd/go/internal/help/helpdoc.go b/src/cmd/go/internal/help/helpdoc.go
+index 68ac4d229d..55701bac46 100644
+--- a/src/cmd/go/internal/help/helpdoc.go
++++ b/src/cmd/go/internal/help/helpdoc.go
+@@ -617,6 +617,10 @@ Architecture-specific environment variables:
+ 	GOPPC64
+ 		For GOARCH=ppc64{,le}, the target ISA (Instruction Set Architecture).
+ 		Valid values are power8 (default), power9, power10.
++	GORISCV64
++		For GOARCH=riscv64, the RISC-V user-mode application profile for which
++		to compile. Valid values are rva20u64 (default), rva22u64.
++		See https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc
+ 	GOWASM
+ 		For GOARCH=wasm, comma-separated list of experimental WebAssembly features to use.
+ 		Valid values are satconv, signext.
+@@ -905,10 +909,13 @@ The defined architecture feature build tags are:
+ 	  ppc64.power8, ppc64.power9, and ppc64.power10
+ 	  (or ppc64le.power8, ppc64le.power9, and ppc64le.power10)
+ 	  feature build tags.
++	- For GOARCH=riscv64,
++	  GORISCV64=rva20u64 and rva22u64 correspond to the riscv64.rva20u64
++	  and riscv64.rva22u64 build tags.
+ 	- For GOARCH=wasm, GOWASM=satconv and signext
+ 	  correspond to the wasm.satconv and wasm.signext feature build tags.
+ 
+-For GOARCH=amd64, arm, ppc64, and ppc64le, a particular feature level
++For GOARCH=amd64, arm, ppc64, ppc64le, and riscv64, a particular feature level
+ sets the feature build tags for all previous levels as well.
+ For example, GOAMD64=v2 sets the amd64.v1 and amd64.v2 feature flags.
+ This ensures that code making use of v2 features continues to compile
+diff --git a/src/cmd/go/internal/work/gc.go b/src/cmd/go/internal/work/gc.go
+index f682219b3b..a8bd121472 100644
+--- a/src/cmd/go/internal/work/gc.go
++++ b/src/cmd/go/internal/work/gc.go
+@@ -409,6 +409,11 @@ func asmArgs(a *Action, p *load.Package) []any {
+ 		}
+ 	}
+ 
++	if cfg.Goarch == "riscv64" {
++		// Define GORISCV64_value from cfg.GORISCV64.
++		args = append(args, "-D", "GORISCV64_"+cfg.GORISCV64)
++	}
++
+ 	return args
+ }
+ 
+diff --git a/src/cmd/go/testdata/script/tooltags.txt b/src/cmd/go/testdata/script/tooltags.txt
+index 27068eebae..1f6f54563c 100644
+--- a/src/cmd/go/testdata/script/tooltags.txt
++++ b/src/cmd/go/testdata/script/tooltags.txt
+@@ -40,6 +40,26 @@ env GOPPC64=power10
+ go list -f '{{context.ToolTags}}'
+ stdout 'ppc64le.power8 ppc64le.power9 ppc64le.power10'
+ 
++env GOARCH=riscv64
++env GORISCV64=rva20u64
++go list -f '{{context.ToolTags}}'
++stdout 'riscv64.rva20u64'
++
++env GOARCH=riscv64
++env GORISCV64=rva22u64
++go list -f '{{context.ToolTags}}'
++stdout 'riscv64.rva20u64 riscv64.rva22u64'
++
++env GOARCH=riscv64
++env GORISCV64=rva22
++! go list -f '{{context.ToolTags}}'
++stderr 'go: invalid GORISCV64: must be rva20u64, rva22u64'
++
++env GOARCH=riscv64
++env GORISCV64=
++go list -f '{{context.ToolTags}}'
++stdout 'riscv64.rva20u64'
++
+ env GOARCH=386
+ env GO386=sse2
+ go list -f '{{context.ToolTags}}'
+diff --git a/src/cmd/internal/testdir/testdir_test.go b/src/cmd/internal/testdir/testdir_test.go
+index bd7785900c..1677191d96 100644
+--- a/src/cmd/internal/testdir/testdir_test.go
++++ b/src/cmd/internal/testdir/testdir_test.go
+@@ -1464,7 +1464,7 @@ var (
+ 		"ppc64x":  {}, // A pseudo-arch representing both ppc64 and ppc64le
+ 		"s390x":   {},
+ 		"wasm":    {},
+-		"riscv64": {},
++		"riscv64": {"GORISCV64", "rva20u64", "rva22u64"},
+ 	}
+ )
+ 
+diff --git a/src/internal/buildcfg/cfg.go b/src/internal/buildcfg/cfg.go
+index dbb1f70ec3..599e782c7a 100644
+--- a/src/internal/buildcfg/cfg.go
++++ b/src/internal/buildcfg/cfg.go
+@@ -21,20 +21,21 @@ import (
+ )
+ 
+ var (
+-	GOROOT   = runtime.GOROOT() // cached for efficiency
+-	GOARCH   = envOr("GOARCH", defaultGOARCH)
+-	GOOS     = envOr("GOOS", defaultGOOS)
+-	GO386    = envOr("GO386", defaultGO386)
+-	GOAMD64  = goamd64()
+-	GOARM    = goarm()
+-	GOARM64  = goarm64()
+-	GOMIPS   = gomips()
+-	GOMIPS64 = gomips64()
+-	GOPPC64  = goppc64()
+-	GOWASM   = gowasm()
+-	ToolTags = toolTags()
+-	GO_LDSO  = defaultGO_LDSO
+-	Version  = version
++	GOROOT    = runtime.GOROOT() // cached for efficiency
++	GOARCH    = envOr("GOARCH", defaultGOARCH)
++	GOOS      = envOr("GOOS", defaultGOOS)
++	GO386     = envOr("GO386", defaultGO386)
++	GOAMD64   = goamd64()
++	GOARM     = goarm()
++	GOARM64   = goarm64()
++	GOMIPS    = gomips()
++	GOMIPS64  = gomips64()
++	GOPPC64   = goppc64()
++	GORISCV64 = goriscv64()
++	GOWASM    = gowasm()
++	ToolTags  = toolTags()
++	GO_LDSO   = defaultGO_LDSO
++	Version   = version
+ )
+ 
+ // Error is one of the errors found (if any) in the build configuration.
+@@ -139,7 +140,7 @@ func ParseGoarm64(v string) (g Goarm64Features, e error) {
+ 
+ 	switch v {
+ 	case "v8.0", "v8.1", "v8.2", "v8.3", "v8.4", "v8.5", "v8.6", "v8.7", "v8.8", "v8.9",
+-	    "v9.0", "v9.1", "v9.2", "v9.4", "v9.5":
++		"v9.0", "v9.1", "v9.2", "v9.4", "v9.5":
+ 		g.Version = v
+ 	default:
+ 		e = fmt.Errorf("invalid GOARM64: must start with v8.{0-9} or v9.{0-5} and may optionally end in %q and/or %q",
+@@ -213,6 +214,22 @@ func goppc64() int {
+ 	return int(defaultGOPPC64[len("power")] - '0')
+ }
+ 
++func goriscv64() int {
++	switch v := envOr("GORISCV64", defaultGORISCV64); v {
++	case "rva20u64":
++		return 20
++	case "rva22u64":
++		return 22
++	}
++	Error = fmt.Errorf("invalid GORISCV64: must be rva20u64, rva22u64")
++	v := defaultGORISCV64[len("rva"):]
++	i := strings.IndexFunc(v, func(r rune) bool {
++		return r < '0' || r > '9'
++	})
++	year, _ := strconv.Atoi(v[:i])
++	return year
++}
++
+ type gowasmFeatures struct {
+ 	SatConv bool
+ 	SignExt bool
+@@ -331,6 +348,12 @@ func gogoarchTags() []string {
+ 			list = append(list, fmt.Sprintf("%s.power%d", GOARCH, i))
+ 		}
+ 		return list
++	case "riscv64":
++		list := []string{GOARCH + "." + "rva20u64"}
++		if GORISCV64 >= 22 {
++			list = append(list, GOARCH+"."+"rva22u64")
++		}
++		return list
+ 	case "wasm":
+ 		var list []string
+ 		if GOWASM.SatConv {
+diff --git a/src/internal/buildcfg/cfg_test.go b/src/internal/buildcfg/cfg_test.go
+index 0123593317..69eeef2422 100644
+--- a/src/internal/buildcfg/cfg_test.go
++++ b/src/internal/buildcfg/cfg_test.go
+@@ -23,4 +23,18 @@ func TestConfigFlags(t *testing.T) {
+ 	if goamd64(); Error == nil {
+ 		t.Errorf("Wrong parsing of GOAMD64=1")
+ 	}
++
++	os.Setenv("GORISCV64", "rva20u64")
++	if goriscv64() != 20 {
++		t.Errorf("Wrong parsing of RISCV64=rva20u64")
++	}
++	os.Setenv("GORISCV64", "rva22u64")
++	if goriscv64() != 22 {
++		t.Errorf("Wrong parsing of RISCV64=rva22u64")
++	}
++	Error = nil
++	os.Setenv("GORISCV64", "rva22")
++	if _ = goriscv64(); Error == nil {
++		t.Errorf("Wrong parsing of RISCV64=rva22")
++	}
+ }
+diff --git a/src/internal/cfg/cfg.go b/src/internal/cfg/cfg.go
+index 7ef5bb7be6..08d210b797 100644
+--- a/src/internal/cfg/cfg.go
++++ b/src/internal/cfg/cfg.go
+@@ -58,6 +58,7 @@ const KnownEnv = `
+ 	GOPPC64
+ 	GOPRIVATE
+ 	GOPROXY
++	GORISCV64
+ 	GOROOT
+ 	GOSUMDB
+ 	GOTMPDIR
+-- 
+2.39.5
+
diff --git a/2026-cmd-compile-implement-float-min-max-in-hardware-for-.patch b/2026-cmd-compile-implement-float-min-max-in-hardware-for-.patch
new file mode 100644
index 0000000..9074965
--- /dev/null
+++ b/2026-cmd-compile-implement-float-min-max-in-hardware-for-.patch
@@ -0,0 +1,520 @@
+From 0c99ca5b172774e907aa32f188236266d1770712 Mon Sep 17 00:00:00 2001
+From: Keith Randall <khr@golang.org>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 026/119] cmd/compile: implement float min/max in hardware for
+ amd64 and arm64
+
+Update #59488
+
+Change-Id: I89f5ea494cbcc887f6fae8560e57bcbd8749be86
+Reviewed-on: https://go-review.googlesource.com/c/go/+/514596
+Reviewed-by: Keith Randall <khr@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Run-TryBot: Keith Randall <khr@golang.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/compile/internal/amd64/ssa.go         |   3 +-
+ src/cmd/compile/internal/arm64/ssa.go         |   4 +
+ src/cmd/compile/internal/ssa/_gen/AMD64.rules |  14 ++
+ src/cmd/compile/internal/ssa/_gen/AMD64Ops.go |   9 +-
+ src/cmd/compile/internal/ssa/_gen/ARM64.rules |   3 +
+ src/cmd/compile/internal/ssa/_gen/ARM64Ops.go |   4 +
+ .../compile/internal/ssa/_gen/genericOps.go   |   6 +
+ src/cmd/compile/internal/ssa/opGen.go         | 133 ++++++++++++++++++
+ src/cmd/compile/internal/ssa/rewriteAMD64.go  |  90 ++++++++++++
+ src/cmd/compile/internal/ssa/rewriteARM64.go  |  12 ++
+ src/cmd/compile/internal/ssagen/ssa.go        |  23 ++-
+ 11 files changed, 298 insertions(+), 3 deletions(-)
+
+diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
+index 113875861c..174ea1ffc8 100644
+--- a/src/cmd/compile/internal/amd64/ssa.go
++++ b/src/cmd/compile/internal/amd64/ssa.go
+@@ -252,7 +252,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
+ 		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
+ 		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
+-		ssa.OpAMD64PXOR,
++		ssa.OpAMD64MINSS, ssa.OpAMD64MINSD,
++		ssa.OpAMD64POR, ssa.OpAMD64PXOR,
+ 		ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
+ 		ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
+ 		ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ:
+diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
+index a0b432bd97..27b4e881c0 100644
+--- a/src/cmd/compile/internal/arm64/ssa.go
++++ b/src/cmd/compile/internal/arm64/ssa.go
+@@ -215,6 +215,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		ssa.OpARM64FNMULD,
+ 		ssa.OpARM64FDIVS,
+ 		ssa.OpARM64FDIVD,
++		ssa.OpARM64FMINS,
++		ssa.OpARM64FMIND,
++		ssa.OpARM64FMAXS,
++		ssa.OpARM64FMAXD,
+ 		ssa.OpARM64ROR,
+ 		ssa.OpARM64RORW:
+ 		r := v.Reg()
+diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+index 5f9b85fc41..fbbeff6dc2 100644
+--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+@@ -172,6 +172,20 @@
+ 
+ (Round(32|64)F ...) => (Copy ...)
+ 
++// Floating-point min is tricky, as the hardware op isn't right for various special
++// cases (-0 and NaN). We use two hardware ops organized just right to make the
++// result come out how we want it. See https://github.com/golang/go/issues/59488#issuecomment-1553493207
++// (although that comment isn't exactly right, as the value overwritten is not simulated correctly).
++//    t1 = MINSD x, y   => incorrect if x==NaN or x==-0,y==+0
++//    t2 = MINSD t1, x  => fixes x==NaN case
++//   res = POR t1, t2   => fixes x==-0,y==+0 case
++// Note that this trick depends on the special property that (NaN OR x) produces a NaN (although
++// it might not produce the same NaN as the input).
++(Min(64|32)F <t> x y) => (POR (MINS(D|S) <t> (MINS(D|S) <t> x y) x) (MINS(D|S) <t> x y))
++// Floating-point max is even trickier. Punt to using min instead.
++// max(x,y) == -min(-x,-y)
++(Max(64|32)F <t> x y) => (Neg(64|32)F <t> (Min(64|32)F <t> (Neg(64|32)F <t> x) (Neg(64|32)F <t> y)))
++
+ (CvtBoolToUint8 ...) => (Copy ...)
+ 
+ // Lowering shifts
+diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
+index d8d0225fc3..27a6844b77 100644
+--- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
+@@ -681,6 +681,12 @@ func init() {
+ 		// Any use must be preceded by a successful check of runtime.support_fma.
+ 		{name: "VFMADD231SD", argLength: 3, reg: fp31, resultInArg0: true, asm: "VFMADD231SD"},
+ 
++		// Note that these operations don't exactly match the semantics of Go's
++		// builtin min. In particular, these aren't commutative, because on various
++		// special cases the 2nd argument is preferred.
++		{name: "MINSD", argLength: 2, reg: fp21, resultInArg0: true, asm: "MINSD"}, // min(arg0,arg1)
++		{name: "MINSS", argLength: 2, reg: fp21, resultInArg0: true, asm: "MINSS"}, // min(arg0,arg1)
++
+ 		{name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
+ 		{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
+ 		// Note: SBBW and SBBB are subsumed by SBBL
+@@ -746,7 +752,8 @@ func init() {
+ 		{name: "MOVLi2f", argLength: 1, reg: gpfp, typ: "Float32"}, // move 32 bits from int to float reg
+ 		{name: "MOVLf2i", argLength: 1, reg: fpgp, typ: "UInt32"},  // move 32 bits from float to int reg, zero extend
+ 
+-		{name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.
++		{name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs (for float negation).
++		{name: "POR", argLength: 2, reg: fp21, asm: "POR", commutative: true, resultInArg0: true},   // inclusive or, applied to X regs (for float min/max).
+ 
+ 		{name: "LEAQ", argLength: 1, reg: gp11sb, asm: "LEAQ", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux
+ 		{name: "LEAL", argLength: 1, reg: gp11sb, asm: "LEAL", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux
+diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64.rules b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
+index 8cf6f6740e..94032d6ca4 100644
+--- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
+@@ -61,6 +61,9 @@
+ 
+ (Sqrt32 ...) => (FSQRTS ...)
+ 
++(Min(64|32)F ...) => (FMIN(D|S) ...)
++(Max(64|32)F ...) => (FMAX(D|S) ...)
++
+ // lowering rotates
+ // we do rotate detection in generic rules, if the following rules need to be changed, check generic rules first.
+ (RotateLeft8  <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
+index 2853e62540..c0c7cbbe61 100644
+--- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
+@@ -234,6 +234,10 @@ func init() {
+ 		{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD"},                                // -arg0, float64
+ 		{name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD"},                              // sqrt(arg0), float64
+ 		{name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"},                              // sqrt(arg0), float32
++		{name: "FMIND", argLength: 2, reg: fp21, asm: "FMIND"},                                // min(arg0, arg1)
++		{name: "FMINS", argLength: 2, reg: fp21, asm: "FMINS"},                                // min(arg0, arg1)
++		{name: "FMAXD", argLength: 2, reg: fp21, asm: "FMAXD"},                                // max(arg0, arg1)
++		{name: "FMAXS", argLength: 2, reg: fp21, asm: "FMAXS"},                                // max(arg0, arg1)
+ 		{name: "REV", argLength: 1, reg: gp11, asm: "REV"},                                    // byte reverse, 64-bit
+ 		{name: "REVW", argLength: 1, reg: gp11, asm: "REVW"},                                  // byte reverse, 32-bit
+ 		{name: "REV16", argLength: 1, reg: gp11, asm: "REV16"},                                // byte reverse in each 16-bit halfword, 64-bit
+diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go
+index 53ff57f6b1..fb18319263 100644
+--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go
++++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go
+@@ -285,6 +285,12 @@ var genericOps = []opData{
+ 	{name: "Abs", argLength: 1},      // absolute value arg0
+ 	{name: "Copysign", argLength: 2}, // copy sign from arg0 to arg1
+ 
++	// Float min/max implementation, if hardware is available.
++	{name: "Min64F", argLength: 2}, // min(arg0,arg1)
++	{name: "Min32F", argLength: 2}, // min(arg0,arg1)
++	{name: "Max64F", argLength: 2}, // max(arg0,arg1)
++	{name: "Max32F", argLength: 2}, // max(arg0,arg1)
++
+ 	// 3-input opcode.
+ 	// Fused-multiply-add, float64 only.
+ 	// When a*b+c is exactly zero (before rounding), then the result is +0 or -0.
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index 62b516ce61..c811c4e020 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -912,6 +912,8 @@ const (
+ 	OpAMD64SQRTSS
+ 	OpAMD64ROUNDSD
+ 	OpAMD64VFMADD231SD
++	OpAMD64MINSD
++	OpAMD64MINSS
+ 	OpAMD64SBBQcarrymask
+ 	OpAMD64SBBLcarrymask
+ 	OpAMD64SETEQ
+@@ -964,6 +966,7 @@ const (
+ 	OpAMD64MOVLi2f
+ 	OpAMD64MOVLf2i
+ 	OpAMD64PXOR
++	OpAMD64POR
+ 	OpAMD64LEAQ
+ 	OpAMD64LEAL
+ 	OpAMD64LEAW
+@@ -1441,6 +1444,10 @@ const (
+ 	OpARM64FNEGD
+ 	OpARM64FSQRTD
+ 	OpARM64FSQRTS
++	OpARM64FMIND
++	OpARM64FMINS
++	OpARM64FMAXD
++	OpARM64FMAXS
+ 	OpARM64REV
+ 	OpARM64REVW
+ 	OpARM64REV16
+@@ -3016,6 +3023,10 @@ const (
+ 	OpRoundToEven
+ 	OpAbs
+ 	OpCopysign
++	OpMin64F
++	OpMin32F
++	OpMax64F
++	OpMax32F
+ 	OpFMA
+ 	OpPhi
+ 	OpCopy
+@@ -11900,6 +11911,36 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:         "MINSD",
++		argLen:       2,
++		resultInArg0: true,
++		asm:          x86.AMINSD,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
++				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
++			},
++			outputs: []outputInfo{
++				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
++			},
++		},
++	},
++	{
++		name:         "MINSS",
++		argLen:       2,
++		resultInArg0: true,
++		asm:          x86.AMINSS,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
++				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
++			},
++			outputs: []outputInfo{
++				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
++			},
++		},
++	},
+ 	{
+ 		name:   "SBBQcarrymask",
+ 		argLen: 1,
+@@ -12520,6 +12561,22 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:         "POR",
++		argLen:       2,
++		commutative:  true,
++		resultInArg0: true,
++		asm:          x86.APOR,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
++				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
++			},
++			outputs: []outputInfo{
++				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
++			},
++		},
++	},
+ 	{
+ 		name:              "LEAQ",
+ 		auxType:           auxSymOff,
+@@ -19287,6 +19344,62 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "FMIND",
++		argLen: 2,
++		asm:    arm64.AFMIND,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
++	{
++		name:   "FMINS",
++		argLen: 2,
++		asm:    arm64.AFMINS,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
++	{
++		name:   "FMAXD",
++		argLen: 2,
++		asm:    arm64.AFMAXD,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
++	{
++		name:   "FMAXS",
++		argLen: 2,
++		asm:    arm64.AFMAXS,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
+ 	{
+ 		name:   "REV",
+ 		argLen: 1,
+@@ -39072,6 +39185,26 @@ var opcodeTable = [...]opInfo{
+ 		argLen:  2,
+ 		generic: true,
+ 	},
++	{
++		name:    "Min64F",
++		argLen:  2,
++		generic: true,
++	},
++	{
++		name:    "Min32F",
++		argLen:  2,
++		generic: true,
++	},
++	{
++		name:    "Max64F",
++		argLen:  2,
++		generic: true,
++	},
++	{
++		name:    "Max32F",
++		argLen:  2,
++		generic: true,
++	},
+ 	{
+ 		name:    "FMA",
+ 		argLen:  3,
+diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
+index 88bd48f331..23a1b11ddd 100644
+--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
++++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
+@@ -871,6 +871,14 @@ func rewriteValueAMD64(v *Value) bool {
+ 		return rewriteValueAMD64_OpLsh8x64(v)
+ 	case OpLsh8x8:
+ 		return rewriteValueAMD64_OpLsh8x8(v)
++	case OpMax32F:
++		return rewriteValueAMD64_OpMax32F(v)
++	case OpMax64F:
++		return rewriteValueAMD64_OpMax64F(v)
++	case OpMin32F:
++		return rewriteValueAMD64_OpMin32F(v)
++	case OpMin64F:
++		return rewriteValueAMD64_OpMin64F(v)
+ 	case OpMod16:
+ 		return rewriteValueAMD64_OpMod16(v)
+ 	case OpMod16u:
+@@ -27481,6 +27489,88 @@ func rewriteValueAMD64_OpLsh8x8(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueAMD64_OpMax32F(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	b := v.Block
++	// match: (Max32F <t> x y)
++	// result: (Neg32F <t> (Min32F <t> (Neg32F <t> x) (Neg32F <t> y)))
++	for {
++		t := v.Type
++		x := v_0
++		y := v_1
++		v.reset(OpNeg32F)
++		v.Type = t
++		v0 := b.NewValue0(v.Pos, OpMin32F, t)
++		v1 := b.NewValue0(v.Pos, OpNeg32F, t)
++		v1.AddArg(x)
++		v2 := b.NewValue0(v.Pos, OpNeg32F, t)
++		v2.AddArg(y)
++		v0.AddArg2(v1, v2)
++		v.AddArg(v0)
++		return true
++	}
++}
++func rewriteValueAMD64_OpMax64F(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	b := v.Block
++	// match: (Max64F <t> x y)
++	// result: (Neg64F <t> (Min64F <t> (Neg64F <t> x) (Neg64F <t> y)))
++	for {
++		t := v.Type
++		x := v_0
++		y := v_1
++		v.reset(OpNeg64F)
++		v.Type = t
++		v0 := b.NewValue0(v.Pos, OpMin64F, t)
++		v1 := b.NewValue0(v.Pos, OpNeg64F, t)
++		v1.AddArg(x)
++		v2 := b.NewValue0(v.Pos, OpNeg64F, t)
++		v2.AddArg(y)
++		v0.AddArg2(v1, v2)
++		v.AddArg(v0)
++		return true
++	}
++}
++func rewriteValueAMD64_OpMin32F(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	b := v.Block
++	// match: (Min32F <t> x y)
++	// result: (POR (MINSS <t> (MINSS <t> x y) x) (MINSS <t> x y))
++	for {
++		t := v.Type
++		x := v_0
++		y := v_1
++		v.reset(OpAMD64POR)
++		v0 := b.NewValue0(v.Pos, OpAMD64MINSS, t)
++		v1 := b.NewValue0(v.Pos, OpAMD64MINSS, t)
++		v1.AddArg2(x, y)
++		v0.AddArg2(v1, x)
++		v.AddArg2(v0, v1)
++		return true
++	}
++}
++func rewriteValueAMD64_OpMin64F(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	b := v.Block
++	// match: (Min64F <t> x y)
++	// result: (POR (MINSD <t> (MINSD <t> x y) x) (MINSD <t> x y))
++	for {
++		t := v.Type
++		x := v_0
++		y := v_1
++		v.reset(OpAMD64POR)
++		v0 := b.NewValue0(v.Pos, OpAMD64MINSD, t)
++		v1 := b.NewValue0(v.Pos, OpAMD64MINSD, t)
++		v1.AddArg2(x, y)
++		v0.AddArg2(v1, x)
++		v.AddArg2(v0, v1)
++		return true
++	}
++}
+ func rewriteValueAMD64_OpMod16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
+index 3b8fe30371..93a741ad87 100644
+--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
++++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
+@@ -820,6 +820,18 @@ func rewriteValueARM64(v *Value) bool {
+ 		return rewriteValueARM64_OpLsh8x64(v)
+ 	case OpLsh8x8:
+ 		return rewriteValueARM64_OpLsh8x8(v)
++	case OpMax32F:
++		v.Op = OpARM64FMAXS
++		return true
++	case OpMax64F:
++		v.Op = OpARM64FMAXD
++		return true
++	case OpMin32F:
++		v.Op = OpARM64FMINS
++		return true
++	case OpMin64F:
++		v.Op = OpARM64FMIND
++		return true
+ 	case OpMod16:
+ 		return rewriteValueARM64_OpMod16(v)
+ 	case OpMod16u:
+diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
+index cc70dc8f7d..b668f1847c 100644
+--- a/src/cmd/compile/internal/ssagen/ssa.go
++++ b/src/cmd/compile/internal/ssagen/ssa.go
+@@ -3567,11 +3567,32 @@ func (s *state) minMax(n *ir.CallExpr) *ssa.Value {
+ 
+ 	if typ.IsFloat() || typ.IsString() {
+ 		// min/max semantics for floats are tricky because of NaNs and
+-		// negative zero, so we let the runtime handle this instead.
++		// negative zero. Some architectures have instructions which
++		// we can use to generate the right result. For others we must
++		// call into the runtime instead.
+ 		//
+ 		// Strings are conceptually simpler, but we currently desugar
+ 		// string comparisons during walk, not ssagen.
+ 
++		if typ.IsFloat() {
++			switch Arch.LinkArch.Family {
++			case sys.AMD64, sys.ARM64:
++				var op ssa.Op
++				switch {
++				case typ.Kind() == types.TFLOAT64 && n.Op() == ir.OMIN:
++					op = ssa.OpMin64F
++				case typ.Kind() == types.TFLOAT64 && n.Op() == ir.OMAX:
++					op = ssa.OpMax64F
++				case typ.Kind() == types.TFLOAT32 && n.Op() == ir.OMIN:
++					op = ssa.OpMin32F
++				case typ.Kind() == types.TFLOAT32 && n.Op() == ir.OMAX:
++					op = ssa.OpMax32F
++				}
++				return fold(func(x, a *ssa.Value) *ssa.Value {
++					return s.newValue2(op, typ, x, a)
++				})
++			}
++		}
+ 		var name string
+ 		switch typ.Kind() {
+ 		case types.TFLOAT32:
+-- 
+2.39.5
+
diff --git a/2027-cmd-compile-implement-float-min-max-in-hardware-for-.patch b/2027-cmd-compile-implement-float-min-max-in-hardware-for-.patch
new file mode 100644
index 0000000..764882c
--- /dev/null
+++ b/2027-cmd-compile-implement-float-min-max-in-hardware-for-.patch
@@ -0,0 +1,348 @@
+From aed69da05999cc02f56ecf3736ed81ca6bc113e1 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 027/119] cmd/compile: implement float min/max in hardware for
+ riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+CL 514596 adds float min/max for amd64, this CL adds it for riscv64.
+
+The behavior of the RISC-V FMIN/FMAX instructions almost match Go's
+requirements.
+
+However according to RISCV spec 8.3 "NaN Generation and Propagation"
+>> if at least one input is a signaling NaN, or if both inputs are quiet
+>> NaNs, the result is the canonical NaN. If one operand is a quiet NaN
+>> and the other is not a NaN, the result is the non-NaN operand.
+
+Go using quiet NaN as NaN and according to Go spec
+>> if any argument is a NaN, the result is a NaN
+
+This requires the float min/max implementation to check whether one
+of operand is qNaN before float mix/max actually execute.
+
+This CL also fix a typo in minmax test.
+
+Benchmark on Visionfive2
+goos: linux
+goarch: riscv64
+pkg: runtime
+         │ float_minmax.old.bench │       float_minmax.new.bench        │
+         │         sec/op         │   sec/op     vs base                │
+MinFloat             158.20n ± 0%   28.13n ± 0%  -82.22% (p=0.000 n=10)
+MaxFloat             158.10n ± 0%   28.12n ± 0%  -82.21% (p=0.000 n=10)
+geomean               158.1n        28.12n       -82.22%
+
+Update #59488
+
+Change-Id: Iab48be6d32b8882044fb8c821438ca8840e5493d
+Reviewed-on: https://go-review.googlesource.com/c/go/+/514775
+Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com>
+Run-TryBot: M Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Keith Randall <khr@google.com>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       | 66 ++++++++++++++++++
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  3 +
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |  4 ++
+ src/cmd/compile/internal/ssa/opGen.go         | 68 +++++++++++++++++++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 12 ++++
+ src/cmd/compile/internal/ssagen/ssa.go        |  2 +-
+ src/runtime/minmax_test.go                    | 22 +++++-
+ 7 files changed, 174 insertions(+), 3 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 22338188e5..caca504d28 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -297,6 +297,72 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.Reg = r1
+ 		p.To.Type = obj.TYPE_REG
+ 		p.To.Reg = r
++
++	case ssa.OpRISCV64LoweredFMAXD, ssa.OpRISCV64LoweredFMIND, ssa.OpRISCV64LoweredFMAXS, ssa.OpRISCV64LoweredFMINS:
++		// Most of FMIN/FMAX result match Go's required behaviour, unless one of the
++		// inputs is a NaN. As such, we need to explicitly test for NaN
++		// before using FMIN/FMAX.
++
++		// FADD Rarg0, Rarg1, Rout // FADD is used to propagate a NaN to the result in these cases.
++		// FEQ  Rarg0, Rarg0, Rtmp
++		// BEQZ Rtmp, end
++		// FEQ  Rarg1, Rarg1, Rtmp
++		// BEQZ Rtmp, end
++		// F(MIN | MAX)
++
++		r0 := v.Args[0].Reg()
++		r1 := v.Args[1].Reg()
++		out := v.Reg()
++		add, feq := riscv.AFADDD, riscv.AFEQD
++		if v.Op == ssa.OpRISCV64LoweredFMAXS || v.Op == ssa.OpRISCV64LoweredFMINS {
++			add = riscv.AFADDS
++			feq = riscv.AFEQS
++		}
++
++		p1 := s.Prog(add)
++		p1.From.Type = obj.TYPE_REG
++		p1.From.Reg = r0
++		p1.Reg = r1
++		p1.To.Type = obj.TYPE_REG
++		p1.To.Reg = out
++
++		p2 := s.Prog(feq)
++		p2.From.Type = obj.TYPE_REG
++		p2.From.Reg = r0
++		p2.Reg = r0
++		p2.To.Type = obj.TYPE_REG
++		p2.To.Reg = riscv.REG_TMP
++
++		p3 := s.Prog(riscv.ABEQ)
++		p3.From.Type = obj.TYPE_REG
++		p3.From.Reg = riscv.REG_ZERO
++		p3.Reg = riscv.REG_TMP
++		p3.To.Type = obj.TYPE_BRANCH
++
++		p4 := s.Prog(feq)
++		p4.From.Type = obj.TYPE_REG
++		p4.From.Reg = r1
++		p4.Reg = r1
++		p4.To.Type = obj.TYPE_REG
++		p4.To.Reg = riscv.REG_TMP
++
++		p5 := s.Prog(riscv.ABEQ)
++		p5.From.Type = obj.TYPE_REG
++		p5.From.Reg = riscv.REG_ZERO
++		p5.Reg = riscv.REG_TMP
++		p5.To.Type = obj.TYPE_BRANCH
++
++		p6 := s.Prog(v.Op.Asm())
++		p6.From.Type = obj.TYPE_REG
++		p6.From.Reg = r1
++		p6.Reg = r0
++		p6.To.Type = obj.TYPE_REG
++		p6.To.Reg = out
++
++		nop := s.Prog(obj.ANOP)
++		p3.To.SetTarget(nop)
++		p5.To.SetTarget(nop)
++
+ 	case ssa.OpRISCV64LoweredMuluhilo:
+ 		r0 := v.Args[0].Reg()
+ 		r1 := v.Args[1].Reg()
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index fc206c42d3..4fef20a565 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -72,6 +72,9 @@
+ 
+ (FMA ...) => (FMADDD ...)
+ 
++(Min(64|32)F ...) => (LoweredFMIN(D|S) ...)
++(Max(64|32)F ...) => (LoweredFMAX(D|S) ...)
++
+ // Sign and zero extension.
+ 
+ (SignExt8to16  ...) => (MOVBreg ...)
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 93f20f8a99..9ce6450166 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -429,6 +429,8 @@ func init() {
+ 		{name: "FNES", argLength: 2, reg: fp2gp, asm: "FNES", commutative: true},                                                            // arg0 != arg1
+ 		{name: "FLTS", argLength: 2, reg: fp2gp, asm: "FLTS"},                                                                               // arg0 < arg1
+ 		{name: "FLES", argLength: 2, reg: fp2gp, asm: "FLES"},                                                                               // arg0 <= arg1
++		{name: "LoweredFMAXS", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMAXS", commutative: true, typ: "Float32"},             // max(arg0, arg1)
++		{name: "LoweredFMINS", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMINS", commutative: true, typ: "Float32"},             // min(arg0, arg1)
+ 
+ 		// D extension.
+ 		{name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true, typ: "Float64"},                                           // arg0 + arg1
+@@ -456,6 +458,8 @@ func init() {
+ 		{name: "FNED", argLength: 2, reg: fp2gp, asm: "FNED", commutative: true},                                                            // arg0 != arg1
+ 		{name: "FLTD", argLength: 2, reg: fp2gp, asm: "FLTD"},                                                                               // arg0 < arg1
+ 		{name: "FLED", argLength: 2, reg: fp2gp, asm: "FLED"},                                                                               // arg0 <= arg1
++		{name: "LoweredFMIND", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMIND", commutative: true, typ: "Float64"},             // min(arg0, arg1)
++		{name: "LoweredFMAXD", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMAXD", commutative: true, typ: "Float64"},             // max(arg0, arg1)
+ 	}
+ 
+ 	RISCV64blocks := []blockData{
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index c811c4e020..e10b054214 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2446,6 +2446,8 @@ const (
+ 	OpRISCV64FNES
+ 	OpRISCV64FLTS
+ 	OpRISCV64FLES
++	OpRISCV64LoweredFMAXS
++	OpRISCV64LoweredFMINS
+ 	OpRISCV64FADDD
+ 	OpRISCV64FSUBD
+ 	OpRISCV64FMULD
+@@ -2471,6 +2473,8 @@ const (
+ 	OpRISCV64FNED
+ 	OpRISCV64FLTD
+ 	OpRISCV64FLED
++	OpRISCV64LoweredFMIND
++	OpRISCV64LoweredFMAXD
+ 
+ 	OpS390XFADDS
+ 	OpS390XFADD
+@@ -32805,6 +32809,38 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:            "LoweredFMAXS",
++		argLen:          2,
++		commutative:     true,
++		resultNotInArgs: true,
++		asm:             riscv.AFMAXS,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
++	{
++		name:            "LoweredFMINS",
++		argLen:          2,
++		commutative:     true,
++		resultNotInArgs: true,
++		asm:             riscv.AFMINS,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
+ 	{
+ 		name:        "FADDD",
+ 		argLen:      2,
+@@ -33159,6 +33195,38 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:            "LoweredFMIND",
++		argLen:          2,
++		commutative:     true,
++		resultNotInArgs: true,
++		asm:             riscv.AFMIND,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
++	{
++		name:            "LoweredFMAXD",
++		argLen:          2,
++		commutative:     true,
++		resultNotInArgs: true,
++		asm:             riscv.AFMAXD,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++				{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++		},
++	},
+ 
+ 	{
+ 		name:         "FADDS",
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 41edcdf8b8..230033c7af 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -328,6 +328,18 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpLsh8x64(v)
+ 	case OpLsh8x8:
+ 		return rewriteValueRISCV64_OpLsh8x8(v)
++	case OpMax32F:
++		v.Op = OpRISCV64LoweredFMAXS
++		return true
++	case OpMax64F:
++		v.Op = OpRISCV64LoweredFMAXD
++		return true
++	case OpMin32F:
++		v.Op = OpRISCV64LoweredFMINS
++		return true
++	case OpMin64F:
++		v.Op = OpRISCV64LoweredFMIND
++		return true
+ 	case OpMod16:
+ 		return rewriteValueRISCV64_OpMod16(v)
+ 	case OpMod16u:
+diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
+index b668f1847c..ec89a45701 100644
+--- a/src/cmd/compile/internal/ssagen/ssa.go
++++ b/src/cmd/compile/internal/ssagen/ssa.go
+@@ -3576,7 +3576,7 @@ func (s *state) minMax(n *ir.CallExpr) *ssa.Value {
+ 
+ 		if typ.IsFloat() {
+ 			switch Arch.LinkArch.Family {
+-			case sys.AMD64, sys.ARM64:
++			case sys.AMD64, sys.ARM64, sys.RISCV64:
+ 				var op ssa.Op
+ 				switch {
+ 				case typ.Kind() == types.TFLOAT64 && n.Op() == ir.OMIN:
+diff --git a/src/runtime/minmax_test.go b/src/runtime/minmax_test.go
+index e0bc28fbf6..1f815a84c3 100644
+--- a/src/runtime/minmax_test.go
++++ b/src/runtime/minmax_test.go
+@@ -66,10 +66,10 @@ func TestMaxFloat(t *testing.T) {
+ 	}
+ 	for _, x := range all {
+ 		if z := max(nan, x); !math.IsNaN(z) {
+-			t.Errorf("min(%v, %v) = %v, want %v", nan, x, z, nan)
++			t.Errorf("max(%v, %v) = %v, want %v", nan, x, z, nan)
+ 		}
+ 		if z := max(x, nan); !math.IsNaN(z) {
+-			t.Errorf("min(%v, %v) = %v, want %v", nan, x, z, nan)
++			t.Errorf("max(%v, %v) = %v, want %v", nan, x, z, nan)
+ 		}
+ 	}
+ }
+@@ -127,3 +127,21 @@ func TestMinMaxStringTies(t *testing.T) {
+ 	test(2, 0, 1)
+ 	test(2, 1, 0)
+ }
++
++func BenchmarkMinFloat(b *testing.B) {
++	var m float64 = 0
++	for i := 0; i < b.N; i++ {
++		for _, f := range all {
++			m = min(m, f)
++		}
++	}
++}
++
++func BenchmarkMaxFloat(b *testing.B) {
++	var m float64 = 0
++	for i := 0; i < b.N; i++ {
++		for _, f := range all {
++			m = max(m, f)
++		}
++	}
++}
+-- 
+2.39.5
+
diff --git a/2028-cmd-compile-improve-rotations-for-riscv64.patch b/2028-cmd-compile-improve-rotations-for-riscv64.patch
new file mode 100644
index 0000000..71c6a7a
--- /dev/null
+++ b/2028-cmd-compile-improve-rotations-for-riscv64.patch
@@ -0,0 +1,596 @@
+From 94e798b40448d2d8a1f21ee7f711d92e546a8bd7 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 028/119] cmd/compile: improve rotations for riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Enable canRotate for riscv64, enable rotation intrinsics and provide
+better rewrite implementations for rotations. By avoiding Lsh*x64
+and Rsh*Ux64 we can produce better code, especially for 32 and 64
+bit rotations. By enabling canRotate we also benefit from the generic
+rotation rewrite rules.
+
+Benchmark on a StarFive VisionFive 2:
+
+               │   rotate.1   │              rotate.2               │
+               │    sec/op    │   sec/op     vs base                │
+RotateLeft-4     14.700n ± 0%   8.016n ± 0%  -45.47% (p=0.000 n=10)
+RotateLeft8-4     14.70n ± 0%   10.69n ± 0%  -27.28% (p=0.000 n=10)
+RotateLeft16-4    14.70n ± 0%   12.02n ± 0%  -18.23% (p=0.000 n=10)
+RotateLeft32-4   13.360n ± 0%   8.016n ± 0%  -40.00% (p=0.000 n=10)
+RotateLeft64-4   13.360n ± 0%   8.016n ± 0%  -40.00% (p=0.000 n=10)
+geomean           14.15n        9.208n       -34.92%
+
+Change-Id: I1a2036fdc57cf88ebb6617eb8d92e1d187e183b2
+Reviewed-on: https://go-review.googlesource.com/c/go/+/560315
+Reviewed-by: M Zhuo <mengzhuo1203@gmail.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: David Chase <drchase@google.com>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |   6 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   |   9 +-
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |  22 +--
+ src/cmd/compile/internal/ssa/opGen.go         |  30 ++++
+ src/cmd/compile/internal/ssa/rewrite.go       |   2 +-
+ .../compile/internal/ssa/rewriteRISCV64.go    | 142 ++++++++++--------
+ src/cmd/compile/internal/ssagen/ssa.go        |   8 +-
+ test/codegen/rotate.go                        |  22 +++
+ 8 files changed, 153 insertions(+), 88 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index caca504d28..17f0d98532 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -278,7 +278,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.To.Type = obj.TYPE_REG
+ 		p.To.Reg = rd
+ 	case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64SUBW, ssa.OpRISCV64XOR, ssa.OpRISCV64OR, ssa.OpRISCV64AND,
+-		ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRAW, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW,
++		ssa.OpRISCV64SLL, ssa.OpRISCV64SLLW, ssa.OpRISCV64SRA, ssa.OpRISCV64SRAW, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW,
+ 		ssa.OpRISCV64SLT, ssa.OpRISCV64SLTU, ssa.OpRISCV64MUL, ssa.OpRISCV64MULW, ssa.OpRISCV64MULH,
+ 		ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW,
+ 		ssa.OpRISCV64DIVUW, ssa.OpRISCV64REM, ssa.OpRISCV64REMU, ssa.OpRISCV64REMW,
+@@ -422,8 +422,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.To.Type = obj.TYPE_REG
+ 		p.To.Reg = v.Reg()
+ 	case ssa.OpRISCV64ADDI, ssa.OpRISCV64ADDIW, ssa.OpRISCV64XORI, ssa.OpRISCV64ORI, ssa.OpRISCV64ANDI,
+-		ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRAIW, ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI,
+-		ssa.OpRISCV64SLTIU:
++		ssa.OpRISCV64SLLI, ssa.OpRISCV64SLLIW, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRAIW,
++		ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI, ssa.OpRISCV64SLTIU:
+ 		p := s.Prog(v.Op.Asm())
+ 		p.From.Type = obj.TYPE_CONST
+ 		p.From.Offset = v.AuxInt
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 4fef20a565..135d70bc47 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -214,10 +214,10 @@
+ (Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA   x                y)
+ 
+ // Rotates.
+-(RotateLeft8  <t> x (MOVDconst [c])) => (Or8  (Lsh8x64  <t> x (MOVDconst [c&7]))  (Rsh8Ux64  <t> x (MOVDconst [-c&7])))
+-(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+-(RotateLeft32 <t> x (MOVDconst [c])) => (Or32 (Lsh32x64 <t> x (MOVDconst [c&31])) (Rsh32Ux64 <t> x (MOVDconst [-c&31])))
+-(RotateLeft64 <t> x (MOVDconst [c])) => (Or64 (Lsh64x64 <t> x (MOVDconst [c&63])) (Rsh64Ux64 <t> x (MOVDconst [-c&63])))
++(RotateLeft8  <t> x y) => (OR (SLL  <t> x (ANDI [7]  <y.Type> y)) (SRL <t> (ZeroExt8to64  x) (ANDI [7]  <y.Type> (NEG <y.Type> y))))
++(RotateLeft16 <t> x y) => (OR (SLL  <t> x (ANDI [15] <y.Type> y)) (SRL <t> (ZeroExt16to64 x) (ANDI [15] <y.Type> (NEG <y.Type> y))))
++(RotateLeft32 <t> x y) => (OR (SLLW <t> x y) (SRLW <t> x (NEG <y.Type> y)))
++(RotateLeft64 <t> x y) => (OR (SLL  <t> x y) (SRL  <t> x (NEG <y.Type> y)))
+ 
+ (Less64  ...) => (SLT  ...)
+ (Less32  x y) => (SLT  (SignExt32to64 x) (SignExt32to64 y))
+@@ -733,6 +733,7 @@
+ (XOR (MOVDconst [val]) x) && is32Bit(val) => (XORI [val] x)
+ (SLL  x (MOVDconst [val])) => (SLLI [int64(val&63)] x)
+ (SRL  x (MOVDconst [val])) => (SRLI [int64(val&63)] x)
++(SLLW x (MOVDconst [val])) => (SLLIW [int64(val&31)] x)
+ (SRLW x (MOVDconst [val])) => (SRLIW [int64(val&31)] x)
+ (SRA  x (MOVDconst [val])) => (SRAI [int64(val&63)] x)
+ (SRAW x (MOVDconst [val])) => (SRAIW [int64(val&31)] x)
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 9ce6450166..e9f1df0d58 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -207,16 +207,18 @@ func init() {
+ 		{name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}, resultInArg0: true}, // nop, return arg0 in same register
+ 
+ 		// Shift ops
+-		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                   // arg0 << (aux1 & 63)
+-		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                   // arg0 >> (aux1 & 63), signed
+-		{name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"},                 // arg0 >> (aux1 & 31), signed
+-		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                   // arg0 >> (aux1 & 63), unsigned
+-		{name: "SRLW", argLength: 2, reg: gp21, asm: "SRLW"},                 // arg0 >> (aux1 & 31), unsigned
+-		{name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"},   // arg0 << auxint, shift amount 0-63
+-		{name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"},   // arg0 >> auxint, signed, shift amount 0-63
+-		{name: "SRAIW", argLength: 1, reg: gp11, asm: "SRAIW", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-31
+-		{name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"},   // arg0 >> auxint, unsigned, shift amount 0-63
+-		{name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-31
++		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                   // arg0 << (aux1 & 63), logical left shift
++		{name: "SLLW", argLength: 2, reg: gp21, asm: "SLLW"},                 // arg0 << (aux1 & 31), logical left shift of 32 bit value, sign extended to 64 bits
++		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                   // arg0 >> (aux1 & 63), arithmetic right shift
++		{name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"},                 // arg0 >> (aux1 & 31), arithmetic right shift of 32 bit value, sign extended to 64 bits
++		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                   // arg0 >> (aux1 & 63), logical right shift
++		{name: "SRLW", argLength: 2, reg: gp21, asm: "SRLW"},                 // arg0 >> (aux1 & 31), logical right shift of 32 bit value, sign extended to 64 bits
++		{name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"},   // arg0 << auxint, shift amount 0-63, logical left shift
++		{name: "SLLIW", argLength: 1, reg: gp11, asm: "SLLIW", aux: "Int64"}, // arg0 << auxint, shift amount 0-31, logical left shift of 32 bit value, sign extended to 64 bits
++		{name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"},   // arg0 >> auxint, shift amount 0-63, arithmetic right shift
++		{name: "SRAIW", argLength: 1, reg: gp11, asm: "SRAIW", aux: "Int64"}, // arg0 >> auxint, shift amount 0-31, arithmetic right shift of 32 bit value, sign extended to 64 bits
++		{name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"},   // arg0 >> auxint, shift amount 0-63, logical right shift
++		{name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, shift amount 0-31, logical right shift of 32 bit value, sign extended to 64 bits
+ 
+ 		// Bitwise ops
+ 		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index e10b054214..29ca9f5c0f 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2370,11 +2370,13 @@ const (
+ 	OpRISCV64MOVWUreg
+ 	OpRISCV64MOVDnop
+ 	OpRISCV64SLL
++	OpRISCV64SLLW
+ 	OpRISCV64SRA
+ 	OpRISCV64SRAW
+ 	OpRISCV64SRL
+ 	OpRISCV64SRLW
+ 	OpRISCV64SLLI
++	OpRISCV64SLLIW
+ 	OpRISCV64SRAI
+ 	OpRISCV64SRAIW
+ 	OpRISCV64SRLI
+@@ -31778,6 +31780,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "SLLW",
++		argLen: 2,
++		asm:    riscv.ASLLW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:   "SRA",
+ 		argLen: 2,
+@@ -31848,6 +31864,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:    "SLLIW",
++		auxType: auxInt64,
++		argLen:  1,
++		asm:     riscv.ASLLIW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:    "SRAI",
+ 		auxType: auxInt64,
+diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
+index 43843bda55..63d13bf6c4 100644
+--- a/src/cmd/compile/internal/ssa/rewrite.go
++++ b/src/cmd/compile/internal/ssa/rewrite.go
+@@ -2066,7 +2066,7 @@ func canRotate(c *Config, bits int64) bool {
+ 		return false
+ 	}
+ 	switch c.arch {
+-	case "386", "amd64", "arm64":
++	case "386", "amd64", "arm64", "riscv64":
+ 		return true
+ 	case "arm", "s390x", "ppc64", "ppc64le", "wasm", "loong64":
+ 		return bits >= 32
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 230033c7af..ca0e108915 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -538,6 +538,8 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpRISCV64SLL(v)
+ 	case OpRISCV64SLLI:
+ 		return rewriteValueRISCV64_OpRISCV64SLLI(v)
++	case OpRISCV64SLLW:
++		return rewriteValueRISCV64_OpRISCV64SLLW(v)
+ 	case OpRISCV64SLT:
+ 		return rewriteValueRISCV64_OpRISCV64SLT(v)
+ 	case OpRISCV64SLTI:
+@@ -6072,6 +6074,24 @@ func rewriteValueRISCV64_OpRISCV64SLLI(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64SLLW(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (SLLW x (MOVDconst [val]))
++	// result: (SLLIW [int64(val&31)] x)
++	for {
++		x := v_0
++		if v_1.Op != OpRISCV64MOVDconst {
++			break
++		}
++		val := auxIntToInt64(v_1.AuxInt)
++		v.reset(OpRISCV64SLLIW)
++		v.AuxInt = int64ToAuxInt(int64(val & 31))
++		v.AddArg(x)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64SLT(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+@@ -6646,112 +6666,102 @@ func rewriteValueRISCV64_OpRotateLeft16(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+-	// match: (RotateLeft16 <t> x (MOVDconst [c]))
+-	// result: (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
++	// match: (RotateLeft16 <t> x y)
++	// result: (OR (SLL <t> x (ANDI [15] <y.Type> y)) (SRL <t> (ZeroExt16to64 x) (ANDI [15] <y.Type> (NEG <y.Type> y))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+-		if v_1.Op != OpRISCV64MOVDconst {
+-			break
+-		}
+-		c := auxIntToInt64(v_1.AuxInt)
+-		v.reset(OpOr16)
+-		v0 := b.NewValue0(v.Pos, OpLsh16x64, t)
+-		v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
+-		v1.AuxInt = int64ToAuxInt(c & 15)
++		y := v_1
++		v.reset(OpRISCV64OR)
++		v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
++		v1 := b.NewValue0(v.Pos, OpRISCV64ANDI, y.Type)
++		v1.AuxInt = int64ToAuxInt(15)
++		v1.AddArg(y)
+ 		v0.AddArg2(x, v1)
+-		v2 := b.NewValue0(v.Pos, OpRsh16Ux64, t)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
+-		v3.AuxInt = int64ToAuxInt(-c & 15)
+-		v2.AddArg2(x, v3)
++		v2 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
++		v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v3.AddArg(x)
++		v4 := b.NewValue0(v.Pos, OpRISCV64ANDI, y.Type)
++		v4.AuxInt = int64ToAuxInt(15)
++		v5 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type)
++		v5.AddArg(y)
++		v4.AddArg(v5)
++		v2.AddArg2(v3, v4)
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
+-	return false
+ }
+ func rewriteValueRISCV64_OpRotateLeft32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+-	typ := &b.Func.Config.Types
+-	// match: (RotateLeft32 <t> x (MOVDconst [c]))
+-	// result: (Or32 (Lsh32x64 <t> x (MOVDconst [c&31])) (Rsh32Ux64 <t> x (MOVDconst [-c&31])))
++	// match: (RotateLeft32 <t> x y)
++	// result: (OR (SLLW <t> x y) (SRLW <t> x (NEG <y.Type> y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+-		if v_1.Op != OpRISCV64MOVDconst {
+-			break
+-		}
+-		c := auxIntToInt64(v_1.AuxInt)
+-		v.reset(OpOr32)
+-		v0 := b.NewValue0(v.Pos, OpLsh32x64, t)
+-		v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
+-		v1.AuxInt = int64ToAuxInt(c & 31)
+-		v0.AddArg2(x, v1)
+-		v2 := b.NewValue0(v.Pos, OpRsh32Ux64, t)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
+-		v3.AuxInt = int64ToAuxInt(-c & 31)
+-		v2.AddArg2(x, v3)
+-		v.AddArg2(v0, v2)
++		y := v_1
++		v.reset(OpRISCV64OR)
++		v0 := b.NewValue0(v.Pos, OpRISCV64SLLW, t)
++		v0.AddArg2(x, y)
++		v1 := b.NewValue0(v.Pos, OpRISCV64SRLW, t)
++		v2 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type)
++		v2.AddArg(y)
++		v1.AddArg2(x, v2)
++		v.AddArg2(v0, v1)
+ 		return true
+ 	}
+-	return false
+ }
+ func rewriteValueRISCV64_OpRotateLeft64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+-	typ := &b.Func.Config.Types
+-	// match: (RotateLeft64 <t> x (MOVDconst [c]))
+-	// result: (Or64 (Lsh64x64 <t> x (MOVDconst [c&63])) (Rsh64Ux64 <t> x (MOVDconst [-c&63])))
++	// match: (RotateLeft64 <t> x y)
++	// result: (OR (SLL <t> x y) (SRL <t> x (NEG <y.Type> y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+-		if v_1.Op != OpRISCV64MOVDconst {
+-			break
+-		}
+-		c := auxIntToInt64(v_1.AuxInt)
+-		v.reset(OpOr64)
+-		v0 := b.NewValue0(v.Pos, OpLsh64x64, t)
+-		v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
+-		v1.AuxInt = int64ToAuxInt(c & 63)
+-		v0.AddArg2(x, v1)
+-		v2 := b.NewValue0(v.Pos, OpRsh64Ux64, t)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
+-		v3.AuxInt = int64ToAuxInt(-c & 63)
+-		v2.AddArg2(x, v3)
+-		v.AddArg2(v0, v2)
++		y := v_1
++		v.reset(OpRISCV64OR)
++		v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
++		v0.AddArg2(x, y)
++		v1 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
++		v2 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type)
++		v2.AddArg(y)
++		v1.AddArg2(x, v2)
++		v.AddArg2(v0, v1)
+ 		return true
+ 	}
+-	return false
+ }
+ func rewriteValueRISCV64_OpRotateLeft8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+-	// match: (RotateLeft8 <t> x (MOVDconst [c]))
+-	// result: (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
++	// match: (RotateLeft8 <t> x y)
++	// result: (OR (SLL <t> x (ANDI [7] <y.Type> y)) (SRL <t> (ZeroExt8to64 x) (ANDI [7] <y.Type> (NEG <y.Type> y))))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+-		if v_1.Op != OpRISCV64MOVDconst {
+-			break
+-		}
+-		c := auxIntToInt64(v_1.AuxInt)
+-		v.reset(OpOr8)
+-		v0 := b.NewValue0(v.Pos, OpLsh8x64, t)
+-		v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
+-		v1.AuxInt = int64ToAuxInt(c & 7)
++		y := v_1
++		v.reset(OpRISCV64OR)
++		v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
++		v1 := b.NewValue0(v.Pos, OpRISCV64ANDI, y.Type)
++		v1.AuxInt = int64ToAuxInt(7)
++		v1.AddArg(y)
+ 		v0.AddArg2(x, v1)
+-		v2 := b.NewValue0(v.Pos, OpRsh8Ux64, t)
+-		v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
+-		v3.AuxInt = int64ToAuxInt(-c & 7)
+-		v2.AddArg2(x, v3)
++		v2 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
++		v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v3.AddArg(x)
++		v4 := b.NewValue0(v.Pos, OpRISCV64ANDI, y.Type)
++		v4.AuxInt = int64ToAuxInt(7)
++		v5 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type)
++		v5.AddArg(y)
++		v4.AddArg(v5)
++		v2.AddArg2(v3, v4)
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
+-	return false
+ }
+ func rewriteValueRISCV64_OpRsh16Ux16(v *Value) bool {
+ 	v_1 := v.Args[1]
+diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
+index ec89a45701..178ccfb59b 100644
+--- a/src/cmd/compile/internal/ssagen/ssa.go
++++ b/src/cmd/compile/internal/ssagen/ssa.go
+@@ -4776,22 +4776,22 @@ func InitTables() {
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
+ 		},
+-		sys.AMD64)
++		sys.AMD64, sys.RISCV64)
+ 	addF("math/bits", "RotateLeft16",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
+ 		},
+-		sys.AMD64)
++		sys.AMD64, sys.RISCV64)
+ 	addF("math/bits", "RotateLeft32",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
+ 		},
+-		sys.AMD64, sys.ARM, sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm, sys.Loong64)
++		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+ 	addF("math/bits", "RotateLeft64",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
+ 		},
+-		sys.AMD64, sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm, sys.Loong64)
++		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+ 	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
+ 
+ 	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+diff --git a/test/codegen/rotate.go b/test/codegen/rotate.go
+index 5495f86b79..109e55763c 100644
+--- a/test/codegen/rotate.go
++++ b/test/codegen/rotate.go
+@@ -18,6 +18,7 @@ func rot64(x uint64) uint64 {
+ 	// amd64:"ROLQ\t[$]7"
+ 	// ppc64x:"ROTL\t[$]7"
+ 	// loong64: "ROTRV\t[$]57"
++	// riscv64: "OR","SLLI","SRLI",-"AND"
+ 	a += x<<7 | x>>57
+ 
+ 	// amd64:"ROLQ\t[$]8"
+@@ -25,6 +26,7 @@ func rot64(x uint64) uint64 {
+ 	// s390x:"RISBGZ\t[$]0, [$]63, [$]8, "
+ 	// ppc64x:"ROTL\t[$]8"
+ 	// loong64: "ROTRV\t[$]56"
++	// riscv64: "OR","SLLI","SRLI",-"AND"
+ 	a += x<<8 + x>>56
+ 
+ 	// amd64:"ROLQ\t[$]9"
+@@ -32,6 +34,7 @@ func rot64(x uint64) uint64 {
+ 	// s390x:"RISBGZ\t[$]0, [$]63, [$]9, "
+ 	// ppc64x:"ROTL\t[$]9"
+ 	// loong64: "ROTRV\t[$]55"
++	// riscv64: "OR","SLLI","SRLI",-"AND"
+ 	a += x<<9 ^ x>>55
+ 
+ 	// amd64:"ROLQ\t[$]10"
+@@ -41,6 +44,7 @@ func rot64(x uint64) uint64 {
+ 	// arm64:"ROR\t[$]54"
+ 	// s390x:"RISBGZ\t[$]0, [$]63, [$]10, "
+ 	// loong64: "ROTRV\t[$]54"
++	// riscv64: "OR","SLLI","SRLI",-"AND"
+ 	a += bits.RotateLeft64(x, 10)
+ 
+ 	return a
+@@ -53,6 +57,7 @@ func rot32(x uint32) uint32 {
+ 	// arm:"MOVW\tR\\d+@>25"
+ 	// ppc64x:"ROTLW\t[$]7"
+ 	// loong64: "ROTR\t[$]25"
++	// riscv64: "OR","SLLIW","SRLIW",-"AND"
+ 	a += x<<7 | x>>25
+ 
+ 	// amd64:`ROLL\t[$]8`
+@@ -61,6 +66,7 @@ func rot32(x uint32) uint32 {
+ 	// s390x:"RLL\t[$]8"
+ 	// ppc64x:"ROTLW\t[$]8"
+ 	// loong64: "ROTR\t[$]24"
++	// riscv64: "OR","SLLIW","SRLIW",-"AND"
+ 	a += x<<8 + x>>24
+ 
+ 	// amd64:"ROLL\t[$]9"
+@@ -69,6 +75,7 @@ func rot32(x uint32) uint32 {
+ 	// s390x:"RLL\t[$]9"
+ 	// ppc64x:"ROTLW\t[$]9"
+ 	// loong64: "ROTR\t[$]23"
++	// riscv64: "OR","SLLIW","SRLIW",-"AND"
+ 	a += x<<9 ^ x>>23
+ 
+ 	// amd64:"ROLL\t[$]10"
+@@ -79,6 +86,7 @@ func rot32(x uint32) uint32 {
+ 	// arm64:"RORW\t[$]22"
+ 	// s390x:"RLL\t[$]10"
+ 	// loong64: "ROTR\t[$]22"
++	// riscv64: "OR","SLLIW","SRLIW",-"AND"
+ 	a += bits.RotateLeft32(x, 10)
+ 
+ 	return a
+@@ -88,12 +96,15 @@ func rot16(x uint16) uint16 {
+ 	var a uint16
+ 
+ 	// amd64:"ROLW\t[$]7"
++	// riscv64: "OR","SLLI","SRLI",-"AND"
+ 	a += x<<7 | x>>9
+ 
+ 	// amd64:`ROLW\t[$]8`
++	// riscv64: "OR","SLLI","SRLI",-"AND"
+ 	a += x<<8 + x>>8
+ 
+ 	// amd64:"ROLW\t[$]9"
++	// riscv64: "OR","SLLI","SRLI",-"AND"
+ 	a += x<<9 ^ x>>7
+ 
+ 	return a
+@@ -103,12 +114,15 @@ func rot8(x uint8) uint8 {
+ 	var a uint8
+ 
+ 	// amd64:"ROLB\t[$]5"
++	// riscv64: "OR","SLLI","SRLI",-"AND"
+ 	a += x<<5 | x>>3
+ 
+ 	// amd64:`ROLB\t[$]6`
++	// riscv64: "OR","SLLI","SRLI",-"AND"
+ 	a += x<<6 + x>>2
+ 
+ 	// amd64:"ROLB\t[$]7"
++	// riscv64: "OR","SLLI","SRLI",-"AND"
+ 	a += x<<7 ^ x>>1
+ 
+ 	return a
+@@ -127,12 +141,14 @@ func rot64nc(x uint64, z uint) uint64 {
+ 	// arm64:"ROR","NEG",-"AND"
+ 	// ppc64x:"ROTL",-"NEG",-"AND"
+ 	// loong64: "ROTRV", -"AND"
++	// riscv64: "OR","SLL","SRL",-"AND"
+ 	a += x<<z | x>>(64-z)
+ 
+ 	// amd64:"RORQ",-"AND"
+ 	// arm64:"ROR",-"NEG",-"AND"
+ 	// ppc64x:"ROTL","NEG",-"AND"
+ 	// loong64: "ROTRV", -"AND"
++	// riscv64: "OR","SLL","SRL",-"AND"
+ 	a += x>>z | x<<(64-z)
+ 
+ 	return a
+@@ -147,12 +163,14 @@ func rot32nc(x uint32, z uint) uint32 {
+ 	// arm64:"ROR","NEG",-"AND"
+ 	// ppc64x:"ROTLW",-"NEG",-"AND"
+ 	// loong64: "ROTR", -"AND"
++	// riscv64: "OR","SLLW","SRLW",-"AND"
+ 	a += x<<z | x>>(32-z)
+ 
+ 	// amd64:"RORL",-"AND"
+ 	// arm64:"ROR",-"NEG",-"AND"
+ 	// ppc64x:"ROTLW","NEG",-"AND"
+ 	// loong64: "ROTR", -"AND"
++	// riscv64: "OR","SLLW","SRLW",-"AND"
+ 	a += x>>z | x<<(32-z)
+ 
+ 	return a
+@@ -164,9 +182,11 @@ func rot16nc(x uint16, z uint) uint16 {
+ 	z &= 15
+ 
+ 	// amd64:"ROLW",-"ANDQ"
++	// riscv64: "OR","SLL","SRL",-"AND\t"
+ 	a += x<<z | x>>(16-z)
+ 
+ 	// amd64:"RORW",-"ANDQ"
++	// riscv64: "OR","SLL","SRL",-"AND\t"
+ 	a += x>>z | x<<(16-z)
+ 
+ 	return a
+@@ -178,9 +198,11 @@ func rot8nc(x uint8, z uint) uint8 {
+ 	z &= 7
+ 
+ 	// amd64:"ROLB",-"ANDQ"
++	// riscv64: "OR","SLL","SRL",-"AND\t"
+ 	a += x<<z | x>>(8-z)
+ 
+ 	// amd64:"RORB",-"ANDQ"
++	// riscv64: "OR","SLL","SRL",-"AND\t"
+ 	a += x>>z | x<<(8-z)
+ 
+ 	return a
+-- 
+2.39.5
+
diff --git a/2029-cmd-asm-cmd-internal-obj-enable-rounding-mode-suffix.patch b/2029-cmd-asm-cmd-internal-obj-enable-rounding-mode-suffix.patch
new file mode 100644
index 0000000..15583f0
--- /dev/null
+++ b/2029-cmd-asm-cmd-internal-obj-enable-rounding-mode-suffix.patch
@@ -0,0 +1,308 @@
+From 7788f5dcc7c9046892c55c74e751b0409b4631d7 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 029/119] cmd/asm, cmd/internal/obj: enable rounding mode
+ suffix for riscv64
+
+This CL adds rounding modes for riscv64 floating point conversion
+instructions by suffix with 5 modes: RNE, RTZ, RDN, RUP and RMM.
+
+For example, for round to nearest (RNE), we can use `FCVTLD.RNE`
+According to RISCV manual 8.7 and 9.5, we changed these
+conversion instructions:
+
+FCVTWS
+FCVTLS
+FCVTWUS
+FCVTLUS
+FCVTWD
+FCVTLD
+FCVTWUD
+FCVTLUD
+
+Note: Round towards zero (RTZ) by default for all these instructions above.
+
+Change-Id: I491e522e14d721e24aa7f528ee0c4640c54c5808
+Reviewed-on: https://go-review.googlesource.com/c/go/+/504736
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Run-TryBot: M Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Than McIntosh <thanm@google.com>
+---
+ src/cmd/asm/internal/asm/asm.go             |  7 ++-
+ src/cmd/asm/internal/asm/parse.go           |  4 +-
+ src/cmd/asm/internal/asm/testdata/riscv64.s | 40 ++++++++++++++++
+ src/cmd/internal/obj/link.go                |  2 +-
+ src/cmd/internal/obj/riscv/cpu.go           | 51 ++++++++++++++++++++-
+ src/cmd/internal/obj/riscv/list.go          | 16 +++++++
+ src/cmd/internal/obj/riscv/obj.go           | 17 ++++++-
+ 7 files changed, 130 insertions(+), 7 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/asm.go b/src/cmd/asm/internal/asm/asm.go
+index 563e794706..223c613bd9 100644
+--- a/src/cmd/asm/internal/asm/asm.go
++++ b/src/cmd/asm/internal/asm/asm.go
+@@ -16,6 +16,7 @@ import (
+ 	"cmd/asm/internal/lex"
+ 	"cmd/internal/obj"
+ 	"cmd/internal/obj/ppc64"
++	"cmd/internal/obj/riscv"
+ 	"cmd/internal/obj/x86"
+ 	"cmd/internal/sys"
+ )
+@@ -46,7 +47,11 @@ func (p *Parser) append(prog *obj.Prog, cond string, doLabel bool) {
+ 				p.errorf("%v", err)
+ 				return
+ 			}
+-
++		case sys.RISCV64:
++			if err := riscv.ParseSuffix(prog, cond); err != nil {
++				p.errorf("unrecognized suffix .%q", cond)
++				return
++			}
+ 		default:
+ 			p.errorf("unrecognized suffix .%q", cond)
+ 			return
+diff --git a/src/cmd/asm/internal/asm/parse.go b/src/cmd/asm/internal/asm/parse.go
+index 37f8e6c0bc..ecee98593d 100644
+--- a/src/cmd/asm/internal/asm/parse.go
++++ b/src/cmd/asm/internal/asm/parse.go
+@@ -209,8 +209,8 @@ next:
+ 		for {
+ 			tok = p.nextToken()
+ 			if len(operands) == 0 && len(items) == 0 {
+-				if p.arch.InFamily(sys.ARM, sys.ARM64, sys.AMD64, sys.I386) && tok == '.' {
+-					// Suffixes: ARM conditionals or x86 modifiers.
++				if p.arch.InFamily(sys.ARM, sys.ARM64, sys.AMD64, sys.I386, sys.RISCV64) && tok == '.' {
++					// Suffixes: ARM conditionals, RISCV rounding mode or x86 modifiers.
+ 					tok = p.nextToken()
+ 					str := p.lex.Text()
+ 					if tok != scanner.Ident {
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 11a9e30080..837351508f 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -233,11 +233,31 @@ start:
+ 
+ 	// 11.7: Single-Precision Floating-Point Conversion and Move Instructions
+ 	FCVTWS	F0, X5					// d31200c0
++	FCVTWS.RNE	F0, X5				// d30200c0
++	FCVTWS.RTZ	F0, X5				// d31200c0
++	FCVTWS.RDN	F0, X5				// d32200c0
++	FCVTWS.RUP	F0, X5				// d33200c0
++	FCVTWS.RMM	F0, X5				// d34200c0
+ 	FCVTLS	F0, X5					// d31220c0
++	FCVTLS.RNE	F0, X5				// d30220c0
++	FCVTLS.RTZ	F0, X5				// d31220c0
++	FCVTLS.RDN	F0, X5				// d32220c0
++	FCVTLS.RUP	F0, X5				// d33220c0
++	FCVTLS.RMM	F0, X5				// d34220c0
+ 	FCVTSW	X5, F0					// 538002d0
+ 	FCVTSL	X5, F0					// 538022d0
+ 	FCVTWUS	F0, X5					// d31210c0
++	FCVTWUS.RNE	F0, X5				// d30210c0
++	FCVTWUS.RTZ	F0, X5				// d31210c0
++	FCVTWUS.RDN	F0, X5				// d32210c0
++	FCVTWUS.RUP	F0, X5				// d33210c0
++	FCVTWUS.RMM	F0, X5				// d34210c0
+ 	FCVTLUS	F0, X5					// d31230c0
++	FCVTLUS.RNE	F0, X5				// d30230c0
++	FCVTLUS.RTZ	F0, X5				// d31230c0
++	FCVTLUS.RDN	F0, X5				// d32230c0
++	FCVTLUS.RUP	F0, X5				// d33230c0
++	FCVTLUS.RMM	F0, X5				// d34230c0
+ 	FCVTSWU	X5, F0					// 538012d0
+ 	FCVTSLU	X5, F0					// 538032d0
+ 	FSGNJS	F1, F0, F2				// 53011020
+@@ -277,11 +297,31 @@ start:
+ 
+ 	// 12.5: Double-Precision Floating-Point Conversion and Move Instructions
+ 	FCVTWD	F0, X5					// d31200c2
++	FCVTWD.RNE	F0, X5				// d30200c2
++	FCVTWD.RTZ	F0, X5				// d31200c2
++	FCVTWD.RDN	F0, X5				// d32200c2
++	FCVTWD.RUP	F0, X5				// d33200c2
++	FCVTWD.RMM	F0, X5				// d34200c2
+ 	FCVTLD	F0, X5					// d31220c2
++	FCVTLD.RNE	F0, X5				// d30220c2
++	FCVTLD.RTZ	F0, X5				// d31220c2
++	FCVTLD.RDN	F0, X5				// d32220c2
++	FCVTLD.RUP	F0, X5				// d33220c2
++	FCVTLD.RMM	F0, X5				// d34220c2
+ 	FCVTDW	X5, F0					// 538002d2
+ 	FCVTDL	X5, F0					// 538022d2
+ 	FCVTWUD F0, X5					// d31210c2
++	FCVTWUD.RNE F0, X5				// d30210c2
++	FCVTWUD.RTZ F0, X5				// d31210c2
++	FCVTWUD.RDN F0, X5				// d32210c2
++	FCVTWUD.RUP F0, X5				// d33210c2
++	FCVTWUD.RMM F0, X5				// d34210c2
+ 	FCVTLUD F0, X5					// d31230c2
++	FCVTLUD.RNE F0, X5				// d30230c2
++	FCVTLUD.RTZ F0, X5				// d31230c2
++	FCVTLUD.RDN F0, X5				// d32230c2
++	FCVTLUD.RUP F0, X5				// d33230c2
++	FCVTLUD.RMM F0, X5				// d34230c2
+ 	FCVTDWU X5, F0					// 538012d2
+ 	FCVTDLU X5, F0					// 538032d2
+ 	FCVTSD	F0, F1					// d3001040
+diff --git a/src/cmd/internal/obj/link.go b/src/cmd/internal/obj/link.go
+index f13f9b4c70..b12bf2399a 100644
+--- a/src/cmd/internal/obj/link.go
++++ b/src/cmd/internal/obj/link.go
+@@ -314,7 +314,7 @@ type Prog struct {
+ 	RegTo2   int16     // 2nd destination operand
+ 	Mark     uint16    // bitmask of arch-specific items
+ 	Optab    uint16    // arch-specific opcode index
+-	Scond    uint8     // bits that describe instruction suffixes (e.g. ARM conditions)
++	Scond    uint8     // bits that describe instruction suffixes (e.g. ARM conditions, RISCV Rounding Mode)
+ 	Back     uint8     // for x86 back end: backwards branch state
+ 	Ft       uint8     // for x86 back end: type index of Prog.From
+ 	Tt       uint8     // for x86 back end: type index of Prog.To
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index bfd5153da4..0f63a616f7 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -28,7 +28,12 @@
+ 
+ package riscv
+ 
+-import "cmd/internal/obj"
++import (
++	"errors"
++	"fmt"
++
++	"cmd/internal/obj"
++)
+ 
+ //go:generate go run ../stringer.go -i $GOFILE -o anames.go -p riscv
+ 
+@@ -602,6 +607,50 @@ const (
+ 	ALAST
+ )
+ 
++// opSuffix encoding to uint8 which fit into p.Scond
++var rmSuffixSet = map[string]uint8{
++	"RNE": RM_RNE,
++	"RTZ": RM_RTZ,
++	"RDN": RM_RDN,
++	"RUP": RM_RUP,
++	"RMM": RM_RMM,
++}
++
++const rmSuffixBit uint8 = 1 << 7
++
++func rmSuffixEncode(s string) (uint8, error) {
++	if s == "" {
++		return 0, errors.New("empty suffix")
++	}
++	enc, ok := rmSuffixSet[s]
++	if !ok {
++		return 0, fmt.Errorf("invalid encoding for unknown suffix:%q", s)
++	}
++	return enc | rmSuffixBit, nil
++}
++
++func rmSuffixString(u uint8) (string, error) {
++	if u&rmSuffixBit == 0 {
++		return "", fmt.Errorf("invalid suffix, require round mode bit:%x", u)
++	}
++
++	u &^= rmSuffixBit
++	for k, v := range rmSuffixSet {
++		if v == u {
++			return k, nil
++		}
++	}
++	return "", fmt.Errorf("unknown suffix:%x", u)
++}
++
++const (
++	RM_RNE uint8 = iota // Round to Nearest, ties to Even
++	RM_RTZ              // Round towards Zero
++	RM_RDN              // Round Down
++	RM_RUP              // Round Up
++	RM_RMM              // Round to Nearest, ties to Max Magnitude
++)
++
+ // All unary instructions which write to their arguments (as opposed to reading
+ // from them) go here. The assembly parser uses this information to populate
+ // its AST in a semantically reasonable way.
+diff --git a/src/cmd/internal/obj/riscv/list.go b/src/cmd/internal/obj/riscv/list.go
+index de90961e32..bc87539f27 100644
+--- a/src/cmd/internal/obj/riscv/list.go
++++ b/src/cmd/internal/obj/riscv/list.go
+@@ -13,6 +13,7 @@ import (
+ func init() {
+ 	obj.RegisterRegister(obj.RBaseRISCV, REG_END, RegName)
+ 	obj.RegisterOpcode(obj.ABaseRISCV, Anames)
++	obj.RegisterOpSuffix("riscv64", opSuffixString)
+ }
+ 
+ func RegName(r int) string {
+@@ -31,3 +32,18 @@ func RegName(r int) string {
+ 		return fmt.Sprintf("Rgok(%d)", r-obj.RBaseRISCV)
+ 	}
+ }
++
++func opSuffixString(s uint8) string {
++	if s&rmSuffixBit == 0 {
++		return ""
++	}
++
++	ss, err := rmSuffixString(s)
++	if err != nil {
++		ss = fmt.Sprintf("<invalid 0x%x>", s)
++	}
++	if ss == "" {
++		return ss
++	}
++	return fmt.Sprintf(".%s", ss)
++}
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 02d08fec76..8020624c70 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -28,6 +28,7 @@ import (
+ 	"internal/abi"
+ 	"log"
+ 	"math/bits"
++	"strings"
+ )
+ 
+ func buildop(ctxt *obj.Link) {}
+@@ -2272,8 +2273,12 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		ins.imm = 0x0ff
+ 
+ 	case AFCVTWS, AFCVTLS, AFCVTWUS, AFCVTLUS, AFCVTWD, AFCVTLD, AFCVTWUD, AFCVTLUD:
+-		// Set the rounding mode in funct3 to round to zero.
+-		ins.funct3 = 1
++		// Set the default rounding mode in funct3 to round to zero.
++		if p.Scond&rmSuffixBit == 0 {
++			ins.funct3 = uint32(RM_RTZ)
++		} else {
++			ins.funct3 = uint32(p.Scond &^ rmSuffixBit)
++		}
+ 
+ 	case AFNES, AFNED:
+ 		// Replace FNE[SD] with FEQ[SD] and NOT.
+@@ -2474,6 +2479,14 @@ func isUnsafePoint(p *obj.Prog) bool {
+ 	return p.Mark&USES_REG_TMP == USES_REG_TMP || p.From.Reg == REG_TMP || p.To.Reg == REG_TMP || p.Reg == REG_TMP
+ }
+ 
++func ParseSuffix(prog *obj.Prog, cond string) (err error) {
++	switch prog.As {
++	case AFCVTWS, AFCVTLS, AFCVTWUS, AFCVTLUS, AFCVTWD, AFCVTLD, AFCVTWUD, AFCVTLUD:
++		prog.Scond, err = rmSuffixEncode(strings.TrimPrefix(cond, "."))
++	}
++	return
++}
++
+ var LinkRISCV64 = obj.LinkArch{
+ 	Arch:           sys.ArchRISCV64,
+ 	Init:           buildop,
+-- 
+2.39.5
+
diff --git a/2030-math-add-round-assembly-implementations-on-riscv64.patch b/2030-math-add-round-assembly-implementations-on-riscv64.patch
new file mode 100644
index 0000000..49c0dc2
--- /dev/null
+++ b/2030-math-add-round-assembly-implementations-on-riscv64.patch
@@ -0,0 +1,113 @@
+From b72a91d17ed5ef4149ad33f7c5ad4e80ba7a861e Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 030/119] math: add round assembly implementations on riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+goos: linux
+goarch: riscv64
+pkg: math
+            │ floor_old.bench │           floor_new.bench           │
+            │     sec/op      │   sec/op     vs base                │
+Ceil              54.12n ± 0%   22.05n ± 0%  -59.26% (p=0.000 n=10)
+Floor             40.80n ± 0%   22.05n ± 0%  -45.96% (p=0.000 n=10)
+Round             20.73n ± 0%   20.74n ± 0%        ~ (p=0.441 n=10)
+RoundToEven       24.07n ± 0%   24.07n ± 0%        ~ (p=1.000 n=10)
+Trunc             38.73n ± 0%   22.05n ± 0%  -43.07% (p=0.000 n=10)
+geomean           33.58n        22.17n       -33.98%
+
+Change-Id: I24fb9e3bbf8146da253b6791b21377bea1afbd16
+Reviewed-on: https://go-review.googlesource.com/c/go/+/504737
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Russ Cox <rsc@golang.org>
+Reviewed-by: M Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Run-TryBot: M Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+---
+ src/math/floor_asm.go    |  2 +-
+ src/math/floor_noasm.go  |  2 +-
+ src/math/floor_riscv64.s | 41 ++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 43 insertions(+), 2 deletions(-)
+ create mode 100644 src/math/floor_riscv64.s
+
+diff --git a/src/math/floor_asm.go b/src/math/floor_asm.go
+index fb419d6da2..5cb45f5a7e 100644
+--- a/src/math/floor_asm.go
++++ b/src/math/floor_asm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || s390x || wasm
++//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || riscv64 || s390x || wasm
+ 
+ package math
+ 
+diff --git a/src/math/floor_noasm.go b/src/math/floor_noasm.go
+index 5641c7ea0a..6754ca8fc8 100644
+--- a/src/math/floor_noasm.go
++++ b/src/math/floor_noasm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !s390x && !wasm
++//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !riscv64 && !s390x && !wasm
+ 
+ package math
+ 
+diff --git a/src/math/floor_riscv64.s b/src/math/floor_riscv64.s
+new file mode 100644
+index 0000000000..62ce963781
+--- /dev/null
++++ b/src/math/floor_riscv64.s
+@@ -0,0 +1,41 @@
++// Copyright 2023 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++#include "textflag.h"
++
++#define PosInf 0x7FF0000000000000
++
++// The rounding mode of RISC-V is different from Go spec.
++
++#define ROUNDFN(NAME, MODE) 	\
++TEXT NAME(SB),NOSPLIT,$0; 	\
++	MOVD	x+0(FP), F0; 	\
++	/* whether x is NaN */; \
++	FEQD	F0, F0, X6;	\
++	BNEZ	X6, 3(PC);	\
++	/* return NaN if x is NaN */; \
++	MOVD	F0, ret+8(FP); 	\
++	RET;			\
++	MOV	$PosInf, X6;	\
++	FMVDX	X6, F1;		\
++	FABSD	F0, F2;		\
++	/* if abs(x) > +Inf, return Inf instead of round(x) */; \
++	FLTD	F1, F2, X6;	\
++	/* Inf should keep same signed with x then return */;	\
++	BEQZ	X6, 3(PC); \
++	FCVTLD.MODE	F0, X6;	\
++	FCVTDL	X6, F1;		\
++	/* rounding will drop signed bit in RISCV, restore it */; \
++	FSGNJD	F0, F1, F0;	\
++	MOVD	F0, ret+8(FP); 	\
++	RET
++
++// func archFloor(x float64) float64
++ROUNDFN(·archFloor, RDN)
++
++// func archCeil(x float64) float64
++ROUNDFN(·archCeil, RUP)
++
++// func archTrunc(x float64) float64
++ROUNDFN(·archTrunc, RTZ)
+-- 
+2.39.5
+
diff --git a/2031-cmd-link-internal-riscv64-generate-local-text-symbol.patch b/2031-cmd-link-internal-riscv64-generate-local-text-symbol.patch
new file mode 100644
index 0000000..32f3aba
--- /dev/null
+++ b/2031-cmd-link-internal-riscv64-generate-local-text-symbol.patch
@@ -0,0 +1,47 @@
+From f5bd8318d01641e10dfcd98d22e07030d2189beb Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 031/119] cmd/link/internal/riscv64: generate local text
+ symbols for R_RISCV_CALL
+
+Correctly generate local text symbols needed for R_RISCV_CALL when
+external linking. R_RISCV_CALL was added in CL #520095 as a way of
+marking AUIPC+JALR pairs, instead of overloading R_RISCV_PCREL_ITYPE.
+However, genSymsLate was not updated to generate local text symbols
+for the new relocation type, leading to HI20 symbol lookup failures.
+
+This issue is detected by cmd/internal/obj/riscv.TestLargeCall,
+however this is unfortunately skipped in short mode.
+
+Fixes #65646
+
+Change-Id: I8ee0f13791e0628f31657bf7dae2be8482b689b5
+Reviewed-on: https://go-review.googlesource.com/c/go/+/567375
+Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/cmd/link/internal/riscv64/asm.go | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/src/cmd/link/internal/riscv64/asm.go b/src/cmd/link/internal/riscv64/asm.go
+index f3186398eb..55fb98199e 100644
+--- a/src/cmd/link/internal/riscv64/asm.go
++++ b/src/cmd/link/internal/riscv64/asm.go
+@@ -38,8 +38,8 @@ func genSymsLate(ctxt *ld.Link, ldr *loader.Loader) {
+ 		relocs := ldr.Relocs(s)
+ 		for ri := 0; ri < relocs.Count(); ri++ {
+ 			r := relocs.At(ri)
+-			if r.Type() != objabi.R_RISCV_PCREL_ITYPE && r.Type() != objabi.R_RISCV_PCREL_STYPE &&
+-				r.Type() != objabi.R_RISCV_TLS_IE {
++			if r.Type() != objabi.R_RISCV_CALL && r.Type() != objabi.R_RISCV_PCREL_ITYPE &&
++				r.Type() != objabi.R_RISCV_PCREL_STYPE && r.Type() != objabi.R_RISCV_TLS_IE {
+ 				continue
+ 			}
+ 			if r.Off() == 0 && ldr.SymType(s) == sym.STEXT {
+-- 
+2.39.5
+
diff --git a/2032-cmd-compile-cmd-internal-obj-provide-rotation-pseudo.patch b/2032-cmd-compile-cmd-internal-obj-provide-rotation-pseudo.patch
new file mode 100644
index 0000000..dc8b2d3
--- /dev/null
+++ b/2032-cmd-compile-cmd-internal-obj-provide-rotation-pseudo.patch
@@ -0,0 +1,864 @@
+From 3d3ebd4e882f711158e197cd7e95033e014b2216 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 032/119] cmd/compile,cmd/internal/obj: provide rotation
+ pseudo-instructions for riscv64
+
+Provide and use rotation pseudo-instructions for riscv64. The RISC-V bitmanip
+extension adds support for hardware rotation instructions in the form of ROL,
+ROLW, ROR, RORI, RORIW and RORW. These are easily implemented in the assembler
+as pseudo-instructions for CPUs that do not support the bitmanip extension.
+
+This approach provides a number of advantages, including reducing the rewrite
+rules needed in the compiler, simplifying codegen tests and most importantly,
+allowing these instructions to be used in assembly (for example, riscv64
+optimised versions of SHA-256 and SHA-512). When bitmanip support is added,
+these instruction sequences can simply be replaced with a single instruction
+if permitted by the GORISCV64 profile.
+
+Change-Id: Ia23402e1a82f211ac760690deb063386056ae1fa
+Reviewed-on: https://go-review.googlesource.com/c/go/+/565015
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Michael Knyszek <mknyszek@google.com>
+Reviewed-by: M Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   |  18 ++
+ src/cmd/compile/internal/riscv64/ssa.go       |   4 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  15 +-
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |  14 +-
+ src/cmd/compile/internal/ssa/opGen.go         | 124 ++++++++++--
+ .../compile/internal/ssa/rewriteRISCV64.go    | 185 +++++++++++++-----
+ src/cmd/internal/obj/riscv/anames.go          |   6 +
+ src/cmd/internal/obj/riscv/cpu.go             |   6 +
+ src/cmd/internal/obj/riscv/obj.go             |  51 ++++-
+ src/crypto/sha512/sha512block_riscv64.s       |  25 +--
+ test/codegen/rotate.go                        |  24 +--
+ 11 files changed, 376 insertions(+), 96 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 837351508f..ed691f4d9e 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -417,6 +417,24 @@ start:
+ 	NEGW	X5					// bb025040
+ 	NEGW	X5, X6					// 3b035040
+ 
++	// Bitwise rotation pseudo-instructions
++	ROL	X5, X6, X7				// b30f5040b35ff301b3135300b3e37f00
++	ROL	X5, X6					// b30f5040b35ff3013313530033e36f00
++	ROLW	X5, X6, X7				// b30f5040bb5ff301bb135300b3e37f00
++	ROLW	X5, X6					// b30f5040bb5ff3013b13530033e36f00
++	ROR	X5, X6, X7				// b30f5040b31ff301b3535300b3e37f00
++	ROR	X5, X6					// b30f5040b31ff3013353530033e36f00
++	RORW	X5, X6, X7				// b30f5040bb1ff301bb535300b3e37f00
++	RORW	X5, X6					// b30f5040bb1ff3013b53530033e36f00
++	RORI	$5, X6, X7				// 935f53009313b303b3e37f00
++	RORI	$5, X6					// 935f53001313b30333e36f00
++	RORIW	$5, X6, X7				// 9b5f53009b13b301b3e37f00
++	RORIW	$5, X6					// 9b5f53001b13b30133e36f00
++	ROR	$5, X6, X7				// 935f53009313b303b3e37f00
++	ROR	$5, X6					// 935f53001313b30333e36f00
++	RORW	$5, X6, X7				// 9b5f53009b13b301b3e37f00
++	RORW	$5, X6					// 9b5f53001b13b30133e36f00
++
+ 	// This jumps to the second instruction in the function (the
+ 	// first instruction is an invisible stack pointer adjustment).
+ 	JMP	start					// JMP	2
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 17f0d98532..c9e75b2180 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -283,6 +283,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW,
+ 		ssa.OpRISCV64DIVUW, ssa.OpRISCV64REM, ssa.OpRISCV64REMU, ssa.OpRISCV64REMW,
+ 		ssa.OpRISCV64REMUW,
++		ssa.OpRISCV64ROL, ssa.OpRISCV64ROLW, ssa.OpRISCV64ROR, ssa.OpRISCV64RORW,
+ 		ssa.OpRISCV64FADDS, ssa.OpRISCV64FSUBS, ssa.OpRISCV64FMULS, ssa.OpRISCV64FDIVS,
+ 		ssa.OpRISCV64FEQS, ssa.OpRISCV64FNES, ssa.OpRISCV64FLTS, ssa.OpRISCV64FLES,
+ 		ssa.OpRISCV64FADDD, ssa.OpRISCV64FSUBD, ssa.OpRISCV64FMULD, ssa.OpRISCV64FDIVD,
+@@ -423,7 +424,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.To.Reg = v.Reg()
+ 	case ssa.OpRISCV64ADDI, ssa.OpRISCV64ADDIW, ssa.OpRISCV64XORI, ssa.OpRISCV64ORI, ssa.OpRISCV64ANDI,
+ 		ssa.OpRISCV64SLLI, ssa.OpRISCV64SLLIW, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRAIW,
+-		ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI, ssa.OpRISCV64SLTIU:
++		ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI, ssa.OpRISCV64SLTIU,
++		ssa.OpRISCV64RORI, ssa.OpRISCV64RORIW:
+ 		p := s.Prog(v.Op.Asm())
+ 		p.From.Type = obj.TYPE_CONST
+ 		p.From.Offset = v.AuxInt
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 135d70bc47..c2df433315 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -216,8 +216,8 @@
+ // Rotates.
+ (RotateLeft8  <t> x y) => (OR (SLL  <t> x (ANDI [7]  <y.Type> y)) (SRL <t> (ZeroExt8to64  x) (ANDI [7]  <y.Type> (NEG <y.Type> y))))
+ (RotateLeft16 <t> x y) => (OR (SLL  <t> x (ANDI [15] <y.Type> y)) (SRL <t> (ZeroExt16to64 x) (ANDI [15] <y.Type> (NEG <y.Type> y))))
+-(RotateLeft32 <t> x y) => (OR (SLLW <t> x y) (SRLW <t> x (NEG <y.Type> y)))
+-(RotateLeft64 <t> x y) => (OR (SLL  <t> x y) (SRL  <t> x (NEG <y.Type> y)))
++(RotateLeft32 ...) => (ROLW ...)
++(RotateLeft64 ...) => (ROL  ...)
+ 
+ (Less64  ...) => (SLT  ...)
+ (Less32  x y) => (SLT  (SignExt32to64 x) (SignExt32to64 y))
+@@ -665,6 +665,9 @@
+ (MOVWreg x:(DIVUW _ _)) => (MOVDreg x)
+ (MOVWreg x:(REMW  _ _)) => (MOVDreg x)
+ (MOVWreg x:(REMUW _ _)) => (MOVDreg x)
++(MOVWreg x:(ROLW  _ _)) => (MOVDreg x)
++(MOVWreg x:(RORW  _ _)) => (MOVDreg x)
++(MOVWreg x:(RORIW   _)) => (MOVDreg x)
+ 
+ // Fold double extensions.
+ (MOVBreg  x:(MOVBreg  _)) => (MOVDreg x)
+@@ -731,6 +734,10 @@
+ (AND (MOVDconst [val]) x) && is32Bit(val) => (ANDI [val] x)
+ (OR  (MOVDconst [val]) x) && is32Bit(val) => (ORI  [val] x)
+ (XOR (MOVDconst [val]) x) && is32Bit(val) => (XORI [val] x)
++(ROL  x (MOVDconst [val])) => (RORI  [int64(int8(-val)&63)] x)
++(ROLW x (MOVDconst [val])) => (RORIW [int64(int8(-val)&31)] x)
++(ROR  x (MOVDconst [val])) => (RORI  [int64(val&63)] x)
++(RORW x (MOVDconst [val])) => (RORIW [int64(val&31)] x)
+ (SLL  x (MOVDconst [val])) => (SLLI [int64(val&63)] x)
+ (SRL  x (MOVDconst [val])) => (SRLI [int64(val&63)] x)
+ (SLLW x (MOVDconst [val])) => (SLLIW [int64(val&31)] x)
+@@ -740,6 +747,10 @@
+ (SLT  x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTI  [val] x)
+ (SLTU x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTIU [val] x)
+ 
++// Replace negated left rotation with right rotation.
++(ROL  x (NEG y)) => (ROR  x y)
++(ROLW x (NEG y)) => (RORW x y)
++
+ // Convert const subtraction into ADDI with negative immediate, where possible.
+ (SUB x (MOVDconst [val])) && is32Bit(-val) => (ADDI [-val] x)
+ (SUB <t> (MOVDconst [val]) y) && is32Bit(-val) => (NEG (ADDI <t> [-val] y))
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index e9f1df0d58..13fa91864b 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -221,13 +221,19 @@ func init() {
+ 		{name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, shift amount 0-31, logical right shift of 32 bit value, sign extended to 64 bits
+ 
+ 		// Bitwise ops
+-		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1
+-		{name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"},    // arg0 ^ auxint
+-		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},   // arg0 | arg1
+-		{name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"},      // arg0 | auxint
+ 		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+ 		{name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"},    // arg0 & auxint
+ 		{name: "NOT", argLength: 1, reg: gp11, asm: "NOT"},                    // ^arg0
++		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},   // arg0 | arg1
++		{name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"},      // arg0 | auxint
++		{name: "ROL", argLength: 2, reg: gp21, asm: "ROL"},                    // rotate left arg0 by (arg1 & 63)
++		{name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"},                  // rotate left least significant word of arg0 by (arg1 & 31), sign extended
++		{name: "ROR", argLength: 2, reg: gp21, asm: "ROR"},                    // rotate right arg0 by (arg1 & 63)
++		{name: "RORI", argLength: 1, reg: gp11, asm: "RORI", aux: "Int64"},    // rotate right arg0 by auxint, shift amount 0-63
++		{name: "RORIW", argLength: 1, reg: gp11, asm: "RORIW", aux: "Int64"},  // rotate right least significant word of arg0 by auxint, shift amount 0-31, sign extended
++		{name: "RORW", argLength: 2, reg: gp21, asm: "RORW"},                  // rotate right least significant word of arg0 by (arg1 & 31), sign extended
++		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1
++		{name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"},    // arg0 ^ auxint
+ 
+ 		// Generate boolean values
+ 		{name: "SEQZ", argLength: 1, reg: gp11, asm: "SEQZ"},                 // arg0 == 0, result is 0 or 1
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index 29ca9f5c0f..dd80a2c52a 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2381,13 +2381,19 @@ const (
+ 	OpRISCV64SRAIW
+ 	OpRISCV64SRLI
+ 	OpRISCV64SRLIW
+-	OpRISCV64XOR
+-	OpRISCV64XORI
+-	OpRISCV64OR
+-	OpRISCV64ORI
+ 	OpRISCV64AND
+ 	OpRISCV64ANDI
+ 	OpRISCV64NOT
++	OpRISCV64OR
++	OpRISCV64ORI
++	OpRISCV64ROL
++	OpRISCV64ROLW
++	OpRISCV64ROR
++	OpRISCV64RORI
++	OpRISCV64RORIW
++	OpRISCV64RORW
++	OpRISCV64XOR
++	OpRISCV64XORI
+ 	OpRISCV64SEQZ
+ 	OpRISCV64SNEZ
+ 	OpRISCV64SLT
+@@ -31935,10 +31941,10 @@ var opcodeTable = [...]opInfo{
+ 		},
+ 	},
+ 	{
+-		name:        "XOR",
++		name:        "AND",
+ 		argLen:      2,
+ 		commutative: true,
+-		asm:         riscv.AXOR,
++		asm:         riscv.AAND,
+ 		reg: regInfo{
+ 			inputs: []inputInfo{
+ 				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+@@ -31950,10 +31956,23 @@ var opcodeTable = [...]opInfo{
+ 		},
+ 	},
+ 	{
+-		name:    "XORI",
++		name:    "ANDI",
+ 		auxType: auxInt64,
+ 		argLen:  1,
+-		asm:     riscv.AXORI,
++		asm:     riscv.AANDI,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:   "NOT",
++		argLen: 1,
++		asm:    riscv.ANOT,
+ 		reg: regInfo{
+ 			inputs: []inputInfo{
+ 				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+@@ -31993,10 +32012,9 @@ var opcodeTable = [...]opInfo{
+ 		},
+ 	},
+ 	{
+-		name:        "AND",
+-		argLen:      2,
+-		commutative: true,
+-		asm:         riscv.AAND,
++		name:   "ROL",
++		argLen: 2,
++		asm:    riscv.AROL,
+ 		reg: regInfo{
+ 			inputs: []inputInfo{
+ 				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+@@ -32008,10 +32026,38 @@ var opcodeTable = [...]opInfo{
+ 		},
+ 	},
+ 	{
+-		name:    "ANDI",
++		name:   "ROLW",
++		argLen: 2,
++		asm:    riscv.AROLW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:   "ROR",
++		argLen: 2,
++		asm:    riscv.AROR,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:    "RORI",
+ 		auxType: auxInt64,
+ 		argLen:  1,
+-		asm:     riscv.AANDI,
++		asm:     riscv.ARORI,
+ 		reg: regInfo{
+ 			inputs: []inputInfo{
+ 				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+@@ -32022,9 +32068,53 @@ var opcodeTable = [...]opInfo{
+ 		},
+ 	},
+ 	{
+-		name:   "NOT",
+-		argLen: 1,
+-		asm:    riscv.ANOT,
++		name:    "RORIW",
++		auxType: auxInt64,
++		argLen:  1,
++		asm:     riscv.ARORIW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:   "RORW",
++		argLen: 2,
++		asm:    riscv.ARORW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:        "XOR",
++		argLen:      2,
++		commutative: true,
++		asm:         riscv.AXOR,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:    "XORI",
++		auxType: auxInt64,
++		argLen:  1,
++		asm:     riscv.AXORI,
+ 		reg: regInfo{
+ 			inputs: []inputInfo{
+ 				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index ca0e108915..28c44da5a8 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -532,6 +532,14 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpRISCV64OR(v)
+ 	case OpRISCV64ORI:
+ 		return rewriteValueRISCV64_OpRISCV64ORI(v)
++	case OpRISCV64ROL:
++		return rewriteValueRISCV64_OpRISCV64ROL(v)
++	case OpRISCV64ROLW:
++		return rewriteValueRISCV64_OpRISCV64ROLW(v)
++	case OpRISCV64ROR:
++		return rewriteValueRISCV64_OpRISCV64ROR(v)
++	case OpRISCV64RORW:
++		return rewriteValueRISCV64_OpRISCV64RORW(v)
+ 	case OpRISCV64SEQZ:
+ 		return rewriteValueRISCV64_OpRISCV64SEQZ(v)
+ 	case OpRISCV64SLL:
+@@ -571,9 +579,11 @@ func rewriteValueRISCV64(v *Value) bool {
+ 	case OpRotateLeft16:
+ 		return rewriteValueRISCV64_OpRotateLeft16(v)
+ 	case OpRotateLeft32:
+-		return rewriteValueRISCV64_OpRotateLeft32(v)
++		v.Op = OpRISCV64ROLW
++		return true
+ 	case OpRotateLeft64:
+-		return rewriteValueRISCV64_OpRotateLeft64(v)
++		v.Op = OpRISCV64ROL
++		return true
+ 	case OpRotateLeft8:
+ 		return rewriteValueRISCV64_OpRotateLeft8(v)
+ 	case OpRound32F:
+@@ -5626,6 +5636,39 @@ func rewriteValueRISCV64_OpRISCV64MOVWreg(v *Value) bool {
+ 		v.AddArg(x)
+ 		return true
+ 	}
++	// match: (MOVWreg x:(ROLW _ _))
++	// result: (MOVDreg x)
++	for {
++		x := v_0
++		if x.Op != OpRISCV64ROLW {
++			break
++		}
++		v.reset(OpRISCV64MOVDreg)
++		v.AddArg(x)
++		return true
++	}
++	// match: (MOVWreg x:(RORW _ _))
++	// result: (MOVDreg x)
++	for {
++		x := v_0
++		if x.Op != OpRISCV64RORW {
++			break
++		}
++		v.reset(OpRISCV64MOVDreg)
++		v.AddArg(x)
++		return true
++	}
++	// match: (MOVWreg x:(RORIW _))
++	// result: (MOVDreg x)
++	for {
++		x := v_0
++		if x.Op != OpRISCV64RORIW {
++			break
++		}
++		v.reset(OpRISCV64MOVDreg)
++		v.AddArg(x)
++		return true
++	}
+ 	// match: (MOVWreg x:(MOVBreg _))
+ 	// result: (MOVDreg x)
+ 	for {
+@@ -5999,6 +6042,102 @@ func rewriteValueRISCV64_OpRISCV64ORI(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64ROL(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (ROL x (MOVDconst [val]))
++	// result: (RORI [int64(int8(-val)&63)] x)
++	for {
++		x := v_0
++		if v_1.Op != OpRISCV64MOVDconst {
++			break
++		}
++		val := auxIntToInt64(v_1.AuxInt)
++		v.reset(OpRISCV64RORI)
++		v.AuxInt = int64ToAuxInt(int64(int8(-val) & 63))
++		v.AddArg(x)
++		return true
++	}
++	// match: (ROL x (NEG y))
++	// result: (ROR x y)
++	for {
++		x := v_0
++		if v_1.Op != OpRISCV64NEG {
++			break
++		}
++		y := v_1.Args[0]
++		v.reset(OpRISCV64ROR)
++		v.AddArg2(x, y)
++		return true
++	}
++	return false
++}
++func rewriteValueRISCV64_OpRISCV64ROLW(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (ROLW x (MOVDconst [val]))
++	// result: (RORIW [int64(int8(-val)&31)] x)
++	for {
++		x := v_0
++		if v_1.Op != OpRISCV64MOVDconst {
++			break
++		}
++		val := auxIntToInt64(v_1.AuxInt)
++		v.reset(OpRISCV64RORIW)
++		v.AuxInt = int64ToAuxInt(int64(int8(-val) & 31))
++		v.AddArg(x)
++		return true
++	}
++	// match: (ROLW x (NEG y))
++	// result: (RORW x y)
++	for {
++		x := v_0
++		if v_1.Op != OpRISCV64NEG {
++			break
++		}
++		y := v_1.Args[0]
++		v.reset(OpRISCV64RORW)
++		v.AddArg2(x, y)
++		return true
++	}
++	return false
++}
++func rewriteValueRISCV64_OpRISCV64ROR(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (ROR x (MOVDconst [val]))
++	// result: (RORI [int64(val&63)] x)
++	for {
++		x := v_0
++		if v_1.Op != OpRISCV64MOVDconst {
++			break
++		}
++		val := auxIntToInt64(v_1.AuxInt)
++		v.reset(OpRISCV64RORI)
++		v.AuxInt = int64ToAuxInt(int64(val & 63))
++		v.AddArg(x)
++		return true
++	}
++	return false
++}
++func rewriteValueRISCV64_OpRISCV64RORW(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (RORW x (MOVDconst [val]))
++	// result: (RORIW [int64(val&31)] x)
++	for {
++		x := v_0
++		if v_1.Op != OpRISCV64MOVDconst {
++			break
++		}
++		val := auxIntToInt64(v_1.AuxInt)
++		v.reset(OpRISCV64RORIW)
++		v.AuxInt = int64ToAuxInt(int64(val & 31))
++		v.AddArg(x)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64SEQZ(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	// match: (SEQZ (NEG x))
+@@ -6691,48 +6830,6 @@ func rewriteValueRISCV64_OpRotateLeft16(v *Value) bool {
+ 		return true
+ 	}
+ }
+-func rewriteValueRISCV64_OpRotateLeft32(v *Value) bool {
+-	v_1 := v.Args[1]
+-	v_0 := v.Args[0]
+-	b := v.Block
+-	// match: (RotateLeft32 <t> x y)
+-	// result: (OR (SLLW <t> x y) (SRLW <t> x (NEG <y.Type> y)))
+-	for {
+-		t := v.Type
+-		x := v_0
+-		y := v_1
+-		v.reset(OpRISCV64OR)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SLLW, t)
+-		v0.AddArg2(x, y)
+-		v1 := b.NewValue0(v.Pos, OpRISCV64SRLW, t)
+-		v2 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type)
+-		v2.AddArg(y)
+-		v1.AddArg2(x, v2)
+-		v.AddArg2(v0, v1)
+-		return true
+-	}
+-}
+-func rewriteValueRISCV64_OpRotateLeft64(v *Value) bool {
+-	v_1 := v.Args[1]
+-	v_0 := v.Args[0]
+-	b := v.Block
+-	// match: (RotateLeft64 <t> x y)
+-	// result: (OR (SLL <t> x y) (SRL <t> x (NEG <y.Type> y)))
+-	for {
+-		t := v.Type
+-		x := v_0
+-		y := v_1
+-		v.reset(OpRISCV64OR)
+-		v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
+-		v0.AddArg2(x, y)
+-		v1 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
+-		v2 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type)
+-		v2.AddArg(y)
+-		v1.AddArg2(x, v2)
+-		v.AddArg2(v0, v1)
+-		return true
+-	}
+-}
+ func rewriteValueRISCV64_OpRotateLeft8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go
+index d2c41971b8..e547c6d5e9 100644
+--- a/src/cmd/internal/obj/riscv/anames.go
++++ b/src/cmd/internal/obj/riscv/anames.go
+@@ -246,6 +246,12 @@ var Anames = []string{
+ 	"NEG",
+ 	"NEGW",
+ 	"NOT",
++	"ROL",
++	"ROLW",
++	"ROR",
++	"RORI",
++	"RORIW",
++	"RORW",
+ 	"SEQZ",
+ 	"SNEZ",
+ 	"LAST",
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index 0f63a616f7..24026561ee 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -600,6 +600,12 @@ const (
+ 	ANEG
+ 	ANEGW
+ 	ANOT
++	AROL
++	AROLW
++	AROR
++	ARORI
++	ARORIW
++	ARORW
+ 	ASEQZ
+ 	ASNEZ
+ 
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 8020624c70..9b81768b85 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -59,7 +59,8 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 			AADDIW, ASLLIW, ASRLIW, ASRAIW, AADDW, ASUBW, ASLLW, ASRLW, ASRAW,
+ 			AADD, AAND, AOR, AXOR, ASLL, ASRL, ASUB, ASRA,
+ 			AMUL, AMULH, AMULHU, AMULHSU, AMULW, ADIV, ADIVU, ADIVW, ADIVUW,
+-			AREM, AREMU, AREMW, AREMUW:
++			AREM, AREMU, AREMW, AREMUW,
++			AROL, AROLW, AROR, ARORW, ARORI, ARORIW:
+ 			p.Reg = p.To.Reg
+ 		}
+ 	}
+@@ -90,6 +91,10 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 			p.As = ASRAI
+ 		case AADDW:
+ 			p.As = AADDIW
++		case AROR:
++			p.As = ARORI
++		case ARORW:
++			p.As = ARORIW
+ 		case ASUBW:
+ 			p.As, p.From.Offset = AADDIW, -p.From.Offset
+ 		case ASLLW:
+@@ -2192,6 +2197,47 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 	return inss
+ }
+ 
++// instructionsForRotate returns the machine instructions for a bitwise rotation.
++func instructionsForRotate(p *obj.Prog, ins *instruction) []*instruction {
++	switch ins.as {
++	case AROL, AROLW, AROR, ARORW:
++		// ROL -> OR (SLL x y) (SRL x (NEG y))
++		// ROR -> OR (SRL x y) (SLL x (NEG y))
++		sllOp, srlOp := ASLL, ASRL
++		if ins.as == AROLW || ins.as == ARORW {
++			sllOp, srlOp = ASLLW, ASRLW
++		}
++		shift1, shift2 := sllOp, srlOp
++		if ins.as == AROR || ins.as == ARORW {
++			shift1, shift2 = shift2, shift1
++		}
++		return []*instruction{
++			&instruction{as: ASUB, rs1: REG_ZERO, rs2: ins.rs2, rd: REG_TMP},
++			&instruction{as: shift2, rs1: ins.rs1, rs2: REG_TMP, rd: REG_TMP},
++			&instruction{as: shift1, rs1: ins.rs1, rs2: ins.rs2, rd: ins.rd},
++			&instruction{as: AOR, rs1: REG_TMP, rs2: ins.rd, rd: ins.rd},
++		}
++
++	case ARORI, ARORIW:
++		// ROR -> OR (SLLI -x y) (SRLI x y)
++		sllOp, srlOp := ASLLI, ASRLI
++		sllImm := int64(int8(-ins.imm) & 63)
++		if ins.as == ARORIW {
++			sllOp, srlOp = ASLLIW, ASRLIW
++			sllImm = int64(int8(-ins.imm) & 31)
++		}
++		return []*instruction{
++			&instruction{as: srlOp, rs1: ins.rs1, rd: REG_TMP, imm: ins.imm},
++			&instruction{as: sllOp, rs1: ins.rs1, rd: ins.rd, imm: sllImm},
++			&instruction{as: AOR, rs1: REG_TMP, rs2: ins.rd, rd: ins.rd},
++		}
++
++	default:
++		p.Ctxt.Diag("%v: unknown rotation", p)
++		return nil
++	}
++}
++
+ // instructionsForProg returns the machine instructions for an *obj.Prog.
+ func instructionsForProg(p *obj.Prog) []*instruction {
+ 	ins := instructionForProg(p)
+@@ -2362,6 +2408,9 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		ins.as = AFSGNJND
+ 		ins.rs1 = uint32(p.From.Reg)
+ 
++	case AROL, AROLW, AROR, ARORW, ARORI, ARORIW:
++		inss = instructionsForRotate(p, ins)
++
+ 	case ASLLI, ASRLI, ASRAI:
+ 		if ins.imm < 0 || ins.imm > 63 {
+ 			p.Ctxt.Diag("%v: shift amount out of range 0 to 63", p)
+diff --git a/src/crypto/sha512/sha512block_riscv64.s b/src/crypto/sha512/sha512block_riscv64.s
+index 361aafe49d..e3a240f70e 100644
+--- a/src/crypto/sha512/sha512block_riscv64.s
++++ b/src/crypto/sha512/sha512block_riscv64.s
+@@ -44,11 +44,6 @@
+ // H6 = g + H6
+ // H7 = h + H7
+ 
+-#define ROR(s, r, d, t1, t2) \
+-	SLL	$(64-s), r, t1; \
+-	SRL	$(s), r, t2; \
+-	OR	t1, t2, d
+-
+ // Wt = Mt; for 0 <= t <= 15
+ #define MSGSCHEDULE0(index) \
+ 	MOVBU	((index*8)+0)(X29), X5; \
+@@ -83,14 +78,14 @@
+ 	MOV	(((index-15)&0xf)*8)(X19), X6; \
+ 	MOV	(((index-7)&0xf)*8)(X19), X9; \
+ 	MOV	(((index-16)&0xf)*8)(X19), X21; \
+-	ROR(19, X5, X7, X23, X24); \
+-	ROR(61, X5, X8, X23, X24); \
++	ROR	$19, X5, X7; \
++	ROR	$61, X5, X8; \
+ 	SRL	$6, X5; \
+ 	XOR	X7, X5; \
+ 	XOR	X8, X5; \
+ 	ADD	X9, X5; \
+-	ROR(1, X6, X7, X23, X24); \
+-	ROR(8, X6, X8, X23, X24); \
++	ROR	$1, X6, X7; \
++	ROR	$8, X6, X8; \
+ 	SRL	$7, X6; \
+ 	XOR	X7, X6; \
+ 	XOR	X8, X6; \
+@@ -106,11 +101,11 @@
+ #define SHA512T1(index, e, f, g, h) \
+ 	MOV	(index*8)(X18), X8; \
+ 	ADD	X5, h; \
+-	ROR(14, e, X6, X23, X24); \
++	ROR	$14, e, X6; \
+ 	ADD	X8, h; \
+-	ROR(18, e, X7, X23, X24); \
++	ROR	$18, e, X7; \
+ 	XOR	X7, X6; \
+-	ROR(41, e, X8, X23, X24); \
++	ROR	$41, e, X8; \
+ 	XOR	X8, X6; \
+ 	ADD	X6, h; \
+ 	AND	e, f, X5; \
+@@ -124,10 +119,10 @@
+ //     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
+ //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
+ #define SHA512T2(a, b, c) \
+-	ROR(28, a, X6, X23, X24); \
+-	ROR(34, a, X7, X23, X24); \
++	ROR	$28, a, X6; \
++	ROR	$34, a, X7; \
+ 	XOR	X7, X6; \
+-	ROR(39, a, X8, X23, X24); \
++	ROR	$39, a, X8; \
+ 	XOR	X8, X6; \
+ 	AND	a, b, X7; \
+ 	AND	a, c, X8; \
+diff --git a/test/codegen/rotate.go b/test/codegen/rotate.go
+index 109e55763c..121ce4cc0a 100644
+--- a/test/codegen/rotate.go
++++ b/test/codegen/rotate.go
+@@ -18,7 +18,7 @@ func rot64(x uint64) uint64 {
+ 	// amd64:"ROLQ\t[$]7"
+ 	// ppc64x:"ROTL\t[$]7"
+ 	// loong64: "ROTRV\t[$]57"
+-	// riscv64: "OR","SLLI","SRLI",-"AND"
++	// riscv64: "RORI\t[$]57"
+ 	a += x<<7 | x>>57
+ 
+ 	// amd64:"ROLQ\t[$]8"
+@@ -26,7 +26,7 @@ func rot64(x uint64) uint64 {
+ 	// s390x:"RISBGZ\t[$]0, [$]63, [$]8, "
+ 	// ppc64x:"ROTL\t[$]8"
+ 	// loong64: "ROTRV\t[$]56"
+-	// riscv64: "OR","SLLI","SRLI",-"AND"
++	// riscv64: "RORI\t[$]56"
+ 	a += x<<8 + x>>56
+ 
+ 	// amd64:"ROLQ\t[$]9"
+@@ -34,7 +34,7 @@ func rot64(x uint64) uint64 {
+ 	// s390x:"RISBGZ\t[$]0, [$]63, [$]9, "
+ 	// ppc64x:"ROTL\t[$]9"
+ 	// loong64: "ROTRV\t[$]55"
+-	// riscv64: "OR","SLLI","SRLI",-"AND"
++	// riscv64: "RORI\t[$]55"
+ 	a += x<<9 ^ x>>55
+ 
+ 	// amd64:"ROLQ\t[$]10"
+@@ -44,7 +44,7 @@ func rot64(x uint64) uint64 {
+ 	// arm64:"ROR\t[$]54"
+ 	// s390x:"RISBGZ\t[$]0, [$]63, [$]10, "
+ 	// loong64: "ROTRV\t[$]54"
+-	// riscv64: "OR","SLLI","SRLI",-"AND"
++	// riscv64: "RORI\t[$]54"
+ 	a += bits.RotateLeft64(x, 10)
+ 
+ 	return a
+@@ -57,7 +57,7 @@ func rot32(x uint32) uint32 {
+ 	// arm:"MOVW\tR\\d+@>25"
+ 	// ppc64x:"ROTLW\t[$]7"
+ 	// loong64: "ROTR\t[$]25"
+-	// riscv64: "OR","SLLIW","SRLIW",-"AND"
++	// riscv64: "RORIW\t[$]25"
+ 	a += x<<7 | x>>25
+ 
+ 	// amd64:`ROLL\t[$]8`
+@@ -66,7 +66,7 @@ func rot32(x uint32) uint32 {
+ 	// s390x:"RLL\t[$]8"
+ 	// ppc64x:"ROTLW\t[$]8"
+ 	// loong64: "ROTR\t[$]24"
+-	// riscv64: "OR","SLLIW","SRLIW",-"AND"
++	// riscv64: "RORIW\t[$]24"
+ 	a += x<<8 + x>>24
+ 
+ 	// amd64:"ROLL\t[$]9"
+@@ -75,7 +75,7 @@ func rot32(x uint32) uint32 {
+ 	// s390x:"RLL\t[$]9"
+ 	// ppc64x:"ROTLW\t[$]9"
+ 	// loong64: "ROTR\t[$]23"
+-	// riscv64: "OR","SLLIW","SRLIW",-"AND"
++	// riscv64: "RORIW\t[$]23"
+ 	a += x<<9 ^ x>>23
+ 
+ 	// amd64:"ROLL\t[$]10"
+@@ -86,7 +86,7 @@ func rot32(x uint32) uint32 {
+ 	// arm64:"RORW\t[$]22"
+ 	// s390x:"RLL\t[$]10"
+ 	// loong64: "ROTR\t[$]22"
+-	// riscv64: "OR","SLLIW","SRLIW",-"AND"
++	// riscv64: "RORIW\t[$]22"
+ 	a += bits.RotateLeft32(x, 10)
+ 
+ 	return a
+@@ -141,14 +141,14 @@ func rot64nc(x uint64, z uint) uint64 {
+ 	// arm64:"ROR","NEG",-"AND"
+ 	// ppc64x:"ROTL",-"NEG",-"AND"
+ 	// loong64: "ROTRV", -"AND"
+-	// riscv64: "OR","SLL","SRL",-"AND"
++	// riscv64: "ROL",-"AND"
+ 	a += x<<z | x>>(64-z)
+ 
+ 	// amd64:"RORQ",-"AND"
+ 	// arm64:"ROR",-"NEG",-"AND"
+ 	// ppc64x:"ROTL","NEG",-"AND"
+ 	// loong64: "ROTRV", -"AND"
+-	// riscv64: "OR","SLL","SRL",-"AND"
++	// riscv64: "ROR",-"AND"
+ 	a += x>>z | x<<(64-z)
+ 
+ 	return a
+@@ -163,14 +163,14 @@ func rot32nc(x uint32, z uint) uint32 {
+ 	// arm64:"ROR","NEG",-"AND"
+ 	// ppc64x:"ROTLW",-"NEG",-"AND"
+ 	// loong64: "ROTR", -"AND"
+-	// riscv64: "OR","SLLW","SRLW",-"AND"
++	// riscv64: "ROLW",-"AND"
+ 	a += x<<z | x>>(32-z)
+ 
+ 	// amd64:"RORL",-"AND"
+ 	// arm64:"ROR",-"NEG",-"AND"
+ 	// ppc64x:"ROTLW","NEG",-"AND"
+ 	// loong64: "ROTR", -"AND"
+-	// riscv64: "OR","SLLW","SRLW",-"AND"
++	// riscv64: "RORW",-"AND"
+ 	a += x>>z | x<<(32-z)
+ 
+ 	return a
+-- 
+2.39.5
+
diff --git a/2033-cmd-internal-obj-support-Zba-Zbb-Zbs-extensions-in-r.patch b/2033-cmd-internal-obj-support-Zba-Zbb-Zbs-extensions-in-r.patch
new file mode 100644
index 0000000..4825528
--- /dev/null
+++ b/2033-cmd-internal-obj-support-Zba-Zbb-Zbs-extensions-in-r.patch
@@ -0,0 +1,617 @@
+From e7609bdfce949e57151a4ea7e11ffcd4fc3cc485 Mon Sep 17 00:00:00 2001
+From: Wang Yaduo <wangyaduo@linux.alibaba.com>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 033/119] cmd/internal/obj: support Zba, Zbb, Zbs extensions in
+ riscv64 assembler
+
+Add assembler support for Zba, Zbb, Zbs extensions, which are
+mandatory in the rva22u64 profile. These can be used to accelerate
+address computation and bit manipulation.
+
+Change-Id: Ie90fe6b76b1382cf69984a0e71a72d3cba0e750a
+Reviewed-on: https://go-review.googlesource.com/c/go/+/559655
+Reviewed-by: M Zhuo <mengzhuo1203@gmail.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Keith Randall <khr@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s | 96 +++++++++++++++++----
+ src/cmd/internal/obj/riscv/anames.go        | 46 ++++++++--
+ src/cmd/internal/obj/riscv/cpu.go           | 58 +++++++++++--
+ src/cmd/internal/obj/riscv/inst.go          | 84 +++++++++++++++++-
+ src/cmd/internal/obj/riscv/obj.go           | 90 +++++++++++++++++--
+ 5 files changed, 336 insertions(+), 38 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index ed691f4d9e..64170340dc 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -339,6 +339,84 @@ start:
+ 	// 12.6: Double-Precision Floating-Point Classify Instruction
+ 	FCLASSD	F0, X5					// d31200e2
+ 
++	// RISC-V Bit-Manipulation ISA-extensions (1.0)
++	// 1.1: Address Generation Instructions (Zba)
++	ADDUW		X10, X11, X12			// 3b86a508
++	ADDUW		X10, X11			// bb85a508
++	SH1ADD		X11, X12, X13			// b326b620
++	SH1ADD		X11, X12			// 3326b620
++	SH1ADDUW	X12, X13, X14			// 3ba7c620
++	SH1ADDUW	X12, X13			// bba6c620
++	SH2ADD		X13, X14, X15			// b347d720
++	SH2ADD		X13, X14			// 3347d720
++	SH2ADDUW	X14, X15, X16			// 3bc8e720
++	SH2ADDUW	X14, X15			// bbc7e720
++	SH3ADD		X15, X16, X17			// b368f820
++	SH3ADD		X15, X16			// 3368f820
++	SH3ADDUW	X16, X17, X18			// 3be90821
++	SH3ADDUW	X16, X17			// bbe80821
++	SLLIUW		$31, X17, X18			// 1b99f809
++	SLLIUW		$63, X17			// 9b98f80b
++	SLLIUW		$63, X17, X18			// 1b99f80b
++	SLLIUW		$1, X18, X19			// 9b191908
++
++	// 1.2: Basic Bit Manipulation (Zbb)
++	ANDN	X19, X20, X21				// b37a3a41
++	ANDN	X19, X20				// 337a3a41
++	CLZ	X20, X21				// 931a0a60
++	CLZW	X21, X22				// 1b9b0a60
++	CPOP	X22, X23				// 931b2b60
++	CPOPW	X23, X24				// 1b9c2b60
++	CTZ	X24, X25				// 931c1c60
++	CTZW	X25, X26				// 1b9d1c60
++	MAX	X26, X28, X29				// b36eae0b
++	MAX	X26, X28				// 336eae0b
++	MAXU	X28, X29, X30				// 33ffce0b
++	MAXU	X28, X29				// b3fece0b
++	MIN	X29, X30, X5				// b342df0b
++	MIN	X29, X30				// 334fdf0b
++	MINU	X30, X5, X6				// 33d3e20b
++	MINU	X30, X5					// b3d2e20b
++	ORN	X6, X7, X8				// 33e46340
++	ORN	X6, X7					// b3e36340
++	SEXTB	X16, X17				// 93184860
++	SEXTH	X17, X18				// 13995860
++	XNOR	X18, X19, X20				// 33ca2941
++	XNOR	X18, X19				// b3c92941
++	ZEXTH	X19, X20				// 3bca0908
++
++	// 1.3: Bitwise Rotation (Zbb)
++	ROL	X8, X9, X10				// 33958460 or b30f8040b3dff4013395840033e5af00
++	ROL	X8, X9					// b3948460 or b30f8040b3dff401b3948400b3e49f00
++	ROLW	X9, X10, X11				// bb159560 or b30f9040bb5ff501bb159500b3e5bf00
++	ROLW	X9, X10					// 3b159560 or b30f9040bb5ff5013b15950033e5af00
++	ROR	X10, X11, X12				// 33d6a560 or b30fa040b39ff50133d6a50033e6cf00
++	ROR	X10, X11				// b3d5a560 or b30fa040b39ff501b3d5a500b3e5bf00
++	ROR	$63, X11				// 93d5f563 or 93dff50393951500b3e5bf00
++	RORI	$63, X11, X12				// 13d6f563 or 93dff5031396150033e6cf00
++	RORI	$1, X12, X13				// 93561660 or 935f16009316f603b3e6df00
++	RORIW	$31, X13, X14				// 1bd7f661 or 9bdff6011b97160033e7ef00
++	RORIW	$1, X14, X15				// 9b571760 or 9b5f17009b17f701b3e7ff00
++	RORW	X15, X16, X17				// bb58f860 or b30ff040bb1ff801bb58f800b3e81f01
++	RORW	X15, X16				// 3b58f860 or b30ff040bb1ff8013b58f80033e80f01
++	RORW	$31, X13				// 9bd6f661 or 9bdff6019b961600b3e6df00
++	ORCB	X5, X6					// 13d37228
++	REV8	X7, X8					// 13d4836b
++
++	// 1.5: Single-bit Instructions (Zbs)
++	BCLR	X23, X24, X25				// b31c7c49
++	BCLR	$63, X24				// 131cfc4b
++	BCLRI	$1, X25, X26				// 139d1c48
++	BEXT	X26, X28, X29				// b35eae49
++	BEXT	$63, X28				// 135efe4b
++	BEXTI	$1, X29, X30				// 13df1e48
++	BINV	X30, X5, X6				// 3393e269
++	BINV	$63, X6					// 1313f36b
++	BINVI	$1, X7, X8				// 13941368
++	BSET	X8, X9, X10				// 33958428
++	BSET	$63, X9					// 9394f42b
++	BSETI	$1, X10, X11				// 93151528
++
+ 	// Privileged ISA
+ 
+ 	// 3.2.1: Environment Call and Breakpoint
+@@ -417,24 +495,6 @@ start:
+ 	NEGW	X5					// bb025040
+ 	NEGW	X5, X6					// 3b035040
+ 
+-	// Bitwise rotation pseudo-instructions
+-	ROL	X5, X6, X7				// b30f5040b35ff301b3135300b3e37f00
+-	ROL	X5, X6					// b30f5040b35ff3013313530033e36f00
+-	ROLW	X5, X6, X7				// b30f5040bb5ff301bb135300b3e37f00
+-	ROLW	X5, X6					// b30f5040bb5ff3013b13530033e36f00
+-	ROR	X5, X6, X7				// b30f5040b31ff301b3535300b3e37f00
+-	ROR	X5, X6					// b30f5040b31ff3013353530033e36f00
+-	RORW	X5, X6, X7				// b30f5040bb1ff301bb535300b3e37f00
+-	RORW	X5, X6					// b30f5040bb1ff3013b53530033e36f00
+-	RORI	$5, X6, X7				// 935f53009313b303b3e37f00
+-	RORI	$5, X6					// 935f53001313b30333e36f00
+-	RORIW	$5, X6, X7				// 9b5f53009b13b301b3e37f00
+-	RORIW	$5, X6					// 9b5f53001b13b30133e36f00
+-	ROR	$5, X6, X7				// 935f53009313b303b3e37f00
+-	ROR	$5, X6					// 935f53001313b30333e36f00
+-	RORW	$5, X6, X7				// 9b5f53009b13b301b3e37f00
+-	RORW	$5, X6					// 9b5f53001b13b30133e36f00
+-
+ 	// This jumps to the second instruction in the function (the
+ 	// first instruction is an invisible stack pointer adjustment).
+ 	JMP	start					// JMP	2
+diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go
+index e547c6d5e9..60c7b48620 100644
+--- a/src/cmd/internal/obj/riscv/anames.go
++++ b/src/cmd/internal/obj/riscv/anames.go
+@@ -217,6 +217,46 @@ var Anames = []string{
+ 	"DRET",
+ 	"WFI",
+ 	"SFENCEVMA",
++	"ADDUW",
++	"SH1ADD",
++	"SH1ADDUW",
++	"SH2ADD",
++	"SH2ADDUW",
++	"SH3ADD",
++	"SH3ADDUW",
++	"SLLIUW",
++	"ANDN",
++	"ORN",
++	"XNOR",
++	"CLZ",
++	"CLZW",
++	"CTZ",
++	"CTZW",
++	"CPOP",
++	"CPOPW",
++	"MAX",
++	"MAXU",
++	"MIN",
++	"MINU",
++	"SEXTB",
++	"SEXTH",
++	"ZEXTH",
++	"ROL",
++	"ROLW",
++	"ROR",
++	"RORI",
++	"RORIW",
++	"RORW",
++	"ORCB",
++	"REV8",
++	"BCLR",
++	"BCLRI",
++	"BEXT",
++	"BEXTI",
++	"BINV",
++	"BINVI",
++	"BSET",
++	"BSETI",
+ 	"WORD",
+ 	"BEQZ",
+ 	"BGEZ",
+@@ -246,12 +286,6 @@ var Anames = []string{
+ 	"NEG",
+ 	"NEGW",
+ 	"NOT",
+-	"ROL",
+-	"ROLW",
+-	"ROR",
+-	"RORI",
+-	"RORIW",
+-	"RORW",
+ 	"SEQZ",
+ 	"SNEZ",
+ 	"LAST",
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index 24026561ee..07d5ccff87 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -567,6 +567,58 @@ const (
+ 	// 4.2.1: Supervisor Memory-Management Fence Instruction
+ 	ASFENCEVMA
+ 
++	//
++	// RISC-V Bit-Manipulation ISA-extensions (1.0)
++	//
++
++	// 1.1: Address Generation Instructions (Zba)
++	AADDUW
++	ASH1ADD
++	ASH1ADDUW
++	ASH2ADD
++	ASH2ADDUW
++	ASH3ADD
++	ASH3ADDUW
++	ASLLIUW
++
++	// 1.2: Basic Bit Manipulation (Zbb)
++	AANDN
++	AORN
++	AXNOR
++	ACLZ
++	ACLZW
++	ACTZ
++	ACTZW
++	ACPOP
++	ACPOPW
++	AMAX
++	AMAXU
++	AMIN
++	AMINU
++	ASEXTB
++	ASEXTH
++	AZEXTH
++
++	// 1.3: Bitwise Rotation (Zbb)
++	AROL
++	AROLW
++	AROR
++	ARORI
++	ARORIW
++	ARORW
++	AORCB
++	AREV8
++
++	// 1.5: Single-bit Instructions (Zbs)
++	ABCLR
++	ABCLRI
++	ABEXT
++	ABEXTI
++	ABINV
++	ABINVI
++	ABSET
++	ABSETI
++
+ 	// The escape hatch. Inserts a single 32-bit word.
+ 	AWORD
+ 
+@@ -600,12 +652,6 @@ const (
+ 	ANEG
+ 	ANEGW
+ 	ANOT
+-	AROL
+-	AROLW
+-	AROR
+-	ARORI
+-	ARORIW
+-	ARORW
+ 	ASEQZ
+ 	ASNEZ
+ 
+diff --git a/src/cmd/internal/obj/riscv/inst.go b/src/cmd/internal/obj/riscv/inst.go
+index 6cb11cdfb5..223ddd15b2 100644
+--- a/src/cmd/internal/obj/riscv/inst.go
++++ b/src/cmd/internal/obj/riscv/inst.go
+@@ -1,4 +1,4 @@
+-// Code generated by parse.py -go rv64_a rv64_d rv64_f rv64_i rv64_m rv64_q rv_a rv_d rv_f rv_i rv_m rv_q rv_s rv_system rv_zicsr; DO NOT EDIT.
++// Code generated by ./parse.py -go rv64_a rv64_d rv64_f rv64_i rv64_m rv64_q rv64_zba rv64_zbb rv64_zbs rv_a rv_d rv_f rv_i rv_m rv_q rv_zba rv_zbb rv_zbs rv_s rv_system rv_zicsr; DO NOT EDIT.
+ package riscv
+ 
+ import "cmd/internal/obj"
+@@ -15,6 +15,8 @@ func encode(a obj.As) *inst {
+ 	switch a {
+ 	case AADD:
+ 		return &inst{0x33, 0x0, 0x0, 0, 0x0}
++	case AADDUW:
++		return &inst{0x3b, 0x0, 0x0, 128, 0x4}
+ 	case AADDI:
+ 		return &inst{0x13, 0x0, 0x0, 0, 0x0}
+ 	case AADDIW:
+@@ -61,20 +63,46 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x33, 0x7, 0x0, 0, 0x0}
+ 	case AANDI:
+ 		return &inst{0x13, 0x7, 0x0, 0, 0x0}
++	case AANDN:
++		return &inst{0x33, 0x7, 0x0, 1024, 0x20}
+ 	case AAUIPC:
+ 		return &inst{0x17, 0x0, 0x0, 0, 0x0}
++	case ABCLR:
++		return &inst{0x33, 0x1, 0x0, 1152, 0x24}
++	case ABCLRI:
++		return &inst{0x13, 0x1, 0x0, 1152, 0x24}
+ 	case ABEQ:
+ 		return &inst{0x63, 0x0, 0x0, 0, 0x0}
++	case ABEXT:
++		return &inst{0x33, 0x5, 0x0, 1152, 0x24}
++	case ABEXTI:
++		return &inst{0x13, 0x5, 0x0, 1152, 0x24}
+ 	case ABGE:
+ 		return &inst{0x63, 0x5, 0x0, 0, 0x0}
+ 	case ABGEU:
+ 		return &inst{0x63, 0x7, 0x0, 0, 0x0}
++	case ABINV:
++		return &inst{0x33, 0x1, 0x0, 1664, 0x34}
++	case ABINVI:
++		return &inst{0x13, 0x1, 0x0, 1664, 0x34}
+ 	case ABLT:
+ 		return &inst{0x63, 0x4, 0x0, 0, 0x0}
+ 	case ABLTU:
+ 		return &inst{0x63, 0x6, 0x0, 0, 0x0}
+ 	case ABNE:
+ 		return &inst{0x63, 0x1, 0x0, 0, 0x0}
++	case ABSET:
++		return &inst{0x33, 0x1, 0x0, 640, 0x14}
++	case ABSETI:
++		return &inst{0x13, 0x1, 0x0, 640, 0x14}
++	case ACLZ:
++		return &inst{0x13, 0x1, 0x0, 1536, 0x30}
++	case ACLZW:
++		return &inst{0x1b, 0x1, 0x0, 1536, 0x30}
++	case ACPOP:
++		return &inst{0x13, 0x1, 0x2, 1538, 0x30}
++	case ACPOPW:
++		return &inst{0x1b, 0x1, 0x2, 1538, 0x30}
+ 	case ACSRRC:
+ 		return &inst{0x73, 0x3, 0x0, 0, 0x0}
+ 	case ACSRRCI:
+@@ -87,6 +115,10 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x73, 0x1, 0x0, 0, 0x0}
+ 	case ACSRRWI:
+ 		return &inst{0x73, 0x5, 0x0, 0, 0x0}
++	case ACTZ:
++		return &inst{0x13, 0x1, 0x1, 1537, 0x30}
++	case ACTZW:
++		return &inst{0x1b, 0x1, 0x1, 1537, 0x30}
+ 	case ADIV:
+ 		return &inst{0x33, 0x4, 0x0, 32, 0x1}
+ 	case ADIVU:
+@@ -95,8 +127,6 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x3b, 0x5, 0x0, 32, 0x1}
+ 	case ADIVW:
+ 		return &inst{0x3b, 0x4, 0x0, 32, 0x1}
+-	case ADRET:
+-		return &inst{0x73, 0x0, 0x12, 1970, 0x3d}
+ 	case AEBREAK:
+ 		return &inst{0x73, 0x0, 0x1, 1, 0x0}
+ 	case AECALL:
+@@ -337,6 +367,14 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x3, 0x2, 0x0, 0, 0x0}
+ 	case ALWU:
+ 		return &inst{0x3, 0x6, 0x0, 0, 0x0}
++	case AMAX:
++		return &inst{0x33, 0x6, 0x0, 160, 0x5}
++	case AMAXU:
++		return &inst{0x33, 0x7, 0x0, 160, 0x5}
++	case AMIN:
++		return &inst{0x33, 0x4, 0x0, 160, 0x5}
++	case AMINU:
++		return &inst{0x33, 0x5, 0x0, 160, 0x5}
+ 	case AMRET:
+ 		return &inst{0x73, 0x0, 0x2, 770, 0x18}
+ 	case AMUL:
+@@ -351,8 +389,12 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x3b, 0x0, 0x0, 32, 0x1}
+ 	case AOR:
+ 		return &inst{0x33, 0x6, 0x0, 0, 0x0}
++	case AORCB:
++		return &inst{0x13, 0x5, 0x7, 647, 0x14}
+ 	case AORI:
+ 		return &inst{0x13, 0x6, 0x0, 0, 0x0}
++	case AORN:
++		return &inst{0x33, 0x6, 0x0, 1024, 0x20}
+ 	case APAUSE:
+ 		return &inst{0xf, 0x0, 0x10, 16, 0x0}
+ 	case ARDCYCLE:
+@@ -375,6 +417,20 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x3b, 0x7, 0x0, 32, 0x1}
+ 	case AREMW:
+ 		return &inst{0x3b, 0x6, 0x0, 32, 0x1}
++	case AREV8:
++		return &inst{0x13, 0x5, 0x18, 1720, 0x35}
++	case AROL:
++		return &inst{0x33, 0x1, 0x0, 1536, 0x30}
++	case AROLW:
++		return &inst{0x3b, 0x1, 0x0, 1536, 0x30}
++	case AROR:
++		return &inst{0x33, 0x5, 0x0, 1536, 0x30}
++	case ARORI:
++		return &inst{0x13, 0x5, 0x0, 1536, 0x30}
++	case ARORIW:
++		return &inst{0x1b, 0x5, 0x0, 1536, 0x30}
++	case ARORW:
++		return &inst{0x3b, 0x5, 0x0, 1536, 0x30}
+ 	case ASB:
+ 		return &inst{0x23, 0x0, 0x0, 0, 0x0}
+ 	case ASBREAK:
+@@ -387,14 +443,32 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x73, 0x0, 0x0, 0, 0x0}
+ 	case ASD:
+ 		return &inst{0x23, 0x3, 0x0, 0, 0x0}
++	case ASEXTB:
++		return &inst{0x13, 0x1, 0x4, 1540, 0x30}
++	case ASEXTH:
++		return &inst{0x13, 0x1, 0x5, 1541, 0x30}
+ 	case ASFENCEVMA:
+ 		return &inst{0x73, 0x0, 0x0, 288, 0x9}
+ 	case ASH:
+ 		return &inst{0x23, 0x1, 0x0, 0, 0x0}
++	case ASH1ADD:
++		return &inst{0x33, 0x2, 0x0, 512, 0x10}
++	case ASH1ADDUW:
++		return &inst{0x3b, 0x2, 0x0, 512, 0x10}
++	case ASH2ADD:
++		return &inst{0x33, 0x4, 0x0, 512, 0x10}
++	case ASH2ADDUW:
++		return &inst{0x3b, 0x4, 0x0, 512, 0x10}
++	case ASH3ADD:
++		return &inst{0x33, 0x6, 0x0, 512, 0x10}
++	case ASH3ADDUW:
++		return &inst{0x3b, 0x6, 0x0, 512, 0x10}
+ 	case ASLL:
+ 		return &inst{0x33, 0x1, 0x0, 0, 0x0}
+ 	case ASLLI:
+ 		return &inst{0x13, 0x1, 0x0, 0, 0x0}
++	case ASLLIUW:
++		return &inst{0x1b, 0x1, 0x0, 128, 0x4}
+ 	case ASLLIW:
+ 		return &inst{0x1b, 0x1, 0x0, 0, 0x0}
+ 	case ASLLW:
+@@ -433,10 +507,14 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x23, 0x2, 0x0, 0, 0x0}
+ 	case AWFI:
+ 		return &inst{0x73, 0x0, 0x5, 261, 0x8}
++	case AXNOR:
++		return &inst{0x33, 0x4, 0x0, 1024, 0x20}
+ 	case AXOR:
+ 		return &inst{0x33, 0x4, 0x0, 0, 0x0}
+ 	case AXORI:
+ 		return &inst{0x13, 0x4, 0x0, 0, 0x0}
++	case AZEXTH:
++		return &inst{0x3b, 0x4, 0x0, 128, 0x4}
+ 	}
+ 	return nil
+ }
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 9b81768b85..8115350a9e 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -60,7 +60,9 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 			AADD, AAND, AOR, AXOR, ASLL, ASRL, ASUB, ASRA,
+ 			AMUL, AMULH, AMULHU, AMULHSU, AMULW, ADIV, ADIVU, ADIVW, ADIVUW,
+ 			AREM, AREMU, AREMW, AREMUW,
+-			AROL, AROLW, AROR, ARORW, ARORI, ARORIW:
++			AADDUW, ASH1ADD, ASH1ADDUW, ASH2ADD, ASH2ADDUW, ASH3ADD, ASH3ADDUW, ASLLIUW,
++			AANDN, AORN, AXNOR, AMAX, AMAXU, AMIN, AMINU, AROL, AROLW, AROR, ARORW, ARORI, ARORIW,
++			ABCLR, ABCLRI, ABEXT, ABEXTI, ABINV, ABINVI, ABSET, ABSETI:
+ 			p.Reg = p.To.Reg
+ 		}
+ 	}
+@@ -91,10 +93,6 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 			p.As = ASRAI
+ 		case AADDW:
+ 			p.As = AADDIW
+-		case AROR:
+-			p.As = ARORI
+-		case ARORW:
+-			p.As = ARORIW
+ 		case ASUBW:
+ 			p.As, p.From.Offset = AADDIW, -p.From.Offset
+ 		case ASLLW:
+@@ -103,6 +101,18 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 			p.As = ASRLIW
+ 		case ASRAW:
+ 			p.As = ASRAIW
++		case AROR:
++			p.As = ARORI
++		case ARORW:
++			p.As = ARORIW
++		case ABCLR:
++			p.As = ABCLRI
++		case ABEXT:
++			p.As = ABEXTI
++		case ABINV:
++			p.As = ABINVI
++		case ABSET:
++			p.As = ABSETI
+ 		}
+ 	}
+ 
+@@ -1107,6 +1117,13 @@ func wantEvenOffset(ctxt *obj.Link, ins *instruction, offset int64) {
+ 	}
+ }
+ 
++func validateRII(ctxt *obj.Link, ins *instruction) {
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
+ func validateRIII(ctxt *obj.Link, ins *instruction) {
+ 	wantIntReg(ctxt, ins, "rd", ins.rd)
+ 	wantIntReg(ctxt, ins, "rs1", ins.rs1)
+@@ -1260,6 +1277,10 @@ func encodeR4(as obj.As, rs1, rs2, rs3, rd, funct3, funct2 uint32) uint32 {
+ 	return rs3<<27 | funct2<<25 | rs2<<20 | rs1<<15 | enc.funct3<<12 | funct3<<12 | rd<<7 | enc.opcode
+ }
+ 
++func encodeRII(ins *instruction) uint32 {
++	return encodeR(ins.as, regI(ins.rs1), 0, regI(ins.rd), ins.funct3, ins.funct7)
++}
++
+ func encodeRIII(ins *instruction) uint32 {
+ 	return encodeR(ins.as, regI(ins.rs1), regI(ins.rs2), regI(ins.rd), ins.funct3, ins.funct7)
+ }
+@@ -1491,6 +1512,7 @@ var (
+ 	// indicates an S-type instruction with rs2 being a float register.
+ 
+ 	rIIIEncoding  = encoding{encode: encodeRIII, validate: validateRIII, length: 4}
++	rIIEncoding   = encoding{encode: encodeRII, validate: validateRII, length: 4}
+ 	rFFFEncoding  = encoding{encode: encodeRFFF, validate: validateRFFF, length: 4}
+ 	rFFFFEncoding = encoding{encode: encodeRFFFF, validate: validateRFFFF, length: 4}
+ 	rFFIEncoding  = encoding{encode: encodeRFFI, validate: validateRFFI, length: 4}
+@@ -1723,6 +1745,58 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	AECALL & obj.AMask:  iIEncoding,
+ 	AEBREAK & obj.AMask: iIEncoding,
+ 
++	//
++	// RISC-V Bit-Manipulation ISA-extensions (1.0)
++	//
++
++	// 1.1: Address Generation Instructions (Zba)
++	AADDUW & obj.AMask:    rIIIEncoding,
++	ASH1ADD & obj.AMask:   rIIIEncoding,
++	ASH1ADDUW & obj.AMask: rIIIEncoding,
++	ASH2ADD & obj.AMask:   rIIIEncoding,
++	ASH2ADDUW & obj.AMask: rIIIEncoding,
++	ASH3ADD & obj.AMask:   rIIIEncoding,
++	ASH3ADDUW & obj.AMask: rIIIEncoding,
++	ASLLIUW & obj.AMask:   iIEncoding,
++
++	// 1.2: Basic Bit Manipulation (Zbb)
++	AANDN & obj.AMask:  rIIIEncoding,
++	ACLZ & obj.AMask:   rIIEncoding,
++	ACLZW & obj.AMask:  rIIEncoding,
++	ACPOP & obj.AMask:  rIIEncoding,
++	ACPOPW & obj.AMask: rIIEncoding,
++	ACTZ & obj.AMask:   rIIEncoding,
++	ACTZW & obj.AMask:  rIIEncoding,
++	AMAX & obj.AMask:   rIIIEncoding,
++	AMAXU & obj.AMask:  rIIIEncoding,
++	AMIN & obj.AMask:   rIIIEncoding,
++	AMINU & obj.AMask:  rIIIEncoding,
++	AORN & obj.AMask:   rIIIEncoding,
++	ASEXTB & obj.AMask: rIIEncoding,
++	ASEXTH & obj.AMask: rIIEncoding,
++	AXNOR & obj.AMask:  rIIIEncoding,
++	AZEXTH & obj.AMask: rIIEncoding,
++
++	// 1.3: Bitwise Rotation (Zbb)
++	AROL & obj.AMask:   rIIIEncoding,
++	AROLW & obj.AMask:  rIIIEncoding,
++	AROR & obj.AMask:   rIIIEncoding,
++	ARORI & obj.AMask:  iIEncoding,
++	ARORIW & obj.AMask: iIEncoding,
++	ARORW & obj.AMask:  rIIIEncoding,
++	AORCB & obj.AMask:  iIEncoding,
++	AREV8 & obj.AMask:  iIEncoding,
++
++	// 1.5: Single-bit Instructions (Zbs)
++	ABCLR & obj.AMask:  rIIIEncoding,
++	ABCLRI & obj.AMask: iIEncoding,
++	ABEXT & obj.AMask:  rIIIEncoding,
++	ABEXTI & obj.AMask: iIEncoding,
++	ABINV & obj.AMask:  rIIIEncoding,
++	ABINVI & obj.AMask: iIEncoding,
++	ABSET & obj.AMask:  rIIIEncoding,
++	ABSETI & obj.AMask: iIEncoding,
++
+ 	// Escape hatch
+ 	AWORD & obj.AMask: rawEncoding,
+ 
+@@ -2420,6 +2494,12 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		if ins.imm < 0 || ins.imm > 31 {
+ 			p.Ctxt.Diag("%v: shift amount out of range 0 to 31", p)
+ 		}
++
++	case ACLZ, ACLZW, ACTZ, ACTZW, ACPOP, ACPOPW, ASEXTB, ASEXTH, AZEXTH:
++		ins.rs1, ins.rs2 = uint32(p.From.Reg), obj.REG_NONE
++
++	case AORCB, AREV8:
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE
+ 	}
+ 
+ 	for _, ins := range inss {
+-- 
+2.39.5
+
diff --git a/2034-cmd-internal-obj-riscv-improve-register-MOVB-MOVH-MO.patch b/2034-cmd-internal-obj-riscv-improve-register-MOVB-MOVH-MO.patch
new file mode 100644
index 0000000..5b43913
--- /dev/null
+++ b/2034-cmd-internal-obj-riscv-improve-register-MOVB-MOVH-MO.patch
@@ -0,0 +1,118 @@
+From 15597b92557425f079d2a780cb4a592ad4288f0a Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 034/119] cmd/internal/obj/riscv: improve register
+ MOVB/MOVH/MOVBU/MOVHU for rva22u64
+
+When GORISCV64 enables rva22u64, use SEXTB for MOVB, SEXTH for MOVH, ZEXTH
+for MOVHU and ADDUW for MOVWU. These are single instruction alternatives
+to the two instruction shift sequences that are needed otherwise.
+
+Change-Id: Iea5e394f57e238ae8771400a87287c1ee507d44c
+Reviewed-on: https://go-review.googlesource.com/c/go/+/572736
+Reviewed-by: David Chase <drchase@google.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: M Zhuo <mengzhuo1203@gmail.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s |  8 ++--
+ src/cmd/internal/obj/riscv/obj.go           | 49 ++++++++++++++-------
+ 2 files changed, 37 insertions(+), 20 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 64170340dc..5296a34d09 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -462,12 +462,12 @@ start:
+ 	MOVW	X5, (X6)				// 23205300
+ 	MOVW	X5, 4(X6)				// 23225300
+ 
+-	MOVB	X5, X6					// 1393820313538343
+-	MOVH	X5, X6					// 1393020313530343
++	MOVB	X5, X6					// 1393820313538343 or 13934260
++	MOVH	X5, X6					// 1393020313530343 or 13935260
+ 	MOVW	X5, X6					// 1b830200
+ 	MOVBU	X5, X6					// 13f3f20f
+-	MOVHU	X5, X6					// 1393020313530303
+-	MOVWU	X5, X6					// 1393020213530302
++	MOVHU	X5, X6					// 1393020313530303 or 3bc30208
++	MOVWU	X5, X6					// 1393020213530302 or 3b830208
+ 
+ 	MOVF	4(X5), F0				// 07a04200
+ 	MOVF	F0, 4(X5)				// 27a20200
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 8115350a9e..f731359f7f 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -26,6 +26,7 @@ import (
+ 	"cmd/internal/sys"
+ 	"fmt"
+ 	"internal/abi"
++	"internal/buildcfg"
+ 	"log"
+ 	"math/bits"
+ 	"strings"
+@@ -2156,25 +2157,41 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 		case AMOVD: // MOVD Ra, Rb -> FSGNJD Ra, Ra, Rb
+ 			ins.as, ins.rs1 = AFSGNJD, uint32(p.From.Reg)
+ 		case AMOVB, AMOVH:
+-			// Use SLLI/SRAI to extend.
+-			ins.as, ins.rs1, ins.rs2 = ASLLI, uint32(p.From.Reg), obj.REG_NONE
+-			if p.As == AMOVB {
+-				ins.imm = 56
+-			} else if p.As == AMOVH {
+-				ins.imm = 48
++			if buildcfg.GORISCV64 >= 22 {
++				// Use SEXTB or SEXTH to extend.
++				ins.as, ins.rs1, ins.rs2 = ASEXTB, uint32(p.From.Reg), obj.REG_NONE
++				if p.As == AMOVH {
++					ins.as = ASEXTH
++				}
++			} else {
++				// Use SLLI/SRAI sequence to extend.
++				ins.as, ins.rs1, ins.rs2 = ASLLI, uint32(p.From.Reg), obj.REG_NONE
++				if p.As == AMOVB {
++					ins.imm = 56
++				} else if p.As == AMOVH {
++					ins.imm = 48
++				}
++				ins2 := &instruction{as: ASRAI, rd: ins.rd, rs1: ins.rd, imm: ins.imm}
++				inss = append(inss, ins2)
+ 			}
+-			ins2 := &instruction{as: ASRAI, rd: ins.rd, rs1: ins.rd, imm: ins.imm}
+-			inss = append(inss, ins2)
+ 		case AMOVHU, AMOVWU:
+-			// Use SLLI/SRLI to extend.
+-			ins.as, ins.rs1, ins.rs2 = ASLLI, uint32(p.From.Reg), obj.REG_NONE
+-			if p.As == AMOVHU {
+-				ins.imm = 48
+-			} else if p.As == AMOVWU {
+-				ins.imm = 32
++			if buildcfg.GORISCV64 >= 22 {
++				// Use ZEXTH or ADDUW to extend.
++				ins.as, ins.rs1, ins.rs2, ins.imm = AZEXTH, uint32(p.From.Reg), obj.REG_NONE, 0
++				if p.As == AMOVWU {
++					ins.as, ins.rs2 = AADDUW, REG_ZERO
++				}
++			} else {
++				// Use SLLI/SRLI sequence to extend.
++				ins.as, ins.rs1, ins.rs2 = ASLLI, uint32(p.From.Reg), obj.REG_NONE
++				if p.As == AMOVHU {
++					ins.imm = 48
++				} else if p.As == AMOVWU {
++					ins.imm = 32
++				}
++				ins2 := &instruction{as: ASRLI, rd: ins.rd, rs1: ins.rd, imm: ins.imm}
++				inss = append(inss, ins2)
+ 			}
+-			ins2 := &instruction{as: ASRLI, rd: ins.rd, rs1: ins.rd, imm: ins.imm}
+-			inss = append(inss, ins2)
+ 		}
+ 
+ 	case p.From.Type == obj.TYPE_MEM && p.To.Type == obj.TYPE_REG:
+-- 
+2.39.5
+
diff --git a/2035-cmd-internal-obj-riscv-use-native-rotation-instructi.patch b/2035-cmd-internal-obj-riscv-use-native-rotation-instructi.patch
new file mode 100644
index 0000000..bf23dda
--- /dev/null
+++ b/2035-cmd-internal-obj-riscv-use-native-rotation-instructi.patch
@@ -0,0 +1,62 @@
+From b230277ae2437913fb5e906aeeab9748e48a1dfd Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 035/119] cmd/internal/obj/riscv: use native rotation
+ instructions for rva22u64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When rva22u64 is available, we can now use the native rotation instructions
+from the Zbb extension. Use these instead of synthesising rotation
+instructions.
+
+This provides a significant performance gain for SHA-512, the following
+benchmarked on a StarFive VisionFive 2:
+
+                    │ sha512.rva20u64 │            sha512.rva22u64            │
+                    │       B/s       │      B/s       vs base                │
+Hash8Bytes/New-4         859.4Ki ± 0%   1337.9Ki ± 0%  +55.68% (p=0.000 n=10)
+Hash8Bytes/Sum384-4      888.7Ki ± 1%   1308.6Ki ± 1%  +47.25% (p=0.000 n=10)
+Hash8Bytes/Sum512-4      869.1Ki ± 0%   1269.5Ki ± 1%  +46.07% (p=0.000 n=10)
+Hash1K/New-4             19.83Mi ± 0%    29.03Mi ± 0%  +46.38% (p=0.000 n=10)
+Hash1K/Sum384-4          20.00Mi ± 0%    28.86Mi ± 0%  +44.30% (p=0.000 n=10)
+Hash1K/Sum512-4          19.93Mi ± 0%    28.72Mi ± 0%  +44.11% (p=0.000 n=10)
+Hash8K/New-4             23.85Mi ± 0%    34.12Mi ± 0%  +43.09% (p=0.000 n=10)
+Hash8K/Sum384-4          23.88Mi ± 0%    34.09Mi ± 0%  +42.77% (p=0.000 n=10)
+Hash8K/Sum512-4          23.87Mi ± 0%    34.07Mi ± 0%  +42.71% (p=0.000 n=10)
+geomean                  7.399Mi         10.78Mi       +45.77%
+
+Change-Id: I9dca8e3f311eea101684c806cb998872dc697288
+Reviewed-on: https://go-review.googlesource.com/c/go/+/572716
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: David Chase <drchase@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: M Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Matthew Dempsky <mdempsky@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Auto-Submit: Emmanuel Odeke <emmanuel@orijtech.com>
+---
+ src/cmd/internal/obj/riscv/obj.go | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index f731359f7f..579ac43810 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -2290,6 +2290,11 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 
+ // instructionsForRotate returns the machine instructions for a bitwise rotation.
+ func instructionsForRotate(p *obj.Prog, ins *instruction) []*instruction {
++	if buildcfg.GORISCV64 >= 22 {
++		// Rotation instructions are supported natively.
++		return []*instruction{ins}
++	}
++
+ 	switch ins.as {
+ 	case AROL, AROLW, AROR, ARORW:
+ 		// ROL -> OR (SLL x y) (SRL x (NEG y))
+-- 
+2.39.5
+
diff --git a/2036-cmd-internal-obj-riscv-check-immediate-for-rotation-.patch b/2036-cmd-internal-obj-riscv-check-immediate-for-rotation-.patch
new file mode 100644
index 0000000..99ae0cb
--- /dev/null
+++ b/2036-cmd-internal-obj-riscv-check-immediate-for-rotation-.patch
@@ -0,0 +1,102 @@
+From 234c3da75367155836a25d8317d349ec0e2010cc Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 036/119] cmd/internal/obj/riscv: check immediate for rotation
+ instructions
+
+Ensure that the immediate for a RORI or RORIW instruction are within range,
+adding test coverage. Also use a consistent "immediate out of range" error
+for both rotations and shifts.
+
+Change-Id: Id687d7c6e028786f607e9519bbb64dab62b6cf3d
+Reviewed-on: https://go-review.googlesource.com/c/go/+/572735
+Reviewed-by: M Zhuo <mengzhuo1203@gmail.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Reviewed-by: Than McIntosh <thanm@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ .../asm/internal/asm/testdata/riscv64error.s  | 28 +++++++++++--------
+ src/cmd/internal/obj/riscv/obj.go             | 18 ++++++++++--
+ 2 files changed, 31 insertions(+), 15 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index 2dc9db3fb1..0b0184aaa7 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -26,18 +26,22 @@ TEXT errors(SB),$0
+ 	MOVD	F0, F1, F2			// ERROR "illegal MOV instruction"
+ 	MOV	X10, X11, X12			// ERROR "illegal MOV instruction"
+ 	MOVW	X10, X11, X12			// ERROR "illegal MOV instruction"
+-	SLLI	$64, X5, X6			// ERROR "shift amount out of range 0 to 63"
+-	SRLI	$64, X5, X6			// ERROR "shift amount out of range 0 to 63"
+-	SRAI	$64, X5, X6			// ERROR "shift amount out of range 0 to 63"
+-	SLLI	$-1, X5, X6			// ERROR "shift amount out of range 0 to 63"
+-	SRLI	$-1, X5, X6			// ERROR "shift amount out of range 0 to 63"
+-	SRAI	$-1, X5, X6			// ERROR "shift amount out of range 0 to 63"
+-	SLLIW	$32, X5, X6			// ERROR "shift amount out of range 0 to 31"
+-	SRLIW	$32, X5, X6			// ERROR "shift amount out of range 0 to 31"
+-	SRAIW	$32, X5, X6			// ERROR "shift amount out of range 0 to 31"
+-	SLLIW	$-1, X5, X6			// ERROR "shift amount out of range 0 to 31"
+-	SRLIW	$-1, X5, X6			// ERROR "shift amount out of range 0 to 31"
+-	SRAIW	$-1, X5, X6			// ERROR "shift amount out of range 0 to 31"
++	RORI	$64, X5, X6			// ERROR "immediate out of range 0 to 63"
++	SLLI	$64, X5, X6			// ERROR "immediate out of range 0 to 63"
++	SRLI	$64, X5, X6			// ERROR "immediate out of range 0 to 63"
++	SRAI	$64, X5, X6			// ERROR "immediate out of range 0 to 63"
++	RORI	$-1, X5, X6			// ERROR "immediate out of range 0 to 63"
++	SLLI	$-1, X5, X6			// ERROR "immediate out of range 0 to 63"
++	SRLI	$-1, X5, X6			// ERROR "immediate out of range 0 to 63"
++	SRAI	$-1, X5, X6			// ERROR "immediate out of range 0 to 63"
++	RORIW	$32, X5, X6			// ERROR "immediate out of range 0 to 31"
++	SLLIW	$32, X5, X6			// ERROR "immediate out of range 0 to 31"
++	SRLIW	$32, X5, X6			// ERROR "immediate out of range 0 to 31"
++	SRAIW	$32, X5, X6			// ERROR "immediate out of range 0 to 31"
++	RORIW	$-1, X5, X6			// ERROR "immediate out of range 0 to 31"
++	SLLIW	$-1, X5, X6			// ERROR "immediate out of range 0 to 31"
++	SRLIW	$-1, X5, X6			// ERROR "immediate out of range 0 to 31"
++	SRAIW	$-1, X5, X6			// ERROR "immediate out of range 0 to 31"
+ 	SD	X5, 4294967296(X6)		// ERROR "constant 4294967296 too large"
+ 	SRLI	$1, X5, F1			// ERROR "expected integer register in rd position but got non-integer register F1"
+ 	SRLI	$1, F1, X5			// ERROR "expected integer register in rs1 position but got non-integer register F1"
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 579ac43810..d396264a05 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -2504,17 +2504,29 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		ins.as = AFSGNJND
+ 		ins.rs1 = uint32(p.From.Reg)
+ 
+-	case AROL, AROLW, AROR, ARORW, ARORI, ARORIW:
++	case AROL, AROLW, AROR, ARORW:
++		inss = instructionsForRotate(p, ins)
++
++	case ARORI:
++		if ins.imm < 0 || ins.imm > 63 {
++			p.Ctxt.Diag("%v: immediate out of range 0 to 63", p)
++		}
++		inss = instructionsForRotate(p, ins)
++
++	case ARORIW:
++		if ins.imm < 0 || ins.imm > 31 {
++			p.Ctxt.Diag("%v: immediate out of range 0 to 31", p)
++		}
+ 		inss = instructionsForRotate(p, ins)
+ 
+ 	case ASLLI, ASRLI, ASRAI:
+ 		if ins.imm < 0 || ins.imm > 63 {
+-			p.Ctxt.Diag("%v: shift amount out of range 0 to 63", p)
++			p.Ctxt.Diag("%v: immediate out of range 0 to 63", p)
+ 		}
+ 
+ 	case ASLLIW, ASRLIW, ASRAIW:
+ 		if ins.imm < 0 || ins.imm > 31 {
+-			p.Ctxt.Diag("%v: shift amount out of range 0 to 31", p)
++			p.Ctxt.Diag("%v: immediate out of range 0 to 31", p)
+ 		}
+ 
+ 	case ACLZ, ACLZW, ACTZ, ACTZW, ACPOP, ACPOPW, ASEXTB, ASEXTH, AZEXTH:
+-- 
+2.39.5
+
diff --git a/2037-test-codegen-add-Mul-test-for-riscv64.patch b/2037-test-codegen-add-Mul-test-for-riscv64.patch
new file mode 100644
index 0000000..3797d36
--- /dev/null
+++ b/2037-test-codegen-add-Mul-test-for-riscv64.patch
@@ -0,0 +1,31 @@
+From 1c7896634d95fb272aaeabc4be0e3f72ef4aeae0 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 037/119] test/codegen: add Mul test for riscv64
+
+Change-Id: I51e9832317e5dee1e3fe0772e7592b3dae95a625
+Reviewed-on: https://go-review.googlesource.com/c/go/+/586797
+Reviewed-by: Keith Randall <khr@golang.org>
+Reviewed-by: Keith Randall <khr@google.com>
+Auto-Submit: Keith Randall <khr@golang.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ test/codegen/mathbits.go | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
+index 8c971cf760..80fe9d2e0c 100644
+--- a/test/codegen/mathbits.go
++++ b/test/codegen/mathbits.go
+@@ -804,6 +804,7 @@ func Mul(x, y uint) (hi, lo uint) {
+ 	// ppc64x:"MULHDU","MULLD"
+ 	// s390x:"MLGR"
+ 	// mips64: "MULVU"
++	// riscv64:"MULHU","MUL"
+ 	return bits.Mul(x, y)
+ }
+ 
+-- 
+2.39.5
+
diff --git a/2038-math-remove-riscv64-assembly-implementations-of-roun.patch b/2038-math-remove-riscv64-assembly-implementations-of-roun.patch
new file mode 100644
index 0000000..157b13e
--- /dev/null
+++ b/2038-math-remove-riscv64-assembly-implementations-of-roun.patch
@@ -0,0 +1,127 @@
+From c38f77412975029cb112bf435bcefdf79391bc80 Mon Sep 17 00:00:00 2001
+From: Jorropo <jorropo.pgm@gmail.com>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 038/119] math: remove riscv64 assembly implementations of
+ rounding
+
+Fixes #68322
+
+This reverts commit ad377e906a8ee6f27545d83de280206dacec1e58.
+
+Change-Id: Ifa4811e2c679d789cc830dbff5e50301410e24d0
+Reviewed-on: https://go-review.googlesource.com/c/go/+/596516
+Reviewed-by: Than McIntosh <thanm@google.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+Commit-Queue: Cuong Manh Le <cuong.manhle.vn@gmail.com>
+Auto-Submit: Cuong Manh Le <cuong.manhle.vn@gmail.com>
+Reviewed-by: Keith Randall <khr@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Cuong Manh Le <cuong.manhle.vn@gmail.com>
+---
+ src/math/floor_asm.go        |  2 +-
+ src/math/floor_noasm.go      |  2 +-
+ src/math/floor_riscv64.s     | 41 ------------------------------------
+ test/fixedbugs/issue68322.go | 17 +++++++++++++++
+ 4 files changed, 19 insertions(+), 43 deletions(-)
+ delete mode 100644 src/math/floor_riscv64.s
+ create mode 100644 test/fixedbugs/issue68322.go
+
+diff --git a/src/math/floor_asm.go b/src/math/floor_asm.go
+index 5cb45f5a7e..fb419d6da2 100644
+--- a/src/math/floor_asm.go
++++ b/src/math/floor_asm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || riscv64 || s390x || wasm
++//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || s390x || wasm
+ 
+ package math
+ 
+diff --git a/src/math/floor_noasm.go b/src/math/floor_noasm.go
+index 6754ca8fc8..5641c7ea0a 100644
+--- a/src/math/floor_noasm.go
++++ b/src/math/floor_noasm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !riscv64 && !s390x && !wasm
++//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !s390x && !wasm
+ 
+ package math
+ 
+diff --git a/src/math/floor_riscv64.s b/src/math/floor_riscv64.s
+deleted file mode 100644
+index 62ce963781..0000000000
+--- a/src/math/floor_riscv64.s
++++ /dev/null
+@@ -1,41 +0,0 @@
+-// Copyright 2023 The Go Authors. All rights reserved.
+-// Use of this source code is governed by a BSD-style
+-// license that can be found in the LICENSE file.
+-
+-#include "textflag.h"
+-
+-#define PosInf 0x7FF0000000000000
+-
+-// The rounding mode of RISC-V is different from Go spec.
+-
+-#define ROUNDFN(NAME, MODE) 	\
+-TEXT NAME(SB),NOSPLIT,$0; 	\
+-	MOVD	x+0(FP), F0; 	\
+-	/* whether x is NaN */; \
+-	FEQD	F0, F0, X6;	\
+-	BNEZ	X6, 3(PC);	\
+-	/* return NaN if x is NaN */; \
+-	MOVD	F0, ret+8(FP); 	\
+-	RET;			\
+-	MOV	$PosInf, X6;	\
+-	FMVDX	X6, F1;		\
+-	FABSD	F0, F2;		\
+-	/* if abs(x) > +Inf, return Inf instead of round(x) */; \
+-	FLTD	F1, F2, X6;	\
+-	/* Inf should keep same signed with x then return */;	\
+-	BEQZ	X6, 3(PC); \
+-	FCVTLD.MODE	F0, X6;	\
+-	FCVTDL	X6, F1;		\
+-	/* rounding will drop signed bit in RISCV, restore it */; \
+-	FSGNJD	F0, F1, F0;	\
+-	MOVD	F0, ret+8(FP); 	\
+-	RET
+-
+-// func archFloor(x float64) float64
+-ROUNDFN(·archFloor, RDN)
+-
+-// func archCeil(x float64) float64
+-ROUNDFN(·archCeil, RUP)
+-
+-// func archTrunc(x float64) float64
+-ROUNDFN(·archTrunc, RTZ)
+diff --git a/test/fixedbugs/issue68322.go b/test/fixedbugs/issue68322.go
+new file mode 100644
+index 0000000000..9b3e713d59
+--- /dev/null
++++ b/test/fixedbugs/issue68322.go
+@@ -0,0 +1,17 @@
++// run
++
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++package main
++
++import "math"
++
++var doNotFold = 18446744073709549568.0
++
++func main() {
++	if math.Trunc(doNotFold) != doNotFold {
++		panic("big (over 2**63-1) math.Trunc is incorrect")
++	}
++}
+-- 
+2.39.5
+
diff --git a/2039-cmd-compile-drop-TODO-in-NilCheck-for-riscv64.patch b/2039-cmd-compile-drop-TODO-in-NilCheck-for-riscv64.patch
new file mode 100644
index 0000000..fa52e1a
--- /dev/null
+++ b/2039-cmd-compile-drop-TODO-in-NilCheck-for-riscv64.patch
@@ -0,0 +1,49 @@
+From 946d929609b1ad9b3a96ed3f02464469c6989084 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 039/119] cmd/compile: drop TODO in NilCheck for riscv64
+
+Also add log as arm/amd64 do.
+
+Change-Id: I3698993e2df0ebf3bfcf8bad5fe389affa0e8eff
+Reviewed-on: https://go-review.googlesource.com/c/go/+/595355
+Reviewed-by: Keith Randall <khr@golang.org>
+Reviewed-by: Robert Griesemer <gri@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Keith Randall <khr@google.com>
+---
+ src/cmd/compile/internal/riscv64/ssa.go | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index c9e75b2180..3c57bc93bc 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -7,6 +7,7 @@ package riscv64
+ import (
+ 	"cmd/compile/internal/base"
+ 	"cmd/compile/internal/ir"
++	"cmd/compile/internal/logopt"
+ 	"cmd/compile/internal/objw"
+ 	"cmd/compile/internal/ssa"
+ 	"cmd/compile/internal/ssagen"
+@@ -720,13 +721,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 
+ 	case ssa.OpRISCV64LoweredNilCheck:
+ 		// Issue a load which will fault if arg is nil.
+-		// TODO: optimizations. See arm and amd64 LoweredNilCheck.
+ 		p := s.Prog(riscv.AMOVB)
+ 		p.From.Type = obj.TYPE_MEM
+ 		p.From.Reg = v.Args[0].Reg()
+ 		ssagen.AddAux(&p.From, v)
+ 		p.To.Type = obj.TYPE_REG
+ 		p.To.Reg = riscv.REG_ZERO
++		if logopt.Enabled() {
++			logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
++		}
+ 		if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos == 1 in generated wrappers
+ 			base.WarnfAt(v.Pos, "generated nil check")
+ 		}
+-- 
+2.39.5
+
diff --git a/2040-math-big-implement-addVV-in-riscv64-assembly.patch b/2040-math-big-implement-addVV-in-riscv64-assembly.patch
new file mode 100644
index 0000000..ec5aa2a
--- /dev/null
+++ b/2040-math-big-implement-addVV-in-riscv64-assembly.patch
@@ -0,0 +1,148 @@
+From 2ce9d3b1f780c80d0dcbcf2efe3972ba26b5ba7a Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 040/119] math/big: implement addVV in riscv64 assembly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This provides an assembly implementation of addVV for riscv64,
+processing up to four words per loop, resulting in a significant
+performance gain.
+
+On a StarFive VisionFive 2:
+
+               │   addvv.1    │               addvv.2               │
+               │    sec/op    │   sec/op     vs base                │
+AddVV/1-4         73.45n ± 0%   48.08n ± 0%  -34.54% (p=0.000 n=10)
+AddVV/2-4         88.14n ± 0%   58.76n ± 0%  -33.33% (p=0.000 n=10)
+AddVV/3-4        102.80n ± 0%   69.44n ± 0%  -32.45% (p=0.000 n=10)
+AddVV/4-4        117.50n ± 0%   72.18n ± 0%  -38.57% (p=0.000 n=10)
+AddVV/5-4        132.20n ± 0%   82.79n ± 0%  -37.38% (p=0.000 n=10)
+AddVV/10-4        216.3n ± 0%   126.8n ± 0%  -41.35% (p=0.000 n=10)
+AddVV/100-4      1659.0n ± 0%   885.2n ± 0%  -46.64% (p=0.000 n=10)
+AddVV/1000-4     16.089µ ± 0%   8.400µ ± 0%  -47.79% (p=0.000 n=10)
+AddVV/10000-4     245.3µ ± 0%   176.9µ ± 0%  -27.88% (p=0.000 n=10)
+AddVV/100000-4    2.537m ± 0%   1.873m ± 0%  -26.17% (p=0.000 n=10)
+geomean           1.435µ        904.5n       -36.99%
+
+               │   addvv.1    │                addvv.2                │
+               │     B/s      │      B/s       vs base                │
+AddVV/1-4        830.9Mi ± 0%   1269.5Mi ± 0%  +52.78% (p=0.000 n=10)
+AddVV/2-4        1.353Gi ± 0%    2.029Gi ± 0%  +50.00% (p=0.000 n=10)
+AddVV/3-4        1.739Gi ± 0%    2.575Gi ± 0%  +48.09% (p=0.000 n=10)
+AddVV/4-4        2.029Gi ± 0%    3.303Gi ± 0%  +62.82% (p=0.000 n=10)
+AddVV/5-4        2.254Gi ± 0%    3.600Gi ± 0%  +59.69% (p=0.000 n=10)
+AddVV/10-4       2.755Gi ± 0%    4.699Gi ± 0%  +70.54% (p=0.000 n=10)
+AddVV/100-4      3.594Gi ± 0%    6.734Gi ± 0%  +87.37% (p=0.000 n=10)
+AddVV/1000-4     3.705Gi ± 0%    7.096Gi ± 0%  +91.54% (p=0.000 n=10)
+AddVV/10000-4    2.430Gi ± 0%    3.369Gi ± 0%  +38.65% (p=0.000 n=10)
+AddVV/100000-4   2.350Gi ± 0%    3.183Gi ± 0%  +35.44% (p=0.000 n=10)
+geomean          2.119Gi         3.364Gi       +58.71%
+
+Change-Id: I727b3d9f8ab01eada7270046480b1430d56d0a96
+Reviewed-on: https://go-review.googlesource.com/c/go/+/595395
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: M Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Than McIntosh <thanm@google.com>
+---
+ src/math/big/arith_riscv64.s | 81 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 80 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s
+index cb9ac18292..56e7a2bf4f 100644
+--- a/src/math/big/arith_riscv64.s
++++ b/src/math/big/arith_riscv64.s
+@@ -11,7 +11,86 @@
+ // arithmetic operations on vectors implemented in arith.go.
+ 
+ TEXT ·addVV(SB),NOSPLIT,$0
+-	JMP ·addVV_g(SB)
++	MOV	x+24(FP), X5
++	MOV	y+48(FP), X6
++	MOV	z+0(FP), X7
++	MOV	z_len+8(FP), X30
++
++	MOV	$4, X28
++	MOV	$0, X29		// c = 0
++
++	BEQZ	X30, done
++	BLTU	X30, X28, loop1
++
++loop4:
++	MOV	0(X5), X8	// x[0]
++	MOV	0(X6), X9	// y[0]
++	MOV	8(X5), X11	// x[1]
++	MOV	8(X6), X12	// y[1]
++	MOV	16(X5), X14	// x[2]
++	MOV	16(X6), X15	// y[2]
++	MOV	24(X5), X17	// x[3]
++	MOV	24(X6), X18	// y[3]
++
++	ADD	X8, X9, X21	// z[0] = x[0] + y[0]
++	SLTU	X8, X21, X22
++	ADD	X21, X29, X10	// z[0] = x[0] + y[0] + c
++	SLTU	X21, X10, X23
++	ADD	X22, X23, X29	// next c
++
++	ADD	X11, X12, X24	// z[1] = x[1] + y[1]
++	SLTU	X11, X24, X25
++	ADD	X24, X29, X13	// z[1] = x[1] + y[1] + c
++	SLTU	X24, X13, X26
++	ADD	X25, X26, X29	// next c
++
++	ADD	X14, X15, X21	// z[2] = x[2] + y[2]
++	SLTU	X14, X21, X22
++	ADD	X21, X29, X16	// z[2] = x[2] + y[2] + c
++	SLTU	X21, X16, X23
++	ADD	X22, X23, X29	// next c
++
++	ADD	X17, X18, X21	// z[3] = x[3] + y[3]
++	SLTU	X17, X21, X22
++	ADD	X21, X29, X19	// z[3] = x[3] + y[3] + c
++	SLTU	X21, X19, X23
++	ADD	X22, X23, X29	// next c
++
++	MOV	X10, 0(X7)	// z[0]
++	MOV	X13, 8(X7)	// z[1]
++	MOV	X16, 16(X7)	// z[2]
++	MOV	X19, 24(X7)	// z[3]
++
++	ADD	$32, X5
++	ADD	$32, X6
++	ADD	$32, X7
++	SUB	$4, X30
++
++	BGEU	X30, X28, loop4
++	BEQZ	X30, done
++
++loop1:
++	MOV	0(X5), X10	// x
++	MOV	0(X6), X11	// y
++
++	ADD	X10, X11, X12	// z = x + y
++	SLTU	X10, X12, X14
++	ADD	X12, X29, X13	// z = x + y + c
++	SLTU	X12, X13, X15
++	ADD	X14, X15, X29	// next c
++
++	MOV	X13, 0(X7)	// z
++
++	ADD	$8, X5
++	ADD	$8, X6
++	ADD	$8, X7
++	SUB	$1, X30
++
++	BNEZ	X30, loop1
++
++done:
++	MOV	X29, c+72(FP)	// return c
++	RET
+ 
+ TEXT ·subVV(SB),NOSPLIT,$0
+ 	JMP ·subVV_g(SB)
+-- 
+2.39.5
+
diff --git a/2041-math-big-implement-subVV-in-riscv64-assembly.patch b/2041-math-big-implement-subVV-in-riscv64-assembly.patch
new file mode 100644
index 0000000..b7b4d7c
--- /dev/null
+++ b/2041-math-big-implement-subVV-in-riscv64-assembly.patch
@@ -0,0 +1,148 @@
+From e89ee68e880af855a61c4b29f10d6db4aa86598d Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 041/119] math/big: implement subVV in riscv64 assembly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This provides an assembly implementation of subVV for riscv64,
+processing up to four words per loop, resulting in a significant
+performance gain.
+
+On a StarFive VisionFive 2:
+
+               │   subvv.1    │               subvv.2               │
+               │    sec/op    │   sec/op     vs base                │
+SubVV/1-4         73.46n ± 0%   48.08n ± 0%  -34.55% (p=0.000 n=10)
+SubVV/2-4         88.13n ± 0%   58.76n ± 0%  -33.33% (p=0.000 n=10)
+SubVV/3-4        102.80n ± 0%   69.45n ± 0%  -32.44% (p=0.000 n=10)
+SubVV/4-4        117.50n ± 0%   72.11n ± 0%  -38.63% (p=0.000 n=10)
+SubVV/5-4        132.20n ± 0%   82.80n ± 0%  -37.37% (p=0.000 n=10)
+SubVV/10-4        216.3n ± 0%   126.9n ± 0%  -41.33% (p=0.000 n=10)
+SubVV/100-4      1659.0n ± 0%   886.5n ± 0%  -46.56% (p=0.000 n=10)
+SubVV/1000-4     16.089µ ± 0%   8.401µ ± 0%  -47.78% (p=0.000 n=10)
+SubVV/10000-4     244.7µ ± 0%   176.8µ ± 0%  -27.74% (p=0.000 n=10)
+SubVV/100000-4    2.562m ± 0%   1.871m ± 0%  -26.96% (p=0.000 n=10)
+geomean           1.436µ        904.4n       -37.04%
+
+               │   subvv.1    │                subvv.2                │
+               │     B/s      │      B/s       vs base                │
+SubVV/1-4        830.9Mi ± 0%   1269.5Mi ± 0%  +52.79% (p=0.000 n=10)
+SubVV/2-4        1.353Gi ± 0%    2.029Gi ± 0%  +49.99% (p=0.000 n=10)
+SubVV/3-4        1.739Gi ± 0%    2.575Gi ± 0%  +48.06% (p=0.000 n=10)
+SubVV/4-4        2.029Gi ± 0%    3.306Gi ± 0%  +62.96% (p=0.000 n=10)
+SubVV/5-4        2.254Gi ± 0%    3.600Gi ± 0%  +59.67% (p=0.000 n=10)
+SubVV/10-4       2.755Gi ± 0%    4.699Gi ± 0%  +70.53% (p=0.000 n=10)
+SubVV/100-4      3.594Gi ± 0%    6.723Gi ± 0%  +87.08% (p=0.000 n=10)
+SubVV/1000-4     3.705Gi ± 0%    7.095Gi ± 0%  +91.52% (p=0.000 n=10)
+SubVV/10000-4    2.436Gi ± 0%    3.372Gi ± 0%  +38.39% (p=0.000 n=10)
+SubVV/100000-4   2.327Gi ± 0%    3.185Gi ± 0%  +36.91% (p=0.000 n=10)
+geomean          2.118Gi         3.364Gi       +58.84%
+
+Change-Id: I361cb3f4195b27a9f1e9486c9e1fdbeaa94d32b4
+Reviewed-on: https://go-review.googlesource.com/c/go/+/595396
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+---
+ src/math/big/arith_riscv64.s | 81 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 80 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s
+index 56e7a2bf4f..f29933d2a1 100644
+--- a/src/math/big/arith_riscv64.s
++++ b/src/math/big/arith_riscv64.s
+@@ -93,7 +93,86 @@ done:
+ 	RET
+ 
+ TEXT ·subVV(SB),NOSPLIT,$0
+-	JMP ·subVV_g(SB)
++	MOV	x+24(FP), X5
++	MOV	y+48(FP), X6
++	MOV	z+0(FP), X7
++	MOV	z_len+8(FP), X30
++
++	MOV	$4, X28
++	MOV	$0, X29		// b = 0
++
++	BEQZ	X30, done
++	BLTU	X30, X28, loop1
++
++loop4:
++	MOV	0(X5), X8	// x[0]
++	MOV	0(X6), X9	// y[0]
++	MOV	8(X5), X11	// x[1]
++	MOV	8(X6), X12	// y[1]
++	MOV	16(X5), X14	// x[2]
++	MOV	16(X6), X15	// y[2]
++	MOV	24(X5), X17	// x[3]
++	MOV	24(X6), X18	// y[3]
++
++	SUB	X9, X8, X21	// z[0] = x[0] - y[0]
++	SLTU	X21, X8, X22
++	SUB	X29, X21, X10	// z[0] = x[0] - y[0] - b
++	SLTU	X10, X21, X23
++	ADD	X22, X23, X29	// next b
++
++	SUB	X12, X11, X24	// z[1] = x[1] - y[1]
++	SLTU	X24, X11, X25
++	SUB	X29, X24, X13	// z[1] = x[1] - y[1] - b
++	SLTU	X13, X24, X26
++	ADD	X25, X26, X29	// next b
++
++	SUB	X15, X14, X21	// z[2] = x[2] - y[2]
++	SLTU	X21, X14, X22
++	SUB	X29, X21, X16	// z[2] = x[2] - y[2] - b
++	SLTU	X16, X21, X23
++	ADD	X22, X23, X29	// next b
++
++	SUB	X18, X17, X21	// z[3] = x[3] - y[3]
++	SLTU	X21, X17, X22
++	SUB	X29, X21, X19	// z[3] = x[3] - y[3] - b
++	SLTU	X19, X21, X23
++	ADD	X22, X23, X29	// next b
++
++	MOV	X10, 0(X7)	// z[0]
++	MOV	X13, 8(X7)	// z[1]
++	MOV	X16, 16(X7)	// z[2]
++	MOV	X19, 24(X7)	// z[3]
++
++	ADD	$32, X5
++	ADD	$32, X6
++	ADD	$32, X7
++	SUB	$4, X30
++
++	BGEU	X30, X28, loop4
++	BEQZ	X30, done
++
++loop1:
++	MOV	0(X5), X10	// x
++	MOV	0(X6), X11	// y
++
++	SUB	X11, X10, X12	// z = x - y
++	SLTU	X12, X10, X14
++	SUB	X29, X12, X13	// z = x - y - b
++	SLTU	X13, X12, X15
++	ADD	X14, X15, X29	// next b
++
++	MOV	X13, 0(X7)	// z
++
++	ADD	$8, X5
++	ADD	$8, X6
++	ADD	$8, X7
++	SUB	$1, X30
++
++	BNEZ	X30, loop1
++
++done:
++	MOV	X29, c+72(FP)	// return b
++	RET
+ 
+ TEXT ·addVW(SB),NOSPLIT,$0
+ 	JMP ·addVW_g(SB)
+-- 
+2.39.5
+
diff --git a/2042-cmd-compile-use-integer-min-max-instructions-on-risc.patch b/2042-cmd-compile-use-integer-min-max-instructions-on-risc.patch
new file mode 100644
index 0000000..f2d97ff
--- /dev/null
+++ b/2042-cmd-compile-use-integer-min-max-instructions-on-risc.patch
@@ -0,0 +1,360 @@
+From 27901e4870c6f988a9a1dbdb0a6aedda301a66bc Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 042/119] cmd/compile: use integer min/max instructions on
+ riscv64
+
+When GORISCV64 enables rva22u64, make use of integer MIN/MINU/MAX/MAXU
+instructions in compiler rewrite rules.
+
+Change-Id: I4e7c514516acad03f2869d4c8936f06582cf7ea9
+Reviewed-on: https://go-review.googlesource.com/c/go/+/559660
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |  4 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   | 10 +++
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |  6 ++
+ .../compile/internal/ssa/_gen/genericOps.go   |  6 ++
+ src/cmd/compile/internal/ssa/opGen.go         | 88 +++++++++++++++++++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 81 +++++++++++++++++
+ src/cmd/compile/internal/ssagen/ssa.go        | 19 ++++
+ 7 files changed, 212 insertions(+), 2 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 3c57bc93bc..10fea07e60 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -288,8 +288,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		ssa.OpRISCV64FADDS, ssa.OpRISCV64FSUBS, ssa.OpRISCV64FMULS, ssa.OpRISCV64FDIVS,
+ 		ssa.OpRISCV64FEQS, ssa.OpRISCV64FNES, ssa.OpRISCV64FLTS, ssa.OpRISCV64FLES,
+ 		ssa.OpRISCV64FADDD, ssa.OpRISCV64FSUBD, ssa.OpRISCV64FMULD, ssa.OpRISCV64FDIVD,
+-		ssa.OpRISCV64FEQD, ssa.OpRISCV64FNED, ssa.OpRISCV64FLTD, ssa.OpRISCV64FLED,
+-		ssa.OpRISCV64FSGNJD:
++		ssa.OpRISCV64FEQD, ssa.OpRISCV64FNED, ssa.OpRISCV64FLTD, ssa.OpRISCV64FLED, ssa.OpRISCV64FSGNJD,
++		ssa.OpRISCV64MIN, ssa.OpRISCV64MAX, ssa.OpRISCV64MINU, ssa.OpRISCV64MAXU:
+ 		r := v.Reg()
+ 		r1 := v.Args[0].Reg()
+ 		r2 := v.Args[1].Reg()
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index c2df433315..7d8fb79e17 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -834,3 +834,13 @@
+ (F(MADD|NMADD|MSUB|NMSUB)S x y neg:(FNEGS z)) && neg.Uses == 1 => (F(MSUB|NMSUB|MADD|NMADD)S x y z)
+ (F(MADD|NMADD|MSUB|NMSUB)D neg:(FNEGD x) y z) && neg.Uses == 1 => (F(NMSUB|MSUB|NMADD|MADD)D x y z)
+ (F(MADD|NMADD|MSUB|NMSUB)D x y neg:(FNEGD z)) && neg.Uses == 1 => (F(MSUB|NMSUB|MADD|NMADD)D x y z)
++
++//
++// Optimisations for rva22u64 and above.
++//
++
++// Integer minimum and maximum.
++(Min64  x y) && buildcfg.GORISCV64 >= 22 => (MIN  x y)
++(Max64  x y) && buildcfg.GORISCV64 >= 22 => (MAX  x y)
++(Min64u x y) && buildcfg.GORISCV64 >= 22 => (MINU x y)
++(Max64u x y) && buildcfg.GORISCV64 >= 22 => (MAXU x y)
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 13fa91864b..7323cb119c 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -235,6 +235,12 @@ func init() {
+ 		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1
+ 		{name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"},    // arg0 ^ auxint
+ 
++		// Minimum and maximum
++		{name: "MIN", argLength: 2, reg: gp21, asm: "MIN", commutative: true},   // min(arg0,arg1), signed
++		{name: "MAX", argLength: 2, reg: gp21, asm: "MAX", commutative: true},   // max(arg0,arg1), signed
++		{name: "MINU", argLength: 2, reg: gp21, asm: "MINU", commutative: true}, // min(arg0,arg1), unsigned
++		{name: "MAXU", argLength: 2, reg: gp21, asm: "MAXU", commutative: true}, // max(arg0,arg1), unsigned
++
+ 		// Generate boolean values
+ 		{name: "SEQZ", argLength: 1, reg: gp11, asm: "SEQZ"},                 // arg0 == 0, result is 0 or 1
+ 		{name: "SNEZ", argLength: 1, reg: gp11, asm: "SNEZ"},                 // arg0 != 0, result is 0 or 1
+diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go
+index fb18319263..95a5a4dda9 100644
+--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go
++++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go
+@@ -285,6 +285,12 @@ var genericOps = []opData{
+ 	{name: "Abs", argLength: 1},      // absolute value arg0
+ 	{name: "Copysign", argLength: 2}, // copy sign from arg0 to arg1
+ 
++	// Integer min/max implementation, if hardware is available.
++	{name: "Min64", argLength: 2},  // min(arg0,arg1), signed
++	{name: "Max64", argLength: 2},  // max(arg0,arg1), signed
++	{name: "Min64u", argLength: 2}, // min(arg0,arg1), unsigned
++	{name: "Max64u", argLength: 2}, // max(arg0,arg1), unsigned
++
+ 	// Float min/max implementation, if hardware is available.
+ 	{name: "Min64F", argLength: 2}, // min(arg0,arg1)
+ 	{name: "Min32F", argLength: 2}, // min(arg0,arg1)
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index dd80a2c52a..600b8d9e30 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2394,6 +2394,10 @@ const (
+ 	OpRISCV64RORW
+ 	OpRISCV64XOR
+ 	OpRISCV64XORI
++	OpRISCV64MIN
++	OpRISCV64MAX
++	OpRISCV64MINU
++	OpRISCV64MAXU
+ 	OpRISCV64SEQZ
+ 	OpRISCV64SNEZ
+ 	OpRISCV64SLT
+@@ -3035,6 +3039,10 @@ const (
+ 	OpRoundToEven
+ 	OpAbs
+ 	OpCopysign
++	OpMin64
++	OpMax64
++	OpMin64u
++	OpMax64u
+ 	OpMin64F
+ 	OpMin32F
+ 	OpMax64F
+@@ -32124,6 +32132,66 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:        "MIN",
++		argLen:      2,
++		commutative: true,
++		asm:         riscv.AMIN,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:        "MAX",
++		argLen:      2,
++		commutative: true,
++		asm:         riscv.AMAX,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:        "MINU",
++		argLen:      2,
++		commutative: true,
++		asm:         riscv.AMINU,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:        "MAXU",
++		argLen:      2,
++		commutative: true,
++		asm:         riscv.AMAXU,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:   "SEQZ",
+ 		argLen: 1,
+@@ -39373,6 +39441,26 @@ var opcodeTable = [...]opInfo{
+ 		argLen:  2,
+ 		generic: true,
+ 	},
++	{
++		name:    "Min64",
++		argLen:  2,
++		generic: true,
++	},
++	{
++		name:    "Max64",
++		argLen:  2,
++		generic: true,
++	},
++	{
++		name:    "Min64u",
++		argLen:  2,
++		generic: true,
++	},
++	{
++		name:    "Max64u",
++		argLen:  2,
++		generic: true,
++	},
+ 	{
+ 		name:    "Min64F",
+ 		argLen:  2,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 28c44da5a8..9a13955689 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -4,6 +4,7 @@ package ssa
+ 
+ import (
+ 	"cmd/compile/internal/types"
++	"internal/buildcfg"
+ 	"math"
+ )
+ 
+@@ -331,15 +332,23 @@ func rewriteValueRISCV64(v *Value) bool {
+ 	case OpMax32F:
+ 		v.Op = OpRISCV64LoweredFMAXS
+ 		return true
++	case OpMax64:
++		return rewriteValueRISCV64_OpMax64(v)
+ 	case OpMax64F:
+ 		v.Op = OpRISCV64LoweredFMAXD
+ 		return true
++	case OpMax64u:
++		return rewriteValueRISCV64_OpMax64u(v)
+ 	case OpMin32F:
+ 		v.Op = OpRISCV64LoweredFMINS
+ 		return true
++	case OpMin64:
++		return rewriteValueRISCV64_OpMin64(v)
+ 	case OpMin64F:
+ 		v.Op = OpRISCV64LoweredFMIND
+ 		return true
++	case OpMin64u:
++		return rewriteValueRISCV64_OpMin64u(v)
+ 	case OpMod16:
+ 		return rewriteValueRISCV64_OpMod16(v)
+ 	case OpMod16u:
+@@ -2398,6 +2407,78 @@ func rewriteValueRISCV64_OpLsh8x8(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpMax64(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (Max64 x y)
++	// cond: buildcfg.GORISCV64 >= 22
++	// result: (MAX x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(buildcfg.GORISCV64 >= 22) {
++			break
++		}
++		v.reset(OpRISCV64MAX)
++		v.AddArg2(x, y)
++		return true
++	}
++	return false
++}
++func rewriteValueRISCV64_OpMax64u(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (Max64u x y)
++	// cond: buildcfg.GORISCV64 >= 22
++	// result: (MAXU x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(buildcfg.GORISCV64 >= 22) {
++			break
++		}
++		v.reset(OpRISCV64MAXU)
++		v.AddArg2(x, y)
++		return true
++	}
++	return false
++}
++func rewriteValueRISCV64_OpMin64(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (Min64 x y)
++	// cond: buildcfg.GORISCV64 >= 22
++	// result: (MIN x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(buildcfg.GORISCV64 >= 22) {
++			break
++		}
++		v.reset(OpRISCV64MIN)
++		v.AddArg2(x, y)
++		return true
++	}
++	return false
++}
++func rewriteValueRISCV64_OpMin64u(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (Min64u x y)
++	// cond: buildcfg.GORISCV64 >= 22
++	// result: (MINU x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(buildcfg.GORISCV64 >= 22) {
++			break
++		}
++		v.reset(OpRISCV64MINU)
++		v.AddArg2(x, y)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpMod16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
+index 178ccfb59b..a0a3470ea2 100644
+--- a/src/cmd/compile/internal/ssagen/ssa.go
++++ b/src/cmd/compile/internal/ssagen/ssa.go
+@@ -3624,6 +3624,25 @@ func (s *state) minMax(n *ir.CallExpr) *ssa.Value {
+ 		})
+ 	}
+ 
++	if typ.IsInteger() {
++		if Arch.LinkArch.Family == sys.RISCV64 && buildcfg.GORISCV64 >= 22 && typ.Size() == 8 {
++			var op ssa.Op
++			switch {
++			case typ.IsSigned() && n.Op() == ir.OMIN:
++				op = ssa.OpMin64
++			case typ.IsSigned() && n.Op() == ir.OMAX:
++				op = ssa.OpMax64
++			case typ.IsUnsigned() && n.Op() == ir.OMIN:
++				op = ssa.OpMin64u
++			case typ.IsUnsigned() && n.Op() == ir.OMAX:
++				op = ssa.OpMax64u
++			}
++			return fold(func(x, a *ssa.Value) *ssa.Value {
++				return s.newValue2(op, typ, x, a)
++			})
++		}
++	}
++
+ 	lt := s.ssaOp(ir.OLT, typ)
+ 
+ 	return fold(func(x, a *ssa.Value) *ssa.Value {
+-- 
+2.39.5
+
diff --git a/2043-math-big-implement-addVW-in-riscv64-assembly.patch b/2043-math-big-implement-addVW-in-riscv64-assembly.patch
new file mode 100644
index 0000000..6ce9221
--- /dev/null
+++ b/2043-math-big-implement-addVW-in-riscv64-assembly.patch
@@ -0,0 +1,146 @@
+From bb58f5dd1ad9a3c5f95a372a9e2ee6c97032a442 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 043/119] math/big: implement addVW in riscv64 assembly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This provides an assembly implementation of addVW for riscv64,
+processing up to four words per loop, resulting in a significant
+performance gain.
+
+On a StarFive VisionFive 2:
+
+                  │   addvw.1    │               addvw.2               │
+                  │    sec/op    │   sec/op     vs base                │
+AddVW/1-4            57.43n ± 0%   41.45n ± 0%  -27.83% (p=0.000 n=10)
+AddVW/2-4            69.31n ± 0%   48.15n ± 0%  -30.53% (p=0.000 n=10)
+AddVW/3-4            76.12n ± 0%   54.97n ± 0%  -27.79% (p=0.000 n=10)
+AddVW/4-4            85.47n ± 0%   56.14n ± 0%  -34.32% (p=0.000 n=10)
+AddVW/5-4            96.16n ± 0%   62.82n ± 0%  -34.67% (p=0.000 n=10)
+AddVW/10-4          149.60n ± 0%   89.55n ± 0%  -40.14% (p=0.000 n=10)
+AddVW/100-4         1115.0n ± 0%   549.3n ± 0%  -50.74% (p=0.000 n=10)
+AddVW/1000-4        10.732µ ± 0%   5.060µ ± 0%  -52.85% (p=0.000 n=10)
+AddVW/10000-4        151.7µ ± 0%   103.7µ ± 0%  -31.63% (p=0.000 n=10)
+AddVW/100000-4       1.523m ± 0%   1.050m ± 0%  -31.03% (p=0.000 n=10)
+AddVWext/1-4         57.42n ± 0%   41.45n ± 0%  -27.81% (p=0.000 n=10)
+AddVWext/2-4         69.32n ± 0%   48.15n ± 0%  -30.54% (p=0.000 n=10)
+AddVWext/3-4         76.12n ± 0%   54.87n ± 0%  -27.92% (p=0.000 n=10)
+AddVWext/4-4         85.47n ± 0%   56.14n ± 0%  -34.32% (p=0.000 n=10)
+AddVWext/5-4         96.15n ± 0%   62.82n ± 0%  -34.66% (p=0.000 n=10)
+AddVWext/10-4       149.60n ± 0%   89.55n ± 0%  -40.14% (p=0.000 n=10)
+AddVWext/100-4      1115.0n ± 0%   549.3n ± 0%  -50.74% (p=0.000 n=10)
+AddVWext/1000-4     10.732µ ± 0%   5.060µ ± 0%  -52.85% (p=0.000 n=10)
+AddVWext/10000-4     150.5µ ± 0%   103.7µ ± 0%  -31.10% (p=0.000 n=10)
+AddVWext/100000-4    1.530m ± 0%   1.049m ± 0%  -31.41% (p=0.000 n=10)
+geomean              1.003µ        633.9n       -36.79%
+
+                  │   addvw.1    │                addvw.2                 │
+                  │     B/s      │      B/s       vs base                 │
+AddVW/1-4           132.8Mi ± 0%    184.1Mi ± 0%   +38.55% (p=0.000 n=10)
+AddVW/2-4           220.1Mi ± 0%    316.9Mi ± 0%   +43.96% (p=0.000 n=10)
+AddVW/3-4           300.7Mi ± 0%    416.4Mi ± 0%   +38.48% (p=0.000 n=10)
+AddVW/4-4           357.1Mi ± 0%    543.6Mi ± 0%   +52.25% (p=0.000 n=10)
+AddVW/5-4           396.7Mi ± 0%    607.2Mi ± 0%   +53.06% (p=0.000 n=10)
+AddVW/10-4          510.1Mi ± 0%    852.0Mi ± 0%   +67.02% (p=0.000 n=10)
+AddVW/100-4         684.1Mi ± 0%   1389.0Mi ± 0%  +103.03% (p=0.000 n=10)
+AddVW/1000-4        710.9Mi ± 0%   1507.8Mi ± 0%  +112.08% (p=0.000 n=10)
+AddVW/10000-4       503.1Mi ± 0%    735.8Mi ± 0%   +46.26% (p=0.000 n=10)
+AddVW/100000-4      501.0Mi ± 0%    726.5Mi ± 0%   +45.00% (p=0.000 n=10)
+AddVWext/1-4        132.9Mi ± 0%    184.1Mi ± 0%   +38.55% (p=0.000 n=10)
+AddVWext/2-4        220.1Mi ± 0%    316.9Mi ± 0%   +43.98% (p=0.000 n=10)
+AddVWext/3-4        300.7Mi ± 0%    417.1Mi ± 0%   +38.73% (p=0.000 n=10)
+AddVWext/4-4        357.1Mi ± 0%    543.6Mi ± 0%   +52.25% (p=0.000 n=10)
+AddVWext/5-4        396.7Mi ± 0%    607.2Mi ± 0%   +53.05% (p=0.000 n=10)
+AddVWext/10-4       510.1Mi ± 0%    852.0Mi ± 0%   +67.02% (p=0.000 n=10)
+AddVWext/100-4      684.2Mi ± 0%   1389.0Mi ± 0%  +103.02% (p=0.000 n=10)
+AddVWext/1000-4     710.9Mi ± 0%   1507.7Mi ± 0%  +112.08% (p=0.000 n=10)
+AddVWext/10000-4    506.9Mi ± 0%    735.8Mi ± 0%   +45.15% (p=0.000 n=10)
+AddVWext/100000-4   498.6Mi ± 0%    727.0Mi ± 0%   +45.79% (p=0.000 n=10)
+geomean             388.3Mi         614.3Mi        +58.19%
+
+Change-Id: Ib14a4b8c1d81e710753bbf6dd5546bbca44fe3f1
+Reviewed-on: https://go-review.googlesource.com/c/go/+/595397
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+---
+ src/math/big/arith_riscv64.s | 59 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 58 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s
+index f29933d2a1..eb11de7a2c 100644
+--- a/src/math/big/arith_riscv64.s
++++ b/src/math/big/arith_riscv64.s
+@@ -175,7 +175,64 @@ done:
+ 	RET
+ 
+ TEXT ·addVW(SB),NOSPLIT,$0
+-	JMP ·addVW_g(SB)
++	MOV	x+24(FP), X5
++	MOV	y+48(FP), X6
++	MOV	z+0(FP), X7
++	MOV	z_len+8(FP), X30
++
++	MOV	$4, X28
++	MOV	X6, X29		// c = y
++
++	BEQZ	X30, done
++	BLTU	X30, X28, loop1
++
++loop4:
++	MOV	0(X5), X8	// x[0]
++	MOV	8(X5), X11	// x[1]
++	MOV	16(X5), X14	// x[2]
++	MOV	24(X5), X17	// x[3]
++
++	ADD	X8, X29, X10	// z[0] = x[0] + c
++	SLTU	X8, X10, X29	// next c
++
++	ADD	X11, X29, X13	// z[1] = x[1] + c
++	SLTU	X11, X13, X29	// next c
++
++	ADD	X14, X29, X16	// z[2] = x[2] + c
++	SLTU	X14, X16, X29	// next c
++
++	ADD	X17, X29, X19	// z[3] = x[3] + c
++	SLTU	X17, X19, X29	// next c
++
++	MOV	X10, 0(X7)	// z[0]
++	MOV	X13, 8(X7)	// z[1]
++	MOV	X16, 16(X7)	// z[2]
++	MOV	X19, 24(X7)	// z[3]
++
++	ADD	$32, X5
++	ADD	$32, X7
++	SUB	$4, X30
++
++	BGEU	X30, X28, loop4
++	BEQZ	X30, done
++
++loop1:
++	MOV	0(X5), X10	// x
++
++	ADD	X10, X29, X12	// z = x + c
++	SLTU	X10, X12, X29	// next c
++
++	MOV	X12, 0(X7)	// z
++
++	ADD	$8, X5
++	ADD	$8, X7
++	SUB	$1, X30
++
++	BNEZ	X30, loop1
++
++done:
++	MOV	X29, c+56(FP)	// return c
++	RET
+ 
+ TEXT ·subVW(SB),NOSPLIT,$0
+ 	JMP ·subVW_g(SB)
+-- 
+2.39.5
+
diff --git a/2044-math-big-implement-subVW-in-riscv64-assembly.patch b/2044-math-big-implement-subVW-in-riscv64-assembly.patch
new file mode 100644
index 0000000..18d93ff
--- /dev/null
+++ b/2044-math-big-implement-subVW-in-riscv64-assembly.patch
@@ -0,0 +1,146 @@
+From 5b10906c01a84e75ff5eb8a95cec158d53bea3ee Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 044/119] math/big: implement subVW in riscv64 assembly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This provides an assembly implementation of subVW for riscv64,
+processing up to four words per loop, resulting in a significant
+performance gain.
+
+On a StarFive VisionFive 2:
+
+                  │   subvw.1    │               subvw.2               │
+                  │    sec/op    │   sec/op     vs base                │
+SubVW/1-4            57.43n ± 0%   41.45n ± 0%  -27.82% (p=0.000 n=10)
+SubVW/2-4            69.31n ± 0%   48.15n ± 0%  -30.53% (p=0.000 n=10)
+SubVW/3-4            76.12n ± 0%   54.87n ± 0%  -27.92% (p=0.000 n=10)
+SubVW/4-4            85.47n ± 0%   56.14n ± 0%  -34.32% (p=0.000 n=10)
+SubVW/5-4            96.15n ± 0%   62.83n ± 0%  -34.65% (p=0.000 n=10)
+SubVW/10-4          149.60n ± 0%   89.55n ± 0%  -40.14% (p=0.000 n=10)
+SubVW/100-4         1115.0n ± 0%   549.3n ± 0%  -50.74% (p=0.000 n=10)
+SubVW/1000-4        10.732µ ± 0%   5.071µ ± 0%  -52.75% (p=0.000 n=10)
+SubVW/10000-4        153.0µ ± 0%   103.7µ ± 0%  -32.21% (p=0.000 n=10)
+SubVW/100000-4       1.542m ± 0%   1.046m ± 0%  -32.13% (p=0.000 n=10)
+SubVWext/1-4         57.42n ± 0%   41.45n ± 0%  -27.81% (p=0.000 n=10)
+SubVWext/2-4         69.33n ± 0%   48.15n ± 0%  -30.55% (p=0.000 n=10)
+SubVWext/3-4         76.12n ± 0%   54.93n ± 0%  -27.84% (p=0.000 n=10)
+SubVWext/4-4         85.47n ± 0%   56.14n ± 0%  -34.32% (p=0.000 n=10)
+SubVWext/5-4         96.15n ± 0%   62.83n ± 0%  -34.65% (p=0.000 n=10)
+SubVWext/10-4       149.60n ± 0%   89.56n ± 0%  -40.14% (p=0.000 n=10)
+SubVWext/100-4      1115.0n ± 0%   549.3n ± 0%  -50.74% (p=0.000 n=10)
+SubVWext/1000-4     10.732µ ± 0%   5.061µ ± 0%  -52.84% (p=0.000 n=10)
+SubVWext/10000-4     152.5µ ± 0%   103.7µ ± 0%  -32.02% (p=0.000 n=10)
+SubVWext/100000-4    1.533m ± 0%   1.046m ± 0%  -31.75% (p=0.000 n=10)
+geomean              1.005µ        633.7n       -36.92%
+
+                  │   subvw.1    │                subvw.2                 │
+                  │     B/s      │      B/s       vs base                 │
+SubVW/1-4           132.9Mi ± 0%    184.1Mi ± 0%   +38.54% (p=0.000 n=10)
+SubVW/2-4           220.1Mi ± 0%    316.9Mi ± 0%   +43.95% (p=0.000 n=10)
+SubVW/3-4           300.7Mi ± 0%    417.1Mi ± 0%   +38.72% (p=0.000 n=10)
+SubVW/4-4           357.1Mi ± 0%    543.6Mi ± 0%   +52.24% (p=0.000 n=10)
+SubVW/5-4           396.7Mi ± 0%    607.2Mi ± 0%   +53.03% (p=0.000 n=10)
+SubVW/10-4          510.1Mi ± 0%    851.9Mi ± 0%   +67.01% (p=0.000 n=10)
+SubVW/100-4         684.2Mi ± 0%   1388.9Mi ± 0%  +102.99% (p=0.000 n=10)
+SubVW/1000-4        710.9Mi ± 0%   1504.5Mi ± 0%  +111.63% (p=0.000 n=10)
+SubVW/10000-4       498.7Mi ± 0%    735.7Mi ± 0%   +47.52% (p=0.000 n=10)
+SubVW/100000-4      494.8Mi ± 0%    729.1Mi ± 0%   +47.34% (p=0.000 n=10)
+SubVWext/1-4        132.9Mi ± 0%    184.1Mi ± 0%   +38.53% (p=0.000 n=10)
+SubVWext/2-4        220.1Mi ± 0%    316.9Mi ± 0%   +44.00% (p=0.000 n=10)
+SubVWext/3-4        300.7Mi ± 0%    416.7Mi ± 0%   +38.57% (p=0.000 n=10)
+SubVWext/4-4        357.1Mi ± 0%    543.6Mi ± 0%   +52.24% (p=0.000 n=10)
+SubVWext/5-4        396.7Mi ± 0%    607.2Mi ± 0%   +53.04% (p=0.000 n=10)
+SubVWext/10-4       510.1Mi ± 0%    851.9Mi ± 0%   +67.01% (p=0.000 n=10)
+SubVWext/100-4      684.2Mi ± 0%   1388.9Mi ± 0%  +102.99% (p=0.000 n=10)
+SubVWext/1000-4     710.9Mi ± 0%   1507.6Mi ± 0%  +112.07% (p=0.000 n=10)
+SubVWext/10000-4    500.1Mi ± 0%    735.7Mi ± 0%   +47.10% (p=0.000 n=10)
+SubVWext/100000-4   497.8Mi ± 0%    729.4Mi ± 0%   +46.52% (p=0.000 n=10)
+geomean             387.6Mi         614.5Mi        +58.51%
+
+Change-Id: I9d7fac719e977710ad9db9121fa298db6df605de
+Reviewed-on: https://go-review.googlesource.com/c/go/+/595398
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/math/big/arith_riscv64.s | 59 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 58 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s
+index eb11de7a2c..6aca1b6d6c 100644
+--- a/src/math/big/arith_riscv64.s
++++ b/src/math/big/arith_riscv64.s
+@@ -235,7 +235,64 @@ done:
+ 	RET
+ 
+ TEXT ·subVW(SB),NOSPLIT,$0
+-	JMP ·subVW_g(SB)
++	MOV	x+24(FP), X5
++	MOV	y+48(FP), X6
++	MOV	z+0(FP), X7
++	MOV	z_len+8(FP), X30
++
++	MOV	$4, X28
++	MOV	X6, X29		// b = y
++
++	BEQZ	X30, done
++	BLTU	X30, X28, loop1
++
++loop4:
++	MOV	0(X5), X8	// x[0]
++	MOV	8(X5), X11	// x[1]
++	MOV	16(X5), X14	// x[2]
++	MOV	24(X5), X17	// x[3]
++
++	SUB	X29, X8, X10	// z[0] = x[0] - b
++	SLTU	X10, X8, X29	// next b
++
++	SUB	X29, X11, X13	// z[1] = x[1] - b
++	SLTU	X13, X11, X29	// next b
++
++	SUB	X29, X14, X16	// z[2] = x[2] - b
++	SLTU	X16, X14, X29	// next b
++
++	SUB	X29, X17, X19	// z[3] = x[3] - b
++	SLTU	X19, X17, X29	// next b
++
++	MOV	X10, 0(X7)	// z[0]
++	MOV	X13, 8(X7)	// z[1]
++	MOV	X16, 16(X7)	// z[2]
++	MOV	X19, 24(X7)	// z[3]
++
++	ADD	$32, X5
++	ADD	$32, X7
++	SUB	$4, X30
++
++	BGEU	X30, X28, loop4
++	BEQZ	X30, done
++
++loop1:
++	MOV	0(X5), X10	// x
++
++	SUB	X29, X10, X12	// z = x - b
++	SLTU	X12, X10, X29	// next b
++
++	MOV	X12, 0(X7)	// z
++
++	ADD	$8, X5
++	ADD	$8, X7
++	SUB	$1, X30
++
++	BNEZ	X30, loop1
++
++done:
++	MOV	X29, c+56(FP)	// return b
++	RET
+ 
+ TEXT ·shlVU(SB),NOSPLIT,$0
+ 	JMP ·shlVU_g(SB)
+-- 
+2.39.5
+
diff --git a/2045-crypto-sha256-provide-optimised-assembly-for-riscv64.patch b/2045-crypto-sha256-provide-optimised-assembly-for-riscv64.patch
new file mode 100644
index 0000000..543143f
--- /dev/null
+++ b/2045-crypto-sha256-provide-optimised-assembly-for-riscv64.patch
@@ -0,0 +1,349 @@
+From 6cbde165e70d9890431c1e452bb29a6fc5c963c8 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 045/119] crypto/sha256: provide optimised assembly for riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Provide an optimised assembly implementation of sha256 for riscv64.
+This results in considerable performance gains.
+
+On a StarFive VisionFive 2:
+
+                    │   sha256.1   │              sha256.2               │
+                    │    sec/op    │   sec/op     vs base                │
+Hash8Bytes/New-4       7.820µ ± 0%   5.193µ ± 0%  -33.59% (p=0.000 n=10)
+Hash8Bytes/Sum224-4    7.918µ ± 0%   5.294µ ± 0%  -33.15% (p=0.000 n=10)
+Hash8Bytes/Sum256-4    7.950µ ± 0%   5.320µ ± 0%  -33.08% (p=0.000 n=10)
+Hash1K/New-4          108.03µ ± 0%   66.12µ ± 0%  -38.79% (p=0.000 n=10)
+Hash1K/Sum224-4       108.12µ ± 0%   66.22µ ± 0%  -38.76% (p=0.000 n=10)
+Hash1K/Sum256-4       108.15µ ± 0%   66.24µ ± 0%  -38.75% (p=0.000 n=10)
+Hash8K/New-4           808.5µ ± 0%   493.0µ ± 0%  -39.02% (p=0.000 n=10)
+Hash8K/Sum224-4        808.6µ ± 0%   493.1µ ± 0%  -39.02% (p=0.000 n=10)
+Hash8K/Sum256-4        808.6µ ± 0%   493.1µ ± 0%  -39.02% (p=0.000 n=10)
+geomean                88.37µ        55.61µ       -37.08%
+
+                    │   sha256.1   │               sha256.2                │
+                    │     B/s      │      B/s       vs base                │
+Hash8Bytes/New-4      996.1Ki ± 0%   1503.9Ki ± 0%  +50.98% (p=0.000 n=10)
+Hash8Bytes/Sum224-4   986.3Ki ± 0%   1474.6Ki ± 0%  +49.50% (p=0.000 n=10)
+Hash8Bytes/Sum256-4   986.3Ki ± 0%   1464.8Ki ± 0%  +48.51% (p=0.000 n=10)
+Hash1K/New-4          9.041Mi ± 0%   14.772Mi ± 0%  +63.40% (p=0.000 n=10)
+Hash1K/Sum224-4       9.031Mi ± 0%   14.744Mi ± 0%  +63.25% (p=0.000 n=10)
+Hash1K/Sum256-4       9.031Mi ± 0%   14.744Mi ± 0%  +63.25% (p=0.000 n=10)
+Hash8K/New-4          9.661Mi ± 0%   15.850Mi ± 0%  +64.07% (p=0.000 n=10)
+Hash8K/Sum224-4       9.661Mi ± 0%   15.841Mi ± 0%  +63.97% (p=0.000 n=10)
+Hash8K/Sum256-4       9.661Mi ± 0%   15.841Mi ± 0%  +63.97% (p=0.000 n=10)
+geomean               4.386Mi         6.966Mi       +58.85%
+
+Change-Id: Ieead7b7c02291d70ddc472a7a8cf3c044c1da4b3
+Reviewed-on: https://go-review.googlesource.com/c/go/+/519695
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: David Chase <drchase@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/crypto/sha256/sha256block_decl.go    |   2 +-
+ src/crypto/sha256/sha256block_generic.go |   2 +-
+ src/crypto/sha256/sha256block_riscv64.s  | 261 +++++++++++++++++++++++
+ 3 files changed, 263 insertions(+), 2 deletions(-)
+ create mode 100644 src/crypto/sha256/sha256block_riscv64.s
+
+diff --git a/src/crypto/sha256/sha256block_decl.go b/src/crypto/sha256/sha256block_decl.go
+index 7d68cd95fe..0646ef3685 100644
+--- a/src/crypto/sha256/sha256block_decl.go
++++ b/src/crypto/sha256/sha256block_decl.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build 386 || amd64 || s390x || ppc64le || ppc64
++//go:build 386 || amd64 || s390x || ppc64le || ppc64 || riscv64
+ 
+ package sha256
+ 
+diff --git a/src/crypto/sha256/sha256block_generic.go b/src/crypto/sha256/sha256block_generic.go
+index fd098bec89..125eb8effb 100644
+--- a/src/crypto/sha256/sha256block_generic.go
++++ b/src/crypto/sha256/sha256block_generic.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !amd64 && !386 && !s390x && !ppc64le && !ppc64 && !arm64
++//go:build !amd64 && !386 && !s390x && !ppc64le && !ppc64 && !arm64 && !riscv64
+ 
+ package sha256
+ 
+diff --git a/src/crypto/sha256/sha256block_riscv64.s b/src/crypto/sha256/sha256block_riscv64.s
+new file mode 100644
+index 0000000000..fc7bf65e41
+--- /dev/null
++++ b/src/crypto/sha256/sha256block_riscv64.s
+@@ -0,0 +1,261 @@
++// Copyright 2023 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++#include "textflag.h"
++
++// SHA256 block routine. See sha256block.go for Go equivalent.
++//
++// The algorithm is detailed in FIPS 180-4:
++//
++//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
++//
++// Wt = Mt; for 0 <= t <= 15
++// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
++//
++// a = H0
++// b = H1
++// c = H2
++// d = H3
++// e = H4
++// f = H5
++// g = H6
++// h = H7
++//
++// for t = 0 to 63 {
++//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
++//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
++//    h = g
++//    g = f
++//    f = e
++//    e = d + T1
++//    d = c
++//    c = b
++//    b = a
++//    a = T1 + T2
++// }
++//
++// H0 = a + H0
++// H1 = b + H1
++// H2 = c + H2
++// H3 = d + H3
++// H4 = e + H4
++// H5 = f + H5
++// H6 = g + H6
++// H7 = h + H7
++
++// Wt = Mt; for 0 <= t <= 15
++#define MSGSCHEDULE0(index) \
++	MOVBU	((index*4)+0)(X29), X5; \
++	MOVBU	((index*4)+1)(X29), X6; \
++	MOVBU	((index*4)+2)(X29), X7; \
++	MOVBU	((index*4)+3)(X29), X8; \
++	SLL	$24, X5; \
++	SLL	$16, X6; \
++	OR	X5, X6, X5; \
++	SLL	$8, X7; \
++	OR	X5, X7, X5; \
++	OR	X5, X8, X5; \
++	MOVW	X5, (index*4)(X19)
++
++// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
++//   SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
++//   SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
++#define MSGSCHEDULE1(index) \
++	MOVWU	(((index-2)&0xf)*4)(X19), X5; \
++	MOVWU	(((index-15)&0xf)*4)(X19), X6; \
++	MOVWU	(((index-7)&0xf)*4)(X19), X9; \
++	MOVWU	(((index-16)&0xf)*4)(X19), X21; \
++	RORW	$17, X5, X7; \
++	RORW	$19, X5, X8; \
++	SRL	$10, X5; \
++	XOR	X7, X5; \
++	XOR	X8, X5; \
++	ADD	X9, X5; \
++	RORW	$7, X6, X7; \
++	RORW	$18, X6, X8; \
++	SRL	$3, X6; \
++	XOR	X7, X6; \
++	XOR	X8, X6; \
++	ADD	X6, X5; \
++	ADD	X21, X5; \
++	MOVW	X5, ((index&0xf)*4)(X19)
++
++// Calculate T1 in X5.
++// h is also used as an accumulator. Wt is passed in X5.
++//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
++//     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
++//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
++#define SHA256T1(index, e, f, g, h) \
++	MOVWU	(index*4)(X18), X8; \
++	ADD	X5, h; \
++	RORW	$6, e, X6; \
++	ADD	X8, h; \
++	RORW	$11, e, X7; \
++	XOR	X7, X6; \
++	RORW	$25, e, X8; \
++	XOR	X8, X6; \
++	ADD	X6, h; \
++	AND	e, f, X5; \
++	NOT	e, X7; \
++	AND	g, X7; \
++	XOR	X7, X5; \
++	ADD	h, X5
++
++// Calculate T2 in X6.
++//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
++//     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
++//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
++#define SHA256T2(a, b, c) \
++	RORW	$2, a, X6; \
++	RORW	$13, a, X7; \
++	XOR	X7, X6; \
++	RORW	$22, a, X8; \
++	XOR	X8, X6; \
++	AND	a, b, X7; \
++	AND	a, c, X8; \
++	XOR	X8, X7; \
++	AND	b, c, X9; \
++	XOR	X9, X7; \
++	ADD	X7, X6
++
++// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
++// The values for e and a are stored in d and h, ready for rotation.
++#define SHA256ROUND(index, a, b, c, d, e, f, g, h) \
++	SHA256T1(index, e, f, g, h); \
++	SHA256T2(a, b, c); \
++	MOV	X6, h; \
++	ADD	X5, d; \
++	ADD	X5, h
++
++#define SHA256ROUND0(index, a, b, c, d, e, f, g, h) \
++	MSGSCHEDULE0(index); \
++	SHA256ROUND(index, a, b, c, d, e, f, g, h)
++
++#define SHA256ROUND1(index, a, b, c, d, e, f, g, h) \
++	MSGSCHEDULE1(index); \
++	SHA256ROUND(index, a, b, c, d, e, f, g, h)
++
++// Note that 64 bytes of stack space is used as a circular buffer
++// for the message schedule (4 bytes * 16 entries).
++//
++// func block(dig *digest, p []byte)
++TEXT ·block(SB),0,$64-32
++	MOV	p_base+8(FP), X29
++	MOV	p_len+16(FP), X30
++	SRL	$6, X30
++	SLL	$6, X30
++
++	ADD	X29, X30, X28
++	BEQ	X28, X29, end
++
++	MOV	·_K(SB), X18		// const table
++	ADD	$8, X2, X19		// message schedule
++
++	MOV	dig+0(FP), X20
++	MOVWU	(0*4)(X20), X10		// a = H0
++	MOVWU	(1*4)(X20), X11		// b = H1
++	MOVWU	(2*4)(X20), X12		// c = H2
++	MOVWU	(3*4)(X20), X13		// d = H3
++	MOVWU	(4*4)(X20), X14		// e = H4
++	MOVWU	(5*4)(X20), X15		// f = H5
++	MOVWU	(6*4)(X20), X16		// g = H6
++	MOVWU	(7*4)(X20), X17		// h = H7
++
++loop:
++	SHA256ROUND0(0, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA256ROUND0(1, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA256ROUND0(2, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA256ROUND0(3, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA256ROUND0(4, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA256ROUND0(5, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA256ROUND0(6, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA256ROUND0(7, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA256ROUND0(8, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA256ROUND0(9, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA256ROUND0(10, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA256ROUND0(11, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA256ROUND0(12, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA256ROUND0(13, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA256ROUND0(14, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA256ROUND0(15, X11, X12, X13, X14, X15, X16, X17, X10)
++
++	SHA256ROUND1(16, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA256ROUND1(17, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA256ROUND1(18, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA256ROUND1(19, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA256ROUND1(20, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA256ROUND1(21, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA256ROUND1(22, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA256ROUND1(23, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA256ROUND1(24, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA256ROUND1(25, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA256ROUND1(26, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA256ROUND1(27, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA256ROUND1(28, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA256ROUND1(29, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA256ROUND1(30, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA256ROUND1(31, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA256ROUND1(32, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA256ROUND1(33, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA256ROUND1(34, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA256ROUND1(35, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA256ROUND1(36, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA256ROUND1(37, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA256ROUND1(38, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA256ROUND1(39, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA256ROUND1(40, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA256ROUND1(41, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA256ROUND1(42, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA256ROUND1(43, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA256ROUND1(44, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA256ROUND1(45, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA256ROUND1(46, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA256ROUND1(47, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA256ROUND1(48, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA256ROUND1(49, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA256ROUND1(50, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA256ROUND1(51, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA256ROUND1(52, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA256ROUND1(53, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA256ROUND1(54, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA256ROUND1(55, X11, X12, X13, X14, X15, X16, X17, X10)
++	SHA256ROUND1(56, X10, X11, X12, X13, X14, X15, X16, X17)
++	SHA256ROUND1(57, X17, X10, X11, X12, X13, X14, X15, X16)
++	SHA256ROUND1(58, X16, X17, X10, X11, X12, X13, X14, X15)
++	SHA256ROUND1(59, X15, X16, X17, X10, X11, X12, X13, X14)
++	SHA256ROUND1(60, X14, X15, X16, X17, X10, X11, X12, X13)
++	SHA256ROUND1(61, X13, X14, X15, X16, X17, X10, X11, X12)
++	SHA256ROUND1(62, X12, X13, X14, X15, X16, X17, X10, X11)
++	SHA256ROUND1(63, X11, X12, X13, X14, X15, X16, X17, X10)
++
++	MOVWU	(0*4)(X20), X5
++	MOVWU	(1*4)(X20), X6
++	MOVWU	(2*4)(X20), X7
++	MOVWU	(3*4)(X20), X8
++	ADD	X5, X10		// H0 = a + H0
++	ADD	X6, X11		// H1 = b + H1
++	ADD	X7, X12		// H2 = c + H2
++	ADD	X8, X13		// H3 = d + H3
++	MOVW	X10, (0*4)(X20)
++	MOVW	X11, (1*4)(X20)
++	MOVW	X12, (2*4)(X20)
++	MOVW	X13, (3*4)(X20)
++	MOVWU	(4*4)(X20), X5
++	MOVWU	(5*4)(X20), X6
++	MOVWU	(6*4)(X20), X7
++	MOVWU	(7*4)(X20), X8
++	ADD	X5, X14		// H4 = e + H4
++	ADD	X6, X15		// H5 = f + H5
++	ADD	X7, X16		// H6 = g + H6
++	ADD	X8, X17		// H7 = h + H7
++	MOVW	X14, (4*4)(X20)
++	MOVW	X15, (5*4)(X20)
++	MOVW	X16, (6*4)(X20)
++	MOVW	X17, (7*4)(X20)
++
++	ADD	$64, X29
++	BNE	X28, X29, loop
++
++end:
++	RET
+-- 
+2.39.5
+
diff --git a/2046-math-big-implement-mulAddVWW-in-riscv64-assembly.patch b/2046-math-big-implement-mulAddVWW-in-riscv64-assembly.patch
new file mode 100644
index 0000000..c0151ac
--- /dev/null
+++ b/2046-math-big-implement-mulAddVWW-in-riscv64-assembly.patch
@@ -0,0 +1,141 @@
+From e24c5f2b3d687d5f7870107fa2c4ec28833b142a Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 046/119] math/big: implement mulAddVWW in riscv64 assembly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This provides an assembly implementation of mulAddVWW for riscv64,
+processing up to four words per loop, resulting in a significant
+performance gain.
+
+On a StarFive VisionFive 2:
+
+                   │ muladdvww.1  │             muladdvww.2             │
+                   │    sec/op    │   sec/op     vs base                │
+MulAddVWW/1-4         68.18n ± 0%   65.49n ± 0%   -3.95% (p=0.000 n=10)
+MulAddVWW/2-4         82.81n ± 0%   78.85n ± 0%   -4.78% (p=0.000 n=10)
+MulAddVWW/3-4         97.49n ± 0%   72.18n ± 0%  -25.96% (p=0.000 n=10)
+MulAddVWW/4-4        112.20n ± 0%   85.54n ± 0%  -23.76% (p=0.000 n=10)
+MulAddVWW/5-4        126.90n ± 0%   98.90n ± 0%  -22.06% (p=0.000 n=10)
+MulAddVWW/10-4        200.3n ± 0%   144.3n ± 0%  -27.96% (p=0.000 n=10)
+MulAddVWW/100-4      1532.0n ± 0%   860.0n ± 0%  -43.86% (p=0.000 n=10)
+MulAddVWW/1000-4     14.757µ ± 0%   8.076µ ± 0%  -45.27% (p=0.000 n=10)
+MulAddVWW/10000-4     204.0µ ± 0%   137.1µ ± 0%  -32.77% (p=0.000 n=10)
+MulAddVWW/100000-4    2.066m ± 0%   1.382m ± 0%  -33.12% (p=0.000 n=10)
+geomean               1.311µ        950.0n       -27.51%
+
+                   │ muladdvww.1  │             muladdvww.2              │
+                   │     B/s      │     B/s       vs base                │
+MulAddVWW/1-4        895.1Mi ± 0%   932.0Mi ± 0%   +4.11% (p=0.000 n=10)
+MulAddVWW/2-4        1.440Gi ± 0%   1.512Gi ± 0%   +5.02% (p=0.000 n=10)
+MulAddVWW/3-4        1.834Gi ± 0%   2.477Gi ± 0%  +35.07% (p=0.000 n=10)
+MulAddVWW/4-4        2.125Gi ± 0%   2.787Gi ± 0%  +31.15% (p=0.000 n=10)
+MulAddVWW/5-4        2.349Gi ± 0%   3.013Gi ± 0%  +28.28% (p=0.000 n=10)
+MulAddVWW/10-4       2.975Gi ± 0%   4.130Gi ± 0%  +38.79% (p=0.000 n=10)
+MulAddVWW/100-4      3.891Gi ± 0%   6.930Gi ± 0%  +78.11% (p=0.000 n=10)
+MulAddVWW/1000-4     4.039Gi ± 0%   7.380Gi ± 0%  +82.72% (p=0.000 n=10)
+MulAddVWW/10000-4    2.922Gi ± 0%   4.346Gi ± 0%  +48.74% (p=0.000 n=10)
+MulAddVWW/100000-4   2.884Gi ± 0%   4.313Gi ± 0%  +49.52% (p=0.000 n=10)
+geomean              2.321Gi        3.202Gi       +37.95%
+
+Change-Id: If08191607913ce5c7641f34bae8fa5c9dfb44777
+Reviewed-on: https://go-review.googlesource.com/c/go/+/595399
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/math/big/arith_riscv64.s | 74 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 73 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s
+index 6aca1b6d6c..c6b32ae1cc 100644
+--- a/src/math/big/arith_riscv64.s
++++ b/src/math/big/arith_riscv64.s
+@@ -301,7 +301,79 @@ TEXT ·shrVU(SB),NOSPLIT,$0
+ 	JMP ·shrVU_g(SB)
+ 
+ TEXT ·mulAddVWW(SB),NOSPLIT,$0
+-	JMP ·mulAddVWW_g(SB)
++	MOV	x+24(FP), X5
++	MOV	y+48(FP), X6
++	MOV	z+0(FP), X7
++	MOV	z_len+8(FP), X30
++	MOV	r+56(FP), X29
++
++	MOV	$4, X28
++
++	BEQ	ZERO, X30, done
++	BLTU	X30, X28, loop1
++
++loop4:
++	MOV	0(X5), X8	// x[0]
++	MOV	8(X5), X11	// x[1]
++	MOV	16(X5), X14	// x[2]
++	MOV	24(X5), X17	// x[3]
++
++	MULHU	X8, X6, X9	// z_hi[0] = x[0] * y
++	MUL	X8, X6, X8	// z_lo[0] = x[0] * y
++	ADD	X8, X29, X10	// z[0] = z_lo[0] + c
++	SLTU	X8, X10, X23
++	ADD	X23, X9, X29	// next c
++
++	MULHU	X11, X6, X12	// z_hi[1] = x[1] * y
++	MUL	X11, X6, X11	// z_lo[1] = x[1] * y
++	ADD	X11, X29, X13	// z[1] = z_lo[1] + c
++	SLTU	X11, X13, X23
++	ADD	X23, X12, X29	// next c
++
++	MULHU	X14, X6, X15	// z_hi[2] = x[2] * y
++	MUL	X14, X6, X14	// z_lo[2] = x[2] * y
++	ADD	X14, X29, X16	// z[2] = z_lo[2] + c
++	SLTU	X14, X16, X23
++	ADD	X23, X15, X29	// next c
++
++	MULHU	X17, X6, X18	// z_hi[3] = x[3] * y
++	MUL	X17, X6, X17	// z_lo[3] = x[3] * y
++	ADD	X17, X29, X19	// z[3] = z_lo[3] + c
++	SLTU	X17, X19, X23
++	ADD	X23, X18, X29	// next c
++
++	MOV	X10, 0(X7)	// z[0]
++	MOV	X13, 8(X7)	// z[1]
++	MOV	X16, 16(X7)	// z[2]
++	MOV	X19, 24(X7)	// z[3]
++
++	ADD	$32, X5
++	ADD	$32, X7
++	SUB	$4, X30
++
++	BGEU	X30, X28, loop4
++	BEQZ	X30, done
++
++loop1:
++	MOV	0(X5), X10	// x
++
++	MULHU	X10, X6, X12	// z_hi = x * y
++	MUL	X10, X6, X10	// z_lo = x * y
++	ADD	X10, X29, X13	// z_lo + c
++	SLTU	X10, X13, X15
++	ADD	X12, X15, X29	// next c
++
++	MOV	X13, 0(X7)	// z
++
++	ADD	$8, X5
++	ADD	$8, X7
++	SUB	$1, X30
++
++	BNEZ	X30, loop1
++
++done:
++	MOV	X29, c+64(FP)	// return c
++	RET
+ 
+ TEXT ·addMulVVW(SB),NOSPLIT,$0
+ 	JMP ·addMulVVW_g(SB)
+-- 
+2.39.5
+
diff --git a/2047-math-big-implement-addMulVVW-in-riscv64-assembly.patch b/2047-math-big-implement-addMulVVW-in-riscv64-assembly.patch
new file mode 100644
index 0000000..41ae569
--- /dev/null
+++ b/2047-math-big-implement-addMulVVW-in-riscv64-assembly.patch
@@ -0,0 +1,158 @@
+From e959b79c7113f225df7e89d50c965fae3d25ad1e Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 047/119] math/big: implement addMulVVW in riscv64 assembly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This provides an assembly implementation of addMulVVW for riscv64,
+processing up to four words per loop, resulting in a significant
+performance gain.
+
+On a StarFive VisionFive 2:
+
+                   │ addmulvvw.1  │             addmulvvw.2             │
+                   │    sec/op    │   sec/op     vs base                │
+AddMulVVW/1-4         65.49n ± 0%   50.79n ± 0%  -22.44% (p=0.000 n=10)
+AddMulVVW/2-4         82.81n ± 0%   66.83n ± 0%  -19.29% (p=0.000 n=10)
+AddMulVVW/3-4        100.20n ± 0%   82.87n ± 0%  -17.30% (p=0.000 n=10)
+AddMulVVW/4-4        117.50n ± 0%   84.20n ± 0%  -28.34% (p=0.000 n=10)
+AddMulVVW/5-4         134.9n ± 0%   100.3n ± 0%  -25.69% (p=0.000 n=10)
+AddMulVVW/10-4        221.7n ± 0%   164.4n ± 0%  -25.85% (p=0.000 n=10)
+AddMulVVW/100-4       1.794µ ± 0%   1.250µ ± 0%  -30.32% (p=0.000 n=10)
+AddMulVVW/1000-4      17.42µ ± 0%   12.08µ ± 0%  -30.68% (p=0.000 n=10)
+AddMulVVW/10000-4     254.9µ ± 0%   214.8µ ± 0%  -15.75% (p=0.000 n=10)
+AddMulVVW/100000-4    2.569m ± 0%   2.178m ± 0%  -15.20% (p=0.000 n=10)
+geomean               1.443µ        1.107µ       -23.29%
+
+                   │ addmulvvw.1  │              addmulvvw.2              │
+                   │     B/s      │      B/s       vs base                │
+AddMulVVW/1-4        932.0Mi ± 0%   1201.6Mi ± 0%  +28.93% (p=0.000 n=10)
+AddMulVVW/2-4        1.440Gi ± 0%    1.784Gi ± 0%  +23.90% (p=0.000 n=10)
+AddMulVVW/3-4        1.785Gi ± 0%    2.158Gi ± 0%  +20.87% (p=0.000 n=10)
+AddMulVVW/4-4        2.029Gi ± 0%    2.832Gi ± 0%  +39.59% (p=0.000 n=10)
+AddMulVVW/5-4        2.209Gi ± 0%    2.973Gi ± 0%  +34.55% (p=0.000 n=10)
+AddMulVVW/10-4       2.689Gi ± 0%    3.626Gi ± 0%  +34.86% (p=0.000 n=10)
+AddMulVVW/100-4      3.323Gi ± 0%    4.770Gi ± 0%  +43.54% (p=0.000 n=10)
+AddMulVVW/1000-4     3.421Gi ± 0%    4.936Gi ± 0%  +44.27% (p=0.000 n=10)
+AddMulVVW/10000-4    2.338Gi ± 0%    2.776Gi ± 0%  +18.69% (p=0.000 n=10)
+AddMulVVW/100000-4   2.320Gi ± 0%    2.736Gi ± 0%  +17.93% (p=0.000 n=10)
+geomean              2.109Gi         2.749Gi       +30.36%
+
+Change-Id: I6c7ee48233c53ff9b6a5a9002675886cd9bff5af
+Reviewed-on: https://go-review.googlesource.com/c/go/+/595400
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/math/big/arith_riscv64.s | 93 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 92 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s
+index c6b32ae1cc..cc96d3145c 100644
+--- a/src/math/big/arith_riscv64.s
++++ b/src/math/big/arith_riscv64.s
+@@ -376,5 +376,96 @@ done:
+ 	RET
+ 
+ TEXT ·addMulVVW(SB),NOSPLIT,$0
+-	JMP ·addMulVVW_g(SB)
++	MOV	x+24(FP), X5
++	MOV	y+48(FP), X6
++	MOV	z+0(FP), X7
++	MOV	z_len+8(FP), X30
++
++	MOV	$4, X28
++	MOV	$0, X29		// c = 0
++
++	BEQZ	X30, done
++	BLTU	X30, X28, loop1
++
++loop4:
++	MOV	0(X5), X8	// x[0]
++	MOV	0(X7), X10	// z[0]
++	MOV	8(X5), X11	// x[1]
++	MOV	8(X7), X13	// z[1]
++	MOV	16(X5), X14	// x[2]
++	MOV	16(X7), X16	// z[2]
++	MOV	24(X5), X17	// x[3]
++	MOV	24(X7), X19	// z[3]
++
++	MULHU	X8, X6, X9	// z_hi[0] = x[0] * y
++	MUL	X8, X6, X8	// z_lo[0] = x[0] * y
++	ADD	X8, X10, X21	// z_lo[0] = x[0] * y + z[0]
++	SLTU	X8, X21, X22
++	ADD	X9, X22, X9	// z_hi[0] = x[0] * y + z[0]
++	ADD	X21, X29, X10	// z[0] = x[0] * y + z[0] + c
++	SLTU	X21, X10, X22
++	ADD	X9, X22, X29	// next c
++
++	MULHU	X11, X6, X12	// z_hi[1] = x[1] * y
++	MUL	X11, X6, X11	// z_lo[1] = x[1] * y
++	ADD	X11, X13, X21	// z_lo[1] = x[1] * y + z[1]
++	SLTU	X11, X21, X22
++	ADD	X12, X22, X12	// z_hi[1] = x[1] * y + z[1]
++	ADD	X21, X29, X13	// z[1] = x[1] * y + z[1] + c
++	SLTU	X21, X13, X22
++	ADD	X12, X22, X29	// next c
++
++	MULHU	X14, X6, X15	// z_hi[2] = x[2] * y
++	MUL	X14, X6, X14	// z_lo[2] = x[2] * y
++	ADD	X14, X16, X21	// z_lo[2] = x[2] * y + z[2]
++	SLTU	X14, X21, X22
++	ADD	X15, X22, X15	// z_hi[2] = x[2] * y + z[2]
++	ADD	X21, X29, X16	// z[2] = x[2] * y + z[2] + c
++	SLTU	X21, X16, X22
++	ADD	X15, X22, X29	// next c
++
++	MULHU	X17, X6, X18	// z_hi[3] = x[3] * y
++	MUL	X17, X6, X17	// z_lo[3] = x[3] * y
++	ADD	X17, X19, X21	// z_lo[3] = x[3] * y + z[3]
++	SLTU	X17, X21, X22
++	ADD	X18, X22, X18	// z_hi[3] = x[3] * y + z[3]
++	ADD	X21, X29, X19	// z[3] = x[3] * y + z[3] + c
++	SLTU	X21, X19, X22
++	ADD	X18, X22, X29	// next c
+ 
++	MOV	X10, 0(X7)	// z[0]
++	MOV	X13, 8(X7)	// z[1]
++	MOV	X16, 16(X7)	// z[2]
++	MOV	X19, 24(X7)	// z[3]
++
++	ADD	$32, X5
++	ADD	$32, X7
++	SUB	$4, X30
++
++	BGEU	X30, X28, loop4
++	BEQZ	X30, done
++
++loop1:
++	MOV	0(X5), X10	// x
++	MOV	0(X7), X11	// z
++
++	MULHU	X10, X6, X12	// z_hi = x * y
++	MUL	X10, X6, X10	// z_lo = x * y
++	ADD	X10, X11, X13	// z_lo = x * y + z
++	SLTU	X10, X13, X15
++	ADD	X12, X15, X12	// z_hi = x * y + z
++	ADD	X13, X29, X10	// z = x * y + z + c
++	SLTU	X13, X10, X15
++	ADD	X12, X15, X29	// next c
++
++	MOV	X10, 0(X7)	// z
++
++	ADD	$8, X5
++	ADD	$8, X7
++	SUB	$1, X30
++
++	BNEZ	X30, loop1
++
++done:
++	MOV	X29, c+56(FP)	// return c
++	RET
+-- 
+2.39.5
+
diff --git a/2048-test-codegen-add-initial-codegen-tests-for-integer-m.patch b/2048-test-codegen-add-initial-codegen-tests-for-integer-m.patch
new file mode 100644
index 0000000..6500317
--- /dev/null
+++ b/2048-test-codegen-add-initial-codegen-tests-for-integer-m.patch
@@ -0,0 +1,63 @@
+From 0626074d153ea7ade203d2946d97919e7c682700 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 048/119] test/codegen: add initial codegen tests for integer
+ min/max
+
+Change-Id: I006370053748edbec930c7279ee88a805009aa0d
+Reviewed-on: https://go-review.googlesource.com/c/go/+/606976
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ test/codegen/arithmetic.go | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
+diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
+index f381b34ade..5f4ce9c76f 100644
+--- a/test/codegen/arithmetic.go
++++ b/test/codegen/arithmetic.go
+@@ -588,3 +588,39 @@ func constantFold3(i, j int) int {
+ 	r := (5 * i) * (6 * j)
+ 	return r
+ }
++
++// ----------------- //
++//  Integer Min/Max  //
++// ----------------- //
++
++func Int64Min(a, b int64) int64 {
++	// amd64: "CMPQ","CMOVQLT"
++	// arm64: "CMP","CSEL"
++	// riscv64/rva20u64:"BLT\t"
++	// riscv64/rva22u64:"MIN\t"
++	return min(a, b)
++}
++
++func Int64Max(a, b int64) int64 {
++	// amd64: "CMPQ","CMOVQGT"
++	// arm64: "CMP","CSEL"
++	// riscv64/rva20u64:"BLT\t"
++	// riscv64/rva22u64:"MAX\t"
++	return max(a, b)
++}
++
++func Uint64Min(a, b uint64) uint64 {
++	// amd64: "CMPQ","CMOVQCS"
++	// arm64: "CMP","CSEL"
++	// riscv64/rva20u64:"BLTU"
++	// riscv64/rva22u64:"MINU"
++	return min(a, b)
++}
++
++func Uint64Max(a, b uint64) uint64 {
++	// amd64: "CMPQ","CMOVQHI"
++	// arm64: "CMP","CSEL"
++	// riscv64/rva20u64:"BLTU"
++	// riscv64/rva22u64:"MAXU"
++	return max(a, b)
++}
+-- 
+2.39.5
+
diff --git a/2049-cmd-compile-internal-ssa-combine-shift-and-addition-.patch b/2049-cmd-compile-internal-ssa-combine-shift-and-addition-.patch
new file mode 100644
index 0000000..5fce80d
--- /dev/null
+++ b/2049-cmd-compile-internal-ssa-combine-shift-and-addition-.patch
@@ -0,0 +1,232 @@
+From 8afd6098ad3cbdbe64f7108459f3954791e1391f Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 049/119] cmd/compile/internal/ssa: combine shift and addition
+ for riscv64 rva22u64
+
+When GORISCV64 enables rva22u64, combined shift and addition using the
+SH1ADD, SH2ADD and SH3ADD instructions that are available via the Zba
+extension. This results in more than 2000 instructions being removed
+from the Go binary on riscv64.
+
+Change-Id: Ia62ae7dda3d8083cff315113421bee73f518eea8
+Reviewed-on: https://go-review.googlesource.com/c/go/+/606636
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Michael Pratt <mpratt@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |  3 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  5 ++
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |  5 ++
+ src/cmd/compile/internal/ssa/opGen.go         | 45 +++++++++++++++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 57 +++++++++++++++++++
+ test/codegen/shift.go                         | 17 ++++++
+ 6 files changed, 131 insertions(+), 1 deletion(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 10fea07e60..e3a2889697 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -289,7 +289,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		ssa.OpRISCV64FEQS, ssa.OpRISCV64FNES, ssa.OpRISCV64FLTS, ssa.OpRISCV64FLES,
+ 		ssa.OpRISCV64FADDD, ssa.OpRISCV64FSUBD, ssa.OpRISCV64FMULD, ssa.OpRISCV64FDIVD,
+ 		ssa.OpRISCV64FEQD, ssa.OpRISCV64FNED, ssa.OpRISCV64FLTD, ssa.OpRISCV64FLED, ssa.OpRISCV64FSGNJD,
+-		ssa.OpRISCV64MIN, ssa.OpRISCV64MAX, ssa.OpRISCV64MINU, ssa.OpRISCV64MAXU:
++		ssa.OpRISCV64MIN, ssa.OpRISCV64MAX, ssa.OpRISCV64MINU, ssa.OpRISCV64MAXU,
++		ssa.OpRISCV64SH1ADD, ssa.OpRISCV64SH2ADD, ssa.OpRISCV64SH3ADD:
+ 		r := v.Reg()
+ 		r1 := v.Args[0].Reg()
+ 		r2 := v.Args[1].Reg()
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 7d8fb79e17..f0afd6b345 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -839,6 +839,11 @@
+ // Optimisations for rva22u64 and above.
+ //
+ 
++// Combine left shift and addition.
++(ADD (SLLI [1] x) y) && buildcfg.GORISCV64 >= 22 => (SH1ADD x y)
++(ADD (SLLI [2] x) y) && buildcfg.GORISCV64 >= 22 => (SH2ADD x y)
++(ADD (SLLI [3] x) y) && buildcfg.GORISCV64 >= 22 => (SH3ADD x y)
++
+ // Integer minimum and maximum.
+ (Min64  x y) && buildcfg.GORISCV64 >= 22 => (MIN  x y)
+ (Max64  x y) && buildcfg.GORISCV64 >= 22 => (MAX  x y)
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 7323cb119c..8badefa9ac 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -220,6 +220,11 @@ func init() {
+ 		{name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"},   // arg0 >> auxint, shift amount 0-63, logical right shift
+ 		{name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, shift amount 0-31, logical right shift of 32 bit value, sign extended to 64 bits
+ 
++		// Shift and add
++		{name: "SH1ADD", argLength: 2, reg: gp21, asm: "SH1ADD"}, // arg0 << 1 + arg1
++		{name: "SH2ADD", argLength: 2, reg: gp21, asm: "SH2ADD"}, // arg0 << 2 + arg1
++		{name: "SH3ADD", argLength: 2, reg: gp21, asm: "SH3ADD"}, // arg0 << 3 + arg1
++
+ 		// Bitwise ops
+ 		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+ 		{name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"},    // arg0 & auxint
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index 600b8d9e30..f651adf63e 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2381,6 +2381,9 @@ const (
+ 	OpRISCV64SRAIW
+ 	OpRISCV64SRLI
+ 	OpRISCV64SRLIW
++	OpRISCV64SH1ADD
++	OpRISCV64SH2ADD
++	OpRISCV64SH3ADD
+ 	OpRISCV64AND
+ 	OpRISCV64ANDI
+ 	OpRISCV64NOT
+@@ -31948,6 +31951,48 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "SH1ADD",
++		argLen: 2,
++		asm:    riscv.ASH1ADD,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:   "SH2ADD",
++		argLen: 2,
++		asm:    riscv.ASH2ADD,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:   "SH3ADD",
++		argLen: 2,
++		asm:    riscv.ASH3ADD,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:        "AND",
+ 		argLen:      2,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 9a13955689..5e6ccab467 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -3317,6 +3317,63 @@ func rewriteValueRISCV64_OpRISCV64ADD(v *Value) bool {
+ 		}
+ 		break
+ 	}
++	// match: (ADD (SLLI [1] x) y)
++	// cond: buildcfg.GORISCV64 >= 22
++	// result: (SH1ADD x y)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			if v_0.Op != OpRISCV64SLLI || auxIntToInt64(v_0.AuxInt) != 1 {
++				continue
++			}
++			x := v_0.Args[0]
++			y := v_1
++			if !(buildcfg.GORISCV64 >= 22) {
++				continue
++			}
++			v.reset(OpRISCV64SH1ADD)
++			v.AddArg2(x, y)
++			return true
++		}
++		break
++	}
++	// match: (ADD (SLLI [2] x) y)
++	// cond: buildcfg.GORISCV64 >= 22
++	// result: (SH2ADD x y)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			if v_0.Op != OpRISCV64SLLI || auxIntToInt64(v_0.AuxInt) != 2 {
++				continue
++			}
++			x := v_0.Args[0]
++			y := v_1
++			if !(buildcfg.GORISCV64 >= 22) {
++				continue
++			}
++			v.reset(OpRISCV64SH2ADD)
++			v.AddArg2(x, y)
++			return true
++		}
++		break
++	}
++	// match: (ADD (SLLI [3] x) y)
++	// cond: buildcfg.GORISCV64 >= 22
++	// result: (SH3ADD x y)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			if v_0.Op != OpRISCV64SLLI || auxIntToInt64(v_0.AuxInt) != 3 {
++				continue
++			}
++			x := v_0.Args[0]
++			y := v_1
++			if !(buildcfg.GORISCV64 >= 22) {
++				continue
++			}
++			v.reset(OpRISCV64SH3ADD)
++			v.AddArg2(x, y)
++			return true
++		}
++		break
++	}
+ 	return false
+ }
+ func rewriteValueRISCV64_OpRISCV64ADDI(v *Value) bool {
+diff --git a/test/codegen/shift.go b/test/codegen/shift.go
+index 51b9b2e39c..4b3b79f142 100644
+--- a/test/codegen/shift.go
++++ b/test/codegen/shift.go
+@@ -474,3 +474,20 @@ func checkShiftToMask(u []uint64, s []int64) {
+ 	// amd64:-"SHR",-"SHL","ANDQ"
+ 	u[1] = u[1] << 5 >> 5
+ }
++
++//
++// Left shift with addition.
++//
++
++func checkLeftShiftWithAddition(a int64, b int64) int64 {
++	// riscv64/rva20u64: "SLLI","ADD"
++	// riscv64/rva22u64: "SH1ADD"
++	a = a + b<<1
++	// riscv64/rva20u64: "SLLI","ADD"
++	// riscv64/rva22u64: "SH2ADD"
++	a = a + b<<2
++	// riscv64/rva20u64: "SLLI","ADD"
++	// riscv64/rva22u64: "SH3ADD"
++	a = a + b<<3
++	return a
++}
+-- 
+2.39.5
+
diff --git a/2050-math-add-round-assembly-implementations-on-riscv64.patch b/2050-math-add-round-assembly-implementations-on-riscv64.patch
new file mode 100644
index 0000000..00cef9d
--- /dev/null
+++ b/2050-math-add-round-assembly-implementations-on-riscv64.patch
@@ -0,0 +1,125 @@
+From 38a2f9c476c9b5fb3688605454eff7a198368211 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 050/119] math: add round assembly implementations on riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This CL reapplies CL 504737 and adds integer precision
+limitation check, since CL 504737 only checks whether
+floating point number is +-Inf or NaN.
+
+This CL is also ~7% faster than CL 504737.
+
+Updates #68322
+
+goos: linux
+goarch: riscv64
+pkg: math
+            │ math.old.bench │           math.new.bench            │
+            │     sec/op     │   sec/op     vs base                │
+Ceil             54.09n ± 0%   18.72n ± 0%  -65.39% (p=0.000 n=10)
+Floor            40.72n ± 0%   18.72n ± 0%  -54.03% (p=0.000 n=10)
+Round            20.73n ± 0%   20.73n ± 0%        ~ (p=1.000 n=10)
+RoundToEven      24.07n ± 0%   24.07n ± 0%        ~ (p=1.000 n=10)
+Trunc            38.72n ± 0%   18.72n ± 0%  -51.65% (p=0.000 n=10)
+geomean          33.56n        20.09n       -40.13%
+
+Change-Id: I06cfe2cb9e2535cd705d40b6650a7e71fedd906c
+Reviewed-on: https://go-review.googlesource.com/c/go/+/600075
+Reviewed-by: Keith Randall <khr@golang.org>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Keith Randall <khr@google.com>
+Reviewed-by: Michael Knyszek <mknyszek@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/math/floor_asm.go    |  2 +-
+ src/math/floor_noasm.go  |  2 +-
+ src/math/floor_riscv64.s | 48 ++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 50 insertions(+), 2 deletions(-)
+ create mode 100644 src/math/floor_riscv64.s
+
+diff --git a/src/math/floor_asm.go b/src/math/floor_asm.go
+index fb419d6da2..5cb45f5a7e 100644
+--- a/src/math/floor_asm.go
++++ b/src/math/floor_asm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || s390x || wasm
++//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || riscv64 || s390x || wasm
+ 
+ package math
+ 
+diff --git a/src/math/floor_noasm.go b/src/math/floor_noasm.go
+index 5641c7ea0a..6754ca8fc8 100644
+--- a/src/math/floor_noasm.go
++++ b/src/math/floor_noasm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !s390x && !wasm
++//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !riscv64 && !s390x && !wasm
+ 
+ package math
+ 
+diff --git a/src/math/floor_riscv64.s b/src/math/floor_riscv64.s
+new file mode 100644
+index 0000000000..d9fe0ed8e2
+--- /dev/null
++++ b/src/math/floor_riscv64.s
+@@ -0,0 +1,48 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++#include "textflag.h"
++
++// RISC-V offered floating-point (FP) rounding by FP conversion instructions (FCVT)
++// with rounding mode field.
++// As Go spec expects FP rounding result in FP, we have to use FCVT integer
++// back to FP (fp -> int -> fp).
++// RISC-V only set Inexact flag during invalid FP-integer conversion without changing any data,
++// on the other hand, RISC-V sets out of integer represent range yet valid FP into NaN.
++// When it comes to integer-FP conversion, invalid FP like NaN, +-Inf will be
++// converted into the closest valid FP, for example:
++//
++// `Floor(-Inf) -> int64(0x7fffffffffffffff) -> float64(9.22e+18)`
++// `Floor(18446744073709549568.0) -> int64(0x7fffffffffffffff) -> float64(9.22e+18)`
++//
++// This ISA conversion limitation requires we skip all invalid or out of range FP
++// before any normal rounding operations.
++
++#define ROUNDFN(NAME, MODE) 	\
++TEXT NAME(SB),NOSPLIT,$0; 	\
++	MOVD	x+0(FP), F10; 	\
++	FMVXD	F10, X10;	\
++	/* Drop all fraction bits */;\
++	SRL	$52, X10, X12;	\
++	/* Remove sign bit */;	\
++	AND	$0x7FF, X12, X12;\
++	/* Return either input is +-Inf, NaN(0x7FF) or out of precision limitation */;\
++	/* 1023: bias of exponent, [-2^53, 2^53]: exactly integer represent range */;\
++	MOV	$1023+53, X11;	\
++	BLTU	X11, X12, 4(PC);\
++	FCVTLD.MODE F10, X11;	\
++	FCVTDL	X11, F11;	\
++	/* RISC-V rounds negative values to +0, restore original sign */;\
++	FSGNJD	F10, F11, F10;	\
++	MOVD	F10, ret+8(FP); \
++	RET
++
++// func archFloor(x float64) float64
++ROUNDFN(·archFloor, RDN)
++
++// func archCeil(x float64) float64
++ROUNDFN(·archCeil, RUP)
++
++// func archTrunc(x float64) float64
++ROUNDFN(·archTrunc, RTZ)
+-- 
+2.39.5
+
diff --git a/2051-test-codegen-add-Rotate-test-for-riscv64.patch b/2051-test-codegen-add-Rotate-test-for-riscv64.patch
new file mode 100644
index 0000000..020d744
--- /dev/null
+++ b/2051-test-codegen-add-Rotate-test-for-riscv64.patch
@@ -0,0 +1,62 @@
+From 359d01862161f76d5493d310de9b9a9ae46d75d5 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mzh@golangcn.org>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 051/119] test/codegen: add Rotate test for riscv64
+
+Change-Id: I7d996b8d46fbeef933943f806052a30f1f8d50c3
+Reviewed-on: https://go-review.googlesource.com/c/go/+/588836
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Tim King <taking@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+---
+ test/codegen/mathbits.go | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
+index 80fe9d2e0c..caeecdf078 100644
+--- a/test/codegen/mathbits.go
++++ b/test/codegen/mathbits.go
+@@ -231,6 +231,7 @@ func RotateLeft64(n uint64) uint64 {
+ 	// amd64:"ROLQ"
+ 	// arm64:"ROR"
+ 	// ppc64x:"ROTL"
++	// riscv64:"RORI"
+ 	// s390x:"RISBGZ\t[$]0, [$]63, [$]37, "
+ 	// wasm:"I64Rotl"
+ 	return bits.RotateLeft64(n, 37)
+@@ -241,6 +242,7 @@ func RotateLeft32(n uint32) uint32 {
+ 	// arm:`MOVW\tR[0-9]+@>23`
+ 	// arm64:"RORW"
+ 	// ppc64x:"ROTLW"
++	// riscv64:"RORIW"
+ 	// s390x:"RLL"
+ 	// wasm:"I32Rotl"
+ 	return bits.RotateLeft32(n, 9)
+@@ -262,6 +264,7 @@ func RotateLeftVariable(n uint, m int) uint {
+ 	// amd64:"ROLQ"
+ 	// arm64:"ROR"
+ 	// ppc64x:"ROTL"
++	// riscv64:"ROL"
+ 	// s390x:"RLLG"
+ 	// wasm:"I64Rotl"
+ 	return bits.RotateLeft(n, m)
+@@ -271,6 +274,7 @@ func RotateLeftVariable64(n uint64, m int) uint64 {
+ 	// amd64:"ROLQ"
+ 	// arm64:"ROR"
+ 	// ppc64x:"ROTL"
++	// riscv64:"ROL"
+ 	// s390x:"RLLG"
+ 	// wasm:"I64Rotl"
+ 	return bits.RotateLeft64(n, m)
+@@ -281,6 +285,7 @@ func RotateLeftVariable32(n uint32, m int) uint32 {
+ 	// amd64:"ROLL"
+ 	// arm64:"RORW"
+ 	// ppc64x:"ROTLW"
++	// riscv64:"ROLW"
+ 	// s390x:"RLL"
+ 	// wasm:"I32Rotl"
+ 	return bits.RotateLeft32(n, m)
+-- 
+2.39.5
+
diff --git a/2052-runtime-add-asm_riscv64.h.patch b/2052-runtime-add-asm_riscv64.h.patch
new file mode 100644
index 0000000..84cfce3
--- /dev/null
+++ b/2052-runtime-add-asm_riscv64.h.patch
@@ -0,0 +1,67 @@
+From d1f0994bb9c5f428f58638aa4ac69e78d4122a25 Mon Sep 17 00:00:00 2001
+From: Mark D Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 052/119] runtime: add asm_riscv64.h
+
+asm_riscv64.h will be used to define macros for each riscv64
+extension that is not part of the rva20u64 base profile but that the
+_riscv64.s assembly files are allowed to use because the user has
+specified a more capable profile in the GORISCV64 variable. This will
+allow us, for example, to test for the hasZba macro in those assembly
+files instead of the GORISCV64_rva22u64 macro before using a Zba
+instruction.  This is important as it means that in the future when
+we add support for new profiles that support Zba, e.g., rva23u64,
+we only need to update asm_riscv64.h to indicate rva23u64 supports
+Zba.  We will not need to update every assembly language file that
+already uses Zba instructions.
+
+Updates #61476
+
+Change-Id: I83abfeb20d08a87ac8ea88f4d8a93437f0631353
+Reviewed-on: https://go-review.googlesource.com/c/go/+/608255
+Auto-Submit: Tim King <taking@google.com>
+Reviewed-by: Tim King <taking@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/cmd/dist/build.go     |  2 ++
+ src/runtime/asm_riscv64.h | 12 ++++++++++++
+ 2 files changed, 14 insertions(+)
+ create mode 100644 src/runtime/asm_riscv64.h
+
+diff --git a/src/cmd/dist/build.go b/src/cmd/dist/build.go
+index 7d720cc5e1..873d031fac 100644
+--- a/src/cmd/dist/build.go
++++ b/src/cmd/dist/build.go
+@@ -821,6 +821,8 @@ func runInstall(pkg string, ch chan struct{}) {
+ 			pathf("%s/src/runtime/asm_ppc64x.h", goroot), 0)
+ 		copyfile(pathf("%s/pkg/include/asm_amd64.h", goroot),
+ 			pathf("%s/src/runtime/asm_amd64.h", goroot), 0)
++		copyfile(pathf("%s/pkg/include/asm_riscv64.h", goroot),
++			pathf("%s/src/runtime/asm_riscv64.h", goroot), 0)
+ 	}
+ 
+ 	// Generate any missing files; regenerate existing ones.
+diff --git a/src/runtime/asm_riscv64.h b/src/runtime/asm_riscv64.h
+new file mode 100644
+index 0000000000..d4deb093a6
+--- /dev/null
++++ b/src/runtime/asm_riscv64.h
+@@ -0,0 +1,12 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++// Define features that are guaranteed to be supported by setting the GORISCV64 variable.
++// If a feature is supported, there's no need to check it at runtime every time.
++
++#ifdef GORISCV64_rva22u64
++#define hasZba
++#define hasZbb
++#define hasZbs
++#endif
+-- 
+2.39.5
+
diff --git a/2053-cmd-compile-cmd-internal-obj-riscv-always-provide-AN.patch b/2053-cmd-compile-cmd-internal-obj-riscv-always-provide-AN.patch
new file mode 100644
index 0000000..77c492f
--- /dev/null
+++ b/2053-cmd-compile-cmd-internal-obj-riscv-always-provide-AN.patch
@@ -0,0 +1,387 @@
+From 92dfc0ca8a0e09f34d4ab39b964cb532aa6ac73d Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 053/119] cmd/compile,cmd/internal/obj/riscv: always provide
+ ANDN, ORN and XNOR for riscv64
+
+The ANDN, ORN and XNOR RISC-V Zbb extension instructions are easily
+synthesised. Make them always available by adding support to the
+riscv64 assembler so that we either emit two instruction sequences,
+or a single instruction, when permitted by the GORISCV64 profile.
+This means that these instructions can be used unconditionally,
+simplifying compiler rewrite rules, codegen tests and manually
+written assembly.
+
+Around 180 instructions are removed from the Go binary on riscv64
+when built with rva22u64.
+
+Change-Id: Ib2d90f2593a306530dc0ed08a981acde4d01be20
+Reviewed-on: https://go-review.googlesource.com/c/go/+/611895
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Tim King <taking@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   | 12 +--
+ src/cmd/compile/internal/riscv64/ssa.go       |  3 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  1 -
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   | 29 ++++---
+ .../internal/ssa/_gen/RISCV64latelower.rules  |  6 ++
+ src/cmd/compile/internal/ssa/opGen.go         | 46 ++++++++++
+ .../internal/ssa/rewriteRISCV64latelower.go   | 84 +++++++++++++++++++
+ src/cmd/internal/obj/riscv/obj.go             | 28 +++++++
+ 8 files changed, 188 insertions(+), 21 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 5296a34d09..53b7b92faa 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -361,8 +361,8 @@ start:
+ 	SLLIUW		$1, X18, X19			// 9b191908
+ 
+ 	// 1.2: Basic Bit Manipulation (Zbb)
+-	ANDN	X19, X20, X21				// b37a3a41
+-	ANDN	X19, X20				// 337a3a41
++	ANDN	X19, X20, X21				// b37a3a41 or 93caf9ffb37a5a01
++	ANDN	X19, X20				// 337a3a41 or 93cff9ff337afa01
+ 	CLZ	X20, X21				// 931a0a60
+ 	CLZW	X21, X22				// 1b9b0a60
+ 	CPOP	X22, X23				// 931b2b60
+@@ -377,12 +377,12 @@ start:
+ 	MIN	X29, X30				// 334fdf0b
+ 	MINU	X30, X5, X6				// 33d3e20b
+ 	MINU	X30, X5					// b3d2e20b
+-	ORN	X6, X7, X8				// 33e46340
+-	ORN	X6, X7					// b3e36340
++	ORN	X6, X7, X8				// 33e46340 or 1344f3ff33e48300
++	ORN	X6, X7					// b3e36340 or 934ff3ffb3e3f301
+ 	SEXTB	X16, X17				// 93184860
+ 	SEXTH	X17, X18				// 13995860
+-	XNOR	X18, X19, X20				// 33ca2941
+-	XNOR	X18, X19				// b3c92941
++	XNOR	X18, X19, X20				// 33ca2941 or 33ca2901134afaff
++	XNOR	X18, X19				// b3c92941 or b3c9290193c9f9ff
+ 	ZEXTH	X19, X20				// 3bca0908
+ 
+ 	// 1.3: Bitwise Rotation (Zbb)
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index e3a2889697..759d8d7cf4 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -278,7 +278,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.From.Reg = rs
+ 		p.To.Type = obj.TYPE_REG
+ 		p.To.Reg = rd
+-	case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64SUBW, ssa.OpRISCV64XOR, ssa.OpRISCV64OR, ssa.OpRISCV64AND,
++	case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64SUBW, ssa.OpRISCV64XNOR, ssa.OpRISCV64XOR,
++		ssa.OpRISCV64OR, ssa.OpRISCV64ORN, ssa.OpRISCV64AND, ssa.OpRISCV64ANDN,
+ 		ssa.OpRISCV64SLL, ssa.OpRISCV64SLLW, ssa.OpRISCV64SRA, ssa.OpRISCV64SRAW, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW,
+ 		ssa.OpRISCV64SLT, ssa.OpRISCV64SLTU, ssa.OpRISCV64MUL, ssa.OpRISCV64MULW, ssa.OpRISCV64MULH,
+ 		ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW,
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index f0afd6b345..9ae9604381 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -62,7 +62,6 @@
+ 
+ (Com(64|32|16|8) ...) => (NOT ...)
+ 
+-
+ (Sqrt ...) => (FSQRTD ...)
+ (Sqrt32 ...) => (FSQRTS ...)
+ 
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 8badefa9ac..7f3c4a2bf4 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -226,19 +226,22 @@ func init() {
+ 		{name: "SH3ADD", argLength: 2, reg: gp21, asm: "SH3ADD"}, // arg0 << 3 + arg1
+ 
+ 		// Bitwise ops
+-		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
+-		{name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"},    // arg0 & auxint
+-		{name: "NOT", argLength: 1, reg: gp11, asm: "NOT"},                    // ^arg0
+-		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},   // arg0 | arg1
+-		{name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"},      // arg0 | auxint
+-		{name: "ROL", argLength: 2, reg: gp21, asm: "ROL"},                    // rotate left arg0 by (arg1 & 63)
+-		{name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"},                  // rotate left least significant word of arg0 by (arg1 & 31), sign extended
+-		{name: "ROR", argLength: 2, reg: gp21, asm: "ROR"},                    // rotate right arg0 by (arg1 & 63)
+-		{name: "RORI", argLength: 1, reg: gp11, asm: "RORI", aux: "Int64"},    // rotate right arg0 by auxint, shift amount 0-63
+-		{name: "RORIW", argLength: 1, reg: gp11, asm: "RORIW", aux: "Int64"},  // rotate right least significant word of arg0 by auxint, shift amount 0-31, sign extended
+-		{name: "RORW", argLength: 2, reg: gp21, asm: "RORW"},                  // rotate right least significant word of arg0 by (arg1 & 31), sign extended
+-		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1
+-		{name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"},    // arg0 ^ auxint
++		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true},   // arg0 & arg1
++		{name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"},                    // ^arg0 & arg1
++		{name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"},      // arg0 & auxint
++		{name: "NOT", argLength: 1, reg: gp11, asm: "NOT"},                      // ^arg0
++		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},     // arg0 | arg1
++		{name: "ORN", argLength: 2, reg: gp21, asm: "ORN"},                      // ^arg0 | arg1
++		{name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"},        // arg0 | auxint
++		{name: "ROL", argLength: 2, reg: gp21, asm: "ROL"},                      // rotate left arg0 by (arg1 & 63)
++		{name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"},                    // rotate left least significant word of arg0 by (arg1 & 31), sign extended
++		{name: "ROR", argLength: 2, reg: gp21, asm: "ROR"},                      // rotate right arg0 by (arg1 & 63)
++		{name: "RORI", argLength: 1, reg: gp11, asm: "RORI", aux: "Int64"},      // rotate right arg0 by auxint, shift amount 0-63
++		{name: "RORIW", argLength: 1, reg: gp11, asm: "RORIW", aux: "Int64"},    // rotate right least significant word of arg0 by auxint, shift amount 0-31, sign extended
++		{name: "RORW", argLength: 2, reg: gp21, asm: "RORW"},                    // rotate right least significant word of arg0 by (arg1 & 31), sign extended
++		{name: "XNOR", argLength: 2, reg: gp21, asm: "XNOR", commutative: true}, // ^(arg0 ^ arg1)
++		{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true},   // arg0 ^ arg1
++		{name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"},      // arg0 ^ auxint
+ 
+ 		// Minimum and maximum
+ 		{name: "MIN", argLength: 2, reg: gp21, asm: "MIN", commutative: true},   // min(arg0,arg1), signed
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules
+index cd55331dfd..7acaa2f3fe 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules
+@@ -2,6 +2,12 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
++// Combine bitwise operation and bitwise inversion.
++(AND x (NOT y)) => (ANDN x y)
++(OR  x (NOT y)) => (ORN  x y)
++(XOR x (NOT y)) => (XNOR x y)
++(NOT (XOR x y)) => (XNOR x y)
++
+ // Fold constant shift with extension.
+ (SRAI [c] (MOVBreg  x)) && c <   8 => (SRAI [56+c] (SLLI <typ.Int64> [56] x))
+ (SRAI [c] (MOVHreg  x)) && c <  16 => (SRAI [48+c] (SLLI <typ.Int64> [48] x))
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index f651adf63e..a02afc2da0 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2385,9 +2385,11 @@ const (
+ 	OpRISCV64SH2ADD
+ 	OpRISCV64SH3ADD
+ 	OpRISCV64AND
++	OpRISCV64ANDN
+ 	OpRISCV64ANDI
+ 	OpRISCV64NOT
+ 	OpRISCV64OR
++	OpRISCV64ORN
+ 	OpRISCV64ORI
+ 	OpRISCV64ROL
+ 	OpRISCV64ROLW
+@@ -2395,6 +2397,7 @@ const (
+ 	OpRISCV64RORI
+ 	OpRISCV64RORIW
+ 	OpRISCV64RORW
++	OpRISCV64XNOR
+ 	OpRISCV64XOR
+ 	OpRISCV64XORI
+ 	OpRISCV64MIN
+@@ -32008,6 +32011,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "ANDN",
++		argLen: 2,
++		asm:    riscv.AANDN,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:    "ANDI",
+ 		auxType: auxInt64,
+@@ -32050,6 +32067,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "ORN",
++		argLen: 2,
++		asm:    riscv.AORN,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:    "ORI",
+ 		auxType: auxInt64,
+@@ -32148,6 +32179,21 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:        "XNOR",
++		argLen:      2,
++		commutative: true,
++		asm:         riscv.AXNOR,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++				{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:        "XOR",
+ 		argLen:      2,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64latelower.go b/src/cmd/compile/internal/ssa/rewriteRISCV64latelower.go
+index 6dd97d65bd..d2c3a8f73d 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64latelower.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64latelower.go
+@@ -4,12 +4,76 @@ package ssa
+ 
+ func rewriteValueRISCV64latelower(v *Value) bool {
+ 	switch v.Op {
++	case OpRISCV64AND:
++		return rewriteValueRISCV64latelower_OpRISCV64AND(v)
++	case OpRISCV64NOT:
++		return rewriteValueRISCV64latelower_OpRISCV64NOT(v)
++	case OpRISCV64OR:
++		return rewriteValueRISCV64latelower_OpRISCV64OR(v)
+ 	case OpRISCV64SLLI:
+ 		return rewriteValueRISCV64latelower_OpRISCV64SLLI(v)
+ 	case OpRISCV64SRAI:
+ 		return rewriteValueRISCV64latelower_OpRISCV64SRAI(v)
+ 	case OpRISCV64SRLI:
+ 		return rewriteValueRISCV64latelower_OpRISCV64SRLI(v)
++	case OpRISCV64XOR:
++		return rewriteValueRISCV64latelower_OpRISCV64XOR(v)
++	}
++	return false
++}
++func rewriteValueRISCV64latelower_OpRISCV64AND(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (AND x (NOT y))
++	// result: (ANDN x y)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			x := v_0
++			if v_1.Op != OpRISCV64NOT {
++				continue
++			}
++			y := v_1.Args[0]
++			v.reset(OpRISCV64ANDN)
++			v.AddArg2(x, y)
++			return true
++		}
++		break
++	}
++	return false
++}
++func rewriteValueRISCV64latelower_OpRISCV64NOT(v *Value) bool {
++	v_0 := v.Args[0]
++	// match: (NOT (XOR x y))
++	// result: (XNOR x y)
++	for {
++		if v_0.Op != OpRISCV64XOR {
++			break
++		}
++		y := v_0.Args[1]
++		x := v_0.Args[0]
++		v.reset(OpRISCV64XNOR)
++		v.AddArg2(x, y)
++		return true
++	}
++	return false
++}
++func rewriteValueRISCV64latelower_OpRISCV64OR(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (OR x (NOT y))
++	// result: (ORN x y)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			x := v_0
++			if v_1.Op != OpRISCV64NOT {
++				continue
++			}
++			y := v_1.Args[0]
++			v.reset(OpRISCV64ORN)
++			v.AddArg2(x, y)
++			return true
++		}
++		break
+ 	}
+ 	return false
+ }
+@@ -241,6 +305,26 @@ func rewriteValueRISCV64latelower_OpRISCV64SRLI(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64latelower_OpRISCV64XOR(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (XOR x (NOT y))
++	// result: (XNOR x y)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			x := v_0
++			if v_1.Op != OpRISCV64NOT {
++				continue
++			}
++			y := v_1.Args[0]
++			v.reset(OpRISCV64XNOR)
++			v.AddArg2(x, y)
++			return true
++		}
++		break
++	}
++	return false
++}
+ func rewriteBlockRISCV64latelower(b *Block) bool {
+ 	return false
+ }
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index d396264a05..088463aef8 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -2534,6 +2534,34 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 
+ 	case AORCB, AREV8:
+ 		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE
++
++	case AANDN, AORN:
++		if buildcfg.GORISCV64 >= 22 {
++			// ANDN and ORN instructions are supported natively.
++			break
++		}
++		// ANDN -> (AND (NOT x) y)
++		// ORN  -> (OR  (NOT x) y)
++		bitwiseOp, notReg := AAND, ins.rd
++		if ins.as == AORN {
++			bitwiseOp = AOR
++		}
++		if ins.rs1 == notReg {
++			notReg = REG_TMP
++		}
++		inss = []*instruction{
++			&instruction{as: AXORI, rs1: ins.rs2, rs2: obj.REG_NONE, rd: notReg, imm: -1},
++			&instruction{as: bitwiseOp, rs1: ins.rs1, rs2: notReg, rd: ins.rd},
++		}
++
++	case AXNOR:
++		if buildcfg.GORISCV64 >= 22 {
++			// XNOR instruction is supported natively.
++			break
++		}
++		// XNOR -> (NOT (XOR x y))
++		ins.as = AXOR
++		inss = append(inss, &instruction{as: AXORI, rs1: ins.rd, rs2: obj.REG_NONE, rd: ins.rd, imm: -1})
+ 	}
+ 
+ 	for _, ins := range inss {
+-- 
+2.39.5
+
diff --git a/2054-crypto-md5-provide-optimised-assembly-for-riscv64.patch b/2054-crypto-md5-provide-optimised-assembly-for-riscv64.patch
new file mode 100644
index 0000000..d636541
--- /dev/null
+++ b/2054-crypto-md5-provide-optimised-assembly-for-riscv64.patch
@@ -0,0 +1,385 @@
+From d9a225d97913ddb16defe054eb661769b99df43d Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:39 +0800
+Subject: [PATCH 054/119] crypto/md5: provide optimised assembly for riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Provide an optimised assembly implementation of MD5 for RISC-V.
+There are significant performance improvements.  The assembler takes
+advantage of Zbb instructions when they are available.
+
+Results for the VisionFive 2 running Ubuntu 24.04 with
+GORISCV64=rva20u64.
+
+goos: linux
+goarch: riscv64
+pkg: crypto/md5
+                    │ md5_go.txt  │             md5_ass.txt             │
+                    │   sec/op    │   sec/op     vs base                │
+Hash8Bytes            1.202µ ± 0%   1.220µ ± 0%   +1.50% (p=0.000 n=10)
+Hash64                1.665µ ± 0%   1.518µ ± 0%   -8.83% (p=0.000 n=10)
+Hash128               2.165µ ± 0%   1.885µ ± 0%  -12.94% (p=0.000 n=10)
+Hash256               3.162µ ± 0%   2.613µ ± 0%  -17.38% (p=0.000 n=10)
+Hash512               5.146µ ± 0%   4.063µ ± 0%  -21.05% (p=0.000 n=10)
+Hash1K                9.115µ ± 0%   6.959µ ± 0%  -23.65% (p=0.000 n=10)
+Hash8K                64.68µ ± 0%   47.52µ ± 0%  -26.54% (p=0.000 n=10)
+Hash1M                8.131m ± 0%   5.936m ± 0%  -27.00% (p=0.000 n=10)
+Hash8M                65.06m ± 0%   47.50m ± 0%  -26.99% (p=0.000 n=10)
+Hash8BytesUnaligned   1.210µ ± 0%   1.199µ ± 0%   -0.91% (p=0.000 n=10)
+Hash1KUnaligned       9.114µ ± 0%   8.266µ ± 0%   -9.30% (p=0.000 n=10)
+Hash8KUnaligned       64.68µ ± 0%   57.97µ ± 0%  -10.38% (p=0.000 n=10)
+geomean               22.37µ        18.83µ       -15.82%
+
+Results for the VisionFive 2 running Ubuntu 24.04 with
+GORISCV64=rva22u64.
+
+goos: linux
+goarch: riscv64
+pkg: crypto/md5
+                    │ md5_g22.txt │             md5_a22.txt             │
+                    │   sec/op    │   sec/op     vs base                │
+Hash8Bytes            1.175µ ± 0%   1.002µ ± 0%  -14.72% (p=0.000 n=10)
+Hash64                1.575µ ± 0%   1.274µ ± 0%  -19.11% (p=0.000 n=10)
+Hash128               2.033µ ± 0%   1.587µ ± 0%  -21.92% (p=0.000 n=10)
+Hash256               2.943µ ± 0%   2.209µ ± 0%  -24.93% (p=0.000 n=10)
+Hash512               4.755µ ± 0%   3.443µ ± 0%  -27.58% (p=0.000 n=10)
+Hash1K                8.378µ ± 0%   5.910µ ± 0%  -29.46% (p=0.000 n=10)
+Hash8K                59.12µ ± 0%   40.45µ ± 0%  -31.58% (p=0.000 n=10)
+Hash1M                7.426m ± 0%   5.056m ± 0%  -31.92% (p=0.000 n=10)
+Hash8M                59.41m ± 0%   40.45m ± 0%  -31.91% (p=0.000 n=10)
+Hash8BytesUnaligned   1.169µ ± 0%   1.012µ ± 0%  -13.43% (p=0.000 n=10)
+Hash1KUnaligned       8.379µ ± 0%   7.213µ ± 0%  -13.91% (p=0.000 n=10)
+Hash8KUnaligned       59.12µ ± 0%   50.90µ ± 0%  -13.91% (p=0.000 n=10)
+geomean               20.83µ        15.99µ       -23.21%
+
+Change-Id: I61e3fa802c2cc50e0b5f71f151b4741691ccb481
+Reviewed-on: https://go-review.googlesource.com/c/go/+/527936
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Auto-Submit: Tim King <taking@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Tim King <taking@google.com>
+---
+ src/crypto/md5/md5block_decl.go    |   2 +-
+ src/crypto/md5/md5block_generic.go |   2 +-
+ src/crypto/md5/md5block_riscv64.s  | 279 +++++++++++++++++++++++++++++
+ 3 files changed, 281 insertions(+), 2 deletions(-)
+ create mode 100644 src/crypto/md5/md5block_riscv64.s
+
+diff --git a/src/crypto/md5/md5block_decl.go b/src/crypto/md5/md5block_decl.go
+index f1fb34c3d7..9c8e7271df 100644
+--- a/src/crypto/md5/md5block_decl.go
++++ b/src/crypto/md5/md5block_decl.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build amd64 || 386 || arm || ppc64le || ppc64 || s390x || arm64
++//go:build amd64 || 386 || arm || ppc64le || ppc64 || s390x || arm64 || riscv64
+ 
+ package md5
+ 
+diff --git a/src/crypto/md5/md5block_generic.go b/src/crypto/md5/md5block_generic.go
+index c929c2b84a..de607e01a6 100644
+--- a/src/crypto/md5/md5block_generic.go
++++ b/src/crypto/md5/md5block_generic.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !amd64 && !386 && !arm && !ppc64le && !ppc64 && !s390x && !arm64
++//go:build !amd64 && !386 && !arm && !ppc64le && !ppc64 && !s390x && !arm64 && !riscv64
+ 
+ package md5
+ 
+diff --git a/src/crypto/md5/md5block_riscv64.s b/src/crypto/md5/md5block_riscv64.s
+new file mode 100644
+index 0000000000..017c70b936
+--- /dev/null
++++ b/src/crypto/md5/md5block_riscv64.s
+@@ -0,0 +1,279 @@
++// Copyright 2023 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++//
++// RISCV64 version of md5block.go
++// derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go
++
++//go:build !purego
++
++#include "textflag.h"
++
++#define LOAD32U(base, offset, tmp, dest) \
++	MOVBU	(offset+0*1)(base), dest; \
++	MOVBU	(offset+1*1)(base), tmp; \
++	SLL	$8, tmp; \
++	OR	tmp, dest; \
++	MOVBU	(offset+2*1)(base), tmp; \
++	SLL	$16, tmp; \
++	OR	tmp, dest; \
++	MOVBU	(offset+3*1)(base), tmp; \
++	SLL	$24, tmp; \
++	OR	tmp, dest
++
++#define LOAD64U(base, offset, tmp1, tmp2, dst) \
++	LOAD32U(base, offset, tmp1, dst); \
++	LOAD32U(base, offset+4, tmp1, tmp2); \
++	SLL	$32, tmp2; \
++	OR	tmp2, dst
++
++#define ROUND1EVN(a, b, c, d, x, const, shift) \
++	MOV	$const, X23; \
++	ADDW	x, a; \
++	ADDW	X23, a; \
++	XOR	c, d, X23; \
++	AND	b, X23; \
++	XOR	d, X23; \
++	ADDW	X23, a; \
++	RORIW	$(32-shift), a; \
++	ADDW	b, a
++
++#define ROUND1ODD(a, b, c, d, x, const, shift) \
++	MOV	$const, X23; \
++	ADDW	X23, a; \
++	SRL	$32, x, X23; \
++	ADDW	X23, a; \
++	XOR	c, d, X23; \
++	AND	b, X23; \
++	XOR	d, X23; \
++	ADDW	X23, a; \
++	RORIW	$(32-shift), a; \
++	ADDW	b, a
++
++#define ROUND2EVN(a, b, c, d, x, const, shift) \
++	MOV	$const, X23; \
++	ADDW	x, a; \
++	ADDW	X23, a; \
++	XOR	b, c, X23; \
++	AND	d, X23; \
++	XOR	c, X23; \
++	ADDW	X23, a; \
++	RORIW	$(32-shift), a; \
++	ADDW	b, a
++
++#define ROUND2ODD(a, b, c, d, x, const, shift) \
++	MOV	$const, X23; \
++	ADDW	X23, a; \
++	SRL	$32, x, X23; \
++	ADDW	X23, a; \
++	XOR	b, c, X23; \
++	AND	d, X23; \
++	XOR	c, X23; \
++	ADDW	X23, a; \
++	RORIW	$(32-shift), a; \
++	ADDW	b, a
++
++#define ROUND3EVN(a, b, c, d, x, const, shift) \
++	MOV	$const, X23; \
++	ADDW	x, a; \
++	ADDW	X23, a; \
++	XOR	c, d, X23; \
++	XOR	b, X23; \
++	ADDW	X23, a; \
++	RORIW	$(32-shift), a; \
++	ADDW	b, a
++
++#define ROUND3ODD(a, b, c, d, x, const, shift) \
++	MOV	$const, X23; \
++	ADDW	X23, a; \
++	SRL	$32, x, X23; \
++	ADDW	X23, a; \
++	XOR	c, d, X23; \
++	XOR	b, X23; \
++	ADDW	X23, a; \
++	RORIW	$(32-shift), a; \
++	ADDW	b, a
++
++#define ROUND4EVN(a, b, c, d, x, const, shift) \
++	MOV	$const, X23; \
++	ADDW	x, a; \
++	ADDW	X23, a; \
++	ORN	d, b, X23; \
++	XOR	c, X23; \
++	ADDW	X23, a; \
++	RORIW	$(32-shift), a; \
++	ADDW	b, a
++
++#define ROUND4ODD(a, b, c, d, x, const, shift) \
++	MOV	$const, X23; \
++	ADDW	X23, a; \
++	SRL	$32, x, X23; \
++	ADDW	X23, a; \
++	ORN	d, b, X23; \
++	XOR	c, X23; \
++	ADDW	X23, a; \
++	RORIW	$(32-shift), a; \
++	ADDW	b, a
++
++// Register use for the block function
++//
++// X5 - X12	: contain the 16 32 bit data items in the block we're
++//		  processing.  Odd numbered values, e.g., x1, x3 are stored in
++//		  the upper 32 bits of the register.
++// X13 - X16	: a, b, c, d
++// X17 - X20	: used to store the old values of a, b, c, d, i.e., aa, bb, cc,
++//		  dd.  X17 and X18 are also used as temporary registers when
++//		  loading unaligned data.
++// X22		: pointer to dig.s
++// X23		: temporary register
++// X28		: pointer to the first byte beyond the end of p
++// X29		: pointer to current 64 byte block of data, initially set to
++//		  &p[0]
++// X30		: temporary register
++
++TEXT	·block(SB),NOSPLIT,$0-32
++	MOV	p+8(FP), X29
++	MOV	p_len+16(FP), X30
++	SRL	$6, X30
++	SLL	$6, X30
++	BEQZ	X30, zero
++
++	ADD	X29, X30, X28
++
++	MOV	dig+0(FP), X22
++	MOVWU	(0*4)(X22), X13	// a = s[0]
++	MOVWU	(1*4)(X22), X14	// b = s[1]
++	MOVWU	(2*4)(X22), X15	// c = s[2]
++	MOVWU	(3*4)(X22), X16	// d = s[3]
++
++loop:
++
++	// Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12.
++	// Different paths are taken to load the values depending on whether the
++	// buffer is 8 byte aligned or not.  We load all the values up front
++	// here at the start of the loop to avoid multiple alignment checks and
++	// to reduce code size.  It takes 10 instructions to load an unaligned
++	// 32 bit value and this value will be used 4 times in the main body
++	// of the loop below.
++
++	AND	$7, X29, X30
++	BEQZ	X30, aligned
++
++	LOAD64U(X29,0, X17, X18, X5)
++	LOAD64U(X29,8, X17, X18, X6)
++	LOAD64U(X29,16, X17, X18, X7)
++	LOAD64U(X29,24, X17, X18, X8)
++	LOAD64U(X29,32, X17, X18, X9)
++	LOAD64U(X29,40, X17, X18, X10)
++	LOAD64U(X29,48, X17, X18, X11)
++	LOAD64U(X29,56, X17, X18, X12)
++	JMP block_loaded
++
++aligned:
++	MOV	(0*8)(X29), X5
++	MOV	(1*8)(X29), X6
++	MOV	(2*8)(X29), X7
++	MOV	(3*8)(X29), X8
++	MOV	(4*8)(X29), X9
++	MOV	(5*8)(X29), X10
++	MOV	(6*8)(X29), X11
++	MOV	(7*8)(X29), X12
++
++block_loaded:
++	MOV	X13, X17
++	MOV	X14, X18
++	MOV	X15, X19
++	MOV	X16, X20
++
++	// Some of the hex constants below are too large to fit into a
++	// signed 32 bit value.  The assembler will handle these
++	// constants in a special way to ensure that they are
++	// zero extended.  Our algorithm is only interested in the
++	// bottom 32 bits and doesn't care whether constants are
++	// sign or zero extended when moved into 64 bit registers.
++	// So we use signed constants instead of hex when bit 31 is
++	// set so all constants can be loaded by lui+addi.
++
++	ROUND1EVN(X13,X14,X15,X16,X5,  -680876936, 7); // 0xd76aa478
++	ROUND1ODD(X16,X13,X14,X15,X5,  -389564586,12); // 0xe8c7b756
++	ROUND1EVN(X15,X16,X13,X14,X6,  0x242070db,17); // 0x242070db
++	ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee
++	ROUND1EVN(X13,X14,X15,X16,X7,  -176418897, 7); // 0xf57c0faf
++	ROUND1ODD(X16,X13,X14,X15,X7,  0x4787c62a,12); // 0x4787c62a
++	ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613
++	ROUND1ODD(X14,X15,X16,X13,X8,   -45705983,22); // 0xfd469501
++	ROUND1EVN(X13,X14,X15,X16,X9,  0x698098d8, 7); // 0x698098d8
++	ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af
++	ROUND1EVN(X15,X16,X13,X14,X10,     -42063,17); // 0xffff5bb1
++	ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be
++	ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122
++	ROUND1ODD(X16,X13,X14,X15,X11,  -40341101,12); // 0xfd987193
++	ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e
++	ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821
++
++	ROUND2ODD(X13,X14,X15,X16,X5,  -165796510, 5); // f61e2562
++	ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340
++	ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51
++	ROUND2EVN(X14,X15,X16,X13,X5,  -373897302,20); // e9b6c7aa
++	ROUND2ODD(X13,X14,X15,X16,X7,  -701558691, 5); // d62f105d
++	ROUND2EVN(X16,X13,X14,X15,X10,  0x2441453, 9); // 2441453
++	ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681
++	ROUND2EVN(X14,X15,X16,X13,X7,  -405537848,20); // e7d3fbc8
++	ROUND2ODD(X13,X14,X15,X16,X9,  0x21e1cde6, 5); // 21e1cde6
++	ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6
++	ROUND2ODD(X15,X16,X13,X14,X6,  -187363961,14); // f4d50d87
++	ROUND2EVN(X14,X15,X16,X13,X9,  0x455a14ed,20); // 455a14ed
++	ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905
++	ROUND2EVN(X16,X13,X14,X15,X6,   -51403784, 9); // fcefa3f8
++	ROUND2ODD(X15,X16,X13,X14,X8,  0x676f02d9,14); // 676f02d9
++	ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a
++
++	ROUND3ODD(X13,X14,X15,X16,X7,     -378558, 4); // fffa3942
++	ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681
++	ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122
++	ROUND3EVN(X14,X15,X16,X13,X12,  -35309556,23); // fde5380c
++	ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44
++	ROUND3EVN(X16,X13,X14,X15,X7,  0x4bdecfa9,11); // 4bdecfa9
++	ROUND3ODD(X15,X16,X13,X14,X8,  -155497632,16); // f6bb4b60
++	ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70
++	ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6
++	ROUND3EVN(X16,X13,X14,X15,X5,  -358537222,11); // eaa127fa
++	ROUND3ODD(X15,X16,X13,X14,X6,  -722521979,16); // d4ef3085
++	ROUND3EVN(X14,X15,X16,X13,X8,   0x4881d05,23); // 4881d05
++	ROUND3ODD(X13,X14,X15,X16,X9,  -640364487, 4); // d9d4d039
++	ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5
++	ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8
++	ROUND3EVN(X14,X15,X16,X13,X6,  -995338651,23); // c4ac5665
++
++	ROUND4EVN(X13,X14,X15,X16,X5,  -198630844, 6); // f4292244
++	ROUND4ODD(X16,X13,X14,X15,X8,  0x432aff97,10); // 432aff97
++	ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7
++	ROUND4ODD(X14,X15,X16,X13,X7,   -57434055,21); // fc93a039
++	ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3
++	ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92
++	ROUND4EVN(X15,X16,X13,X14,X10   ,-1051523,15); // ffeff47d
++	ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1
++	ROUND4EVN(X13,X14,X15,X16,X9,  0x6fa87e4f, 6); // 6fa87e4f
++	ROUND4ODD(X16,X13,X14,X15,X12,  -30611744,10); // fe2ce6e0
++	ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314
++	ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1
++	ROUND4EVN(X13,X14,X15,X16,X7,  -145523070, 6); // f7537e82
++	ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235
++	ROUND4EVN(X15,X16,X13,X14,X6,  0x2ad7d2bb,15); // 2ad7d2bb
++	ROUND4ODD(X14,X15,X16,X13,X9,  -343485551,21); // eb86d391
++
++	ADDW	X17, X13
++	ADDW	X18, X14
++	ADDW	X19, X15
++	ADDW	X20, X16
++
++	ADD	$64, X29
++	BNE	X28, X29, loop
++
++	MOVW	X13, (0*4)(X22)
++	MOVW	X14, (1*4)(X22)
++	MOVW	X15, (2*4)(X22)
++	MOVW	X16, (3*4)(X22)
++
++zero:
++	RET
+-- 
+2.39.5
+
diff --git a/2055-cmd-internal-obj-riscv-rename-the-iIEncoding.patch b/2055-cmd-internal-obj-riscv-rename-the-iIEncoding.patch
new file mode 100644
index 0000000..b138006
--- /dev/null
+++ b/2055-cmd-internal-obj-riscv-rename-the-iIEncoding.patch
@@ -0,0 +1,200 @@
+From 01f4244453d5872e9c0dbb5057eace1b18fa65b6 Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 055/119] cmd/internal/obj/riscv: rename the iIEncoding
+
+We rename it to iIIEncoding to reflect the fact that instructions
+that use this encoding take two integer registers.  This change
+will allow us to add a new encoding for I-type instructions that
+take a single integer register.  This new encoding will be used for
+instructions that modify CSRs.
+
+Change-Id: Ic507d0020e18f6aa72353f4d3ffcd0e868261e7a
+Reviewed-on: https://go-review.googlesource.com/c/go/+/614355
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: David Chase <drchase@google.com>
+---
+ src/cmd/internal/obj/riscv/obj.go | 80 +++++++++++++++----------------
+ 1 file changed, 40 insertions(+), 40 deletions(-)
+
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 088463aef8..6e9691bb4f 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1174,7 +1174,7 @@ func validateRFF(ctxt *obj.Link, ins *instruction) {
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+-func validateII(ctxt *obj.Link, ins *instruction) {
++func validateIII(ctxt *obj.Link, ins *instruction) {
+ 	wantImmI(ctxt, ins, ins.imm, 12)
+ 	wantIntReg(ctxt, ins, "rd", ins.rd)
+ 	wantIntReg(ctxt, ins, "rs1", ins.rs1)
+@@ -1320,7 +1320,7 @@ func encodeI(as obj.As, rs1, rd, imm uint32) uint32 {
+ 	return imm<<20 | rs1<<15 | enc.funct3<<12 | rd<<7 | enc.opcode
+ }
+ 
+-func encodeII(ins *instruction) uint32 {
++func encodeIII(ins *instruction) uint32 {
+ 	return encodeI(ins.as, regI(ins.rs1), regI(ins.rd), uint32(ins.imm))
+ }
+ 
+@@ -1521,8 +1521,8 @@ var (
+ 	rIFEncoding   = encoding{encode: encodeRIF, validate: validateRIF, length: 4}
+ 	rFFEncoding   = encoding{encode: encodeRFF, validate: validateRFF, length: 4}
+ 
+-	iIEncoding = encoding{encode: encodeII, validate: validateII, length: 4}
+-	iFEncoding = encoding{encode: encodeIF, validate: validateIF, length: 4}
++	iIIEncoding = encoding{encode: encodeIII, validate: validateIII, length: 4}
++	iFEncoding  = encoding{encode: encodeIF, validate: validateIF, length: 4}
+ 
+ 	sIEncoding = encoding{encode: encodeSI, validate: validateSI, length: 4}
+ 	sFEncoding = encoding{encode: encodeSF, validate: validateSF, length: 4}
+@@ -1549,15 +1549,15 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	// Unprivileged ISA
+ 
+ 	// 2.4: Integer Computational Instructions
+-	AADDI & obj.AMask:  iIEncoding,
+-	ASLTI & obj.AMask:  iIEncoding,
+-	ASLTIU & obj.AMask: iIEncoding,
+-	AANDI & obj.AMask:  iIEncoding,
+-	AORI & obj.AMask:   iIEncoding,
+-	AXORI & obj.AMask:  iIEncoding,
+-	ASLLI & obj.AMask:  iIEncoding,
+-	ASRLI & obj.AMask:  iIEncoding,
+-	ASRAI & obj.AMask:  iIEncoding,
++	AADDI & obj.AMask:  iIIEncoding,
++	ASLTI & obj.AMask:  iIIEncoding,
++	ASLTIU & obj.AMask: iIIEncoding,
++	AANDI & obj.AMask:  iIIEncoding,
++	AORI & obj.AMask:   iIIEncoding,
++	AXORI & obj.AMask:  iIIEncoding,
++	ASLLI & obj.AMask:  iIIEncoding,
++	ASRLI & obj.AMask:  iIIEncoding,
++	ASRAI & obj.AMask:  iIIEncoding,
+ 	ALUI & obj.AMask:   uEncoding,
+ 	AAUIPC & obj.AMask: uEncoding,
+ 	AADD & obj.AMask:   rIIIEncoding,
+@@ -1573,7 +1573,7 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 
+ 	// 2.5: Control Transfer Instructions
+ 	AJAL & obj.AMask:  jEncoding,
+-	AJALR & obj.AMask: iIEncoding,
++	AJALR & obj.AMask: iIIEncoding,
+ 	ABEQ & obj.AMask:  bEncoding,
+ 	ABNE & obj.AMask:  bEncoding,
+ 	ABLT & obj.AMask:  bEncoding,
+@@ -1582,24 +1582,24 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	ABGEU & obj.AMask: bEncoding,
+ 
+ 	// 2.6: Load and Store Instructions
+-	ALW & obj.AMask:  iIEncoding,
+-	ALWU & obj.AMask: iIEncoding,
+-	ALH & obj.AMask:  iIEncoding,
+-	ALHU & obj.AMask: iIEncoding,
+-	ALB & obj.AMask:  iIEncoding,
+-	ALBU & obj.AMask: iIEncoding,
++	ALW & obj.AMask:  iIIEncoding,
++	ALWU & obj.AMask: iIIEncoding,
++	ALH & obj.AMask:  iIIEncoding,
++	ALHU & obj.AMask: iIIEncoding,
++	ALB & obj.AMask:  iIIEncoding,
++	ALBU & obj.AMask: iIIEncoding,
+ 	ASW & obj.AMask:  sIEncoding,
+ 	ASH & obj.AMask:  sIEncoding,
+ 	ASB & obj.AMask:  sIEncoding,
+ 
+ 	// 2.7: Memory Ordering
+-	AFENCE & obj.AMask: iIEncoding,
++	AFENCE & obj.AMask: iIIEncoding,
+ 
+ 	// 5.2: Integer Computational Instructions (RV64I)
+-	AADDIW & obj.AMask: iIEncoding,
+-	ASLLIW & obj.AMask: iIEncoding,
+-	ASRLIW & obj.AMask: iIEncoding,
+-	ASRAIW & obj.AMask: iIEncoding,
++	AADDIW & obj.AMask: iIIEncoding,
++	ASLLIW & obj.AMask: iIIEncoding,
++	ASRLIW & obj.AMask: iIIEncoding,
++	ASRAIW & obj.AMask: iIIEncoding,
+ 	AADDW & obj.AMask:  rIIIEncoding,
+ 	ASLLW & obj.AMask:  rIIIEncoding,
+ 	ASRLW & obj.AMask:  rIIIEncoding,
+@@ -1607,7 +1607,7 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	ASRAW & obj.AMask:  rIIIEncoding,
+ 
+ 	// 5.3: Load and Store Instructions (RV64I)
+-	ALD & obj.AMask: iIEncoding,
++	ALD & obj.AMask: iIIEncoding,
+ 	ASD & obj.AMask: sIEncoding,
+ 
+ 	// 7.1: Multiplication Operations
+@@ -1652,9 +1652,9 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	AAMOMINUD & obj.AMask: rIIIEncoding,
+ 
+ 	// 10.1: Base Counters and Timers
+-	ARDCYCLE & obj.AMask:   iIEncoding,
+-	ARDTIME & obj.AMask:    iIEncoding,
+-	ARDINSTRET & obj.AMask: iIEncoding,
++	ARDCYCLE & obj.AMask:   iIIEncoding,
++	ARDTIME & obj.AMask:    iIIEncoding,
++	ARDINSTRET & obj.AMask: iIIEncoding,
+ 
+ 	// 11.5: Single-Precision Load and Store Instructions
+ 	AFLW & obj.AMask: iFEncoding,
+@@ -1743,8 +1743,8 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	// Privileged ISA
+ 
+ 	// 3.2.1: Environment Call and Breakpoint
+-	AECALL & obj.AMask:  iIEncoding,
+-	AEBREAK & obj.AMask: iIEncoding,
++	AECALL & obj.AMask:  iIIEncoding,
++	AEBREAK & obj.AMask: iIIEncoding,
+ 
+ 	//
+ 	// RISC-V Bit-Manipulation ISA-extensions (1.0)
+@@ -1758,7 +1758,7 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	ASH2ADDUW & obj.AMask: rIIIEncoding,
+ 	ASH3ADD & obj.AMask:   rIIIEncoding,
+ 	ASH3ADDUW & obj.AMask: rIIIEncoding,
+-	ASLLIUW & obj.AMask:   iIEncoding,
++	ASLLIUW & obj.AMask:   iIIEncoding,
+ 
+ 	// 1.2: Basic Bit Manipulation (Zbb)
+ 	AANDN & obj.AMask:  rIIIEncoding,
+@@ -1782,21 +1782,21 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	AROL & obj.AMask:   rIIIEncoding,
+ 	AROLW & obj.AMask:  rIIIEncoding,
+ 	AROR & obj.AMask:   rIIIEncoding,
+-	ARORI & obj.AMask:  iIEncoding,
+-	ARORIW & obj.AMask: iIEncoding,
++	ARORI & obj.AMask:  iIIEncoding,
++	ARORIW & obj.AMask: iIIEncoding,
+ 	ARORW & obj.AMask:  rIIIEncoding,
+-	AORCB & obj.AMask:  iIEncoding,
+-	AREV8 & obj.AMask:  iIEncoding,
++	AORCB & obj.AMask:  iIIEncoding,
++	AREV8 & obj.AMask:  iIIEncoding,
+ 
+ 	// 1.5: Single-bit Instructions (Zbs)
+ 	ABCLR & obj.AMask:  rIIIEncoding,
+-	ABCLRI & obj.AMask: iIEncoding,
++	ABCLRI & obj.AMask: iIIEncoding,
+ 	ABEXT & obj.AMask:  rIIIEncoding,
+-	ABEXTI & obj.AMask: iIEncoding,
++	ABEXTI & obj.AMask: iIIEncoding,
+ 	ABINV & obj.AMask:  rIIIEncoding,
+-	ABINVI & obj.AMask: iIEncoding,
++	ABINVI & obj.AMask: iIIEncoding,
+ 	ABSET & obj.AMask:  rIIIEncoding,
+-	ABSETI & obj.AMask: iIEncoding,
++	ABSETI & obj.AMask: iIIEncoding,
+ 
+ 	// Escape hatch
+ 	AWORD & obj.AMask: rawEncoding,
+-- 
+2.39.5
+
diff --git a/2056-cmd-internal-obj-riscv-add-vector-instruction-encodi.patch b/2056-cmd-internal-obj-riscv-add-vector-instruction-encodi.patch
new file mode 100644
index 0000000..c63fb18
--- /dev/null
+++ b/2056-cmd-internal-obj-riscv-add-vector-instruction-encodi.patch
@@ -0,0 +1,2444 @@
+From b8fe27a30aaf17fbb06a4ca6f43a2ac74086327e Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 056/119] cmd/internal/obj/riscv: add vector instruction
+ encodings
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Regenerate the riscv instruction encoding table with the V extension
+enabled. Add constants and names for the resulting 375 instructions.
+
+Change-Id: Icce688493aeb1e9880fb76a0618643f57e481273
+Reviewed-on: https://go-review.googlesource.com/c/go/+/595403
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: 鹏程汪 <wangpengcheng.pp@bytedance.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Michael Pratt <mpratt@google.com>
+---
+ src/cmd/internal/obj/riscv/anames.go |  375 ++++++++
+ src/cmd/internal/obj/riscv/cpu.go    |  491 ++++++++++
+ src/cmd/internal/obj/riscv/inst.go   | 1255 ++++++++++++++++++++------
+ 3 files changed, 1869 insertions(+), 252 deletions(-)
+
+diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go
+index 60c7b48620..53cf1c95dc 100644
+--- a/src/cmd/internal/obj/riscv/anames.go
++++ b/src/cmd/internal/obj/riscv/anames.go
+@@ -257,6 +257,381 @@ var Anames = []string{
+ 	"BINVI",
+ 	"BSET",
+ 	"BSETI",
++	"VSETVLI",
++	"VSETIVLI",
++	"VSETVL",
++	"VLE8V",
++	"VLE16V",
++	"VLE32V",
++	"VLE64V",
++	"VSE8V",
++	"VSE16V",
++	"VSE32V",
++	"VSE64V",
++	"VLMV",
++	"VSMV",
++	"VLSE8V",
++	"VLSE16V",
++	"VLSE32V",
++	"VLSE64V",
++	"VSSE8V",
++	"VSSE16V",
++	"VSSE32V",
++	"VSSE64V",
++	"VLUXEI8V",
++	"VLUXEI16V",
++	"VLUXEI32V",
++	"VLUXEI64V",
++	"VLOXEI8V",
++	"VLOXEI16V",
++	"VLOXEI32V",
++	"VLOXEI64V",
++	"VSUXEI8V",
++	"VSUXEI16V",
++	"VSUXEI32V",
++	"VSUXEI64V",
++	"VSOXEI8V",
++	"VSOXEI16V",
++	"VSOXEI32V",
++	"VSOXEI64V",
++	"VLE8FFV",
++	"VLE16FFV",
++	"VLE32FFV",
++	"VLE64FFV",
++	"VL1RE8V",
++	"VL1RE16V",
++	"VL1RE32V",
++	"VL1RE64V",
++	"VL2RE8V",
++	"VL2RE16V",
++	"VL2RE32V",
++	"VL2RE64V",
++	"VL4RE8V",
++	"VL4RE16V",
++	"VL4RE32V",
++	"VL4RE64V",
++	"VL8RE8V",
++	"VL8RE16V",
++	"VL8RE32V",
++	"VL8RE64V",
++	"VS1RV",
++	"VS2RV",
++	"VS4RV",
++	"VS8RV",
++	"VADDVV",
++	"VADDVX",
++	"VADDVI",
++	"VSUBVV",
++	"VSUBVX",
++	"VRSUBVX",
++	"VRSUBVI",
++	"VWADDUVV",
++	"VWADDUVX",
++	"VWSUBUVV",
++	"VWSUBUVX",
++	"VWADDVV",
++	"VWADDVX",
++	"VWSUBVV",
++	"VWSUBVX",
++	"VWADDUWV",
++	"VWADDUWX",
++	"VWSUBUWV",
++	"VWSUBUWX",
++	"VWADDWV",
++	"VWADDWX",
++	"VWSUBWV",
++	"VWSUBWX",
++	"VZEXTVF2",
++	"VSEXTVF2",
++	"VZEXTVF4",
++	"VSEXTVF4",
++	"VZEXTVF8",
++	"VSEXTVF8",
++	"VADCVVM",
++	"VADCVXM",
++	"VADCVIM",
++	"VMADCVVM",
++	"VMADCVXM",
++	"VMADCVIM",
++	"VMADCVV",
++	"VMADCVX",
++	"VMADCVI",
++	"VSBCVVM",
++	"VSBCVXM",
++	"VMSBCVVM",
++	"VMSBCVXM",
++	"VMSBCVV",
++	"VMSBCVX",
++	"VANDVV",
++	"VANDVX",
++	"VANDVI",
++	"VORVV",
++	"VORVX",
++	"VORVI",
++	"VXORVV",
++	"VXORVX",
++	"VXORVI",
++	"VSLLVV",
++	"VSLLVX",
++	"VSLLVI",
++	"VSRLVV",
++	"VSRLVX",
++	"VSRLVI",
++	"VSRAVV",
++	"VSRAVX",
++	"VSRAVI",
++	"VNSRLWV",
++	"VNSRLWX",
++	"VNSRLWI",
++	"VNSRAWV",
++	"VNSRAWX",
++	"VNSRAWI",
++	"VMSEQVV",
++	"VMSEQVX",
++	"VMSEQVI",
++	"VMSNEVV",
++	"VMSNEVX",
++	"VMSNEVI",
++	"VMSLTUVV",
++	"VMSLTUVX",
++	"VMSLTVV",
++	"VMSLTVX",
++	"VMSLEUVV",
++	"VMSLEUVX",
++	"VMSLEUVI",
++	"VMSLEVV",
++	"VMSLEVX",
++	"VMSLEVI",
++	"VMSGTUVX",
++	"VMSGTUVI",
++	"VMSGTVX",
++	"VMSGTVI",
++	"VMINUVV",
++	"VMINUVX",
++	"VMINVV",
++	"VMINVX",
++	"VMAXUVV",
++	"VMAXUVX",
++	"VMAXVV",
++	"VMAXVX",
++	"VMULVV",
++	"VMULVX",
++	"VMULHVV",
++	"VMULHVX",
++	"VMULHUVV",
++	"VMULHUVX",
++	"VMULHSUVV",
++	"VMULHSUVX",
++	"VDIVUVV",
++	"VDIVUVX",
++	"VDIVVV",
++	"VDIVVX",
++	"VREMUVV",
++	"VREMUVX",
++	"VREMVV",
++	"VREMVX",
++	"VWMULVV",
++	"VWMULVX",
++	"VWMULUVV",
++	"VWMULUVX",
++	"VWMULSUVV",
++	"VWMULSUVX",
++	"VMACCVV",
++	"VMACCVX",
++	"VNMSACVV",
++	"VNMSACVX",
++	"VMADDVV",
++	"VMADDVX",
++	"VNMSUBVV",
++	"VNMSUBVX",
++	"VWMACCUVV",
++	"VWMACCUVX",
++	"VWMACCVV",
++	"VWMACCVX",
++	"VWMACCSUVV",
++	"VWMACCSUVX",
++	"VWMACCUSVX",
++	"VMERGEVVM",
++	"VMERGEVXM",
++	"VMERGEVIM",
++	"VMVVV",
++	"VMVVX",
++	"VMVVI",
++	"VSADDUVV",
++	"VSADDUVX",
++	"VSADDUVI",
++	"VSADDVV",
++	"VSADDVX",
++	"VSADDVI",
++	"VSSUBUVV",
++	"VSSUBUVX",
++	"VSSUBVV",
++	"VSSUBVX",
++	"VAADDUVV",
++	"VAADDUVX",
++	"VAADDVV",
++	"VAADDVX",
++	"VASUBUVV",
++	"VASUBUVX",
++	"VASUBVV",
++	"VASUBVX",
++	"VSMULVV",
++	"VSMULVX",
++	"VSSRLVV",
++	"VSSRLVX",
++	"VSSRLVI",
++	"VSSRAVV",
++	"VSSRAVX",
++	"VSSRAVI",
++	"VNCLIPUWV",
++	"VNCLIPUWX",
++	"VNCLIPUWI",
++	"VNCLIPWV",
++	"VNCLIPWX",
++	"VNCLIPWI",
++	"VFADDVV",
++	"VFADDVF",
++	"VFSUBVV",
++	"VFSUBVF",
++	"VFRSUBVF",
++	"VFWADDVV",
++	"VFWADDVF",
++	"VFWSUBVV",
++	"VFWSUBVF",
++	"VFWADDWV",
++	"VFWADDWF",
++	"VFWSUBWV",
++	"VFWSUBWF",
++	"VFMULVV",
++	"VFMULVF",
++	"VFDIVVV",
++	"VFDIVVF",
++	"VFRDIVVF",
++	"VFWMULVV",
++	"VFWMULVF",
++	"VFMACCVV",
++	"VFMACCVF",
++	"VFNMACCVV",
++	"VFNMACCVF",
++	"VFMSACVV",
++	"VFMSACVF",
++	"VFNMSACVV",
++	"VFNMSACVF",
++	"VFMADDVV",
++	"VFMADDVF",
++	"VFNMADDVV",
++	"VFNMADDVF",
++	"VFMSUBVV",
++	"VFMSUBVF",
++	"VFNMSUBVV",
++	"VFNMSUBVF",
++	"VFWMACCVV",
++	"VFWMACCVF",
++	"VFWNMACCVV",
++	"VFWNMACCVF",
++	"VFWMSACVV",
++	"VFWMSACVF",
++	"VFWNMSACVV",
++	"VFWNMSACVF",
++	"VFSQRTV",
++	"VFRSQRT7V",
++	"VFREC7V",
++	"VFMINVV",
++	"VFMINVF",
++	"VFMAXVV",
++	"VFMAXVF",
++	"VFSGNJVV",
++	"VFSGNJVF",
++	"VFSGNJNVV",
++	"VFSGNJNVF",
++	"VFSGNJXVV",
++	"VFSGNJXVF",
++	"VMFEQVV",
++	"VMFEQVF",
++	"VMFNEVV",
++	"VMFNEVF",
++	"VMFLTVV",
++	"VMFLTVF",
++	"VMFLEVV",
++	"VMFLEVF",
++	"VMFGTVF",
++	"VMFGEVF",
++	"VFCLASSV",
++	"VFMERGEVFM",
++	"VFMVVF",
++	"VFCVTXUFV",
++	"VFCVTXFV",
++	"VFCVTRTZXUFV",
++	"VFCVTRTZXFV",
++	"VFCVTFXUV",
++	"VFCVTFXV",
++	"VFWCVTXUFV",
++	"VFWCVTXFV",
++	"VFWCVTRTZXUFV",
++	"VFWCVTRTZXFV",
++	"VFWCVTFXUV",
++	"VFWCVTFXV",
++	"VFWCVTFFV",
++	"VFNCVTXUFW",
++	"VFNCVTXFW",
++	"VFNCVTRTZXUFW",
++	"VFNCVTRTZXFW",
++	"VFNCVTFXUW",
++	"VFNCVTFXW",
++	"VFNCVTFFW",
++	"VFNCVTRODFFW",
++	"VREDSUMVS",
++	"VREDMAXUVS",
++	"VREDMAXVS",
++	"VREDMINUVS",
++	"VREDMINVS",
++	"VREDANDVS",
++	"VREDORVS",
++	"VREDXORVS",
++	"VWREDSUMUVS",
++	"VWREDSUMVS",
++	"VFREDOSUMVS",
++	"VFREDUSUMVS",
++	"VFREDMAXVS",
++	"VFREDMINVS",
++	"VFWREDOSUMVS",
++	"VFWREDUSUMVS",
++	"VMANDMM",
++	"VMNANDMM",
++	"VMANDNMM",
++	"VMXORMM",
++	"VMORMM",
++	"VMNORMM",
++	"VMORNMM",
++	"VMXNORMM",
++	"VCPOPM",
++	"VFIRSTM",
++	"VMSBFM",
++	"VMSIFM",
++	"VMSOFM",
++	"VIOTAM",
++	"VIDV",
++	"VMVXS",
++	"VMVSX",
++	"VFMVFS",
++	"VFMVSF",
++	"VSLIDEUPVX",
++	"VSLIDEUPVI",
++	"VSLIDEDOWNVX",
++	"VSLIDEDOWNVI",
++	"VSLIDE1UPVX",
++	"VFSLIDE1UPVF",
++	"VSLIDE1DOWNVX",
++	"VFSLIDE1DOWNVF",
++	"VRGATHERVV",
++	"VRGATHEREI16VV",
++	"VRGATHERVX",
++	"VRGATHERVI",
++	"VCOMPRESSVM",
++	"VMV1RV",
++	"VMV2RV",
++	"VMV4RV",
++	"VMV8RV",
+ 	"WORD",
+ 	"BEQZ",
+ 	"BGEZ",
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index 07d5ccff87..8b620b8646 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -619,6 +619,497 @@ const (
+ 	ABSET
+ 	ABSETI
+ 
++	//
++	// RISC-V Vector ISA-extension (1.0) (Unprivileged 20240411)
++	//
++
++	// 31.6. Configuration-Setting Instructions
++	AVSETVLI
++	AVSETIVLI
++	AVSETVL
++
++	// 31.7.4. Vector Unit-Stride Instructions
++	AVLE8V
++	AVLE16V
++	AVLE32V
++	AVLE64V
++	AVSE8V
++	AVSE16V
++	AVSE32V
++	AVSE64V
++	AVLMV
++	AVSMV
++
++	// 31.7.5. Vector Strided Instructions
++	AVLSE8V
++	AVLSE16V
++	AVLSE32V
++	AVLSE64V
++	AVSSE8V
++	AVSSE16V
++	AVSSE32V
++	AVSSE64V
++
++	// 31.7.6. Vector Indexed Instructions
++	AVLUXEI8V
++	AVLUXEI16V
++	AVLUXEI32V
++	AVLUXEI64V
++	AVLOXEI8V
++	AVLOXEI16V
++	AVLOXEI32V
++	AVLOXEI64V
++	AVSUXEI8V
++	AVSUXEI16V
++	AVSUXEI32V
++	AVSUXEI64V
++	AVSOXEI8V
++	AVSOXEI16V
++	AVSOXEI32V
++	AVSOXEI64V
++
++	// 31.7.7. Unit-stride Fault-Only-First Loads
++	AVLE8FFV
++	AVLE16FFV
++	AVLE32FFV
++	AVLE64FFV
++
++	// 31.7.9. Vector Load/Store Whole Register Instructions
++	AVL1RE8V
++	AVL1RE16V
++	AVL1RE32V
++	AVL1RE64V
++	AVL2RE8V
++	AVL2RE16V
++	AVL2RE32V
++	AVL2RE64V
++	AVL4RE8V
++	AVL4RE16V
++	AVL4RE32V
++	AVL4RE64V
++	AVL8RE8V
++	AVL8RE16V
++	AVL8RE32V
++	AVL8RE64V
++	AVS1RV
++	AVS2RV
++	AVS4RV
++	AVS8RV
++
++	// 31.11.1. Vector Single-Width Integer Add and Subtract
++	AVADDVV
++	AVADDVX
++	AVADDVI
++	AVSUBVV
++	AVSUBVX
++	AVRSUBVX
++	AVRSUBVI
++
++	// 31.11.2. Vector Widening Integer Add/Subtract
++	AVWADDUVV
++	AVWADDUVX
++	AVWSUBUVV
++	AVWSUBUVX
++	AVWADDVV
++	AVWADDVX
++	AVWSUBVV
++	AVWSUBVX
++	AVWADDUWV
++	AVWADDUWX
++	AVWSUBUWV
++	AVWSUBUWX
++	AVWADDWV
++	AVWADDWX
++	AVWSUBWV
++	AVWSUBWX
++
++	// 31.11.3. Vector Integer Extension
++	AVZEXTVF2
++	AVSEXTVF2
++	AVZEXTVF4
++	AVSEXTVF4
++	AVZEXTVF8
++	AVSEXTVF8
++
++	// 31.11.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
++	AVADCVVM
++	AVADCVXM
++	AVADCVIM
++	AVMADCVVM
++	AVMADCVXM
++	AVMADCVIM
++	AVMADCVV
++	AVMADCVX
++	AVMADCVI
++	AVSBCVVM
++	AVSBCVXM
++	AVMSBCVVM
++	AVMSBCVXM
++	AVMSBCVV
++	AVMSBCVX
++
++	// 31.11.5. Vector Bitwise Logical Instructions
++	AVANDVV
++	AVANDVX
++	AVANDVI
++	AVORVV
++	AVORVX
++	AVORVI
++	AVXORVV
++	AVXORVX
++	AVXORVI
++
++	// 31.11.6. Vector Single-Width Shift Instructions
++	AVSLLVV
++	AVSLLVX
++	AVSLLVI
++	AVSRLVV
++	AVSRLVX
++	AVSRLVI
++	AVSRAVV
++	AVSRAVX
++	AVSRAVI
++
++	// 31.11.7. Vector Narrowing Integer Right Shift Instructions
++	AVNSRLWV
++	AVNSRLWX
++	AVNSRLWI
++	AVNSRAWV
++	AVNSRAWX
++	AVNSRAWI
++
++	// 31.11.8. Vector Integer Compare Instructions
++	AVMSEQVV
++	AVMSEQVX
++	AVMSEQVI
++	AVMSNEVV
++	AVMSNEVX
++	AVMSNEVI
++	AVMSLTUVV
++	AVMSLTUVX
++	AVMSLTVV
++	AVMSLTVX
++	AVMSLEUVV
++	AVMSLEUVX
++	AVMSLEUVI
++	AVMSLEVV
++	AVMSLEVX
++	AVMSLEVI
++	AVMSGTUVX
++	AVMSGTUVI
++	AVMSGTVX
++	AVMSGTVI
++
++	// 31.11.9. Vector Integer Min/Max Instructions
++	AVMINUVV
++	AVMINUVX
++	AVMINVV
++	AVMINVX
++	AVMAXUVV
++	AVMAXUVX
++	AVMAXVV
++	AVMAXVX
++
++	// 31.11.10. Vector Single-Width Integer Multiply Instructions
++	AVMULVV
++	AVMULVX
++	AVMULHVV
++	AVMULHVX
++	AVMULHUVV
++	AVMULHUVX
++	AVMULHSUVV
++	AVMULHSUVX
++
++	// 31.11.11. Vector Integer Divide Instructions
++	AVDIVUVV
++	AVDIVUVX
++	AVDIVVV
++	AVDIVVX
++	AVREMUVV
++	AVREMUVX
++	AVREMVV
++	AVREMVX
++
++	// 31.11.12. Vector Widening Integer Multiply Instructions
++	AVWMULVV
++	AVWMULVX
++	AVWMULUVV
++	AVWMULUVX
++	AVWMULSUVV
++	AVWMULSUVX
++
++	// 31.11.13. Vector Single-Width Integer Multiply-Add Instructions
++	AVMACCVV
++	AVMACCVX
++	AVNMSACVV
++	AVNMSACVX
++	AVMADDVV
++	AVMADDVX
++	AVNMSUBVV
++	AVNMSUBVX
++
++	// 31.11.14. Vector Widening Integer Multiply-Add Instructions
++	AVWMACCUVV
++	AVWMACCUVX
++	AVWMACCVV
++	AVWMACCVX
++	AVWMACCSUVV
++	AVWMACCSUVX
++	AVWMACCUSVX
++
++	// 31.11.15. Vector Integer Merge Instructions
++	AVMERGEVVM
++	AVMERGEVXM
++	AVMERGEVIM
++
++	// 31.11.16. Vector Integer Move Instructions
++	AVMVVV
++	AVMVVX
++	AVMVVI
++
++	// 31.12.1. Vector Single-Width Saturating Add and Subtract
++	AVSADDUVV
++	AVSADDUVX
++	AVSADDUVI
++	AVSADDVV
++	AVSADDVX
++	AVSADDVI
++	AVSSUBUVV
++	AVSSUBUVX
++	AVSSUBVV
++	AVSSUBVX
++
++	// 31.12.2. Vector Single-Width Averaging Add and Subtract
++	AVAADDUVV
++	AVAADDUVX
++	AVAADDVV
++	AVAADDVX
++	AVASUBUVV
++	AVASUBUVX
++	AVASUBVV
++	AVASUBVX
++
++	// 31.12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
++	AVSMULVV
++	AVSMULVX
++
++	// 31.12.4. Vector Single-Width Scaling Shift Instructions
++	AVSSRLVV
++	AVSSRLVX
++	AVSSRLVI
++	AVSSRAVV
++	AVSSRAVX
++	AVSSRAVI
++
++	// 31.12.5. Vector Narrowing Fixed-Point Clip Instructions
++	AVNCLIPUWV
++	AVNCLIPUWX
++	AVNCLIPUWI
++	AVNCLIPWV
++	AVNCLIPWX
++	AVNCLIPWI
++
++	// 31.13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
++	AVFADDVV
++	AVFADDVF
++	AVFSUBVV
++	AVFSUBVF
++	AVFRSUBVF
++
++	// 31.13.3. Vector Widening Floating-Point Add/Subtract Instructions
++	AVFWADDVV
++	AVFWADDVF
++	AVFWSUBVV
++	AVFWSUBVF
++	AVFWADDWV
++	AVFWADDWF
++	AVFWSUBWV
++	AVFWSUBWF
++
++	// 31.13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
++	AVFMULVV
++	AVFMULVF
++	AVFDIVVV
++	AVFDIVVF
++	AVFRDIVVF
++
++	// 31.13.5. Vector Widening Floating-Point Multiply
++	AVFWMULVV
++	AVFWMULVF
++
++	// 31.13.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
++	AVFMACCVV
++	AVFMACCVF
++	AVFNMACCVV
++	AVFNMACCVF
++	AVFMSACVV
++	AVFMSACVF
++	AVFNMSACVV
++	AVFNMSACVF
++	AVFMADDVV
++	AVFMADDVF
++	AVFNMADDVV
++	AVFNMADDVF
++	AVFMSUBVV
++	AVFMSUBVF
++	AVFNMSUBVV
++	AVFNMSUBVF
++
++	// 31.13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
++	AVFWMACCVV
++	AVFWMACCVF
++	AVFWNMACCVV
++	AVFWNMACCVF
++	AVFWMSACVV
++	AVFWMSACVF
++	AVFWNMSACVV
++	AVFWNMSACVF
++
++	// 31.13.8. Vector Floating-Point Square-Root Instruction
++	AVFSQRTV
++
++	// 31.13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
++	AVFRSQRT7V
++
++	// 31.13.10. Vector Floating-Point Reciprocal Estimate Instruction
++	AVFREC7V
++
++	// 31.13.11. Vector Floating-Point MIN/MAX Instructions
++	AVFMINVV
++	AVFMINVF
++	AVFMAXVV
++	AVFMAXVF
++
++	// 31.13.12. Vector Floating-Point Sign-Injection Instructions
++	AVFSGNJVV
++	AVFSGNJVF
++	AVFSGNJNVV
++	AVFSGNJNVF
++	AVFSGNJXVV
++	AVFSGNJXVF
++
++	// 31.13.13. Vector Floating-Point Compare Instructions
++	AVMFEQVV
++	AVMFEQVF
++	AVMFNEVV
++	AVMFNEVF
++	AVMFLTVV
++	AVMFLTVF
++	AVMFLEVV
++	AVMFLEVF
++	AVMFGTVF
++	AVMFGEVF
++
++	// 31.13.14. Vector Floating-Point Classify Instruction
++	AVFCLASSV
++
++	// 31.13.15. Vector Floating-Point Merge Instruction
++	AVFMERGEVFM
++
++	// 31.13.16. Vector Floating-Point Move Instruction
++	AVFMVVF
++
++	// 31.13.17. Single-Width Floating-Point/Integer Type-Convert Instructions
++	AVFCVTXUFV
++	AVFCVTXFV
++	AVFCVTRTZXUFV
++	AVFCVTRTZXFV
++	AVFCVTFXUV
++	AVFCVTFXV
++
++	// 31.13.18. Widening Floating-Point/Integer Type-Convert Instructions
++	AVFWCVTXUFV
++	AVFWCVTXFV
++	AVFWCVTRTZXUFV
++	AVFWCVTRTZXFV
++	AVFWCVTFXUV
++	AVFWCVTFXV
++	AVFWCVTFFV
++
++	// 31.13.19. Narrowing Floating-Point/Integer Type-Convert Instructions
++	AVFNCVTXUFW
++	AVFNCVTXFW
++	AVFNCVTRTZXUFW
++	AVFNCVTRTZXFW
++	AVFNCVTFXUW
++	AVFNCVTFXW
++	AVFNCVTFFW
++	AVFNCVTRODFFW
++
++	// 31.14.1. Vector Single-Width Integer Reduction Instructions
++	AVREDSUMVS
++	AVREDMAXUVS
++	AVREDMAXVS
++	AVREDMINUVS
++	AVREDMINVS
++	AVREDANDVS
++	AVREDORVS
++	AVREDXORVS
++
++	// 31.14.2. Vector Widening Integer Reduction Instructions
++	AVWREDSUMUVS
++	AVWREDSUMVS
++
++	// 31.14.3. Vector Single-Width Floating-Point Reduction Instructions
++	AVFREDOSUMVS
++	AVFREDUSUMVS
++	AVFREDMAXVS
++	AVFREDMINVS
++
++	// 31.14.4. Vector Widening Floating-Point Reduction Instructions
++	AVFWREDOSUMVS
++	AVFWREDUSUMVS
++
++	// 31.15. Vector Mask Instructions
++	AVMANDMM
++	AVMNANDMM
++	AVMANDNMM
++	AVMXORMM
++	AVMORMM
++	AVMNORMM
++	AVMORNMM
++	AVMXNORMM
++	AVCPOPM
++	AVFIRSTM
++	AVMSBFM
++	AVMSIFM
++	AVMSOFM
++	AVIOTAM
++	AVIDV
++
++	// 31.16.1. Integer Scalar Move Instructions
++	AVMVXS
++	AVMVSX
++
++	// 31.16.2. Floating-Point Scalar Move Instructions
++	AVFMVFS
++	AVFMVSF
++
++	// 31.16.3. Vector Slide Instructions
++	AVSLIDEUPVX
++	AVSLIDEUPVI
++	AVSLIDEDOWNVX
++	AVSLIDEDOWNVI
++	AVSLIDE1UPVX
++	AVFSLIDE1UPVF
++	AVSLIDE1DOWNVX
++	AVFSLIDE1DOWNVF
++
++	// 31.16.4. Vector Register Gather Instructions
++	AVRGATHERVV
++	AVRGATHEREI16VV
++	AVRGATHERVX
++	AVRGATHERVI
++
++	// 31.16.5. Vector Compress Instruction
++	AVCOMPRESSVM
++
++	// 31.16.6. Whole Vector Register Move
++	AVMV1RV
++	AVMV2RV
++	AVMV4RV
++	AVMV8RV
++
+ 	// The escape hatch. Inserts a single 32-bit word.
+ 	AWORD
+ 
+diff --git a/src/cmd/internal/obj/riscv/inst.go b/src/cmd/internal/obj/riscv/inst.go
+index 223ddd15b2..c264f6ae15 100644
+--- a/src/cmd/internal/obj/riscv/inst.go
++++ b/src/cmd/internal/obj/riscv/inst.go
+@@ -1,4 +1,4 @@
+-// Code generated by ./parse.py -go rv64_a rv64_d rv64_f rv64_i rv64_m rv64_q rv64_zba rv64_zbb rv64_zbs rv_a rv_d rv_f rv_i rv_m rv_q rv_zba rv_zbb rv_zbs rv_s rv_system rv_zicsr; DO NOT EDIT.
++// Code generated by ./parse.py -go rv64_a rv64_d rv64_f rv64_i rv64_m rv64_q rv64_zba rv64_zbb rv64_zbs rv_a rv_d rv_f rv_i rv_m rv_q rv_s rv_system rv_v rv_zba rv_zbb rv_zbs rv_zicsr; DO NOT EDIT.
+ package riscv
+ 
+ import "cmd/internal/obj"
+@@ -6,6 +6,7 @@ import "cmd/internal/obj"
+ type inst struct {
+ 	opcode uint32
+ 	funct3 uint32
++	rs1    uint32
+ 	rs2    uint32
+ 	csr    int64
+ 	funct7 uint32
+@@ -14,507 +15,1257 @@ type inst struct {
+ func encode(a obj.As) *inst {
+ 	switch a {
+ 	case AADD:
+-		return &inst{0x33, 0x0, 0x0, 0, 0x0}
++		return &inst{0x33, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AADDUW:
+-		return &inst{0x3b, 0x0, 0x0, 128, 0x4}
++		return &inst{0x3b, 0x0, 0x0, 0x0, 128, 0x4}
+ 	case AADDI:
+-		return &inst{0x13, 0x0, 0x0, 0, 0x0}
++		return &inst{0x13, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AADDIW:
+-		return &inst{0x1b, 0x0, 0x0, 0, 0x0}
++		return &inst{0x1b, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AADDW:
+-		return &inst{0x3b, 0x0, 0x0, 0, 0x0}
++		return &inst{0x3b, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AAMOADDD:
+-		return &inst{0x2f, 0x3, 0x0, 0, 0x0}
++		return &inst{0x2f, 0x3, 0x0, 0x0, 0, 0x0}
+ 	case AAMOADDW:
+-		return &inst{0x2f, 0x2, 0x0, 0, 0x0}
++		return &inst{0x2f, 0x2, 0x0, 0x0, 0, 0x0}
+ 	case AAMOANDD:
+-		return &inst{0x2f, 0x3, 0x0, 1536, 0x30}
++		return &inst{0x2f, 0x3, 0x0, 0x0, 1536, 0x30}
+ 	case AAMOANDW:
+-		return &inst{0x2f, 0x2, 0x0, 1536, 0x30}
++		return &inst{0x2f, 0x2, 0x0, 0x0, 1536, 0x30}
+ 	case AAMOMAXD:
+-		return &inst{0x2f, 0x3, 0x0, -1536, 0x50}
++		return &inst{0x2f, 0x3, 0x0, 0x0, -1536, 0x50}
+ 	case AAMOMAXW:
+-		return &inst{0x2f, 0x2, 0x0, -1536, 0x50}
++		return &inst{0x2f, 0x2, 0x0, 0x0, -1536, 0x50}
+ 	case AAMOMAXUD:
+-		return &inst{0x2f, 0x3, 0x0, -512, 0x70}
++		return &inst{0x2f, 0x3, 0x0, 0x0, -512, 0x70}
+ 	case AAMOMAXUW:
+-		return &inst{0x2f, 0x2, 0x0, -512, 0x70}
++		return &inst{0x2f, 0x2, 0x0, 0x0, -512, 0x70}
+ 	case AAMOMIND:
+-		return &inst{0x2f, 0x3, 0x0, -2048, 0x40}
++		return &inst{0x2f, 0x3, 0x0, 0x0, -2048, 0x40}
+ 	case AAMOMINW:
+-		return &inst{0x2f, 0x2, 0x0, -2048, 0x40}
++		return &inst{0x2f, 0x2, 0x0, 0x0, -2048, 0x40}
+ 	case AAMOMINUD:
+-		return &inst{0x2f, 0x3, 0x0, -1024, 0x60}
++		return &inst{0x2f, 0x3, 0x0, 0x0, -1024, 0x60}
+ 	case AAMOMINUW:
+-		return &inst{0x2f, 0x2, 0x0, -1024, 0x60}
++		return &inst{0x2f, 0x2, 0x0, 0x0, -1024, 0x60}
+ 	case AAMOORD:
+-		return &inst{0x2f, 0x3, 0x0, 1024, 0x20}
++		return &inst{0x2f, 0x3, 0x0, 0x0, 1024, 0x20}
+ 	case AAMOORW:
+-		return &inst{0x2f, 0x2, 0x0, 1024, 0x20}
++		return &inst{0x2f, 0x2, 0x0, 0x0, 1024, 0x20}
+ 	case AAMOSWAPD:
+-		return &inst{0x2f, 0x3, 0x0, 128, 0x4}
++		return &inst{0x2f, 0x3, 0x0, 0x0, 128, 0x4}
+ 	case AAMOSWAPW:
+-		return &inst{0x2f, 0x2, 0x0, 128, 0x4}
++		return &inst{0x2f, 0x2, 0x0, 0x0, 128, 0x4}
+ 	case AAMOXORD:
+-		return &inst{0x2f, 0x3, 0x0, 512, 0x10}
++		return &inst{0x2f, 0x3, 0x0, 0x0, 512, 0x10}
+ 	case AAMOXORW:
+-		return &inst{0x2f, 0x2, 0x0, 512, 0x10}
++		return &inst{0x2f, 0x2, 0x0, 0x0, 512, 0x10}
+ 	case AAND:
+-		return &inst{0x33, 0x7, 0x0, 0, 0x0}
++		return &inst{0x33, 0x7, 0x0, 0x0, 0, 0x0}
+ 	case AANDI:
+-		return &inst{0x13, 0x7, 0x0, 0, 0x0}
++		return &inst{0x13, 0x7, 0x0, 0x0, 0, 0x0}
+ 	case AANDN:
+-		return &inst{0x33, 0x7, 0x0, 1024, 0x20}
++		return &inst{0x33, 0x7, 0x0, 0x0, 1024, 0x20}
+ 	case AAUIPC:
+-		return &inst{0x17, 0x0, 0x0, 0, 0x0}
++		return &inst{0x17, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case ABCLR:
+-		return &inst{0x33, 0x1, 0x0, 1152, 0x24}
++		return &inst{0x33, 0x1, 0x0, 0x0, 1152, 0x24}
+ 	case ABCLRI:
+-		return &inst{0x13, 0x1, 0x0, 1152, 0x24}
++		return &inst{0x13, 0x1, 0x0, 0x0, 1152, 0x24}
+ 	case ABEQ:
+-		return &inst{0x63, 0x0, 0x0, 0, 0x0}
++		return &inst{0x63, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case ABEXT:
+-		return &inst{0x33, 0x5, 0x0, 1152, 0x24}
++		return &inst{0x33, 0x5, 0x0, 0x0, 1152, 0x24}
+ 	case ABEXTI:
+-		return &inst{0x13, 0x5, 0x0, 1152, 0x24}
++		return &inst{0x13, 0x5, 0x0, 0x0, 1152, 0x24}
+ 	case ABGE:
+-		return &inst{0x63, 0x5, 0x0, 0, 0x0}
++		return &inst{0x63, 0x5, 0x0, 0x0, 0, 0x0}
+ 	case ABGEU:
+-		return &inst{0x63, 0x7, 0x0, 0, 0x0}
++		return &inst{0x63, 0x7, 0x0, 0x0, 0, 0x0}
+ 	case ABINV:
+-		return &inst{0x33, 0x1, 0x0, 1664, 0x34}
++		return &inst{0x33, 0x1, 0x0, 0x0, 1664, 0x34}
+ 	case ABINVI:
+-		return &inst{0x13, 0x1, 0x0, 1664, 0x34}
++		return &inst{0x13, 0x1, 0x0, 0x0, 1664, 0x34}
+ 	case ABLT:
+-		return &inst{0x63, 0x4, 0x0, 0, 0x0}
++		return &inst{0x63, 0x4, 0x0, 0x0, 0, 0x0}
+ 	case ABLTU:
+-		return &inst{0x63, 0x6, 0x0, 0, 0x0}
++		return &inst{0x63, 0x6, 0x0, 0x0, 0, 0x0}
+ 	case ABNE:
+-		return &inst{0x63, 0x1, 0x0, 0, 0x0}
++		return &inst{0x63, 0x1, 0x0, 0x0, 0, 0x0}
+ 	case ABSET:
+-		return &inst{0x33, 0x1, 0x0, 640, 0x14}
++		return &inst{0x33, 0x1, 0x0, 0x0, 640, 0x14}
+ 	case ABSETI:
+-		return &inst{0x13, 0x1, 0x0, 640, 0x14}
++		return &inst{0x13, 0x1, 0x0, 0x0, 640, 0x14}
+ 	case ACLZ:
+-		return &inst{0x13, 0x1, 0x0, 1536, 0x30}
++		return &inst{0x13, 0x1, 0x0, 0x0, 1536, 0x30}
+ 	case ACLZW:
+-		return &inst{0x1b, 0x1, 0x0, 1536, 0x30}
++		return &inst{0x1b, 0x1, 0x0, 0x0, 1536, 0x30}
+ 	case ACPOP:
+-		return &inst{0x13, 0x1, 0x2, 1538, 0x30}
++		return &inst{0x13, 0x1, 0x0, 0x2, 1538, 0x30}
+ 	case ACPOPW:
+-		return &inst{0x1b, 0x1, 0x2, 1538, 0x30}
++		return &inst{0x1b, 0x1, 0x0, 0x2, 1538, 0x30}
+ 	case ACSRRC:
+-		return &inst{0x73, 0x3, 0x0, 0, 0x0}
++		return &inst{0x73, 0x3, 0x0, 0x0, 0, 0x0}
+ 	case ACSRRCI:
+-		return &inst{0x73, 0x7, 0x0, 0, 0x0}
++		return &inst{0x73, 0x7, 0x0, 0x0, 0, 0x0}
+ 	case ACSRRS:
+-		return &inst{0x73, 0x2, 0x0, 0, 0x0}
++		return &inst{0x73, 0x2, 0x0, 0x0, 0, 0x0}
+ 	case ACSRRSI:
+-		return &inst{0x73, 0x6, 0x0, 0, 0x0}
++		return &inst{0x73, 0x6, 0x0, 0x0, 0, 0x0}
+ 	case ACSRRW:
+-		return &inst{0x73, 0x1, 0x0, 0, 0x0}
++		return &inst{0x73, 0x1, 0x0, 0x0, 0, 0x0}
+ 	case ACSRRWI:
+-		return &inst{0x73, 0x5, 0x0, 0, 0x0}
++		return &inst{0x73, 0x5, 0x0, 0x0, 0, 0x0}
+ 	case ACTZ:
+-		return &inst{0x13, 0x1, 0x1, 1537, 0x30}
++		return &inst{0x13, 0x1, 0x0, 0x1, 1537, 0x30}
+ 	case ACTZW:
+-		return &inst{0x1b, 0x1, 0x1, 1537, 0x30}
++		return &inst{0x1b, 0x1, 0x0, 0x1, 1537, 0x30}
+ 	case ADIV:
+-		return &inst{0x33, 0x4, 0x0, 32, 0x1}
++		return &inst{0x33, 0x4, 0x0, 0x0, 32, 0x1}
+ 	case ADIVU:
+-		return &inst{0x33, 0x5, 0x0, 32, 0x1}
++		return &inst{0x33, 0x5, 0x0, 0x0, 32, 0x1}
+ 	case ADIVUW:
+-		return &inst{0x3b, 0x5, 0x0, 32, 0x1}
++		return &inst{0x3b, 0x5, 0x0, 0x0, 32, 0x1}
+ 	case ADIVW:
+-		return &inst{0x3b, 0x4, 0x0, 32, 0x1}
++		return &inst{0x3b, 0x4, 0x0, 0x0, 32, 0x1}
+ 	case AEBREAK:
+-		return &inst{0x73, 0x0, 0x1, 1, 0x0}
++		return &inst{0x73, 0x0, 0x0, 0x1, 1, 0x0}
+ 	case AECALL:
+-		return &inst{0x73, 0x0, 0x0, 0, 0x0}
++		return &inst{0x73, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AFADDD:
+-		return &inst{0x53, 0x0, 0x0, 32, 0x1}
++		return &inst{0x53, 0x0, 0x0, 0x0, 32, 0x1}
+ 	case AFADDQ:
+-		return &inst{0x53, 0x0, 0x0, 96, 0x3}
++		return &inst{0x53, 0x0, 0x0, 0x0, 96, 0x3}
+ 	case AFADDS:
+-		return &inst{0x53, 0x0, 0x0, 0, 0x0}
++		return &inst{0x53, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AFCLASSD:
+-		return &inst{0x53, 0x1, 0x0, -480, 0x71}
++		return &inst{0x53, 0x1, 0x0, 0x0, -480, 0x71}
+ 	case AFCLASSQ:
+-		return &inst{0x53, 0x1, 0x0, -416, 0x73}
++		return &inst{0x53, 0x1, 0x0, 0x0, -416, 0x73}
+ 	case AFCLASSS:
+-		return &inst{0x53, 0x1, 0x0, -512, 0x70}
++		return &inst{0x53, 0x1, 0x0, 0x0, -512, 0x70}
+ 	case AFCVTDL:
+-		return &inst{0x53, 0x0, 0x2, -734, 0x69}
++		return &inst{0x53, 0x0, 0x0, 0x2, -734, 0x69}
+ 	case AFCVTDLU:
+-		return &inst{0x53, 0x0, 0x3, -733, 0x69}
++		return &inst{0x53, 0x0, 0x0, 0x3, -733, 0x69}
+ 	case AFCVTDQ:
+-		return &inst{0x53, 0x0, 0x3, 1059, 0x21}
++		return &inst{0x53, 0x0, 0x0, 0x3, 1059, 0x21}
+ 	case AFCVTDS:
+-		return &inst{0x53, 0x0, 0x0, 1056, 0x21}
++		return &inst{0x53, 0x0, 0x0, 0x0, 1056, 0x21}
+ 	case AFCVTDW:
+-		return &inst{0x53, 0x0, 0x0, -736, 0x69}
++		return &inst{0x53, 0x0, 0x0, 0x0, -736, 0x69}
+ 	case AFCVTDWU:
+-		return &inst{0x53, 0x0, 0x1, -735, 0x69}
++		return &inst{0x53, 0x0, 0x0, 0x1, -735, 0x69}
+ 	case AFCVTLD:
+-		return &inst{0x53, 0x0, 0x2, -990, 0x61}
++		return &inst{0x53, 0x0, 0x0, 0x2, -990, 0x61}
+ 	case AFCVTLQ:
+-		return &inst{0x53, 0x0, 0x2, -926, 0x63}
++		return &inst{0x53, 0x0, 0x0, 0x2, -926, 0x63}
+ 	case AFCVTLS:
+-		return &inst{0x53, 0x0, 0x2, -1022, 0x60}
++		return &inst{0x53, 0x0, 0x0, 0x2, -1022, 0x60}
+ 	case AFCVTLUD:
+-		return &inst{0x53, 0x0, 0x3, -989, 0x61}
++		return &inst{0x53, 0x0, 0x0, 0x3, -989, 0x61}
+ 	case AFCVTLUQ:
+-		return &inst{0x53, 0x0, 0x3, -925, 0x63}
++		return &inst{0x53, 0x0, 0x0, 0x3, -925, 0x63}
+ 	case AFCVTLUS:
+-		return &inst{0x53, 0x0, 0x3, -1021, 0x60}
++		return &inst{0x53, 0x0, 0x0, 0x3, -1021, 0x60}
+ 	case AFCVTQD:
+-		return &inst{0x53, 0x0, 0x1, 1121, 0x23}
++		return &inst{0x53, 0x0, 0x0, 0x1, 1121, 0x23}
+ 	case AFCVTQL:
+-		return &inst{0x53, 0x0, 0x2, -670, 0x6b}
++		return &inst{0x53, 0x0, 0x0, 0x2, -670, 0x6b}
+ 	case AFCVTQLU:
+-		return &inst{0x53, 0x0, 0x3, -669, 0x6b}
++		return &inst{0x53, 0x0, 0x0, 0x3, -669, 0x6b}
+ 	case AFCVTQS:
+-		return &inst{0x53, 0x0, 0x0, 1120, 0x23}
++		return &inst{0x53, 0x0, 0x0, 0x0, 1120, 0x23}
+ 	case AFCVTQW:
+-		return &inst{0x53, 0x0, 0x0, -672, 0x6b}
++		return &inst{0x53, 0x0, 0x0, 0x0, -672, 0x6b}
+ 	case AFCVTQWU:
+-		return &inst{0x53, 0x0, 0x1, -671, 0x6b}
++		return &inst{0x53, 0x0, 0x0, 0x1, -671, 0x6b}
+ 	case AFCVTSD:
+-		return &inst{0x53, 0x0, 0x1, 1025, 0x20}
++		return &inst{0x53, 0x0, 0x0, 0x1, 1025, 0x20}
+ 	case AFCVTSL:
+-		return &inst{0x53, 0x0, 0x2, -766, 0x68}
++		return &inst{0x53, 0x0, 0x0, 0x2, -766, 0x68}
+ 	case AFCVTSLU:
+-		return &inst{0x53, 0x0, 0x3, -765, 0x68}
++		return &inst{0x53, 0x0, 0x0, 0x3, -765, 0x68}
+ 	case AFCVTSQ:
+-		return &inst{0x53, 0x0, 0x3, 1027, 0x20}
++		return &inst{0x53, 0x0, 0x0, 0x3, 1027, 0x20}
+ 	case AFCVTSW:
+-		return &inst{0x53, 0x0, 0x0, -768, 0x68}
++		return &inst{0x53, 0x0, 0x0, 0x0, -768, 0x68}
+ 	case AFCVTSWU:
+-		return &inst{0x53, 0x0, 0x1, -767, 0x68}
++		return &inst{0x53, 0x0, 0x0, 0x1, -767, 0x68}
+ 	case AFCVTWD:
+-		return &inst{0x53, 0x0, 0x0, -992, 0x61}
++		return &inst{0x53, 0x0, 0x0, 0x0, -992, 0x61}
+ 	case AFCVTWQ:
+-		return &inst{0x53, 0x0, 0x0, -928, 0x63}
++		return &inst{0x53, 0x0, 0x0, 0x0, -928, 0x63}
+ 	case AFCVTWS:
+-		return &inst{0x53, 0x0, 0x0, -1024, 0x60}
++		return &inst{0x53, 0x0, 0x0, 0x0, -1024, 0x60}
+ 	case AFCVTWUD:
+-		return &inst{0x53, 0x0, 0x1, -991, 0x61}
++		return &inst{0x53, 0x0, 0x0, 0x1, -991, 0x61}
+ 	case AFCVTWUQ:
+-		return &inst{0x53, 0x0, 0x1, -927, 0x63}
++		return &inst{0x53, 0x0, 0x0, 0x1, -927, 0x63}
+ 	case AFCVTWUS:
+-		return &inst{0x53, 0x0, 0x1, -1023, 0x60}
++		return &inst{0x53, 0x0, 0x0, 0x1, -1023, 0x60}
+ 	case AFDIVD:
+-		return &inst{0x53, 0x0, 0x0, 416, 0xd}
++		return &inst{0x53, 0x0, 0x0, 0x0, 416, 0xd}
+ 	case AFDIVQ:
+-		return &inst{0x53, 0x0, 0x0, 480, 0xf}
++		return &inst{0x53, 0x0, 0x0, 0x0, 480, 0xf}
+ 	case AFDIVS:
+-		return &inst{0x53, 0x0, 0x0, 384, 0xc}
++		return &inst{0x53, 0x0, 0x0, 0x0, 384, 0xc}
+ 	case AFENCE:
+-		return &inst{0xf, 0x0, 0x0, 0, 0x0}
++		return &inst{0xf, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AFENCETSO:
+-		return &inst{0xf, 0x0, 0x13, -1997, 0x41}
++		return &inst{0xf, 0x0, 0x0, 0x13, -1997, 0x41}
+ 	case AFEQD:
+-		return &inst{0x53, 0x2, 0x0, -1504, 0x51}
++		return &inst{0x53, 0x2, 0x0, 0x0, -1504, 0x51}
+ 	case AFEQQ:
+-		return &inst{0x53, 0x2, 0x0, -1440, 0x53}
++		return &inst{0x53, 0x2, 0x0, 0x0, -1440, 0x53}
+ 	case AFEQS:
+-		return &inst{0x53, 0x2, 0x0, -1536, 0x50}
++		return &inst{0x53, 0x2, 0x0, 0x0, -1536, 0x50}
+ 	case AFLD:
+-		return &inst{0x7, 0x3, 0x0, 0, 0x0}
++		return &inst{0x7, 0x3, 0x0, 0x0, 0, 0x0}
+ 	case AFLED:
+-		return &inst{0x53, 0x0, 0x0, -1504, 0x51}
++		return &inst{0x53, 0x0, 0x0, 0x0, -1504, 0x51}
+ 	case AFLEQ:
+-		return &inst{0x53, 0x0, 0x0, -1440, 0x53}
++		return &inst{0x53, 0x0, 0x0, 0x0, -1440, 0x53}
+ 	case AFLES:
+-		return &inst{0x53, 0x0, 0x0, -1536, 0x50}
++		return &inst{0x53, 0x0, 0x0, 0x0, -1536, 0x50}
+ 	case AFLQ:
+-		return &inst{0x7, 0x4, 0x0, 0, 0x0}
++		return &inst{0x7, 0x4, 0x0, 0x0, 0, 0x0}
+ 	case AFLTD:
+-		return &inst{0x53, 0x1, 0x0, -1504, 0x51}
++		return &inst{0x53, 0x1, 0x0, 0x0, -1504, 0x51}
+ 	case AFLTQ:
+-		return &inst{0x53, 0x1, 0x0, -1440, 0x53}
++		return &inst{0x53, 0x1, 0x0, 0x0, -1440, 0x53}
+ 	case AFLTS:
+-		return &inst{0x53, 0x1, 0x0, -1536, 0x50}
++		return &inst{0x53, 0x1, 0x0, 0x0, -1536, 0x50}
+ 	case AFLW:
+-		return &inst{0x7, 0x2, 0x0, 0, 0x0}
++		return &inst{0x7, 0x2, 0x0, 0x0, 0, 0x0}
+ 	case AFMADDD:
+-		return &inst{0x43, 0x0, 0x0, 32, 0x1}
++		return &inst{0x43, 0x0, 0x0, 0x0, 32, 0x1}
+ 	case AFMADDQ:
+-		return &inst{0x43, 0x0, 0x0, 96, 0x3}
++		return &inst{0x43, 0x0, 0x0, 0x0, 96, 0x3}
+ 	case AFMADDS:
+-		return &inst{0x43, 0x0, 0x0, 0, 0x0}
++		return &inst{0x43, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AFMAXD:
+-		return &inst{0x53, 0x1, 0x0, 672, 0x15}
++		return &inst{0x53, 0x1, 0x0, 0x0, 672, 0x15}
+ 	case AFMAXQ:
+-		return &inst{0x53, 0x1, 0x0, 736, 0x17}
++		return &inst{0x53, 0x1, 0x0, 0x0, 736, 0x17}
+ 	case AFMAXS:
+-		return &inst{0x53, 0x1, 0x0, 640, 0x14}
++		return &inst{0x53, 0x1, 0x0, 0x0, 640, 0x14}
+ 	case AFMIND:
+-		return &inst{0x53, 0x0, 0x0, 672, 0x15}
++		return &inst{0x53, 0x0, 0x0, 0x0, 672, 0x15}
+ 	case AFMINQ:
+-		return &inst{0x53, 0x0, 0x0, 736, 0x17}
++		return &inst{0x53, 0x0, 0x0, 0x0, 736, 0x17}
+ 	case AFMINS:
+-		return &inst{0x53, 0x0, 0x0, 640, 0x14}
++		return &inst{0x53, 0x0, 0x0, 0x0, 640, 0x14}
+ 	case AFMSUBD:
+-		return &inst{0x47, 0x0, 0x0, 32, 0x1}
++		return &inst{0x47, 0x0, 0x0, 0x0, 32, 0x1}
+ 	case AFMSUBQ:
+-		return &inst{0x47, 0x0, 0x0, 96, 0x3}
++		return &inst{0x47, 0x0, 0x0, 0x0, 96, 0x3}
+ 	case AFMSUBS:
+-		return &inst{0x47, 0x0, 0x0, 0, 0x0}
++		return &inst{0x47, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AFMULD:
+-		return &inst{0x53, 0x0, 0x0, 288, 0x9}
++		return &inst{0x53, 0x0, 0x0, 0x0, 288, 0x9}
+ 	case AFMULQ:
+-		return &inst{0x53, 0x0, 0x0, 352, 0xb}
++		return &inst{0x53, 0x0, 0x0, 0x0, 352, 0xb}
+ 	case AFMULS:
+-		return &inst{0x53, 0x0, 0x0, 256, 0x8}
++		return &inst{0x53, 0x0, 0x0, 0x0, 256, 0x8}
+ 	case AFMVDX:
+-		return &inst{0x53, 0x0, 0x0, -224, 0x79}
++		return &inst{0x53, 0x0, 0x0, 0x0, -224, 0x79}
+ 	case AFMVSX:
+-		return &inst{0x53, 0x0, 0x0, -256, 0x78}
++		return &inst{0x53, 0x0, 0x0, 0x0, -256, 0x78}
+ 	case AFMVWX:
+-		return &inst{0x53, 0x0, 0x0, -256, 0x78}
++		return &inst{0x53, 0x0, 0x0, 0x0, -256, 0x78}
+ 	case AFMVXD:
+-		return &inst{0x53, 0x0, 0x0, -480, 0x71}
++		return &inst{0x53, 0x0, 0x0, 0x0, -480, 0x71}
+ 	case AFMVXS:
+-		return &inst{0x53, 0x0, 0x0, -512, 0x70}
++		return &inst{0x53, 0x0, 0x0, 0x0, -512, 0x70}
+ 	case AFMVXW:
+-		return &inst{0x53, 0x0, 0x0, -512, 0x70}
++		return &inst{0x53, 0x0, 0x0, 0x0, -512, 0x70}
+ 	case AFNMADDD:
+-		return &inst{0x4f, 0x0, 0x0, 32, 0x1}
++		return &inst{0x4f, 0x0, 0x0, 0x0, 32, 0x1}
+ 	case AFNMADDQ:
+-		return &inst{0x4f, 0x0, 0x0, 96, 0x3}
++		return &inst{0x4f, 0x0, 0x0, 0x0, 96, 0x3}
+ 	case AFNMADDS:
+-		return &inst{0x4f, 0x0, 0x0, 0, 0x0}
++		return &inst{0x4f, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AFNMSUBD:
+-		return &inst{0x4b, 0x0, 0x0, 32, 0x1}
++		return &inst{0x4b, 0x0, 0x0, 0x0, 32, 0x1}
+ 	case AFNMSUBQ:
+-		return &inst{0x4b, 0x0, 0x0, 96, 0x3}
++		return &inst{0x4b, 0x0, 0x0, 0x0, 96, 0x3}
+ 	case AFNMSUBS:
+-		return &inst{0x4b, 0x0, 0x0, 0, 0x0}
++		return &inst{0x4b, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AFRCSR:
+-		return &inst{0x73, 0x2, 0x3, 3, 0x0}
++		return &inst{0x73, 0x2, 0x0, 0x3, 3, 0x0}
+ 	case AFRFLAGS:
+-		return &inst{0x73, 0x2, 0x1, 1, 0x0}
++		return &inst{0x73, 0x2, 0x0, 0x1, 1, 0x0}
+ 	case AFRRM:
+-		return &inst{0x73, 0x2, 0x2, 2, 0x0}
++		return &inst{0x73, 0x2, 0x0, 0x2, 2, 0x0}
+ 	case AFSCSR:
+-		return &inst{0x73, 0x1, 0x3, 3, 0x0}
++		return &inst{0x73, 0x1, 0x0, 0x3, 3, 0x0}
+ 	case AFSD:
+-		return &inst{0x27, 0x3, 0x0, 0, 0x0}
++		return &inst{0x27, 0x3, 0x0, 0x0, 0, 0x0}
+ 	case AFSFLAGS:
+-		return &inst{0x73, 0x1, 0x1, 1, 0x0}
++		return &inst{0x73, 0x1, 0x0, 0x1, 1, 0x0}
+ 	case AFSFLAGSI:
+-		return &inst{0x73, 0x5, 0x1, 1, 0x0}
++		return &inst{0x73, 0x5, 0x0, 0x1, 1, 0x0}
+ 	case AFSGNJD:
+-		return &inst{0x53, 0x0, 0x0, 544, 0x11}
++		return &inst{0x53, 0x0, 0x0, 0x0, 544, 0x11}
+ 	case AFSGNJQ:
+-		return &inst{0x53, 0x0, 0x0, 608, 0x13}
++		return &inst{0x53, 0x0, 0x0, 0x0, 608, 0x13}
+ 	case AFSGNJS:
+-		return &inst{0x53, 0x0, 0x0, 512, 0x10}
++		return &inst{0x53, 0x0, 0x0, 0x0, 512, 0x10}
+ 	case AFSGNJND:
+-		return &inst{0x53, 0x1, 0x0, 544, 0x11}
++		return &inst{0x53, 0x1, 0x0, 0x0, 544, 0x11}
+ 	case AFSGNJNQ:
+-		return &inst{0x53, 0x1, 0x0, 608, 0x13}
++		return &inst{0x53, 0x1, 0x0, 0x0, 608, 0x13}
+ 	case AFSGNJNS:
+-		return &inst{0x53, 0x1, 0x0, 512, 0x10}
++		return &inst{0x53, 0x1, 0x0, 0x0, 512, 0x10}
+ 	case AFSGNJXD:
+-		return &inst{0x53, 0x2, 0x0, 544, 0x11}
++		return &inst{0x53, 0x2, 0x0, 0x0, 544, 0x11}
+ 	case AFSGNJXQ:
+-		return &inst{0x53, 0x2, 0x0, 608, 0x13}
++		return &inst{0x53, 0x2, 0x0, 0x0, 608, 0x13}
+ 	case AFSGNJXS:
+-		return &inst{0x53, 0x2, 0x0, 512, 0x10}
++		return &inst{0x53, 0x2, 0x0, 0x0, 512, 0x10}
+ 	case AFSQ:
+-		return &inst{0x27, 0x4, 0x0, 0, 0x0}
++		return &inst{0x27, 0x4, 0x0, 0x0, 0, 0x0}
+ 	case AFSQRTD:
+-		return &inst{0x53, 0x0, 0x0, 1440, 0x2d}
++		return &inst{0x53, 0x0, 0x0, 0x0, 1440, 0x2d}
+ 	case AFSQRTQ:
+-		return &inst{0x53, 0x0, 0x0, 1504, 0x2f}
++		return &inst{0x53, 0x0, 0x0, 0x0, 1504, 0x2f}
+ 	case AFSQRTS:
+-		return &inst{0x53, 0x0, 0x0, 1408, 0x2c}
++		return &inst{0x53, 0x0, 0x0, 0x0, 1408, 0x2c}
+ 	case AFSRM:
+-		return &inst{0x73, 0x1, 0x2, 2, 0x0}
++		return &inst{0x73, 0x1, 0x0, 0x2, 2, 0x0}
+ 	case AFSRMI:
+-		return &inst{0x73, 0x5, 0x2, 2, 0x0}
++		return &inst{0x73, 0x5, 0x0, 0x2, 2, 0x0}
+ 	case AFSUBD:
+-		return &inst{0x53, 0x0, 0x0, 160, 0x5}
++		return &inst{0x53, 0x0, 0x0, 0x0, 160, 0x5}
+ 	case AFSUBQ:
+-		return &inst{0x53, 0x0, 0x0, 224, 0x7}
++		return &inst{0x53, 0x0, 0x0, 0x0, 224, 0x7}
+ 	case AFSUBS:
+-		return &inst{0x53, 0x0, 0x0, 128, 0x4}
++		return &inst{0x53, 0x0, 0x0, 0x0, 128, 0x4}
+ 	case AFSW:
+-		return &inst{0x27, 0x2, 0x0, 0, 0x0}
++		return &inst{0x27, 0x2, 0x0, 0x0, 0, 0x0}
+ 	case AJAL:
+-		return &inst{0x6f, 0x0, 0x0, 0, 0x0}
++		return &inst{0x6f, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case AJALR:
+-		return &inst{0x67, 0x0, 0x0, 0, 0x0}
++		return &inst{0x67, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case ALB:
+-		return &inst{0x3, 0x0, 0x0, 0, 0x0}
++		return &inst{0x3, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case ALBU:
+-		return &inst{0x3, 0x4, 0x0, 0, 0x0}
++		return &inst{0x3, 0x4, 0x0, 0x0, 0, 0x0}
+ 	case ALD:
+-		return &inst{0x3, 0x3, 0x0, 0, 0x0}
++		return &inst{0x3, 0x3, 0x0, 0x0, 0, 0x0}
+ 	case ALH:
+-		return &inst{0x3, 0x1, 0x0, 0, 0x0}
++		return &inst{0x3, 0x1, 0x0, 0x0, 0, 0x0}
+ 	case ALHU:
+-		return &inst{0x3, 0x5, 0x0, 0, 0x0}
++		return &inst{0x3, 0x5, 0x0, 0x0, 0, 0x0}
+ 	case ALRD:
+-		return &inst{0x2f, 0x3, 0x0, 256, 0x8}
++		return &inst{0x2f, 0x3, 0x0, 0x0, 256, 0x8}
+ 	case ALRW:
+-		return &inst{0x2f, 0x2, 0x0, 256, 0x8}
++		return &inst{0x2f, 0x2, 0x0, 0x0, 256, 0x8}
+ 	case ALUI:
+-		return &inst{0x37, 0x0, 0x0, 0, 0x0}
++		return &inst{0x37, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case ALW:
+-		return &inst{0x3, 0x2, 0x0, 0, 0x0}
++		return &inst{0x3, 0x2, 0x0, 0x0, 0, 0x0}
+ 	case ALWU:
+-		return &inst{0x3, 0x6, 0x0, 0, 0x0}
++		return &inst{0x3, 0x6, 0x0, 0x0, 0, 0x0}
+ 	case AMAX:
+-		return &inst{0x33, 0x6, 0x0, 160, 0x5}
++		return &inst{0x33, 0x6, 0x0, 0x0, 160, 0x5}
+ 	case AMAXU:
+-		return &inst{0x33, 0x7, 0x0, 160, 0x5}
++		return &inst{0x33, 0x7, 0x0, 0x0, 160, 0x5}
+ 	case AMIN:
+-		return &inst{0x33, 0x4, 0x0, 160, 0x5}
++		return &inst{0x33, 0x4, 0x0, 0x0, 160, 0x5}
+ 	case AMINU:
+-		return &inst{0x33, 0x5, 0x0, 160, 0x5}
++		return &inst{0x33, 0x5, 0x0, 0x0, 160, 0x5}
+ 	case AMRET:
+-		return &inst{0x73, 0x0, 0x2, 770, 0x18}
++		return &inst{0x73, 0x0, 0x0, 0x2, 770, 0x18}
+ 	case AMUL:
+-		return &inst{0x33, 0x0, 0x0, 32, 0x1}
++		return &inst{0x33, 0x0, 0x0, 0x0, 32, 0x1}
+ 	case AMULH:
+-		return &inst{0x33, 0x1, 0x0, 32, 0x1}
++		return &inst{0x33, 0x1, 0x0, 0x0, 32, 0x1}
+ 	case AMULHSU:
+-		return &inst{0x33, 0x2, 0x0, 32, 0x1}
++		return &inst{0x33, 0x2, 0x0, 0x0, 32, 0x1}
+ 	case AMULHU:
+-		return &inst{0x33, 0x3, 0x0, 32, 0x1}
++		return &inst{0x33, 0x3, 0x0, 0x0, 32, 0x1}
+ 	case AMULW:
+-		return &inst{0x3b, 0x0, 0x0, 32, 0x1}
++		return &inst{0x3b, 0x0, 0x0, 0x0, 32, 0x1}
+ 	case AOR:
+-		return &inst{0x33, 0x6, 0x0, 0, 0x0}
++		return &inst{0x33, 0x6, 0x0, 0x0, 0, 0x0}
+ 	case AORCB:
+-		return &inst{0x13, 0x5, 0x7, 647, 0x14}
++		return &inst{0x13, 0x5, 0x0, 0x7, 647, 0x14}
+ 	case AORI:
+-		return &inst{0x13, 0x6, 0x0, 0, 0x0}
++		return &inst{0x13, 0x6, 0x0, 0x0, 0, 0x0}
+ 	case AORN:
+-		return &inst{0x33, 0x6, 0x0, 1024, 0x20}
++		return &inst{0x33, 0x6, 0x0, 0x0, 1024, 0x20}
+ 	case APAUSE:
+-		return &inst{0xf, 0x0, 0x10, 16, 0x0}
++		return &inst{0xf, 0x0, 0x0, 0x10, 16, 0x0}
+ 	case ARDCYCLE:
+-		return &inst{0x73, 0x2, 0x0, -1024, 0x60}
++		return &inst{0x73, 0x2, 0x0, 0x0, -1024, 0x60}
+ 	case ARDCYCLEH:
+-		return &inst{0x73, 0x2, 0x0, -896, 0x64}
++		return &inst{0x73, 0x2, 0x0, 0x0, -896, 0x64}
+ 	case ARDINSTRET:
+-		return &inst{0x73, 0x2, 0x2, -1022, 0x60}
++		return &inst{0x73, 0x2, 0x0, 0x2, -1022, 0x60}
+ 	case ARDINSTRETH:
+-		return &inst{0x73, 0x2, 0x2, -894, 0x64}
++		return &inst{0x73, 0x2, 0x0, 0x2, -894, 0x64}
+ 	case ARDTIME:
+-		return &inst{0x73, 0x2, 0x1, -1023, 0x60}
++		return &inst{0x73, 0x2, 0x0, 0x1, -1023, 0x60}
+ 	case ARDTIMEH:
+-		return &inst{0x73, 0x2, 0x1, -895, 0x64}
++		return &inst{0x73, 0x2, 0x0, 0x1, -895, 0x64}
+ 	case AREM:
+-		return &inst{0x33, 0x6, 0x0, 32, 0x1}
++		return &inst{0x33, 0x6, 0x0, 0x0, 32, 0x1}
+ 	case AREMU:
+-		return &inst{0x33, 0x7, 0x0, 32, 0x1}
++		return &inst{0x33, 0x7, 0x0, 0x0, 32, 0x1}
+ 	case AREMUW:
+-		return &inst{0x3b, 0x7, 0x0, 32, 0x1}
++		return &inst{0x3b, 0x7, 0x0, 0x0, 32, 0x1}
+ 	case AREMW:
+-		return &inst{0x3b, 0x6, 0x0, 32, 0x1}
++		return &inst{0x3b, 0x6, 0x0, 0x0, 32, 0x1}
+ 	case AREV8:
+-		return &inst{0x13, 0x5, 0x18, 1720, 0x35}
++		return &inst{0x13, 0x5, 0x0, 0x18, 1720, 0x35}
+ 	case AROL:
+-		return &inst{0x33, 0x1, 0x0, 1536, 0x30}
++		return &inst{0x33, 0x1, 0x0, 0x0, 1536, 0x30}
+ 	case AROLW:
+-		return &inst{0x3b, 0x1, 0x0, 1536, 0x30}
++		return &inst{0x3b, 0x1, 0x0, 0x0, 1536, 0x30}
+ 	case AROR:
+-		return &inst{0x33, 0x5, 0x0, 1536, 0x30}
++		return &inst{0x33, 0x5, 0x0, 0x0, 1536, 0x30}
+ 	case ARORI:
+-		return &inst{0x13, 0x5, 0x0, 1536, 0x30}
++		return &inst{0x13, 0x5, 0x0, 0x0, 1536, 0x30}
+ 	case ARORIW:
+-		return &inst{0x1b, 0x5, 0x0, 1536, 0x30}
++		return &inst{0x1b, 0x5, 0x0, 0x0, 1536, 0x30}
+ 	case ARORW:
+-		return &inst{0x3b, 0x5, 0x0, 1536, 0x30}
++		return &inst{0x3b, 0x5, 0x0, 0x0, 1536, 0x30}
+ 	case ASB:
+-		return &inst{0x23, 0x0, 0x0, 0, 0x0}
++		return &inst{0x23, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case ASBREAK:
+-		return &inst{0x73, 0x0, 0x1, 1, 0x0}
++		return &inst{0x73, 0x0, 0x0, 0x1, 1, 0x0}
+ 	case ASCD:
+-		return &inst{0x2f, 0x3, 0x0, 384, 0xc}
++		return &inst{0x2f, 0x3, 0x0, 0x0, 384, 0xc}
+ 	case ASCW:
+-		return &inst{0x2f, 0x2, 0x0, 384, 0xc}
++		return &inst{0x2f, 0x2, 0x0, 0x0, 384, 0xc}
+ 	case ASCALL:
+-		return &inst{0x73, 0x0, 0x0, 0, 0x0}
++		return &inst{0x73, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case ASD:
+-		return &inst{0x23, 0x3, 0x0, 0, 0x0}
++		return &inst{0x23, 0x3, 0x0, 0x0, 0, 0x0}
+ 	case ASEXTB:
+-		return &inst{0x13, 0x1, 0x4, 1540, 0x30}
++		return &inst{0x13, 0x1, 0x0, 0x4, 1540, 0x30}
+ 	case ASEXTH:
+-		return &inst{0x13, 0x1, 0x5, 1541, 0x30}
++		return &inst{0x13, 0x1, 0x0, 0x5, 1541, 0x30}
+ 	case ASFENCEVMA:
+-		return &inst{0x73, 0x0, 0x0, 288, 0x9}
++		return &inst{0x73, 0x0, 0x0, 0x0, 288, 0x9}
+ 	case ASH:
+-		return &inst{0x23, 0x1, 0x0, 0, 0x0}
++		return &inst{0x23, 0x1, 0x0, 0x0, 0, 0x0}
+ 	case ASH1ADD:
+-		return &inst{0x33, 0x2, 0x0, 512, 0x10}
++		return &inst{0x33, 0x2, 0x0, 0x0, 512, 0x10}
+ 	case ASH1ADDUW:
+-		return &inst{0x3b, 0x2, 0x0, 512, 0x10}
++		return &inst{0x3b, 0x2, 0x0, 0x0, 512, 0x10}
+ 	case ASH2ADD:
+-		return &inst{0x33, 0x4, 0x0, 512, 0x10}
++		return &inst{0x33, 0x4, 0x0, 0x0, 512, 0x10}
+ 	case ASH2ADDUW:
+-		return &inst{0x3b, 0x4, 0x0, 512, 0x10}
++		return &inst{0x3b, 0x4, 0x0, 0x0, 512, 0x10}
+ 	case ASH3ADD:
+-		return &inst{0x33, 0x6, 0x0, 512, 0x10}
++		return &inst{0x33, 0x6, 0x0, 0x0, 512, 0x10}
+ 	case ASH3ADDUW:
+-		return &inst{0x3b, 0x6, 0x0, 512, 0x10}
++		return &inst{0x3b, 0x6, 0x0, 0x0, 512, 0x10}
+ 	case ASLL:
+-		return &inst{0x33, 0x1, 0x0, 0, 0x0}
++		return &inst{0x33, 0x1, 0x0, 0x0, 0, 0x0}
+ 	case ASLLI:
+-		return &inst{0x13, 0x1, 0x0, 0, 0x0}
++		return &inst{0x13, 0x1, 0x0, 0x0, 0, 0x0}
+ 	case ASLLIUW:
+-		return &inst{0x1b, 0x1, 0x0, 128, 0x4}
++		return &inst{0x1b, 0x1, 0x0, 0x0, 128, 0x4}
+ 	case ASLLIW:
+-		return &inst{0x1b, 0x1, 0x0, 0, 0x0}
++		return &inst{0x1b, 0x1, 0x0, 0x0, 0, 0x0}
+ 	case ASLLW:
+-		return &inst{0x3b, 0x1, 0x0, 0, 0x0}
++		return &inst{0x3b, 0x1, 0x0, 0x0, 0, 0x0}
+ 	case ASLT:
+-		return &inst{0x33, 0x2, 0x0, 0, 0x0}
++		return &inst{0x33, 0x2, 0x0, 0x0, 0, 0x0}
+ 	case ASLTI:
+-		return &inst{0x13, 0x2, 0x0, 0, 0x0}
++		return &inst{0x13, 0x2, 0x0, 0x0, 0, 0x0}
+ 	case ASLTIU:
+-		return &inst{0x13, 0x3, 0x0, 0, 0x0}
++		return &inst{0x13, 0x3, 0x0, 0x0, 0, 0x0}
+ 	case ASLTU:
+-		return &inst{0x33, 0x3, 0x0, 0, 0x0}
++		return &inst{0x33, 0x3, 0x0, 0x0, 0, 0x0}
+ 	case ASRA:
+-		return &inst{0x33, 0x5, 0x0, 1024, 0x20}
++		return &inst{0x33, 0x5, 0x0, 0x0, 1024, 0x20}
+ 	case ASRAI:
+-		return &inst{0x13, 0x5, 0x0, 1024, 0x20}
++		return &inst{0x13, 0x5, 0x0, 0x0, 1024, 0x20}
+ 	case ASRAIW:
+-		return &inst{0x1b, 0x5, 0x0, 1024, 0x20}
++		return &inst{0x1b, 0x5, 0x0, 0x0, 1024, 0x20}
+ 	case ASRAW:
+-		return &inst{0x3b, 0x5, 0x0, 1024, 0x20}
++		return &inst{0x3b, 0x5, 0x0, 0x0, 1024, 0x20}
+ 	case ASRET:
+-		return &inst{0x73, 0x0, 0x2, 258, 0x8}
++		return &inst{0x73, 0x0, 0x0, 0x2, 258, 0x8}
+ 	case ASRL:
+-		return &inst{0x33, 0x5, 0x0, 0, 0x0}
++		return &inst{0x33, 0x5, 0x0, 0x0, 0, 0x0}
+ 	case ASRLI:
+-		return &inst{0x13, 0x5, 0x0, 0, 0x0}
++		return &inst{0x13, 0x5, 0x0, 0x0, 0, 0x0}
+ 	case ASRLIW:
+-		return &inst{0x1b, 0x5, 0x0, 0, 0x0}
++		return &inst{0x1b, 0x5, 0x0, 0x0, 0, 0x0}
+ 	case ASRLW:
+-		return &inst{0x3b, 0x5, 0x0, 0, 0x0}
++		return &inst{0x3b, 0x5, 0x0, 0x0, 0, 0x0}
+ 	case ASUB:
+-		return &inst{0x33, 0x0, 0x0, 1024, 0x20}
++		return &inst{0x33, 0x0, 0x0, 0x0, 1024, 0x20}
+ 	case ASUBW:
+-		return &inst{0x3b, 0x0, 0x0, 1024, 0x20}
++		return &inst{0x3b, 0x0, 0x0, 0x0, 1024, 0x20}
+ 	case ASW:
+-		return &inst{0x23, 0x2, 0x0, 0, 0x0}
++		return &inst{0x23, 0x2, 0x0, 0x0, 0, 0x0}
++	case AVAADDVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, 576, 0x12}
++	case AVAADDVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, 576, 0x12}
++	case AVAADDUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, 512, 0x10}
++	case AVAADDUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, 512, 0x10}
++	case AVADCVIM:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1024, 0x20}
++	case AVADCVVM:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1024, 0x20}
++	case AVADCVXM:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1024, 0x20}
++	case AVADDVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 0, 0x0}
++	case AVADDVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 0, 0x0}
++	case AVADDVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 0, 0x0}
++	case AVANDVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 576, 0x12}
++	case AVANDVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 576, 0x12}
++	case AVANDVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 576, 0x12}
++	case AVASUBVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, 704, 0x16}
++	case AVASUBVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, 704, 0x16}
++	case AVASUBUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, 640, 0x14}
++	case AVASUBUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, 640, 0x14}
++	case AVCOMPRESSVM:
++		return &inst{0x57, 0x2, 0x0, 0x0, 1504, 0x2f}
++	case AVCPOPM:
++		return &inst{0x57, 0x2, 0x10, 0x0, 1024, 0x20}
++	case AVDIVVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1984, 0x42}
++	case AVDIVVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1984, 0x42}
++	case AVDIVUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -2048, 0x40}
++	case AVDIVUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -2048, 0x40}
++	case AVFADDVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 0, 0x0}
++	case AVFADDVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 0, 0x0}
++	case AVFCLASSV:
++		return &inst{0x57, 0x1, 0x10, 0x0, 1216, 0x26}
++	case AVFCVTFXV:
++		return &inst{0x57, 0x1, 0x3, 0x0, 1152, 0x24}
++	case AVFCVTFXUV:
++		return &inst{0x57, 0x1, 0x2, 0x0, 1152, 0x24}
++	case AVFCVTRTZXFV:
++		return &inst{0x57, 0x1, 0x7, 0x0, 1152, 0x24}
++	case AVFCVTRTZXUFV:
++		return &inst{0x57, 0x1, 0x6, 0x0, 1152, 0x24}
++	case AVFCVTXFV:
++		return &inst{0x57, 0x1, 0x1, 0x0, 1152, 0x24}
++	case AVFCVTXUFV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 1152, 0x24}
++	case AVFDIVVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -2048, 0x40}
++	case AVFDIVVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -2048, 0x40}
++	case AVFIRSTM:
++		return &inst{0x57, 0x2, 0x11, 0x0, 1024, 0x20}
++	case AVFMACCVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1280, 0x58}
++	case AVFMACCVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -1280, 0x58}
++	case AVFMADDVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1536, 0x50}
++	case AVFMADDVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -1536, 0x50}
++	case AVFMAXVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 384, 0xc}
++	case AVFMAXVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 384, 0xc}
++	case AVFMERGEVFM:
++		return &inst{0x57, 0x5, 0x0, 0x0, 1472, 0x2e}
++	case AVFMINVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 256, 0x8}
++	case AVFMINVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 256, 0x8}
++	case AVFMSACVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1152, 0x5c}
++	case AVFMSACVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -1152, 0x5c}
++	case AVFMSUBVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1408, 0x54}
++	case AVFMSUBVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -1408, 0x54}
++	case AVFMULVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1792, 0x48}
++	case AVFMULVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -1792, 0x48}
++	case AVFMVFS:
++		return &inst{0x57, 0x1, 0x0, 0x0, 1056, 0x21}
++	case AVFMVSF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 1056, 0x21}
++	case AVFMVVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 1504, 0x2f}
++	case AVFNCVTFFW:
++		return &inst{0x57, 0x1, 0x14, 0x0, 1152, 0x24}
++	case AVFNCVTFXW:
++		return &inst{0x57, 0x1, 0x13, 0x0, 1152, 0x24}
++	case AVFNCVTFXUW:
++		return &inst{0x57, 0x1, 0x12, 0x0, 1152, 0x24}
++	case AVFNCVTRODFFW:
++		return &inst{0x57, 0x1, 0x15, 0x0, 1152, 0x24}
++	case AVFNCVTRTZXFW:
++		return &inst{0x57, 0x1, 0x17, 0x0, 1152, 0x24}
++	case AVFNCVTRTZXUFW:
++		return &inst{0x57, 0x1, 0x16, 0x0, 1152, 0x24}
++	case AVFNCVTXFW:
++		return &inst{0x57, 0x1, 0x11, 0x0, 1152, 0x24}
++	case AVFNCVTXUFW:
++		return &inst{0x57, 0x1, 0x10, 0x0, 1152, 0x24}
++	case AVFNMACCVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1216, 0x5a}
++	case AVFNMACCVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -1216, 0x5a}
++	case AVFNMADDVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1472, 0x52}
++	case AVFNMADDVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -1472, 0x52}
++	case AVFNMSACVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1088, 0x5e}
++	case AVFNMSACVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -1088, 0x5e}
++	case AVFNMSUBVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1344, 0x56}
++	case AVFNMSUBVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -1344, 0x56}
++	case AVFRDIVVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1984, 0x42}
++	case AVFREC7V:
++		return &inst{0x57, 0x1, 0x5, 0x0, 1216, 0x26}
++	case AVFREDMAXVS:
++		return &inst{0x57, 0x1, 0x0, 0x0, 448, 0xe}
++	case AVFREDMINVS:
++		return &inst{0x57, 0x1, 0x0, 0x0, 320, 0xa}
++	case AVFREDOSUMVS:
++		return &inst{0x57, 0x1, 0x0, 0x0, 192, 0x6}
++	case AVFREDUSUMVS:
++		return &inst{0x57, 0x1, 0x0, 0x0, 64, 0x2}
++	case AVFRSQRT7V:
++		return &inst{0x57, 0x1, 0x4, 0x0, 1216, 0x26}
++	case AVFRSUBVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1600, 0x4e}
++	case AVFSGNJVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 512, 0x10}
++	case AVFSGNJVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 512, 0x10}
++	case AVFSGNJNVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 576, 0x12}
++	case AVFSGNJNVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 576, 0x12}
++	case AVFSGNJXVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 640, 0x14}
++	case AVFSGNJXVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 640, 0x14}
++	case AVFSLIDE1DOWNVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 960, 0x1e}
++	case AVFSLIDE1UPVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 896, 0x1c}
++	case AVFSQRTV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 1216, 0x26}
++	case AVFSUBVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 128, 0x4}
++	case AVFSUBVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 128, 0x4}
++	case AVFWADDVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -1024, 0x60}
++	case AVFWADDVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -1024, 0x60}
++	case AVFWADDWF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -768, 0x68}
++	case AVFWADDWV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -768, 0x68}
++	case AVFWCVTFFV:
++		return &inst{0x57, 0x1, 0xc, 0x0, 1152, 0x24}
++	case AVFWCVTFXV:
++		return &inst{0x57, 0x1, 0xb, 0x0, 1152, 0x24}
++	case AVFWCVTFXUV:
++		return &inst{0x57, 0x1, 0xa, 0x0, 1152, 0x24}
++	case AVFWCVTRTZXFV:
++		return &inst{0x57, 0x1, 0xf, 0x0, 1152, 0x24}
++	case AVFWCVTRTZXUFV:
++		return &inst{0x57, 0x1, 0xe, 0x0, 1152, 0x24}
++	case AVFWCVTXFV:
++		return &inst{0x57, 0x1, 0x9, 0x0, 1152, 0x24}
++	case AVFWCVTXUFV:
++		return &inst{0x57, 0x1, 0x8, 0x0, 1152, 0x24}
++	case AVFWMACCVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -256, 0x78}
++	case AVFWMACCVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -256, 0x78}
++	case AVFWMSACVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -128, 0x7c}
++	case AVFWMSACVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -128, 0x7c}
++	case AVFWMULVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -512, 0x70}
++	case AVFWMULVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -512, 0x70}
++	case AVFWNMACCVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -192, 0x7a}
++	case AVFWNMACCVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -192, 0x7a}
++	case AVFWNMSACVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -64, 0x7e}
++	case AVFWNMSACVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -64, 0x7e}
++	case AVFWREDOSUMVS:
++		return &inst{0x57, 0x1, 0x0, 0x0, -832, 0x66}
++	case AVFWREDUSUMVS:
++		return &inst{0x57, 0x1, 0x0, 0x0, -960, 0x62}
++	case AVFWSUBVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -896, 0x64}
++	case AVFWSUBVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -896, 0x64}
++	case AVFWSUBWF:
++		return &inst{0x57, 0x5, 0x0, 0x0, -640, 0x6c}
++	case AVFWSUBWV:
++		return &inst{0x57, 0x1, 0x0, 0x0, -640, 0x6c}
++	case AVIDV:
++		return &inst{0x57, 0x2, 0x11, 0x0, 1280, 0x28}
++	case AVIOTAM:
++		return &inst{0x57, 0x2, 0x10, 0x0, 1280, 0x28}
++	case AVL1RE16V:
++		return &inst{0x7, 0x5, 0x0, 0x8, 40, 0x1}
++	case AVL1RE32V:
++		return &inst{0x7, 0x6, 0x0, 0x8, 40, 0x1}
++	case AVL1RE64V:
++		return &inst{0x7, 0x7, 0x0, 0x8, 40, 0x1}
++	case AVL1RE8V:
++		return &inst{0x7, 0x0, 0x0, 0x8, 40, 0x1}
++	case AVL2RE16V:
++		return &inst{0x7, 0x5, 0x0, 0x8, 552, 0x11}
++	case AVL2RE32V:
++		return &inst{0x7, 0x6, 0x0, 0x8, 552, 0x11}
++	case AVL2RE64V:
++		return &inst{0x7, 0x7, 0x0, 0x8, 552, 0x11}
++	case AVL2RE8V:
++		return &inst{0x7, 0x0, 0x0, 0x8, 552, 0x11}
++	case AVL4RE16V:
++		return &inst{0x7, 0x5, 0x0, 0x8, 1576, 0x31}
++	case AVL4RE32V:
++		return &inst{0x7, 0x6, 0x0, 0x8, 1576, 0x31}
++	case AVL4RE64V:
++		return &inst{0x7, 0x7, 0x0, 0x8, 1576, 0x31}
++	case AVL4RE8V:
++		return &inst{0x7, 0x0, 0x0, 0x8, 1576, 0x31}
++	case AVL8RE16V:
++		return &inst{0x7, 0x5, 0x0, 0x8, -472, 0x71}
++	case AVL8RE32V:
++		return &inst{0x7, 0x6, 0x0, 0x8, -472, 0x71}
++	case AVL8RE64V:
++		return &inst{0x7, 0x7, 0x0, 0x8, -472, 0x71}
++	case AVL8RE8V:
++		return &inst{0x7, 0x0, 0x0, 0x8, -472, 0x71}
++	case AVLE16V:
++		return &inst{0x7, 0x5, 0x0, 0x0, 0, 0x0}
++	case AVLE16FFV:
++		return &inst{0x7, 0x5, 0x0, 0x10, 16, 0x0}
++	case AVLE32V:
++		return &inst{0x7, 0x6, 0x0, 0x0, 0, 0x0}
++	case AVLE32FFV:
++		return &inst{0x7, 0x6, 0x0, 0x10, 16, 0x0}
++	case AVLE64V:
++		return &inst{0x7, 0x7, 0x0, 0x0, 0, 0x0}
++	case AVLE64FFV:
++		return &inst{0x7, 0x7, 0x0, 0x10, 16, 0x0}
++	case AVLE8V:
++		return &inst{0x7, 0x0, 0x0, 0x0, 0, 0x0}
++	case AVLE8FFV:
++		return &inst{0x7, 0x0, 0x0, 0x10, 16, 0x0}
++	case AVLMV:
++		return &inst{0x7, 0x0, 0x0, 0xb, 43, 0x1}
++	case AVLOXEI16V:
++		return &inst{0x7, 0x5, 0x0, 0x0, 192, 0x6}
++	case AVLOXEI32V:
++		return &inst{0x7, 0x6, 0x0, 0x0, 192, 0x6}
++	case AVLOXEI64V:
++		return &inst{0x7, 0x7, 0x0, 0x0, 192, 0x6}
++	case AVLOXEI8V:
++		return &inst{0x7, 0x0, 0x0, 0x0, 192, 0x6}
++	case AVLSE16V:
++		return &inst{0x7, 0x5, 0x0, 0x0, 128, 0x4}
++	case AVLSE32V:
++		return &inst{0x7, 0x6, 0x0, 0x0, 128, 0x4}
++	case AVLSE64V:
++		return &inst{0x7, 0x7, 0x0, 0x0, 128, 0x4}
++	case AVLSE8V:
++		return &inst{0x7, 0x0, 0x0, 0x0, 128, 0x4}
++	case AVLUXEI16V:
++		return &inst{0x7, 0x5, 0x0, 0x0, 64, 0x2}
++	case AVLUXEI32V:
++		return &inst{0x7, 0x6, 0x0, 0x0, 64, 0x2}
++	case AVLUXEI64V:
++		return &inst{0x7, 0x7, 0x0, 0x0, 64, 0x2}
++	case AVLUXEI8V:
++		return &inst{0x7, 0x0, 0x0, 0x0, 64, 0x2}
++	case AVMACCVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1216, 0x5a}
++	case AVMACCVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1216, 0x5a}
++	case AVMADCVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1120, 0x23}
++	case AVMADCVIM:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1088, 0x22}
++	case AVMADCVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1120, 0x23}
++	case AVMADCVVM:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1088, 0x22}
++	case AVMADCVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1120, 0x23}
++	case AVMADCVXM:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1088, 0x22}
++	case AVMADDVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1472, 0x52}
++	case AVMADDVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1472, 0x52}
++	case AVMANDMM:
++		return &inst{0x57, 0x2, 0x0, 0x0, 1632, 0x33}
++	case AVMANDNMM:
++		return &inst{0x57, 0x2, 0x0, 0x0, 1568, 0x31}
++	case AVMAXVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 448, 0xe}
++	case AVMAXVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 448, 0xe}
++	case AVMAXUVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 384, 0xc}
++	case AVMAXUVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 384, 0xc}
++	case AVMERGEVIM:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1472, 0x2e}
++	case AVMERGEVVM:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1472, 0x2e}
++	case AVMERGEVXM:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1472, 0x2e}
++	case AVMFEQVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 1536, 0x30}
++	case AVMFEQVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 1536, 0x30}
++	case AVMFGEVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 1984, 0x3e}
++	case AVMFGTVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 1856, 0x3a}
++	case AVMFLEVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 1600, 0x32}
++	case AVMFLEVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 1600, 0x32}
++	case AVMFLTVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 1728, 0x36}
++	case AVMFLTVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 1728, 0x36}
++	case AVMFNEVF:
++		return &inst{0x57, 0x5, 0x0, 0x0, 1792, 0x38}
++	case AVMFNEVV:
++		return &inst{0x57, 0x1, 0x0, 0x0, 1792, 0x38}
++	case AVMINVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 320, 0xa}
++	case AVMINVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 320, 0xa}
++	case AVMINUVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 256, 0x8}
++	case AVMINUVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 256, 0x8}
++	case AVMNANDMM:
++		return &inst{0x57, 0x2, 0x0, 0x0, 1888, 0x3b}
++	case AVMNORMM:
++		return &inst{0x57, 0x2, 0x0, 0x0, 1952, 0x3d}
++	case AVMORMM:
++		return &inst{0x57, 0x2, 0x0, 0x0, 1696, 0x35}
++	case AVMORNMM:
++		return &inst{0x57, 0x2, 0x0, 0x0, 1824, 0x39}
++	case AVMSBCVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1248, 0x27}
++	case AVMSBCVVM:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1216, 0x26}
++	case AVMSBCVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1248, 0x27}
++	case AVMSBCVXM:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1216, 0x26}
++	case AVMSBFM:
++		return &inst{0x57, 0x2, 0x1, 0x0, 1280, 0x28}
++	case AVMSEQVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1536, 0x30}
++	case AVMSEQVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1536, 0x30}
++	case AVMSEQVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1536, 0x30}
++	case AVMSGTVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1984, 0x3e}
++	case AVMSGTVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1984, 0x3e}
++	case AVMSGTUVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1920, 0x3c}
++	case AVMSGTUVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1920, 0x3c}
++	case AVMSIFM:
++		return &inst{0x57, 0x2, 0x3, 0x0, 1280, 0x28}
++	case AVMSLEVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1856, 0x3a}
++	case AVMSLEVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1856, 0x3a}
++	case AVMSLEVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1856, 0x3a}
++	case AVMSLEUVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1792, 0x38}
++	case AVMSLEUVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1792, 0x38}
++	case AVMSLEUVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1792, 0x38}
++	case AVMSLTVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1728, 0x36}
++	case AVMSLTVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1728, 0x36}
++	case AVMSLTUVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1664, 0x34}
++	case AVMSLTUVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1664, 0x34}
++	case AVMSNEVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1600, 0x32}
++	case AVMSNEVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1600, 0x32}
++	case AVMSNEVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1600, 0x32}
++	case AVMSOFM:
++		return &inst{0x57, 0x2, 0x2, 0x0, 1280, 0x28}
++	case AVMULVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1728, 0x4a}
++	case AVMULVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1728, 0x4a}
++	case AVMULHVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1600, 0x4e}
++	case AVMULHVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1600, 0x4e}
++	case AVMULHSUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1664, 0x4c}
++	case AVMULHSUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1664, 0x4c}
++	case AVMULHUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1792, 0x48}
++	case AVMULHUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1792, 0x48}
++	case AVMV1RV:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1568, 0x4f}
++	case AVMV2RV:
++		return &inst{0x57, 0x3, 0x1, 0x0, -1568, 0x4f}
++	case AVMV4RV:
++		return &inst{0x57, 0x3, 0x3, 0x0, -1568, 0x4f}
++	case AVMV8RV:
++		return &inst{0x57, 0x3, 0x7, 0x0, -1568, 0x4f}
++	case AVMVSX:
++		return &inst{0x57, 0x6, 0x0, 0x0, 1056, 0x21}
++	case AVMVVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 1504, 0x2f}
++	case AVMVVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1504, 0x2f}
++	case AVMVVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1504, 0x2f}
++	case AVMVXS:
++		return &inst{0x57, 0x2, 0x0, 0x0, 1056, 0x21}
++	case AVMXNORMM:
++		return &inst{0x57, 0x2, 0x0, 0x0, 2016, 0x3f}
++	case AVMXORMM:
++		return &inst{0x57, 0x2, 0x0, 0x0, 1760, 0x37}
++	case AVNCLIPWI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1088, 0x5e}
++	case AVNCLIPWV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1088, 0x5e}
++	case AVNCLIPWX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1088, 0x5e}
++	case AVNCLIPUWI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1152, 0x5c}
++	case AVNCLIPUWV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1152, 0x5c}
++	case AVNCLIPUWX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1152, 0x5c}
++	case AVNMSACVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1088, 0x5e}
++	case AVNMSACVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1088, 0x5e}
++	case AVNMSUBVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1344, 0x56}
++	case AVNMSUBVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1344, 0x56}
++	case AVNSRAWI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1216, 0x5a}
++	case AVNSRAWV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1216, 0x5a}
++	case AVNSRAWX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1216, 0x5a}
++	case AVNSRLWI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1280, 0x58}
++	case AVNSRLWV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1280, 0x58}
++	case AVNSRLWX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1280, 0x58}
++	case AVORVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 640, 0x14}
++	case AVORVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 640, 0x14}
++	case AVORVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 640, 0x14}
++	case AVREDANDVS:
++		return &inst{0x57, 0x2, 0x0, 0x0, 64, 0x2}
++	case AVREDMAXVS:
++		return &inst{0x57, 0x2, 0x0, 0x0, 448, 0xe}
++	case AVREDMAXUVS:
++		return &inst{0x57, 0x2, 0x0, 0x0, 384, 0xc}
++	case AVREDMINVS:
++		return &inst{0x57, 0x2, 0x0, 0x0, 320, 0xa}
++	case AVREDMINUVS:
++		return &inst{0x57, 0x2, 0x0, 0x0, 256, 0x8}
++	case AVREDORVS:
++		return &inst{0x57, 0x2, 0x0, 0x0, 128, 0x4}
++	case AVREDSUMVS:
++		return &inst{0x57, 0x2, 0x0, 0x0, 0, 0x0}
++	case AVREDXORVS:
++		return &inst{0x57, 0x2, 0x0, 0x0, 192, 0x6}
++	case AVREMVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1856, 0x46}
++	case AVREMVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1856, 0x46}
++	case AVREMUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1920, 0x44}
++	case AVREMUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1920, 0x44}
++	case AVRGATHERVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 768, 0x18}
++	case AVRGATHERVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 768, 0x18}
++	case AVRGATHERVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 768, 0x18}
++	case AVRGATHEREI16VV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 896, 0x1c}
++	case AVRSUBVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 192, 0x6}
++	case AVRSUBVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 192, 0x6}
++	case AVS1RV:
++		return &inst{0x27, 0x0, 0x0, 0x8, 40, 0x1}
++	case AVS2RV:
++		return &inst{0x27, 0x0, 0x0, 0x8, 552, 0x11}
++	case AVS4RV:
++		return &inst{0x27, 0x0, 0x0, 0x8, 1576, 0x31}
++	case AVS8RV:
++		return &inst{0x27, 0x0, 0x0, 0x8, -472, 0x71}
++	case AVSADDVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1984, 0x42}
++	case AVSADDVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1984, 0x42}
++	case AVSADDVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1984, 0x42}
++	case AVSADDUVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -2048, 0x40}
++	case AVSADDUVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -2048, 0x40}
++	case AVSADDUVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -2048, 0x40}
++	case AVSBCVVM:
++		return &inst{0x57, 0x0, 0x0, 0x0, 1152, 0x24}
++	case AVSBCVXM:
++		return &inst{0x57, 0x4, 0x0, 0x0, 1152, 0x24}
++	case AVSE16V:
++		return &inst{0x27, 0x5, 0x0, 0x0, 0, 0x0}
++	case AVSE32V:
++		return &inst{0x27, 0x6, 0x0, 0x0, 0, 0x0}
++	case AVSE64V:
++		return &inst{0x27, 0x7, 0x0, 0x0, 0, 0x0}
++	case AVSE8V:
++		return &inst{0x27, 0x0, 0x0, 0x0, 0, 0x0}
++	case AVSETIVLI:
++		return &inst{0x57, 0x7, 0x0, 0x0, -1024, 0x60}
++	case AVSETVL:
++		return &inst{0x57, 0x7, 0x0, 0x0, -2048, 0x40}
++	case AVSETVLI:
++		return &inst{0x57, 0x7, 0x0, 0x0, 0, 0x0}
++	case AVSEXTVF2:
++		return &inst{0x57, 0x2, 0x7, 0x0, 1152, 0x24}
++	case AVSEXTVF4:
++		return &inst{0x57, 0x2, 0x5, 0x0, 1152, 0x24}
++	case AVSEXTVF8:
++		return &inst{0x57, 0x2, 0x3, 0x0, 1152, 0x24}
++	case AVSLIDE1DOWNVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, 960, 0x1e}
++	case AVSLIDE1UPVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, 896, 0x1c}
++	case AVSLIDEDOWNVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 960, 0x1e}
++	case AVSLIDEDOWNVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 960, 0x1e}
++	case AVSLIDEUPVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 896, 0x1c}
++	case AVSLIDEUPVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 896, 0x1c}
++	case AVSLLVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1728, 0x4a}
++	case AVSLLVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1728, 0x4a}
++	case AVSLLVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1728, 0x4a}
++	case AVSMV:
++		return &inst{0x27, 0x0, 0x0, 0xb, 43, 0x1}
++	case AVSMULVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1600, 0x4e}
++	case AVSMULVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1600, 0x4e}
++	case AVSOXEI16V:
++		return &inst{0x27, 0x5, 0x0, 0x0, 192, 0x6}
++	case AVSOXEI32V:
++		return &inst{0x27, 0x6, 0x0, 0x0, 192, 0x6}
++	case AVSOXEI64V:
++		return &inst{0x27, 0x7, 0x0, 0x0, 192, 0x6}
++	case AVSOXEI8V:
++		return &inst{0x27, 0x0, 0x0, 0x0, 192, 0x6}
++	case AVSRAVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1472, 0x52}
++	case AVSRAVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1472, 0x52}
++	case AVSRAVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1472, 0x52}
++	case AVSRLVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1536, 0x50}
++	case AVSRLVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1536, 0x50}
++	case AVSRLVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1536, 0x50}
++	case AVSSE16V:
++		return &inst{0x27, 0x5, 0x0, 0x0, 128, 0x4}
++	case AVSSE32V:
++		return &inst{0x27, 0x6, 0x0, 0x0, 128, 0x4}
++	case AVSSE64V:
++		return &inst{0x27, 0x7, 0x0, 0x0, 128, 0x4}
++	case AVSSE8V:
++		return &inst{0x27, 0x0, 0x0, 0x0, 128, 0x4}
++	case AVSSRAVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1344, 0x56}
++	case AVSSRAVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1344, 0x56}
++	case AVSSRAVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1344, 0x56}
++	case AVSSRLVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, -1408, 0x54}
++	case AVSSRLVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1408, 0x54}
++	case AVSSRLVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1408, 0x54}
++	case AVSSUBVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1856, 0x46}
++	case AVSSUBVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1856, 0x46}
++	case AVSSUBUVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1920, 0x44}
++	case AVSSUBUVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, -1920, 0x44}
++	case AVSUBVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 128, 0x4}
++	case AVSUBVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 128, 0x4}
++	case AVSUXEI16V:
++		return &inst{0x27, 0x5, 0x0, 0x0, 64, 0x2}
++	case AVSUXEI32V:
++		return &inst{0x27, 0x6, 0x0, 0x0, 64, 0x2}
++	case AVSUXEI64V:
++		return &inst{0x27, 0x7, 0x0, 0x0, 64, 0x2}
++	case AVSUXEI8V:
++		return &inst{0x27, 0x0, 0x0, 0x0, 64, 0x2}
++	case AVWADDVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -960, 0x62}
++	case AVWADDVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -960, 0x62}
++	case AVWADDWV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -704, 0x6a}
++	case AVWADDWX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -704, 0x6a}
++	case AVWADDUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -1024, 0x60}
++	case AVWADDUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -1024, 0x60}
++	case AVWADDUWV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -768, 0x68}
++	case AVWADDUWX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -768, 0x68}
++	case AVWMACCVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -192, 0x7a}
++	case AVWMACCVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -192, 0x7a}
++	case AVWMACCSUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -64, 0x7e}
++	case AVWMACCSUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -64, 0x7e}
++	case AVWMACCUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -256, 0x78}
++	case AVWMACCUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -256, 0x78}
++	case AVWMACCUSVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -128, 0x7c}
++	case AVWMULVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -320, 0x76}
++	case AVWMULVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -320, 0x76}
++	case AVWMULSUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -384, 0x74}
++	case AVWMULSUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -384, 0x74}
++	case AVWMULUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -512, 0x70}
++	case AVWMULUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -512, 0x70}
++	case AVWREDSUMVS:
++		return &inst{0x57, 0x0, 0x0, 0x0, -960, 0x62}
++	case AVWREDSUMUVS:
++		return &inst{0x57, 0x0, 0x0, 0x0, -1024, 0x60}
++	case AVWSUBVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -832, 0x66}
++	case AVWSUBVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -832, 0x66}
++	case AVWSUBWV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -576, 0x6e}
++	case AVWSUBWX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -576, 0x6e}
++	case AVWSUBUVV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -896, 0x64}
++	case AVWSUBUVX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -896, 0x64}
++	case AVWSUBUWV:
++		return &inst{0x57, 0x2, 0x0, 0x0, -640, 0x6c}
++	case AVWSUBUWX:
++		return &inst{0x57, 0x6, 0x0, 0x0, -640, 0x6c}
++	case AVXORVI:
++		return &inst{0x57, 0x3, 0x0, 0x0, 704, 0x16}
++	case AVXORVV:
++		return &inst{0x57, 0x0, 0x0, 0x0, 704, 0x16}
++	case AVXORVX:
++		return &inst{0x57, 0x4, 0x0, 0x0, 704, 0x16}
++	case AVZEXTVF2:
++		return &inst{0x57, 0x2, 0x6, 0x0, 1152, 0x24}
++	case AVZEXTVF4:
++		return &inst{0x57, 0x2, 0x4, 0x0, 1152, 0x24}
++	case AVZEXTVF8:
++		return &inst{0x57, 0x2, 0x2, 0x0, 1152, 0x24}
+ 	case AWFI:
+-		return &inst{0x73, 0x0, 0x5, 261, 0x8}
++		return &inst{0x73, 0x0, 0x0, 0x5, 261, 0x8}
+ 	case AXNOR:
+-		return &inst{0x33, 0x4, 0x0, 1024, 0x20}
++		return &inst{0x33, 0x4, 0x0, 0x0, 1024, 0x20}
+ 	case AXOR:
+-		return &inst{0x33, 0x4, 0x0, 0, 0x0}
++		return &inst{0x33, 0x4, 0x0, 0x0, 0, 0x0}
+ 	case AXORI:
+-		return &inst{0x13, 0x4, 0x0, 0, 0x0}
++		return &inst{0x13, 0x4, 0x0, 0x0, 0, 0x0}
+ 	case AZEXTH:
+-		return &inst{0x3b, 0x4, 0x0, 128, 0x4}
++		return &inst{0x3b, 0x4, 0x0, 0x0, 128, 0x4}
+ 	}
+ 	return nil
+ }
+-- 
+2.39.5
+
diff --git a/2057-cmd-internal-obj-cmd-asm-add-vector-registers-to-ris.patch b/2057-cmd-internal-obj-cmd-asm-add-vector-registers-to-ris.patch
new file mode 100644
index 0000000..f767f21
--- /dev/null
+++ b/2057-cmd-internal-obj-cmd-asm-add-vector-registers-to-ris.patch
@@ -0,0 +1,139 @@
+From 39d4bbc0b357cd81948a05ed60f513665d0bc1d0 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 057/119] cmd/internal/obj,cmd/asm: add vector registers to
+ riscv64 assembler
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This adds V0 through V31 as vector registers, which are available on CPUs
+that support the V extension.
+
+Change-Id: Ibffee3f9a2cf1d062638715b3744431d72d451ce
+Reviewed-on: https://go-review.googlesource.com/c/go/+/595404
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Michael Pratt <mpratt@google.com>
+Reviewed-by: 鹏程汪 <wangpengcheng.pp@bytedance.com>
+---
+ src/cmd/asm/internal/arch/arch.go  |  4 ++++
+ src/cmd/internal/obj/riscv/cpu.go  | 36 +++++++++++++++++++++++++++++-
+ src/cmd/internal/obj/riscv/list.go |  2 ++
+ src/cmd/internal/obj/riscv/obj.go  | 10 +++++++++
+ 4 files changed, 51 insertions(+), 1 deletion(-)
+
+diff --git a/src/cmd/asm/internal/arch/arch.go b/src/cmd/asm/internal/arch/arch.go
+index 11bb7af899..429dff7be5 100644
+--- a/src/cmd/asm/internal/arch/arch.go
++++ b/src/cmd/asm/internal/arch/arch.go
+@@ -586,6 +586,10 @@ func archRISCV64(shared bool) *Arch {
+ 		name := fmt.Sprintf("F%d", i-riscv.REG_F0)
+ 		register[name] = int16(i)
+ 	}
++	for i := riscv.REG_V0; i <= riscv.REG_V31; i++ {
++		name := fmt.Sprintf("V%d", i-riscv.REG_V0)
++		register[name] = int16(i)
++	}
+ 
+ 	// General registers with ABI names.
+ 	register["ZERO"] = riscv.REG_ZERO
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index 8b620b8646..d2154593de 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -72,7 +72,7 @@ const (
+ 	REG_X30
+ 	REG_X31
+ 
+-	// FP register numberings.
++	// Floating Point register numberings.
+ 	REG_F0
+ 	REG_F1
+ 	REG_F2
+@@ -106,6 +106,40 @@ const (
+ 	REG_F30
+ 	REG_F31
+ 
++	// Vector register numberings.
++	REG_V0
++	REG_V1
++	REG_V2
++	REG_V3
++	REG_V4
++	REG_V5
++	REG_V6
++	REG_V7
++	REG_V8
++	REG_V9
++	REG_V10
++	REG_V11
++	REG_V12
++	REG_V13
++	REG_V14
++	REG_V15
++	REG_V16
++	REG_V17
++	REG_V18
++	REG_V19
++	REG_V20
++	REG_V21
++	REG_V22
++	REG_V23
++	REG_V24
++	REG_V25
++	REG_V26
++	REG_V27
++	REG_V28
++	REG_V29
++	REG_V30
++	REG_V31
++
+ 	// This marks the end of the register numbering.
+ 	REG_END
+ 
+diff --git a/src/cmd/internal/obj/riscv/list.go b/src/cmd/internal/obj/riscv/list.go
+index bc87539f27..c5b7e80719 100644
+--- a/src/cmd/internal/obj/riscv/list.go
++++ b/src/cmd/internal/obj/riscv/list.go
+@@ -28,6 +28,8 @@ func RegName(r int) string {
+ 		return fmt.Sprintf("X%d", r-REG_X0)
+ 	case REG_F0 <= r && r <= REG_F31:
+ 		return fmt.Sprintf("F%d", r-REG_F0)
++	case REG_V0 <= r && r <= REG_V31:
++		return fmt.Sprintf("V%d", r-REG_V0)
+ 	default:
+ 		return fmt.Sprintf("Rgok(%d)", r-obj.RBaseRISCV)
+ 	}
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 6e9691bb4f..6f74f38543 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1029,6 +1029,11 @@ func regF(r uint32) uint32 {
+ 	return regVal(r, REG_F0, REG_F31)
+ }
+ 
++// regV returns a vector register.
++func regV(r uint32) uint32 {
++	return regVal(r, REG_V0, REG_V31)
++}
++
+ // regAddr extracts a register from an Addr.
+ func regAddr(a obj.Addr, min, max uint32) uint32 {
+ 	if a.Type != obj.TYPE_REG {
+@@ -1111,6 +1116,11 @@ func wantFloatReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
+ 	wantReg(ctxt, ins, pos, "float", r, REG_F0, REG_F31)
+ }
+ 
++// wantVectorReg checks that r is a vector register.
++func wantVectorReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) {
++	wantReg(ctxt, ins, pos, "vector", r, REG_V0, REG_V31)
++}
++
+ // wantEvenOffset checks that the offset is a multiple of two.
+ func wantEvenOffset(ctxt *obj.Link, ins *instruction, offset int64) {
+ 	if err := immEven(offset); err != nil {
+-- 
+2.39.5
+
diff --git a/2058-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch b/2058-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch
new file mode 100644
index 0000000..8babafc
--- /dev/null
+++ b/2058-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch
@@ -0,0 +1,578 @@
+From 14b23a9bebcd9c6d482db37e76bff8dcde86c7ee Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 058/119] cmd/internal/obj/riscv: update references to RISC-V
+ specification
+
+Update references to version 20240411 of the RISC-V specifications.
+Reorder and regroup instructions to maintain ordering.
+
+Change-Id: Iea2a5d22ad677e04948e9a9325986ad301c03f35
+Reviewed-on: https://go-review.googlesource.com/c/go/+/616115
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: David Chase <drchase@google.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s |  55 ++++----
+ src/cmd/internal/obj/riscv/anames.go        |  42 +++---
+ src/cmd/internal/obj/riscv/cpu.go           | 134 ++++++++++----------
+ 3 files changed, 119 insertions(+), 112 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 53b7b92faa..517930aa60 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -6,7 +6,9 @@
+ 
+ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
+ start:
++	//
+ 	// Unprivileged ISA
++	//
+ 
+ 	// 2.4: Integer Computational Instructions
+ 
+@@ -139,7 +141,7 @@ start:
+ 	// 2.7: Memory Ordering Instructions
+ 	FENCE						// 0f00f00f
+ 
+-	// 5.2: Integer Computational Instructions (RV64I)
++	// 4.2: Integer Computational Instructions (RV64I)
+ 	ADDIW	$1, X5, X6				// 1b831200
+ 	SLLIW	$1, X5, X6				// 1b931200
+ 	SRLIW	$1, X5, X6				// 1bd31200
+@@ -164,18 +166,25 @@ start:
+ 	SUBW	$1, X6					// 1b03f3ff
+ 	SRAW	$1, X6					// 1b531340
+ 
+-	// 5.3: Load and Store Instructions (RV64I)
++	// 4.3: Load and Store Instructions (RV64I)
+ 	LD	(X5), X6				// 03b30200
+ 	LD	4(X5), X6				// 03b34200
+ 	SD	X5, (X6)				// 23305300
+ 	SD	X5, 4(X6)				// 23325300
+ 
+-	// 7.1: Multiplication Operations
++	// 8.1: Base Counters and Timers (Zicntr)
++	RDCYCLE		X5				// f32200c0
++	RDTIME		X5				// f32210c0
++	RDINSTRET	X5				// f32220c0
++
++	// 13.1: Multiplication Operations
+ 	MUL	X5, X6, X7				// b3035302
+ 	MULH	X5, X6, X7				// b3135302
+ 	MULHU	X5, X6, X7				// b3335302
+ 	MULHSU	X5, X6, X7				// b3235302
+ 	MULW	X5, X6, X7				// bb035302
++
++	// 13.2: Division Operations
+ 	DIV	X5, X6, X7				// b3435302
+ 	DIVU	X5, X6, X7				// b3535302
+ 	REM	X5, X6, X7				// b3635302
+@@ -185,13 +194,13 @@ start:
+ 	REMW	X5, X6, X7				// bb635302
+ 	REMUW	X5, X6, X7				// bb735302
+ 
+-	// 8.2: Load-Reserved/Store-Conditional
++	// 14.2: Load-Reserved/Store-Conditional (Zalrsc)
+ 	LRW	(X5), X6				// 2fa30214
+ 	LRD	(X5), X6				// 2fb30214
+ 	SCW	X5, (X6), X7				// af23531a
+ 	SCD	X5, (X6), X7				// af33531a
+ 
+-	// 8.3: Atomic Memory Operations
++	// 14.4: Atomic Memory Operations (Zaamo)
+ 	AMOSWAPW	X5, (X6), X7			// af23530e
+ 	AMOSWAPD	X5, (X6), X7			// af33530e
+ 	AMOADDW		X5, (X6), X7			// af235306
+@@ -211,18 +220,13 @@ start:
+ 	AMOMINUW	X5, (X6), X7			// af2353c6
+ 	AMOMINUD	X5, (X6), X7			// af3353c6
+ 
+-	// 10.1: Base Counters and Timers
+-	RDCYCLE		X5				// f32200c0
+-	RDTIME		X5				// f32210c0
+-	RDINSTRET	X5				// f32220c0
+-
+-	// 11.5: Single-Precision Load and Store Instructions
++	// 20.5: Single-Precision Load and Store Instructions
+ 	FLW	(X5), F0				// 07a00200
+ 	FLW	4(X5), F0				// 07a04200
+ 	FSW	F0, (X5)				// 27a00200
+ 	FSW	F0, 4(X5)				// 27a20200
+ 
+-	// 11.6: Single-Precision Floating-Point Computational Instructions
++	// 20.6: Single-Precision Floating-Point Computational Instructions
+ 	FADDS	F1, F0, F2				// 53011000
+ 	FSUBS	F1, F0, F2				// 53011008
+ 	FMULS	F1, F0, F2				// 53011010
+@@ -231,7 +235,7 @@ start:
+ 	FMAXS	F1, F0, F2				// 53111028
+ 	FSQRTS	F0, F1					// d3000058
+ 
+-	// 11.7: Single-Precision Floating-Point Conversion and Move Instructions
++	// 20.7: Single-Precision Floating-Point Conversion and Move Instructions
+ 	FCVTWS	F0, X5					// d31200c0
+ 	FCVTWS.RNE	F0, X5				// d30200c0
+ 	FCVTWS.RTZ	F0, X5				// d31200c0
+@@ -272,21 +276,21 @@ start:
+ 	FNMSUBS	F1, F2, F3, F4				// 4b822018
+ 	FNMADDS	F1, F2, F3, F4				// 4f822018
+ 
+-	// 11.8: Single-Precision Floating-Point Compare Instructions
++	// 20.8: Single-Precision Floating-Point Compare Instructions
+ 	FEQS	F0, F1, X7				// d3a300a0
+ 	FLTS	F0, F1, X7				// d39300a0
+ 	FLES	F0, F1, X7				// d38300a0
+ 
+-	// 11.9: Single-Precision Floating-Point Classify Instruction
++	// 20.9: Single-Precision Floating-Point Classify Instruction
+ 	FCLASSS	F0, X5					// d31200e0
+ 
+-	// 12.3: Double-Precision Load and Store Instructions
++	// 21.3: Double-Precision Load and Store Instructions
+ 	FLD	(X5), F0				// 07b00200
+ 	FLD	4(X5), F0				// 07b04200
+ 	FSD	F0, (X5)				// 27b00200
+ 	FSD	F0, 4(X5)				// 27b20200
+ 
+-	// 12.4: Double-Precision Floating-Point Computational Instructions
++	// 21.4: Double-Precision Floating-Point Computational Instructions
+ 	FADDD	F1, F0, F2				// 53011002
+ 	FSUBD	F1, F0, F2				// 5301100a
+ 	FMULD	F1, F0, F2				// 53011012
+@@ -295,7 +299,7 @@ start:
+ 	FMAXD	F1, F0, F2				// 5311102a
+ 	FSQRTD	F0, F1					// d300005a
+ 
+-	// 12.5: Double-Precision Floating-Point Conversion and Move Instructions
++	// 21.5: Double-Precision Floating-Point Conversion and Move Instructions
+ 	FCVTWD	F0, X5					// d31200c2
+ 	FCVTWD.RNE	F0, X5				// d30200c2
+ 	FCVTWD.RTZ	F0, X5				// d31200c2
+@@ -336,11 +340,10 @@ start:
+ 	FNMSUBD	F1, F2, F3, F4				// 4b82201a
+ 	FNMADDD	F1, F2, F3, F4				// 4f82201a
+ 
+-	// 12.6: Double-Precision Floating-Point Classify Instruction
++	// 21.7: Double-Precision Floating-Point Classify Instruction
+ 	FCLASSD	F0, X5					// d31200e2
+ 
+-	// RISC-V Bit-Manipulation ISA-extensions (1.0)
+-	// 1.1: Address Generation Instructions (Zba)
++	// 28.4.1: Address Generation Instructions (Zba)
+ 	ADDUW		X10, X11, X12			// 3b86a508
+ 	ADDUW		X10, X11			// bb85a508
+ 	SH1ADD		X11, X12, X13			// b326b620
+@@ -360,7 +363,7 @@ start:
+ 	SLLIUW		$63, X17, X18			// 1b99f80b
+ 	SLLIUW		$1, X18, X19			// 9b191908
+ 
+-	// 1.2: Basic Bit Manipulation (Zbb)
++	// 28.4.2: Basic Bit Manipulation (Zbb)
+ 	ANDN	X19, X20, X21				// b37a3a41 or 93caf9ffb37a5a01
+ 	ANDN	X19, X20				// 337a3a41 or 93cff9ff337afa01
+ 	CLZ	X20, X21				// 931a0a60
+@@ -385,7 +388,7 @@ start:
+ 	XNOR	X18, X19				// b3c92941 or b3c9290193c9f9ff
+ 	ZEXTH	X19, X20				// 3bca0908
+ 
+-	// 1.3: Bitwise Rotation (Zbb)
++	// 28.4.2: Bitwise Rotation (Zbb)
+ 	ROL	X8, X9, X10				// 33958460 or b30f8040b3dff4013395840033e5af00
+ 	ROL	X8, X9					// b3948460 or b30f8040b3dff401b3948400b3e49f00
+ 	ROLW	X9, X10, X11				// bb159560 or b30f9040bb5ff501bb159500b3e5bf00
+@@ -403,7 +406,7 @@ start:
+ 	ORCB	X5, X6					// 13d37228
+ 	REV8	X7, X8					// 13d4836b
+ 
+-	// 1.5: Single-bit Instructions (Zbs)
++	// 28.4.4: Single-bit Instructions (Zbs)
+ 	BCLR	X23, X24, X25				// b31c7c49
+ 	BCLR	$63, X24				// 131cfc4b
+ 	BCLRI	$1, X25, X26				// 139d1c48
+@@ -417,9 +420,11 @@ start:
+ 	BSET	$63, X9					// 9394f42b
+ 	BSETI	$1, X10, X11				// 93151528
+ 
++	//
+ 	// Privileged ISA
++	//
+ 
+-	// 3.2.1: Environment Call and Breakpoint
++	// 3.3.1: Environment Call and Breakpoint
+ 	ECALL						// 73000000
+ 	SCALL						// 73000000
+ 	EBREAK						// 73001000
+diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go
+index 53cf1c95dc..dbdce22687 100644
+--- a/src/cmd/internal/obj/riscv/anames.go
++++ b/src/cmd/internal/obj/riscv/anames.go
+@@ -57,6 +57,18 @@ var Anames = []string{
+ 	"SRAW",
+ 	"LD",
+ 	"SD",
++	"CSRRW",
++	"CSRRS",
++	"CSRRC",
++	"CSRRWI",
++	"CSRRSI",
++	"CSRRCI",
++	"RDCYCLE",
++	"RDCYCLEH",
++	"RDTIME",
++	"RDTIMEH",
++	"RDINSTRET",
++	"RDINSTRETH",
+ 	"MUL",
+ 	"MULH",
+ 	"MULHU",
+@@ -92,12 +104,6 @@ var Anames = []string{
+ 	"AMOMAXUW",
+ 	"AMOMINW",
+ 	"AMOMINUW",
+-	"RDCYCLE",
+-	"RDCYCLEH",
+-	"RDTIME",
+-	"RDTIMEH",
+-	"RDINSTRET",
+-	"RDINSTRETH",
+ 	"FRCSR",
+ 	"FSCSR",
+ 	"FRRM",
+@@ -202,21 +208,6 @@ var Anames = []string{
+ 	"FLEQ",
+ 	"FLTQ",
+ 	"FCLASSQ",
+-	"CSRRW",
+-	"CSRRS",
+-	"CSRRC",
+-	"CSRRWI",
+-	"CSRRSI",
+-	"CSRRCI",
+-	"ECALL",
+-	"SCALL",
+-	"EBREAK",
+-	"SBREAK",
+-	"MRET",
+-	"SRET",
+-	"DRET",
+-	"WFI",
+-	"SFENCEVMA",
+ 	"ADDUW",
+ 	"SH1ADD",
+ 	"SH1ADDUW",
+@@ -632,6 +623,15 @@ var Anames = []string{
+ 	"VMV2RV",
+ 	"VMV4RV",
+ 	"VMV8RV",
++	"ECALL",
++	"SCALL",
++	"EBREAK",
++	"SBREAK",
++	"MRET",
++	"SRET",
++	"DRET",
++	"WFI",
++	"SFENCEVMA",
+ 	"WORD",
+ 	"BEQZ",
+ 	"BGEZ",
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index d2154593de..1c3a13f6c3 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -319,11 +319,13 @@ const (
+ //
+ // As well as some pseudo-mnemonics (e.g. MOV) used only in the assembler.
+ //
+-// See also "The RISC-V Instruction Set Manual" at https://riscv.org/specifications/.
++// See also "The RISC-V Instruction Set Manual" at https://riscv.org/technical/specifications/.
+ //
+ // If you modify this table, you MUST run 'go generate' to regenerate anames.go!
+ const (
+-	// Unprivileged ISA (Document Version 20190608-Base-Ratified)
++	//
++	// Unprivileged ISA (version 20240411)
++	//
+ 
+ 	// 2.4: Integer Computational Instructions
+ 	AADDI = obj.ABaseRISCV + obj.A_ARCHSPECIFIC + iota
+@@ -374,7 +376,7 @@ const (
+ 	AFENCETSO
+ 	APAUSE
+ 
+-	// 5.2: Integer Computational Instructions (RV64I)
++	// 4.2: Integer Computational Instructions (RV64I)
+ 	AADDIW
+ 	ASLLIW
+ 	ASRLIW
+@@ -385,16 +387,34 @@ const (
+ 	ASUBW
+ 	ASRAW
+ 
+-	// 5.3: Load and Store Instructions (RV64I)
++	// 4.3: Load and Store Instructions (RV64I)
+ 	ALD
+ 	ASD
+ 
+-	// 7.1: Multiplication Operations
++	// 7.1: CSR Instructions (Zicsr)
++	ACSRRW
++	ACSRRS
++	ACSRRC
++	ACSRRWI
++	ACSRRSI
++	ACSRRCI
++
++	// 8.1: Base Counters and Timers (Zicntr)
++	ARDCYCLE
++	ARDCYCLEH
++	ARDTIME
++	ARDTIMEH
++	ARDINSTRET
++	ARDINSTRETH
++
++	// 13.1: Multiplication Operations
+ 	AMUL
+ 	AMULH
+ 	AMULHU
+ 	AMULHSU
+ 	AMULW
++
++	// 13.2: Division Operations
+ 	ADIV
+ 	ADIVU
+ 	AREM
+@@ -404,13 +424,13 @@ const (
+ 	AREMW
+ 	AREMUW
+ 
+-	// 8.2: Load-Reserved/Store-Conditional Instructions
++	// 14.2: Load-Reserved/Store-Conditional Instructions (Zalrsc)
+ 	ALRD
+ 	ASCD
+ 	ALRW
+ 	ASCW
+ 
+-	// 8.3: Atomic Memory Operations
++	// 14.4: Atomic Memory Operations (Zaamo)
+ 	AAMOSWAPD
+ 	AAMOADDD
+ 	AAMOANDD
+@@ -430,15 +450,7 @@ const (
+ 	AAMOMINW
+ 	AAMOMINUW
+ 
+-	// 10.1: Base Counters and Timers
+-	ARDCYCLE
+-	ARDCYCLEH
+-	ARDTIME
+-	ARDTIMEH
+-	ARDINSTRET
+-	ARDINSTRETH
+-
+-	// 11.2: Floating-Point Control and Status Register
++	// 20.2: Floating-Point Control and Status Register
+ 	AFRCSR
+ 	AFSCSR
+ 	AFRRM
+@@ -448,11 +460,11 @@ const (
+ 	AFSRMI
+ 	AFSFLAGSI
+ 
+-	// 11.5: Single-Precision Load and Store Instructions
++	// 20.5: Single-Precision Load and Store Instructions
+ 	AFLW
+ 	AFSW
+ 
+-	// 11.6: Single-Precision Floating-Point Computational Instructions
++	// 20.6: Single-Precision Floating-Point Computational Instructions
+ 	AFADDS
+ 	AFSUBS
+ 	AFMULS
+@@ -465,7 +477,7 @@ const (
+ 	AFNMADDS
+ 	AFNMSUBS
+ 
+-	// 11.7: Single-Precision Floating-Point Conversion and Move Instructions
++	// 20.7: Single-Precision Floating-Point Conversion and Move Instructions
+ 	AFCVTWS
+ 	AFCVTLS
+ 	AFCVTSW
+@@ -482,19 +494,19 @@ const (
+ 	AFMVXW
+ 	AFMVWX
+ 
+-	// 11.8: Single-Precision Floating-Point Compare Instructions
++	// 20.8: Single-Precision Floating-Point Compare Instructions
+ 	AFEQS
+ 	AFLTS
+ 	AFLES
+ 
+-	// 11.9: Single-Precision Floating-Point Classify Instruction
++	// 20.9: Single-Precision Floating-Point Classify Instruction
+ 	AFCLASSS
+ 
+-	// 12.3: Double-Precision Load and Store Instructions
++	// 21.3: Double-Precision Load and Store Instructions
+ 	AFLD
+ 	AFSD
+ 
+-	// 12.4: Double-Precision Floating-Point Computational Instructions
++	// 21.4: Double-Precision Floating-Point Computational Instructions
+ 	AFADDD
+ 	AFSUBD
+ 	AFMULD
+@@ -507,7 +519,7 @@ const (
+ 	AFNMADDD
+ 	AFNMSUBD
+ 
+-	// 12.5: Double-Precision Floating-Point Conversion and Move Instructions
++	// 21.5: Double-Precision Floating-Point Conversion and Move Instructions
+ 	AFCVTWD
+ 	AFCVTLD
+ 	AFCVTDW
+@@ -524,19 +536,19 @@ const (
+ 	AFMVXD
+ 	AFMVDX
+ 
+-	// 12.6: Double-Precision Floating-Point Compare Instructions
++	// 21.6: Double-Precision Floating-Point Compare Instructions
+ 	AFEQD
+ 	AFLTD
+ 	AFLED
+ 
+-	// 12.7: Double-Precision Floating-Point Classify Instruction
++	// 21.7: Double-Precision Floating-Point Classify Instruction
+ 	AFCLASSD
+ 
+-	// 13.1 Quad-Precision Load and Store Instructions
++	// 22.1 Quad-Precision Load and Store Instructions
+ 	AFLQ
+ 	AFSQ
+ 
+-	// 13.2: Quad-Precision Computational Instructions
++	// 22.2: Quad-Precision Computational Instructions
+ 	AFADDQ
+ 	AFSUBQ
+ 	AFMULQ
+@@ -549,7 +561,7 @@ const (
+ 	AFNMADDQ
+ 	AFNMSUBQ
+ 
+-	// 13.3 Quad-Precision Convert and Move Instructions
++	// 22.3 Quad-Precision Convert and Move Instructions
+ 	AFCVTWQ
+ 	AFCVTLQ
+ 	AFCVTSQ
+@@ -566,46 +578,15 @@ const (
+ 	AFSGNJNQ
+ 	AFSGNJXQ
+ 
+-	// 13.4 Quad-Precision Floating-Point Compare Instructions
++	// 22.4 Quad-Precision Floating-Point Compare Instructions
+ 	AFEQQ
+ 	AFLEQ
+ 	AFLTQ
+ 
+-	// 13.5 Quad-Precision Floating-Point Classify Instruction
++	// 22.5 Quad-Precision Floating-Point Classify Instruction
+ 	AFCLASSQ
+ 
+-	// Privileged ISA (Version 20190608-Priv-MSU-Ratified)
+-
+-	// 3.1.9: Instructions to Access CSRs
+-	ACSRRW
+-	ACSRRS
+-	ACSRRC
+-	ACSRRWI
+-	ACSRRSI
+-	ACSRRCI
+-
+-	// 3.2.1: Environment Call and Breakpoint
+-	AECALL
+-	ASCALL
+-	AEBREAK
+-	ASBREAK
+-
+-	// 3.2.2: Trap-Return Instructions
+-	AMRET
+-	ASRET
+-	ADRET
+-
+-	// 3.2.3: Wait for Interrupt
+-	AWFI
+-
+-	// 4.2.1: Supervisor Memory-Management Fence Instruction
+-	ASFENCEVMA
+-
+-	//
+-	// RISC-V Bit-Manipulation ISA-extensions (1.0)
+-	//
+-
+-	// 1.1: Address Generation Instructions (Zba)
++	// 28.4.1: Address Generation Instructions (Zba)
+ 	AADDUW
+ 	ASH1ADD
+ 	ASH1ADDUW
+@@ -615,7 +596,7 @@ const (
+ 	ASH3ADDUW
+ 	ASLLIUW
+ 
+-	// 1.2: Basic Bit Manipulation (Zbb)
++	// 28.4.2: Basic Bit Manipulation (Zbb)
+ 	AANDN
+ 	AORN
+ 	AXNOR
+@@ -633,7 +614,7 @@ const (
+ 	ASEXTH
+ 	AZEXTH
+ 
+-	// 1.3: Bitwise Rotation (Zbb)
++	// 28.4.3: Bitwise Rotation (Zbb)
+ 	AROL
+ 	AROLW
+ 	AROR
+@@ -643,7 +624,7 @@ const (
+ 	AORCB
+ 	AREV8
+ 
+-	// 1.5: Single-bit Instructions (Zbs)
++	// 28.4.4: Single-bit Instructions (Zbs)
+ 	ABCLR
+ 	ABCLRI
+ 	ABEXT
+@@ -1144,6 +1125,27 @@ const (
+ 	AVMV4RV
+ 	AVMV8RV
+ 
++	//
++	// Privileged ISA (version 20240411)
++	//
++
++	// 3.3.1: Environment Call and Breakpoint
++	AECALL
++	ASCALL
++	AEBREAK
++	ASBREAK
++
++	// 3.3.2: Trap-Return Instructions
++	AMRET
++	ASRET
++	ADRET
++
++	// 3.3.3: Wait for Interrupt
++	AWFI
++
++	// 10.2: Supervisor Memory-Management Fence Instruction
++	ASFENCEVMA
++
+ 	// The escape hatch. Inserts a single 32-bit word.
+ 	AWORD
+ 
+-- 
+2.39.5
+
diff --git a/2059-cmd-internal-obj-add-prologue_end-DWARF-stmt-for-ris.patch b/2059-cmd-internal-obj-add-prologue_end-DWARF-stmt-for-ris.patch
new file mode 100644
index 0000000..1fe9120
--- /dev/null
+++ b/2059-cmd-internal-obj-add-prologue_end-DWARF-stmt-for-ris.patch
@@ -0,0 +1,58 @@
+From 9f63043c4866fc204823e3fd62bacb1cfe0bbeb1 Mon Sep 17 00:00:00 2001
+From: Lin Runze <lrzlin@163.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 059/119] cmd/internal/obj: add prologue_end DWARF stmt for
+ riscv64
+
+This patch adds prologue_end statement to the DWARF info for riscv64,
+which delve debugger uses for skip stacksplit prologue.
+
+Change-Id: I4e5d9c26202385f65b3118b16f53f66de9d327f0
+Reviewed-on: https://go-review.googlesource.com/c/go/+/620295
+Reviewed-by: Hyang-Ah Hana Kim <hyangah@gmail.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/internal/obj/riscv/obj.go | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 6f74f38543..76aec7df8f 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -23,6 +23,7 @@ package riscv
+ import (
+ 	"cmd/internal/obj"
+ 	"cmd/internal/objabi"
++	"cmd/internal/src"
+ 	"cmd/internal/sys"
+ 	"fmt"
+ 	"internal/abi"
+@@ -427,18 +428,23 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
+ 		prologue = stacksplit(ctxt, prologue, cursym, newprog, stacksize) // emit split check
+ 	}
+ 
++	q := prologue
++
+ 	if stacksize != 0 {
+ 		prologue = ctxt.StartUnsafePoint(prologue, newprog)
+ 
+ 		// Actually save LR.
+ 		prologue = obj.Appendp(prologue, newprog)
+ 		prologue.As = AMOV
++		prologue.Pos = q.Pos
+ 		prologue.From = obj.Addr{Type: obj.TYPE_REG, Reg: REG_LR}
+ 		prologue.To = obj.Addr{Type: obj.TYPE_MEM, Reg: REG_SP, Offset: -stacksize}
+ 
+ 		// Insert stack adjustment.
+ 		prologue = obj.Appendp(prologue, newprog)
+ 		prologue.As = AADDI
++		prologue.Pos = q.Pos
++		prologue.Pos = prologue.Pos.WithXlogue(src.PosPrologueEnd)
+ 		prologue.From = obj.Addr{Type: obj.TYPE_CONST, Offset: -stacksize}
+ 		prologue.Reg = REG_SP
+ 		prologue.To = obj.Addr{Type: obj.TYPE_REG, Reg: REG_SP}
+-- 
+2.39.5
+
diff --git a/2060-cmd-internal-obj-riscv-update-RISC-V-instruction-tab.patch b/2060-cmd-internal-obj-riscv-update-RISC-V-instruction-tab.patch
new file mode 100644
index 0000000..bb06b51
--- /dev/null
+++ b/2060-cmd-internal-obj-riscv-update-RISC-V-instruction-tab.patch
@@ -0,0 +1,371 @@
+From b5e96f2d3a7ea8eb3f2114eac8f564197f64af29 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 060/119] cmd/internal/obj/riscv: update RISC-V instruction
+ table
+
+Regenerate RISC-V instruction table from the riscv-opcodes repository,
+due to various changes and shuffling upstream.
+
+This has been changed to remove pseudo-instructions, since Go only
+needs the instruction encodings and including the pseudo-instructions
+is creating unnecessary complications (for example, the inclusion
+of ANOP and ARET, as well as strangely named aliases such as
+AJALPSEUDO/AJALRPSEUDO). Remove pseudo-instructions that are not
+currently supported by the assembler and add specific handling for
+RDCYCLE, RDTIME and RDINSTRET, which were previously implemented
+via the instruction encodings.
+
+Change-Id: I78be4506ba6b627eba1f321406081a63bab5b2e6
+Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64
+Reviewed-on: https://go-review.googlesource.com/c/go/+/616116
+Reviewed-by: Michael Pratt <mpratt@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/internal/obj/riscv/anames.go | 23 ++++------------
+ src/cmd/internal/obj/riscv/cpu.go    | 36 ++++++-------------------
+ src/cmd/internal/obj/riscv/inst.go   | 40 ----------------------------
+ src/cmd/internal/obj/riscv/obj.go    | 35 ++++++++++++++++++------
+ 4 files changed, 40 insertions(+), 94 deletions(-)
+
+diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go
+index dbdce22687..c49569c943 100644
+--- a/src/cmd/internal/obj/riscv/anames.go
++++ b/src/cmd/internal/obj/riscv/anames.go
+@@ -44,8 +44,6 @@ var Anames = []string{
+ 	"SH",
+ 	"SB",
+ 	"FENCE",
+-	"FENCETSO",
+-	"PAUSE",
+ 	"ADDIW",
+ 	"SLLIW",
+ 	"SRLIW",
+@@ -63,12 +61,6 @@ var Anames = []string{
+ 	"CSRRWI",
+ 	"CSRRSI",
+ 	"CSRRCI",
+-	"RDCYCLE",
+-	"RDCYCLEH",
+-	"RDTIME",
+-	"RDTIMEH",
+-	"RDINSTRET",
+-	"RDINSTRETH",
+ 	"MUL",
+ 	"MULH",
+ 	"MULHU",
+@@ -104,14 +96,6 @@ var Anames = []string{
+ 	"AMOMAXUW",
+ 	"AMOMINW",
+ 	"AMOMINUW",
+-	"FRCSR",
+-	"FSCSR",
+-	"FRRM",
+-	"FSRM",
+-	"FRFLAGS",
+-	"FSFLAGS",
+-	"FSRMI",
+-	"FSFLAGSI",
+ 	"FLW",
+ 	"FSW",
+ 	"FADDS",
+@@ -645,15 +629,15 @@ var Anames = []string{
+ 	"BNEZ",
+ 	"FABSD",
+ 	"FABSS",
++	"FNED",
+ 	"FNEGD",
+ 	"FNEGS",
+-	"FNED",
+ 	"FNES",
+ 	"MOV",
+ 	"MOVB",
+ 	"MOVBU",
+-	"MOVF",
+ 	"MOVD",
++	"MOVF",
+ 	"MOVH",
+ 	"MOVHU",
+ 	"MOVW",
+@@ -661,6 +645,9 @@ var Anames = []string{
+ 	"NEG",
+ 	"NEGW",
+ 	"NOT",
++	"RDCYCLE",
++	"RDINSTRET",
++	"RDTIME",
+ 	"SEQZ",
+ 	"SNEZ",
+ 	"LAST",
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index 1c3a13f6c3..a36b95e6d2 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -373,8 +373,6 @@ const (
+ 
+ 	// 2.7: Memory Ordering Instructions
+ 	AFENCE
+-	AFENCETSO
+-	APAUSE
+ 
+ 	// 4.2: Integer Computational Instructions (RV64I)
+ 	AADDIW
+@@ -399,14 +397,6 @@ const (
+ 	ACSRRSI
+ 	ACSRRCI
+ 
+-	// 8.1: Base Counters and Timers (Zicntr)
+-	ARDCYCLE
+-	ARDCYCLEH
+-	ARDTIME
+-	ARDTIMEH
+-	ARDINSTRET
+-	ARDINSTRETH
+-
+ 	// 13.1: Multiplication Operations
+ 	AMUL
+ 	AMULH
+@@ -450,16 +440,6 @@ const (
+ 	AAMOMINW
+ 	AAMOMINUW
+ 
+-	// 20.2: Floating-Point Control and Status Register
+-	AFRCSR
+-	AFSCSR
+-	AFRRM
+-	AFSRM
+-	AFRFLAGS
+-	AFSFLAGS
+-	AFSRMI
+-	AFSFLAGSI
+-
+ 	// 20.5: Single-Precision Load and Store Instructions
+ 	AFLW
+ 	AFSW
+@@ -1163,15 +1143,15 @@ const (
+ 	ABNEZ
+ 	AFABSD
+ 	AFABSS
++	AFNED
+ 	AFNEGD
+ 	AFNEGS
+-	AFNED
+ 	AFNES
+ 	AMOV
+ 	AMOVB
+ 	AMOVBU
+-	AMOVF
+ 	AMOVD
++	AMOVF
+ 	AMOVH
+ 	AMOVHU
+ 	AMOVW
+@@ -1179,6 +1159,9 @@ const (
+ 	ANEG
+ 	ANEGW
+ 	ANOT
++	ARDCYCLE
++	ARDINSTRET
++	ARDTIME
+ 	ASEQZ
+ 	ASNEZ
+ 
+@@ -1237,12 +1220,9 @@ const (
+ // Any instructions not listed here are assumed to either be non-unary or to read
+ // from its argument.
+ var unaryDst = map[obj.As]bool{
+-	ARDCYCLE:    true,
+-	ARDCYCLEH:   true,
+-	ARDTIME:     true,
+-	ARDTIMEH:    true,
+-	ARDINSTRET:  true,
+-	ARDINSTRETH: true,
++	ARDCYCLE:   true,
++	ARDTIME:    true,
++	ARDINSTRET: true,
+ }
+ 
+ // Instruction encoding masks.
+diff --git a/src/cmd/internal/obj/riscv/inst.go b/src/cmd/internal/obj/riscv/inst.go
+index c264f6ae15..2d9132e532 100644
+--- a/src/cmd/internal/obj/riscv/inst.go
++++ b/src/cmd/internal/obj/riscv/inst.go
+@@ -212,8 +212,6 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x53, 0x0, 0x0, 0x0, 384, 0xc}
+ 	case AFENCE:
+ 		return &inst{0xf, 0x0, 0x0, 0x0, 0, 0x0}
+-	case AFENCETSO:
+-		return &inst{0xf, 0x0, 0x0, 0x13, -1997, 0x41}
+ 	case AFEQD:
+ 		return &inst{0x53, 0x2, 0x0, 0x0, -1504, 0x51}
+ 	case AFEQQ:
+@@ -270,14 +268,10 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x53, 0x0, 0x0, 0x0, 256, 0x8}
+ 	case AFMVDX:
+ 		return &inst{0x53, 0x0, 0x0, 0x0, -224, 0x79}
+-	case AFMVSX:
+-		return &inst{0x53, 0x0, 0x0, 0x0, -256, 0x78}
+ 	case AFMVWX:
+ 		return &inst{0x53, 0x0, 0x0, 0x0, -256, 0x78}
+ 	case AFMVXD:
+ 		return &inst{0x53, 0x0, 0x0, 0x0, -480, 0x71}
+-	case AFMVXS:
+-		return &inst{0x53, 0x0, 0x0, 0x0, -512, 0x70}
+ 	case AFMVXW:
+ 		return &inst{0x53, 0x0, 0x0, 0x0, -512, 0x70}
+ 	case AFNMADDD:
+@@ -292,20 +286,8 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x4b, 0x0, 0x0, 0x0, 96, 0x3}
+ 	case AFNMSUBS:
+ 		return &inst{0x4b, 0x0, 0x0, 0x0, 0, 0x0}
+-	case AFRCSR:
+-		return &inst{0x73, 0x2, 0x0, 0x3, 3, 0x0}
+-	case AFRFLAGS:
+-		return &inst{0x73, 0x2, 0x0, 0x1, 1, 0x0}
+-	case AFRRM:
+-		return &inst{0x73, 0x2, 0x0, 0x2, 2, 0x0}
+-	case AFSCSR:
+-		return &inst{0x73, 0x1, 0x0, 0x3, 3, 0x0}
+ 	case AFSD:
+ 		return &inst{0x27, 0x3, 0x0, 0x0, 0, 0x0}
+-	case AFSFLAGS:
+-		return &inst{0x73, 0x1, 0x0, 0x1, 1, 0x0}
+-	case AFSFLAGSI:
+-		return &inst{0x73, 0x5, 0x0, 0x1, 1, 0x0}
+ 	case AFSGNJD:
+ 		return &inst{0x53, 0x0, 0x0, 0x0, 544, 0x11}
+ 	case AFSGNJQ:
+@@ -332,10 +314,6 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x53, 0x0, 0x0, 0x0, 1504, 0x2f}
+ 	case AFSQRTS:
+ 		return &inst{0x53, 0x0, 0x0, 0x0, 1408, 0x2c}
+-	case AFSRM:
+-		return &inst{0x73, 0x1, 0x0, 0x2, 2, 0x0}
+-	case AFSRMI:
+-		return &inst{0x73, 0x5, 0x0, 0x2, 2, 0x0}
+ 	case AFSUBD:
+ 		return &inst{0x53, 0x0, 0x0, 0x0, 160, 0x5}
+ 	case AFSUBQ:
+@@ -396,20 +374,6 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x13, 0x6, 0x0, 0x0, 0, 0x0}
+ 	case AORN:
+ 		return &inst{0x33, 0x6, 0x0, 0x0, 1024, 0x20}
+-	case APAUSE:
+-		return &inst{0xf, 0x0, 0x0, 0x10, 16, 0x0}
+-	case ARDCYCLE:
+-		return &inst{0x73, 0x2, 0x0, 0x0, -1024, 0x60}
+-	case ARDCYCLEH:
+-		return &inst{0x73, 0x2, 0x0, 0x0, -896, 0x64}
+-	case ARDINSTRET:
+-		return &inst{0x73, 0x2, 0x0, 0x2, -1022, 0x60}
+-	case ARDINSTRETH:
+-		return &inst{0x73, 0x2, 0x0, 0x2, -894, 0x64}
+-	case ARDTIME:
+-		return &inst{0x73, 0x2, 0x0, 0x1, -1023, 0x60}
+-	case ARDTIMEH:
+-		return &inst{0x73, 0x2, 0x0, 0x1, -895, 0x64}
+ 	case AREM:
+ 		return &inst{0x33, 0x6, 0x0, 0x0, 32, 0x1}
+ 	case AREMU:
+@@ -434,14 +398,10 @@ func encode(a obj.As) *inst {
+ 		return &inst{0x3b, 0x5, 0x0, 0x0, 1536, 0x30}
+ 	case ASB:
+ 		return &inst{0x23, 0x0, 0x0, 0x0, 0, 0x0}
+-	case ASBREAK:
+-		return &inst{0x73, 0x0, 0x0, 0x1, 1, 0x0}
+ 	case ASCD:
+ 		return &inst{0x2f, 0x3, 0x0, 0x0, 384, 0xc}
+ 	case ASCW:
+ 		return &inst{0x2f, 0x2, 0x0, 0x0, 384, 0xc}
+-	case ASCALL:
+-		return &inst{0x73, 0x0, 0x0, 0x0, 0, 0x0}
+ 	case ASD:
+ 		return &inst{0x23, 0x3, 0x0, 0x0, 0, 0x0}
+ 	case ASEXTB:
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 76aec7df8f..6b490a8967 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -155,6 +155,14 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 	case obj.AUNDEF:
+ 		p.As = AEBREAK
+ 
++	case AFMVXS:
++		// FMVXS is the old name for FMVXW.
++		p.As = AFMVXW
++
++	case AFMVSX:
++		// FMVSX is the old name for FMVWX.
++		p.As = AFMVWX
++
+ 	case ASCALL:
+ 		// SCALL is the old name for ECALL.
+ 		p.As = AECALL
+@@ -1626,6 +1634,9 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	ALD & obj.AMask: iIIEncoding,
+ 	ASD & obj.AMask: sIEncoding,
+ 
++	// 7.1: CSR Instructions
++	ACSRRS & obj.AMask: iIIEncoding,
++
+ 	// 7.1: Multiplication Operations
+ 	AMUL & obj.AMask:    rIIIEncoding,
+ 	AMULH & obj.AMask:   rIIIEncoding,
+@@ -1667,11 +1678,6 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	AAMOMINUW & obj.AMask: rIIIEncoding,
+ 	AAMOMINUD & obj.AMask: rIIIEncoding,
+ 
+-	// 10.1: Base Counters and Timers
+-	ARDCYCLE & obj.AMask:   iIIEncoding,
+-	ARDTIME & obj.AMask:    iIIEncoding,
+-	ARDINSTRET & obj.AMask: iIIEncoding,
+-
+ 	// 11.5: Single-Precision Load and Store Instructions
+ 	AFLW & obj.AMask: iFEncoding,
+ 	AFSW & obj.AMask: sFEncoding,
+@@ -1701,8 +1707,6 @@ var encodings = [ALAST & obj.AMask]encoding{
+ 	AFSGNJS & obj.AMask:  rFFFEncoding,
+ 	AFSGNJNS & obj.AMask: rFFFEncoding,
+ 	AFSGNJXS & obj.AMask: rFFFEncoding,
+-	AFMVXS & obj.AMask:   rFIEncoding,
+-	AFMVSX & obj.AMask:   rIFEncoding,
+ 	AFMVXW & obj.AMask:   rFIEncoding,
+ 	AFMVWX & obj.AMask:   rIFEncoding,
+ 
+@@ -2418,7 +2422,7 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		ins.funct7 = 3
+ 		ins.rd, ins.rs1, ins.rs2 = uint32(p.RegTo2), uint32(p.To.Reg), uint32(p.From.Reg)
+ 
+-	case AECALL, AEBREAK, ARDCYCLE, ARDTIME, ARDINSTRET:
++	case AECALL, AEBREAK:
+ 		insEnc := encode(p.As)
+ 		if p.To.Type == obj.TYPE_NONE {
+ 			ins.rd = REG_ZERO
+@@ -2426,6 +2430,21 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		ins.rs1 = REG_ZERO
+ 		ins.imm = insEnc.csr
+ 
++	case ARDCYCLE, ARDTIME, ARDINSTRET:
++		ins.as = ACSRRS
++		if p.To.Type == obj.TYPE_NONE {
++			ins.rd = REG_ZERO
++		}
++		ins.rs1 = REG_ZERO
++		switch p.As {
++		case ARDCYCLE:
++			ins.imm = -1024
++		case ARDTIME:
++			ins.imm = -1023
++		case ARDINSTRET:
++			ins.imm = -1022
++		}
++
+ 	case AFENCE:
+ 		ins.rd, ins.rs1, ins.rs2 = REG_ZERO, REG_ZERO, obj.REG_NONE
+ 		ins.imm = 0x0ff
+-- 
+2.39.5
+
diff --git a/2061-crypto-sha512-improve-performance-of-riscv64-assembl.patch b/2061-crypto-sha512-improve-performance-of-riscv64-assembl.patch
new file mode 100644
index 0000000..3c92e2f
--- /dev/null
+++ b/2061-crypto-sha512-improve-performance-of-riscv64-assembl.patch
@@ -0,0 +1,110 @@
+From 07fe1dae08b8a60a417bc3c9b65e01ea19e3e3cd Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 061/119] crypto/sha512: improve performance of riscv64
+ assembly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Implement optimised versions of Maj and Ch, which reduce the number of
+instructions required per round. Reorder instructions for better
+interleaving.
+
+This gives around a 10% gain on a StarFive VisionFive 2:
+
+                    │  sha512.1   │              sha512.2               │
+                    │   sec/op    │   sec/op     vs base                │
+Hash8Bytes/New-4      9.310µ ± 0%   8.564µ ± 0%   -8.01% (p=0.000 n=10)
+Hash8Bytes/Sum384-4   8.833µ ± 0%   7.980µ ± 0%   -9.66% (p=0.000 n=10)
+Hash8Bytes/Sum512-4   9.293µ ± 0%   8.162µ ± 0%  -12.17% (p=0.000 n=10)
+Hash1K/New-4          49.60µ ± 0%   44.33µ ± 0%  -10.63% (p=0.000 n=10)
+Hash1K/Sum384-4       48.93µ ± 0%   43.78µ ± 0%  -10.53% (p=0.000 n=10)
+Hash1K/Sum512-4       49.48µ ± 0%   43.96µ ± 0%  -11.15% (p=0.000 n=10)
+Hash8K/New-4          327.9µ ± 0%   292.6µ ± 0%  -10.78% (p=0.000 n=10)
+Hash8K/Sum384-4       327.3µ ± 0%   292.0µ ± 0%  -10.77% (p=0.000 n=10)
+Hash8K/Sum512-4       327.8µ ± 0%   292.2µ ± 0%  -10.85% (p=0.000 n=10)
+geomean               52.87µ        47.31µ       -10.51%
+
+                    │   sha512.1   │               sha512.2               │
+                    │     B/s      │     B/s       vs base                │
+Hash8Bytes/New-4      839.8Ki ± 0%   908.2Ki ± 0%   +8.14% (p=0.000 n=10)
+Hash8Bytes/Sum384-4   888.7Ki ± 1%   976.6Ki ± 0%   +9.89% (p=0.000 n=10)
+Hash8Bytes/Sum512-4   839.8Ki ± 0%   957.0Ki ± 0%  +13.95% (p=0.000 n=10)
+Hash1K/New-4          19.69Mi ± 0%   22.03Mi ± 0%  +11.86% (p=0.000 n=10)
+Hash1K/Sum384-4       19.96Mi ± 0%   22.31Mi ± 0%  +11.75% (p=0.000 n=10)
+Hash1K/Sum512-4       19.74Mi ± 0%   22.21Mi ± 0%  +12.51% (p=0.000 n=10)
+Hash8K/New-4          23.82Mi ± 0%   26.70Mi ± 0%  +12.09% (p=0.000 n=10)
+Hash8K/Sum384-4       23.87Mi ± 0%   26.75Mi ± 0%  +12.07% (p=0.000 n=10)
+Hash8K/Sum512-4       23.83Mi ± 0%   26.73Mi ± 0%  +12.16% (p=0.000 n=10)
+geomean               7.334Mi        8.184Mi       +11.59%
+
+Change-Id: I66e359e96b25b38efbc4d840e6b2d6a1e5d417ec
+Reviewed-on: https://go-review.googlesource.com/c/go/+/605495
+Reviewed-by: David Chase <drchase@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/crypto/sha512/sha512block_riscv64.s | 24 ++++++++++++------------
+ 1 file changed, 12 insertions(+), 12 deletions(-)
+
+diff --git a/src/crypto/sha512/sha512block_riscv64.s b/src/crypto/sha512/sha512block_riscv64.s
+index e3a240f70e..0281464e4d 100644
+--- a/src/crypto/sha512/sha512block_riscv64.s
++++ b/src/crypto/sha512/sha512block_riscv64.s
+@@ -98,38 +98,38 @@
+ //   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
+ //     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
+ //     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
++//                 = ((y XOR z) AND x) XOR z
+ #define SHA512T1(index, e, f, g, h) \
+ 	MOV	(index*8)(X18), X8; \
+ 	ADD	X5, h; \
+ 	ROR	$14, e, X6; \
+ 	ADD	X8, h; \
+ 	ROR	$18, e, X7; \
+-	XOR	X7, X6; \
+ 	ROR	$41, e, X8; \
++	XOR	X7, X6; \
++	XOR	f, g, X5; \
+ 	XOR	X8, X6; \
++	AND	e, X5; \
+ 	ADD	X6, h; \
+-	AND	e, f, X5; \
+-	NOT	e, X7; \
+-	AND	g, X7; \
+-	XOR	X7, X5; \
++	XOR	g, X5; \
+ 	ADD	h, X5
+ 
+ // Calculate T2 in X6.
+ //   T2 = BIGSIGMA0(a) + Maj(a, b, c)
+ //     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
+ //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
++//                  = ((y XOR z) AND x) XOR (y AND z)
+ #define SHA512T2(a, b, c) \
+ 	ROR	$28, a, X6; \
+ 	ROR	$34, a, X7; \
+-	XOR	X7, X6; \
+ 	ROR	$39, a, X8; \
++	XOR	X7, X6; \
++	XOR	b, c, X9; \
++	AND	b, c, X7; \
++	AND	a, X9; \
+ 	XOR	X8, X6; \
+-	AND	a, b, X7; \
+-	AND	a, c, X8; \
+-	XOR	X8, X7; \
+-	AND	b, c, X9; \
+-	XOR	X9, X7; \
+-	ADD	X7, X6
++	XOR	X7, X9; \
++	ADD	X9, X6
+ 
+ // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
+ // The values for e and a are stored in d and h, ready for rotation.
+-- 
+2.39.5
+
diff --git a/2062-internal-bytealg-optimize-IndexByte-for-riscv64.patch b/2062-internal-bytealg-optimize-IndexByte-for-riscv64.patch
new file mode 100644
index 0000000..0236441
--- /dev/null
+++ b/2062-internal-bytealg-optimize-IndexByte-for-riscv64.patch
@@ -0,0 +1,466 @@
+From b8ad4498aa879366c7eaa60a3999a48c67195249 Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 062/119] internal/bytealg: optimize IndexByte for riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The existing implementations of IndexByte and IndexByteString for
+riscv64 are very simplistic.  They load and compare a single byte at
+a time in a tight loop.  It's possible to improve performance in the
+general case by loading and checking 8 bytes at a time.  This is
+achieved using the 'Determine if a word has a byte equal to n' bit
+hack from https://graphics.stanford.edu/~seander/bithacks.html.
+
+We broadcast the byte we're looking for across a 64 bit register,
+let v be the result of xoring that register with 8 bytes loaded
+from the buffer and then use the formula,
+
+(((v) - 0x0101010101010101UL) & ~(v) & 0x8080808080808080UL)
+
+which evaluates to true if any one of the bytes in v is 0, i.e,
+matches the byte we're looking for.  We then just need to figure
+out which byte out of the 8 it is to return the correct index.
+
+This change generally improves performance when the byte we're
+looking for is not in the first 24 bytes of the buffer and degrades
+performance slightly when it is.
+
+Some example benchmarks results from the bytes and strings package
+are presented below.  These were generated on a VisionFive2 running
+Ubuntu 24.04.
+
+Subset of bytes Index benchmarks
+
+IndexByte/10                   46.49n ± 0%   44.08n ± 0%   -5.19% (p=0.000 n=10)
+IndexByte/32                   75.98n ± 0%   67.90n ± 0%  -10.63% (p=0.000 n=10)
+IndexByte/4K                   5.512µ ± 0%   2.113µ ± 0%  -61.67% (p=0.000 n=10)
+IndexByte/4M                   7.354m ± 0%   3.218m ± 0%  -56.24% (p=0.000 n=10)
+IndexByte/64M                  90.15m ± 0%   33.86m ± 0%  -62.44% (p=0.000 n=10)
+IndexBytePortable/10           50.41n ± 0%   54.92n ± 1%   +8.94% (p=0.000 n=10)
+IndexBytePortable/32           111.9n ± 0%   115.5n ± 0%   +3.22% (p=0.000 n=10)
+IndexBytePortable/4K           10.99µ ± 0%   10.99µ ± 0%   +0.04% (p=0.000 n=10)
+IndexBytePortable/4M           11.24m ± 0%   11.24m ± 0%        ~ (p=0.218 n=10)
+IndexBytePortable/64M          179.8m ± 0%   179.8m ± 0%   +0.01% (p=0.001 n=10)
+IndexRune/10                   104.2n ± 0%   104.4n ± 0%   +0.19% (p=0.000 n=10)
+IndexRune/32                   133.7n ± 0%   139.3n ± 0%   +4.23% (p=0.000 n=10)
+IndexRune/4K                   5.573µ ± 0%   2.184µ ± 0%  -60.81% (p=0.000 n=10)
+IndexRune/4M                   5.634m ± 0%   2.112m ± 0%  -62.51% (p=0.000 n=10)
+IndexRune/64M                  90.19m ± 0%   33.87m ± 0%  -62.45% (p=0.000 n=10)
+IndexRuneASCII/10              50.42n ± 2%   47.14n ± 0%   -6.52% (p=0.000 n=10)
+IndexRuneASCII/32              79.64n ± 1%   70.39n ± 0%  -11.61% (p=0.000 n=10)
+IndexRuneASCII/4K              5.516µ ± 0%   2.115µ ± 0%  -61.66% (p=0.000 n=10)
+IndexRuneASCII/4M              5.634m ± 0%   2.112m ± 0%  -62.51% (p=0.000 n=10)
+IndexRuneASCII/64M             90.16m ± 0%   33.86m ± 0%  -62.44% (p=0.000 n=10)
+IndexRuneUnicode/Latin/10      82.14n ± 0%   82.07n ± 0%   -0.09% (p=0.000 n=10)
+IndexRuneUnicode/Latin/32      111.6n ± 0%   117.1n ± 0%   +4.93% (p=0.000 n=10)
+IndexRuneUnicode/Latin/4K      6.222µ ± 0%   3.429µ ± 0%  -44.89% (p=0.000 n=10)
+IndexRuneUnicode/Latin/4M      8.189m ± 0%   4.706m ± 0%  -42.53% (p=0.000 n=10)
+IndexRuneUnicode/Latin/64M     171.8m ± 2%   105.8m ± 0%  -38.44% (p=0.000 n=10)
+IndexRuneUnicode/Cyrillic/10   89.69n ± 0%   89.67n ± 0%   -0.02% (p=0.000 n=10)
+IndexRuneUnicode/Cyrillic/32   119.1n ± 0%   124.1n ± 0%   +4.20% (p=0.000 n=10)
+IndexRuneUnicode/Cyrillic/4K   8.002µ ± 0%   6.232µ ± 0%  -22.12% (p=0.000 n=10)
+IndexRuneUnicode/Cyrillic/4M   9.501m ± 0%   7.510m ± 0%  -20.95% (p=0.000 n=10)
+IndexRuneUnicode/Cyrillic/64M  186.5m ± 0%   150.3m ± 0%  -19.41% (p=0.000 n=10)
+IndexRuneUnicode/Han/10        117.8n ± 0%   118.1n ± 0%   +0.25% (p=0.000 n=10)
+IndexRuneUnicode/Han/32        151.5n ± 0%   154.0n ± 0%   +1.65% (p=0.000 n=10)
+IndexRuneUnicode/Han/4K        6.664µ ± 0%   4.125µ ± 0%  -38.11% (p=0.000 n=10)
+IndexRuneUnicode/Han/4M        8.526m ± 0%   5.502m ± 0%  -35.46% (p=0.000 n=10)
+IndexRuneUnicode/Han/64M       171.8m ± 1%   112.2m ± 0%  -34.68% (p=0.000 n=10)
+Index/10                       199.3n ± 1%   199.4n ± 0%        ~ (p=1.000 n=10)
+Index/32                       547.7n ± 0%   547.3n ± 0%   -0.08% (p=0.001 n=10)
+Index/4K                       38.62µ ± 0%   38.62µ ± 0%   -0.01% (p=0.023 n=10)
+Index/4M                       40.46m ± 0%   40.45m ± 0%        ~ (p=0.105 n=10)
+Index/64M                      648.5m ± 0%   648.4m ± 0%        ~ (p=1.000 n=10)
+IndexEasy/10                   70.25n ± 0%   70.92n ± 0%   +0.95% (p=0.000 n=10)
+IndexEasy/32                  104.60n ± 0%   95.67n ± 0%   -8.54% (p=0.000 n=10)
+IndexEasy/4K                   5.544µ ± 0%   2.142µ ± 0%  -61.36% (p=0.000 n=10)
+IndexEasy/4M                   7.354m ± 0%   3.213m ± 0%  -56.32% (p=0.000 n=10)
+IndexEasy/64M                 114.93m ± 2%   52.61m ± 0%  -54.22% (p=0.000 n=10)
+IndexHard1                     10.09m ± 0%   10.09m ± 0%        ~ (p=0.393 n=10)
+IndexHard2                     10.09m ± 0%   10.09m ± 0%        ~ (p=0.481 n=10)
+IndexHard3                     10.09m ± 0%   10.09m ± 0%        ~ (p=1.000 n=10)
+IndexHard4                     10.09m ± 0%   10.09m ± 0%        ~ (p=0.739 n=10)
+LastIndexHard1                 10.71m ± 0%   10.71m ± 0%        ~ (p=0.052 n=10)
+LastIndexHard2                 10.71m ± 0%   10.71m ± 0%        ~ (p=0.218 n=10)
+LastIndexHard3                 10.71m ± 0%   10.71m ± 0%        ~ (p=0.739 n=10)
+IndexAnyASCII/1:1              30.13n ± 0%   30.79n ± 0%   +2.19% (p=0.000 n=10)
+IndexAnyASCII/1:2              31.49n ± 0%   32.16n ± 0%   +2.13% (p=0.000 n=10)
+IndexAnyASCII/1:4              34.16n ± 0%   34.82n ± 0%   +1.93% (p=0.000 n=10)
+IndexAnyASCII/1:8              39.50n ± 0%   40.16n ± 0%   +1.67% (p=0.000 n=10)
+IndexAnyASCII/1:16             50.20n ± 0%   50.87n ± 0%   +1.33% (p=0.000 n=10)
+IndexAnyASCII/1:32             81.04n ± 0%   50.29n ± 0%  -37.94% (p=0.000 n=10)
+IndexAnyASCII/1:64            119.80n ± 0%   66.94n ± 0%  -44.13% (p=0.000 n=10)
+IndexAnyASCII/16:1             54.86n ± 0%   55.53n ± 0%   +1.22% (p=0.000 n=10)
+IndexAnyASCII/16:2             268.2n ± 0%   268.2n ± 0%        ~ (p=1.000 n=10)
+IndexAnyASCII/16:4             288.1n ± 0%   288.1n ± 0%        ~ (p=1.000 n=10) ¹
+IndexAnyASCII/16:8             328.3n ± 0%   328.2n ± 0%        ~ (p=0.370 n=10)
+IndexAnyASCII/16:16            413.4n ± 0%   413.4n ± 0%        ~ (p=0.628 n=10)
+IndexAnyASCII/16:32            574.0n ± 0%   573.9n ± 0%        ~ (p=0.141 n=10)
+IndexAnyASCII/16:64            895.1n ± 0%   895.1n ± 0%        ~ (p=0.548 n=10)
+IndexAnyASCII/256:1            381.4n ± 0%   175.4n ± 0%  -53.99% (p=0.000 n=10)
+IndexAnyASCII/256:2            2.998µ ± 0%   2.998µ ± 0%        ~ (p=0.365 n=10)
+IndexAnyASCII/256:4            3.018µ ± 0%   3.018µ ± 0%        ~ (p=0.650 n=10)
+IndexAnyASCII/256:8            3.058µ ± 0%   3.064µ ± 0%   +0.20% (p=0.011 n=10)
+IndexAnyASCII/256:16           3.143µ ± 0%   3.150µ ± 0%   +0.21% (p=0.000 n=10)
+IndexAnyASCII/256:32           3.303µ ± 0%   3.307µ ± 0%   +0.12% (p=0.000 n=10)
+IndexAnyASCII/256:64           3.625µ ± 0%   3.638µ ± 0%   +0.36% (p=0.000 n=10)
+IndexAnyUTF8/1:1               30.13n ± 0%   30.94n ± 0%   +2.69% (p=0.000 n=10)
+IndexAnyUTF8/1:2               31.49n ± 0%   32.30n ± 0%   +2.59% (p=0.000 n=10)
+IndexAnyUTF8/1:4               34.16n ± 0%   35.03n ± 0%   +2.55% (p=0.000 n=10)
+IndexAnyUTF8/1:8               39.50n ± 0%   40.16n ± 0%   +1.67% (p=0.000 n=10)
+IndexAnyUTF8/1:16              50.20n ± 0%   50.84n ± 0%   +1.27% (p=0.000 n=10)
+IndexAnyUTF8/1:32              81.02n ± 0%   61.55n ± 0%  -24.03% (p=0.000 n=10)
+IndexAnyUTF8/1:64             119.80n ± 0%   80.04n ± 0%  -33.19% (p=0.000 n=10)
+IndexAnyUTF8/16:1              489.0n ± 0%   489.0n ± 0%        ~ (p=1.000 n=10)
+IndexAnyUTF8/16:2              361.9n ± 0%   372.6n ± 0%   +2.96% (p=0.000 n=10)
+IndexAnyUTF8/16:4              404.7n ± 0%   415.4n ± 0%   +2.64% (p=0.000 n=10)
+IndexAnyUTF8/16:8              489.9n ± 0%   500.7n ± 0%   +2.20% (p=0.000 n=10)
+IndexAnyUTF8/16:16             661.2n ± 0%   671.9n ± 0%   +1.62% (p=0.000 n=10)
+IndexAnyUTF8/16:32            1004.0n ± 0%   881.6n ± 0%  -12.19% (p=0.000 n=10)
+IndexAnyUTF8/16:64             1.767µ ± 0%   1.129µ ± 0%  -36.11% (p=0.000 n=10)
+IndexAnyUTF8/256:1             7.072µ ± 0%   7.072µ ± 0%        ~ (p=0.387 n=10)
+IndexAnyUTF8/256:2             4.700µ ± 0%   4.872µ ± 0%   +3.66% (p=0.000 n=10)
+IndexAnyUTF8/256:4             5.386µ ± 0%   5.557µ ± 0%   +3.18% (p=0.000 n=10)
+IndexAnyUTF8/256:8             6.752µ ± 0%   6.923µ ± 0%   +2.53% (p=0.000 n=10)
+IndexAnyUTF8/256:16            9.493µ ± 0%   9.664µ ± 0%   +1.80% (p=0.000 n=10)
+IndexAnyUTF8/256:32            14.97µ ± 0%   12.93µ ± 0%  -13.64% (p=0.000 n=10)
+IndexAnyUTF8/256:64            27.15µ ± 0%   16.89µ ± 0%  -37.80% (p=0.000 n=10)
+LastIndexAnyASCII/1:1          30.78n ± 0%   31.45n ± 0%   +2.18% (p=0.000 n=10)
+LastIndexAnyASCII/1:2          32.13n ± 0%   32.80n ± 0%   +2.07% (p=0.000 n=10)
+LastIndexAnyASCII/1:4          34.81n ± 0%   35.48n ± 0%   +1.92% (p=0.000 n=10)
+LastIndexAnyASCII/1:8          40.14n ± 0%   40.81n ± 0%   +1.67% (p=0.000 n=10)
+LastIndexAnyASCII/1:16         50.85n ± 0%   51.51n ± 0%   +1.30% (p=0.000 n=10)
+LastIndexAnyASCII/1:32         84.03n ± 0%   50.85n ± 0%  -39.49% (p=0.000 n=10)
+LastIndexAnyASCII/1:64        121.50n ± 0%   68.16n ± 0%  -43.90% (p=0.000 n=10)
+LastIndexAnyASCII/16:1         249.7n ± 0%   249.7n ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/16:2         255.2n ± 0%   255.2n ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/16:4         274.0n ± 0%   274.0n ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/16:8         314.1n ± 0%   314.1n ± 0%        ~ (p=1.000 n=10)
+LastIndexAnyASCII/16:16        403.8n ± 0%   403.8n ± 0%        ~ (p=1.000 n=10)
+LastIndexAnyASCII/16:32        564.4n ± 0%   564.4n ± 0%        ~ (p=1.000 n=10)
+LastIndexAnyASCII/16:64        885.5n ± 0%   885.5n ± 0%        ~ (p=0.474 n=10)
+LastIndexAnyASCII/256:1        2.819µ ± 0%   2.819µ ± 0%        ~ (p=0.211 n=10)
+LastIndexAnyASCII/256:2        2.824µ ± 0%   2.824µ ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/256:4        2.843µ ± 0%   2.843µ ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/256:8        2.883µ ± 0%   2.883µ ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/256:16       2.973µ ± 0%   2.973µ ± 0%        ~ (p=1.000 n=10)
+LastIndexAnyASCII/256:32       3.133µ ± 0%   3.133µ ± 0%        ~ (p=0.628 n=10)
+LastIndexAnyASCII/256:64       3.454µ ± 0%   3.454µ ± 0%        ~ (p=1.000 n=10)
+LastIndexAnyUTF8/1:1           30.78n ± 0%   31.45n ± 0%   +2.18% (p=0.000 n=10)
+LastIndexAnyUTF8/1:2           32.13n ± 0%   32.80n ± 0%   +2.07% (p=0.000 n=10)
+LastIndexAnyUTF8/1:4           34.81n ± 0%   35.48n ± 0%   +1.92% (p=0.000 n=10)
+LastIndexAnyUTF8/1:8           40.14n ± 0%   40.81n ± 0%   +1.67% (p=0.000 n=10)
+LastIndexAnyUTF8/1:16          50.84n ± 0%   51.52n ± 0%   +1.33% (p=0.000 n=10)
+LastIndexAnyUTF8/1:32          83.87n ± 0%   62.90n ± 0%  -25.00% (p=0.000 n=10)
+LastIndexAnyUTF8/1:64         121.50n ± 0%   81.67n ± 0%  -32.78% (p=0.000 n=10)
+LastIndexAnyUTF8/16:1          330.0n ± 0%   330.0n ± 0%        ~ (p=1.000 n=10)
+LastIndexAnyUTF8/16:2          365.4n ± 1%   376.1n ± 0%   +2.93% (p=0.000 n=10)
+LastIndexAnyUTF8/16:4          399.9n ± 0%   410.6n ± 0%   +2.68% (p=0.000 n=10)
+LastIndexAnyUTF8/16:8          485.5n ± 0%   496.2n ± 0%   +2.20% (p=0.000 n=10)
+LastIndexAnyUTF8/16:16         656.8n ± 0%   667.5n ± 0%   +1.63% (p=0.000 n=10)
+LastIndexAnyUTF8/16:32         999.3n ± 0%   882.6n ± 0%  -11.68% (p=0.000 n=10)
+LastIndexAnyUTF8/16:64         1.744µ ± 0%   1.129µ ± 0%  -35.26% (p=0.000 n=10)
+LastIndexAnyUTF8/256:1         4.023µ ± 0%   4.023µ ± 0%    0.00% (p=0.033 n=10)
+LastIndexAnyUTF8/256:2         4.645µ ± 0%   4.816µ ± 0%   +3.68% (p=0.000 n=10)
+LastIndexAnyUTF8/256:4         5.217µ ± 0%   5.388µ ± 0%   +3.28% (p=0.000 n=10)
+LastIndexAnyUTF8/256:8         6.587µ ± 0%   6.758µ ± 0%   +2.60% (p=0.000 n=10)
+LastIndexAnyUTF8/256:16        9.327µ ± 0%   9.498µ ± 0%   +1.83% (p=0.000 n=10)
+LastIndexAnyUTF8/256:32        14.81µ ± 0%   12.92µ ± 0%  -12.73% (p=0.000 n=10)
+LastIndexAnyUTF8/256:64        26.69µ ± 0%   16.84µ ± 0%  -36.92% (p=0.000 n=10)
+IndexPeriodic/IndexPeriodic2   625.6µ ± 0%   625.6µ ± 0%        ~ (p=0.529 n=10)
+IndexPeriodic/IndexPeriodic4   625.5µ ± 0%   625.6µ ± 0%   +0.01% (p=0.002 n=10)
+IndexPeriodic/IndexPeriodic8   625.4µ ± 0%   625.4µ ± 0%   +0.01% (p=0.001 n=10)
+IndexPeriodic/IndexPeriodic16  236.5µ ± 0%   225.4µ ± 0%   -4.69% (p=0.000 n=10)
+IndexPeriodic/IndexPeriodic32  171.1µ ± 3%   133.4µ ± 0%  -22.05% (p=0.000 n=10)
+IndexPeriodic/IndexPeriodic64 139.10µ ± 3%   89.28µ ± 0%  -35.82% (p=0.000 n=10)
+geomean                        4.222µ        3.628µ       -14.0
+
+Subset of strings Index benchmarks
+
+IndexRune                      110.7n ± 0%   117.7n ± 0%   +6.32% (p=0.000 n=10)
+IndexRuneLongString            246.6n ± 0%   187.4n ± 3%  -24.01% (p=0.000 n=10)
+IndexRuneFastPath              46.82n ± 0%   46.06n ± 0%   -1.62% (p=0.000 n=10)
+Index                          48.28n ± 0%   47.61n ± 0%   -1.39% (p=0.000 n=10)
+LastIndex                      34.50n ± 0%   34.50n ± 0%        ~ (p=1.000 n=10) ¹
+IndexByte                      41.72n ± 0%   40.83n ± 0%   -2.13% (p=0.000 n=10)
+IndexHard1                     10.01m ± 0%   10.01m ± 0%   +0.02% (p=0.000 n=10)
+IndexHard2                     10.01m ± 0%   10.01m ± 0%   +0.02% (p=0.000 n=10)
+IndexHard3                     10.01m ± 0%   10.01m ± 0%   +0.02% (p=0.000 n=10)
+IndexHard4                     10.01m ± 0%   10.01m ± 0%   +0.02% (p=0.000 n=10)
+LastIndexHard1                 10.71m ± 0%   10.71m ± 0%   +0.03% (p=0.000 n=10)
+LastIndexHard2                 10.71m ± 0%   10.71m ± 0%   +0.03% (p=0.000 n=10)
+LastIndexHard3                 10.71m ± 0%   10.71m ± 0%   +0.03% (p=0.000 n=10)
+IndexTorture                   71.33µ ± 0%   71.37µ ± 0%   +0.05% (p=0.000 n=10)
+IndexAnyASCII/1:1              34.40n ± 0%   35.07n ± 0%   +1.95% (p=0.000 n=10)
+IndexAnyASCII/1:2              46.87n ± 0%   47.54n ± 0%   +1.43% (p=0.000 n=10)
+IndexAnyASCII/1:4              49.53n ± 0%   50.20n ± 0%   +1.35% (p=0.000 n=10)
+IndexAnyASCII/1:8              54.86n ± 0%   55.53n ± 0%   +1.22% (p=0.000 n=10)
+IndexAnyASCII/1:16             65.56n ± 0%   66.24n ± 0%   +1.04% (p=0.000 n=10)
+IndexAnyASCII/1:32             86.97n ± 0%   77.82n ± 0%  -10.52% (p=0.000 n=10)
+IndexAnyASCII/1:64            134.50n ± 0%   98.57n ± 0%  -26.71% (p=0.000 n=10)
+IndexAnyASCII/16:1             54.19n ± 0%   54.86n ± 0%   +1.24% (p=0.000 n=10)
+IndexAnyASCII/16:2             257.4n ± 0%   256.7n ± 0%   -0.27% (p=0.000 n=10)
+IndexAnyASCII/16:4             275.3n ± 0%   275.3n ± 0%        ~ (p=1.000 n=10)
+IndexAnyASCII/16:8             315.4n ± 0%   315.5n ± 0%   +0.03% (p=0.001 n=10)
+IndexAnyASCII/16:16            405.4n ± 0%   405.4n ± 0%        ~ (p=1.000 n=10)
+IndexAnyASCII/16:32            566.0n ± 0%   566.0n ± 0%        ~ (p=1.000 n=10)
+IndexAnyASCII/16:64            887.0n ± 0%   887.1n ± 0%        ~ (p=0.181 n=10)
+IndexAnyASCII/256:1            380.0n ± 0%   174.7n ± 0%  -54.03% (p=0.000 n=10)
+IndexAnyASCII/256:2            2.826µ ± 0%   2.826µ ± 0%        ~ (p=1.000 n=10) ¹
+IndexAnyASCII/256:4            2.844µ ± 0%   2.844µ ± 0%        ~ (p=1.000 n=10) ¹
+IndexAnyASCII/256:8            2.884µ ± 0%   2.884µ ± 0%        ~ (p=0.087 n=10)
+IndexAnyASCII/256:16           2.974µ ± 0%   2.974µ ± 0%        ~ (p=1.000 n=10)
+IndexAnyASCII/256:32           3.135µ ± 0%   3.135µ ± 0%        ~ (p=1.000 n=10)
+IndexAnyASCII/256:64           3.456µ ± 0%   3.456µ ± 0%        ~ (p=1.000 n=10) ¹
+IndexAnyUTF8/1:1               38.13n ± 0%   38.13n ± 0%        ~ (p=1.000 n=10) ¹
+IndexAnyUTF8/1:2               46.87n ± 0%   47.54n ± 0%   +1.43% (p=0.000 n=10)
+IndexAnyUTF8/1:4               49.53n ± 0%   50.19n ± 0%   +1.33% (p=0.000 n=10)
+IndexAnyUTF8/1:8               54.86n ± 0%   55.52n ± 0%   +1.20% (p=0.000 n=10)
+IndexAnyUTF8/1:16              65.56n ± 0%   66.23n ± 0%   +1.02% (p=0.000 n=10)
+IndexAnyUTF8/1:32              86.97n ± 0%   82.25n ± 0%   -5.42% (p=0.000 n=10)
+IndexAnyUTF8/1:64             134.50n ± 0%   99.96n ± 0%  -25.68% (p=0.000 n=10)
+IndexAnyUTF8/16:1              98.34n ± 0%   98.34n ± 0%        ~ (p=1.000 n=10)
+IndexAnyUTF8/16:2              462.7n ± 0%   473.7n ± 0%   +2.38% (p=0.000 n=10)
+IndexAnyUTF8/16:4              504.6n ± 0%   515.3n ± 0%   +2.11% (p=0.000 n=10)
+IndexAnyUTF8/16:8              589.1n ± 0%   599.7n ± 0%   +1.80% (p=0.000 n=10)
+IndexAnyUTF8/16:16             760.4n ± 0%   770.9n ± 0%   +1.38% (p=0.000 n=10)
+IndexAnyUTF8/16:32             1.103µ ± 0%   1.023µ ± 0%   -7.25% (p=0.000 n=10)
+IndexAnyUTF8/16:64             1.857µ ± 0%   1.294µ ± 0%  -30.32% (p=0.000 n=10)
+IndexAnyUTF8/256:1             1.066µ ± 0%   1.066µ ± 0%        ~ (p=1.000 n=10) ¹
+IndexAnyUTF8/256:2             6.106µ ± 0%   6.277µ ± 0%   +2.81% (p=0.000 n=10)
+IndexAnyUTF8/256:4             6.787µ ± 0%   6.958µ ± 0%   +2.52% (p=0.000 n=10)
+IndexAnyUTF8/256:8             8.136µ ± 0%   8.308µ ± 0%   +2.11% (p=0.000 n=10)
+IndexAnyUTF8/256:16            10.88µ ± 0%   11.05µ ± 0%   +1.57% (p=0.000 n=10)
+IndexAnyUTF8/256:32            16.36µ ± 0%   14.90µ ± 0%   -8.93% (p=0.000 n=10)
+IndexAnyUTF8/256:64            28.51µ ± 0%   19.41µ ± 0%  -31.92% (p=0.000 n=10)
+LastIndexAnyASCII/1:1          35.79n ± 0%   38.52n ± 0%   +7.63% (p=0.000 n=10)
+LastIndexAnyASCII/1:2          37.12n ± 0%   39.85n ± 0%   +7.35% (p=0.000 n=10)
+LastIndexAnyASCII/1:4          39.76n ± 0%   42.08n ± 0%   +5.84% (p=0.000 n=10)
+LastIndexAnyASCII/1:8          44.82n ± 0%   47.22n ± 0%   +5.34% (p=0.000 n=10)
+LastIndexAnyASCII/1:16         55.53n ± 0%   57.92n ± 3%   +4.30% (p=0.000 n=10)
+LastIndexAnyASCII/1:32         76.94n ± 0%   70.16n ± 0%   -8.81% (p=0.000 n=10)
+LastIndexAnyASCII/1:64        124.40n ± 0%   89.67n ± 0%  -27.92% (p=0.000 n=10)
+LastIndexAnyASCII/16:1         245.9n ± 0%   245.9n ± 0%        ~ (p=1.000 n=10)
+LastIndexAnyASCII/16:2         255.2n ± 0%   255.2n ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/16:4         275.1n ± 0%   275.1n ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/16:8         315.2n ± 0%   315.2n ± 0%        ~ (p=1.000 n=10)
+LastIndexAnyASCII/16:16        400.4n ± 0%   400.4n ± 0%        ~ (p=0.087 n=10)
+LastIndexAnyASCII/16:32        560.9n ± 0%   560.9n ± 0%        ~ (p=0.124 n=10)
+LastIndexAnyASCII/16:64        882.1n ± 0%   882.0n ± 0%   -0.01% (p=0.003 n=10)
+LastIndexAnyASCII/256:1        2.815µ ± 0%   2.815µ ± 0%        ~ (p=0.211 n=10)
+LastIndexAnyASCII/256:2        2.824µ ± 0%   2.824µ ± 0%        ~ (p=1.000 n=10)
+LastIndexAnyASCII/256:4        2.844µ ± 0%   2.844µ ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/256:8        2.884µ ± 0%   2.884µ ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/256:16       2.969µ ± 0%   2.969µ ± 0%        ~ (p=1.000 n=10)
+LastIndexAnyASCII/256:32       3.130µ ± 0%   3.130µ ± 0%        ~ (p=1.000 n=10) ¹
+LastIndexAnyASCII/256:64       3.451µ ± 0%   3.451µ ± 0%        ~ (p=0.474 n=10)
+LastIndexAnyUTF8/1:1           35.79n ± 0%   36.13n ± 0%   +0.95% (p=0.000 n=10)
+LastIndexAnyUTF8/1:2           37.11n ± 0%   37.47n ± 0%   +0.97% (p=0.000 n=10)
+LastIndexAnyUTF8/1:4           39.75n ± 0%   40.14n ± 0%   +0.97% (p=0.000 n=10)
+LastIndexAnyUTF8/1:8           44.82n ± 0%   45.49n ± 0%   +1.49% (p=0.000 n=10)
+LastIndexAnyUTF8/1:16          55.52n ± 0%   56.20n ± 0%   +1.22% (p=0.000 n=10)
+LastIndexAnyUTF8/1:32          76.93n ± 0%   74.25n ± 0%   -3.48% (p=0.000 n=10)
+LastIndexAnyUTF8/1:64         124.40n ± 0%   91.15n ± 0%  -26.73% (p=0.000 n=10)
+LastIndexAnyUTF8/16:1          322.5n ± 0%   322.5n ± 0%        ~ (p=0.087 n=10)
+LastIndexAnyUTF8/16:2          634.2n ± 0%   616.4n ± 0%   -2.81% (p=0.000 n=10)
+LastIndexAnyUTF8/16:4          674.5n ± 0%   657.9n ± 0%   -2.46% (p=0.000 n=10)
+LastIndexAnyUTF8/16:8          758.3n ± 0%   741.0n ± 0%   -2.28% (p=0.000 n=10)
+LastIndexAnyUTF8/16:16         929.6n ± 0%   912.3n ± 0%   -1.86% (p=0.000 n=10)
+LastIndexAnyUTF8/16:32         1.272µ ± 0%   1.176µ ± 0%   -7.55% (p=0.000 n=10)
+LastIndexAnyUTF8/16:64         2.018µ ± 0%   1.453µ ± 0%  -28.00% (p=0.000 n=10)
+LastIndexAnyUTF8/256:1         4.015µ ± 0%   4.016µ ± 0%   +0.02% (p=0.000 n=10)
+LastIndexAnyUTF8/256:2         8.896µ ± 0%   8.537µ ± 0%   -4.04% (p=0.000 n=10)
+LastIndexAnyUTF8/256:4         9.553µ ± 0%   9.217µ ± 0%   -3.52% (p=0.000 n=10)
+LastIndexAnyUTF8/256:8         10.90µ ± 0%   10.54µ ± 0%   -3.29% (p=0.000 n=10)
+LastIndexAnyUTF8/256:16        13.64µ ± 0%   13.28µ ± 0%   -2.63% (p=0.000 n=10)
+LastIndexAnyUTF8/256:32        19.12µ ± 0%   17.16µ ± 1%  -10.23% (p=0.000 n=10)
+LastIndexAnyUTF8/256:64        31.11µ ± 0%   21.98µ ± 0%  -29.36% (p=0.000 n=10)
+IndexPeriodic/IndexPeriodic2   625.5µ ± 0%   625.5µ ± 0%        ~ (p=0.955 n=10)
+IndexPeriodic/IndexPeriodic4   625.4µ ± 0%   625.4µ ± 0%        ~ (p=0.838 n=10)
+IndexPeriodic/IndexPeriodic8   625.3µ ± 0%   625.3µ ± 0%   +0.01% (p=0.009 n=10)
+IndexPeriodic/IndexPeriodic16  229.8µ ± 0%   227.0µ ± 0%   -1.22% (p=0.000 n=10)
+IndexPeriodic/IndexPeriodic32  168.9µ ± 3%   131.8µ ± 0%  -22.00% (p=0.000 n=10)
+IndexPeriodic/IndexPeriodic64 126.36µ ± 0%   86.66µ ± 0%  -31.42% (p=0.000 n=10)
+geomean                        1.361µ        1.302µ        -4.31%
+
+As these functions are so heavily used this change impacts other
+benchmarks.  I include the improvements in geomean for the all the
+benchmarks in the strings and bytes packages, along with some
+selected benchmarks to illustrate the impact of the change.
+
+geomean for bytes              13.81µ          12.92µ         -6.44%
+geomean for string             9.385µ          9.224µ         -1.72%
+
+Note that when building for rva22u64 a single Zbb instruction is used
+in the main loop.  This also helps to improve performance slightly.
+The geomean for all the bytes benchmarks when building with
+GORISCV64=rva22u64 with and without the patch is shown below.
+
+geomean for bytes (rva22u64)   13.46µ          12.49µ         -7.21%
+
+Examples of non-Index benchmarks affected by this commit.
+
+ReadString uses IndexByte to search for a byte stored at the end of
+32KB buffer, so we see a speed up.  SplitSingleByteSeparator searches
+large buffers, but the byte being sought occurs within the first 15
+bytes of the buffer, 76% of the time, hence the slowdown.  In
+SplitMultiByteSeparator the first byte of the separator only occurs
+in the first 15 bytes 33% of the time so we see a speed up.
+
+ReadString               05.13µ ±  2%    74.67µ ±  0%  -28.97% (p=0.000 n=10)
+SplitSingleByteSeparator 11.31m ±  2%    12.43m ±  1%   +9.83% (p=0.000 n=10)
+SplitMultiByteSeparator  8.070m ±  1%    7.707m ±  1%   -4.49% (p=0.000 n=10)
+
+Change-Id: I6210ea2f3decdc6d2e0609df72b1b66e6d6f5395
+Reviewed-on: https://go-review.googlesource.com/c/go/+/561275
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/internal/bytealg/indexbyte_riscv64.s | 100 +++++++++++++++++++++--
+ 1 file changed, 94 insertions(+), 6 deletions(-)
+
+diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s
+index de00983c7b..fde00da0ea 100644
+--- a/src/internal/bytealg/indexbyte_riscv64.s
++++ b/src/internal/bytealg/indexbyte_riscv64.s
+@@ -10,31 +10,118 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+ 	// X11 = b_len
+ 	// X12 = b_cap (unused)
+ 	// X13 = byte to find
+-	AND	$0xff, X13
+-	MOV	X10, X12		// store base for later
++	AND	$0xff, X13, X12		// x12 byte to look for
++	MOV	X10, X13		// store base for later
++
++	SLTI	$24, X11, X14
+ 	ADD	X10, X11		// end
+-	SUB	$1, X10
++	BEQZ	X14, bigBody
+ 
++	SUB	$1, X10
+ loop:
+ 	ADD	$1, X10
+ 	BEQ	X10, X11, notfound
+ 	MOVBU	(X10), X14
+-	BNE	X13, X14, loop
++	BNE	X12, X14, loop
+ 
+-	SUB	X12, X10		// remove base
++	SUB	X13, X10		// remove base
+ 	RET
+ 
+ notfound:
+ 	MOV	$-1, X10
+ 	RET
+ 
++bigBody:
++	JMP	indexByteBig<>(SB)
++
+ TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+ 	// X10 = b_base
+ 	// X11 = b_len
+ 	// X12 = byte to find
+-	AND	$0xff, X12
++
++	AND	$0xff, X12		// x12 byte to look for
+ 	MOV	X10, X13		// store base for later
++
++	SLTI	$24, X11, X14
+ 	ADD	X10, X11		// end
++	BEQZ	X14, bigBody
++
++	SUB	$1, X10
++loop:
++	ADD	$1, X10
++	BEQ	X10, X11, notfound
++	MOVBU	(X10), X14
++	BNE	X12, X14, loop
++
++	SUB	X13, X10		// remove base
++	RET
++
++notfound:
++	MOV	$-1, X10
++	RET
++
++bigBody:
++	JMP	indexByteBig<>(SB)
++
++TEXT indexByteBig<>(SB),NOSPLIT|NOFRAME,$0
++	// On entry
++	// X10 = b_base
++	// X11 = end
++	// X12 = byte to find
++	// X13 = b_base
++	// X11 is at least 16 bytes > X10
++
++	// On exit
++	// X10 = index of first instance of sought byte, if found, or -1 otherwise
++
++	// Process the first few bytes until we get to an 8 byte boundary
++	// No need to check for end here as we have at least 16 bytes in
++	// the buffer.
++
++unalignedloop:
++	AND	$7, X10, X14
++	BEQZ	X14, aligned
++	MOVBU	(X10), X14
++	BEQ	X12, X14, found
++	ADD	$1, X10
++	JMP	unalignedloop
++
++aligned:
++	AND	$~7, X11, X15		// X15 = end of aligned data
++
++	// We have at least 9 bytes left
++
++	// Use 'Determine if a word has a byte equal to n' bit hack from
++	// https://graphics.stanford.edu/~seander/bithacks.html to determine
++	// whether the byte is present somewhere in the next 8 bytes of the
++	// array.
++
++	MOV	$0x0101010101010101, X16
++	SLLI	$7, X16, X17		// X17 = 0x8080808080808080
++
++	MUL	X12, X16, X18		// broadcast X12 to every byte in X18
++
++alignedloop:
++	MOV	(X10), X14
++	XOR	X14, X18, X19
++
++	// If the LSB in X12 is present somewhere in the 8 bytes we've just
++	// loaded into X14 then at least one of the bytes in X19 will be 0
++	// after the XOR.  If any of the bytes in X19 are zero then
++	//
++	// ((X19 - X16) & (~X19) & X17)
++	//
++	// will be non-zero.  The expression will evaluate to zero if none of
++	// the bytes in X19 are zero, i.e., X12 is not present in X14.
++
++	SUB	X16, X19, X20
++	ANDN	X19, X17, X21
++	AND	X20, X21
++	BNEZ	X21, tailloop		// If X21 != 0 X12 is present in X14
++	ADD	$8, X10
++	BNE	X10, X15, alignedloop
++
++tailloop:
+ 	SUB	$1, X10
+ 
+ loop:
+@@ -43,6 +130,7 @@ loop:
+ 	MOVBU	(X10), X14
+ 	BNE	X12, X14, loop
+ 
++found:
+ 	SUB	X13, X10		// remove base
+ 	RET
+ 
+-- 
+2.39.5
+
diff --git a/2063-cmd-internal-obj-riscv-rework-instruction-encoding-i.patch b/2063-cmd-internal-obj-riscv-rework-instruction-encoding-i.patch
new file mode 100644
index 0000000..f630789
--- /dev/null
+++ b/2063-cmd-internal-obj-riscv-rework-instruction-encoding-i.patch
@@ -0,0 +1,624 @@
+From 28c6d3f567efbad8c616f1964fbb4f88848991ba Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 063/119] cmd/internal/obj/riscv: rework instruction encoding
+ information
+
+Currently, instruction encoding is a slice of encoding types, which
+is indexed by a masked version of the riscv64 opcode. Additional
+information about some instructions (for example, if an instruction
+has a ternary form and if there is an immediate form for an instruction)
+is manually specified in other parts of the assembler code.
+
+Rework the instruction encoding information so that we use a table
+driven form, providing additional data for each instruction where
+relevant. This means that we can simplify other parts of the code
+by simply looking up the instruction data and reusing minimal logic.
+
+Change-Id: I7b3b6c61a4868647edf28bd7dbae2150e043ae00
+Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64
+Reviewed-on: https://go-review.googlesource.com/c/go/+/622535
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+---
+ src/cmd/internal/obj/riscv/obj.go | 491 ++++++++++++++----------------
+ 1 file changed, 234 insertions(+), 257 deletions(-)
+
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 6b490a8967..5e7092ab36 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -53,18 +53,14 @@ func jalToSym(ctxt *obj.Link, p *obj.Prog, lr int16) {
+ // progedit is called individually for each *obj.Prog. It normalizes instruction
+ // formats and eliminates as many pseudo-instructions as possible.
+ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
++	insData, err := instructionDataForAs(p.As)
++	if err != nil {
++		panic(fmt.Sprintf("failed to lookup instruction data for %v: %v", p.As, err))
++	}
+ 
+ 	// Expand binary instructions to ternary ones.
+ 	if p.Reg == obj.REG_NONE {
+-		switch p.As {
+-		case AADDI, ASLTI, ASLTIU, AANDI, AORI, AXORI, ASLLI, ASRLI, ASRAI,
+-			AADDIW, ASLLIW, ASRLIW, ASRAIW, AADDW, ASUBW, ASLLW, ASRLW, ASRAW,
+-			AADD, AAND, AOR, AXOR, ASLL, ASRL, ASUB, ASRA,
+-			AMUL, AMULH, AMULHU, AMULHSU, AMULW, ADIV, ADIVU, ADIVW, ADIVUW,
+-			AREM, AREMU, AREMW, AREMUW,
+-			AADDUW, ASH1ADD, ASH1ADDUW, ASH2ADD, ASH2ADDUW, ASH3ADD, ASH3ADDUW, ASLLIUW,
+-			AANDN, AORN, AXNOR, AMAX, AMAXU, AMIN, AMINU, AROL, AROLW, AROR, ARORW, ARORI, ARORIW,
+-			ABCLR, ABCLRI, ABEXT, ABEXTI, ABINV, ABINVI, ABSET, ABSETI:
++		if insData.ternary {
+ 			p.Reg = p.To.Reg
+ 		}
+ 	}
+@@ -73,48 +69,14 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 	// form of the instruction.
+ 	if p.From.Type == obj.TYPE_CONST {
+ 		switch p.As {
+-		case AADD:
+-			p.As = AADDI
+ 		case ASUB:
+ 			p.As, p.From.Offset = AADDI, -p.From.Offset
+-		case ASLT:
+-			p.As = ASLTI
+-		case ASLTU:
+-			p.As = ASLTIU
+-		case AAND:
+-			p.As = AANDI
+-		case AOR:
+-			p.As = AORI
+-		case AXOR:
+-			p.As = AXORI
+-		case ASLL:
+-			p.As = ASLLI
+-		case ASRL:
+-			p.As = ASRLI
+-		case ASRA:
+-			p.As = ASRAI
+-		case AADDW:
+-			p.As = AADDIW
+ 		case ASUBW:
+ 			p.As, p.From.Offset = AADDIW, -p.From.Offset
+-		case ASLLW:
+-			p.As = ASLLIW
+-		case ASRLW:
+-			p.As = ASRLIW
+-		case ASRAW:
+-			p.As = ASRAIW
+-		case AROR:
+-			p.As = ARORI
+-		case ARORW:
+-			p.As = ARORIW
+-		case ABCLR:
+-			p.As = ABCLRI
+-		case ABEXT:
+-			p.As = ABEXTI
+-		case ABINV:
+-			p.As = ABINVI
+-		case ABSET:
+-			p.As = ABSETI
++		default:
++			if insData.immForm != obj.AXXX {
++				p.As = insData.immForm
++			}
+ 		}
+ 	}
+ 
+@@ -1566,285 +1528,300 @@ var (
+ 	badEncoding = encoding{encode: func(*instruction) uint32 { return 0 }, validate: func(*obj.Link, *instruction) {}, length: 0}
+ )
+ 
+-// encodings contains the encodings for RISC-V instructions.
+-// Instructions are masked with obj.AMask to keep indices small.
+-var encodings = [ALAST & obj.AMask]encoding{
++// instructionData specifies details relating to a RISC-V instruction.
++type instructionData struct {
++	enc     encoding
++	immForm obj.As // immediate form of this instruction
++	ternary bool
++}
+ 
++// instructions contains details of RISC-V instructions, including
++// their encoding type. Entries are masked with obj.AMask to keep
++// indices small.
++var instructions = [ALAST & obj.AMask]instructionData{
+ 	// Unprivileged ISA
+ 
+ 	// 2.4: Integer Computational Instructions
+-	AADDI & obj.AMask:  iIIEncoding,
+-	ASLTI & obj.AMask:  iIIEncoding,
+-	ASLTIU & obj.AMask: iIIEncoding,
+-	AANDI & obj.AMask:  iIIEncoding,
+-	AORI & obj.AMask:   iIIEncoding,
+-	AXORI & obj.AMask:  iIIEncoding,
+-	ASLLI & obj.AMask:  iIIEncoding,
+-	ASRLI & obj.AMask:  iIIEncoding,
+-	ASRAI & obj.AMask:  iIIEncoding,
+-	ALUI & obj.AMask:   uEncoding,
+-	AAUIPC & obj.AMask: uEncoding,
+-	AADD & obj.AMask:   rIIIEncoding,
+-	ASLT & obj.AMask:   rIIIEncoding,
+-	ASLTU & obj.AMask:  rIIIEncoding,
+-	AAND & obj.AMask:   rIIIEncoding,
+-	AOR & obj.AMask:    rIIIEncoding,
+-	AXOR & obj.AMask:   rIIIEncoding,
+-	ASLL & obj.AMask:   rIIIEncoding,
+-	ASRL & obj.AMask:   rIIIEncoding,
+-	ASUB & obj.AMask:   rIIIEncoding,
+-	ASRA & obj.AMask:   rIIIEncoding,
++	AADDI & obj.AMask:  {enc: iIIEncoding, ternary: true},
++	ASLTI & obj.AMask:  {enc: iIIEncoding, ternary: true},
++	ASLTIU & obj.AMask: {enc: iIIEncoding, ternary: true},
++	AANDI & obj.AMask:  {enc: iIIEncoding, ternary: true},
++	AORI & obj.AMask:   {enc: iIIEncoding, ternary: true},
++	AXORI & obj.AMask:  {enc: iIIEncoding, ternary: true},
++	ASLLI & obj.AMask:  {enc: iIIEncoding, ternary: true},
++	ASRLI & obj.AMask:  {enc: iIIEncoding, ternary: true},
++	ASRAI & obj.AMask:  {enc: iIIEncoding, ternary: true},
++	ALUI & obj.AMask:   {enc: uEncoding},
++	AAUIPC & obj.AMask: {enc: uEncoding},
++	AADD & obj.AMask:   {enc: rIIIEncoding, immForm: AADDI, ternary: true},
++	ASLT & obj.AMask:   {enc: rIIIEncoding, immForm: ASLTI, ternary: true},
++	ASLTU & obj.AMask:  {enc: rIIIEncoding, immForm: ASLTIU, ternary: true},
++	AAND & obj.AMask:   {enc: rIIIEncoding, immForm: AANDI, ternary: true},
++	AOR & obj.AMask:    {enc: rIIIEncoding, immForm: AORI, ternary: true},
++	AXOR & obj.AMask:   {enc: rIIIEncoding, immForm: AXORI, ternary: true},
++	ASLL & obj.AMask:   {enc: rIIIEncoding, immForm: ASLLI, ternary: true},
++	ASRL & obj.AMask:   {enc: rIIIEncoding, immForm: ASRLI, ternary: true},
++	ASUB & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	ASRA & obj.AMask:   {enc: rIIIEncoding, immForm: ASRAI, ternary: true},
+ 
+ 	// 2.5: Control Transfer Instructions
+-	AJAL & obj.AMask:  jEncoding,
+-	AJALR & obj.AMask: iIIEncoding,
+-	ABEQ & obj.AMask:  bEncoding,
+-	ABNE & obj.AMask:  bEncoding,
+-	ABLT & obj.AMask:  bEncoding,
+-	ABLTU & obj.AMask: bEncoding,
+-	ABGE & obj.AMask:  bEncoding,
+-	ABGEU & obj.AMask: bEncoding,
++	AJAL & obj.AMask:  {enc: jEncoding},
++	AJALR & obj.AMask: {enc: iIIEncoding},
++	ABEQ & obj.AMask:  {enc: bEncoding},
++	ABNE & obj.AMask:  {enc: bEncoding},
++	ABLT & obj.AMask:  {enc: bEncoding},
++	ABLTU & obj.AMask: {enc: bEncoding},
++	ABGE & obj.AMask:  {enc: bEncoding},
++	ABGEU & obj.AMask: {enc: bEncoding},
+ 
+ 	// 2.6: Load and Store Instructions
+-	ALW & obj.AMask:  iIIEncoding,
+-	ALWU & obj.AMask: iIIEncoding,
+-	ALH & obj.AMask:  iIIEncoding,
+-	ALHU & obj.AMask: iIIEncoding,
+-	ALB & obj.AMask:  iIIEncoding,
+-	ALBU & obj.AMask: iIIEncoding,
+-	ASW & obj.AMask:  sIEncoding,
+-	ASH & obj.AMask:  sIEncoding,
+-	ASB & obj.AMask:  sIEncoding,
++	ALW & obj.AMask:  {enc: iIIEncoding},
++	ALWU & obj.AMask: {enc: iIIEncoding},
++	ALH & obj.AMask:  {enc: iIIEncoding},
++	ALHU & obj.AMask: {enc: iIIEncoding},
++	ALB & obj.AMask:  {enc: iIIEncoding},
++	ALBU & obj.AMask: {enc: iIIEncoding},
++	ASW & obj.AMask:  {enc: sIEncoding},
++	ASH & obj.AMask:  {enc: sIEncoding},
++	ASB & obj.AMask:  {enc: sIEncoding},
+ 
+ 	// 2.7: Memory Ordering
+-	AFENCE & obj.AMask: iIIEncoding,
++	AFENCE & obj.AMask: {enc: iIIEncoding},
+ 
+ 	// 5.2: Integer Computational Instructions (RV64I)
+-	AADDIW & obj.AMask: iIIEncoding,
+-	ASLLIW & obj.AMask: iIIEncoding,
+-	ASRLIW & obj.AMask: iIIEncoding,
+-	ASRAIW & obj.AMask: iIIEncoding,
+-	AADDW & obj.AMask:  rIIIEncoding,
+-	ASLLW & obj.AMask:  rIIIEncoding,
+-	ASRLW & obj.AMask:  rIIIEncoding,
+-	ASUBW & obj.AMask:  rIIIEncoding,
+-	ASRAW & obj.AMask:  rIIIEncoding,
++	AADDIW & obj.AMask: {enc: iIIEncoding, ternary: true},
++	ASLLIW & obj.AMask: {enc: iIIEncoding, ternary: true},
++	ASRLIW & obj.AMask: {enc: iIIEncoding, ternary: true},
++	ASRAIW & obj.AMask: {enc: iIIEncoding, ternary: true},
++	AADDW & obj.AMask:  {enc: rIIIEncoding, immForm: AADDIW, ternary: true},
++	ASLLW & obj.AMask:  {enc: rIIIEncoding, immForm: ASLLIW, ternary: true},
++	ASRLW & obj.AMask:  {enc: rIIIEncoding, immForm: ASRLIW, ternary: true},
++	ASUBW & obj.AMask:  {enc: rIIIEncoding, ternary: true},
++	ASRAW & obj.AMask:  {enc: rIIIEncoding, immForm: ASRAIW, ternary: true},
+ 
+ 	// 5.3: Load and Store Instructions (RV64I)
+-	ALD & obj.AMask: iIIEncoding,
+-	ASD & obj.AMask: sIEncoding,
++	ALD & obj.AMask: {enc: iIIEncoding},
++	ASD & obj.AMask: {enc: sIEncoding},
+ 
+ 	// 7.1: CSR Instructions
+-	ACSRRS & obj.AMask: iIIEncoding,
++	ACSRRS & obj.AMask: {enc: iIIEncoding},
+ 
+ 	// 7.1: Multiplication Operations
+-	AMUL & obj.AMask:    rIIIEncoding,
+-	AMULH & obj.AMask:   rIIIEncoding,
+-	AMULHU & obj.AMask:  rIIIEncoding,
+-	AMULHSU & obj.AMask: rIIIEncoding,
+-	AMULW & obj.AMask:   rIIIEncoding,
+-	ADIV & obj.AMask:    rIIIEncoding,
+-	ADIVU & obj.AMask:   rIIIEncoding,
+-	AREM & obj.AMask:    rIIIEncoding,
+-	AREMU & obj.AMask:   rIIIEncoding,
+-	ADIVW & obj.AMask:   rIIIEncoding,
+-	ADIVUW & obj.AMask:  rIIIEncoding,
+-	AREMW & obj.AMask:   rIIIEncoding,
+-	AREMUW & obj.AMask:  rIIIEncoding,
++	AMUL & obj.AMask:    {enc: rIIIEncoding, ternary: true},
++	AMULH & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	AMULHU & obj.AMask:  {enc: rIIIEncoding, ternary: true},
++	AMULHSU & obj.AMask: {enc: rIIIEncoding, ternary: true},
++	AMULW & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	ADIV & obj.AMask:    {enc: rIIIEncoding, ternary: true},
++	ADIVU & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	AREM & obj.AMask:    {enc: rIIIEncoding, ternary: true},
++	AREMU & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	ADIVW & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	ADIVUW & obj.AMask:  {enc: rIIIEncoding, ternary: true},
++	AREMW & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	AREMUW & obj.AMask:  {enc: rIIIEncoding, ternary: true},
+ 
+ 	// 8.2: Load-Reserved/Store-Conditional
+-	ALRW & obj.AMask: rIIIEncoding,
+-	ALRD & obj.AMask: rIIIEncoding,
+-	ASCW & obj.AMask: rIIIEncoding,
+-	ASCD & obj.AMask: rIIIEncoding,
++	ALRW & obj.AMask: {enc: rIIIEncoding},
++	ALRD & obj.AMask: {enc: rIIIEncoding},
++	ASCW & obj.AMask: {enc: rIIIEncoding},
++	ASCD & obj.AMask: {enc: rIIIEncoding},
+ 
+ 	// 8.3: Atomic Memory Operations
+-	AAMOSWAPW & obj.AMask: rIIIEncoding,
+-	AAMOSWAPD & obj.AMask: rIIIEncoding,
+-	AAMOADDW & obj.AMask:  rIIIEncoding,
+-	AAMOADDD & obj.AMask:  rIIIEncoding,
+-	AAMOANDW & obj.AMask:  rIIIEncoding,
+-	AAMOANDD & obj.AMask:  rIIIEncoding,
+-	AAMOORW & obj.AMask:   rIIIEncoding,
+-	AAMOORD & obj.AMask:   rIIIEncoding,
+-	AAMOXORW & obj.AMask:  rIIIEncoding,
+-	AAMOXORD & obj.AMask:  rIIIEncoding,
+-	AAMOMAXW & obj.AMask:  rIIIEncoding,
+-	AAMOMAXD & obj.AMask:  rIIIEncoding,
+-	AAMOMAXUW & obj.AMask: rIIIEncoding,
+-	AAMOMAXUD & obj.AMask: rIIIEncoding,
+-	AAMOMINW & obj.AMask:  rIIIEncoding,
+-	AAMOMIND & obj.AMask:  rIIIEncoding,
+-	AAMOMINUW & obj.AMask: rIIIEncoding,
+-	AAMOMINUD & obj.AMask: rIIIEncoding,
++	AAMOSWAPW & obj.AMask: {enc: rIIIEncoding},
++	AAMOSWAPD & obj.AMask: {enc: rIIIEncoding},
++	AAMOADDW & obj.AMask:  {enc: rIIIEncoding},
++	AAMOADDD & obj.AMask:  {enc: rIIIEncoding},
++	AAMOANDW & obj.AMask:  {enc: rIIIEncoding},
++	AAMOANDD & obj.AMask:  {enc: rIIIEncoding},
++	AAMOORW & obj.AMask:   {enc: rIIIEncoding},
++	AAMOORD & obj.AMask:   {enc: rIIIEncoding},
++	AAMOXORW & obj.AMask:  {enc: rIIIEncoding},
++	AAMOXORD & obj.AMask:  {enc: rIIIEncoding},
++	AAMOMAXW & obj.AMask:  {enc: rIIIEncoding},
++	AAMOMAXD & obj.AMask:  {enc: rIIIEncoding},
++	AAMOMAXUW & obj.AMask: {enc: rIIIEncoding},
++	AAMOMAXUD & obj.AMask: {enc: rIIIEncoding},
++	AAMOMINW & obj.AMask:  {enc: rIIIEncoding},
++	AAMOMIND & obj.AMask:  {enc: rIIIEncoding},
++	AAMOMINUW & obj.AMask: {enc: rIIIEncoding},
++	AAMOMINUD & obj.AMask: {enc: rIIIEncoding},
+ 
+ 	// 11.5: Single-Precision Load and Store Instructions
+-	AFLW & obj.AMask: iFEncoding,
+-	AFSW & obj.AMask: sFEncoding,
++	AFLW & obj.AMask: {enc: iFEncoding},
++	AFSW & obj.AMask: {enc: sFEncoding},
+ 
+ 	// 11.6: Single-Precision Floating-Point Computational Instructions
+-	AFADDS & obj.AMask:   rFFFEncoding,
+-	AFSUBS & obj.AMask:   rFFFEncoding,
+-	AFMULS & obj.AMask:   rFFFEncoding,
+-	AFDIVS & obj.AMask:   rFFFEncoding,
+-	AFMINS & obj.AMask:   rFFFEncoding,
+-	AFMAXS & obj.AMask:   rFFFEncoding,
+-	AFSQRTS & obj.AMask:  rFFFEncoding,
+-	AFMADDS & obj.AMask:  rFFFFEncoding,
+-	AFMSUBS & obj.AMask:  rFFFFEncoding,
+-	AFNMSUBS & obj.AMask: rFFFFEncoding,
+-	AFNMADDS & obj.AMask: rFFFFEncoding,
++	AFADDS & obj.AMask:   {enc: rFFFEncoding},
++	AFSUBS & obj.AMask:   {enc: rFFFEncoding},
++	AFMULS & obj.AMask:   {enc: rFFFEncoding},
++	AFDIVS & obj.AMask:   {enc: rFFFEncoding},
++	AFMINS & obj.AMask:   {enc: rFFFEncoding},
++	AFMAXS & obj.AMask:   {enc: rFFFEncoding},
++	AFSQRTS & obj.AMask:  {enc: rFFFEncoding},
++	AFMADDS & obj.AMask:  {enc: rFFFFEncoding},
++	AFMSUBS & obj.AMask:  {enc: rFFFFEncoding},
++	AFNMSUBS & obj.AMask: {enc: rFFFFEncoding},
++	AFNMADDS & obj.AMask: {enc: rFFFFEncoding},
+ 
+ 	// 11.7: Single-Precision Floating-Point Conversion and Move Instructions
+-	AFCVTWS & obj.AMask:  rFIEncoding,
+-	AFCVTLS & obj.AMask:  rFIEncoding,
+-	AFCVTSW & obj.AMask:  rIFEncoding,
+-	AFCVTSL & obj.AMask:  rIFEncoding,
+-	AFCVTWUS & obj.AMask: rFIEncoding,
+-	AFCVTLUS & obj.AMask: rFIEncoding,
+-	AFCVTSWU & obj.AMask: rIFEncoding,
+-	AFCVTSLU & obj.AMask: rIFEncoding,
+-	AFSGNJS & obj.AMask:  rFFFEncoding,
+-	AFSGNJNS & obj.AMask: rFFFEncoding,
+-	AFSGNJXS & obj.AMask: rFFFEncoding,
+-	AFMVXW & obj.AMask:   rFIEncoding,
+-	AFMVWX & obj.AMask:   rIFEncoding,
++	AFCVTWS & obj.AMask:  {enc: rFIEncoding},
++	AFCVTLS & obj.AMask:  {enc: rFIEncoding},
++	AFCVTSW & obj.AMask:  {enc: rIFEncoding},
++	AFCVTSL & obj.AMask:  {enc: rIFEncoding},
++	AFCVTWUS & obj.AMask: {enc: rFIEncoding},
++	AFCVTLUS & obj.AMask: {enc: rFIEncoding},
++	AFCVTSWU & obj.AMask: {enc: rIFEncoding},
++	AFCVTSLU & obj.AMask: {enc: rIFEncoding},
++	AFSGNJS & obj.AMask:  {enc: rFFFEncoding},
++	AFSGNJNS & obj.AMask: {enc: rFFFEncoding},
++	AFSGNJXS & obj.AMask: {enc: rFFFEncoding},
++	AFMVXW & obj.AMask:   {enc: rFIEncoding},
++	AFMVWX & obj.AMask:   {enc: rIFEncoding},
+ 
+ 	// 11.8: Single-Precision Floating-Point Compare Instructions
+-	AFEQS & obj.AMask: rFFIEncoding,
+-	AFLTS & obj.AMask: rFFIEncoding,
+-	AFLES & obj.AMask: rFFIEncoding,
++	AFEQS & obj.AMask: {enc: rFFIEncoding},
++	AFLTS & obj.AMask: {enc: rFFIEncoding},
++	AFLES & obj.AMask: {enc: rFFIEncoding},
+ 
+ 	// 11.9: Single-Precision Floating-Point Classify Instruction
+-	AFCLASSS & obj.AMask: rFIEncoding,
++	AFCLASSS & obj.AMask: {enc: rFIEncoding},
+ 
+ 	// 12.3: Double-Precision Load and Store Instructions
+-	AFLD & obj.AMask: iFEncoding,
+-	AFSD & obj.AMask: sFEncoding,
++	AFLD & obj.AMask: {enc: iFEncoding},
++	AFSD & obj.AMask: {enc: sFEncoding},
+ 
+ 	// 12.4: Double-Precision Floating-Point Computational Instructions
+-	AFADDD & obj.AMask:   rFFFEncoding,
+-	AFSUBD & obj.AMask:   rFFFEncoding,
+-	AFMULD & obj.AMask:   rFFFEncoding,
+-	AFDIVD & obj.AMask:   rFFFEncoding,
+-	AFMIND & obj.AMask:   rFFFEncoding,
+-	AFMAXD & obj.AMask:   rFFFEncoding,
+-	AFSQRTD & obj.AMask:  rFFFEncoding,
+-	AFMADDD & obj.AMask:  rFFFFEncoding,
+-	AFMSUBD & obj.AMask:  rFFFFEncoding,
+-	AFNMSUBD & obj.AMask: rFFFFEncoding,
+-	AFNMADDD & obj.AMask: rFFFFEncoding,
++	AFADDD & obj.AMask:   {enc: rFFFEncoding},
++	AFSUBD & obj.AMask:   {enc: rFFFEncoding},
++	AFMULD & obj.AMask:   {enc: rFFFEncoding},
++	AFDIVD & obj.AMask:   {enc: rFFFEncoding},
++	AFMIND & obj.AMask:   {enc: rFFFEncoding},
++	AFMAXD & obj.AMask:   {enc: rFFFEncoding},
++	AFSQRTD & obj.AMask:  {enc: rFFFEncoding},
++	AFMADDD & obj.AMask:  {enc: rFFFFEncoding},
++	AFMSUBD & obj.AMask:  {enc: rFFFFEncoding},
++	AFNMSUBD & obj.AMask: {enc: rFFFFEncoding},
++	AFNMADDD & obj.AMask: {enc: rFFFFEncoding},
+ 
+ 	// 12.5: Double-Precision Floating-Point Conversion and Move Instructions
+-	AFCVTWD & obj.AMask:  rFIEncoding,
+-	AFCVTLD & obj.AMask:  rFIEncoding,
+-	AFCVTDW & obj.AMask:  rIFEncoding,
+-	AFCVTDL & obj.AMask:  rIFEncoding,
+-	AFCVTWUD & obj.AMask: rFIEncoding,
+-	AFCVTLUD & obj.AMask: rFIEncoding,
+-	AFCVTDWU & obj.AMask: rIFEncoding,
+-	AFCVTDLU & obj.AMask: rIFEncoding,
+-	AFCVTSD & obj.AMask:  rFFEncoding,
+-	AFCVTDS & obj.AMask:  rFFEncoding,
+-	AFSGNJD & obj.AMask:  rFFFEncoding,
+-	AFSGNJND & obj.AMask: rFFFEncoding,
+-	AFSGNJXD & obj.AMask: rFFFEncoding,
+-	AFMVXD & obj.AMask:   rFIEncoding,
+-	AFMVDX & obj.AMask:   rIFEncoding,
++	AFCVTWD & obj.AMask:  {enc: rFIEncoding},
++	AFCVTLD & obj.AMask:  {enc: rFIEncoding},
++	AFCVTDW & obj.AMask:  {enc: rIFEncoding},
++	AFCVTDL & obj.AMask:  {enc: rIFEncoding},
++	AFCVTWUD & obj.AMask: {enc: rFIEncoding},
++	AFCVTLUD & obj.AMask: {enc: rFIEncoding},
++	AFCVTDWU & obj.AMask: {enc: rIFEncoding},
++	AFCVTDLU & obj.AMask: {enc: rIFEncoding},
++	AFCVTSD & obj.AMask:  {enc: rFFEncoding},
++	AFCVTDS & obj.AMask:  {enc: rFFEncoding},
++	AFSGNJD & obj.AMask:  {enc: rFFFEncoding},
++	AFSGNJND & obj.AMask: {enc: rFFFEncoding},
++	AFSGNJXD & obj.AMask: {enc: rFFFEncoding},
++	AFMVXD & obj.AMask:   {enc: rFIEncoding},
++	AFMVDX & obj.AMask:   {enc: rIFEncoding},
+ 
+ 	// 12.6: Double-Precision Floating-Point Compare Instructions
+-	AFEQD & obj.AMask: rFFIEncoding,
+-	AFLTD & obj.AMask: rFFIEncoding,
+-	AFLED & obj.AMask: rFFIEncoding,
++	AFEQD & obj.AMask: {enc: rFFIEncoding},
++	AFLTD & obj.AMask: {enc: rFFIEncoding},
++	AFLED & obj.AMask: {enc: rFFIEncoding},
+ 
+ 	// 12.7: Double-Precision Floating-Point Classify Instruction
+-	AFCLASSD & obj.AMask: rFIEncoding,
++	AFCLASSD & obj.AMask: {enc: rFIEncoding},
+ 
+ 	// Privileged ISA
+ 
+ 	// 3.2.1: Environment Call and Breakpoint
+-	AECALL & obj.AMask:  iIIEncoding,
+-	AEBREAK & obj.AMask: iIIEncoding,
++	AECALL & obj.AMask:  {enc: iIIEncoding},
++	AEBREAK & obj.AMask: {enc: iIIEncoding},
+ 
+ 	//
+ 	// RISC-V Bit-Manipulation ISA-extensions (1.0)
+ 	//
+ 
+ 	// 1.1: Address Generation Instructions (Zba)
+-	AADDUW & obj.AMask:    rIIIEncoding,
+-	ASH1ADD & obj.AMask:   rIIIEncoding,
+-	ASH1ADDUW & obj.AMask: rIIIEncoding,
+-	ASH2ADD & obj.AMask:   rIIIEncoding,
+-	ASH2ADDUW & obj.AMask: rIIIEncoding,
+-	ASH3ADD & obj.AMask:   rIIIEncoding,
+-	ASH3ADDUW & obj.AMask: rIIIEncoding,
+-	ASLLIUW & obj.AMask:   iIIEncoding,
++	AADDUW & obj.AMask:    {enc: rIIIEncoding, ternary: true},
++	ASH1ADD & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	ASH1ADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true},
++	ASH2ADD & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	ASH2ADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true},
++	ASH3ADD & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	ASH3ADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true},
++	ASLLIUW & obj.AMask:   {enc: iIIEncoding, ternary: true},
+ 
+ 	// 1.2: Basic Bit Manipulation (Zbb)
+-	AANDN & obj.AMask:  rIIIEncoding,
+-	ACLZ & obj.AMask:   rIIEncoding,
+-	ACLZW & obj.AMask:  rIIEncoding,
+-	ACPOP & obj.AMask:  rIIEncoding,
+-	ACPOPW & obj.AMask: rIIEncoding,
+-	ACTZ & obj.AMask:   rIIEncoding,
+-	ACTZW & obj.AMask:  rIIEncoding,
+-	AMAX & obj.AMask:   rIIIEncoding,
+-	AMAXU & obj.AMask:  rIIIEncoding,
+-	AMIN & obj.AMask:   rIIIEncoding,
+-	AMINU & obj.AMask:  rIIIEncoding,
+-	AORN & obj.AMask:   rIIIEncoding,
+-	ASEXTB & obj.AMask: rIIEncoding,
+-	ASEXTH & obj.AMask: rIIEncoding,
+-	AXNOR & obj.AMask:  rIIIEncoding,
+-	AZEXTH & obj.AMask: rIIEncoding,
++	AANDN & obj.AMask:  {enc: rIIIEncoding, ternary: true},
++	ACLZ & obj.AMask:   {enc: rIIEncoding},
++	ACLZW & obj.AMask:  {enc: rIIEncoding},
++	ACPOP & obj.AMask:  {enc: rIIEncoding},
++	ACPOPW & obj.AMask: {enc: rIIEncoding},
++	ACTZ & obj.AMask:   {enc: rIIEncoding},
++	ACTZW & obj.AMask:  {enc: rIIEncoding},
++	AMAX & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	AMAXU & obj.AMask:  {enc: rIIIEncoding, ternary: true},
++	AMIN & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	AMINU & obj.AMask:  {enc: rIIIEncoding, ternary: true},
++	AORN & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	ASEXTB & obj.AMask: {enc: rIIEncoding},
++	ASEXTH & obj.AMask: {enc: rIIEncoding},
++	AXNOR & obj.AMask:  {enc: rIIIEncoding, ternary: true},
++	AZEXTH & obj.AMask: {enc: rIIEncoding},
+ 
+ 	// 1.3: Bitwise Rotation (Zbb)
+-	AROL & obj.AMask:   rIIIEncoding,
+-	AROLW & obj.AMask:  rIIIEncoding,
+-	AROR & obj.AMask:   rIIIEncoding,
+-	ARORI & obj.AMask:  iIIEncoding,
+-	ARORIW & obj.AMask: iIIEncoding,
+-	ARORW & obj.AMask:  rIIIEncoding,
+-	AORCB & obj.AMask:  iIIEncoding,
+-	AREV8 & obj.AMask:  iIIEncoding,
++	AROL & obj.AMask:   {enc: rIIIEncoding, ternary: true},
++	AROLW & obj.AMask:  {enc: rIIIEncoding, ternary: true},
++	AROR & obj.AMask:   {enc: rIIIEncoding, immForm: ARORI, ternary: true},
++	ARORI & obj.AMask:  {enc: iIIEncoding, ternary: true},
++	ARORIW & obj.AMask: {enc: iIIEncoding, ternary: true},
++	ARORW & obj.AMask:  {enc: rIIIEncoding, immForm: ARORIW, ternary: true},
++	AORCB & obj.AMask:  {enc: iIIEncoding},
++	AREV8 & obj.AMask:  {enc: iIIEncoding},
+ 
+ 	// 1.5: Single-bit Instructions (Zbs)
+-	ABCLR & obj.AMask:  rIIIEncoding,
+-	ABCLRI & obj.AMask: iIIEncoding,
+-	ABEXT & obj.AMask:  rIIIEncoding,
+-	ABEXTI & obj.AMask: iIIEncoding,
+-	ABINV & obj.AMask:  rIIIEncoding,
+-	ABINVI & obj.AMask: iIIEncoding,
+-	ABSET & obj.AMask:  rIIIEncoding,
+-	ABSETI & obj.AMask: iIIEncoding,
++	ABCLR & obj.AMask:  {enc: rIIIEncoding, immForm: ABCLRI, ternary: true},
++	ABCLRI & obj.AMask: {enc: iIIEncoding, ternary: true},
++	ABEXT & obj.AMask:  {enc: rIIIEncoding, immForm: ABEXTI, ternary: true},
++	ABEXTI & obj.AMask: {enc: iIIEncoding, ternary: true},
++	ABINV & obj.AMask:  {enc: rIIIEncoding, immForm: ABINVI, ternary: true},
++	ABINVI & obj.AMask: {enc: iIIEncoding, ternary: true},
++	ABSET & obj.AMask:  {enc: rIIIEncoding, immForm: ABSETI, ternary: true},
++	ABSETI & obj.AMask: {enc: iIIEncoding, ternary: true},
+ 
+ 	// Escape hatch
+-	AWORD & obj.AMask: rawEncoding,
++	AWORD & obj.AMask: {enc: rawEncoding},
+ 
+ 	// Pseudo-operations
+-	obj.AFUNCDATA: pseudoOpEncoding,
+-	obj.APCDATA:   pseudoOpEncoding,
+-	obj.ATEXT:     pseudoOpEncoding,
+-	obj.ANOP:      pseudoOpEncoding,
+-	obj.ADUFFZERO: pseudoOpEncoding,
+-	obj.ADUFFCOPY: pseudoOpEncoding,
+-	obj.APCALIGN:  pseudoOpEncoding,
++	obj.AFUNCDATA: {enc: pseudoOpEncoding},
++	obj.APCDATA:   {enc: pseudoOpEncoding},
++	obj.ATEXT:     {enc: pseudoOpEncoding},
++	obj.ANOP:      {enc: pseudoOpEncoding},
++	obj.ADUFFZERO: {enc: pseudoOpEncoding},
++	obj.ADUFFCOPY: {enc: pseudoOpEncoding},
++	obj.APCALIGN:  {enc: pseudoOpEncoding},
+ }
+ 
+-// encodingForAs returns the encoding for an obj.As.
+-func encodingForAs(as obj.As) (encoding, error) {
++// instructionDataForAs returns the instruction data for an obj.As.
++func instructionDataForAs(as obj.As) (*instructionData, error) {
+ 	if base := as &^ obj.AMask; base != obj.ABaseRISCV && base != 0 {
+-		return badEncoding, fmt.Errorf("encodingForAs: not a RISC-V instruction %s", as)
++		return nil, fmt.Errorf("%v is not a RISC-V instruction", as)
+ 	}
+ 	asi := as & obj.AMask
+-	if int(asi) >= len(encodings) {
+-		return badEncoding, fmt.Errorf("encodingForAs: bad RISC-V instruction %s", as)
++	if int(asi) >= len(instructions) {
++		return nil, fmt.Errorf("bad RISC-V instruction %v", as)
++	}
++	return &instructions[asi], nil
++}
++
++// encodingForAs returns the encoding for an obj.As.
++func encodingForAs(as obj.As) (*encoding, error) {
++	insData, err := instructionDataForAs(as)
++	if err != nil {
++		return &badEncoding, err
+ 	}
+-	enc := encodings[asi]
+-	if enc.validate == nil {
+-		return badEncoding, fmt.Errorf("encodingForAs: no encoding for instruction %s", as)
++	if insData.enc.validate == nil {
++		return &badEncoding, fmt.Errorf("no encoding for instruction %s", as)
+ 	}
+-	return enc, nil
++	return &insData.enc, nil
+ }
+ 
+ type instruction struct {
+-- 
+2.39.5
+
diff --git a/2064-cpu-internal-provide-runtime-detection-of-RISC-V-ext.patch b/2064-cpu-internal-provide-runtime-detection-of-RISC-V-ext.patch
new file mode 100644
index 0000000..e144b15
--- /dev/null
+++ b/2064-cpu-internal-provide-runtime-detection-of-RISC-V-ext.patch
@@ -0,0 +1,255 @@
+From 0f385c824d3218473ca71a98c71050671078f6ed Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 064/119] cpu/internal: provide runtime detection of RISC-V
+ extensions on Linux
+
+Add a RISCV64 variable to cpu/internal that indicates both the presence
+of RISC-V extensions and performance information about the underlying
+RISC-V cores.  The variable is only populated with non false values on
+Linux.  The detection code relies on the riscv_hwprobe syscall
+introduced in Linux 6.4.  The patch can detect RVV 1.0 and whether
+the CPU supports fast misaligned accesses.  It can only detect RVV 1.0
+on a 6.5 kernel or later (without backports).
+
+Updates #61416
+
+Change-Id: I2d8289345c885b699afff441d417cae38f6bdc54
+Reviewed-on: https://go-review.googlesource.com/c/go/+/522995
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Michael Knyszek <mknyszek@google.com>
+Reviewed-by: David Chase <drchase@google.com>
+---
+ src/go/build/deps_test.go             |  4 +-
+ src/internal/cpu/cpu.go               | 11 ++++
+ src/internal/cpu/cpu_riscv64.go       | 11 ++++
+ src/internal/cpu/cpu_riscv64_linux.go | 91 +++++++++++++++++++++++++++
+ src/internal/cpu/cpu_riscv64_other.go | 11 ++++
+ src/runtime/os_linux_riscv64.go       | 30 +++++++++
+ 6 files changed, 156 insertions(+), 2 deletions(-)
+ create mode 100644 src/internal/cpu/cpu_riscv64_linux.go
+ create mode 100644 src/internal/cpu/cpu_riscv64_other.go
+
+diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go
+index 592f2fd72a..babce57e42 100644
+--- a/src/go/build/deps_test.go
++++ b/src/go/build/deps_test.go
+@@ -39,6 +39,7 @@ import (
+ var depsRules = `
+ 	# No dependencies allowed for any of these packages.
+ 	NONE
++	< unsafe
+ 	< cmp, container/list, container/ring,
+ 	  internal/cfg, internal/coverage, internal/coverage/rtcov,
+ 	  internal/coverage/uleb128, internal/coverage/calloc,
+@@ -46,8 +47,7 @@ var depsRules = `
+ 	  internal/goexperiment, internal/goos,
+ 	  internal/goversion, internal/nettrace, internal/platform,
+ 	  log/internal,
+-	  unicode/utf8, unicode/utf16, unicode,
+-	  unsafe;
++	  unicode/utf8, unicode/utf16, unicode;
+ 
+ 	# These packages depend only on internal/goarch and unsafe.
+ 	internal/goarch, unsafe
+diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go
+index 1352810f42..ddad7198a8 100644
+--- a/src/internal/cpu/cpu.go
++++ b/src/internal/cpu/cpu.go
+@@ -117,6 +117,17 @@ var S390X struct {
+ 	_         CacheLinePad
+ }
+ 
++// RISCV64 contains the supported CPU features and performance characteristics for riscv64
++// platforms. The booleans in RISCV64, with the exception of HasFastMisaligned, indicate
++// the presence of RISC-V extensions.
++// The struct is padded to avoid false sharing.
++var RISCV64 struct {
++	_                 CacheLinePad
++	HasFastMisaligned bool // Fast misaligned accesses
++	HasV              bool // Vector extension compatible with RVV 1.0
++	_                 CacheLinePad
++}
++
+ // Initialize examines the processor and sets the relevant variables above.
+ // This is called by the runtime package early in program initialization,
+ // before normal init functions are run. env is set by runtime if the OS supports
+diff --git a/src/internal/cpu/cpu_riscv64.go b/src/internal/cpu/cpu_riscv64.go
+index 2173fe8886..e6e532c7e7 100644
+--- a/src/internal/cpu/cpu_riscv64.go
++++ b/src/internal/cpu/cpu_riscv64.go
+@@ -6,5 +6,16 @@ package cpu
+ 
+ const CacheLinePadSize = 64
+ 
++// RISC-V doesn't have a 'cpuid' equivalent. On Linux we rely on the riscv_hwprobe syscall.
++
+ func doinit() {
++	options = []option{
++		{Name: "fastmisaligned", Feature: &RISCV64.HasFastMisaligned},
++		{Name: "v", Feature: &RISCV64.HasV},
++	}
++	osInit()
++}
++
++func isSet(hwc uint, value uint) bool {
++	return hwc&value != 0
+ }
+diff --git a/src/internal/cpu/cpu_riscv64_linux.go b/src/internal/cpu/cpu_riscv64_linux.go
+new file mode 100644
+index 0000000000..a076d3e33c
+--- /dev/null
++++ b/src/internal/cpu/cpu_riscv64_linux.go
+@@ -0,0 +1,91 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++//go:build riscv64 && linux
++
++package cpu
++
++import _ "unsafe"
++
++// RISC-V extension discovery code for Linux.
++//
++// A note on detection of the Vector extension using HWCAP.
++//
++// Support for the Vector extension version 1.0 was added to the Linux kernel in release 6.5.
++// Support for the riscv_hwprobe syscall was added in 6.4. It follows that if the riscv_hwprobe
++// syscall is not available then neither is the Vector extension (which needs kernel support).
++// The riscv_hwprobe syscall should then be all we need to detect the Vector extension.
++// However, some RISC-V board manufacturers ship boards with an older kernel on top of which
++// they have back-ported various versions of the Vector extension patches but not the riscv_hwprobe
++// patches. These kernels advertise support for the Vector extension using HWCAP. Falling
++// back to HWCAP to detect the Vector extension, if riscv_hwprobe is not available, or simply not
++// bothering with riscv_hwprobe at all and just using HWCAP may then seem like an attractive option.
++//
++// Unfortunately, simply checking the 'V' bit in AT_HWCAP will not work as this bit is used by
++// RISC-V board and cloud instance providers to mean different things. The Lichee Pi 4A board
++// and the Scaleway RV1 cloud instances use the 'V' bit to advertise their support for the unratified
++// 0.7.1 version of the Vector Specification. The Banana Pi BPI-F3 and the CanMV-K230 board use
++// it to advertise support for 1.0 of the Vector extension. Versions 0.7.1 and 1.0 of the Vector
++// extension are binary incompatible. HWCAP can then not be used in isolation to populate the
++// HasV field as this field indicates that the underlying CPU is compatible with RVV 1.0.
++// Go will only support the ratified versions >= 1.0 and so any vector code it might generate
++// would crash on a Scaleway RV1 instance or a Lichee Pi 4a, if allowed to run.
++//
++// There is a way at runtime to distinguish between versions 0.7.1 and 1.0 of the Vector
++// specification by issuing a RVV 1.0 vsetvli instruction and checking the vill bit of the vtype
++// register. This check would allow us to safely detect version 1.0 of the Vector extension
++// with HWCAP, if riscv_hwprobe were not available. However, the check cannot
++// be added until the assembler supports the Vector instructions.
++//
++// Note the riscv_hwprobe syscall does not suffer from these ambiguities by design as all of the
++// extensions it advertises support for are explicitly versioned. It's also worth noting that
++// the riscv_hwprobe syscall is the only way to detect multi-letter RISC-V extensions, e.g., Zvbb.
++// These cannot be detected using HWCAP and so riscv_hwprobe must be used to detect the majority
++// of RISC-V extensions.
++//
++// Please see https://docs.kernel.org/arch/riscv/hwprobe.html for more information.
++
++const (
++	// Copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go.
++	riscv_HWPROBE_KEY_IMA_EXT_0   = 0x4
++	riscv_HWPROBE_IMA_V           = 0x4
++	riscv_HWPROBE_KEY_CPUPERF_0   = 0x5
++	riscv_HWPROBE_MISALIGNED_FAST = 0x3
++	riscv_HWPROBE_MISALIGNED_MASK = 0x7
++)
++
++// riscvHWProbePairs is copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go.
++type riscvHWProbePairs struct {
++	key   int64
++	value uint64
++}
++
++//go:linkname riscvHWProbe
++func riscvHWProbe(pairs []riscvHWProbePairs, flags uint) bool
++
++func osInit() {
++	// A slice of key/value pair structures is passed to the RISCVHWProbe syscall. The key
++	// field should be initialised with one of the key constants defined above, e.g.,
++	// RISCV_HWPROBE_KEY_IMA_EXT_0. The syscall will set the value field to the appropriate value.
++	// If the kernel does not recognise a key it will set the key field to -1 and the value field to 0.
++
++	pairs := []riscvHWProbePairs{
++		{riscv_HWPROBE_KEY_IMA_EXT_0, 0},
++		{riscv_HWPROBE_KEY_CPUPERF_0, 0},
++	}
++
++	// This call only indicates that extensions are supported if they are implemented on all cores.
++	if !riscvHWProbe(pairs, 0) {
++		return
++	}
++
++	if pairs[0].key != -1 {
++		v := uint(pairs[0].value)
++		RISCV64.HasV = isSet(v, riscv_HWPROBE_IMA_V)
++	}
++	if pairs[1].key != -1 {
++		v := pairs[1].value & riscv_HWPROBE_MISALIGNED_MASK
++		RISCV64.HasFastMisaligned = v == riscv_HWPROBE_MISALIGNED_FAST
++	}
++}
+diff --git a/src/internal/cpu/cpu_riscv64_other.go b/src/internal/cpu/cpu_riscv64_other.go
+new file mode 100644
+index 0000000000..1307d822b3
+--- /dev/null
++++ b/src/internal/cpu/cpu_riscv64_other.go
+@@ -0,0 +1,11 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++//go:build riscv64 && !linux
++
++package cpu
++
++func osInit() {
++	// Other operating systems do not support the riscv_hwprobe syscall.
++}
+diff --git a/src/runtime/os_linux_riscv64.go b/src/runtime/os_linux_riscv64.go
+index 9be88a5ad2..bd275707eb 100644
+--- a/src/runtime/os_linux_riscv64.go
++++ b/src/runtime/os_linux_riscv64.go
+@@ -4,4 +4,34 @@
+ 
+ package runtime
+ 
++import (
++	"runtime/internal/syscall"
++	"unsafe"
++)
++
+ func osArchInit() {}
++
++type riscvHWProbePairs = struct {
++	key   int64
++	value uint64
++}
++
++// TODO: Consider whether to use the VDSO entry for riscv_hwprobe.
++// There is a VDSO entry for riscv_hwprobe that should allow us to avoid the syscall
++// entirely as it can handle the case where the caller only requests extensions that are
++// supported on all cores, which is what we're doing here. However, as we're only calling
++// this syscall once, it may not be worth the added effort to implement the VDSO call.
++
++//go:linkname internal_cpu_riscvHWProbe internal/cpu.riscvHWProbe
++func internal_cpu_riscvHWProbe(pairs []riscvHWProbePairs, flags uint) bool {
++	// sys_RISCV_HWPROBE is copied from golang.org/x/sys/unix/zsysnum_linux_riscv64.go.
++	const sys_RISCV_HWPROBE uintptr = 258
++
++	if len(pairs) == 0 {
++		return false
++	}
++	// Passing in a cpuCount of 0 and a cpu of nil ensures that only extensions supported by all the
++	// cores are returned, which is the behaviour we want in internal/cpu.
++	_, _, e1 := syscall.Syscall6(sys_RISCV_HWPROBE, uintptr(unsafe.Pointer(&pairs[0])), uintptr(len(pairs)), uintptr(0), uintptr(unsafe.Pointer(nil)), uintptr(flags), 0)
++	return e1 == 0
++}
+-- 
+2.39.5
+
diff --git a/2065-cmd-go-add-rva23u64-as-a-valid-value-for-GORISCV64.patch b/2065-cmd-go-add-rva23u64-as-a-valid-value-for-GORISCV64.patch
new file mode 100644
index 0000000..7edb409
--- /dev/null
+++ b/2065-cmd-go-add-rva23u64-as-a-valid-value-for-GORISCV64.patch
@@ -0,0 +1,190 @@
+From 344295f05b52aa3fa812ad039625033b76cc3fcd Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 065/119] cmd/go: add rva23u64 as a valid value for GORISCV64
+
+The RVA23 profile was ratified on the 21st of October 2024.
+
+https://riscv.org/announcements/2024/10/risc-v-announces-ratification-of-the-rva23-profile-standard/
+
+Now that it's ratified we can add rva23u64 as a valid value for the
+GORISCV64 environment variable. This will allow the compiler and
+assembler to generate instructions made mandatory by the new profile
+without a runtime check.  Examples of such instructions include those
+introduced by the Vector and Zicond extensions.
+
+Setting GORISCV64=rva23u64 defines the riscv64.rva20u64,
+riscv64.rva22u64 and riscv64.rva23u64 build tags, sets the internal
+variable buildcfg.GORISCV64 to 23 and defines the macros
+GORISCV64_rva23u64, hasV, hasZba, hasZbb, hasZbs, hasZfa, and
+hasZicond for use in assembly language code.
+
+Updates #61476
+
+Change-Id: I7641c23084fa52891c9a18df58f4013cb6597d88
+Reviewed-on: https://go-review.googlesource.com/c/go/+/633417
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Jorropo <jorropo.pgm@gmail.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/cmd/go/alldocs.go                    | 9 +++++----
+ src/cmd/go/internal/help/helpdoc.go      | 9 +++++----
+ src/cmd/go/testdata/script/tooltags.txt  | 7 ++++++-
+ src/cmd/internal/testdir/testdir_test.go | 2 +-
+ src/internal/buildcfg/cfg.go             | 7 ++++++-
+ src/internal/buildcfg/cfg_test.go        | 4 ++++
+ src/runtime/asm_riscv64.h                | 9 +++++++++
+ 7 files changed, 36 insertions(+), 11 deletions(-)
+
+diff --git a/src/cmd/go/alldocs.go b/src/cmd/go/alldocs.go
+index db737b062e..32e2ba15e9 100644
+--- a/src/cmd/go/alldocs.go
++++ b/src/cmd/go/alldocs.go
+@@ -1979,8 +1979,8 @@
+ //     (or ppc64le.power8, ppc64le.power9, and ppc64le.power10)
+ //     feature build tags.
+ //   - For GOARCH=riscv64,
+-//     GORISCV64=rva20u64 and rva22u64 correspond to the riscv64.rva20u64
+-//     and riscv64.rva22u64 build tags.
++//     GORISCV64=rva20u64, rva22u64 and rva23u64 correspond to the riscv64.rva20u64,
++//     riscv64.rva22u64 and riscv64.rva23u64 build tags.
+ //   - For GOARCH=wasm, GOWASM=satconv and signext
+ //     correspond to the wasm.satconv and wasm.signext feature build tags.
+ //
+@@ -2280,8 +2280,9 @@
+ //		Valid values are power8 (default), power9, power10.
+ //	GORISCV64
+ //		For GOARCH=riscv64, the RISC-V user-mode application profile for which
+-//		to compile. Valid values are rva20u64 (default), rva22u64.
+-//		See https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc
++//		to compile. Valid values are rva20u64 (default), rva22u64, rva23u64.
++//		See https://github.com/riscv/riscv-profiles/blob/main/src/profiles.adoc
++//		and https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc
+ //	GOWASM
+ //		For GOARCH=wasm, comma-separated list of experimental WebAssembly features to use.
+ //		Valid values are satconv, signext.
+diff --git a/src/cmd/go/internal/help/helpdoc.go b/src/cmd/go/internal/help/helpdoc.go
+index 55701bac46..12b667e9be 100644
+--- a/src/cmd/go/internal/help/helpdoc.go
++++ b/src/cmd/go/internal/help/helpdoc.go
+@@ -619,8 +619,9 @@ Architecture-specific environment variables:
+ 		Valid values are power8 (default), power9, power10.
+ 	GORISCV64
+ 		For GOARCH=riscv64, the RISC-V user-mode application profile for which
+-		to compile. Valid values are rva20u64 (default), rva22u64.
+-		See https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc
++		to compile. Valid values are rva20u64 (default), rva22u64, rva23u64.
++		See https://github.com/riscv/riscv-profiles/blob/main/src/profiles.adoc
++		and https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc
+ 	GOWASM
+ 		For GOARCH=wasm, comma-separated list of experimental WebAssembly features to use.
+ 		Valid values are satconv, signext.
+@@ -910,8 +911,8 @@ The defined architecture feature build tags are:
+ 	  (or ppc64le.power8, ppc64le.power9, and ppc64le.power10)
+ 	  feature build tags.
+ 	- For GOARCH=riscv64,
+-	  GORISCV64=rva20u64 and rva22u64 correspond to the riscv64.rva20u64
+-	  and riscv64.rva22u64 build tags.
++	  GORISCV64=rva20u64, rva22u64 and rva23u64 correspond to the riscv64.rva20u64,
++	  riscv64.rva22u64 and riscv64.rva23u64 build tags.
+ 	- For GOARCH=wasm, GOWASM=satconv and signext
+ 	  correspond to the wasm.satconv and wasm.signext feature build tags.
+ 
+diff --git a/src/cmd/go/testdata/script/tooltags.txt b/src/cmd/go/testdata/script/tooltags.txt
+index 1f6f54563c..a69b7a5c37 100644
+--- a/src/cmd/go/testdata/script/tooltags.txt
++++ b/src/cmd/go/testdata/script/tooltags.txt
+@@ -50,10 +50,15 @@ env GORISCV64=rva22u64
+ go list -f '{{context.ToolTags}}'
+ stdout 'riscv64.rva20u64 riscv64.rva22u64'
+ 
++env GOARCH=riscv64
++env GORISCV64=rva23u64
++go list -f '{{context.ToolTags}}'
++stdout 'riscv64.rva20u64 riscv64.rva22u64 riscv64.rva23u64'
++
+ env GOARCH=riscv64
+ env GORISCV64=rva22
+ ! go list -f '{{context.ToolTags}}'
+-stderr 'go: invalid GORISCV64: must be rva20u64, rva22u64'
++stderr 'go: invalid GORISCV64: must be rva20u64, rva22u64, rva23u64'
+ 
+ env GOARCH=riscv64
+ env GORISCV64=
+diff --git a/src/cmd/internal/testdir/testdir_test.go b/src/cmd/internal/testdir/testdir_test.go
+index 1677191d96..90d967f47d 100644
+--- a/src/cmd/internal/testdir/testdir_test.go
++++ b/src/cmd/internal/testdir/testdir_test.go
+@@ -1464,7 +1464,7 @@ var (
+ 		"ppc64x":  {}, // A pseudo-arch representing both ppc64 and ppc64le
+ 		"s390x":   {},
+ 		"wasm":    {},
+-		"riscv64": {"GORISCV64", "rva20u64", "rva22u64"},
++		"riscv64": {"GORISCV64", "rva20u64", "rva22u64", "rva23u64"},
+ 	}
+ )
+ 
+diff --git a/src/internal/buildcfg/cfg.go b/src/internal/buildcfg/cfg.go
+index 599e782c7a..f6fb2d232f 100644
+--- a/src/internal/buildcfg/cfg.go
++++ b/src/internal/buildcfg/cfg.go
+@@ -220,8 +220,10 @@ func goriscv64() int {
+ 		return 20
+ 	case "rva22u64":
+ 		return 22
++	case "rva23u64":
++		return 23
+ 	}
+-	Error = fmt.Errorf("invalid GORISCV64: must be rva20u64, rva22u64")
++	Error = fmt.Errorf("invalid GORISCV64: must be rva20u64, rva22u64, rva23u64")
+ 	v := defaultGORISCV64[len("rva"):]
+ 	i := strings.IndexFunc(v, func(r rune) bool {
+ 		return r < '0' || r > '9'
+@@ -353,6 +355,9 @@ func gogoarchTags() []string {
+ 		if GORISCV64 >= 22 {
+ 			list = append(list, GOARCH+"."+"rva22u64")
+ 		}
++		if GORISCV64 >= 23 {
++			list = append(list, GOARCH+"."+"rva23u64")
++		}
+ 		return list
+ 	case "wasm":
+ 		var list []string
+diff --git a/src/internal/buildcfg/cfg_test.go b/src/internal/buildcfg/cfg_test.go
+index 69eeef2422..1513cdc9b0 100644
+--- a/src/internal/buildcfg/cfg_test.go
++++ b/src/internal/buildcfg/cfg_test.go
+@@ -32,6 +32,10 @@ func TestConfigFlags(t *testing.T) {
+ 	if goriscv64() != 22 {
+ 		t.Errorf("Wrong parsing of RISCV64=rva22u64")
+ 	}
++	os.Setenv("GORISCV64", "rva23u64")
++	if goriscv64() != 23 {
++		t.Errorf("Wrong parsing of RISCV64=rva23u64")
++	}
+ 	Error = nil
+ 	os.Setenv("GORISCV64", "rva22")
+ 	if _ = goriscv64(); Error == nil {
+diff --git a/src/runtime/asm_riscv64.h b/src/runtime/asm_riscv64.h
+index d4deb093a6..2414b9f067 100644
+--- a/src/runtime/asm_riscv64.h
++++ b/src/runtime/asm_riscv64.h
+@@ -10,3 +10,12 @@
+ #define hasZbb
+ #define hasZbs
+ #endif
++
++#ifdef GORISCV64_rva23u64
++#define hasV
++#define hasZba
++#define hasZbb
++#define hasZbs
++#define hasZfa
++#define hasZicond
++#endif
+-- 
+2.39.5
+
diff --git a/2066-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch b/2066-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch
new file mode 100644
index 0000000..32935e5
--- /dev/null
+++ b/2066-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch
@@ -0,0 +1,671 @@
+From 39ff3dc09208547d679805802d845e64792e686c Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 066/119] cmd/internal/obj/riscv: update references to RISC-V
+ specification
+
+Update references to version 20240411 of the RISC-V specifications.
+Reorder and regroup instructions to maintain ordering. Also be
+consistent with formatting.
+
+The instruction encodings table was seemingly missed in CL 616115.
+
+Change-Id: I47b7c8538383ff3b0503ba59db570c3d4f0d5653
+Reviewed-on: https://go-review.googlesource.com/c/go/+/631935
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Ian Lance Taylor <iant@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s |   4 +
+ src/cmd/internal/obj/riscv/cpu.go           | 118 ++++++++++----------
+ src/cmd/internal/obj/riscv/obj.go           |  54 ++++-----
+ 3 files changed, 94 insertions(+), 82 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 517930aa60..ad468574a9 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -363,6 +363,10 @@ start:
+ 	SLLIUW		$63, X17, X18			// 1b99f80b
+ 	SLLIUW		$1, X18, X19			// 9b191908
+ 
++	//
++	// "B" Extension for Bit Manipulation, Version 1.0.0
++	//
++
+ 	// 28.4.2: Basic Bit Manipulation (Zbb)
+ 	ANDN	X19, X20, X21				// b37a3a41 or 93caf9ffb37a5a01
+ 	ANDN	X19, X20				// 337a3a41 or 93cff9ff337afa01
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index a36b95e6d2..29f7e913ed 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -566,6 +566,10 @@ const (
+ 	// 22.5 Quad-Precision Floating-Point Classify Instruction
+ 	AFCLASSQ
+ 
++	//
++	// "B" Extension for Bit Manipulation, Version 1.0.0
++	//
++
+ 	// 28.4.1: Address Generation Instructions (Zba)
+ 	AADDUW
+ 	ASH1ADD
+@@ -615,15 +619,15 @@ const (
+ 	ABSETI
+ 
+ 	//
+-	// RISC-V Vector ISA-extension (1.0) (Unprivileged 20240411)
++	// "V" Standard Extension for Vector Operations, Version 1.0
+ 	//
+ 
+-	// 31.6. Configuration-Setting Instructions
++	// 31.6: Configuration-Setting Instructions
+ 	AVSETVLI
+ 	AVSETIVLI
+ 	AVSETVL
+ 
+-	// 31.7.4. Vector Unit-Stride Instructions
++	// 31.7.4: Vector Unit-Stride Instructions
+ 	AVLE8V
+ 	AVLE16V
+ 	AVLE32V
+@@ -635,7 +639,7 @@ const (
+ 	AVLMV
+ 	AVSMV
+ 
+-	// 31.7.5. Vector Strided Instructions
++	// 31.7.5: Vector Strided Instructions
+ 	AVLSE8V
+ 	AVLSE16V
+ 	AVLSE32V
+@@ -645,7 +649,7 @@ const (
+ 	AVSSE32V
+ 	AVSSE64V
+ 
+-	// 31.7.6. Vector Indexed Instructions
++	// 31.7.6: Vector Indexed Instructions
+ 	AVLUXEI8V
+ 	AVLUXEI16V
+ 	AVLUXEI32V
+@@ -663,13 +667,13 @@ const (
+ 	AVSOXEI32V
+ 	AVSOXEI64V
+ 
+-	// 31.7.7. Unit-stride Fault-Only-First Loads
++	// 31.7.7: Unit-stride Fault-Only-First Loads
+ 	AVLE8FFV
+ 	AVLE16FFV
+ 	AVLE32FFV
+ 	AVLE64FFV
+ 
+-	// 31.7.9. Vector Load/Store Whole Register Instructions
++	// 31.7.9: Vector Load/Store Whole Register Instructions
+ 	AVL1RE8V
+ 	AVL1RE16V
+ 	AVL1RE32V
+@@ -691,7 +695,7 @@ const (
+ 	AVS4RV
+ 	AVS8RV
+ 
+-	// 31.11.1. Vector Single-Width Integer Add and Subtract
++	// 31.11.1: Vector Single-Width Integer Add and Subtract
+ 	AVADDVV
+ 	AVADDVX
+ 	AVADDVI
+@@ -700,7 +704,7 @@ const (
+ 	AVRSUBVX
+ 	AVRSUBVI
+ 
+-	// 31.11.2. Vector Widening Integer Add/Subtract
++	// 31.11.2: Vector Widening Integer Add/Subtract
+ 	AVWADDUVV
+ 	AVWADDUVX
+ 	AVWSUBUVV
+@@ -718,7 +722,7 @@ const (
+ 	AVWSUBWV
+ 	AVWSUBWX
+ 
+-	// 31.11.3. Vector Integer Extension
++	// 31.11.3: Vector Integer Extension
+ 	AVZEXTVF2
+ 	AVSEXTVF2
+ 	AVZEXTVF4
+@@ -726,7 +730,7 @@ const (
+ 	AVZEXTVF8
+ 	AVSEXTVF8
+ 
+-	// 31.11.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
++	// 31.11.4: Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+ 	AVADCVVM
+ 	AVADCVXM
+ 	AVADCVIM
+@@ -743,7 +747,7 @@ const (
+ 	AVMSBCVV
+ 	AVMSBCVX
+ 
+-	// 31.11.5. Vector Bitwise Logical Instructions
++	// 31.11.5: Vector Bitwise Logical Instructions
+ 	AVANDVV
+ 	AVANDVX
+ 	AVANDVI
+@@ -754,7 +758,7 @@ const (
+ 	AVXORVX
+ 	AVXORVI
+ 
+-	// 31.11.6. Vector Single-Width Shift Instructions
++	// 31.11.6: Vector Single-Width Shift Instructions
+ 	AVSLLVV
+ 	AVSLLVX
+ 	AVSLLVI
+@@ -765,7 +769,7 @@ const (
+ 	AVSRAVX
+ 	AVSRAVI
+ 
+-	// 31.11.7. Vector Narrowing Integer Right Shift Instructions
++	// 31.11.7: Vector Narrowing Integer Right Shift Instructions
+ 	AVNSRLWV
+ 	AVNSRLWX
+ 	AVNSRLWI
+@@ -773,7 +777,7 @@ const (
+ 	AVNSRAWX
+ 	AVNSRAWI
+ 
+-	// 31.11.8. Vector Integer Compare Instructions
++	// 31.11.8: Vector Integer Compare Instructions
+ 	AVMSEQVV
+ 	AVMSEQVX
+ 	AVMSEQVI
+@@ -795,7 +799,7 @@ const (
+ 	AVMSGTVX
+ 	AVMSGTVI
+ 
+-	// 31.11.9. Vector Integer Min/Max Instructions
++	// 31.11.9: Vector Integer Min/Max Instructions
+ 	AVMINUVV
+ 	AVMINUVX
+ 	AVMINVV
+@@ -805,7 +809,7 @@ const (
+ 	AVMAXVV
+ 	AVMAXVX
+ 
+-	// 31.11.10. Vector Single-Width Integer Multiply Instructions
++	// 31.11.10: Vector Single-Width Integer Multiply Instructions
+ 	AVMULVV
+ 	AVMULVX
+ 	AVMULHVV
+@@ -815,7 +819,7 @@ const (
+ 	AVMULHSUVV
+ 	AVMULHSUVX
+ 
+-	// 31.11.11. Vector Integer Divide Instructions
++	// 31.11.11: Vector Integer Divide Instructions
+ 	AVDIVUVV
+ 	AVDIVUVX
+ 	AVDIVVV
+@@ -825,7 +829,7 @@ const (
+ 	AVREMVV
+ 	AVREMVX
+ 
+-	// 31.11.12. Vector Widening Integer Multiply Instructions
++	// 31.11.12: Vector Widening Integer Multiply Instructions
+ 	AVWMULVV
+ 	AVWMULVX
+ 	AVWMULUVV
+@@ -833,7 +837,7 @@ const (
+ 	AVWMULSUVV
+ 	AVWMULSUVX
+ 
+-	// 31.11.13. Vector Single-Width Integer Multiply-Add Instructions
++	// 31.11.13: Vector Single-Width Integer Multiply-Add Instructions
+ 	AVMACCVV
+ 	AVMACCVX
+ 	AVNMSACVV
+@@ -843,7 +847,7 @@ const (
+ 	AVNMSUBVV
+ 	AVNMSUBVX
+ 
+-	// 31.11.14. Vector Widening Integer Multiply-Add Instructions
++	// 31.11.14: Vector Widening Integer Multiply-Add Instructions
+ 	AVWMACCUVV
+ 	AVWMACCUVX
+ 	AVWMACCVV
+@@ -852,17 +856,17 @@ const (
+ 	AVWMACCSUVX
+ 	AVWMACCUSVX
+ 
+-	// 31.11.15. Vector Integer Merge Instructions
++	// 31.11.15: Vector Integer Merge Instructions
+ 	AVMERGEVVM
+ 	AVMERGEVXM
+ 	AVMERGEVIM
+ 
+-	// 31.11.16. Vector Integer Move Instructions
++	// 31.11.16: Vector Integer Move Instructions
+ 	AVMVVV
+ 	AVMVVX
+ 	AVMVVI
+ 
+-	// 31.12.1. Vector Single-Width Saturating Add and Subtract
++	// 31.12.1: Vector Single-Width Saturating Add and Subtract
+ 	AVSADDUVV
+ 	AVSADDUVX
+ 	AVSADDUVI
+@@ -874,7 +878,7 @@ const (
+ 	AVSSUBVV
+ 	AVSSUBVX
+ 
+-	// 31.12.2. Vector Single-Width Averaging Add and Subtract
++	// 31.12.2: Vector Single-Width Averaging Add and Subtract
+ 	AVAADDUVV
+ 	AVAADDUVX
+ 	AVAADDVV
+@@ -884,11 +888,11 @@ const (
+ 	AVASUBVV
+ 	AVASUBVX
+ 
+-	// 31.12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
++	// 31.12.3: Vector Single-Width Fractional Multiply with Rounding and Saturation
+ 	AVSMULVV
+ 	AVSMULVX
+ 
+-	// 31.12.4. Vector Single-Width Scaling Shift Instructions
++	// 31.12.4: Vector Single-Width Scaling Shift Instructions
+ 	AVSSRLVV
+ 	AVSSRLVX
+ 	AVSSRLVI
+@@ -896,7 +900,7 @@ const (
+ 	AVSSRAVX
+ 	AVSSRAVI
+ 
+-	// 31.12.5. Vector Narrowing Fixed-Point Clip Instructions
++	// 31.12.5: Vector Narrowing Fixed-Point Clip Instructions
+ 	AVNCLIPUWV
+ 	AVNCLIPUWX
+ 	AVNCLIPUWI
+@@ -904,14 +908,14 @@ const (
+ 	AVNCLIPWX
+ 	AVNCLIPWI
+ 
+-	// 31.13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
++	// 31.13.2: Vector Single-Width Floating-Point Add/Subtract Instructions
+ 	AVFADDVV
+ 	AVFADDVF
+ 	AVFSUBVV
+ 	AVFSUBVF
+ 	AVFRSUBVF
+ 
+-	// 31.13.3. Vector Widening Floating-Point Add/Subtract Instructions
++	// 31.13.3: Vector Widening Floating-Point Add/Subtract Instructions
+ 	AVFWADDVV
+ 	AVFWADDVF
+ 	AVFWSUBVV
+@@ -921,18 +925,18 @@ const (
+ 	AVFWSUBWV
+ 	AVFWSUBWF
+ 
+-	// 31.13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
++	// 31.13.4: Vector Single-Width Floating-Point Multiply/Divide Instructions
+ 	AVFMULVV
+ 	AVFMULVF
+ 	AVFDIVVV
+ 	AVFDIVVF
+ 	AVFRDIVVF
+ 
+-	// 31.13.5. Vector Widening Floating-Point Multiply
++	// 31.13.5: Vector Widening Floating-Point Multiply
+ 	AVFWMULVV
+ 	AVFWMULVF
+ 
+-	// 31.13.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
++	// 31.13.6: Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+ 	AVFMACCVV
+ 	AVFMACCVF
+ 	AVFNMACCVV
+@@ -950,7 +954,7 @@ const (
+ 	AVFNMSUBVV
+ 	AVFNMSUBVF
+ 
+-	// 31.13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
++	// 31.13.7: Vector Widening Floating-Point Fused Multiply-Add Instructions
+ 	AVFWMACCVV
+ 	AVFWMACCVF
+ 	AVFWNMACCVV
+@@ -960,22 +964,22 @@ const (
+ 	AVFWNMSACVV
+ 	AVFWNMSACVF
+ 
+-	// 31.13.8. Vector Floating-Point Square-Root Instruction
++	// 31.13.8: Vector Floating-Point Square-Root Instruction
+ 	AVFSQRTV
+ 
+-	// 31.13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
++	// 31.13.9: Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+ 	AVFRSQRT7V
+ 
+-	// 31.13.10. Vector Floating-Point Reciprocal Estimate Instruction
++	// 31.13.10: Vector Floating-Point Reciprocal Estimate Instruction
+ 	AVFREC7V
+ 
+-	// 31.13.11. Vector Floating-Point MIN/MAX Instructions
++	// 31.13.11: Vector Floating-Point MIN/MAX Instructions
+ 	AVFMINVV
+ 	AVFMINVF
+ 	AVFMAXVV
+ 	AVFMAXVF
+ 
+-	// 31.13.12. Vector Floating-Point Sign-Injection Instructions
++	// 31.13.12: Vector Floating-Point Sign-Injection Instructions
+ 	AVFSGNJVV
+ 	AVFSGNJVF
+ 	AVFSGNJNVV
+@@ -983,7 +987,7 @@ const (
+ 	AVFSGNJXVV
+ 	AVFSGNJXVF
+ 
+-	// 31.13.13. Vector Floating-Point Compare Instructions
++	// 31.13.13: Vector Floating-Point Compare Instructions
+ 	AVMFEQVV
+ 	AVMFEQVF
+ 	AVMFNEVV
+@@ -995,16 +999,16 @@ const (
+ 	AVMFGTVF
+ 	AVMFGEVF
+ 
+-	// 31.13.14. Vector Floating-Point Classify Instruction
++	// 31.13.14: Vector Floating-Point Classify Instruction
+ 	AVFCLASSV
+ 
+-	// 31.13.15. Vector Floating-Point Merge Instruction
++	// 31.13.15: Vector Floating-Point Merge Instruction
+ 	AVFMERGEVFM
+ 
+-	// 31.13.16. Vector Floating-Point Move Instruction
++	// 31.13.16: Vector Floating-Point Move Instruction
+ 	AVFMVVF
+ 
+-	// 31.13.17. Single-Width Floating-Point/Integer Type-Convert Instructions
++	// 31.13.17: Single-Width Floating-Point/Integer Type-Convert Instructions
+ 	AVFCVTXUFV
+ 	AVFCVTXFV
+ 	AVFCVTRTZXUFV
+@@ -1012,7 +1016,7 @@ const (
+ 	AVFCVTFXUV
+ 	AVFCVTFXV
+ 
+-	// 31.13.18. Widening Floating-Point/Integer Type-Convert Instructions
++	// 31.13.18: Widening Floating-Point/Integer Type-Convert Instructions
+ 	AVFWCVTXUFV
+ 	AVFWCVTXFV
+ 	AVFWCVTRTZXUFV
+@@ -1021,7 +1025,7 @@ const (
+ 	AVFWCVTFXV
+ 	AVFWCVTFFV
+ 
+-	// 31.13.19. Narrowing Floating-Point/Integer Type-Convert Instructions
++	// 31.13.19: Narrowing Floating-Point/Integer Type-Convert Instructions
+ 	AVFNCVTXUFW
+ 	AVFNCVTXFW
+ 	AVFNCVTRTZXUFW
+@@ -1031,7 +1035,7 @@ const (
+ 	AVFNCVTFFW
+ 	AVFNCVTRODFFW
+ 
+-	// 31.14.1. Vector Single-Width Integer Reduction Instructions
++	// 31.14.1: Vector Single-Width Integer Reduction Instructions
+ 	AVREDSUMVS
+ 	AVREDMAXUVS
+ 	AVREDMAXVS
+@@ -1041,21 +1045,21 @@ const (
+ 	AVREDORVS
+ 	AVREDXORVS
+ 
+-	// 31.14.2. Vector Widening Integer Reduction Instructions
++	// 31.14.2: Vector Widening Integer Reduction Instructions
+ 	AVWREDSUMUVS
+ 	AVWREDSUMVS
+ 
+-	// 31.14.3. Vector Single-Width Floating-Point Reduction Instructions
++	// 31.14.3: Vector Single-Width Floating-Point Reduction Instructions
+ 	AVFREDOSUMVS
+ 	AVFREDUSUMVS
+ 	AVFREDMAXVS
+ 	AVFREDMINVS
+ 
+-	// 31.14.4. Vector Widening Floating-Point Reduction Instructions
++	// 31.14.4: Vector Widening Floating-Point Reduction Instructions
+ 	AVFWREDOSUMVS
+ 	AVFWREDUSUMVS
+ 
+-	// 31.15. Vector Mask Instructions
++	// 31.15: Vector Mask Instructions
+ 	AVMANDMM
+ 	AVMNANDMM
+ 	AVMANDNMM
+@@ -1072,15 +1076,15 @@ const (
+ 	AVIOTAM
+ 	AVIDV
+ 
+-	// 31.16.1. Integer Scalar Move Instructions
++	// 31.16.1: Integer Scalar Move Instructions
+ 	AVMVXS
+ 	AVMVSX
+ 
+-	// 31.16.2. Floating-Point Scalar Move Instructions
++	// 31.16.2: Floating-Point Scalar Move Instructions
+ 	AVFMVFS
+ 	AVFMVSF
+ 
+-	// 31.16.3. Vector Slide Instructions
++	// 31.16.3: Vector Slide Instructions
+ 	AVSLIDEUPVX
+ 	AVSLIDEUPVI
+ 	AVSLIDEDOWNVX
+@@ -1090,16 +1094,16 @@ const (
+ 	AVSLIDE1DOWNVX
+ 	AVFSLIDE1DOWNVF
+ 
+-	// 31.16.4. Vector Register Gather Instructions
++	// 31.16.4: Vector Register Gather Instructions
+ 	AVRGATHERVV
+ 	AVRGATHEREI16VV
+ 	AVRGATHERVX
+ 	AVRGATHERVI
+ 
+-	// 31.16.5. Vector Compress Instruction
++	// 31.16.5: Vector Compress Instruction
+ 	AVCOMPRESSVM
+ 
+-	// 31.16.6. Whole Vector Register Move
++	// 31.16.6: Whole Vector Register Move
+ 	AVMV1RV
+ 	AVMV2RV
+ 	AVMV4RV
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 5e7092ab36..6fac9159e5 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1539,7 +1539,9 @@ type instructionData struct {
+ // their encoding type. Entries are masked with obj.AMask to keep
+ // indices small.
+ var instructions = [ALAST & obj.AMask]instructionData{
++	//
+ 	// Unprivileged ISA
++	//
+ 
+ 	// 2.4: Integer Computational Instructions
+ 	AADDI & obj.AMask:  {enc: iIIEncoding, ternary: true},
+@@ -1588,7 +1590,7 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	// 2.7: Memory Ordering
+ 	AFENCE & obj.AMask: {enc: iIIEncoding},
+ 
+-	// 5.2: Integer Computational Instructions (RV64I)
++	// 4.2: Integer Computational Instructions (RV64I)
+ 	AADDIW & obj.AMask: {enc: iIIEncoding, ternary: true},
+ 	ASLLIW & obj.AMask: {enc: iIIEncoding, ternary: true},
+ 	ASRLIW & obj.AMask: {enc: iIIEncoding, ternary: true},
+@@ -1599,14 +1601,14 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	ASUBW & obj.AMask:  {enc: rIIIEncoding, ternary: true},
+ 	ASRAW & obj.AMask:  {enc: rIIIEncoding, immForm: ASRAIW, ternary: true},
+ 
+-	// 5.3: Load and Store Instructions (RV64I)
++	// 4.3: Load and Store Instructions (RV64I)
+ 	ALD & obj.AMask: {enc: iIIEncoding},
+ 	ASD & obj.AMask: {enc: sIEncoding},
+ 
+ 	// 7.1: CSR Instructions
+ 	ACSRRS & obj.AMask: {enc: iIIEncoding},
+ 
+-	// 7.1: Multiplication Operations
++	// 13.1: Multiplication Operations
+ 	AMUL & obj.AMask:    {enc: rIIIEncoding, ternary: true},
+ 	AMULH & obj.AMask:   {enc: rIIIEncoding, ternary: true},
+ 	AMULHU & obj.AMask:  {enc: rIIIEncoding, ternary: true},
+@@ -1621,13 +1623,13 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AREMW & obj.AMask:   {enc: rIIIEncoding, ternary: true},
+ 	AREMUW & obj.AMask:  {enc: rIIIEncoding, ternary: true},
+ 
+-	// 8.2: Load-Reserved/Store-Conditional
++	// 14.2: Load-Reserved/Store-Conditional Instructions (Zalrsc)
+ 	ALRW & obj.AMask: {enc: rIIIEncoding},
+ 	ALRD & obj.AMask: {enc: rIIIEncoding},
+ 	ASCW & obj.AMask: {enc: rIIIEncoding},
+ 	ASCD & obj.AMask: {enc: rIIIEncoding},
+ 
+-	// 8.3: Atomic Memory Operations
++	// 14.4: Atomic Memory Operations (Zaamo)
+ 	AAMOSWAPW & obj.AMask: {enc: rIIIEncoding},
+ 	AAMOSWAPD & obj.AMask: {enc: rIIIEncoding},
+ 	AAMOADDW & obj.AMask:  {enc: rIIIEncoding},
+@@ -1647,11 +1649,11 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AAMOMINUW & obj.AMask: {enc: rIIIEncoding},
+ 	AAMOMINUD & obj.AMask: {enc: rIIIEncoding},
+ 
+-	// 11.5: Single-Precision Load and Store Instructions
++	// 20.5: Single-Precision Load and Store Instructions
+ 	AFLW & obj.AMask: {enc: iFEncoding},
+ 	AFSW & obj.AMask: {enc: sFEncoding},
+ 
+-	// 11.6: Single-Precision Floating-Point Computational Instructions
++	// 20.6: Single-Precision Floating-Point Computational Instructions
+ 	AFADDS & obj.AMask:   {enc: rFFFEncoding},
+ 	AFSUBS & obj.AMask:   {enc: rFFFEncoding},
+ 	AFMULS & obj.AMask:   {enc: rFFFEncoding},
+@@ -1664,7 +1666,7 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AFNMSUBS & obj.AMask: {enc: rFFFFEncoding},
+ 	AFNMADDS & obj.AMask: {enc: rFFFFEncoding},
+ 
+-	// 11.7: Single-Precision Floating-Point Conversion and Move Instructions
++	// 20.7: Single-Precision Floating-Point Conversion and Move Instructions
+ 	AFCVTWS & obj.AMask:  {enc: rFIEncoding},
+ 	AFCVTLS & obj.AMask:  {enc: rFIEncoding},
+ 	AFCVTSW & obj.AMask:  {enc: rIFEncoding},
+@@ -1679,19 +1681,19 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AFMVXW & obj.AMask:   {enc: rFIEncoding},
+ 	AFMVWX & obj.AMask:   {enc: rIFEncoding},
+ 
+-	// 11.8: Single-Precision Floating-Point Compare Instructions
++	// 20.8: Single-Precision Floating-Point Compare Instructions
+ 	AFEQS & obj.AMask: {enc: rFFIEncoding},
+ 	AFLTS & obj.AMask: {enc: rFFIEncoding},
+ 	AFLES & obj.AMask: {enc: rFFIEncoding},
+ 
+-	// 11.9: Single-Precision Floating-Point Classify Instruction
++	// 20.9: Single-Precision Floating-Point Classify Instruction
+ 	AFCLASSS & obj.AMask: {enc: rFIEncoding},
+ 
+ 	// 12.3: Double-Precision Load and Store Instructions
+ 	AFLD & obj.AMask: {enc: iFEncoding},
+ 	AFSD & obj.AMask: {enc: sFEncoding},
+ 
+-	// 12.4: Double-Precision Floating-Point Computational Instructions
++	// 21.4: Double-Precision Floating-Point Computational Instructions
+ 	AFADDD & obj.AMask:   {enc: rFFFEncoding},
+ 	AFSUBD & obj.AMask:   {enc: rFFFEncoding},
+ 	AFMULD & obj.AMask:   {enc: rFFFEncoding},
+@@ -1704,7 +1706,7 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AFNMSUBD & obj.AMask: {enc: rFFFFEncoding},
+ 	AFNMADDD & obj.AMask: {enc: rFFFFEncoding},
+ 
+-	// 12.5: Double-Precision Floating-Point Conversion and Move Instructions
++	// 21.5: Double-Precision Floating-Point Conversion and Move Instructions
+ 	AFCVTWD & obj.AMask:  {enc: rFIEncoding},
+ 	AFCVTLD & obj.AMask:  {enc: rFIEncoding},
+ 	AFCVTDW & obj.AMask:  {enc: rIFEncoding},
+@@ -1721,25 +1723,19 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AFMVXD & obj.AMask:   {enc: rFIEncoding},
+ 	AFMVDX & obj.AMask:   {enc: rIFEncoding},
+ 
+-	// 12.6: Double-Precision Floating-Point Compare Instructions
++	// 21.6: Double-Precision Floating-Point Compare Instructions
+ 	AFEQD & obj.AMask: {enc: rFFIEncoding},
+ 	AFLTD & obj.AMask: {enc: rFFIEncoding},
+ 	AFLED & obj.AMask: {enc: rFFIEncoding},
+ 
+-	// 12.7: Double-Precision Floating-Point Classify Instruction
++	// 21.7: Double-Precision Floating-Point Classify Instruction
+ 	AFCLASSD & obj.AMask: {enc: rFIEncoding},
+ 
+-	// Privileged ISA
+-
+-	// 3.2.1: Environment Call and Breakpoint
+-	AECALL & obj.AMask:  {enc: iIIEncoding},
+-	AEBREAK & obj.AMask: {enc: iIIEncoding},
+-
+ 	//
+-	// RISC-V Bit-Manipulation ISA-extensions (1.0)
++	// "B" Extension for Bit Manipulation, Version 1.0.0
+ 	//
+ 
+-	// 1.1: Address Generation Instructions (Zba)
++	// 28.4.1: Address Generation Instructions (Zba)
+ 	AADDUW & obj.AMask:    {enc: rIIIEncoding, ternary: true},
+ 	ASH1ADD & obj.AMask:   {enc: rIIIEncoding, ternary: true},
+ 	ASH1ADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true},
+@@ -1749,7 +1745,7 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	ASH3ADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true},
+ 	ASLLIUW & obj.AMask:   {enc: iIIEncoding, ternary: true},
+ 
+-	// 1.2: Basic Bit Manipulation (Zbb)
++	// 28.4.2: Basic Bit Manipulation (Zbb)
+ 	AANDN & obj.AMask:  {enc: rIIIEncoding, ternary: true},
+ 	ACLZ & obj.AMask:   {enc: rIIEncoding},
+ 	ACLZW & obj.AMask:  {enc: rIIEncoding},
+@@ -1767,7 +1763,7 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AXNOR & obj.AMask:  {enc: rIIIEncoding, ternary: true},
+ 	AZEXTH & obj.AMask: {enc: rIIEncoding},
+ 
+-	// 1.3: Bitwise Rotation (Zbb)
++	// 28.4.3: Bitwise Rotation (Zbb)
+ 	AROL & obj.AMask:   {enc: rIIIEncoding, ternary: true},
+ 	AROLW & obj.AMask:  {enc: rIIIEncoding, ternary: true},
+ 	AROR & obj.AMask:   {enc: rIIIEncoding, immForm: ARORI, ternary: true},
+@@ -1777,7 +1773,7 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AORCB & obj.AMask:  {enc: iIIEncoding},
+ 	AREV8 & obj.AMask:  {enc: iIIEncoding},
+ 
+-	// 1.5: Single-bit Instructions (Zbs)
++	// 28.4.4: Single-bit Instructions (Zbs)
+ 	ABCLR & obj.AMask:  {enc: rIIIEncoding, immForm: ABCLRI, ternary: true},
+ 	ABCLRI & obj.AMask: {enc: iIIEncoding, ternary: true},
+ 	ABEXT & obj.AMask:  {enc: rIIIEncoding, immForm: ABEXTI, ternary: true},
+@@ -1787,6 +1783,14 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	ABSET & obj.AMask:  {enc: rIIIEncoding, immForm: ABSETI, ternary: true},
+ 	ABSETI & obj.AMask: {enc: iIIEncoding, ternary: true},
+ 
++	//
++	// Privileged ISA
++	//
++
++	// 3.3.1: Environment Call and Breakpoint
++	AECALL & obj.AMask:  {enc: iIIEncoding},
++	AEBREAK & obj.AMask: {enc: iIIEncoding},
++
+ 	// Escape hatch
+ 	AWORD & obj.AMask: {enc: rawEncoding},
+ 
+-- 
+2.39.5
+
diff --git a/2067-cmd-compile-don-t-merge-symbols-on-riscv64-when-dyna.patch b/2067-cmd-compile-don-t-merge-symbols-on-riscv64-when-dyna.patch
new file mode 100644
index 0000000..afd6b8d
--- /dev/null
+++ b/2067-cmd-compile-don-t-merge-symbols-on-riscv64-when-dyna.patch
@@ -0,0 +1,589 @@
+From 20ee5b16747e9841cb1923ad3590806047e4b235 Mon Sep 17 00:00:00 2001
+From: Meng Zhuo <mengzhuo1203@gmail.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 067/119] cmd/compile: don't merge symbols on riscv64 when
+ dynamic linking
+
+Each plugin is compiled as a separate shared object,
+with its own symbol table. When dynamic linking plugin symbols
+are resolved within the plugin's scope, not globally merged to
+avoid conflicts.
+
+Change-Id: I9e6986085855c17fbd6c39b937cb6129d216f5e9
+Reviewed-on: https://go-review.googlesource.com/c/go/+/435015
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Michael Pratt <mpratt@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  82 +++-------
+ .../compile/internal/ssa/rewriteRISCV64.go    | 154 +++++++++++-------
+ 2 files changed, 115 insertions(+), 121 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 9ae9604381..a69df619a5 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -270,65 +270,29 @@
+ 
+ // We need to fold MOVaddr into the LD/MOVDstore ops so that the live variable analysis
+ // knows what variables are being read/written by the ops.
+-(MOVBUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVBUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+-(MOVBload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVBload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+-(MOVHUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVHUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+-(MOVHload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVHload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+-(MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+-(MOVWload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVWload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+-(MOVDload  [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVDload  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+-
+-(MOVBstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+-(MOVHstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+-(MOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+-(MOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+-	(MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+-(MOVBstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+-	(MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+-(MOVHstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+-	(MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+-(MOVWstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+-	(MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+-(MOVDstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) =>
+-	(MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
+-
+-(MOVBUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVBUload [off1+int32(off2)] {sym} base mem)
+-(MOVBload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVBload  [off1+int32(off2)] {sym} base mem)
+-(MOVHUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVHUload [off1+int32(off2)] {sym} base mem)
+-(MOVHload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVHload  [off1+int32(off2)] {sym} base mem)
+-(MOVWUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVWUload [off1+int32(off2)] {sym} base mem)
+-(MOVWload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVWload  [off1+int32(off2)] {sym} base mem)
+-(MOVDload  [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVDload  [off1+int32(off2)] {sym} base mem)
+-
+-(MOVBstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVBstore [off1+int32(off2)] {sym} base val mem)
+-(MOVHstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVHstore [off1+int32(off2)] {sym} base val mem)
+-(MOVWstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVWstore [off1+int32(off2)] {sym} base val mem)
+-(MOVDstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+-	(MOVDstore [off1+int32(off2)] {sym} base val mem)
+-(MOVBstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBstorezero [off1+int32(off2)] {sym} ptr mem)
+-(MOVHstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHstorezero [off1+int32(off2)] {sym} ptr mem)
+-(MOVWstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWstorezero [off1+int32(off2)] {sym} ptr mem)
+-(MOVDstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVDstorezero [off1+int32(off2)] {sym} ptr mem)
++(MOV(B|BU|H|HU|W|WU|D)load [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) &&
++	is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) &&
++	(base.Op != OpSB || !config.ctxt.Flag_dynlink) =>
++	(MOV(B|BU|H|HU|W|WU|D)load [off1+off2] {mergeSym(sym1,sym2)} base mem)
++
++(MOV(B|H|W|D)store [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) &&
++	is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) &&
++	(base.Op != OpSB || !config.ctxt.Flag_dynlink) =>
++	(MOV(B|H|W|D)store [off1+off2] {mergeSym(sym1,sym2)} base val mem)
++
++(MOV(B|H|W|D)storezero [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) &&
++	canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) &&
++	(base.Op != OpSB || !config.ctxt.Flag_dynlink) =>
++	(MOV(B|H|W|D)storezero [off1+off2] {mergeSym(sym1,sym2)} base mem)
++
++(MOV(B|BU|H|HU|W|WU|D)load [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
++	(MOV(B|BU|H|HU|W|WU|D)load [off1+int32(off2)] {sym} base mem)
++
++(MOV(B|H|W|D)store [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
++	(MOV(B|H|W|D)store [off1+int32(off2)] {sym} base val mem)
++
++(MOV(B|H|W|D)storezero [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
++	(MOV(B|H|W|D)storezero [off1+int32(off2)] {sym} base mem)
+ 
+ // Similarly, fold ADDI into MOVaddr to avoid confusing live variable analysis
+ // with OffPtr -> ADDI.
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 5e6ccab467..1c226a1660 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -4010,8 +4010,10 @@ func rewriteValueRISCV64_OpRISCV64FSUBS(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVBUload(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVBUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -4023,7 +4025,7 @@ func rewriteValueRISCV64_OpRISCV64MOVBUload(v *Value) bool {
+ 		sym2 := auxToSym(v_0.Aux)
+ 		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVBUload)
+@@ -4317,8 +4319,10 @@ func rewriteValueRISCV64_OpRISCV64MOVBUreg(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVBload(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVBload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -4330,7 +4334,7 @@ func rewriteValueRISCV64_OpRISCV64MOVBload(v *Value) bool {
+ 		sym2 := auxToSym(v_0.Aux)
+ 		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVBload)
+@@ -4443,8 +4447,10 @@ func rewriteValueRISCV64_OpRISCV64MOVBstore(v *Value) bool {
+ 	v_2 := v.Args[2]
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVBstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -4457,7 +4463,7 @@ func rewriteValueRISCV64_OpRISCV64MOVBstore(v *Value) bool {
+ 		base := v_0.Args[0]
+ 		val := v_1
+ 		mem := v_2
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVBstore)
+@@ -4611,9 +4617,11 @@ func rewriteValueRISCV64_OpRISCV64MOVBstore(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+-	// match: (MOVBstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem)
+-	// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+-	// result: (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
++	b := v.Block
++	config := b.Func.Config
++	// match: (MOVBstorezero [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
++	// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
++	// result: (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+ 		sym1 := auxToSym(v.Aux)
+@@ -4622,20 +4630,20 @@ func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool {
+ 		}
+ 		off2 := auxIntToInt32(v_0.AuxInt)
+ 		sym2 := auxToSym(v_0.Aux)
+-		ptr := v_0.Args[0]
++		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) {
++		if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVBstorezero)
+ 		v.AuxInt = int32ToAuxInt(off1 + off2)
+ 		v.Aux = symToAux(mergeSym(sym1, sym2))
+-		v.AddArg2(ptr, mem)
++		v.AddArg2(base, mem)
+ 		return true
+ 	}
+-	// match: (MOVBstorezero [off1] {sym} (ADDI [off2] ptr) mem)
++	// match: (MOVBstorezero [off1] {sym} (ADDI [off2] base) mem)
+ 	// cond: is32Bit(int64(off1)+off2)
+-	// result: (MOVBstorezero [off1+int32(off2)] {sym} ptr mem)
++	// result: (MOVBstorezero [off1+int32(off2)] {sym} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+ 		sym := auxToSym(v.Aux)
+@@ -4643,7 +4651,7 @@ func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool {
+ 			break
+ 		}
+ 		off2 := auxIntToInt64(v_0.AuxInt)
+-		ptr := v_0.Args[0]
++		base := v_0.Args[0]
+ 		mem := v_1
+ 		if !(is32Bit(int64(off1) + off2)) {
+ 			break
+@@ -4651,7 +4659,7 @@ func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool {
+ 		v.reset(OpRISCV64MOVBstorezero)
+ 		v.AuxInt = int32ToAuxInt(off1 + int32(off2))
+ 		v.Aux = symToAux(sym)
+-		v.AddArg2(ptr, mem)
++		v.AddArg2(base, mem)
+ 		return true
+ 	}
+ 	return false
+@@ -4659,8 +4667,10 @@ func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVDload(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVDload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -4672,7 +4682,7 @@ func rewriteValueRISCV64_OpRISCV64MOVDload(v *Value) bool {
+ 		sym2 := auxToSym(v_0.Aux)
+ 		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVDload)
+@@ -4739,8 +4749,10 @@ func rewriteValueRISCV64_OpRISCV64MOVDstore(v *Value) bool {
+ 	v_2 := v.Args[2]
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -4753,7 +4765,7 @@ func rewriteValueRISCV64_OpRISCV64MOVDstore(v *Value) bool {
+ 		base := v_0.Args[0]
+ 		val := v_1
+ 		mem := v_2
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVDstore)
+@@ -4805,9 +4817,11 @@ func rewriteValueRISCV64_OpRISCV64MOVDstore(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+-	// match: (MOVDstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem)
+-	// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+-	// result: (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
++	b := v.Block
++	config := b.Func.Config
++	// match: (MOVDstorezero [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
++	// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
++	// result: (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+ 		sym1 := auxToSym(v.Aux)
+@@ -4816,20 +4830,20 @@ func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool {
+ 		}
+ 		off2 := auxIntToInt32(v_0.AuxInt)
+ 		sym2 := auxToSym(v_0.Aux)
+-		ptr := v_0.Args[0]
++		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) {
++		if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVDstorezero)
+ 		v.AuxInt = int32ToAuxInt(off1 + off2)
+ 		v.Aux = symToAux(mergeSym(sym1, sym2))
+-		v.AddArg2(ptr, mem)
++		v.AddArg2(base, mem)
+ 		return true
+ 	}
+-	// match: (MOVDstorezero [off1] {sym} (ADDI [off2] ptr) mem)
++	// match: (MOVDstorezero [off1] {sym} (ADDI [off2] base) mem)
+ 	// cond: is32Bit(int64(off1)+off2)
+-	// result: (MOVDstorezero [off1+int32(off2)] {sym} ptr mem)
++	// result: (MOVDstorezero [off1+int32(off2)] {sym} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+ 		sym := auxToSym(v.Aux)
+@@ -4837,7 +4851,7 @@ func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool {
+ 			break
+ 		}
+ 		off2 := auxIntToInt64(v_0.AuxInt)
+-		ptr := v_0.Args[0]
++		base := v_0.Args[0]
+ 		mem := v_1
+ 		if !(is32Bit(int64(off1) + off2)) {
+ 			break
+@@ -4845,7 +4859,7 @@ func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool {
+ 		v.reset(OpRISCV64MOVDstorezero)
+ 		v.AuxInt = int32ToAuxInt(off1 + int32(off2))
+ 		v.Aux = symToAux(sym)
+-		v.AddArg2(ptr, mem)
++		v.AddArg2(base, mem)
+ 		return true
+ 	}
+ 	return false
+@@ -4853,8 +4867,10 @@ func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVHUload(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVHUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -4866,7 +4882,7 @@ func rewriteValueRISCV64_OpRISCV64MOVHUload(v *Value) bool {
+ 		sym2 := auxToSym(v_0.Aux)
+ 		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVHUload)
+@@ -5017,8 +5033,10 @@ func rewriteValueRISCV64_OpRISCV64MOVHUreg(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVHload(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVHload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -5030,7 +5048,7 @@ func rewriteValueRISCV64_OpRISCV64MOVHload(v *Value) bool {
+ 		sym2 := auxToSym(v_0.Aux)
+ 		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVHload)
+@@ -5187,8 +5205,10 @@ func rewriteValueRISCV64_OpRISCV64MOVHstore(v *Value) bool {
+ 	v_2 := v.Args[2]
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVHstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -5201,7 +5221,7 @@ func rewriteValueRISCV64_OpRISCV64MOVHstore(v *Value) bool {
+ 		base := v_0.Args[0]
+ 		val := v_1
+ 		mem := v_2
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVHstore)
+@@ -5321,9 +5341,11 @@ func rewriteValueRISCV64_OpRISCV64MOVHstore(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+-	// match: (MOVHstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem)
+-	// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+-	// result: (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
++	b := v.Block
++	config := b.Func.Config
++	// match: (MOVHstorezero [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
++	// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
++	// result: (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+ 		sym1 := auxToSym(v.Aux)
+@@ -5332,20 +5354,20 @@ func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool {
+ 		}
+ 		off2 := auxIntToInt32(v_0.AuxInt)
+ 		sym2 := auxToSym(v_0.Aux)
+-		ptr := v_0.Args[0]
++		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) {
++		if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVHstorezero)
+ 		v.AuxInt = int32ToAuxInt(off1 + off2)
+ 		v.Aux = symToAux(mergeSym(sym1, sym2))
+-		v.AddArg2(ptr, mem)
++		v.AddArg2(base, mem)
+ 		return true
+ 	}
+-	// match: (MOVHstorezero [off1] {sym} (ADDI [off2] ptr) mem)
++	// match: (MOVHstorezero [off1] {sym} (ADDI [off2] base) mem)
+ 	// cond: is32Bit(int64(off1)+off2)
+-	// result: (MOVHstorezero [off1+int32(off2)] {sym} ptr mem)
++	// result: (MOVHstorezero [off1+int32(off2)] {sym} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+ 		sym := auxToSym(v.Aux)
+@@ -5353,7 +5375,7 @@ func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool {
+ 			break
+ 		}
+ 		off2 := auxIntToInt64(v_0.AuxInt)
+-		ptr := v_0.Args[0]
++		base := v_0.Args[0]
+ 		mem := v_1
+ 		if !(is32Bit(int64(off1) + off2)) {
+ 			break
+@@ -5361,7 +5383,7 @@ func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool {
+ 		v.reset(OpRISCV64MOVHstorezero)
+ 		v.AuxInt = int32ToAuxInt(off1 + int32(off2))
+ 		v.Aux = symToAux(sym)
+-		v.AddArg2(ptr, mem)
++		v.AddArg2(base, mem)
+ 		return true
+ 	}
+ 	return false
+@@ -5369,8 +5391,10 @@ func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVWUload(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -5382,7 +5406,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWUload(v *Value) bool {
+ 		sym2 := auxToSym(v_0.Aux)
+ 		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVWUload)
+@@ -5557,8 +5581,10 @@ func rewriteValueRISCV64_OpRISCV64MOVWUreg(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVWload(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVWload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -5570,7 +5596,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWload(v *Value) bool {
+ 		sym2 := auxToSym(v_0.Aux)
+ 		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVWload)
+@@ -5881,8 +5907,10 @@ func rewriteValueRISCV64_OpRISCV64MOVWstore(v *Value) bool {
+ 	v_2 := v.Args[2]
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
+ 	// match: (MOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem)
+-	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+@@ -5895,7 +5923,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWstore(v *Value) bool {
+ 		base := v_0.Args[0]
+ 		val := v_1
+ 		mem := v_2
+-		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVWstore)
+@@ -5981,9 +6009,11 @@ func rewriteValueRISCV64_OpRISCV64MOVWstore(v *Value) bool {
+ func rewriteValueRISCV64_OpRISCV64MOVWstorezero(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+-	// match: (MOVWstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem)
+-	// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
+-	// result: (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
++	b := v.Block
++	config := b.Func.Config
++	// match: (MOVWstorezero [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
++	// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
++	// result: (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+ 		sym1 := auxToSym(v.Aux)
+@@ -5992,20 +6022,20 @@ func rewriteValueRISCV64_OpRISCV64MOVWstorezero(v *Value) bool {
+ 		}
+ 		off2 := auxIntToInt32(v_0.AuxInt)
+ 		sym2 := auxToSym(v_0.Aux)
+-		ptr := v_0.Args[0]
++		base := v_0.Args[0]
+ 		mem := v_1
+-		if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) {
++		if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
+ 			break
+ 		}
+ 		v.reset(OpRISCV64MOVWstorezero)
+ 		v.AuxInt = int32ToAuxInt(off1 + off2)
+ 		v.Aux = symToAux(mergeSym(sym1, sym2))
+-		v.AddArg2(ptr, mem)
++		v.AddArg2(base, mem)
+ 		return true
+ 	}
+-	// match: (MOVWstorezero [off1] {sym} (ADDI [off2] ptr) mem)
++	// match: (MOVWstorezero [off1] {sym} (ADDI [off2] base) mem)
+ 	// cond: is32Bit(int64(off1)+off2)
+-	// result: (MOVWstorezero [off1+int32(off2)] {sym} ptr mem)
++	// result: (MOVWstorezero [off1+int32(off2)] {sym} base mem)
+ 	for {
+ 		off1 := auxIntToInt32(v.AuxInt)
+ 		sym := auxToSym(v.Aux)
+@@ -6013,7 +6043,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWstorezero(v *Value) bool {
+ 			break
+ 		}
+ 		off2 := auxIntToInt64(v_0.AuxInt)
+-		ptr := v_0.Args[0]
++		base := v_0.Args[0]
+ 		mem := v_1
+ 		if !(is32Bit(int64(off1) + off2)) {
+ 			break
+@@ -6021,7 +6051,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWstorezero(v *Value) bool {
+ 		v.reset(OpRISCV64MOVWstorezero)
+ 		v.AuxInt = int32ToAuxInt(off1 + int32(off2))
+ 		v.Aux = symToAux(sym)
+-		v.AddArg2(ptr, mem)
++		v.AddArg2(base, mem)
+ 		return true
+ 	}
+ 	return false
+-- 
+2.39.5
+
diff --git a/2068-cmd-internal-obj-riscv-support-MOVD-with-floating-po.patch b/2068-cmd-internal-obj-riscv-support-MOVD-with-floating-po.patch
new file mode 100644
index 0000000..14ffd0b
--- /dev/null
+++ b/2068-cmd-internal-obj-riscv-support-MOVD-with-floating-po.patch
@@ -0,0 +1,83 @@
+From 9a3920342f79a02921089bbafe030e5a74e67530 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 068/119] cmd/internal/obj/riscv: support MOVD with floating
+ point constants
+
+Currently, we only support loading of values from memory (or other
+registers). Add floating point constant support to MOVD. This is
+implemented by storing the floating point constant to a symbol,
+which is then loaded into the floating point register.
+
+Change-Id: I6db242d27f606f0d5d084a3ab93538698d3a4f8c
+Reviewed-on: https://go-review.googlesource.com/c/go/+/631876
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s |  3 +++
+ src/cmd/internal/obj/riscv/obj.go           | 22 ++++++++++++++++++---
+ 2 files changed, 22 insertions(+), 3 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index ad468574a9..588ad0f067 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -486,6 +486,9 @@ start:
+ 	MOVD	F0, 4(X5)				// 27b20200
+ 	MOVD	F0, F1					// d3000022
+ 
++	// Convert to load of symbol (AUIPC + FLD)
++	MOVD	$(709.78271289338397), F3		// 970f000087b10f00
++
+ 	// TLS load with local-exec (LUI + ADDIW + ADD of TP + load)
+ 	MOV	tls(SB), X5				// b70f00009b8f0f00b38f4f0083b20f00
+ 	MOVB	tls(SB), X5				// b70f00009b8f0f00b38f4f0083820f00
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 6fac9159e5..2e582eb9cb 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -147,6 +147,15 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 			p.From.Name = obj.NAME_EXTERN
+ 			p.From.Offset = 0
+ 		}
++
++	case AMOVD:
++		if p.From.Type == obj.TYPE_FCONST && p.From.Name == obj.NAME_NONE && p.From.Reg == obj.REG_NONE {
++			f64 := p.From.Val.(float64)
++			p.From.Type = obj.TYPE_MEM
++			p.From.Sym = ctxt.Float64Sym(f64)
++			p.From.Name = obj.NAME_EXTERN
++			p.From.Offset = 0
++		}
+ 	}
+ }
+ 
+@@ -2208,12 +2217,19 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 			}
+ 
+ 			// Note that the values for $off_hi and $off_lo are currently
+-			// zero and will be assigned during relocation.
++			// zero and will be assigned during relocation. If the destination
++			// is an integer register then we can use the same register for the
++			// address computation, otherwise we need to use the temporary register.
+ 			//
+ 			// AUIPC $off_hi, Rd
+ 			// L $off_lo, Rd, Rd
+-			insAUIPC := &instruction{as: AAUIPC, rd: ins.rd}
+-			ins.as, ins.rs1, ins.rs2, ins.imm = movToLoad(p.As), ins.rd, obj.REG_NONE, 0
++			//
++			addrReg := ins.rd
++			if addrReg < REG_X0 || addrReg > REG_X31 {
++				addrReg = REG_TMP
++			}
++			insAUIPC := &instruction{as: AAUIPC, rd: addrReg}
++			ins.as, ins.rs1, ins.rs2, ins.imm = movToLoad(p.As), addrReg, obj.REG_NONE, 0
+ 			inss = []*instruction{insAUIPC, ins}
+ 
+ 		default:
+-- 
+2.39.5
+
diff --git a/2069-cmd-asm-cmd-internal-obj-riscv-implement-vector-conf.patch b/2069-cmd-asm-cmd-internal-obj-riscv-implement-vector-conf.patch
new file mode 100644
index 0000000..77f8540
--- /dev/null
+++ b/2069-cmd-asm-cmd-internal-obj-riscv-implement-vector-conf.patch
@@ -0,0 +1,618 @@
+From c6be78d841d1c3fcf2b798598655bb9c8e4c1663 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 069/119] cmd/asm,cmd/internal/obj/riscv: implement vector
+ configuration setting instructions
+
+Implement vector configuration setting instructions (VSETVLI,
+VSETIVLI, VSETL).  These allow the vector length (vl) and vector
+type (vtype) CSRs to be configured via a single instruction.
+Unfortunately each instruction has its own dedicated encoding.
+
+In the case of VSETVLI/VSETIVLI, the vector type is specified via
+a series of special operands, which specify the selected element
+width (E8, E16, E32, E64), the vector register group multiplier
+(M1, M2, M4, M8, MF2, MF4, MF8), the vector tail policy (TU, TA)
+and vector mask policy (MU, MA). Note that the order of these
+special operands matches non-Go assemblers.
+
+Partially based on work by Pengcheng Wang <wangpengcheng.pp@bytedance.com>.
+
+Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64
+Change-Id: I431f59c1e048a3e84754f0643a963da473a741fe
+Reviewed-on: https://go-review.googlesource.com/c/go/+/631936
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+---
+ src/cmd/asm/internal/arch/arm64.go            |   6 +-
+ src/cmd/asm/internal/arch/riscv64.go          |  35 +++-
+ src/cmd/asm/internal/asm/asm.go               |  21 +++
+ src/cmd/asm/internal/asm/parse.go             |  16 +-
+ src/cmd/asm/internal/asm/testdata/riscv64.s   |  24 +++
+ .../asm/internal/asm/testdata/riscv64error.s  |   4 +
+ src/cmd/internal/obj/arm64/a.out.go           |   4 +-
+ src/cmd/internal/obj/link.go                  |   3 +-
+ src/cmd/internal/obj/riscv/cpu.go             |  71 +++++++++
+ src/cmd/internal/obj/riscv/list.go            |   9 ++
+ src/cmd/internal/obj/riscv/obj.go             | 149 ++++++++++++++++--
+ src/cmd/internal/obj/util.go                  |   7 +
+ 12 files changed, 325 insertions(+), 24 deletions(-)
+
+diff --git a/src/cmd/asm/internal/arch/arm64.go b/src/cmd/asm/internal/arch/arm64.go
+index e63601de64..87ccb8c040 100644
+--- a/src/cmd/asm/internal/arch/arm64.go
++++ b/src/cmd/asm/internal/arch/arm64.go
+@@ -59,10 +59,10 @@ func jumpArm64(word string) bool {
+ 
+ var arm64SpecialOperand map[string]arm64.SpecialOperand
+ 
+-// GetARM64SpecialOperand returns the internal representation of a special operand.
+-func GetARM64SpecialOperand(name string) arm64.SpecialOperand {
++// ARM64SpecialOperand returns the internal representation of a special operand.
++func ARM64SpecialOperand(name string) arm64.SpecialOperand {
+ 	if arm64SpecialOperand == nil {
+-		// Generate the mapping automatically when the first time the function is called.
++		// Generate mapping when function is first called.
+ 		arm64SpecialOperand = map[string]arm64.SpecialOperand{}
+ 		for opd := arm64.SPOP_BEGIN; opd < arm64.SPOP_END; opd++ {
+ 			arm64SpecialOperand[opd.String()] = opd
+diff --git a/src/cmd/asm/internal/arch/riscv64.go b/src/cmd/asm/internal/arch/riscv64.go
+index 27a66c5e63..69e060a865 100644
+--- a/src/cmd/asm/internal/arch/riscv64.go
++++ b/src/cmd/asm/internal/arch/riscv64.go
+@@ -13,9 +13,8 @@ import (
+ 	"cmd/internal/obj/riscv"
+ )
+ 
+-// IsRISCV64AMO reports whether the op (as defined by a riscv.A*
+-// constant) is one of the AMO instructions that requires special
+-// handling.
++// IsRISCV64AMO reports whether op is an AMO instruction that requires
++// special handling.
+ func IsRISCV64AMO(op obj.As) bool {
+ 	switch op {
+ 	case riscv.ASCW, riscv.ASCD, riscv.AAMOSWAPW, riscv.AAMOSWAPD, riscv.AAMOADDW, riscv.AAMOADDD,
+@@ -26,3 +25,33 @@ func IsRISCV64AMO(op obj.As) bool {
+ 	}
+ 	return false
+ }
++
++// IsRISCV64VTypeI reports whether op is a vtype immediate instruction that
++// requires special handling.
++func IsRISCV64VTypeI(op obj.As) bool {
++	return op == riscv.AVSETVLI || op == riscv.AVSETIVLI
++}
++
++var riscv64SpecialOperand map[string]riscv.SpecialOperand
++
++// RISCV64SpecialOperand returns the internal representation of a special operand.
++func RISCV64SpecialOperand(name string) riscv.SpecialOperand {
++	if riscv64SpecialOperand == nil {
++		// Generate mapping when function is first called.
++		riscv64SpecialOperand = map[string]riscv.SpecialOperand{}
++		for opd := riscv.SPOP_BEGIN; opd < riscv.SPOP_END; opd++ {
++			riscv64SpecialOperand[opd.String()] = opd
++		}
++	}
++	if opd, ok := riscv64SpecialOperand[name]; ok {
++		return opd
++	}
++	return riscv.SPOP_END
++}
++
++// RISCV64ValidateVectorType reports whether the given configuration is a
++// valid vector type.
++func RISCV64ValidateVectorType(vsew, vlmul, vtail, vmask int64) error {
++	_, err := riscv.EncodeVectorType(vsew, vlmul, vtail, vmask)
++	return err
++}
+diff --git a/src/cmd/asm/internal/asm/asm.go b/src/cmd/asm/internal/asm/asm.go
+index 223c613bd9..6a87813549 100644
+--- a/src/cmd/asm/internal/asm/asm.go
++++ b/src/cmd/asm/internal/asm/asm.go
+@@ -905,6 +905,19 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
+ 			prog.To = a[5]
+ 			break
+ 		}
++		if p.arch.Family == sys.RISCV64 && arch.IsRISCV64VTypeI(op) {
++			prog.From = a[0]
++			vsew := p.getSpecial(prog, op, &a[1])
++			vlmul := p.getSpecial(prog, op, &a[2])
++			vtail := p.getSpecial(prog, op, &a[3])
++			vmask := p.getSpecial(prog, op, &a[4])
++			if err := arch.RISCV64ValidateVectorType(vsew, vlmul, vtail, vmask); err != nil {
++				p.errorf("invalid vtype: %v", err)
++			}
++			prog.AddRestSourceArgs([]obj.Addr{a[1], a[2], a[3], a[4]})
++			prog.To = a[5]
++			break
++		}
+ 		fallthrough
+ 	default:
+ 		p.errorf("can't handle %s instruction with %d operands", op, len(a))
+@@ -955,3 +968,11 @@ func (p *Parser) getRegister(prog *obj.Prog, op obj.As, addr *obj.Addr) int16 {
+ 	}
+ 	return addr.Reg
+ }
++
++// getSpecial checks that addr represents a special operand and returns its value.
++func (p *Parser) getSpecial(prog *obj.Prog, op obj.As, addr *obj.Addr) int64 {
++	if addr.Type != obj.TYPE_SPECIAL || addr.Name != 0 || addr.Reg != 0 || addr.Index != 0 {
++		p.errorf("%s: expected special operand; found %s", op, obj.Dconv(prog, addr))
++	}
++	return addr.Offset
++}
+diff --git a/src/cmd/asm/internal/asm/parse.go b/src/cmd/asm/internal/asm/parse.go
+index ecee98593d..0d78a242c7 100644
+--- a/src/cmd/asm/internal/asm/parse.go
++++ b/src/cmd/asm/internal/asm/parse.go
+@@ -20,6 +20,7 @@ import (
+ 	"cmd/asm/internal/lex"
+ 	"cmd/internal/obj"
+ 	"cmd/internal/obj/arm64"
++	"cmd/internal/obj/riscv"
+ 	"cmd/internal/obj/x86"
+ 	"cmd/internal/src"
+ 	"cmd/internal/sys"
+@@ -390,16 +391,21 @@ func (p *Parser) operand(a *obj.Addr) {
+ 	tok := p.next()
+ 	name := tok.String()
+ 	if tok.ScanToken == scanner.Ident && !p.atStartOfRegister(name) {
++		// See if this is an architecture specific special operand.
+ 		switch p.arch.Family {
+ 		case sys.ARM64:
+-			// arm64 special operands.
+-			if opd := arch.GetARM64SpecialOperand(name); opd != arm64.SPOP_END {
++			if opd := arch.ARM64SpecialOperand(name); opd != arm64.SPOP_END {
+ 				a.Type = obj.TYPE_SPECIAL
+ 				a.Offset = int64(opd)
+-				break
+ 			}
+-			fallthrough
+-		default:
++		case sys.RISCV64:
++			if opd := arch.RISCV64SpecialOperand(name); opd != riscv.SPOP_END {
++				a.Type = obj.TYPE_SPECIAL
++				a.Offset = int64(opd)
++			}
++		}
++
++		if a.Type != obj.TYPE_SPECIAL {
+ 			// We have a symbol. Parse $sym±offset(symkind)
+ 			p.symbolReference(a, name, prefix)
+ 		}
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 588ad0f067..aba7a80007 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -424,6 +424,30 @@ start:
+ 	BSET	$63, X9					// 9394f42b
+ 	BSETI	$1, X10, X11				// 93151528
+ 
++	//
++	// "V" Standard Extension for Vector Operations, Version 1.0
++	//
++
++	// 31.6: Configuration Setting Instructions
++	VSETVLI	X10, E8, M1, TU, MU, X12		// 57760500
++	VSETVLI	X10, E16, M1, TU, MU, X12		// 57768500
++	VSETVLI	X10, E32, M1, TU, MU, X12		// 57760501
++	VSETVLI	X10, E64, M1, TU, MU, X12		// 57768501
++	VSETVLI	X10, E32, M1, TU, MA, X12		// 57760509
++	VSETVLI	X10, E32, M1, TA, MA, X12		// 5776050d
++	VSETVLI	X10, E32, M2, TA, MA, X12		// 5776150d
++	VSETVLI	X10, E32, M4, TA, MA, X12		// 5776250d
++	VSETVLI	X10, E32, M8, TA, MA, X12		// 5776350d
++	VSETVLI	X10, E32, MF2, TA, MA, X12		// 5776550d
++	VSETVLI	X10, E32, MF4, TA, MA, X12		// 5776650d
++	VSETVLI	X10, E32, MF8, TA, MA, X12		// 5776750d
++	VSETVLI	X10, E32, M1, TA, MA, X12		// 5776050d
++	VSETVLI	$15, E32, M1, TA, MA, X12		// 57f607cd
++	VSETIVLI $0, E32, M1, TA, MA, X12		// 577600cd
++	VSETIVLI $15, E32, M1, TA, MA, X12		// 57f607cd
++	VSETIVLI $31, E32, M1, TA, MA, X12		// 57f60fcd
++	VSETVL	X10, X11, X12				// 57f6a580
++
+ 	//
+ 	// Privileged ISA
+ 	//
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index 0b0184aaa7..a90f22af9f 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -46,4 +46,8 @@ TEXT errors(SB),$0
+ 	SRLI	$1, X5, F1			// ERROR "expected integer register in rd position but got non-integer register F1"
+ 	SRLI	$1, F1, X5			// ERROR "expected integer register in rs1 position but got non-integer register F1"
+ 	FNES	F1, (X5)			// ERROR "needs an integer register output"
++	VSETVLI	$32, E16, M1, TU, MU, X12	// ERROR "must be in range [0, 31] (5 bits)"
++	VSETVLI	$-1, E32, M2, TA, MA, X12	// ERROR "must be in range [0, 31] (5 bits)"
++	VSETIVLI X10, E32, M2, TA, MA, X12	// ERROR "expected immediate value"
++	VSETVL	X10, X11			// ERROR "expected integer register in rs1 position"
+ 	RET
+diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go
+index fc170e737d..3a3d976639 100644
+--- a/src/cmd/internal/obj/arm64/a.out.go
++++ b/src/cmd/internal/obj/arm64/a.out.go
+@@ -1052,8 +1052,8 @@ type SpecialOperand int
+ 
+ const (
+ 	// PRFM
+-	SPOP_PLDL1KEEP SpecialOperand = iota     // must be the first one
+-	SPOP_BEGIN     SpecialOperand = iota - 1 // set as the lower bound
++	SPOP_PLDL1KEEP SpecialOperand = obj.SpecialOperandARM64Base + iota     // must be the first one
++	SPOP_BEGIN     SpecialOperand = obj.SpecialOperandARM64Base + iota - 1 // set as the lower bound
+ 	SPOP_PLDL1STRM
+ 	SPOP_PLDL2KEEP
+ 	SPOP_PLDL2STRM
+diff --git a/src/cmd/internal/obj/link.go b/src/cmd/internal/obj/link.go
+index b12bf2399a..2b35554cdc 100644
+--- a/src/cmd/internal/obj/link.go
++++ b/src/cmd/internal/obj/link.go
+@@ -97,7 +97,8 @@ import (
+ //			val = string
+ //
+ //	<symbolic constant name>
+-//		Special symbolic constants for ARM64, such as conditional flags, tlbi_op and so on.
++//		Special symbolic constants for ARM64 (such as conditional flags, tlbi_op and so on)
++//		and RISCV64 (such as names for vector configuration instruction arguments).
+ //		Encoding:
+ //			type = TYPE_SPECIAL
+ //			offset = The constant value corresponding to this symbol
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index 29f7e913ed..9b88ff2ccd 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -1217,6 +1217,77 @@ const (
+ 	RM_RMM              // Round to Nearest, ties to Max Magnitude
+ )
+ 
++type SpecialOperand int
++
++const (
++	SPOP_BEGIN SpecialOperand = obj.SpecialOperandRISCVBase
++
++	// Vector mask policy.
++	SPOP_MA SpecialOperand = obj.SpecialOperandRISCVBase + iota - 1
++	SPOP_MU
++
++	// Vector tail policy.
++	SPOP_TA
++	SPOP_TU
++
++	// Vector register group multiplier (VLMUL).
++	SPOP_M1
++	SPOP_M2
++	SPOP_M4
++	SPOP_M8
++	SPOP_MF2
++	SPOP_MF4
++	SPOP_MF8
++
++	// Vector selected element width (VSEW).
++	SPOP_E8
++	SPOP_E16
++	SPOP_E32
++	SPOP_E64
++
++	SPOP_END
++)
++
++var specialOperands = map[SpecialOperand]struct {
++	encoding uint32
++	name     string
++}{
++	SPOP_MA: {encoding: 1, name: "MA"},
++	SPOP_MU: {encoding: 0, name: "MU"},
++
++	SPOP_TA: {encoding: 1, name: "TA"},
++	SPOP_TU: {encoding: 0, name: "TU"},
++
++	SPOP_M1:  {encoding: 0, name: "M1"},
++	SPOP_M2:  {encoding: 1, name: "M2"},
++	SPOP_M4:  {encoding: 2, name: "M4"},
++	SPOP_M8:  {encoding: 3, name: "M8"},
++	SPOP_MF2: {encoding: 5, name: "MF2"},
++	SPOP_MF4: {encoding: 6, name: "MF4"},
++	SPOP_MF8: {encoding: 7, name: "MF8"},
++
++	SPOP_E8:  {encoding: 0, name: "E8"},
++	SPOP_E16: {encoding: 1, name: "E16"},
++	SPOP_E32: {encoding: 2, name: "E32"},
++	SPOP_E64: {encoding: 3, name: "E64"},
++}
++
++func (so SpecialOperand) encode() uint32 {
++	op, ok := specialOperands[so]
++	if ok {
++		return op.encoding
++	}
++	return 0
++}
++
++func (so SpecialOperand) String() string {
++	op, ok := specialOperands[so]
++	if ok {
++		return op.name
++	}
++	return ""
++}
++
+ // All unary instructions which write to their arguments (as opposed to reading
+ // from them) go here. The assembly parser uses this information to populate
+ // its AST in a semantically reasonable way.
+diff --git a/src/cmd/internal/obj/riscv/list.go b/src/cmd/internal/obj/riscv/list.go
+index c5b7e80719..8eb97a476d 100644
+--- a/src/cmd/internal/obj/riscv/list.go
++++ b/src/cmd/internal/obj/riscv/list.go
+@@ -14,6 +14,7 @@ func init() {
+ 	obj.RegisterRegister(obj.RBaseRISCV, REG_END, RegName)
+ 	obj.RegisterOpcode(obj.ABaseRISCV, Anames)
+ 	obj.RegisterOpSuffix("riscv64", opSuffixString)
++	obj.RegisterSpecialOperands(int64(SPOP_BEGIN), int64(SPOP_END), specialOperandConv)
+ }
+ 
+ func RegName(r int) string {
+@@ -49,3 +50,11 @@ func opSuffixString(s uint8) string {
+ 	}
+ 	return fmt.Sprintf(".%s", ss)
+ }
++
++func specialOperandConv(a int64) string {
++	spc := SpecialOperand(a)
++	if spc >= SPOP_BEGIN && spc < SPOP_END {
++		return spc.String()
++	}
++	return "SPC_??"
++}
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 2e582eb9cb..3d1c120baa 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1046,27 +1046,35 @@ func immEven(x int64) error {
+ 	return nil
+ }
+ 
+-// immIFits checks whether the immediate value x fits in nbits bits
+-// as a signed integer. If it does not, an error is returned.
+-func immIFits(x int64, nbits uint) error {
+-	nbits--
+-	min := int64(-1) << nbits
+-	max := int64(1)<<nbits - 1
++func immFits(x int64, nbits uint, signed bool) error {
++	label := "unsigned"
++	min, max := int64(0), int64(1)<<nbits-1
++	if signed {
++		label = "signed"
++		sbits := nbits - 1
++		min, max = int64(-1)<<sbits, int64(1)<<sbits-1
++	}
+ 	if x < min || x > max {
+ 		if nbits <= 16 {
+-			return fmt.Errorf("signed immediate %d must be in range [%d, %d] (%d bits)", x, min, max, nbits)
++			return fmt.Errorf("%s immediate %d must be in range [%d, %d] (%d bits)", label, x, min, max, nbits)
+ 		}
+-		return fmt.Errorf("signed immediate %#x must be in range [%#x, %#x] (%d bits)", x, min, max, nbits)
++		return fmt.Errorf("%s immediate %#x must be in range [%#x, %#x] (%d bits)", label, x, min, max, nbits)
+ 	}
+ 	return nil
+ }
+ 
++// immIFits checks whether the immediate value x fits in nbits bits
++// as a signed integer. If it does not, an error is returned.
++func immIFits(x int64, nbits uint) error {
++	return immFits(x, nbits, true)
++}
++
+ // immI extracts the signed integer of the specified size from an immediate.
+ func immI(as obj.As, imm int64, nbits uint) uint32 {
+ 	if err := immIFits(imm, nbits); err != nil {
+ 		panic(fmt.Sprintf("%v: %v", as, err))
+ 	}
+-	return uint32(imm)
++	return uint32(imm) & ((1 << nbits) - 1)
+ }
+ 
+ func wantImmI(ctxt *obj.Link, ins *instruction, imm int64, nbits uint) {
+@@ -1075,6 +1083,26 @@ func wantImmI(ctxt *obj.Link, ins *instruction, imm int64, nbits uint) {
+ 	}
+ }
+ 
++// immUFits checks whether the immediate value x fits in nbits bits
++// as an unsigned integer. If it does not, an error is returned.
++func immUFits(x int64, nbits uint) error {
++	return immFits(x, nbits, false)
++}
++
++// immU extracts the unsigned integer of the specified size from an immediate.
++func immU(as obj.As, imm int64, nbits uint) uint32 {
++	if err := immUFits(imm, nbits); err != nil {
++		panic(fmt.Sprintf("%v: %v", as, err))
++	}
++	return uint32(imm) & ((1 << nbits) - 1)
++}
++
++func wantImmU(ctxt *obj.Link, ins *instruction, imm int64, nbits uint) {
++	if err := immUFits(imm, nbits); err != nil {
++		ctxt.Diag("%v: %v", ins, err)
++	}
++}
++
+ func wantReg(ctxt *obj.Link, ins *instruction, pos string, descr string, r, min, max uint32) {
+ 	if r < min || r > max {
+ 		var suffix string
+@@ -1231,6 +1259,29 @@ func validateJ(ctxt *obj.Link, ins *instruction) {
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
++func validateVsetvli(ctxt *obj.Link, ins *instruction) {
++	wantImmU(ctxt, ins, ins.imm, 11)
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateVsetivli(ctxt *obj.Link, ins *instruction) {
++	wantImmU(ctxt, ins, ins.imm, 10)
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantImmU(ctxt, ins, int64(ins.rs1), 5)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateVsetvl(ctxt *obj.Link, ins *instruction) {
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantIntReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
+ func validateRaw(ctxt *obj.Link, ins *instruction) {
+ 	// Treat the raw value specially as a 32-bit unsigned integer.
+ 	// Nobody wants to enter negative machine code.
+@@ -1419,6 +1470,29 @@ func encodeCJImmediate(imm uint32) uint32 {
+ 	return bits << 2
+ }
+ 
++func encodeVset(as obj.As, rs1, rs2, rd uint32) uint32 {
++	enc := encode(as)
++	if enc == nil {
++		panic("encodeVset: could not encode instruction")
++	}
++	return enc.funct7<<25 | rs2<<20 | rs1<<15 | enc.funct3<<12 | rd<<7 | enc.opcode
++}
++
++func encodeVsetvli(ins *instruction) uint32 {
++	vtype := immU(ins.as, ins.imm, 11)
++	return encodeVset(ins.as, regI(ins.rs1), vtype, regI(ins.rd))
++}
++
++func encodeVsetivli(ins *instruction) uint32 {
++	vtype := immU(ins.as, ins.imm, 10)
++	avl := immU(ins.as, int64(ins.rs1), 5)
++	return encodeVset(ins.as, avl, vtype, regI(ins.rd))
++}
++
++func encodeVsetvl(ins *instruction) uint32 {
++	return encodeVset(ins.as, regI(ins.rs1), regI(ins.rs2), regI(ins.rd))
++}
++
+ func encodeRawIns(ins *instruction) uint32 {
+ 	// Treat the raw value specially as a 32-bit unsigned integer.
+ 	// Nobody wants to enter negative machine code.
+@@ -1489,6 +1563,27 @@ func EncodeUImmediate(imm int64) (int64, error) {
+ 	return imm << 12, nil
+ }
+ 
++func EncodeVectorType(vsew, vlmul, vtail, vmask int64) (int64, error) {
++	vsewSO := SpecialOperand(vsew)
++	if vsewSO < SPOP_E8 || vsewSO > SPOP_E64 {
++		return -1, fmt.Errorf("invalid vector selected element width %q", vsewSO)
++	}
++	vlmulSO := SpecialOperand(vlmul)
++	if vlmulSO < SPOP_M1 || vlmulSO > SPOP_MF8 {
++		return -1, fmt.Errorf("invalid vector register group multiplier %q", vlmulSO)
++	}
++	vtailSO := SpecialOperand(vtail)
++	if vtailSO != SPOP_TA && vtailSO != SPOP_TU {
++		return -1, fmt.Errorf("invalid vector tail policy %q", vtailSO)
++	}
++	vmaskSO := SpecialOperand(vmask)
++	if vmaskSO != SPOP_MA && vmaskSO != SPOP_MU {
++		return -1, fmt.Errorf("invalid vector mask policy %q", vmaskSO)
++	}
++	vtype := vmaskSO.encode()<<7 | vtailSO.encode()<<6 | vsewSO.encode()<<3 | vlmulSO.encode()
++	return int64(vtype), nil
++}
++
+ type encoding struct {
+ 	encode   func(*instruction) uint32     // encode returns the machine code for an instruction
+ 	validate func(*obj.Link, *instruction) // validate validates an instruction
+@@ -1526,6 +1621,11 @@ var (
+ 	uEncoding = encoding{encode: encodeU, validate: validateU, length: 4}
+ 	jEncoding = encoding{encode: encodeJ, validate: validateJ, length: 4}
+ 
++	// Encodings for vector configuration setting instruction.
++	vsetvliEncoding  = encoding{encode: encodeVsetvli, validate: validateVsetvli, length: 4}
++	vsetivliEncoding = encoding{encode: encodeVsetivli, validate: validateVsetivli, length: 4}
++	vsetvlEncoding   = encoding{encode: encodeVsetvl, validate: validateVsetvl, length: 4}
++
+ 	// rawEncoding encodes a raw instruction byte sequence.
+ 	rawEncoding = encoding{encode: encodeRawIns, validate: validateRaw, length: 4}
+ 
+@@ -1792,6 +1892,15 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	ABSET & obj.AMask:  {enc: rIIIEncoding, immForm: ABSETI, ternary: true},
+ 	ABSETI & obj.AMask: {enc: iIIEncoding, ternary: true},
+ 
++	//
++	// "V" Standard Extension for Vector Operations, Version 1.0
++	//
++
++	// 31.6. Vector Configuration-Setting Instructions
++	AVSETVLI & obj.AMask:  {enc: vsetvliEncoding, immForm: AVSETIVLI},
++	AVSETIVLI & obj.AMask: {enc: vsetivliEncoding},
++	AVSETVL & obj.AMask:   {enc: vsetvlEncoding},
++
+ 	//
+ 	// Privileged ISA
+ 	//
+@@ -2356,7 +2465,12 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 	ins := instructionForProg(p)
+ 	inss := []*instruction{ins}
+ 
+-	if len(p.RestArgs) > 1 {
++	if ins.as == AVSETVLI || ins.as == AVSETIVLI {
++		if len(p.RestArgs) != 4 {
++			p.Ctxt.Diag("incorrect number of arguments for instruction")
++			return nil
++		}
++	} else if len(p.RestArgs) > 1 {
+ 		p.Ctxt.Diag("too many source registers")
+ 		return nil
+ 	}
+@@ -2594,6 +2708,21 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		// XNOR -> (NOT (XOR x y))
+ 		ins.as = AXOR
+ 		inss = append(inss, &instruction{as: AXORI, rs1: ins.rd, rs2: obj.REG_NONE, rd: ins.rd, imm: -1})
++
++	case AVSETVLI, AVSETIVLI:
++		ins.rs1, ins.rs2 = ins.rs2, obj.REG_NONE
++		vtype, err := EncodeVectorType(p.RestArgs[0].Offset, p.RestArgs[1].Offset, p.RestArgs[2].Offset, p.RestArgs[3].Offset)
++		if err != nil {
++			p.Ctxt.Diag("%v: %v", p, err)
++		}
++		ins.imm = int64(vtype)
++		if ins.as == AVSETIVLI {
++			if p.From.Type != obj.TYPE_CONST {
++				p.Ctxt.Diag("%v: expected immediate value", p)
++			}
++			ins.rs1 = uint32(p.From.Offset)
++		}
++
+ 	}
+ 
+ 	for _, ins := range inss {
+diff --git a/src/cmd/internal/obj/util.go b/src/cmd/internal/obj/util.go
+index 3a071c21d4..367535f863 100644
+--- a/src/cmd/internal/obj/util.go
++++ b/src/cmd/internal/obj/util.go
+@@ -584,6 +584,13 @@ type spcSet struct {
+ 
+ var spcSpace []spcSet
+ 
++// Each architecture is allotted a distinct subspace: [Lo, Hi) for declaring its
++// arch-specific special operands.
++const (
++	SpecialOperandARM64Base = 0 << 16
++	SpecialOperandRISCVBase = 1 << 16
++)
++
+ // RegisterSpecialOperands binds a pretty-printer (SPCconv) for special
+ // operand numbers to a given special operand number range. Lo is inclusive,
+ // hi is exclusive (valid special operands are lo through hi-1).
+-- 
+2.39.5
+
diff --git a/2070-internal-bytealg-clean-up-and-simplify-the-riscv64-e.patch b/2070-internal-bytealg-clean-up-and-simplify-the-riscv64-e.patch
new file mode 100644
index 0000000..1456ee5
--- /dev/null
+++ b/2070-internal-bytealg-clean-up-and-simplify-the-riscv64-e.patch
@@ -0,0 +1,160 @@
+From e4c46374f5cbd543dccfa0fb346503a7a46a34ef Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 070/119] internal/bytealg: clean up and simplify the riscv64
+ equal implementation
+
+Now that riscv64 is only regabi, remove the entrypoint separation and
+have runtime.memequal_varlen call runtime.memequal. Add a zero byte
+length check and replace the equal and not equal exit paths with a
+single exit path that conditions on length reaching zero.
+
+Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64
+Change-Id: Ida4e54378daa7fd423f759753eba04ce513a27cb
+Reviewed-on: https://go-review.googlesource.com/c/go/+/648855
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Michael Knyszek <mknyszek@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/internal/bytealg/equal_riscv64.s | 62 +++++++++++++---------------
+ 1 file changed, 29 insertions(+), 33 deletions(-)
+
+diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s
+index 7f470ce0a0..87b2d79302 100644
+--- a/src/internal/bytealg/equal_riscv64.s
++++ b/src/internal/bytealg/equal_riscv64.s
+@@ -7,25 +7,23 @@
+ 
+ #define	CTXT	S10
+ 
+-// func memequal(a, b unsafe.Pointer, size uintptr) bool
+-TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+-	// X10 = a_base
+-	// X11 = b_base
+-	// X12 = size
+-	JMP	memequal<>(SB)
+-
+ // func memequal_varlen(a, b unsafe.Pointer) bool
+ TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
++	// X10 = a_base
++	// X11 = b_base
+ 	MOV	8(CTXT), X12    // compiler stores size at offset 8 in the closure
++	JMP	runtime·memequal<ABIInternal>(SB)
++
++// func memequal(a, b unsafe.Pointer, size uintptr) bool
++TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+ 	// X10 = a_base
+ 	// X11 = b_base
+-	JMP	memequal<>(SB)
++	// X12 = size
++	BNE	X10, X11, length_check
++	MOV	$0, X12
+ 
+-// On entry X10 and X11 contain pointers, X12 contains length.
+-// For non-regabi X13 contains address for return value.
+-// For regabi return value in X10.
+-TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0
+-	BEQ	X10, X11, eq
++length_check:
++	BEQZ	X12, done
+ 
+ 	MOV	$32, X23
+ 	BLT	X12, X23, loop4_check
+@@ -44,7 +42,7 @@ align:
+ 	SUB	$1, X9
+ 	MOVBU	0(X10), X19
+ 	MOVBU	0(X11), X20
+-	BNE	X19, X20, not_eq
++	BNE	X19, X20, done
+ 	ADD	$1, X10
+ 	ADD	$1, X11
+ 	BNEZ	X9, align
+@@ -57,19 +55,19 @@ loop32:
+ 	MOV	0(X11), X20
+ 	MOV	8(X10), X21
+ 	MOV	8(X11), X22
+-	BNE	X19, X20, not_eq
+-	BNE	X21, X22, not_eq
++	BNE	X19, X20, done
++	BNE	X21, X22, done
+ 	MOV	16(X10), X14
+ 	MOV	16(X11), X15
+ 	MOV	24(X10), X16
+ 	MOV	24(X11), X17
+-	BNE	X14, X15, not_eq
+-	BNE	X16, X17, not_eq
++	BNE	X14, X15, done
++	BNE	X16, X17, done
+ 	ADD	$32, X10
+ 	ADD	$32, X11
+ 	SUB	$32, X12
+ 	BGE	X12, X9, loop32
+-	BEQZ	X12, eq
++	BEQZ	X12, done
+ 
+ loop16_check:
+ 	MOV	$16, X23
+@@ -79,13 +77,13 @@ loop16:
+ 	MOV	0(X11), X20
+ 	MOV	8(X10), X21
+ 	MOV	8(X11), X22
+-	BNE	X19, X20, not_eq
+-	BNE	X21, X22, not_eq
++	BNE	X19, X20, done
++	BNE	X21, X22, done
+ 	ADD	$16, X10
+ 	ADD	$16, X11
+ 	SUB	$16, X12
+ 	BGE	X12, X23, loop16
+-	BEQZ	X12, eq
++	BEQZ	X12, done
+ 
+ loop4_check:
+ 	MOV	$4, X23
+@@ -95,32 +93,30 @@ loop4:
+ 	MOVBU	0(X11), X20
+ 	MOVBU	1(X10), X21
+ 	MOVBU	1(X11), X22
+-	BNE	X19, X20, not_eq
+-	BNE	X21, X22, not_eq
++	BNE	X19, X20, done
++	BNE	X21, X22, done
+ 	MOVBU	2(X10), X14
+ 	MOVBU	2(X11), X15
+ 	MOVBU	3(X10), X16
+ 	MOVBU	3(X11), X17
+-	BNE	X14, X15, not_eq
+-	BNE	X16, X17, not_eq
++	BNE	X14, X15, done
++	BNE	X16, X17, done
+ 	ADD	$4, X10
+ 	ADD	$4, X11
+ 	SUB	$4, X12
+ 	BGE	X12, X23, loop4
+ 
+ loop1:
+-	BEQZ	X12, eq
++	BEQZ	X12, done
+ 	MOVBU	0(X10), X19
+ 	MOVBU	0(X11), X20
+-	BNE	X19, X20, not_eq
++	BNE	X19, X20, done
+ 	ADD	$1, X10
+ 	ADD	$1, X11
+ 	SUB	$1, X12
+ 	JMP	loop1
+ 
+-not_eq:
+-	MOVB	ZERO, X10
+-	RET
+-eq:
+-	MOV	$1, X10
++done:
++	// If X12 is zero then memory is equivalent.
++	SEQZ	X12, X10
+ 	RET
+-- 
+2.39.5
+
diff --git a/2071-bytes-internal-bytealg-eliminate-HashStrBytes-HashSt.patch b/2071-bytes-internal-bytealg-eliminate-HashStrBytes-HashSt.patch
new file mode 100644
index 0000000..31903aa
--- /dev/null
+++ b/2071-bytes-internal-bytealg-eliminate-HashStrBytes-HashSt.patch
@@ -0,0 +1,126 @@
+From fd8c0c4bd2cbc86ae57f517398792416f7a497c3 Mon Sep 17 00:00:00 2001
+From: Jes Cok <xigua67damn@gmail.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 071/119] =?UTF-8?q?bytes,internal/bytealg:=20eliminate=20H?=
+ =?UTF-8?q?ashStrBytes,HashStrRevBytes=20using=20=E2=80=A6?=
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+…generics
+
+The logic of HashStrBytes, HashStrRevBytes and HashStr, HashStrRev,
+are exactly the same, except that the types are different.
+
+Since the bootstrap toolchain is bumped to 1.20, we can eliminate them
+by using generics.
+
+Change-Id: I4336b1cab494ba963f09646c169b45f6b1ee62e3
+GitHub-Last-Rev: b11a2bf9476d54bed4bd18a3f9269b5c95a66d67
+GitHub-Pull-Request: golang/go#63766
+Reviewed-on: https://go-review.googlesource.com/c/go/+/538175
+Reviewed-by: Keith Randall <khr@golang.org>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Keith Randall <khr@google.com>
+Auto-Submit: Keith Randall <khr@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/bytes/bytes.go              |  2 +-
+ src/internal/bytealg/bytealg.go | 46 +++++----------------------------
+ 2 files changed, 7 insertions(+), 41 deletions(-)
+
+diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go
+index c54e52e4fc..c662b1cae6 100644
+--- a/src/bytes/bytes.go
++++ b/src/bytes/bytes.go
+@@ -122,7 +122,7 @@ func LastIndex(s, sep []byte) int {
+ 		return -1
+ 	}
+ 	// Rabin-Karp search from the end of the string
+-	hashss, pow := bytealg.HashStrRevBytes(sep)
++	hashss, pow := bytealg.HashStrRev(sep)
+ 	last := len(s) - n
+ 	var h uint32
+ 	for i := len(s) - 1; i >= last; i-- {
+diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go
+index 28f2742c0e..ae4b8b48d2 100644
+--- a/src/internal/bytealg/bytealg.go
++++ b/src/internal/bytealg/bytealg.go
+@@ -24,33 +24,16 @@ const (
+ // If MaxLen is not 0, make sure MaxLen >= 4.
+ var MaxLen int
+ 
+-// FIXME: the logic of HashStrBytes, HashStrRevBytes, IndexRabinKarpBytes and HashStr, HashStrRev,
+-// IndexRabinKarp are exactly the same, except that the types are different. Can we eliminate
+-// three of them without causing allocation?
++// FIXME: the logic of IndexRabinKarpBytes and IndexRabinKarp are exactly the same,
++// except that the types are different.
++// Can we eliminate one of them without causing allocation?
+ 
+ // PrimeRK is the prime base used in Rabin-Karp algorithm.
+ const PrimeRK = 16777619
+ 
+-// HashStrBytes returns the hash and the appropriate multiplicative
+-// factor for use in Rabin-Karp algorithm.
+-func HashStrBytes(sep []byte) (uint32, uint32) {
+-	hash := uint32(0)
+-	for i := 0; i < len(sep); i++ {
+-		hash = hash*PrimeRK + uint32(sep[i])
+-	}
+-	var pow, sq uint32 = 1, PrimeRK
+-	for i := len(sep); i > 0; i >>= 1 {
+-		if i&1 != 0 {
+-			pow *= sq
+-		}
+-		sq *= sq
+-	}
+-	return hash, pow
+-}
+-
+ // HashStr returns the hash and the appropriate multiplicative
+ // factor for use in Rabin-Karp algorithm.
+-func HashStr(sep string) (uint32, uint32) {
++func HashStr[T string | []byte](sep T) (uint32, uint32) {
+ 	hash := uint32(0)
+ 	for i := 0; i < len(sep); i++ {
+ 		hash = hash*PrimeRK + uint32(sep[i])
+@@ -65,26 +48,9 @@ func HashStr(sep string) (uint32, uint32) {
+ 	return hash, pow
+ }
+ 
+-// HashStrRevBytes returns the hash of the reverse of sep and the
+-// appropriate multiplicative factor for use in Rabin-Karp algorithm.
+-func HashStrRevBytes(sep []byte) (uint32, uint32) {
+-	hash := uint32(0)
+-	for i := len(sep) - 1; i >= 0; i-- {
+-		hash = hash*PrimeRK + uint32(sep[i])
+-	}
+-	var pow, sq uint32 = 1, PrimeRK
+-	for i := len(sep); i > 0; i >>= 1 {
+-		if i&1 != 0 {
+-			pow *= sq
+-		}
+-		sq *= sq
+-	}
+-	return hash, pow
+-}
+-
+ // HashStrRev returns the hash of the reverse of sep and the
+ // appropriate multiplicative factor for use in Rabin-Karp algorithm.
+-func HashStrRev(sep string) (uint32, uint32) {
++func HashStrRev[T string | []byte](sep T) (uint32, uint32) {
+ 	hash := uint32(0)
+ 	for i := len(sep) - 1; i >= 0; i-- {
+ 		hash = hash*PrimeRK + uint32(sep[i])
+@@ -103,7 +69,7 @@ func HashStrRev(sep string) (uint32, uint32) {
+ // first occurrence of substr in s, or -1 if not present.
+ func IndexRabinKarpBytes(s, sep []byte) int {
+ 	// Rabin-Karp search
+-	hashsep, pow := HashStrBytes(sep)
++	hashsep, pow := HashStr(sep)
+ 	n := len(sep)
+ 	var h uint32
+ 	for i := 0; i < n; i++ {
+-- 
+2.39.5
+
diff --git a/2072-cmd-internal-obj-riscv-implement-vector-load-store-i.patch b/2072-cmd-internal-obj-riscv-implement-vector-load-store-i.patch
new file mode 100644
index 0000000..6b0afd7
--- /dev/null
+++ b/2072-cmd-internal-obj-riscv-implement-vector-load-store-i.patch
@@ -0,0 +1,539 @@
+From d2e44d94537e9ee96d9b5909f575ae9eb15422d7 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 072/119] cmd/internal/obj/riscv: implement vector load/store
+ instructions
+
+Implement vector unit stride, vector strided, vector indexed and
+vector whole register load and store instructions.
+
+The vector unit stride instructions take an optional vector mask
+register, which if specified must be register V0. If only two
+operands are given, the instruction is encoded as unmasked.
+
+The vector strided and vector indexed instructions also take an
+optional vector mask register, which if specified must be register
+V0. If only three operands are given, the instruction is encoded as
+unmasked.
+
+Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64
+Change-Id: I35e43bb8f1cf6ae8826fbeec384b95ac945da50f
+Reviewed-on: https://go-review.googlesource.com/c/go/+/631937
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Michael Knyszek <mknyszek@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   |  98 ++++++++
+ .../asm/internal/asm/testdata/riscv64error.s  |  39 +++
+ src/cmd/internal/obj/riscv/anames.go          |   4 +
+ src/cmd/internal/obj/riscv/cpu.go             |   4 +
+ src/cmd/internal/obj/riscv/obj.go             | 226 ++++++++++++++++--
+ 5 files changed, 356 insertions(+), 15 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index aba7a80007..49f3ac00f3 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -448,6 +448,104 @@ start:
+ 	VSETIVLI $31, E32, M1, TA, MA, X12		// 57f60fcd
+ 	VSETVL	X10, X11, X12				// 57f6a580
+ 
++	// 31.7.4: Vector Unit-Stride Instructions
++	VLE8V		(X10), V3			// 87010502
++	VLE8V		(X10), V0, V3			// 87010500
++	VLE16V		(X10), V3			// 87510502
++	VLE16V		(X10), V0, V3			// 87510500
++	VLE32V		(X10), V3			// 87610502
++	VLE32V		(X10), V0, V3			// 87610500
++	VLE64V		(X10), V3			// 87710502
++	VLE64V		(X10), V0, V3			// 87710500
++	VSE8V		V3, (X10)			// a7010502
++	VSE8V		V3, V0, (X10)			// a7010500
++	VSE16V		V3, (X10)			// a7510502
++	VSE16V		V3, V0, (X10)			// a7510500
++	VSE32V		V3, (X10)			// a7610502
++	VSE32V		V3, V0, (X10)			// a7610500
++	VSE64V		V3, (X10)			// a7710502
++	VSE64V		V3, V0, (X10)			// a7710500
++	VLMV		(X10), V3			// 8701b502
++	VSMV		V3, (X10)			// a701b502
++
++	// 31.7.5: Vector Strided Instructions
++	VLSE8V		(X10), X11, V3			// 8701b50a
++	VLSE8V		(X10), X11, V0, V3		// 8701b508
++	VLSE16V		(X10), X11, V3			// 8751b50a
++	VLSE16V		(X10), X11, V0, V3		// 8751b508
++	VLSE32V		(X10), X11, V3			// 8761b50a
++	VLSE32V		(X10), X11, V0, V3		// 8761b508
++	VLSE64V		(X10), X11, V3			// 8771b50a
++	VLSE64V		(X10), X11, V0, V3		// 8771b508
++	VSSE8V		V3, X11, (X10)			// a701b50a
++	VSSE8V		V3, X11, V0, (X10)		// a701b508
++	VSSE16V		V3, X11, (X10)			// a751b50a
++	VSSE16V		V3, X11, V0, (X10)		// a751b508
++	VSSE32V		V3, X11, (X10)			// a761b50a
++	VSSE32V		V3, X11, V0, (X10)		// a761b508
++	VSSE64V		V3, X11, (X10)			// a771b50a
++	VSSE64V		V3, X11, V0, (X10)		// a771b508
++
++	// 31.7.6: Vector Indexed Instructions
++	VLUXEI8V	(X10), V2, V3			// 87012506
++	VLUXEI8V	(X10), V2, V0, V3		// 87012504
++	VLUXEI16V	(X10), V2, V3			// 87512506
++	VLUXEI16V	(X10), V2, V0, V3		// 87512504
++	VLUXEI32V	(X10), V2, V3			// 87612506
++	VLUXEI32V	(X10), V2, V0, V3		// 87612504
++	VLUXEI64V	(X10), V2, V3			// 87712506
++	VLUXEI64V	(X10), V2, V0, V3		// 87712504
++	VLOXEI8V	(X10), V2, V3			// 8701250e
++	VLOXEI8V	(X10), V2, V0, V3		// 8701250c
++	VLOXEI16V	(X10), V2, V3			// 8751250e
++	VLOXEI16V	(X10), V2, V0, V3		// 8751250c
++	VLOXEI32V	(X10), V2, V3			// 8761250e
++	VLOXEI32V	(X10), V2, V0, V3		// 8761250c
++	VLOXEI64V	(X10), V2, V3			// 8771250e
++	VLOXEI64V	(X10), V2, V0, V3		// 8771250c
++	VSUXEI8V	V3, V2, (X10)			// a7012506
++	VSUXEI8V	V3, V2, V0, (X10)		// a7012504
++	VSUXEI16V	V3, V2, (X10)			// a7512506
++	VSUXEI16V	V3, V2, V0, (X10)		// a7512504
++	VSUXEI32V	V3, V2, (X10)			// a7612506
++	VSUXEI32V	V3, V2, V0, (X10)		// a7612504
++	VSUXEI64V	V3, V2, (X10)			// a7712506
++	VSUXEI64V	V3, V2, V0, (X10)		// a7712504
++	VSOXEI8V	V3, V2, (X10)			// a701250e
++	VSOXEI8V	V3, V2, V0, (X10)		// a701250c
++	VSOXEI16V	V3, V2, (X10)			// a751250e
++	VSOXEI16V	V3, V2, V0, (X10)		// a751250c
++	VSOXEI32V	V3, V2, (X10)			// a761250e
++	VSOXEI32V	V3, V2, V0, (X10)		// a761250c
++	VSOXEI64V	V3, V2, (X10)			// a771250e
++	VSOXEI64V	V3, V2, V0, (X10)		// a771250c
++
++	// 31.7.9: Vector Load/Store Whole Register Instructions
++	VL1RV		(X10), V3			// 87018502
++	VL1RE8V		(X10), V3			// 87018502
++	VL1RE16V	(X10), V3			// 87518502
++	VL1RE32V	(X10), V3			// 87618502
++	VL1RE64V	(X10), V3			// 87718502
++	VL2RV		(X10), V2			// 07018522
++	VL2RE8V		(X10), V2			// 07018522
++	VL2RE16V	(X10), V2			// 07518522
++	VL2RE32V	(X10), V2			// 07618522
++	VL2RE64V	(X10), V2			// 07718522
++	VL4RV		(X10), V4			// 07028562
++	VL4RE8V		(X10), V4			// 07028562
++	VL4RE16V	(X10), V4			// 07528562
++	VL4RE32V	(X10), V4			// 07628562
++	VL4RE64V	(X10), V4			// 07728562
++	VL8RV		(X10), V8			// 070485e2
++	VL8RE8V		(X10), V8			// 070485e2
++	VL8RE16V	(X10), V8			// 075485e2
++	VL8RE32V	(X10), V8			// 076485e2
++	VL8RE64V	(X10), V8			// 077485e2
++	VS1RV		V3, (X11)			// a7818502
++	VS2RV		V2, (X11)			// 27818522
++	VS4RV		V4, (X11)			// 27828562
++	VS8RV		V8, (X11)			// 278485e2
++
+ 	//
+ 	// Privileged ISA
+ 	//
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index a90f22af9f..82a2348894 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -50,4 +50,43 @@ TEXT errors(SB),$0
+ 	VSETVLI	$-1, E32, M2, TA, MA, X12	// ERROR "must be in range [0, 31] (5 bits)"
+ 	VSETIVLI X10, E32, M2, TA, MA, X12	// ERROR "expected immediate value"
+ 	VSETVL	X10, X11			// ERROR "expected integer register in rs1 position"
++	VLE8V	(X10), X10			// ERROR "expected vector register in rd position"
++	VLE8V	(V1), V3			// ERROR "expected integer register in rs1 position"
++	VLE8V	(X10), V1, V3			// ERROR "invalid vector mask register"
++	VSE8V	X10, (X10)			// ERROR "expected vector register in rs1 position"
++	VSE8V	V3, (V1)			// ERROR "expected integer register in rd position"
++	VSE8V	V3, V1, (X10)			// ERROR "invalid vector mask register"
++	VLSE8V	(X10), V3			// ERROR "expected integer register in rs2 position"
++	VLSE8V	(X10), X10, X11			// ERROR "expected vector register in rd position"
++	VLSE8V	(V1), X10, V3			// ERROR "expected integer register in rs1 position"
++	VLSE8V	(X10), V1, V0, V3		// ERROR "expected integer register in rs2 position"
++	VLSE8V	(X10), X10, V1, V3		// ERROR "invalid vector mask register"
++	VSSE8V	V3, (X10)			// ERROR "expected integer register in rs2 position"
++	VSSE8V	X10, X11, (X10)			// ERROR "expected vector register in rd position"
++	VSSE8V	V3, X11, (V1)			// ERROR "expected integer register in rs1 position"
++	VSSE8V	V3, V1, V0, (X10)		// ERROR "expected integer register in rs2 position"
++	VSSE8V	V3, X11, V1, (X10)		// ERROR "invalid vector mask register"
++	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
++	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
++	VLUXEI8V (V1), V2, V3			// ERROR "expected integer register in rs1 position"
++	VLUXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in rs2 position"
++	VLUXEI8V (X10), V2, V1, V3		// ERROR "invalid vector mask register"
++	VSUXEI8V X10, V2, (X10)			// ERROR "expected vector register in rd position"
++	VSUXEI8V V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
++	VSUXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in rs2 position"
++	VSUXEI8V V3, V2, V1, (X10)		// ERROR "invalid vector mask register"
++	VLOXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
++	VLOXEI8V (V1), V2, V3			// ERROR "expected integer register in rs1 position"
++	VLOXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in rs2 position"
++	VLOXEI8V (X10), V2, V1, V3		// ERROR "invalid vector mask register"
++	VSOXEI8V X10, V2, (X10)			// ERROR "expected vector register in rd position"
++	VSOXEI8V V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
++	VSOXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in rs2 position"
++	VSOXEI8V V3, V2, V1, (X10)		// ERROR "invalid vector mask register"
++	VL1RV	(X10), V0, V3			// ERROR "too many operands for instruction"
++	VL1RV	(X10), X10			// ERROR "expected vector register in rd position"
++	VL1RV	(V1), V3			// ERROR "expected integer register in rs1 position"
++	VS1RV	V3, V0, (X11)			// ERROR "too many operands for instruction"
++	VS1RV	X11, (X11)			// ERROR "expected vector register in rs1 position"
++	VS1RV	V3, (V1)			// ERROR "expected integer register in rd position"
+ 	RET
+diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go
+index c49569c943..6df5f0a173 100644
+--- a/src/cmd/internal/obj/riscv/anames.go
++++ b/src/cmd/internal/obj/riscv/anames.go
+@@ -650,5 +650,9 @@ var Anames = []string{
+ 	"RDTIME",
+ 	"SEQZ",
+ 	"SNEZ",
++	"VL1RV",
++	"VL2RV",
++	"VL4RV",
++	"VL8RV",
+ 	"LAST",
+ }
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index 9b88ff2ccd..8999ef149b 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -1168,6 +1168,10 @@ const (
+ 	ARDTIME
+ 	ASEQZ
+ 	ASNEZ
++	AVL1RV
++	AVL2RV
++	AVL4RV
++	AVL8RV
+ 
+ 	// End marker
+ 	ALAST
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 3d1c120baa..a558dc3596 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1213,6 +1213,27 @@ func validateIF(ctxt *obj.Link, ins *instruction) {
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
++func validateIV(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateIIIV(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantIntReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateIVIV(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
+ func validateSI(ctxt *obj.Link, ins *instruction) {
+ 	wantImmI(ctxt, ins, ins.imm, 12)
+ 	wantIntReg(ctxt, ins, "rd", ins.rd)
+@@ -1229,6 +1250,27 @@ func validateSF(ctxt *obj.Link, ins *instruction) {
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
++func validateSV(ctxt *obj.Link, ins *instruction) {
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantVectorReg(ctxt, ins, "rs1", ins.rs1)
++	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateSVII(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantIntReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateSVIV(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "rd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
+ func validateB(ctxt *obj.Link, ins *instruction) {
+ 	// Offsets are multiples of two, so accept 13 bit immediates for the
+ 	// 12 bit slot. We implicitly drop the least significant bit in encodeB.
+@@ -1305,7 +1347,10 @@ func encodeR(as obj.As, rs1, rs2, rd, funct3, funct7 uint32) uint32 {
+ 	if enc.rs2 != 0 && rs2 != 0 {
+ 		panic("encodeR: instruction uses rs2, but rs2 was nonzero")
+ 	}
+-	return funct7<<25 | enc.funct7<<25 | enc.rs2<<20 | rs2<<20 | rs1<<15 | enc.funct3<<12 | funct3<<12 | rd<<7 | enc.opcode
++	funct3 |= enc.funct3
++	funct7 |= enc.funct7
++	rs2 |= enc.rs2
++	return funct7<<25 | rs2<<20 | rs1<<15 | funct3<<12 | rd<<7 | enc.opcode
+ }
+ 
+ // encodeR4 encodes an R4-type RISC-V instruction.
+@@ -1357,38 +1402,67 @@ func encodeRFF(ins *instruction) uint32 {
+ }
+ 
+ // encodeI encodes an I-type RISC-V instruction.
+-func encodeI(as obj.As, rs1, rd, imm uint32) uint32 {
++func encodeI(as obj.As, rs1, rd, imm, funct7 uint32) uint32 {
+ 	enc := encode(as)
+ 	if enc == nil {
+ 		panic("encodeI: could not encode instruction")
+ 	}
+ 	imm |= uint32(enc.csr)
+-	return imm<<20 | rs1<<15 | enc.funct3<<12 | rd<<7 | enc.opcode
++	return funct7<<25 | imm<<20 | rs1<<15 | enc.funct3<<12 | rd<<7 | enc.opcode
+ }
+ 
+ func encodeIII(ins *instruction) uint32 {
+-	return encodeI(ins.as, regI(ins.rs1), regI(ins.rd), uint32(ins.imm))
++	return encodeI(ins.as, regI(ins.rs1), regI(ins.rd), uint32(ins.imm), 0)
+ }
+ 
+ func encodeIF(ins *instruction) uint32 {
+-	return encodeI(ins.as, regI(ins.rs1), regF(ins.rd), uint32(ins.imm))
++	return encodeI(ins.as, regI(ins.rs1), regF(ins.rd), uint32(ins.imm), 0)
++}
++
++func encodeIV(ins *instruction) uint32 {
++	return encodeI(ins.as, regI(ins.rs1), regV(ins.rd), uint32(ins.imm), ins.funct7)
++}
++
++func encodeIIIV(ins *instruction) uint32 {
++	return encodeI(ins.as, regI(ins.rs1), regV(ins.rd), regI(ins.rs2), ins.funct7)
++}
++
++func encodeIVIV(ins *instruction) uint32 {
++	return encodeI(ins.as, regI(ins.rs1), regV(ins.rd), regV(ins.rs2), ins.funct7)
+ }
+ 
+ // encodeS encodes an S-type RISC-V instruction.
+-func encodeS(as obj.As, rs1, rs2, imm uint32) uint32 {
++func encodeS(as obj.As, rs1, rs2, imm, funct7 uint32) uint32 {
+ 	enc := encode(as)
+ 	if enc == nil {
+ 		panic("encodeS: could not encode instruction")
+ 	}
+-	return (imm>>5)<<25 | rs2<<20 | rs1<<15 | enc.funct3<<12 | (imm&0x1f)<<7 | enc.opcode
++	if enc.rs2 != 0 && rs2 != 0 {
++		panic("encodeS: instruction uses rs2, but rs2 was nonzero")
++	}
++	rs2 |= enc.rs2
++	imm |= uint32(enc.csr) &^ 0x1f
++	return funct7<<25 | (imm>>5)<<25 | rs2<<20 | rs1<<15 | enc.funct3<<12 | (imm&0x1f)<<7 | enc.opcode
+ }
+ 
+ func encodeSI(ins *instruction) uint32 {
+-	return encodeS(ins.as, regI(ins.rd), regI(ins.rs1), uint32(ins.imm))
++	return encodeS(ins.as, regI(ins.rd), regI(ins.rs1), uint32(ins.imm), 0)
+ }
+ 
+ func encodeSF(ins *instruction) uint32 {
+-	return encodeS(ins.as, regI(ins.rd), regF(ins.rs1), uint32(ins.imm))
++	return encodeS(ins.as, regI(ins.rd), regF(ins.rs1), uint32(ins.imm), 0)
++}
++
++func encodeSV(ins *instruction) uint32 {
++	return encodeS(ins.as, regI(ins.rd), 0, regV(ins.rs1), ins.funct7)
++}
++
++func encodeSVII(ins *instruction) uint32 {
++	return encodeS(ins.as, regI(ins.rs1), regI(ins.rs2), regV(ins.rd), ins.funct7)
++}
++
++func encodeSVIV(ins *instruction) uint32 {
++	return encodeS(ins.as, regI(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct7)
+ }
+ 
+ // encodeBImmediate encodes an immediate for a B-type RISC-V instruction.
+@@ -1595,7 +1669,7 @@ var (
+ 	//
+ 	//  1. the instruction encoding (R/I/S/B/U/J), in lowercase
+ 	//  2. zero or more register operand identifiers (I = integer
+-	//     register, F = float register), in uppercase
++	//     register, F = float register, V = vector register), in uppercase
+ 	//  3. the word "Encoding"
+ 	//
+ 	// For example, rIIIEncoding indicates an R-type instruction with two
+@@ -1611,11 +1685,17 @@ var (
+ 	rIFEncoding   = encoding{encode: encodeRIF, validate: validateRIF, length: 4}
+ 	rFFEncoding   = encoding{encode: encodeRFF, validate: validateRFF, length: 4}
+ 
+-	iIIEncoding = encoding{encode: encodeIII, validate: validateIII, length: 4}
+-	iFEncoding  = encoding{encode: encodeIF, validate: validateIF, length: 4}
++	iIIEncoding  = encoding{encode: encodeIII, validate: validateIII, length: 4}
++	iFEncoding   = encoding{encode: encodeIF, validate: validateIF, length: 4}
++	iVEncoding   = encoding{encode: encodeIV, validate: validateIV, length: 4}
++	iIIVEncoding = encoding{encode: encodeIIIV, validate: validateIIIV, length: 4}
++	iVIVEncoding = encoding{encode: encodeIVIV, validate: validateIVIV, length: 4}
+ 
+-	sIEncoding = encoding{encode: encodeSI, validate: validateSI, length: 4}
+-	sFEncoding = encoding{encode: encodeSF, validate: validateSF, length: 4}
++	sIEncoding   = encoding{encode: encodeSI, validate: validateSI, length: 4}
++	sFEncoding   = encoding{encode: encodeSF, validate: validateSF, length: 4}
++	sVEncoding   = encoding{encode: encodeSV, validate: validateSV, length: 4}
++	sVIIEncoding = encoding{encode: encodeSVII, validate: validateSVII, length: 4}
++	sVIVEncoding = encoding{encode: encodeSVIV, validate: validateSVIV, length: 4}
+ 
+ 	bEncoding = encoding{encode: encodeB, validate: validateB, length: 4}
+ 	uEncoding = encoding{encode: encodeU, validate: validateU, length: 4}
+@@ -1896,11 +1976,73 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	// "V" Standard Extension for Vector Operations, Version 1.0
+ 	//
+ 
+-	// 31.6. Vector Configuration-Setting Instructions
++	// 31.6: Vector Configuration-Setting Instructions
+ 	AVSETVLI & obj.AMask:  {enc: vsetvliEncoding, immForm: AVSETIVLI},
+ 	AVSETIVLI & obj.AMask: {enc: vsetivliEncoding},
+ 	AVSETVL & obj.AMask:   {enc: vsetvlEncoding},
+ 
++	// 31.7.4: Vector Unit-Stride Instructions
++	AVLE8V & obj.AMask:  {enc: iVEncoding},
++	AVLE16V & obj.AMask: {enc: iVEncoding},
++	AVLE32V & obj.AMask: {enc: iVEncoding},
++	AVLE64V & obj.AMask: {enc: iVEncoding},
++	AVSE8V & obj.AMask:  {enc: sVEncoding},
++	AVSE16V & obj.AMask: {enc: sVEncoding},
++	AVSE32V & obj.AMask: {enc: sVEncoding},
++	AVSE64V & obj.AMask: {enc: sVEncoding},
++	AVLMV & obj.AMask:   {enc: iVEncoding},
++	AVSMV & obj.AMask:   {enc: sVEncoding},
++
++	// 31.7.5: Vector Strided Instructions
++	AVLSE8V & obj.AMask:  {enc: iIIVEncoding},
++	AVLSE16V & obj.AMask: {enc: iIIVEncoding},
++	AVLSE32V & obj.AMask: {enc: iIIVEncoding},
++	AVLSE64V & obj.AMask: {enc: iIIVEncoding},
++	AVSSE8V & obj.AMask:  {enc: sVIIEncoding},
++	AVSSE16V & obj.AMask: {enc: sVIIEncoding},
++	AVSSE32V & obj.AMask: {enc: sVIIEncoding},
++	AVSSE64V & obj.AMask: {enc: sVIIEncoding},
++
++	// 31.7.6: Vector Indexed Instructions
++	AVLUXEI8V & obj.AMask:  {enc: iVIVEncoding},
++	AVLUXEI16V & obj.AMask: {enc: iVIVEncoding},
++	AVLUXEI32V & obj.AMask: {enc: iVIVEncoding},
++	AVLUXEI64V & obj.AMask: {enc: iVIVEncoding},
++	AVLOXEI8V & obj.AMask:  {enc: iVIVEncoding},
++	AVLOXEI16V & obj.AMask: {enc: iVIVEncoding},
++	AVLOXEI32V & obj.AMask: {enc: iVIVEncoding},
++	AVLOXEI64V & obj.AMask: {enc: iVIVEncoding},
++	AVSUXEI8V & obj.AMask:  {enc: sVIVEncoding},
++	AVSUXEI16V & obj.AMask: {enc: sVIVEncoding},
++	AVSUXEI32V & obj.AMask: {enc: sVIVEncoding},
++	AVSUXEI64V & obj.AMask: {enc: sVIVEncoding},
++	AVSOXEI8V & obj.AMask:  {enc: sVIVEncoding},
++	AVSOXEI16V & obj.AMask: {enc: sVIVEncoding},
++	AVSOXEI32V & obj.AMask: {enc: sVIVEncoding},
++	AVSOXEI64V & obj.AMask: {enc: sVIVEncoding},
++
++	// 31.7.9. Vector Load/Store Whole Register Instructions
++	AVL1RE8V & obj.AMask:  {enc: iVEncoding},
++	AVL1RE16V & obj.AMask: {enc: iVEncoding},
++	AVL1RE32V & obj.AMask: {enc: iVEncoding},
++	AVL1RE64V & obj.AMask: {enc: iVEncoding},
++	AVL2RE8V & obj.AMask:  {enc: iVEncoding},
++	AVL2RE16V & obj.AMask: {enc: iVEncoding},
++	AVL2RE32V & obj.AMask: {enc: iVEncoding},
++	AVL2RE64V & obj.AMask: {enc: iVEncoding},
++	AVL4RE8V & obj.AMask:  {enc: iVEncoding},
++	AVL4RE16V & obj.AMask: {enc: iVEncoding},
++	AVL4RE32V & obj.AMask: {enc: iVEncoding},
++	AVL4RE64V & obj.AMask: {enc: iVEncoding},
++	AVL8RE8V & obj.AMask:  {enc: iVEncoding},
++	AVL8RE16V & obj.AMask: {enc: iVEncoding},
++	AVL8RE32V & obj.AMask: {enc: iVEncoding},
++	AVL8RE64V & obj.AMask: {enc: iVEncoding},
++	AVS1RV & obj.AMask:    {enc: sVEncoding},
++	AVS2RV & obj.AMask:    {enc: sVEncoding},
++	AVS4RV & obj.AMask:    {enc: sVEncoding},
++	AVS8RV & obj.AMask:    {enc: sVEncoding},
++
+ 	//
+ 	// Privileged ISA
+ 	//
+@@ -2723,6 +2865,60 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 			ins.rs1 = uint32(p.From.Offset)
+ 		}
+ 
++	case AVLE8V, AVLE16V, AVLE32V, AVLE64V, AVSE8V, AVSE16V, AVSE32V, AVSE64V, AVLMV, AVSMV:
++		// Set mask bit
++		switch {
++		case ins.rs1 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs1 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE
++
++	case AVLSE8V, AVLSE16V, AVLSE32V, AVLSE64V,
++		AVLUXEI8V, AVLUXEI16V, AVLUXEI32V, AVLUXEI64V, AVLOXEI8V, AVLOXEI16V, AVLOXEI32V, AVLOXEI64V:
++		// Set mask bit
++		switch {
++		case ins.rs3 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs3 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		ins.rs1, ins.rs2, ins.rs3 = ins.rs2, ins.rs1, obj.REG_NONE
++
++	case AVSSE8V, AVSSE16V, AVSSE32V, AVSSE64V,
++		AVSUXEI8V, AVSUXEI16V, AVSUXEI32V, AVSUXEI64V, AVSOXEI8V, AVSOXEI16V, AVSOXEI32V, AVSOXEI64V:
++		// Set mask bit
++		switch {
++		case ins.rs3 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs3 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		ins.rd, ins.rs1, ins.rs2, ins.rs3 = ins.rs2, ins.rd, ins.rs1, obj.REG_NONE
++
++	case AVL1RV, AVL1RE8V, AVL1RE16V, AVL1RE32V, AVL1RE64V, AVL2RV, AVL2RE8V, AVL2RE16V, AVL2RE32V, AVL2RE64V,
++		AVL4RV, AVL4RE8V, AVL4RE16V, AVL4RE32V, AVL4RE64V, AVL8RV, AVL8RE8V, AVL8RE16V, AVL8RE32V, AVL8RE64V:
++		switch ins.as {
++		case AVL1RV:
++			ins.as = AVL1RE8V
++		case AVL2RV:
++			ins.as = AVL2RE8V
++		case AVL4RV:
++			ins.as = AVL4RE8V
++		case AVL8RV:
++			ins.as = AVL8RE8V
++		}
++		if ins.rs1 != obj.REG_NONE {
++			p.Ctxt.Diag("%v: too many operands for instruction", p)
++		}
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE
++
++	case AVS1RV, AVS2RV, AVS4RV, AVS8RV:
++		if ins.rs1 != obj.REG_NONE {
++			p.Ctxt.Diag("%v: too many operands for instruction", p)
++		}
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE
+ 	}
+ 
+ 	for _, ins := range inss {
+-- 
+2.39.5
+
diff --git a/2073-cmd-internal-obj-riscv-add-riscv64-CSR-map.patch b/2073-cmd-internal-obj-riscv-add-riscv64-CSR-map.patch
new file mode 100644
index 0000000..c854f00
--- /dev/null
+++ b/2073-cmd-internal-obj-riscv-add-riscv64-CSR-map.patch
@@ -0,0 +1,363 @@
+From 5fa2cbd247ff2acebe9a8655ad19814b2c40af4d Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 073/119] cmd/internal/obj/riscv: add riscv64 CSR map
+
+The map is automatically generated by running the latest version of
+parse.py from github.com/riscv/riscv-opcodes.
+
+Change-Id: I05e00ab27ec583750752c25e1835c2578b339fbf
+Reviewed-on: https://go-review.googlesource.com/c/go/+/630518
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Michael Pratt <mpratt@google.com>
+---
+ src/cmd/internal/obj/riscv/inst.go | 332 +++++++++++++++++++++++++++++
+ 1 file changed, 332 insertions(+)
+
+diff --git a/src/cmd/internal/obj/riscv/inst.go b/src/cmd/internal/obj/riscv/inst.go
+index 2d9132e532..5ee5bda361 100644
+--- a/src/cmd/internal/obj/riscv/inst.go
++++ b/src/cmd/internal/obj/riscv/inst.go
+@@ -1229,3 +1229,335 @@ func encode(a obj.As) *inst {
+ 	}
+ 	return nil
+ }
++
++var csrs = map[uint16]string{
++	0x1:   "FFLAGS",
++	0x2:   "FRM",
++	0x3:   "FCSR",
++	0x7:   "UTVT",
++	0x8:   "VSTART",
++	0x9:   "VXSAT",
++	0xa:   "VXRM",
++	0xf:   "VCSR",
++	0x11:  "SSP",
++	0x15:  "SEED",
++	0x17:  "JVT",
++	0x45:  "UNXTI",
++	0x46:  "UINTSTATUS",
++	0x48:  "USCRATCHCSW",
++	0x49:  "USCRATCHCSWL",
++	0x100: "SSTATUS",
++	0x102: "SEDELEG",
++	0x103: "SIDELEG",
++	0x104: "SIE",
++	0x105: "STVEC",
++	0x106: "SCOUNTEREN",
++	0x107: "STVT",
++	0x10a: "SENVCFG",
++	0x10c: "SSTATEEN0",
++	0x10d: "SSTATEEN1",
++	0x10e: "SSTATEEN2",
++	0x10f: "SSTATEEN3",
++	0x120: "SCOUNTINHIBIT",
++	0x140: "SSCRATCH",
++	0x141: "SEPC",
++	0x142: "SCAUSE",
++	0x143: "STVAL",
++	0x144: "SIP",
++	0x145: "SNXTI",
++	0x146: "SINTSTATUS",
++	0x148: "SSCRATCHCSW",
++	0x149: "SSCRATCHCSWL",
++	0x14d: "STIMECMP",
++	0x14e: "SCTRCTL",
++	0x14f: "SCTRSTATUS",
++	0x150: "SISELECT",
++	0x151: "SIREG",
++	0x152: "SIREG2",
++	0x153: "SIREG3",
++	0x155: "SIREG4",
++	0x156: "SIREG5",
++	0x157: "SIREG6",
++	0x15c: "STOPEI",
++	0x15f: "SCTRDEPTH",
++	0x180: "SATP",
++	0x181: "SRMCFG",
++	0x200: "VSSTATUS",
++	0x204: "VSIE",
++	0x205: "VSTVEC",
++	0x240: "VSSCRATCH",
++	0x241: "VSEPC",
++	0x242: "VSCAUSE",
++	0x243: "VSTVAL",
++	0x244: "VSIP",
++	0x24d: "VSTIMECMP",
++	0x24e: "VSCTRCTL",
++	0x250: "VSISELECT",
++	0x251: "VSIREG",
++	0x252: "VSIREG2",
++	0x253: "VSIREG3",
++	0x255: "VSIREG4",
++	0x256: "VSIREG5",
++	0x257: "VSIREG6",
++	0x25c: "VSTOPEI",
++	0x280: "VSATP",
++	0x300: "MSTATUS",
++	0x301: "MISA",
++	0x302: "MEDELEG",
++	0x303: "MIDELEG",
++	0x304: "MIE",
++	0x305: "MTVEC",
++	0x306: "MCOUNTEREN",
++	0x307: "MTVT",
++	0x308: "MVIEN",
++	0x309: "MVIP",
++	0x30a: "MENVCFG",
++	0x30c: "MSTATEEN0",
++	0x30d: "MSTATEEN1",
++	0x30e: "MSTATEEN2",
++	0x30f: "MSTATEEN3",
++	0x320: "MCOUNTINHIBIT",
++	0x321: "MCYCLECFG",
++	0x322: "MINSTRETCFG",
++	0x323: "MHPMEVENT3",
++	0x324: "MHPMEVENT4",
++	0x325: "MHPMEVENT5",
++	0x326: "MHPMEVENT6",
++	0x327: "MHPMEVENT7",
++	0x328: "MHPMEVENT8",
++	0x329: "MHPMEVENT9",
++	0x32a: "MHPMEVENT10",
++	0x32b: "MHPMEVENT11",
++	0x32c: "MHPMEVENT12",
++	0x32d: "MHPMEVENT13",
++	0x32e: "MHPMEVENT14",
++	0x32f: "MHPMEVENT15",
++	0x330: "MHPMEVENT16",
++	0x331: "MHPMEVENT17",
++	0x332: "MHPMEVENT18",
++	0x333: "MHPMEVENT19",
++	0x334: "MHPMEVENT20",
++	0x335: "MHPMEVENT21",
++	0x336: "MHPMEVENT22",
++	0x337: "MHPMEVENT23",
++	0x338: "MHPMEVENT24",
++	0x339: "MHPMEVENT25",
++	0x33a: "MHPMEVENT26",
++	0x33b: "MHPMEVENT27",
++	0x33c: "MHPMEVENT28",
++	0x33d: "MHPMEVENT29",
++	0x33e: "MHPMEVENT30",
++	0x33f: "MHPMEVENT31",
++	0x340: "MSCRATCH",
++	0x341: "MEPC",
++	0x342: "MCAUSE",
++	0x343: "MTVAL",
++	0x344: "MIP",
++	0x345: "MNXTI",
++	0x346: "MINTSTATUS",
++	0x348: "MSCRATCHCSW",
++	0x349: "MSCRATCHCSWL",
++	0x34a: "MTINST",
++	0x34b: "MTVAL2",
++	0x34e: "MCTRCTL",
++	0x350: "MISELECT",
++	0x351: "MIREG",
++	0x352: "MIREG2",
++	0x353: "MIREG3",
++	0x355: "MIREG4",
++	0x356: "MIREG5",
++	0x357: "MIREG6",
++	0x35c: "MTOPEI",
++	0x3a0: "PMPCFG0",
++	0x3a1: "PMPCFG1",
++	0x3a2: "PMPCFG2",
++	0x3a3: "PMPCFG3",
++	0x3a4: "PMPCFG4",
++	0x3a5: "PMPCFG5",
++	0x3a6: "PMPCFG6",
++	0x3a7: "PMPCFG7",
++	0x3a8: "PMPCFG8",
++	0x3a9: "PMPCFG9",
++	0x3aa: "PMPCFG10",
++	0x3ab: "PMPCFG11",
++	0x3ac: "PMPCFG12",
++	0x3ad: "PMPCFG13",
++	0x3ae: "PMPCFG14",
++	0x3af: "PMPCFG15",
++	0x3b0: "PMPADDR0",
++	0x3b1: "PMPADDR1",
++	0x3b2: "PMPADDR2",
++	0x3b3: "PMPADDR3",
++	0x3b4: "PMPADDR4",
++	0x3b5: "PMPADDR5",
++	0x3b6: "PMPADDR6",
++	0x3b7: "PMPADDR7",
++	0x3b8: "PMPADDR8",
++	0x3b9: "PMPADDR9",
++	0x3ba: "PMPADDR10",
++	0x3bb: "PMPADDR11",
++	0x3bc: "PMPADDR12",
++	0x3bd: "PMPADDR13",
++	0x3be: "PMPADDR14",
++	0x3bf: "PMPADDR15",
++	0x3c0: "PMPADDR16",
++	0x3c1: "PMPADDR17",
++	0x3c2: "PMPADDR18",
++	0x3c3: "PMPADDR19",
++	0x3c4: "PMPADDR20",
++	0x3c5: "PMPADDR21",
++	0x3c6: "PMPADDR22",
++	0x3c7: "PMPADDR23",
++	0x3c8: "PMPADDR24",
++	0x3c9: "PMPADDR25",
++	0x3ca: "PMPADDR26",
++	0x3cb: "PMPADDR27",
++	0x3cc: "PMPADDR28",
++	0x3cd: "PMPADDR29",
++	0x3ce: "PMPADDR30",
++	0x3cf: "PMPADDR31",
++	0x3d0: "PMPADDR32",
++	0x3d1: "PMPADDR33",
++	0x3d2: "PMPADDR34",
++	0x3d3: "PMPADDR35",
++	0x3d4: "PMPADDR36",
++	0x3d5: "PMPADDR37",
++	0x3d6: "PMPADDR38",
++	0x3d7: "PMPADDR39",
++	0x3d8: "PMPADDR40",
++	0x3d9: "PMPADDR41",
++	0x3da: "PMPADDR42",
++	0x3db: "PMPADDR43",
++	0x3dc: "PMPADDR44",
++	0x3dd: "PMPADDR45",
++	0x3de: "PMPADDR46",
++	0x3df: "PMPADDR47",
++	0x3e0: "PMPADDR48",
++	0x3e1: "PMPADDR49",
++	0x3e2: "PMPADDR50",
++	0x3e3: "PMPADDR51",
++	0x3e4: "PMPADDR52",
++	0x3e5: "PMPADDR53",
++	0x3e6: "PMPADDR54",
++	0x3e7: "PMPADDR55",
++	0x3e8: "PMPADDR56",
++	0x3e9: "PMPADDR57",
++	0x3ea: "PMPADDR58",
++	0x3eb: "PMPADDR59",
++	0x3ec: "PMPADDR60",
++	0x3ed: "PMPADDR61",
++	0x3ee: "PMPADDR62",
++	0x3ef: "PMPADDR63",
++	0x5a8: "SCONTEXT",
++	0x600: "HSTATUS",
++	0x602: "HEDELEG",
++	0x603: "HIDELEG",
++	0x604: "HIE",
++	0x605: "HTIMEDELTA",
++	0x606: "HCOUNTEREN",
++	0x607: "HGEIE",
++	0x608: "HVIEN",
++	0x609: "HVICTL",
++	0x60a: "HENVCFG",
++	0x60c: "HSTATEEN0",
++	0x60d: "HSTATEEN1",
++	0x60e: "HSTATEEN2",
++	0x60f: "HSTATEEN3",
++	0x643: "HTVAL",
++	0x644: "HIP",
++	0x645: "HVIP",
++	0x646: "HVIPRIO1",
++	0x647: "HVIPRIO2",
++	0x64a: "HTINST",
++	0x680: "HGATP",
++	0x6a8: "HCONTEXT",
++	0x747: "MSECCFG",
++	0x7a0: "TSELECT",
++	0x7a1: "TDATA1",
++	0x7a2: "TDATA2",
++	0x7a3: "TDATA3",
++	0x7a4: "TINFO",
++	0x7a5: "TCONTROL",
++	0x7a8: "MCONTEXT",
++	0x7aa: "MSCONTEXT",
++	0x7b0: "DCSR",
++	0x7b1: "DPC",
++	0x7b2: "DSCRATCH0",
++	0x7b3: "DSCRATCH1",
++	0xb00: "MCYCLE",
++	0xb02: "MINSTRET",
++	0xb03: "MHPMCOUNTER3",
++	0xb04: "MHPMCOUNTER4",
++	0xb05: "MHPMCOUNTER5",
++	0xb06: "MHPMCOUNTER6",
++	0xb07: "MHPMCOUNTER7",
++	0xb08: "MHPMCOUNTER8",
++	0xb09: "MHPMCOUNTER9",
++	0xb0a: "MHPMCOUNTER10",
++	0xb0b: "MHPMCOUNTER11",
++	0xb0c: "MHPMCOUNTER12",
++	0xb0d: "MHPMCOUNTER13",
++	0xb0e: "MHPMCOUNTER14",
++	0xb0f: "MHPMCOUNTER15",
++	0xb10: "MHPMCOUNTER16",
++	0xb11: "MHPMCOUNTER17",
++	0xb12: "MHPMCOUNTER18",
++	0xb13: "MHPMCOUNTER19",
++	0xb14: "MHPMCOUNTER20",
++	0xb15: "MHPMCOUNTER21",
++	0xb16: "MHPMCOUNTER22",
++	0xb17: "MHPMCOUNTER23",
++	0xb18: "MHPMCOUNTER24",
++	0xb19: "MHPMCOUNTER25",
++	0xb1a: "MHPMCOUNTER26",
++	0xb1b: "MHPMCOUNTER27",
++	0xb1c: "MHPMCOUNTER28",
++	0xb1d: "MHPMCOUNTER29",
++	0xb1e: "MHPMCOUNTER30",
++	0xb1f: "MHPMCOUNTER31",
++	0xc00: "CYCLE",
++	0xc01: "TIME",
++	0xc02: "INSTRET",
++	0xc03: "HPMCOUNTER3",
++	0xc04: "HPMCOUNTER4",
++	0xc05: "HPMCOUNTER5",
++	0xc06: "HPMCOUNTER6",
++	0xc07: "HPMCOUNTER7",
++	0xc08: "HPMCOUNTER8",
++	0xc09: "HPMCOUNTER9",
++	0xc0a: "HPMCOUNTER10",
++	0xc0b: "HPMCOUNTER11",
++	0xc0c: "HPMCOUNTER12",
++	0xc0d: "HPMCOUNTER13",
++	0xc0e: "HPMCOUNTER14",
++	0xc0f: "HPMCOUNTER15",
++	0xc10: "HPMCOUNTER16",
++	0xc11: "HPMCOUNTER17",
++	0xc12: "HPMCOUNTER18",
++	0xc13: "HPMCOUNTER19",
++	0xc14: "HPMCOUNTER20",
++	0xc15: "HPMCOUNTER21",
++	0xc16: "HPMCOUNTER22",
++	0xc17: "HPMCOUNTER23",
++	0xc18: "HPMCOUNTER24",
++	0xc19: "HPMCOUNTER25",
++	0xc1a: "HPMCOUNTER26",
++	0xc1b: "HPMCOUNTER27",
++	0xc1c: "HPMCOUNTER28",
++	0xc1d: "HPMCOUNTER29",
++	0xc1e: "HPMCOUNTER30",
++	0xc1f: "HPMCOUNTER31",
++	0xc20: "VL",
++	0xc21: "VTYPE",
++	0xc22: "VLENB",
++	0xda0: "SCOUNTOVF",
++	0xdb0: "STOPI",
++	0xe12: "HGEIP",
++	0xeb0: "VSTOPI",
++	0xf11: "MVENDORID",
++	0xf12: "MARCHID",
++	0xf13: "MIMPID",
++	0xf14: "MHARTID",
++	0xf15: "MCONFIGPTR",
++	0xfb0: "MTOPI",
++}
+-- 
+2.39.5
+
diff --git a/2074-test-codegen-tighten-the-TrailingZeros64-test-on-386.patch b/2074-test-codegen-tighten-the-TrailingZeros64-test-on-386.patch
new file mode 100644
index 0000000..a57ede6
--- /dev/null
+++ b/2074-test-codegen-tighten-the-TrailingZeros64-test-on-386.patch
@@ -0,0 +1,36 @@
+From 46c2b95b27862604e6ffe206ac68e92b1983fd29 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 074/119] test/codegen: tighten the TrailingZeros64 test on 386
+
+Make the TrailingZeros64 code generation check more specific for 386.
+Just checking for BSFL will match both the generic 64 bit decomposition
+and the custom 386 lowering.
+
+Change-Id: I62076f1889af0ef1f29704cba01ab419cae0c6e3
+Reviewed-on: https://go-review.googlesource.com/c/go/+/656996
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Keith Randall <khr@google.com>
+Auto-Submit: Keith Randall <khr@google.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+---
+ test/codegen/mathbits.go | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
+index caeecdf078..bf2e8130c4 100644
+--- a/test/codegen/mathbits.go
++++ b/test/codegen/mathbits.go
+@@ -311,7 +311,7 @@ func TrailingZeros(n uint) int {
+ func TrailingZeros64(n uint64) int {
+ 	// amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
+ 	// amd64/v3:"TZCNTQ"
+-	// 386:"BSFL"
++	// 386:"BSFL","JNE"
+ 	// arm64:"RBIT","CLZ"
+ 	// s390x:"FLOGR"
+ 	// ppc64x/power8:"ANDN","POPCNTD"
+-- 
+2.39.5
+
diff --git a/2075-test-codegen-add-riscv64-codegen-for-arithmetic-test.patch b/2075-test-codegen-add-riscv64-codegen-for-arithmetic-test.patch
new file mode 100644
index 0000000..13dffe9
--- /dev/null
+++ b/2075-test-codegen-add-riscv64-codegen-for-arithmetic-test.patch
@@ -0,0 +1,102 @@
+From 60dd9fcdc906997df39a06b95100f0bf28fd0312 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 075/119] test/codegen: add riscv64 codegen for arithmetic
+ tests
+
+Codify the current riscv64 code generation for various subtract from
+constant and addition/subtraction tests.
+
+Change-Id: I54ad923280a0578a338bc4431fa5bdc0644c4729
+Reviewed-on: https://go-review.googlesource.com/c/go/+/652316
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: David Chase <drchase@google.com>
+---
+ test/codegen/arithmetic.go | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
+index 5f4ce9c76f..ec5cb491fa 100644
+--- a/test/codegen/arithmetic.go
++++ b/test/codegen/arithmetic.go
+@@ -44,36 +44,42 @@ func SubMem(arr []int, b, c, d int) int {
+ 
+ func SubFromConst(a int) int {
+ 	// ppc64x: `SUBC\tR[0-9]+,\s[$]40,\sR`
++	// riscv64: "ADDI\t\\$-40","NEG"
+ 	b := 40 - a
+ 	return b
+ }
+ 
+ func SubFromConstNeg(a int) int {
+ 	// ppc64x: `ADD\t[$]40,\sR[0-9]+,\sR`
++	// riscv64: "NEG","ADDI\t\\$-40","NEG"
+ 	c := 40 - (-a)
+ 	return c
+ }
+ 
+ func SubSubFromConst(a int) int {
+ 	// ppc64x: `ADD\t[$]20,\sR[0-9]+,\sR`
++	// riscv64: "ADDI\t\\$20",-"NEG"
+ 	c := 40 - (20 - a)
+ 	return c
+ }
+ 
+ func AddSubFromConst(a int) int {
+ 	// ppc64x: `SUBC\tR[0-9]+,\s[$]60,\sR`
++	// riscv64: "ADDI\t\\$-60","NEG"
+ 	c := 40 + (20 - a)
+ 	return c
+ }
+ 
+ func NegSubFromConst(a int) int {
+ 	// ppc64x: `ADD\t[$]-20,\sR[0-9]+,\sR`
++	// riscv64: "ADDI\t\\$-20"
+ 	c := -(20 - a)
+ 	return c
+ }
+ 
+ func NegAddFromConstNeg(a int) int {
+ 	// ppc64x: `SUBC\tR[0-9]+,\s[$]40,\sR`
++	// riscv64: "ADDI\t\\$-40","NEG"
+ 	c := -(-40 + a)
+ 	return c
+ }
+@@ -81,6 +87,7 @@ func NegAddFromConstNeg(a int) int {
+ func SubSubNegSimplify(a, b int) int {
+ 	// amd64:"NEGQ"
+ 	// ppc64x:"NEG"
++	// riscv64:"NEG",-"SUB"
+ 	r := (a - b) - a
+ 	return r
+ }
+@@ -88,6 +95,7 @@ func SubSubNegSimplify(a, b int) int {
+ func SubAddSimplify(a, b int) int {
+ 	// amd64:-"SUBQ",-"ADDQ"
+ 	// ppc64x:-"SUB",-"ADD"
++	// riscv64:-"SUB",-"ADD"
+ 	r := a + (b - a)
+ 	return r
+ }
+@@ -111,6 +119,7 @@ func SubAddSimplify2(a, b, c int) (int, int, int, int, int, int) {
+ func SubAddNegSimplify(a, b int) int {
+ 	// amd64:"NEGQ",-"ADDQ",-"SUBQ"
+ 	// ppc64x:"NEG",-"ADD",-"SUB"
++	// riscv64:"NEG",-"ADD",-"SUB"
+ 	r := a - (b + a)
+ 	return r
+ }
+@@ -118,6 +127,7 @@ func SubAddNegSimplify(a, b int) int {
+ func AddAddSubSimplify(a, b, c int) int {
+ 	// amd64:-"SUBQ"
+ 	// ppc64x:-"SUB"
++	// riscv64:"ADD","ADD",-"SUB"
+ 	r := a + (b + (c - a))
+ 	return r
+ }
+-- 
+2.39.5
+
diff --git a/2076-test-codegen-add-riscv64-rva23u64-specifiers-to-exis.patch b/2076-test-codegen-add-riscv64-rva23u64-specifiers-to-exis.patch
new file mode 100644
index 0000000..96396b5
--- /dev/null
+++ b/2076-test-codegen-add-riscv64-rva23u64-specifiers-to-exis.patch
@@ -0,0 +1,84 @@
+From 7623a2c8fe4db9e157cd551ca549dd548d072a9f Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 076/119] test/codegen: add riscv64/rva23u64 specifiers to
+ existing tests
+
+Tests that exist for riscv64/rva22u64 should also be applied to
+riscv64/rva23u64.
+
+Change-Id: Ia529fdf0ac55b8bcb3dcd24fa80efef2351f3842
+Reviewed-on: https://go-review.googlesource.com/c/go/+/652315
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: David Chase <drchase@google.com>
+---
+ test/codegen/arithmetic.go | 8 ++++----
+ test/codegen/shift.go      | 6 +++---
+ 2 files changed, 7 insertions(+), 7 deletions(-)
+
+diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
+index ec5cb491fa..1a27227aef 100644
+--- a/test/codegen/arithmetic.go
++++ b/test/codegen/arithmetic.go
+@@ -607,7 +607,7 @@ func Int64Min(a, b int64) int64 {
+ 	// amd64: "CMPQ","CMOVQLT"
+ 	// arm64: "CMP","CSEL"
+ 	// riscv64/rva20u64:"BLT\t"
+-	// riscv64/rva22u64:"MIN\t"
++	// riscv64/rva22u64,riscv64/rva23u64:"MIN\t"
+ 	return min(a, b)
+ }
+ 
+@@ -615,7 +615,7 @@ func Int64Max(a, b int64) int64 {
+ 	// amd64: "CMPQ","CMOVQGT"
+ 	// arm64: "CMP","CSEL"
+ 	// riscv64/rva20u64:"BLT\t"
+-	// riscv64/rva22u64:"MAX\t"
++	// riscv64/rva22u64,riscv64/rva23u64:"MAX\t"
+ 	return max(a, b)
+ }
+ 
+@@ -623,7 +623,7 @@ func Uint64Min(a, b uint64) uint64 {
+ 	// amd64: "CMPQ","CMOVQCS"
+ 	// arm64: "CMP","CSEL"
+ 	// riscv64/rva20u64:"BLTU"
+-	// riscv64/rva22u64:"MINU"
++	// riscv64/rva22u64,riscv64/rva23u64:"MINU"
+ 	return min(a, b)
+ }
+ 
+@@ -631,6 +631,6 @@ func Uint64Max(a, b uint64) uint64 {
+ 	// amd64: "CMPQ","CMOVQHI"
+ 	// arm64: "CMP","CSEL"
+ 	// riscv64/rva20u64:"BLTU"
+-	// riscv64/rva22u64:"MAXU"
++	// riscv64/rva22u64,riscv64/rva23u64:"MAXU"
+ 	return max(a, b)
+ }
+diff --git a/test/codegen/shift.go b/test/codegen/shift.go
+index 4b3b79f142..6b1157d3fd 100644
+--- a/test/codegen/shift.go
++++ b/test/codegen/shift.go
+@@ -481,13 +481,13 @@ func checkShiftToMask(u []uint64, s []int64) {
+ 
+ func checkLeftShiftWithAddition(a int64, b int64) int64 {
+ 	// riscv64/rva20u64: "SLLI","ADD"
+-	// riscv64/rva22u64: "SH1ADD"
++	// riscv64/rva22u64,riscv64/rva23u64: "SH1ADD"
+ 	a = a + b<<1
+ 	// riscv64/rva20u64: "SLLI","ADD"
+-	// riscv64/rva22u64: "SH2ADD"
++	// riscv64/rva22u64,riscv64/rva23u64: "SH2ADD"
+ 	a = a + b<<2
+ 	// riscv64/rva20u64: "SLLI","ADD"
+-	// riscv64/rva22u64: "SH3ADD"
++	// riscv64/rva22u64,riscv64/rva23u64: "SH3ADD"
+ 	a = a + b<<3
+ 	return a
+ }
+-- 
+2.39.5
+
diff --git a/2077-test-codegen-add-a-test-for-negation-and-conversion-.patch b/2077-test-codegen-add-a-test-for-negation-and-conversion-.patch
new file mode 100644
index 0000000..b216ea7
--- /dev/null
+++ b/2077-test-codegen-add-a-test-for-negation-and-conversion-.patch
@@ -0,0 +1,39 @@
+From f57c6ba1f6683a19d0c39ae08deb75dc9bd4ecc5 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 077/119] test/codegen: add a test for negation and conversion
+ to int32
+
+Codify the current code generation used on riscv64 in this case.
+
+Change-Id: If4152e3652fc19d0aa28b79dba08abee2486d5ae
+Reviewed-on: https://go-review.googlesource.com/c/go/+/652317
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: David Chase <drchase@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ test/codegen/arithmetic.go | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
+index 1a27227aef..0d303b9f24 100644
+--- a/test/codegen/arithmetic.go
++++ b/test/codegen/arithmetic.go
+@@ -132,6 +132,12 @@ func AddAddSubSimplify(a, b, c int) int {
+ 	return r
+ }
+ 
++func NegToInt32(a int) int {
++	// riscv64: "NEG","MOVW"
++	r := int(int32(-a))
++	return r
++}
++
+ // -------------------- //
+ //    Multiplication    //
+ // -------------------- //
+-- 
+2.39.5
+
diff --git a/2078-cmd-compile-combine-negation-and-word-sign-extension.patch b/2078-cmd-compile-combine-negation-and-word-sign-extension.patch
new file mode 100644
index 0000000..19c49ab
--- /dev/null
+++ b/2078-cmd-compile-combine-negation-and-word-sign-extension.patch
@@ -0,0 +1,80 @@
+From 1a58541e145e0e54539b5a71ca08d00bfe1a4bf6 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 078/119] cmd/compile: combine negation and word sign extension
+ on riscv64
+
+Use NEGW to produce a negated and sign extended word, rather than doing
+the same via two instructions:
+
+   neg     t0, t0
+   sext.w  a0, t0
+
+Becomes:
+
+   negw    t0, t0
+
+Change-Id: I824ab25001bd3304bdbd435e7b244fcc036ef212
+Reviewed-on: https://go-review.googlesource.com/c/go/+/652319
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/compile/internal/ssa/_gen/RISCV64.rules |  3 +++
+ src/cmd/compile/internal/ssa/rewriteRISCV64.go  | 11 +++++++++++
+ test/codegen/arithmetic.go                      |  2 +-
+ 3 files changed, 15 insertions(+), 1 deletion(-)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index a69df619a5..bc5a49be0b 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -589,6 +589,9 @@
+ (MOVHUreg (ANDI [c] x)) && c < 0 => (ANDI [int64(uint16(c))] x)
+ (MOVWUreg (ANDI [c] x)) && c < 0 => (AND (MOVDconst [int64(uint32(c))]) x)
+ 
++// Combine negation and sign extension.
++(MOVWreg (NEG x)) => (NEGW x)
++
+ // Avoid sign/zero extension for consts.
+ (MOVBreg  (MOVDconst [c])) => (MOVDconst [int64(int8(c))])
+ (MOVHreg  (MOVDconst [c])) => (MOVDconst [int64(int16(c))])
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 1c226a1660..1675d61fe5 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -5646,6 +5646,17 @@ func rewriteValueRISCV64_OpRISCV64MOVWreg(v *Value) bool {
+ 		v.copyOf(x)
+ 		return true
+ 	}
++	// match: (MOVWreg (NEG x))
++	// result: (NEGW x)
++	for {
++		if v_0.Op != OpRISCV64NEG {
++			break
++		}
++		x := v_0.Args[0]
++		v.reset(OpRISCV64NEGW)
++		v.AddArg(x)
++		return true
++	}
+ 	// match: (MOVWreg (MOVDconst [c]))
+ 	// result: (MOVDconst [int64(int32(c))])
+ 	for {
+diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
+index 0d303b9f24..e4e3a90cd1 100644
+--- a/test/codegen/arithmetic.go
++++ b/test/codegen/arithmetic.go
+@@ -133,7 +133,7 @@ func AddAddSubSimplify(a, b, c int) int {
+ }
+ 
+ func NegToInt32(a int) int {
+-	// riscv64: "NEG","MOVW"
++	// riscv64: "NEGW",-"MOVW"
+ 	r := int(int32(-a))
+ 	return r
+ }
+-- 
+2.39.5
+
diff --git a/2079-cmd-compile-internal-ssa-remove-double-negation-with.patch b/2079-cmd-compile-internal-ssa-remove-double-negation-with.patch
new file mode 100644
index 0000000..3e2689d
--- /dev/null
+++ b/2079-cmd-compile-internal-ssa-remove-double-negation-with.patch
@@ -0,0 +1,97 @@
+From 752985cd45306ed0c62eae5090507c47de9834d5 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 079/119] cmd/compile/internal/ssa: remove double negation with
+ addition on riscv64
+
+On riscv64, subtraction from a constant is typically implemented as an
+ADDI with the negative constant, followed by a negation. However this can
+lead to multiple NEG/ADDI/NEG sequences that can be optimised out.
+
+For example, runtime.(*_panic).nextDefer currently contains:
+
+   lbu     t0, 0(t0)
+   addi    t0, t0, -8
+   neg     t0, t0
+   addi    t0, t0, -7
+   neg     t0, t0
+
+Which is now optimised to:
+
+   lbu     t0, 0(t0)
+   addi    t0, t0, -1
+
+Change-Id: Idf5815e6db2e3705cc4a4811ca9130a064ae3d80
+Reviewed-on: https://go-review.googlesource.com/c/go/+/652318
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: David Chase <drchase@google.com>
+---
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  1 +
+ .../compile/internal/ssa/rewriteRISCV64.go    | 22 +++++++++++++++++++
+ test/codegen/arithmetic.go                    |  2 +-
+ 3 files changed, 24 insertions(+), 1 deletion(-)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index bc5a49be0b..58cadc8944 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -735,6 +735,7 @@
+ 
+ // Double negation.
+ (NEG (NEG x)) => x
++(NEG <t> s:(ADDI [val] (NEG x))) && s.Uses == 1 && is32Bit(-val) => (ADDI [-val] x)
+ 
+ // Addition of zero or two constants.
+ (ADDI [0] x) => x
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 1675d61fe5..8f8c902df8 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -6118,6 +6118,28 @@ func rewriteValueRISCV64_OpRISCV64NEG(v *Value) bool {
+ 		v.copyOf(x)
+ 		return true
+ 	}
++	// match: (NEG <t> s:(ADDI [val] (NEG x)))
++	// cond: s.Uses == 1 && is32Bit(-val)
++	// result: (ADDI [-val] x)
++	for {
++		s := v_0
++		if s.Op != OpRISCV64ADDI {
++			break
++		}
++		val := auxIntToInt64(s.AuxInt)
++		s_0 := s.Args[0]
++		if s_0.Op != OpRISCV64NEG {
++			break
++		}
++		x := s_0.Args[0]
++		if !(s.Uses == 1 && is32Bit(-val)) {
++			break
++		}
++		v.reset(OpRISCV64ADDI)
++		v.AuxInt = int64ToAuxInt(-val)
++		v.AddArg(x)
++		return true
++	}
+ 	// match: (NEG (MOVDconst [x]))
+ 	// result: (MOVDconst [-x])
+ 	for {
+diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
+index e4e3a90cd1..976158326c 100644
+--- a/test/codegen/arithmetic.go
++++ b/test/codegen/arithmetic.go
+@@ -51,7 +51,7 @@ func SubFromConst(a int) int {
+ 
+ func SubFromConstNeg(a int) int {
+ 	// ppc64x: `ADD\t[$]40,\sR[0-9]+,\sR`
+-	// riscv64: "NEG","ADDI\t\\$-40","NEG"
++	// riscv64: "ADDI\t\\$40",-"NEG"
+ 	c := 40 - (-a)
+ 	return c
+ }
+-- 
+2.39.5
+
diff --git a/2080-cmd-internal-obj-riscv-prevent-duplicate-error-repor.patch b/2080-cmd-internal-obj-riscv-prevent-duplicate-error-repor.patch
new file mode 100644
index 0000000..07439e3
--- /dev/null
+++ b/2080-cmd-internal-obj-riscv-prevent-duplicate-error-repor.patch
@@ -0,0 +1,189 @@
+From 933daf2afa0cc95422a22b88603a8df7969e4c03 Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 080/119] cmd/internal/obj/riscv: prevent duplicate error
+ reports
+
+The riscv64 Go assembler can output certain errors, ones produced by
+instructionsForProg, multiple times.  These errors are guaranteed to
+be output at least twice and can appear three or more times if a
+rescan is needed to recompute branch addresses.  For example, the
+syntactically incorrect instruction
+
+MOV	(X10), $1
+
+will generate at least two identical errors
+
+asm: 86076 (asm.s:21524)	MOV	(X10), $1: unsupported MOV
+asm: 86076 (asm.s:21524)	MOV	(X10), $1: unsupported MOV
+asm: assembly failed
+
+In addition to confusing the user, these duplicate errors make it
+difficult to write negative tests for certain types of instructions,
+e.g., branches, whose duplicate errors are not always identical,
+and so not ignored by endtoend_test.go.
+
+We fix the issue by returning from preprocess if any errors have been
+generated by the time we reach the end of the rescan loop. One
+implication of this change is that validation errors will no longer
+be reported if an error is generated earlier in the preprocess stage.
+Negative test cases for validation errors are therefore moved to
+their own file as the existing riscv64error.s file contains errors
+generated by instructionsForProg that will now suppress the
+validation errors.
+
+Change-Id: Iffacdbefce28f44970dd5dda44990b822b8a23d4
+Reviewed-on: https://go-review.googlesource.com/c/go/+/637315
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/asm/internal/asm/endtoend_test.go     |  4 ++
+ .../asm/internal/asm/testdata/riscv64error.s  | 34 --------------
+ .../internal/asm/testdata/riscv64validation.s | 46 +++++++++++++++++++
+ src/cmd/internal/obj/riscv/obj.go             |  5 ++
+ 4 files changed, 55 insertions(+), 34 deletions(-)
+ create mode 100644 src/cmd/asm/internal/asm/testdata/riscv64validation.s
+
+diff --git a/src/cmd/asm/internal/asm/endtoend_test.go b/src/cmd/asm/internal/asm/endtoend_test.go
+index 02bc6b7923..0b9b0cbe83 100644
+--- a/src/cmd/asm/internal/asm/endtoend_test.go
++++ b/src/cmd/asm/internal/asm/endtoend_test.go
+@@ -480,6 +480,10 @@ func TestRISCVErrors(t *testing.T) {
+ 	testErrors(t, "riscv64", "riscv64error")
+ }
+ 
++func TestRISCVValidation(t *testing.T) {
++	testErrors(t, "riscv64", "riscv64validation")
++}
++
+ func TestS390XEndToEnd(t *testing.T) {
+ 	testEndToEnd(t, "s390x", "s390x")
+ }
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index 82a2348894..e8855f6cd5 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -43,50 +43,16 @@ TEXT errors(SB),$0
+ 	SRLIW	$-1, X5, X6			// ERROR "immediate out of range 0 to 31"
+ 	SRAIW	$-1, X5, X6			// ERROR "immediate out of range 0 to 31"
+ 	SD	X5, 4294967296(X6)		// ERROR "constant 4294967296 too large"
+-	SRLI	$1, X5, F1			// ERROR "expected integer register in rd position but got non-integer register F1"
+-	SRLI	$1, F1, X5			// ERROR "expected integer register in rs1 position but got non-integer register F1"
+ 	FNES	F1, (X5)			// ERROR "needs an integer register output"
+-	VSETVLI	$32, E16, M1, TU, MU, X12	// ERROR "must be in range [0, 31] (5 bits)"
+-	VSETVLI	$-1, E32, M2, TA, MA, X12	// ERROR "must be in range [0, 31] (5 bits)"
+ 	VSETIVLI X10, E32, M2, TA, MA, X12	// ERROR "expected immediate value"
+-	VSETVL	X10, X11			// ERROR "expected integer register in rs1 position"
+-	VLE8V	(X10), X10			// ERROR "expected vector register in rd position"
+-	VLE8V	(V1), V3			// ERROR "expected integer register in rs1 position"
+ 	VLE8V	(X10), V1, V3			// ERROR "invalid vector mask register"
+-	VSE8V	X10, (X10)			// ERROR "expected vector register in rs1 position"
+-	VSE8V	V3, (V1)			// ERROR "expected integer register in rd position"
+ 	VSE8V	V3, V1, (X10)			// ERROR "invalid vector mask register"
+-	VLSE8V	(X10), V3			// ERROR "expected integer register in rs2 position"
+-	VLSE8V	(X10), X10, X11			// ERROR "expected vector register in rd position"
+-	VLSE8V	(V1), X10, V3			// ERROR "expected integer register in rs1 position"
+-	VLSE8V	(X10), V1, V0, V3		// ERROR "expected integer register in rs2 position"
+ 	VLSE8V	(X10), X10, V1, V3		// ERROR "invalid vector mask register"
+-	VSSE8V	V3, (X10)			// ERROR "expected integer register in rs2 position"
+-	VSSE8V	X10, X11, (X10)			// ERROR "expected vector register in rd position"
+-	VSSE8V	V3, X11, (V1)			// ERROR "expected integer register in rs1 position"
+-	VSSE8V	V3, V1, V0, (X10)		// ERROR "expected integer register in rs2 position"
+ 	VSSE8V	V3, X11, V1, (X10)		// ERROR "invalid vector mask register"
+-	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
+-	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
+-	VLUXEI8V (V1), V2, V3			// ERROR "expected integer register in rs1 position"
+-	VLUXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in rs2 position"
+ 	VLUXEI8V (X10), V2, V1, V3		// ERROR "invalid vector mask register"
+-	VSUXEI8V X10, V2, (X10)			// ERROR "expected vector register in rd position"
+-	VSUXEI8V V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
+-	VSUXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in rs2 position"
+ 	VSUXEI8V V3, V2, V1, (X10)		// ERROR "invalid vector mask register"
+-	VLOXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
+-	VLOXEI8V (V1), V2, V3			// ERROR "expected integer register in rs1 position"
+-	VLOXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in rs2 position"
+ 	VLOXEI8V (X10), V2, V1, V3		// ERROR "invalid vector mask register"
+-	VSOXEI8V X10, V2, (X10)			// ERROR "expected vector register in rd position"
+-	VSOXEI8V V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
+-	VSOXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in rs2 position"
+ 	VSOXEI8V V3, V2, V1, (X10)		// ERROR "invalid vector mask register"
+ 	VL1RV	(X10), V0, V3			// ERROR "too many operands for instruction"
+-	VL1RV	(X10), X10			// ERROR "expected vector register in rd position"
+-	VL1RV	(V1), V3			// ERROR "expected integer register in rs1 position"
+ 	VS1RV	V3, V0, (X11)			// ERROR "too many operands for instruction"
+-	VS1RV	X11, (X11)			// ERROR "expected vector register in rs1 position"
+-	VS1RV	V3, (V1)			// ERROR "expected integer register in rd position"
+ 	RET
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+new file mode 100644
+index 0000000000..773f275dd3
+--- /dev/null
++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+@@ -0,0 +1,46 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++// This file is for validation errors only, i.e., errors reported by the validate function.
++// Negative test cases for errors generated earlier in the assembler's preprocess stage
++// should be added to riscv64error.s.  If they are added to this file, they will prevent
++// the validate function from being run and TestRISCVValidation will report missing
++// errors.
++
++TEXT validation(SB),$0
++	SRLI	$1, X5, F1			// ERROR "expected integer register in rd position but got non-integer register F1"
++	SRLI	$1, F1, X5			// ERROR "expected integer register in rs1 position but got non-integer register F1"
++	VSETVLI	$32, E16, M1, TU, MU, X12	// ERROR "must be in range [0, 31] (5 bits)"
++	VSETVLI	$-1, E32, M2, TA, MA, X12	// ERROR "must be in range [0, 31] (5 bits)"
++	VSETVL	X10, X11			// ERROR "expected integer register in rs1 position"
++	VLE8V	(X10), X10			// ERROR "expected vector register in rd position"
++	VLE8V	(V1), V3			// ERROR "expected integer register in rs1 position"
++	VSE8V	X10, (X10)			// ERROR "expected vector register in rs1 position"
++	VSE8V	V3, (V1)			// ERROR "expected integer register in rd position"
++	VLSE8V	(X10), V3			// ERROR "expected integer register in rs2 position"
++	VLSE8V	(X10), X10, X11			// ERROR "expected vector register in rd position"
++	VLSE8V	(V1), X10, V3			// ERROR "expected integer register in rs1 position"
++	VLSE8V	(X10), V1, V0, V3		// ERROR "expected integer register in rs2 position"
++	VSSE8V	V3, (X10)			// ERROR "expected integer register in rs2 position"
++	VSSE8V	X10, X11, (X10)			// ERROR "expected vector register in rd position"
++	VSSE8V	V3, X11, (V1)			// ERROR "expected integer register in rs1 position"
++	VSSE8V	V3, V1, V0, (X10)		// ERROR "expected integer register in rs2 position"
++	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
++	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
++	VLUXEI8V (V1), V2, V3			// ERROR "expected integer register in rs1 position"
++	VLUXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in rs2 position"
++	VSUXEI8V X10, V2, (X10)			// ERROR "expected vector register in rd position"
++	VSUXEI8V V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
++	VSUXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in rs2 position"
++	VLOXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
++	VLOXEI8V (V1), V2, V3			// ERROR "expected integer register in rs1 position"
++	VLOXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in rs2 position"
++	VSOXEI8V X10, V2, (X10)			// ERROR "expected vector register in rd position"
++	VSOXEI8V V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
++	VSOXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in rs2 position"
++	VL1RV	(X10), X10			// ERROR "expected vector register in rd position"
++	VL1RV	(V1), V3			// ERROR "expected integer register in rs1 position"
++	VS1RV	X11, (X11)			// ERROR "expected vector register in rs1 position"
++	VS1RV	V3, (V1)			// ERROR "expected integer register in rd position"
++	RET
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index a558dc3596..d61cef9695 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -708,6 +708,11 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
+ 			}
+ 		}
+ 
++		// Return if errors have been detected up to this point. Continuing
++		// may lead to duplicate errors being output.
++		if ctxt.Errors > 0 {
++			return
++		}
+ 		if !rescan {
+ 			break
+ 		}
+-- 
+2.39.5
+
diff --git a/2081-cmd-internal-obj-riscv-prevent-panics-on-bad-branche.patch b/2081-cmd-internal-obj-riscv-prevent-panics-on-bad-branche.patch
new file mode 100644
index 0000000..051136f
--- /dev/null
+++ b/2081-cmd-internal-obj-riscv-prevent-panics-on-bad-branche.patch
@@ -0,0 +1,74 @@
+From c61add823865d15d032f87fbc1bc1983e53f4437 Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 081/119] cmd/internal/obj/riscv: prevent panics on bad
+ branches
+
+Syntactically incorrect branches, such as
+
+BEQ	X5, X6, $1
+BEQ	X5, X6, 31(X10)
+
+cause the assembler to panic, which they shouldn't really do.  It's
+better for the user to see a normal error, as reported for other
+syntax errors in riscv64 assembly.  The panics also prevent us
+from writing negative tests for these sorts of errors.
+
+Here we fix the issue by ensuring we generate a normal error instead
+of panicking when the user provides an invalid branch target.  We
+also add a couple of negative tests.
+
+Change-Id: I1da568999a75097484b61a01d418f5d4be3e04fa
+Reviewed-on: https://go-review.googlesource.com/c/go/+/637316
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64error.s | 2 ++
+ src/cmd/internal/obj/riscv/obj.go                | 8 ++++++--
+ 2 files changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index e8855f6cd5..005b794612 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -30,6 +30,8 @@ TEXT errors(SB),$0
+ 	SLLI	$64, X5, X6			// ERROR "immediate out of range 0 to 63"
+ 	SRLI	$64, X5, X6			// ERROR "immediate out of range 0 to 63"
+ 	SRAI	$64, X5, X6			// ERROR "immediate out of range 0 to 63"
++	BEQ	X5, X6, $1			// ERROR "instruction with branch-like opcode lacks destination"
++	BEQ	X5, X6, 31(X10)			// ERROR "instruction with branch-like opcode lacks destination"
+ 	RORI	$-1, X5, X6			// ERROR "immediate out of range 0 to 63"
+ 	SLLI	$-1, X5, X6			// ERROR "immediate out of range 0 to 63"
+ 	SRLI	$-1, X5, X6			// ERROR "immediate out of range 0 to 63"
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index d61cef9695..0a754231cc 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -644,7 +644,8 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
+ 			switch p.As {
+ 			case ABEQ, ABEQZ, ABGE, ABGEU, ABGEZ, ABGT, ABGTU, ABGTZ, ABLE, ABLEU, ABLEZ, ABLT, ABLTU, ABLTZ, ABNE, ABNEZ:
+ 				if p.To.Type != obj.TYPE_BRANCH {
+-					panic("assemble: instruction with branch-like opcode lacks destination")
++					ctxt.Diag("%v: instruction with branch-like opcode lacks destination", p)
++					break
+ 				}
+ 				offset := p.To.Target().Pc - p.Pc
+ 				if offset < -4096 || 4096 <= offset {
+@@ -728,7 +729,10 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
+ 			case obj.TYPE_BRANCH:
+ 				p.To.Type, p.To.Offset = obj.TYPE_CONST, p.To.Target().Pc-p.Pc
+ 			case obj.TYPE_MEM:
+-				panic("unhandled type")
++				if ctxt.Errors == 0 {
++					// An error should have already been reported for this instruction
++					panic("unhandled type")
++				}
+ 			}
+ 
+ 		case AJAL:
+-- 
+2.39.5
+
diff --git a/2082-cmd-internal-obj-riscv-fix-the-encoding-for-REV8-and.patch b/2082-cmd-internal-obj-riscv-fix-the-encoding-for-REV8-and.patch
new file mode 100644
index 0000000..34b871c
--- /dev/null
+++ b/2082-cmd-internal-obj-riscv-fix-the-encoding-for-REV8-and.patch
@@ -0,0 +1,41 @@
+From b72bd33745886f2064a1fd5c3e938f431f913a02 Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 082/119] cmd/internal/obj/riscv: fix the encoding for REV8 and
+ ORCB
+
+The instructions are currently encoded and validated using an
+iIIEncoding which is incorrect as these instructions do not
+take an immediate operand.  Encode them instead using an
+rIIEncoding as is done for the other two register argument bitmanip
+instructions.
+
+Change-Id: Ia4d9c6f6ebd2dfc381935ebc11afa8fc3664232b
+Reviewed-on: https://go-review.googlesource.com/c/go/+/637317
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/internal/obj/riscv/obj.go | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 0a754231cc..7d7a123bcf 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1968,8 +1968,8 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	ARORI & obj.AMask:  {enc: iIIEncoding, ternary: true},
+ 	ARORIW & obj.AMask: {enc: iIIEncoding, ternary: true},
+ 	ARORW & obj.AMask:  {enc: rIIIEncoding, immForm: ARORIW, ternary: true},
+-	AORCB & obj.AMask:  {enc: iIIEncoding},
+-	AREV8 & obj.AMask:  {enc: iIIEncoding},
++	AORCB & obj.AMask:  {enc: rIIEncoding},
++	AREV8 & obj.AMask:  {enc: rIIEncoding},
+ 
+ 	// 28.4.4: Single-bit Instructions (Zbs)
+ 	ABCLR & obj.AMask:  {enc: rIIIEncoding, immForm: ABCLRI, ternary: true},
+-- 
+2.39.5
+
diff --git a/2083-cmd-internal-obj-riscv-factor-out-shift-constant-cod.patch b/2083-cmd-internal-obj-riscv-factor-out-shift-constant-cod.patch
new file mode 100644
index 0000000..88f4deb
--- /dev/null
+++ b/2083-cmd-internal-obj-riscv-factor-out-shift-constant-cod.patch
@@ -0,0 +1,151 @@
+From 011266a777692d5761f994845823efe9f1b4d539 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 083/119] cmd/internal/obj/riscv: factor out shift constant
+ code
+
+Move the code that tests to see if a constant can be represented by a
+32 bit signed integer and a logical left shift. This reduces duplication
+and increases readability. Also add test coverage now that this is an
+independent function.
+
+Change-Id: Id25395b1380b00cf5b69ca201b7715ef84f7ade6
+Reviewed-on: https://go-review.googlesource.com/c/go/+/652777
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/internal/obj/riscv/obj.go      | 30 ++++++++----
+ src/cmd/internal/obj/riscv/obj_test.go | 64 ++++++++++++++++++++++++++
+ 2 files changed, 86 insertions(+), 8 deletions(-)
+ create mode 100644 src/cmd/internal/obj/riscv/obj_test.go
+
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 7d7a123bcf..795452bbcb 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -135,10 +135,7 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
+ 
+ 	case AMOV:
+ 		if p.From.Type == obj.TYPE_CONST && p.From.Name == obj.NAME_NONE && p.From.Reg == obj.REG_NONE && int64(int32(p.From.Offset)) != p.From.Offset {
+-			ctz := bits.TrailingZeros64(uint64(p.From.Offset))
+-			val := p.From.Offset >> ctz
+-			if int64(int32(val)) == val {
+-				// It's ok. We can handle constants with many trailing zeros.
++			if isShiftConst(p.From.Offset) {
+ 				break
+ 			}
+ 			// Put >32-bit constants in memory and load them.
+@@ -2097,6 +2094,24 @@ func encodingForAs(as obj.As) (*encoding, error) {
+ 	return &insData.enc, nil
+ }
+ 
++// splitShiftConst attempts to split a constant into a signed 32 bit integer
++// and a corresponding left shift.
++func splitShiftConst(v int64) (imm int64, lsh int, ok bool) {
++	lsh = bits.TrailingZeros64(uint64(v))
++	c := v >> lsh
++	if int64(int32(c)) != c {
++		return 0, 0, false
++	}
++	return c, lsh, true
++}
++
++// isShiftConst indicates whether a constant can be represented as a signed
++// 32 bit integer that is left shifted.
++func isShiftConst(v int64) bool {
++	_, lsh, ok := splitShiftConst(v)
++	return ok && lsh > 0
++}
++
+ type instruction struct {
+ 	p      *obj.Prog // Prog that instruction is for
+ 	as     obj.As    // Assembler opcode
+@@ -2378,10 +2393,9 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 		// 	SLLI $63, X10, X10
+ 		var insSLLI *instruction
+ 		if err := immIFits(ins.imm, 32); err != nil {
+-			ctz := bits.TrailingZeros64(uint64(ins.imm))
+-			if err := immIFits(ins.imm>>ctz, 32); err == nil {
+-				ins.imm = ins.imm >> ctz
+-				insSLLI = &instruction{as: ASLLI, rd: ins.rd, rs1: ins.rd, imm: int64(ctz)}
++			if c, lsh, ok := splitShiftConst(ins.imm); ok {
++				ins.imm = c
++				insSLLI = &instruction{as: ASLLI, rd: ins.rd, rs1: ins.rd, imm: int64(lsh)}
+ 			}
+ 		}
+ 
+diff --git a/src/cmd/internal/obj/riscv/obj_test.go b/src/cmd/internal/obj/riscv/obj_test.go
+new file mode 100644
+index 0000000000..688f262d8f
+--- /dev/null
++++ b/src/cmd/internal/obj/riscv/obj_test.go
+@@ -0,0 +1,64 @@
++// Copyright 2025 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++package riscv
++
++import (
++	"fmt"
++	"testing"
++)
++
++func TestSplitShiftConst(t *testing.T) {
++	tests := []struct {
++		v       int64
++		wantImm int64
++		wantLsh int
++		wantOk  bool
++	}{
++		{0x100000000, 1, 32, true},
++		{0xfffff001, 0, 0, false},
++		{0xfffff801, 0, 0, false},
++		{0xfffffff1, 0, 0, false},
++		{0xffffffff, 0, 0, false},
++		{0xfffffffe, 0x7fffffff, 1, true},
++		{0xfffffffffffda, 0, 0, false},
++		{0xfffffffffffde, 0, 0, false},
++		{0x000003ffffffffff, 0, 0, false},
++		{0x0007ffffffffffff, 0, 0, false},
++		{0x7fffffff00000000, 0x7fffffff, 32, true},
++		{0x7fffffffffffffff, 0, 0, false},
++		{0x7f7f7f7f7f7f7f7f, 0, 0, false},
++		{0x0080000010000000, 0x8000001, 28, true},
++		{0x0abcdabcd0000000, 0, 0, false},
++		{-4503599610593281, 0, 0, false}, // 0x8abcdabcd0000000
++		{-7543254330000000, 0, 0, false}, // 0xfff0000000ffffff
++	}
++	for _, test := range tests {
++		t.Run(fmt.Sprintf("0x%x", test.v), func(t *testing.T) {
++			c, l, ok := splitShiftConst(test.v)
++
++			if got, want := c, test.wantImm; got != want {
++				t.Errorf("Got immediate %d, want %d", got, want)
++			}
++			if got, want := l, test.wantLsh; got != want {
++				t.Errorf("Got left shift %d, want %d", got, want)
++			}
++			switch {
++			case !ok && test.wantOk:
++				t.Error("Failed to split shift constant, want success")
++			case ok && !test.wantOk:
++				t.Error("Successfully split shift constant, want failure")
++			}
++			if !ok || ok != test.wantOk {
++				return
++			}
++
++			// Reconstruct as a 32 bit signed constant.
++			v := int64(uint64(int32(test.wantImm)) << l)
++			if v != test.v {
++				t.Errorf("Got v = %d (%x), want v = %d (%x)", v, v, test.v, test.v)
++			}
++		})
++	}
++}
+-- 
+2.39.5
+
diff --git a/2084-cmd-asm-internal-asm-add-additional-tests-for-consta.patch b/2084-cmd-asm-internal-asm-add-additional-tests-for-consta.patch
new file mode 100644
index 0000000..35a3b04
--- /dev/null
+++ b/2084-cmd-asm-internal-asm-add-additional-tests-for-consta.patch
@@ -0,0 +1,75 @@
+From 19169a649428edec75a7f94aac545acb6f34fcca Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 084/119] cmd/asm/internal/asm: add additional tests for
+ constant loads on riscv64
+
+This improves test coverage around the various constant load edge cases.
+
+Change-Id: Ibafeec78e76d95e9f56b48fa6bd012772bf505c5
+Reviewed-on: https://go-review.googlesource.com/c/go/+/652776
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s | 40 ++++++++++++++-------
+ 1 file changed, 28 insertions(+), 12 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 49f3ac00f3..86b9eb1fe6 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -561,20 +561,36 @@ start:
+ 	WORD	$0x9abcdef0	// WORD $2596069104	// f0debc9a
+ 
+ 	// MOV pseudo-instructions
+-	MOV	X5, X6					// 13830200
+-	MOV	$2047, X5				// 9302f07f
+-	MOV	$-2048, X5				// 93020080
+-	MOV	$2048, X5				// b71200009b820280
+-	MOV	$-2049, X5				// b7f2ffff9b82f27f
+-	MOV	$4096, X5				// b7120000
+-	MOV	$2147479552, X5				// b7f2ff7f
+-	MOV	$2147483647, X5				// b70200809b82f2ff
+-	MOV	$-2147483647, X5			// b70200809b821200
++	MOV	X5, X6								// 13830200
++	MOV	$2047, X5							// 9302f07f
++	MOV	$-2048, X5							// 93020080
++	MOV	$2048, X5							// b71200009b820280
++	MOV	$-2049, X5							// b7f2ffff9b82f27f
++	MOV	$4096, X5							// b7120000
++	MOV	$0x7ffff000, X5		// MOV	$2147479552, X5			// b7f2ff7f
++	MOV	$-0x7ffff000, X5	// MOV	$-2147479552, X5		// b7120080
++	MOV	$0x7fffffff, X5		// MOV	$2147483647, X5			// b70200809b82f2ff
++	MOV	$-0x7fffffff, X5	// MOV	$-2147483647, X5		// b70200809b821200
++
++	// Converted to load and shift (MOV + SLLI)
++	MOV	$0x100000000, X5	// MOV	$4294967296, X5			// 9302100093920202
++	MOV	$0x7fffffff00000000, X5	// MOV	$9223372032559808512, X5	// b70200809b82f2ff93920202
++	MOV	$0x8000000100000000, X5	// MOV	$-9223372032559808512, X5	// b70200809b82120093920202
++	MOV	$0xffffffff00000000, X5	// MOV	$-4294967296, X5		// 9302f0ff93920202
+ 
+ 	// Converted to load of symbol (AUIPC + LD)
+-	MOV	$4294967295, X5				// 9702000083b20200
+-	// Converted to MOV $1, X5 + SLLI $32, X5
+-	MOV	$4294967296, X5				// 9302100093920202
++	MOV	$0x80000001, X5		// MOV	$2147483649, X5			// 9702000083b20200
++	MOV	$0xffffffff, X5		// MOV	$4294967295, X5			// 9702000083b20200
++	MOV	$0x100000001, X5	// MOV	$4294967297, X5			// 9702000083b20200
++	MOV	$0xfffffffffffda, X5	// MOV	$4503599627370458, X5		// 9702000083b20200
++	MOV	$0xffffffffffffe, X5	// MOV	$4503599627370494, X5		// 9702000083b20200
++	MOV	$0x0800000010000000, X5	// MOV	$576460752571858944, X5		// 9702000083b20200
++	MOV	$0x8000000010000000, X5	// MOV	$-9223372036586340352, X5	// 9702000083b20200
++	MOV	$0x0abcdabcd0000000, X5	// MOV	$773733740479250432, X5		// 9702000083b20200
++	MOV	$0x8abcdabcd0000000, X5	// MOV	$-8449638296375525376, X5	// 9702000083b20200
++	MOV	$0x1ffffffff0000000, X5	// MOV	$2305843008945258496, X5	// 9702000083b20200
++	MOV	$0x7fffffffffffffff, X5 // MOV	$9223372036854775807, X5	// 9702000083b20200
++	MOV	$0xfff0000000ffffff, X5 // MOV	$-4503599610593281, X5		// 9702000083b20200
+ 
+ 	MOV	(X5), X6				// 03b30200
+ 	MOV	4(X5), X6				// 03b34200
+-- 
+2.39.5
+
diff --git a/2085-test-codegen-add-combined-conversion-and-shift-tests.patch b/2085-test-codegen-add-combined-conversion-and-shift-tests.patch
new file mode 100644
index 0000000..e453dca
--- /dev/null
+++ b/2085-test-codegen-add-combined-conversion-and-shift-tests.patch
@@ -0,0 +1,95 @@
+From a84d551feabbf5aa6f3e47cfcda19b5ba62af37d Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 085/119] test/codegen: add combined conversion and shift tests
+
+This adds tests for type conversion and shifts, detailing various
+poor bad code generation that currently exists for riscv64. This
+will be addressed in future CLs.
+
+Change-Id: Ie1d366dfe878832df691600f8500ef383da92848
+Reviewed-on: https://go-review.googlesource.com/c/go/+/615678
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+---
+ test/codegen/shift.go | 64 +++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 64 insertions(+)
+
+diff --git a/test/codegen/shift.go b/test/codegen/shift.go
+index 6b1157d3fd..3836311d5d 100644
+--- a/test/codegen/shift.go
++++ b/test/codegen/shift.go
+@@ -491,3 +491,67 @@ func checkLeftShiftWithAddition(a int64, b int64) int64 {
+ 	a = a + b<<3
+ 	return a
+ }
++
++//
++// Convert and shift.
++//
++
++func rsh64Uto32U(v uint64) uint32 {
++	x := uint32(v)
++	// riscv64:"MOVWU"
++	if x > 8 {
++		// riscv64:"SRLIW",-"MOVWU",-"SLLI"
++		x >>= 2
++	}
++	return x
++}
++
++func rsh64Uto16U(v uint64) uint16 {
++	x := uint16(v)
++	// riscv64:"MOVHU"
++	if x > 8 {
++		// riscv64:"SLLI","SRLI"
++		x >>= 2
++	}
++	return x
++}
++
++func rsh64Uto8U(v uint64) uint8 {
++	x := uint8(v)
++	// riscv64:"MOVBU"
++	if x > 8 {
++		// riscv64:"SLLI","SRLI"
++		x >>= 2
++	}
++	return x
++}
++
++func rsh64to32(v int64) int32 {
++	x := int32(v)
++	// riscv64:"MOVW"
++	if x > 8 {
++		// riscv64:"SRAIW",-"MOVW",-"SLLI"
++		x >>= 2
++	}
++	return x
++}
++
++func rsh64to16(v int64) int16 {
++	x := int16(v)
++	// riscv64:"MOVH"
++	if x > 8 {
++		// riscv64:"SLLI","SRAI"
++		x >>= 2
++	}
++	return x
++}
++
++func rsh64to8(v int64) int8 {
++	x := int8(v)
++	// riscv64:"MOVB"
++	if x > 8 {
++		// riscv64:"SLLI","SRAI"
++		x >>= 2
++	}
++	return x
++}
+-- 
+2.39.5
+
diff --git a/2086-cmd-internal-obj-riscv-internal-bytealg-synthesize-M.patch b/2086-cmd-internal-obj-riscv-internal-bytealg-synthesize-M.patch
new file mode 100644
index 0000000..94e47f8
--- /dev/null
+++ b/2086-cmd-internal-obj-riscv-internal-bytealg-synthesize-M.patch
@@ -0,0 +1,453 @@
+From 7957258a309c668e7d59db35f443723f23d5e210 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 086/119] cmd/internal/obj/riscv,internal/bytealg: synthesize
+ MIN/MAX/MINU/MAXU instructions
+
+Provide a synthesized version of the MIN/MAX/MINU/MAXU instructions
+if they're not natively available. This allows these instructions to
+be used in assembly unconditionally.
+
+Use MIN in internal/bytealg.compare.
+
+Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64
+Change-Id: I8a5a3a59f0a9205e136fc3d673b23eaf3ca469f8
+Reviewed-on: https://go-review.googlesource.com/c/go/+/653295
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   |  16 +-
+ src/cmd/internal/obj/riscv/asm_test.go        |  14 ++
+ src/cmd/internal/obj/riscv/obj.go             |  44 ++++++
+ .../riscv/testdata/testminmax/minmax_test.go  | 140 ++++++++++++++++++
+ .../riscv/testdata/testminmax/minmax_test.s   | 131 ++++++++++++++++
+ src/internal/bytealg/compare_riscv64.s        |   8 +-
+ 6 files changed, 339 insertions(+), 14 deletions(-)
+ create mode 100644 src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.go
+ create mode 100644 src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.s
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 86b9eb1fe6..eeaadf6298 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -376,14 +376,14 @@ start:
+ 	CPOPW	X23, X24				// 1b9c2b60
+ 	CTZ	X24, X25				// 931c1c60
+ 	CTZW	X25, X26				// 1b9d1c60
+-	MAX	X26, X28, X29				// b36eae0b
+-	MAX	X26, X28				// 336eae0b
+-	MAXU	X28, X29, X30				// 33ffce0b
+-	MAXU	X28, X29				// b3fece0b
+-	MIN	X29, X30, X5				// b342df0b
+-	MIN	X29, X30				// 334fdf0b
+-	MINU	X30, X5, X6				// 33d3e20b
+-	MINU	X30, X5					// b3d2e20b
++	MAX	X26, X28, X29				// b36eae0b or b32fae01b30ff041b34eae01b3fedf01b34ede01
++	MAX	X26, X28				// 336eae0b or b32fcd01b30ff041334ecd0133fecf01334ecd01
++	MAXU	X28, X29, X30				// 33ffce0b or b3bfce01b30ff04133cfce0133ffef0133cfee01
++	MAXU	X28, X29				// b3fece0b or b33fde01b30ff041b34ede01b3fedf01b34ede01
++	MIN	X29, X30, X5				// b342df0b or b3afee01b30ff041b342df01b3f25f00b3425f00
++	MIN	X29, X30				// 334fdf0b or b32fdf01b30ff04133cfee0133ffef0133cfee01
++	MINU	X30, X5, X6				// 33d3e20b or b33f5f00b30ff04133c3e20133f36f0033c36200
++	MINU	X30, X5					// b3d2e20b or b3bfe201b30ff041b3425f00b3f25f00b3425f00
+ 	ORN	X6, X7, X8				// 33e46340 or 1344f3ff33e48300
+ 	ORN	X6, X7					// b3e36340 or 934ff3ffb3e3f301
+ 	SEXTB	X16, X17				// 93184860
+diff --git a/src/cmd/internal/obj/riscv/asm_test.go b/src/cmd/internal/obj/riscv/asm_test.go
+index 96ea230841..35854516b9 100644
+--- a/src/cmd/internal/obj/riscv/asm_test.go
++++ b/src/cmd/internal/obj/riscv/asm_test.go
+@@ -280,6 +280,20 @@ func TestBranch(t *testing.T) {
+ 	}
+ }
+ 
++func TestMinMax(t *testing.T) {
++	if runtime.GOARCH != "riscv64" {
++		t.Skip("Requires riscv64 to run")
++	}
++
++	testenv.MustHaveGoBuild(t)
++
++	cmd := testenv.Command(t, testenv.GoToolPath(t), "test")
++	cmd.Dir = "testdata/testminmax"
++	if out, err := testenv.CleanCmdEnv(cmd).CombinedOutput(); err != nil {
++		t.Errorf("Min max test failed: %v\n%s", err, out)
++	}
++}
++
+ func TestPCAlign(t *testing.T) {
+ 	dir := t.TempDir()
+ 	tmpfile := filepath.Join(dir, "x.s")
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 795452bbcb..83d06f09f1 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -2625,6 +2625,47 @@ func instructionsForRotate(p *obj.Prog, ins *instruction) []*instruction {
+ 	}
+ }
+ 
++// instructionsForMinMax returns the machine instructions for an integer minimum or maximum.
++func instructionsForMinMax(p *obj.Prog, ins *instruction) []*instruction {
++	if buildcfg.GORISCV64 >= 22 {
++		// Minimum and maximum instructions are supported natively.
++		return []*instruction{ins}
++	}
++
++	// Generate a move for identical inputs.
++	if ins.rs1 == ins.rs2 {
++		ins.as, ins.rs2, ins.imm = AADDI, obj.REG_NONE, 0
++		return []*instruction{ins}
++	}
++
++	// Ensure that if one of the source registers is the same as the destination,
++	// it is processed first.
++	if ins.rs1 == ins.rd {
++		ins.rs1, ins.rs2 = ins.rs2, ins.rs1
++	}
++	sltReg1, sltReg2 := ins.rs2, ins.rs1
++
++	// MIN -> SLT/SUB/XOR/AND/XOR
++	// MAX -> SLT/SUB/XOR/AND/XOR with swapped inputs to SLT
++	switch ins.as {
++	case AMIN:
++		ins.as = ASLT
++	case AMAX:
++		ins.as, sltReg1, sltReg2 = ASLT, sltReg2, sltReg1
++	case AMINU:
++		ins.as = ASLTU
++	case AMAXU:
++		ins.as, sltReg1, sltReg2 = ASLTU, sltReg2, sltReg1
++	}
++	return []*instruction{
++		&instruction{as: ins.as, rs1: sltReg1, rs2: sltReg2, rd: REG_TMP},
++		&instruction{as: ASUB, rs1: REG_ZERO, rs2: REG_TMP, rd: REG_TMP},
++		&instruction{as: AXOR, rs1: ins.rs1, rs2: ins.rs2, rd: ins.rd},
++		&instruction{as: AAND, rs1: REG_TMP, rs2: ins.rd, rd: ins.rd},
++		&instruction{as: AXOR, rs1: ins.rs1, rs2: ins.rd, rd: ins.rd},
++	}
++}
++
+ // instructionsForProg returns the machine instructions for an *obj.Prog.
+ func instructionsForProg(p *obj.Prog) []*instruction {
+ 	ins := instructionForProg(p)
+@@ -2874,6 +2915,9 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		ins.as = AXOR
+ 		inss = append(inss, &instruction{as: AXORI, rs1: ins.rd, rs2: obj.REG_NONE, rd: ins.rd, imm: -1})
+ 
++	case AMIN, AMAX, AMINU, AMAXU:
++		inss = instructionsForMinMax(p, ins)
++
+ 	case AVSETVLI, AVSETIVLI:
+ 		ins.rs1, ins.rs2 = ins.rs2, obj.REG_NONE
+ 		vtype, err := EncodeVectorType(p.RestArgs[0].Offset, p.RestArgs[1].Offset, p.RestArgs[2].Offset, p.RestArgs[3].Offset)
+diff --git a/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.go b/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.go
+new file mode 100644
+index 0000000000..46d321147b
+--- /dev/null
++++ b/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.go
+@@ -0,0 +1,140 @@
++// Copyright 2025 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++//go:build riscv64
++
++package testminmax
++
++import (
++	"testing"
++)
++
++func testMIN1(a int64) (r int64)
++func testMIN2(a, b int64) (r int64)
++func testMIN3(a, b int64) (r int64)
++func testMIN4(a, b int64) (r int64)
++func testMAX1(a int64) (r int64)
++func testMAX2(a, b int64) (r int64)
++func testMAX3(a, b int64) (r int64)
++func testMAX4(a, b int64) (r int64)
++func testMINU1(a int64) (r int64)
++func testMINU2(a, b int64) (r int64)
++func testMINU3(a, b int64) (r int64)
++func testMINU4(a, b int64) (r int64)
++func testMAXU1(a int64) (r int64)
++func testMAXU2(a, b int64) (r int64)
++func testMAXU3(a, b int64) (r int64)
++func testMAXU4(a, b int64) (r int64)
++
++func TestMin(t *testing.T) {
++	tests := []struct {
++		a    int64
++		b    int64
++		want int64
++	}{
++		{1, 2, 1},
++		{2, 1, 1},
++		{2, 2, 2},
++		{1, -1, -1},
++		{-1, 1, -1},
++	}
++	for _, test := range tests {
++		if got := testMIN1(test.a); got != test.a {
++			t.Errorf("Assembly testMIN1 %v = %v, want %v", test.a, got, test.a)
++		}
++		if got := testMIN2(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMIN2 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++		if got := testMIN3(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMIN3 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++		if got := testMIN4(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMIN4 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++	}
++}
++
++func TestMax(t *testing.T) {
++	tests := []struct {
++		a    int64
++		b    int64
++		want int64
++	}{
++		{1, 2, 2},
++		{2, 1, 2},
++		{2, 2, 2},
++		{1, -1, 1},
++		{-1, 1, 1},
++	}
++	for _, test := range tests {
++		if got := testMAX1(test.a); got != test.a {
++			t.Errorf("Assembly testMAX1 %v = %v, want %v", test.a, got, test.a)
++		}
++		if got := testMAX2(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMAX2 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++		if got := testMAX3(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMAX3 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++		if got := testMAX4(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMAX4 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++	}
++}
++
++func TestMinU(t *testing.T) {
++	tests := []struct {
++		a    int64
++		b    int64
++		want int64
++	}{
++		{1, 2, 1},
++		{2, 1, 1},
++		{2, 2, 2},
++		{1, -1, 1},
++		{-1, 1, 1},
++	}
++	for _, test := range tests {
++		if got := testMINU1(test.a); got != test.a {
++			t.Errorf("Assembly testMINU1 %v = %v, want %v", test.a, got, test.a)
++		}
++		if got := testMINU2(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMINU2 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++		if got := testMINU3(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMINU3 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++		if got := testMINU4(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMINU4 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++	}
++}
++
++func TestMaxU(t *testing.T) {
++	tests := []struct {
++		a    int64
++		b    int64
++		want int64
++	}{
++		{1, 2, 2},
++		{2, 1, 2},
++		{2, 2, 2},
++		{1, -1, -1},
++		{-1, 1, -1},
++	}
++	for _, test := range tests {
++		if got := testMAXU1(test.a); got != test.a {
++			t.Errorf("Assembly testMAXU1 %v = %v, want %v", test.a, got, test.a)
++		}
++		if got := testMAXU2(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMAXU2 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++		if got := testMAXU3(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMAXU3 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++		if got := testMAXU4(test.a, test.b); got != test.want {
++			t.Errorf("Assembly testMAXU4 %v, %v = %v, want %v", test.a, test.b, got, test.want)
++		}
++	}
++}
+diff --git a/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.s b/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.s
+new file mode 100644
+index 0000000000..9d295791a5
+--- /dev/null
++++ b/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.s
+@@ -0,0 +1,131 @@
++// Copyright 2025 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++//go:build riscv64
++
++#include "textflag.h"
++
++// func testMIN1(a int64) (r int64)
++TEXT ·testMIN1(SB),NOSPLIT,$0-16
++	MOV	a+0(FP), X5
++	MIN	X5, X5, X6
++	MOV	X6, r+8(FP)
++	RET
++
++// func testMIN2(a, b int64) (r int64)
++TEXT ·testMIN2(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MIN	X5, X6, X6
++	MOV	X6, r+16(FP)
++	RET
++
++// func testMIN3(a, b int64) (r int64)
++TEXT ·testMIN3(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MIN	X6, X5, X5
++	MOV	X5, r+16(FP)
++	RET
++
++// func testMIN4(a, b int64) (r int64)
++TEXT ·testMIN4(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MIN	X5, X6, X7
++	MOV	X7, r+16(FP)
++	RET
++
++// func testMAX1(a int64) (r int64)
++TEXT ·testMAX1(SB),NOSPLIT,$0-16
++	MOV	a+0(FP), X5
++	MAX	X5, X5, X6
++	MOV	X6, r+8(FP)
++	RET
++
++// func testMAX2(a, b int64) (r int64)
++TEXT ·testMAX2(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MAX	X5, X6, X6
++	MOV	X6, r+16(FP)
++	RET
++
++// func testMAX3(a, b int64) (r int64)
++TEXT ·testMAX3(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MAX	X6, X5, X5
++	MOV	X5, r+16(FP)
++	RET
++
++// func testMAX4(a, b int64) (r int64)
++TEXT ·testMAX4(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MAX	X5, X6, X7
++	MOV	X7, r+16(FP)
++	RET
++
++// func testMINU1(a int64) (r int64)
++TEXT ·testMINU1(SB),NOSPLIT,$0-16
++	MOV	a+0(FP), X5
++	MINU	X5, X5, X6
++	MOV	X6, r+8(FP)
++	RET
++
++// func testMINU2(a, b int64) (r int64)
++TEXT ·testMINU2(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MINU	X5, X6, X6
++	MOV	X6, r+16(FP)
++	RET
++
++// func testMINU3(a, b int64) (r int64)
++TEXT ·testMINU3(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MINU	X6, X5, X5
++	MOV	X5, r+16(FP)
++	RET
++
++// func testMINU4(a, b int64) (r int64)
++TEXT ·testMINU4(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MINU	X5, X6, X7
++	MOV	X7, r+16(FP)
++	RET
++
++// func testMAXU1(a int64) (r int64)
++TEXT ·testMAXU1(SB),NOSPLIT,$0-16
++	MOV	a+0(FP), X5
++	MAXU	X5, X5, X6
++	MOV	X6, r+8(FP)
++	RET
++
++// func testMAXU2(a, b int64) (r int64)
++TEXT ·testMAXU2(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MAXU	X5, X6, X6
++	MOV	X6, r+16(FP)
++	RET
++
++// func testMAXU3(a, b int64) (r int64)
++TEXT ·testMAXU3(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MAXU	X6, X5, X5
++	MOV	X5, r+16(FP)
++	RET
++
++// func testMAXU4(a, b int64) (r int64)
++TEXT ·testMAXU4(SB),NOSPLIT,$0-24
++	MOV	a+0(FP), X5
++	MOV	b+8(FP), X6
++	MAXU	X5, X6, X7
++	MOV	X7, r+16(FP)
++	RET
+diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s
+index b1e1f7bcc7..6388fcd209 100644
+--- a/src/internal/bytealg/compare_riscv64.s
++++ b/src/internal/bytealg/compare_riscv64.s
+@@ -28,15 +28,11 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // X11 length of a
+ // X12 points to start of b
+ // X13 length of b
+-// for non-regabi X14 points to the address to store the return value (-1/0/1)
+-// for regabi the return value in X10
++// return value in X10 (-1/0/1)
+ TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
+ 	BEQ	X10, X12, cmp_len
+ 
+-	MOV	X11, X5
+-	BGE	X13, X5, use_a_len // X5 = min(len(a), len(b))
+-	MOV	X13, X5
+-use_a_len:
++	MIN	X11, X13, X5
+ 	BEQZ	X5, cmp_len
+ 
+ 	MOV	$32, X6
+-- 
+2.39.5
+
diff --git a/2087-cmd-internal-obj-riscv-improve-constant-construction.patch b/2087-cmd-internal-obj-riscv-improve-constant-construction.patch
new file mode 100644
index 0000000..59bccad
--- /dev/null
+++ b/2087-cmd-internal-obj-riscv-improve-constant-construction.patch
@@ -0,0 +1,239 @@
+From 88bf26fdb0c84eedfcc6e370840bccf34b9c9d46 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 087/119] cmd/internal/obj/riscv: improve constant construction
+
+Attempt to construct large constants that have a consecutive sequence
+of ones from a small negative constant, with a logical right and/or
+left shift. This allows for a large range of mask like constants to be
+constructed with only two or three instructions, avoiding the need to
+load from memory.
+
+Change-Id: I35a77fecdd2df0ed3f33b772d518f85119d4ff66
+Reviewed-on: https://go-review.googlesource.com/c/go/+/652778
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s | 12 ++--
+ src/cmd/internal/obj/riscv/obj.go           | 62 ++++++++++++++++-----
+ src/cmd/internal/obj/riscv/obj_test.go      | 48 +++++++++-------
+ 3 files changed, 82 insertions(+), 40 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index eeaadf6298..0b34bba032 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -572,24 +572,24 @@ start:
+ 	MOV	$0x7fffffff, X5		// MOV	$2147483647, X5			// b70200809b82f2ff
+ 	MOV	$-0x7fffffff, X5	// MOV	$-2147483647, X5		// b70200809b821200
+ 
+-	// Converted to load and shift (MOV + SLLI)
++	// Converted to load and shift(s)
++	MOV	$0xffffffff, X5		// MOV	$4294967295, X5			// 9302f0ff93d20202
+ 	MOV	$0x100000000, X5	// MOV	$4294967296, X5			// 9302100093920202
++	MOV	$0xfffffffffffda, X5	// MOV	$4503599627370458, X5		// 9302d0fe9392d20093d2c200
++	MOV	$0xffffffffffffe, X5	// MOV	$4503599627370494, X5		// 9302f0ff9392d20093d2c200
+ 	MOV	$0x7fffffff00000000, X5	// MOV	$9223372032559808512, X5	// b70200809b82f2ff93920202
+ 	MOV	$0x8000000100000000, X5	// MOV	$-9223372032559808512, X5	// b70200809b82120093920202
+ 	MOV	$0xffffffff00000000, X5	// MOV	$-4294967296, X5		// 9302f0ff93920202
++	MOV	$0x1ffffffff0000000, X5	// MOV	$2305843008945258496, X5	// 9302f0ff9392f20193d23200
++	MOV	$0x7fffffffffffffff, X5 // MOV	$9223372036854775807, X5	// 9302f0ff93d21200
+ 
+ 	// Converted to load of symbol (AUIPC + LD)
+ 	MOV	$0x80000001, X5		// MOV	$2147483649, X5			// 9702000083b20200
+-	MOV	$0xffffffff, X5		// MOV	$4294967295, X5			// 9702000083b20200
+ 	MOV	$0x100000001, X5	// MOV	$4294967297, X5			// 9702000083b20200
+-	MOV	$0xfffffffffffda, X5	// MOV	$4503599627370458, X5		// 9702000083b20200
+-	MOV	$0xffffffffffffe, X5	// MOV	$4503599627370494, X5		// 9702000083b20200
+ 	MOV	$0x0800000010000000, X5	// MOV	$576460752571858944, X5		// 9702000083b20200
+ 	MOV	$0x8000000010000000, X5	// MOV	$-9223372036586340352, X5	// 9702000083b20200
+ 	MOV	$0x0abcdabcd0000000, X5	// MOV	$773733740479250432, X5		// 9702000083b20200
+ 	MOV	$0x8abcdabcd0000000, X5	// MOV	$-8449638296375525376, X5	// 9702000083b20200
+-	MOV	$0x1ffffffff0000000, X5	// MOV	$2305843008945258496, X5	// 9702000083b20200
+-	MOV	$0x7fffffffffffffff, X5 // MOV	$9223372036854775807, X5	// 9702000083b20200
+ 	MOV	$0xfff0000000ffffff, X5 // MOV	$-4503599610593281, X5		// 9702000083b20200
+ 
+ 	MOV	(X5), X6				// 03b30200
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 83d06f09f1..b7989ddbd7 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -2094,22 +2094,35 @@ func encodingForAs(as obj.As) (*encoding, error) {
+ 	return &insData.enc, nil
+ }
+ 
+-// splitShiftConst attempts to split a constant into a signed 32 bit integer
+-// and a corresponding left shift.
+-func splitShiftConst(v int64) (imm int64, lsh int, ok bool) {
++// splitShiftConst attempts to split a constant into a signed 12 bit or
++// 32 bit integer, with corresponding logical right shift and/or left shift.
++func splitShiftConst(v int64) (imm int64, lsh int, rsh int, ok bool) {
++	// See if we can reconstruct this value from a signed 32 bit integer.
+ 	lsh = bits.TrailingZeros64(uint64(v))
+ 	c := v >> lsh
+-	if int64(int32(c)) != c {
+-		return 0, 0, false
++	if int64(int32(c)) == c {
++		return c, lsh, 0, true
+ 	}
+-	return c, lsh, true
++
++	// See if we can reconstruct this value from a small negative constant.
++	rsh = bits.LeadingZeros64(uint64(v))
++	ones := bits.OnesCount64((uint64(v) >> lsh) >> 11)
++	c = signExtend(1<<11|((v>>lsh)&0x7ff), 12)
++	if rsh+ones+lsh+11 == 64 {
++		if lsh > 0 || c != -1 {
++			lsh += rsh
++		}
++		return c, lsh, rsh, true
++	}
++
++	return 0, 0, 0, false
+ }
+ 
+ // isShiftConst indicates whether a constant can be represented as a signed
+ // 32 bit integer that is left shifted.
+ func isShiftConst(v int64) bool {
+-	_, lsh, ok := splitShiftConst(v)
+-	return ok && lsh > 0
++	_, lsh, rsh, ok := splitShiftConst(v)
++	return ok && (lsh > 0 || rsh > 0)
+ }
+ 
+ type instruction struct {
+@@ -2386,16 +2399,34 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 		// For constants larger than 32 bits in size that have trailing zeros,
+ 		// use the value with the trailing zeros removed and then use a SLLI
+ 		// instruction to restore the original constant.
++		//
+ 		// For example:
+-		// 	MOV $0x8000000000000000, X10
++		//     MOV $0x8000000000000000, X10
+ 		// becomes
+-		// 	MOV $1, X10
+-		// 	SLLI $63, X10, X10
+-		var insSLLI *instruction
++		//     MOV $1, X10
++		//     SLLI $63, X10, X10
++		//
++		// Similarly, we can construct large constants that have a consecutive
++		// sequence of ones from a small negative constant, with a right and/or
++		// left shift.
++		//
++		// For example:
++		//     MOV $0x000fffffffffffda, X10
++		// becomes
++		//     MOV $-19, X10
++		//     SLLI $13, X10
++		//     SRLI $12, X10
++		//
++		var insSLLI, insSRLI *instruction
+ 		if err := immIFits(ins.imm, 32); err != nil {
+-			if c, lsh, ok := splitShiftConst(ins.imm); ok {
++			if c, lsh, rsh, ok := splitShiftConst(ins.imm); ok {
+ 				ins.imm = c
+-				insSLLI = &instruction{as: ASLLI, rd: ins.rd, rs1: ins.rd, imm: int64(lsh)}
++				if lsh > 0 {
++					insSLLI = &instruction{as: ASLLI, rd: ins.rd, rs1: ins.rd, imm: int64(lsh)}
++				}
++				if rsh > 0 {
++					insSRLI = &instruction{as: ASRLI, rd: ins.rd, rs1: ins.rd, imm: int64(rsh)}
++				}
+ 			}
+ 		}
+ 
+@@ -2422,6 +2453,9 @@ func instructionsForMOV(p *obj.Prog) []*instruction {
+ 		if insSLLI != nil {
+ 			inss = append(inss, insSLLI)
+ 		}
++		if insSRLI != nil {
++			inss = append(inss, insSRLI)
++		}
+ 
+ 	case p.From.Type == obj.TYPE_CONST && p.To.Type != obj.TYPE_REG:
+ 		p.Ctxt.Diag("%v: constant load must target register", p)
+diff --git a/src/cmd/internal/obj/riscv/obj_test.go b/src/cmd/internal/obj/riscv/obj_test.go
+index 688f262d8f..87b31e5a89 100644
+--- a/src/cmd/internal/obj/riscv/obj_test.go
++++ b/src/cmd/internal/obj/riscv/obj_test.go
+@@ -14,29 +14,30 @@ func TestSplitShiftConst(t *testing.T) {
+ 		v       int64
+ 		wantImm int64
+ 		wantLsh int
++		wantRsh int
+ 		wantOk  bool
+ 	}{
+-		{0x100000000, 1, 32, true},
+-		{0xfffff001, 0, 0, false},
+-		{0xfffff801, 0, 0, false},
+-		{0xfffffff1, 0, 0, false},
+-		{0xffffffff, 0, 0, false},
+-		{0xfffffffe, 0x7fffffff, 1, true},
+-		{0xfffffffffffda, 0, 0, false},
+-		{0xfffffffffffde, 0, 0, false},
+-		{0x000003ffffffffff, 0, 0, false},
+-		{0x0007ffffffffffff, 0, 0, false},
+-		{0x7fffffff00000000, 0x7fffffff, 32, true},
+-		{0x7fffffffffffffff, 0, 0, false},
+-		{0x7f7f7f7f7f7f7f7f, 0, 0, false},
+-		{0x0080000010000000, 0x8000001, 28, true},
+-		{0x0abcdabcd0000000, 0, 0, false},
+-		{-4503599610593281, 0, 0, false}, // 0x8abcdabcd0000000
+-		{-7543254330000000, 0, 0, false}, // 0xfff0000000ffffff
++		{0x100000000, 1, 32, 0, true},
++		{0xfffff001, 0, 0, 0, false},
++		{0xfffff801, -2047, 32, 32, true},
++		{0xfffffff1, -15, 32, 32, true},
++		{0xffffffff, -1, 0, 32, true},
++		{0xfffffffe, 0x7fffffff, 1, 0, true},
++		{0xfffffffffffda, -19, 13, 12, true},
++		{0xfffffffffffde, -17, 13, 12, true},
++		{0x000003ffffffffff, -1, 0, 22, true},
++		{0x0007ffffffffffff, -1, 0, 13, true},
++		{0x7fffffff00000000, 0x7fffffff, 32, 0, true},
++		{0x7fffffffffffffff, -1, 0, 1, true},
++		{0x7f7f7f7f7f7f7f7f, 0, 0, 0, false},
++		{0x0080000010000000, 0x8000001, 28, 0, true},
++		{0x0abcdabcd0000000, 0, 0, 0, false},
++		{-4503599610593281, 0, 0, 0, false}, // 0x8abcdabcd0000000
++		{-7543254330000000, 0, 0, 0, false}, // 0xfff0000000ffffff
+ 	}
+ 	for _, test := range tests {
+ 		t.Run(fmt.Sprintf("0x%x", test.v), func(t *testing.T) {
+-			c, l, ok := splitShiftConst(test.v)
++			c, l, r, ok := splitShiftConst(test.v)
+ 
+ 			if got, want := c, test.wantImm; got != want {
+ 				t.Errorf("Got immediate %d, want %d", got, want)
+@@ -44,6 +45,9 @@ func TestSplitShiftConst(t *testing.T) {
+ 			if got, want := l, test.wantLsh; got != want {
+ 				t.Errorf("Got left shift %d, want %d", got, want)
+ 			}
++			if got, want := r, test.wantRsh; got != want {
++				t.Errorf("Got right shift %d, want %d", got, want)
++			}
+ 			switch {
+ 			case !ok && test.wantOk:
+ 				t.Error("Failed to split shift constant, want success")
+@@ -54,8 +58,12 @@ func TestSplitShiftConst(t *testing.T) {
+ 				return
+ 			}
+ 
+-			// Reconstruct as a 32 bit signed constant.
+-			v := int64(uint64(int32(test.wantImm)) << l)
++			// Reconstruct as either a 12 bit or 32 bit signed constant.
++			s := 64 - 12
++			v := int64((uint64(((c << s) >> s)) << l) >> r)
++			if test.wantImm != ((test.wantImm << s) >> s) {
++				v = int64((uint64(int32(test.wantImm)) << l) >> r)
++			}
+ 			if v != test.v {
+ 				t.Errorf("Got v = %d (%x), want v = %d (%x)", v, v, test.v, test.v)
+ 			}
+-- 
+2.39.5
+
diff --git a/2088-cmd-compile-internal-ssa-optimise-more-branches-with.patch b/2088-cmd-compile-internal-ssa-optimise-more-branches-with.patch
new file mode 100644
index 0000000..614e235
--- /dev/null
+++ b/2088-cmd-compile-internal-ssa-optimise-more-branches-with.patch
@@ -0,0 +1,125 @@
+From 753bc12d386d9481c83086003a7dd85fcab1d9ec Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 088/119] cmd/compile/internal/ssa: optimise more branches with
+ zero on riscv64
+
+Optimise more branches with zero on riscv64. In particular, BLTU with
+zero occurs with IsInBounds checks for index zero. This currently results
+in two instructions and requires an additional register:
+
+   li      t2, 0
+   bltu    t2, t1, 0x174b4
+
+This is equivalent to checking if the bounds is not equal to zero. With
+this change:
+
+   bnez    t1, 0x174c0
+
+This removes more than 500 instructions from the Go binary on riscv64.
+
+Change-Id: I6cd861d853e3ef270bd46dacecdfaa205b1c4644
+Reviewed-on: https://go-review.googlesource.com/c/go/+/606715
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+---
+ .../compile/internal/ssa/_gen/RISCV64.rules   | 18 +++++++-------
+ .../compile/internal/ssa/rewriteRISCV64.go    | 24 +++++++++++++++++++
+ test/codegen/compare_and_branch.go            | 10 ++++++++
+ 3 files changed, 44 insertions(+), 8 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 58cadc8944..93f4e6a948 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -545,14 +545,16 @@
+ (BNEZ (SLTIU [x] y) yes no) => (BLTU y (MOVDconst [x]) yes no)
+ 
+ // Convert branch with zero to more optimal branch zero.
+-(BEQ (MOVDconst [0]) cond yes no) => (BEQZ cond yes no)
+-(BEQ cond (MOVDconst [0]) yes no) => (BEQZ cond yes no)
+-(BNE (MOVDconst [0]) cond yes no) => (BNEZ cond yes no)
+-(BNE cond (MOVDconst [0]) yes no) => (BNEZ cond yes no)
+-(BLT (MOVDconst [0]) cond yes no) => (BGTZ cond yes no)
+-(BLT cond (MOVDconst [0]) yes no) => (BLTZ cond yes no)
+-(BGE (MOVDconst [0]) cond yes no) => (BLEZ cond yes no)
+-(BGE cond (MOVDconst [0]) yes no) => (BGEZ cond yes no)
++(BEQ  (MOVDconst [0]) cond yes no) => (BEQZ cond yes no)
++(BEQ  cond (MOVDconst [0]) yes no) => (BEQZ cond yes no)
++(BNE  (MOVDconst [0]) cond yes no) => (BNEZ cond yes no)
++(BNE  cond (MOVDconst [0]) yes no) => (BNEZ cond yes no)
++(BLT  (MOVDconst [0]) cond yes no) => (BGTZ cond yes no)
++(BLT  cond (MOVDconst [0]) yes no) => (BLTZ cond yes no)
++(BLTU (MOVDconst [0]) cond yes no) => (BNEZ cond yes no)
++(BGE  (MOVDconst [0]) cond yes no) => (BLEZ cond yes no)
++(BGE  cond (MOVDconst [0]) yes no) => (BGEZ cond yes no)
++(BGEU (MOVDconst [0]) cond yes no) => (BEQZ cond yes no)
+ 
+ // Remove redundant NEG from SEQZ/SNEZ.
+ (SEQZ (NEG x)) => (SEQZ x)
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 8f8c902df8..c3018f270c 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -9277,6 +9277,18 @@ func rewriteBlockRISCV64(b *Block) bool {
+ 			b.resetWithControl(BlockRISCV64BGEZ, cond)
+ 			return true
+ 		}
++	case BlockRISCV64BGEU:
++		// match: (BGEU (MOVDconst [0]) cond yes no)
++		// result: (BEQZ cond yes no)
++		for b.Controls[0].Op == OpRISCV64MOVDconst {
++			v_0 := b.Controls[0]
++			if auxIntToInt64(v_0.AuxInt) != 0 {
++				break
++			}
++			cond := b.Controls[1]
++			b.resetWithControl(BlockRISCV64BEQZ, cond)
++			return true
++		}
+ 	case BlockRISCV64BLT:
+ 		// match: (BLT (MOVDconst [0]) cond yes no)
+ 		// result: (BGTZ cond yes no)
+@@ -9300,6 +9312,18 @@ func rewriteBlockRISCV64(b *Block) bool {
+ 			b.resetWithControl(BlockRISCV64BLTZ, cond)
+ 			return true
+ 		}
++	case BlockRISCV64BLTU:
++		// match: (BLTU (MOVDconst [0]) cond yes no)
++		// result: (BNEZ cond yes no)
++		for b.Controls[0].Op == OpRISCV64MOVDconst {
++			v_0 := b.Controls[0]
++			if auxIntToInt64(v_0.AuxInt) != 0 {
++				break
++			}
++			cond := b.Controls[1]
++			b.resetWithControl(BlockRISCV64BNEZ, cond)
++			return true
++		}
+ 	case BlockRISCV64BNE:
+ 		// match: (BNE (MOVDconst [0]) cond yes no)
+ 		// result: (BNEZ cond yes no)
+diff --git a/test/codegen/compare_and_branch.go b/test/codegen/compare_and_branch.go
+index b3feef0eb7..fe25ebb3d3 100644
+--- a/test/codegen/compare_and_branch.go
++++ b/test/codegen/compare_and_branch.go
+@@ -239,4 +239,14 @@ func ui64x0(x chan uint64) {
+ 	for <-x < 1 {
+ 		dummy()
+ 	}
++
++	// riscv64:"BNEZ"
++	for 0 < <-x {
++		dummy()
++	}
++
++	// riscv64:"BEQZ"
++	for 0 >= <-x {
++		dummy()
++	}
+ }
+-- 
+2.39.5
+
diff --git a/2089-cmd-internal-obj-riscv-add-support-for-vector-intege.patch b/2089-cmd-internal-obj-riscv-add-support-for-vector-intege.patch
new file mode 100644
index 0000000..9e7e372
--- /dev/null
+++ b/2089-cmd-internal-obj-riscv-add-support-for-vector-intege.patch
@@ -0,0 +1,1327 @@
+From 7b35c25b80e46a548fb4f606f2be44f245062bf5 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 089/119] cmd/internal/obj/riscv: add support for vector
+ integer arithmetic instructions
+
+Add support for vector integer arithmetic instructions to the RISC-V
+assembler. This includes vector addition, subtraction, integer
+extension, add-with-carry, subtract-with-borrow, bitwise logical
+operations, comparison, min/max, integer division and multiplication
+instructions.
+
+Change-Id: I8c191ef8e31291e13743732903e4f12356133a46
+Reviewed-on: https://go-review.googlesource.com/c/go/+/646775
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   | 317 ++++++++++++++
+ .../asm/internal/asm/testdata/riscv64error.s  | 165 ++++++++
+ .../internal/asm/testdata/riscv64validation.s | 225 +++++++++-
+ src/cmd/internal/obj/riscv/anames.go          |  13 +
+ src/cmd/internal/obj/riscv/cpu.go             |  13 +
+ src/cmd/internal/obj/riscv/obj.go             | 393 +++++++++++++++++-
+ 6 files changed, 1101 insertions(+), 25 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 0b34bba032..852104375b 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -546,6 +546,323 @@ start:
+ 	VS4RV		V4, (X11)			// 27828562
+ 	VS8RV		V8, (X11)			// 278485e2
+ 
++	// 31.11.1: Vector Single-Width Integer Add and Subtract
++	VADDVV		V1, V2, V3			// d7812002
++	VADDVV		V1, V2, V0, V3			// d7812000
++	VADDVX		X10, V2, V3			// d7412502
++	VADDVX		X10, V2, V0, V3			// d7412500
++	VADDVI		$15, V2, V3			// d7b12702
++	VADDVI		$15, V2, V0, V3			// d7b12700
++	VADDVI		$-16, V2, V3			// d7312802
++	VADDVI		$-16, V2, V0, V3		// d7312800
++	VSUBVV		V1, V2, V3			// d781200a
++	VSUBVV		V1, V2, V0, V3			// d7812008
++	VSUBVX		X10, V2, V3			// d741250a
++	VSUBVX		X10, V2, V0, V3			// d7412508
++	VRSUBVX		X10, V2, V3			// d741250e
++	VRSUBVX		X10, V2, V0, V3			// d741250c
++	VRSUBVI		$15, V2, V0, V3			// d7b1270c
++	VRSUBVI		$-16, V2, V0, V3		// d731280c
++	VNEGV		V2, V3				// d741200e
++	VNEGV		V2, V0, V3			// d741200c
++
++	// 31.11.2: Vector Widening Integer Add/Subtract
++	VWADDUVV	V1, V2, V3			// d7a120c2
++	VWADDUVV	V1, V2, V0, V3			// d7a120c0
++	VWADDUVX	X10, V2, V3			// d76125c2
++	VWADDUVX	X10, V2, V0, V3			// d76125c0
++	VWSUBUVV	V1, V2, V3			// d7a120ca
++	VWSUBUVV	V1, V2, V0, V3			// d7a120c8
++	VWSUBUVX	X10, V2, V3			// d76125ca
++	VWSUBUVX	X10, V2, V0, V3			// d76125c8
++	VWADDVV		V1, V2, V3			// d7a120c6
++	VWADDVV		V1, V2, V0, V3			// d7a120c4
++	VWADDVX		X10, V2, V3			// d76125c6
++	VWADDVX		X10, V2, V0, V3			// d76125c4
++	VWSUBVV		V1, V2, V3			// d7a120ce
++	VWSUBVV		V1, V2, V0, V3			// d7a120cc
++	VWSUBVX		X10, V2, V3			// d76125ce
++	VWSUBVX		X10, V2, V0, V3			// d76125cc
++	VWADDUWV	V1, V2, V3			// d7a120d2
++	VWADDUWV	V1, V2, V0, V3			// d7a120d0
++	VWADDUWX	X10, V2, V3			// d76125d2
++	VWADDUWX	X10, V2, V0, V3			// d76125d0
++	VWSUBUWV	V1, V2, V3			// d7a120da
++	VWSUBUWV	V1, V2, V0, V3			// d7a120d8
++	VWSUBUWX	X10, V2, V3			// d76125da
++	VWSUBUWX	X10, V2, V0, V3			// d76125d8
++	VWADDWV		V1, V2, V3			// d7a120d6
++	VWADDWV		V1, V2, V0, V3			// d7a120d4
++	VWADDWX		X10, V2, V3			// d76125d6
++	VWADDWX		X10, V2, V0, V3			// d76125d4
++	VWSUBWV		V1, V2, V3			// d7a120de
++	VWSUBWV		V1, V2, V0, V3			// d7a120dc
++	VWSUBWX		X10, V2, V3			// d76125de
++	VWSUBWX		X10, V2, V0, V3			// d76125dc
++	VWCVTXXV	V2, V3				// d76120c6
++	VWCVTXXV	V2, V0, V3			// d76120c4
++	VWCVTUXXV	V2, V3				// d76120c2
++	VWCVTUXXV	V2, V0, V3			// d76120c0
++
++	// 31.11.3: Vector Integer Extension
++	VZEXTVF2	V2, V3				// d721234a
++	VZEXTVF2	V2, V0, V3			// d7212348
++	VSEXTVF2	V2, V3				// d7a1234a
++	VSEXTVF2	V2, V0, V3			// d7a12348
++	VZEXTVF4	V2, V3				// d721224a
++	VZEXTVF4	V2, V0, V3			// d7212248
++	VSEXTVF4	V2, V3				// d7a1224a
++	VSEXTVF4	V2, V0, V3			// d7a12248
++	VZEXTVF8	V2, V3				// d721214a
++	VZEXTVF8	V2, V0, V3			// d7212148
++	VSEXTVF8	V2, V3				// d7a1214a
++	VSEXTVF8	V2, V0, V3			// d7a12148
++
++	// 31.11.4: Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
++	VADCVVM		V1, V2, V0, V3			// d7812040
++	VADCVXM		X11, V2, V0, V3			// d7c12540
++	VADCVIM		$15, V2, V0, V3			// d7b12740
++	VMADCVVM	V1, V2, V0, V3			// d7812044
++	VMADCVXM	X11, V2, V0, V3			// d7c12544
++	VMADCVIM	$15, V2, V0, V3			// d7b12744
++	VMADCVV		V1, V2, V3			// d7812046
++	VMADCVX		X11, V2, V3			// d7c12546
++	VMADCVI		$15, V2, V3			// d7b12746
++	VSBCVVM		V1, V2, V0, V3			// d7812048
++	VSBCVXM		X11, V2, V0, V3			// d7c12548
++	VMSBCVVM	V1, V2, V0, V3			// d781204c
++	VMSBCVXM	X11, V2, V0, V3			// d7c1254c
++	VMSBCVV		V1, V2, V3			// d781204e
++	VMSBCVX		X11, V2, V3			// d7c1254e
++
++	// 31.11.5: Vector Bitwise Logical Instructions
++	VANDVV		V1, V2, V3			// d7812026
++	VANDVV		V1, V2, V0, V3			// d7812024
++	VANDVX		X11, V2, V3			// d7c12526
++	VANDVX		X11, V2, V0, V3			// d7c12524
++	VANDVI		$15, V2, V3			// d7b12726
++	VANDVI		$15, V2, V0, V3			// d7b12724
++	VORVV		V1, V2, V3			// d781202a
++	VORVV		V1, V2, V0, V3			// d7812028
++	VORVX		X11, V2, V3			// d7c1252a
++	VORVX		X11, V2, V0, V3			// d7c12528
++	VORVI		$15, V2, V3			// d7b1272a
++	VORVI		$15, V2, V0, V3			// d7b12728
++	VXORVV		V1, V2, V3			// d781202e
++	VXORVV		V1, V2, V0, V3			// d781202c
++	VXORVX		X11, V2, V3			// d7c1252e
++	VXORVX		X11, V2, V0, V3			// d7c1252c
++	VXORVI		$15, V2, V3			// d7b1272e
++	VXORVI		$15, V2, V0, V3			// d7b1272c
++	VNOTV		V2, V3				// d7b12f2e
++	VNOTV		V2, V0, V3			// d7b12f2c
++
++	// 31.11.6: Vector Single-Width Shift Instructions
++	VSLLVV		V1, V2, V3			// d7812096
++	VSLLVV		V1, V2, V0, V3			// d7812094
++	VSLLVX		X11, V2, V3			// d7c12596
++	VSLLVX		X11, V2, V0, V3			// d7c12594
++	VSLLVI		$15, V2, V3			// d7b12796
++	VSLLVI		$15, V2, V0, V3			// d7b12794
++	VSRLVV		V1, V2, V3			// d78120a2
++	VSRLVV		V1, V2, V0, V3			// d78120a0
++	VSRLVX		X11, V2, V3			// d7c125a2
++	VSRLVX		X11, V2, V0, V3			// d7c125a0
++	VSRLVI		$15, V2, V3			// d7b127a2
++	VSRLVI		$15, V2, V0, V3			// d7b127a0
++	VSRAVV		V1, V2, V3			// d78120a6
++	VSRAVV		V1, V2, V0, V3			// d78120a4
++	VSRAVX		X11, V2, V3			// d7c125a6
++	VSRAVX		X11, V2, V0, V3			// d7c125a4
++	VSRAVI		$15, V2, V3			// d7b127a6
++	VSRAVI		$15, V2, V0, V3			// d7b127a4
++
++	// 31.11.7: Vector Narrowing Integer Right Shift Instructions
++	VNSRLWV		V1, V2, V3			// d78120b2
++	VNSRLWV		V1, V2, V0, V3			// d78120b0
++	VNSRLWX		X10, V2, V3			// d74125b2
++	VNSRLWX		X10, V2, V0, V3			// d74125b0
++	VNSRLWI		$31, V2, V3			// d7b12fb2
++	VNSRLWI		$31, V2, V0, V3			// d7b12fb0
++	VNSRAWV		V1, V2, V3			// d78120b6
++	VNSRAWV		V1, V2, V0, V3			// d78120b4
++	VNSRAWX		X10, V2, V3			// d74125b6
++	VNSRAWX		X10, V2, V0, V3			// d74125b4
++	VNSRAWI		$31, V2, V3			// d7b12fb6
++	VNSRAWI		$31, V2, V0, V3			// d7b12fb4
++	VNCVTXXW	V2, V3				// d74120b2
++	VNCVTXXW	V2, V0, V3			// d74120b0
++
++	// 31.11.8: Vector Integer Compare Instructions
++	VMSEQVV		V1, V2, V3			// d7812062
++	VMSEQVV		V1, V2, V0, V3			// d7812060
++	VMSEQVX		X10, V2, V3			// d7412562
++	VMSEQVX		X10, V2, V0, V3			// d7412560
++	VMSEQVI		$15, V2, V3			// d7b12762
++	VMSEQVI		$15, V2, V0, V3			// d7b12760
++	VMSNEVV		V1, V2, V3			// d7812066
++	VMSNEVV		V1, V2, V0, V3			// d7812064
++	VMSNEVX		X10, V2, V3			// d7412566
++	VMSNEVX		X10, V2, V0, V3			// d7412564
++	VMSNEVI		$15, V2, V3			// d7b12766
++	VMSNEVI		$15, V2, V0, V3			// d7b12764
++	VMSLTUVV	V1, V2, V3			// d781206a
++	VMSLTUVV	V1, V2, V0, V3			// d7812068
++	VMSLTUVX	X10, V2, V3			// d741256a
++	VMSLTUVX	X10, V2, V0, V3			// d7412568
++	VMSLTVV		V1, V2, V3			// d781206e
++	VMSLTVV		V1, V2, V0, V3			// d781206c
++	VMSLTVX		X10, V2, V3			// d741256e
++	VMSLTVX		X10, V2, V0, V3			// d741256c
++	VMSLEUVV	V1, V2, V3			// d7812072
++	VMSLEUVV	V1, V2, V0, V3			// d7812070
++	VMSLEUVX	X10, V2, V3			// d7412572
++	VMSLEUVX	X10, V2, V0, V3			// d7412570
++	VMSLEUVI	$15, V2, V3			// d7b12772
++	VMSLEUVI	$15, V2, V0, V3			// d7b12770
++	VMSLEVV		V1, V2, V3			// d7812076
++	VMSLEVV		V1, V2, V0, V3			// d7812074
++	VMSLEVX		X10, V2, V3			// d7412576
++	VMSLEVX		X10, V2, V0, V3			// d7412574
++	VMSLEVI		$15, V2, V3			// d7b12776
++	VMSLEVI		$15, V2, V0, V3			// d7b12774
++	VMSGTUVX	X10, V2, V3			// d741257a
++	VMSGTUVX	X10, V2, V0, V3			// d7412578
++	VMSGTUVI	$15, V2, V3			// d7b1277a
++	VMSGTUVI	$15, V2, V0, V3			// d7b12778
++	VMSGTVX		X10, V2, V3			// d741257e
++	VMSGTVX		X10, V2, V0, V3			// d741257c
++	VMSGTVI		$15, V2, V3			// d7b1277e
++	VMSGTVI		$15, V2, V0, V3			// d7b1277c
++	VMSGTVV		V1, V2, V3			// d701116e
++	VMSGTVV		V1, V2, V0, V3			// d701116c
++	VMSGTUVV	V1, V2, V3			// d701116a
++	VMSGTUVV	V1, V2, V0, V3			// d7011168
++	VMSGEVV		V1, V2, V3			// d7011176
++	VMSGEVV		V1, V2, V0, V3			// d7011174
++	VMSGEUVV	V1, V2, V3			// d7011172
++	VMSGEUVV	V1, V2, V0, V3			// d7011170
++	VMSLTVI		$15, V2, V3			// d7312776
++	VMSLTVI		$15, V2, V0, V3			// d7312774
++	VMSLTUVI	$15, V2, V3			// d7312772
++	VMSLTUVI	$15, V2, V0, V3			// d7312770
++	VMSGEVI		$15, V2, V3			// d731277e
++	VMSGEVI		$15, V2, V0, V3			// d731277c
++	VMSGEUVI	$15, V2, V3			// d731277a
++	VMSGEUVI	$15, V2, V0, V3			// d7312778
++
++	// 31.11.9: Vector Integer Min/Max Instructions
++	VMINUVV		V1, V2, V3			// d7812012
++	VMINUVV		V1, V2, V0, V3			// d7812010
++	VMINUVX		X10, V2, V3			// d7412512
++	VMINUVX		X10, V2, V0, V3			// d7412510
++	VMINVV		V1, V2, V3			// d7812016
++	VMINVV		V1, V2, V0, V3			// d7812014
++	VMINVX		X10, V2, V3			// d7412516
++	VMINVX		X10, V2, V0, V3			// d7412514
++	VMAXUVV		V1, V2, V3			// d781201a
++	VMAXUVV		V1, V2, V0, V3			// d7812018
++	VMAXUVX		X10, V2, V3			// d741251a
++	VMAXUVX		X10, V2, V0, V3			// d7412518
++	VMAXVV		V1, V2, V3			// d781201e
++	VMAXVV		V1, V2, V0, V3			// d781201c
++	VMAXVX		X10, V2, V3			// d741251e
++	VMAXVX		X10, V2, V0, V3			// d741251c
++
++	// 31.11.10: Vector Single-Width Integer Multiply Instructions
++	VMULVV		V1, V2, V3			// d7a12096
++	VMULVV		V1, V2, V0, V3			// d7a12094
++	VMULVX		X10, V2, V3			// d7612596
++	VMULVX		X10, V2, V0, V3			// d7612594
++	VMULHVV		V1, V2, V3			// d7a1209e
++	VMULHVV		V1, V2, V0, V3			// d7a1209c
++	VMULHVX		X10, V2, V3			// d761259e
++	VMULHVX		X10, V2, V0, V3			// d761259c
++	VMULHUVV	V1, V2, V3			// d7a12092
++	VMULHUVV	V1, V2, V0, V3			// d7a12090
++	VMULHUVX	X10, V2, V3			// d7612592
++	VMULHUVX	X10, V2, V0, V3			// d7612590
++	VMULHSUVV	V1, V2, V3			// d7a1209a
++	VMULHSUVV	V1, V2, V0, V3			// d7a12098
++	VMULHSUVX	X10, V2, V3			// d761259a
++	VMULHSUVX	X10, V2, V0, V3			// d7612598
++
++	// 31.11.11: Vector Integer Divide Instructions
++	VDIVUVV		V1, V2, V3			// d7a12082
++	VDIVUVV		V1, V2, V0, V3			// d7a12080
++	VDIVUVX		X10, V2, V3			// d7612582
++	VDIVUVX		X10, V2, V0, V3			// d7612580
++	VDIVVV		V1, V2, V3			// d7a12086
++	VDIVVV		V1, V2, V0, V3			// d7a12084
++	VDIVVX		X10, V2, V3			// d7612586
++	VDIVVX		X10, V2, V0, V3			// d7612584
++	VREMUVV		V1, V2, V3			// d7a1208a
++	VREMUVV		V1, V2, V0, V3			// d7a12088
++	VREMUVX		X10, V2, V3			// d761258a
++	VREMUVX		X10, V2, V0, V3			// d7612588
++	VREMVV		V1, V2, V3			// d7a1208e
++	VREMVV		V1, V2, V0, V3			// d7a1208c
++	VREMVX		X10, V2, V3			// d761258e
++	VREMVX		X10, V2, V0, V3			// d761258c
++
++	// 31.11.12: Vector Widening Integer Multiply Instructions
++	VWMULVV		V1, V2, V3			// d7a120ee
++	VWMULVV		V1, V2, V0, V3			// d7a120ec
++	VWMULVX		X10, V2, V3			// d76125ee
++	VWMULVX		X10, V2, V0, V3			// d76125ec
++	VWMULUVV	V1, V2, V3			// d7a120e2
++	VWMULUVV	V1, V2, V0, V3			// d7a120e0
++	VWMULUVX	X10, V2, V3			// d76125e2
++	VWMULUVX	X10, V2, V0, V3			// d76125e0
++	VWMULSUVV	V1, V2, V3			// d7a120ea
++	VWMULSUVV	V1, V2, V0, V3			// d7a120e8
++	VWMULSUVX	X10, V2, V3			// d76125ea
++	VWMULSUVX	X10, V2, V0, V3			// d76125e8
++
++	// 31.11.13: Vector Single-Width Integer Multiply-Add Instructions
++	VMACCVV		V1, V2, V3			// d7a120b6
++	VMACCVV		V1, V2, V0, V3			// d7a120b4
++	VMACCVX		X10, V2, V3			// d76125b6
++	VMACCVX		X10, V2, V0, V3			// d76125b4
++	VNMSACVV	V1, V2, V3			// d7a120be
++	VNMSACVV	V1, V2, V0, V3			// d7a120bc
++	VNMSACVX	X10, V2, V3			// d76125be
++	VNMSACVX	X10, V2, V0, V3			// d76125bc
++	VMADDVV		V1, V2, V3			// d7a120a6
++	VMADDVV		V1, V2, V0, V3			// d7a120a4
++	VMADDVX		X10, V2, V3			// d76125a6
++	VMADDVX		X10, V2, V0, V3			// d76125a4
++	VNMSUBVV	V1, V2, V3			// d7a120ae
++	VNMSUBVV	V1, V2, V0, V3			// d7a120ac
++	VNMSUBVX	X10, V2, V3			// d76125ae
++	VNMSUBVX	X10, V2, V0, V3			// d76125ac
++
++	// 31.11.14: Vector Widening Integer Multiply-Add Instructions
++	VWMACCUVV	V1, V2, V3			// d7a120f2
++	VWMACCUVV	V1, V2, V0, V3			// d7a120f0
++	VWMACCUVX	X10, V2, V3			// d76125f2
++	VWMACCUVX	X10, V2, V0, V3			// d76125f0
++	VWMACCVV	V1, V2, V3			// d7a120f6
++	VWMACCVV	V1, V2, V0, V3			// d7a120f4
++	VWMACCVX	X10, V2, V3			// d76125f6
++	VWMACCVX	X10, V2, V0, V3			// d76125f4
++	VWMACCSUVV	V1, V2, V3			// d7a120fe
++	VWMACCSUVV	V1, V2, V0, V3			// d7a120fc
++	VWMACCSUVX	X10, V2, V3			// d76125fe
++	VWMACCSUVX	X10, V2, V0, V3			// d76125fc
++	VWMACCUSVX	X10, V2, V3			// d76125fa
++	VWMACCUSVX	X10, V2, V0, V3			// d76125f8
++
++	// 31.11.15: Vector Integer Merge Instructions
++	VMERGEVVM	V1, V2, V0, V3			// d781205c
++	VMERGEVXM	X10, V2, V0, V3			// d741255c
++	VMERGEVIM	$15, V2, V0, V3			// d7b1275c
++
++	// 31.11.16: Vector Integer Move Instructions
++	VMVVV		V2, V3				// d701015e
++	VMVVX		X10, V3				// d741055e
++	VMVVI		$15, V3				// d7b1075e
++
+ 	//
+ 	// Privileged ISA
+ 	//
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index 005b794612..025d63a15c 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -46,6 +46,10 @@ TEXT errors(SB),$0
+ 	SRAIW	$-1, X5, X6			// ERROR "immediate out of range 0 to 31"
+ 	SD	X5, 4294967296(X6)		// ERROR "constant 4294967296 too large"
+ 	FNES	F1, (X5)			// ERROR "needs an integer register output"
++
++	//
++	// "V" Standard Extension for Vector Operations, Version 1.0
++	//
+ 	VSETIVLI X10, E32, M2, TA, MA, X12	// ERROR "expected immediate value"
+ 	VLE8V	(X10), V1, V3			// ERROR "invalid vector mask register"
+ 	VSE8V	V3, V1, (X10)			// ERROR "invalid vector mask register"
+@@ -57,4 +61,165 @@ TEXT errors(SB),$0
+ 	VSOXEI8V V3, V2, V1, (X10)		// ERROR "invalid vector mask register"
+ 	VL1RV	(X10), V0, V3			// ERROR "too many operands for instruction"
+ 	VS1RV	V3, V0, (X11)			// ERROR "too many operands for instruction"
++	VADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VADDVX	X10, V2, V1, V3			// ERROR "invalid vector mask register"
++	VADDVI	$15, V4, V1, V2			// ERROR "invalid vector mask register"
++	VSUBVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSUBVX	X10, V2, V1, V3			// ERROR "invalid vector mask register"
++	VRSUBVX	X10, V2, V1, V3			// ERROR "invalid vector mask register"
++	VRSUBVI	$15, V4, V1, V2			// ERROR "invalid vector mask register"
++	VNEGV	V2, V3, V4			// ERROR "invalid vector mask register"
++	VWADDUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWSUBUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDUWV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDUWX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWSUBUWV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBUWX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWADDWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWCVTXXV V2, V1, V3			// ERROR "invalid vector mask register"
++	VWCVTUXXV V2, V1, V3			// ERROR "invalid vector mask register"
++	VZEXTVF2 V2, V3, V4			// ERROR "invalid vector mask register"
++	VSEXTVF2 V2, V3, V4			// ERROR "invalid vector mask register"
++	VZEXTVF4 V2, V3, V4			// ERROR "invalid vector mask register"
++	VSEXTVF4 V2, V3, V4			// ERROR "invalid vector mask register"
++	VZEXTVF8 V2, V3, V4			// ERROR "invalid vector mask register"
++	VSEXTVF8 V2, V3, V4			// ERROR "invalid vector mask register"
++	VADCVVM	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VADCVVM	V1, V2, V3			// ERROR "invalid vector mask register"
++	VADCVXM	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VADCVXM	X10, V2, V3			// ERROR "invalid vector mask register"
++	VADCVIM	$15, V2, V1, V3			// ERROR "invalid vector mask register"
++	VADCVIM	$15, V2, V3			// ERROR "invalid vector mask register"
++	VMADCVVM V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMADCVVM V1, V2, V3			// ERROR "invalid vector mask register"
++	VMADCVXM X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMADCVXM X10, V2, V3			// ERROR "invalid vector mask register"
++	VMADCVIM $15, V2, V1, V3		// ERROR "invalid vector mask register"
++	VMADCVIM $15, V2, V3			// ERROR "invalid vector mask register"
++	VSBCVVM	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSBCVVM	V1, V2, V3			// ERROR "invalid vector mask register"
++	VSBCVXM	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSBCVXM	X10, V2, V3			// ERROR "invalid vector mask register"
++	VMSBCVVM V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSBCVVM V1, V2, V3			// ERROR "invalid vector mask register"
++	VMSBCVXM X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMSBCVXM X10, V2, V3			// ERROR "invalid vector mask register"
++	VANDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VANDVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VANDVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VORVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VORVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VORVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VXORVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VXORVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VXORVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNOTV	V1, V2, V3			// ERROR "invalid vector mask register"
++	VSLLVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSLLVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSLLVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRLVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRLVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRLVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRAVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRAVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRAVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRLWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRLWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRLWI	$31, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRAWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRAWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRAWI	$31, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNCVTXXW V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSEQVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSEQVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSEQVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSNEVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSNEVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSNEVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMSLTVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLEUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLEUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMSLEUVI $15, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMSLEVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLEVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLEVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGTUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGTUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMSGTUVI $15, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMSGTVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGTVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGTVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGEVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGEUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTUVI $15, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMSGEVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGEUVI $15, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMINUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMINUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMINVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMINVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMAXUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMAXUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMAXVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMAXVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULHVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULHVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULHUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULHUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMULHSUVV V1, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMULHSUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VDIVUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VDIVUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VDIVVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VDIVVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREMUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREMUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREMVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREMVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMULVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMULVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMULUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMULUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWMULSUVV V1, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWMULSUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMACCVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMACCVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNMSACVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNMSACVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMADDVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNMSUBVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNMSUBVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWMACCUVV V1, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWMACCUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWMACCVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMACCVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWMACCSUVV V1, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWMACCSUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VWMACCUSVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMERGEVVM V1, V2, V3			// ERROR "invalid vector mask register"
++	VMERGEVVM V1, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMERGEVXM X10, V2, V3			// ERROR "invalid vector mask register"
++	VMERGEVXM X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMERGEVIM $15, V2, V3			// ERROR "invalid vector mask register"
++	VMERGEVIM $15, V2, V4, V3		// ERROR "invalid vector mask register"
++	VMVVV	V1, V2, V3			// ERROR "too many operands for instruction"
++	VMVVX	X10, V2, V3			// ERROR "too many operands for instruction"
++	VMVVI	$15, V2, V3			// ERROR "too many operands for instruction"
++
+ 	RET
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+index 773f275dd3..602cab2c2e 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+@@ -11,36 +11,231 @@
+ TEXT validation(SB),$0
+ 	SRLI	$1, X5, F1			// ERROR "expected integer register in rd position but got non-integer register F1"
+ 	SRLI	$1, F1, X5			// ERROR "expected integer register in rs1 position but got non-integer register F1"
++
++	//
++	// "V" Standard Extension for Vector Operations, Version 1.0
++	//
+ 	VSETVLI	$32, E16, M1, TU, MU, X12	// ERROR "must be in range [0, 31] (5 bits)"
+ 	VSETVLI	$-1, E32, M2, TA, MA, X12	// ERROR "must be in range [0, 31] (5 bits)"
+ 	VSETVL	X10, X11			// ERROR "expected integer register in rs1 position"
+-	VLE8V	(X10), X10			// ERROR "expected vector register in rd position"
++	VLE8V	(X10), X10			// ERROR "expected vector register in vd position"
+ 	VLE8V	(V1), V3			// ERROR "expected integer register in rs1 position"
+-	VSE8V	X10, (X10)			// ERROR "expected vector register in rs1 position"
++	VSE8V	X10, (X10)			// ERROR "expected vector register in vs1 position"
+ 	VSE8V	V3, (V1)			// ERROR "expected integer register in rd position"
+ 	VLSE8V	(X10), V3			// ERROR "expected integer register in rs2 position"
+-	VLSE8V	(X10), X10, X11			// ERROR "expected vector register in rd position"
++	VLSE8V	(X10), X10, X11			// ERROR "expected vector register in vd position"
+ 	VLSE8V	(V1), X10, V3			// ERROR "expected integer register in rs1 position"
+ 	VLSE8V	(X10), V1, V0, V3		// ERROR "expected integer register in rs2 position"
+ 	VSSE8V	V3, (X10)			// ERROR "expected integer register in rs2 position"
+-	VSSE8V	X10, X11, (X10)			// ERROR "expected vector register in rd position"
++	VSSE8V	X10, X11, (X10)			// ERROR "expected vector register in vd position"
+ 	VSSE8V	V3, X11, (V1)			// ERROR "expected integer register in rs1 position"
+ 	VSSE8V	V3, V1, V0, (X10)		// ERROR "expected integer register in rs2 position"
+-	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
+-	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
++	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in vd position"
++	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in vd position"
+ 	VLUXEI8V (V1), V2, V3			// ERROR "expected integer register in rs1 position"
+-	VLUXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in rs2 position"
+-	VSUXEI8V X10, V2, (X10)			// ERROR "expected vector register in rd position"
++	VLUXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in vs2 position"
++	VSUXEI8V X10, V2, (X10)			// ERROR "expected vector register in vd position"
+ 	VSUXEI8V V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
+-	VSUXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in rs2 position"
+-	VLOXEI8V (X10), V2, X11			// ERROR "expected vector register in rd position"
++	VSUXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in vs2 position"
++	VLOXEI8V (X10), V2, X11			// ERROR "expected vector register in vd position"
+ 	VLOXEI8V (V1), V2, V3			// ERROR "expected integer register in rs1 position"
+-	VLOXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in rs2 position"
+-	VSOXEI8V X10, V2, (X10)			// ERROR "expected vector register in rd position"
++	VLOXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in vs2 position"
++	VSOXEI8V X10, V2, (X10)			// ERROR "expected vector register in vd position"
+ 	VSOXEI8V V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
+-	VSOXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in rs2 position"
+-	VL1RV	(X10), X10			// ERROR "expected vector register in rd position"
++	VSOXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in vs2 position"
++	VL1RV	(X10), X10			// ERROR "expected vector register in vd position"
+ 	VL1RV	(V1), V3			// ERROR "expected integer register in rs1 position"
+-	VS1RV	X11, (X11)			// ERROR "expected vector register in rs1 position"
++	VS1RV	X11, (X11)			// ERROR "expected vector register in vs1 position"
+ 	VS1RV	V3, (V1)			// ERROR "expected integer register in rd position"
++	VADDVV	V1, X10, V3			// ERROR "expected vector register in vs2 position"
++	VADDVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VADDVI	$16, V4, V2			// ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)"
++	VADDVI	$-17, V4, V2			// ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)"
++	VSUBVV	V1, X10, V3			// ERROR "expected vector register in vs2 position"
++	VSUBVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VRSUBVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VRSUBVI	$16, V4, V2			// ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)"
++	VRSUBVI	$-17, V4, V2			// ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)"
++	VNEGV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VNEGV	V2				// ERROR "expected vector register in vd position"
++	VWADDUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWADDUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWSUBUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWSUBUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWADDVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWADDVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWSUBVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWSUBVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWADDUWV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWADDUWX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWSUBUWV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWSUBUWX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWADDWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWADDWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWSUBWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWSUBWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWCVTXXV X10, V3			// ERROR "expected vector register in vs2 position"
++	VWCVTUXXV X10, V3			// ERROR "expected vector register in vs2 position"
++	VZEXTVF2 V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VZEXTVF2 X10, V3			// ERROR "expected vector register in vs2 position"
++	VSEXTVF2 V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VSEXTVF2 X10, V3			// ERROR "expected vector register in vs2 position"
++	VZEXTVF4 V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VZEXTVF4 X10, V3			// ERROR "expected vector register in vs2 position"
++	VSEXTVF4 V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VSEXTVF4 X10, V3			// ERROR "expected vector register in vs2 position"
++	VZEXTVF8 V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VZEXTVF8 X10, V3			// ERROR "expected vector register in vs2 position"
++	VSEXTVF8 V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VSEXTVF8 X10, V3			// ERROR "expected vector register in vs2 position"
++	VADCVVM	X10, V2, V0, V3			// ERROR "expected vector register in vs1 position"
++	VADCVXM	V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
++	VADCVIM	$16, V2, V0, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VADCVIM	$-17, V2, V0, V3		// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMADCVVM X10, V2, V0, V3		// ERROR "expected vector register in vs1 position"
++	VMADCVXM V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
++	VMADCVIM $16, V2, V0, V3		// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMADCVIM $-17, V2, V0, V3		// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMADCVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMADCVV	V1, V2, V0, V3			// ERROR "expected no register in rs3"
++	VMADCVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMADCVX	X10, V2, V0, V3			// ERROR "expected no register in rs3"
++	VMADCVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMADCVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMADCVI	$15, V2, V0, V3			// ERROR "expected no register in rs3"
++	VSBCVVM	X10, V2, V0, V3			// ERROR "expected vector register in vs1 position"
++	VSBCVXM	V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
++	VMSBCVVM X10, V2, V0, V3		// ERROR "expected vector register in vs1 position"
++	VMSBCVXM V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
++	VMSBCVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSBCVV	V1, V2, V0, V3			// ERROR "expected no register in rs3"
++	VMSBCVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSBCVX	X10, V2, V0, V3			// ERROR "expected no register in rs3"
++	VANDVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VANDVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VANDVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VANDVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VORVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VORVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VORVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VORVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VXORVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VXORVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VXORVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VXORVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VNOTV	V3				// ERROR "expected vector register in vd position"
++	VNOTV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VSLLVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSLLVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSLLVI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VSLLVI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VSRLVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSRLVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSRLVI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VSRLVI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VSRAVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSRAVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSRAVI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VSRAVI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VNSRLWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNSRLWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNSRLWI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VNSRLWI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VNSRAWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNSRAWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNSRAWI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VNSRAWI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VNCVTXXW X10, V3			// ERROR "expected vector register in vs2 position"
++	VMSEQVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSEQVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSEQVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSEQVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSNEVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSNEVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSNEVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSNEVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSLTUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSLTUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSLTVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSLTVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSLEUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSLEUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSLEUVI $16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSLEUVI $-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSLEVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSLEVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSLEVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSLEVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSGTUVV X10, V2, V3			// ERROR "expected vector register in vs2 position"
++	VMSGTUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSGTUVI $16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSGTUVI $-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSGTVV	X10, V2, V3			// ERROR "expected vector register in vs2 position"
++	VMSGTVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSGTVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSGTVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSGEVV	X10, V2, V3			// ERROR "expected vector register in vs2 position"
++	VMSGEUVV X10, V2, V3			// ERROR "expected vector register in vs2 position"
++	VMSLTVI	$17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSLTVI	$-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSLTUVI $17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSLTUVI $-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSGEVI	$17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSGEVI	$-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSGEUVI $17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSGEUVI $-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMINUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMINUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMINVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMINVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMAXUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMAXUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMAXVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMAXVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMULVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMULVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMULHVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMULHVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMULHUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMULHUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMULHSUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMULHSUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VDIVUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VDIVUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VDIVVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VDIVVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VREMUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREMUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VREMVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREMVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMULVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMULVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMULUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMULUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMULSUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMULSUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMACCVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMACCVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNMSACVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNMSACVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMADDVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMADDVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNMSUBVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNMSUBVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMACCUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMACCUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMACCVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMACCVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMACCSUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMACCSUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMACCUSVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMERGEVVM X10, V2, V0, V3		// ERROR "expected vector register in vs1 position"
++	VMERGEVXM V1, V2, V0, V3		// ERROR "expected integer register in rs1 position"
++	VMERGEVIM $16, V2, V0, V3		// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMERGEVIM $-17, V2, V0, V3		// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMVVV	X10, V3				// ERROR "expected vector register in vs1 position"
++	VMVVX	V1, V2				// ERROR "expected integer register in rs1 position"
++	VMVVI	$16, V2				// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMVVI	$-17, V2			// ERROR "signed immediate -17 must be in range [-16, 15]"
++
+ 	RET
+diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go
+index 6df5f0a173..a65dfceea9 100644
+--- a/src/cmd/internal/obj/riscv/anames.go
++++ b/src/cmd/internal/obj/riscv/anames.go
+@@ -654,5 +654,18 @@ var Anames = []string{
+ 	"VL2RV",
+ 	"VL4RV",
+ 	"VL8RV",
++	"VMSGEUVI",
++	"VMSGEUVV",
++	"VMSGEVI",
++	"VMSGEVV",
++	"VMSGTUVV",
++	"VMSGTVV",
++	"VMSLTUVI",
++	"VMSLTVI",
++	"VNCVTXXW",
++	"VNEGV",
++	"VNOTV",
++	"VWCVTUXXV",
++	"VWCVTXXV",
+ 	"LAST",
+ }
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index 8999ef149b..577b06f0ec 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -1172,6 +1172,19 @@ const (
+ 	AVL2RV
+ 	AVL4RV
+ 	AVL8RV
++	AVMSGEUVI
++	AVMSGEUVV
++	AVMSGEVI
++	AVMSGEVV
++	AVMSGTUVV
++	AVMSGTVV
++	AVMSLTUVI
++	AVMSLTVI
++	AVNCVTXXW
++	AVNEGV
++	AVNOTV
++	AVWCVTUXXV
++	AVWCVTXXV
+ 
+ 	// End marker
+ 	ALAST
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index b7989ddbd7..d85bdd302c 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1203,6 +1203,43 @@ func validateRFF(ctxt *obj.Link, ins *instruction) {
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
++func validateRVIV(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
++	wantIntReg(ctxt, ins, "rs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "vs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateRVV(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "vs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateRVVi(ctxt *obj.Link, ins *instruction) {
++	wantImmI(ctxt, ins, ins.imm, 5)
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "vs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateRVVu(ctxt *obj.Link, ins *instruction) {
++	wantImmU(ctxt, ins, ins.imm, 5)
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "vs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateRVVV(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
++	wantVectorReg(ctxt, ins, "vs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "vs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
+ func validateIII(ctxt *obj.Link, ins *instruction) {
+ 	wantImmI(ctxt, ins, ins.imm, 12)
+ 	wantIntReg(ctxt, ins, "rd", ins.rd)
+@@ -1220,23 +1257,23 @@ func validateIF(ctxt *obj.Link, ins *instruction) {
+ }
+ 
+ func validateIV(ctxt *obj.Link, ins *instruction) {
+-	wantVectorReg(ctxt, ins, "rd", ins.rd)
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
+ 	wantIntReg(ctxt, ins, "rs1", ins.rs1)
+ 	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateIIIV(ctxt *obj.Link, ins *instruction) {
+-	wantVectorReg(ctxt, ins, "rd", ins.rd)
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
+ 	wantIntReg(ctxt, ins, "rs1", ins.rs1)
+ 	wantIntReg(ctxt, ins, "rs2", ins.rs2)
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateIVIV(ctxt *obj.Link, ins *instruction) {
+-	wantVectorReg(ctxt, ins, "rd", ins.rd)
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
+ 	wantIntReg(ctxt, ins, "rs1", ins.rs1)
+-	wantVectorReg(ctxt, ins, "rs2", ins.rs2)
++	wantVectorReg(ctxt, ins, "vs2", ins.rs2)
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+@@ -1258,22 +1295,22 @@ func validateSF(ctxt *obj.Link, ins *instruction) {
+ 
+ func validateSV(ctxt *obj.Link, ins *instruction) {
+ 	wantIntReg(ctxt, ins, "rd", ins.rd)
+-	wantVectorReg(ctxt, ins, "rs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "vs1", ins.rs1)
+ 	wantNoneReg(ctxt, ins, "rs2", ins.rs2)
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateSVII(ctxt *obj.Link, ins *instruction) {
+-	wantVectorReg(ctxt, ins, "rd", ins.rd)
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
+ 	wantIntReg(ctxt, ins, "rs1", ins.rs1)
+ 	wantIntReg(ctxt, ins, "rs2", ins.rs2)
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+ func validateSVIV(ctxt *obj.Link, ins *instruction) {
+-	wantVectorReg(ctxt, ins, "rd", ins.rd)
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
+ 	wantIntReg(ctxt, ins, "rs1", ins.rs1)
+-	wantVectorReg(ctxt, ins, "rs2", ins.rs2)
++	wantVectorReg(ctxt, ins, "vs2", ins.rs2)
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+@@ -1350,11 +1387,15 @@ func encodeR(as obj.As, rs1, rs2, rd, funct3, funct7 uint32) uint32 {
+ 	if enc == nil {
+ 		panic("encodeR: could not encode instruction")
+ 	}
++	if enc.rs1 != 0 && rs1 != 0 {
++		panic("encodeR: instruction uses rs1, but rs1 is nonzero")
++	}
+ 	if enc.rs2 != 0 && rs2 != 0 {
+-		panic("encodeR: instruction uses rs2, but rs2 was nonzero")
++		panic("encodeR: instruction uses rs2, but rs2 is nonzero")
+ 	}
+ 	funct3 |= enc.funct3
+ 	funct7 |= enc.funct7
++	rs1 |= enc.rs1
+ 	rs2 |= enc.rs2
+ 	return funct7<<25 | rs2<<20 | rs1<<15 | funct3<<12 | rd<<7 | enc.opcode
+ }
+@@ -1407,6 +1448,26 @@ func encodeRFF(ins *instruction) uint32 {
+ 	return encodeR(ins.as, regF(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7)
+ }
+ 
++func encodeRVV(ins *instruction) uint32 {
++	return encodeR(ins.as, 0, regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
++}
++
++func encodeRVVi(ins *instruction) uint32 {
++	return encodeR(ins.as, immI(ins.as, ins.imm, 5), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
++}
++
++func encodeRVVu(ins *instruction) uint32 {
++	return encodeR(ins.as, immU(ins.as, ins.imm, 5), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
++}
++
++func encodeRVIV(ins *instruction) uint32 {
++	return encodeR(ins.as, regI(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
++}
++
++func encodeRVVV(ins *instruction) uint32 {
++	return encodeR(ins.as, regV(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
++}
++
+ // encodeI encodes an I-type RISC-V instruction.
+ func encodeI(as obj.As, rs1, rd, imm, funct7 uint32) uint32 {
+ 	enc := encode(as)
+@@ -1690,6 +1751,11 @@ var (
+ 	rFIEncoding   = encoding{encode: encodeRFI, validate: validateRFI, length: 4}
+ 	rIFEncoding   = encoding{encode: encodeRIF, validate: validateRIF, length: 4}
+ 	rFFEncoding   = encoding{encode: encodeRFF, validate: validateRFF, length: 4}
++	rVVEncoding   = encoding{encode: encodeRVV, validate: validateRVV, length: 4}
++	rVViEncoding  = encoding{encode: encodeRVVi, validate: validateRVVi, length: 4}
++	rVVuEncoding  = encoding{encode: encodeRVVu, validate: validateRVVu, length: 4}
++	rVIVEncoding  = encoding{encode: encodeRVIV, validate: validateRVIV, length: 4}
++	rVVVEncoding  = encoding{encode: encodeRVVV, validate: validateRVVV, length: 4}
+ 
+ 	iIIEncoding  = encoding{encode: encodeIII, validate: validateIII, length: 4}
+ 	iFEncoding   = encoding{encode: encodeIF, validate: validateIF, length: 4}
+@@ -2027,7 +2093,7 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AVSOXEI32V & obj.AMask: {enc: sVIVEncoding},
+ 	AVSOXEI64V & obj.AMask: {enc: sVIVEncoding},
+ 
+-	// 31.7.9. Vector Load/Store Whole Register Instructions
++	// 31.7.9: Vector Load/Store Whole Register Instructions
+ 	AVL1RE8V & obj.AMask:  {enc: iVEncoding},
+ 	AVL1RE16V & obj.AMask: {enc: iVEncoding},
+ 	AVL1RE32V & obj.AMask: {enc: iVEncoding},
+@@ -2049,6 +2115,177 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AVS4RV & obj.AMask:    {enc: sVEncoding},
+ 	AVS8RV & obj.AMask:    {enc: sVEncoding},
+ 
++	// 31.11.1: Vector Single-Width Integer Add and Subtract
++	AVADDVV & obj.AMask:  {enc: rVVVEncoding},
++	AVADDVX & obj.AMask:  {enc: rVIVEncoding},
++	AVADDVI & obj.AMask:  {enc: rVViEncoding},
++	AVSUBVV & obj.AMask:  {enc: rVVVEncoding},
++	AVSUBVX & obj.AMask:  {enc: rVIVEncoding},
++	AVRSUBVX & obj.AMask: {enc: rVIVEncoding},
++	AVRSUBVI & obj.AMask: {enc: rVViEncoding},
++
++	// 31.11.2: Vector Widening Integer Add/Subtract
++	AVWADDUVV & obj.AMask: {enc: rVVVEncoding},
++	AVWADDUVX & obj.AMask: {enc: rVIVEncoding},
++	AVWSUBUVV & obj.AMask: {enc: rVVVEncoding},
++	AVWSUBUVX & obj.AMask: {enc: rVIVEncoding},
++	AVWADDVV & obj.AMask:  {enc: rVVVEncoding},
++	AVWADDVX & obj.AMask:  {enc: rVIVEncoding},
++	AVWSUBVV & obj.AMask:  {enc: rVVVEncoding},
++	AVWSUBVX & obj.AMask:  {enc: rVIVEncoding},
++	AVWADDUWV & obj.AMask: {enc: rVVVEncoding},
++	AVWADDUWX & obj.AMask: {enc: rVIVEncoding},
++	AVWSUBUWV & obj.AMask: {enc: rVVVEncoding},
++	AVWSUBUWX & obj.AMask: {enc: rVIVEncoding},
++	AVWADDWV & obj.AMask:  {enc: rVVVEncoding},
++	AVWADDWX & obj.AMask:  {enc: rVIVEncoding},
++	AVWSUBWV & obj.AMask:  {enc: rVVVEncoding},
++	AVWSUBWX & obj.AMask:  {enc: rVIVEncoding},
++
++	// 31.11.3: Vector Integer Extension
++	AVZEXTVF2 & obj.AMask: {enc: rVVEncoding},
++	AVSEXTVF2 & obj.AMask: {enc: rVVEncoding},
++	AVZEXTVF4 & obj.AMask: {enc: rVVEncoding},
++	AVSEXTVF4 & obj.AMask: {enc: rVVEncoding},
++	AVZEXTVF8 & obj.AMask: {enc: rVVEncoding},
++	AVSEXTVF8 & obj.AMask: {enc: rVVEncoding},
++
++	// 31.11.4: Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
++	AVADCVVM & obj.AMask:  {enc: rVVVEncoding},
++	AVADCVXM & obj.AMask:  {enc: rVIVEncoding},
++	AVADCVIM & obj.AMask:  {enc: rVViEncoding},
++	AVMADCVVM & obj.AMask: {enc: rVVVEncoding},
++	AVMADCVXM & obj.AMask: {enc: rVIVEncoding},
++	AVMADCVIM & obj.AMask: {enc: rVViEncoding},
++	AVMADCVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMADCVX & obj.AMask:  {enc: rVIVEncoding},
++	AVMADCVI & obj.AMask:  {enc: rVViEncoding},
++	AVSBCVVM & obj.AMask:  {enc: rVVVEncoding},
++	AVSBCVXM & obj.AMask:  {enc: rVIVEncoding},
++	AVMSBCVVM & obj.AMask: {enc: rVVVEncoding},
++	AVMSBCVXM & obj.AMask: {enc: rVIVEncoding},
++	AVMSBCVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMSBCVX & obj.AMask:  {enc: rVIVEncoding},
++
++	// 31.11.5: Vector Bitwise Logical Instructions
++	AVANDVV & obj.AMask: {enc: rVVVEncoding},
++	AVANDVX & obj.AMask: {enc: rVIVEncoding},
++	AVANDVI & obj.AMask: {enc: rVViEncoding},
++	AVORVV & obj.AMask:  {enc: rVVVEncoding},
++	AVORVX & obj.AMask:  {enc: rVIVEncoding},
++	AVORVI & obj.AMask:  {enc: rVViEncoding},
++	AVXORVV & obj.AMask: {enc: rVVVEncoding},
++	AVXORVX & obj.AMask: {enc: rVIVEncoding},
++	AVXORVI & obj.AMask: {enc: rVViEncoding},
++
++	// 31.11.6: Vector Single-Width Shift Instructions
++	AVSLLVV & obj.AMask: {enc: rVVVEncoding},
++	AVSLLVX & obj.AMask: {enc: rVIVEncoding},
++	AVSLLVI & obj.AMask: {enc: rVVuEncoding},
++	AVSRLVV & obj.AMask: {enc: rVVVEncoding},
++	AVSRLVX & obj.AMask: {enc: rVIVEncoding},
++	AVSRLVI & obj.AMask: {enc: rVVuEncoding},
++	AVSRAVV & obj.AMask: {enc: rVVVEncoding},
++	AVSRAVX & obj.AMask: {enc: rVIVEncoding},
++	AVSRAVI & obj.AMask: {enc: rVVuEncoding},
++
++	// 31.11.7: Vector Narrowing Integer Right Shift Instructions
++	AVNSRLWV & obj.AMask: {enc: rVVVEncoding},
++	AVNSRLWX & obj.AMask: {enc: rVIVEncoding},
++	AVNSRLWI & obj.AMask: {enc: rVVuEncoding},
++	AVNSRAWV & obj.AMask: {enc: rVVVEncoding},
++	AVNSRAWX & obj.AMask: {enc: rVIVEncoding},
++	AVNSRAWI & obj.AMask: {enc: rVVuEncoding},
++
++	// 31.11.8: Vector Integer Compare Instructions
++	AVMSEQVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMSEQVX & obj.AMask:  {enc: rVIVEncoding},
++	AVMSEQVI & obj.AMask:  {enc: rVViEncoding},
++	AVMSNEVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMSNEVX & obj.AMask:  {enc: rVIVEncoding},
++	AVMSNEVI & obj.AMask:  {enc: rVViEncoding},
++	AVMSLTUVV & obj.AMask: {enc: rVVVEncoding},
++	AVMSLTUVX & obj.AMask: {enc: rVIVEncoding},
++	AVMSLTVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMSLTVX & obj.AMask:  {enc: rVIVEncoding},
++	AVMSLEUVV & obj.AMask: {enc: rVVVEncoding},
++	AVMSLEUVX & obj.AMask: {enc: rVIVEncoding},
++	AVMSLEUVI & obj.AMask: {enc: rVViEncoding},
++	AVMSLEVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMSLEVX & obj.AMask:  {enc: rVIVEncoding},
++	AVMSLEVI & obj.AMask:  {enc: rVViEncoding},
++	AVMSGTUVX & obj.AMask: {enc: rVIVEncoding},
++	AVMSGTUVI & obj.AMask: {enc: rVViEncoding},
++	AVMSGTVX & obj.AMask:  {enc: rVIVEncoding},
++	AVMSGTVI & obj.AMask:  {enc: rVViEncoding},
++
++	// 31.11.9: Vector Integer Min/Max Instructions
++	AVMINUVV & obj.AMask: {enc: rVVVEncoding},
++	AVMINUVX & obj.AMask: {enc: rVIVEncoding},
++	AVMINVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMINVX & obj.AMask:  {enc: rVIVEncoding},
++	AVMAXUVV & obj.AMask: {enc: rVVVEncoding},
++	AVMAXUVX & obj.AMask: {enc: rVIVEncoding},
++	AVMAXVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMAXVX & obj.AMask:  {enc: rVIVEncoding},
++
++	// 31.11.10: Vector Single-Width Integer Multiply Instructions
++	AVMULVV & obj.AMask:    {enc: rVVVEncoding},
++	AVMULVX & obj.AMask:    {enc: rVIVEncoding},
++	AVMULHVV & obj.AMask:   {enc: rVVVEncoding},
++	AVMULHVX & obj.AMask:   {enc: rVIVEncoding},
++	AVMULHUVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMULHUVX & obj.AMask:  {enc: rVIVEncoding},
++	AVMULHSUVV & obj.AMask: {enc: rVVVEncoding},
++	AVMULHSUVX & obj.AMask: {enc: rVIVEncoding},
++
++	// 31.11.11: Vector Integer Divide Instructions
++	AVDIVUVV & obj.AMask: {enc: rVVVEncoding},
++	AVDIVUVX & obj.AMask: {enc: rVIVEncoding},
++	AVDIVVV & obj.AMask:  {enc: rVVVEncoding},
++	AVDIVVX & obj.AMask:  {enc: rVIVEncoding},
++	AVREMUVV & obj.AMask: {enc: rVVVEncoding},
++	AVREMUVX & obj.AMask: {enc: rVIVEncoding},
++	AVREMVV & obj.AMask:  {enc: rVVVEncoding},
++	AVREMVX & obj.AMask:  {enc: rVIVEncoding},
++
++	// 31.11.12: Vector Widening Integer Multiply Instructions
++	AVWMULVV & obj.AMask:   {enc: rVVVEncoding},
++	AVWMULVX & obj.AMask:   {enc: rVIVEncoding},
++	AVWMULUVV & obj.AMask:  {enc: rVVVEncoding},
++	AVWMULUVX & obj.AMask:  {enc: rVIVEncoding},
++	AVWMULSUVV & obj.AMask: {enc: rVVVEncoding},
++	AVWMULSUVX & obj.AMask: {enc: rVIVEncoding},
++
++	// 31.11.13: Vector Single-Width Integer Multiply-Add Instructions
++	AVMACCVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMACCVX & obj.AMask:  {enc: rVIVEncoding},
++	AVNMSACVV & obj.AMask: {enc: rVVVEncoding},
++	AVNMSACVX & obj.AMask: {enc: rVIVEncoding},
++	AVMADDVV & obj.AMask:  {enc: rVVVEncoding},
++	AVMADDVX & obj.AMask:  {enc: rVIVEncoding},
++	AVNMSUBVV & obj.AMask: {enc: rVVVEncoding},
++	AVNMSUBVX & obj.AMask: {enc: rVIVEncoding},
++
++	// 31.11.14: Vector Widening Integer Multiply-Add Instructions
++	AVWMACCUVV & obj.AMask:  {enc: rVVVEncoding},
++	AVWMACCUVX & obj.AMask:  {enc: rVIVEncoding},
++	AVWMACCVV & obj.AMask:   {enc: rVVVEncoding},
++	AVWMACCVX & obj.AMask:   {enc: rVIVEncoding},
++	AVWMACCSUVV & obj.AMask: {enc: rVVVEncoding},
++	AVWMACCSUVX & obj.AMask: {enc: rVIVEncoding},
++	AVWMACCUSVX & obj.AMask: {enc: rVIVEncoding},
++
++	// 31.11.15: Vector Integer Merge Instructions
++	AVMERGEVVM & obj.AMask: {enc: rVVVEncoding},
++	AVMERGEVXM & obj.AMask: {enc: rVIVEncoding},
++	AVMERGEVIM & obj.AMask: {enc: rVViEncoding},
++
++	// 31.11.16: Vector Integer Move Instructions
++	AVMVVV & obj.AMask: {enc: rVVVEncoding},
++	AVMVVX & obj.AMask: {enc: rVIVEncoding},
++	AVMVVI & obj.AMask: {enc: rVViEncoding},
++
+ 	//
+ 	// Privileged ISA
+ 	//
+@@ -3020,6 +3257,142 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 			p.Ctxt.Diag("%v: too many operands for instruction", p)
+ 		}
+ 		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE
++
++	case AVADDVV, AVADDVX, AVSUBVV, AVSUBVX, AVRSUBVX, AVWADDUVV, AVWADDUVX, AVWSUBUVV, AVWSUBUVX,
++		AVWADDVV, AVWADDVX, AVWSUBVV, AVWSUBVX, AVWADDUWV, AVWADDUWX, AVWSUBUWV, AVWSUBUWX,
++		AVWADDWV, AVWADDWX, AVWSUBWV, AVWSUBWX, AVANDVV, AVANDVX, AVORVV, AVORVX, AVXORVV, AVXORVX,
++		AVSLLVV, AVSLLVX, AVSRLVV, AVSRLVX, AVSRAVV, AVSRAVX,
++		AVMSEQVV, AVMSEQVX, AVMSNEVV, AVMSNEVX, AVMSLTUVV, AVMSLTUVX, AVMSLTVV, AVMSLTVX,
++		AVMSLEUVV, AVMSLEUVX, AVMSLEVV, AVMSLEVX, AVMSGTUVX, AVMSGTVX,
++		AVMINUVV, AVMINUVX, AVMINVV, AVMINVX, AVMAXUVV, AVMAXUVX, AVMAXVV, AVMAXVX,
++		AVMULVV, AVMULVX, AVMULHVV, AVMULHVX, AVMULHUVV, AVMULHUVX, AVMULHSUVV, AVMULHSUVX,
++		AVDIVUVV, AVDIVUVX, AVDIVVV, AVDIVVX, AVREMUVV, AVREMUVX, AVREMVV, AVREMVX,
++		AVWMULVV, AVWMULVX, AVWMULUVV, AVWMULUVX, AVWMULSUVV, AVWMULSUVX,
++		AVNSRLWV, AVNSRLWX, AVNSRAWV, AVNSRAWX,
++		AVMACCVV, AVMACCVX, AVNMSACVV, AVNMSACVX, AVMADDVV, AVMADDVX, AVNMSUBVV, AVNMSUBVX,
++		AVWMACCUVV, AVWMACCUVX, AVWMACCVV, AVWMACCVX, AVWMACCSUVV, AVWMACCSUVX, AVWMACCUSVX:
++		// Set mask bit
++		switch {
++		case ins.rs3 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs3 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg), obj.REG_NONE
++
++	case AVADDVI, AVRSUBVI, AVANDVI, AVORVI, AVXORVI, AVMSEQVI, AVMSNEVI, AVMSLEUVI, AVMSLEVI, AVMSGTUVI, AVMSGTVI,
++		AVSLLVI, AVSRLVI, AVSRAVI, AVNSRLWI, AVNSRAWI:
++		// Set mask bit
++		switch {
++		case ins.rs3 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs3 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), obj.REG_NONE, uint32(p.Reg), obj.REG_NONE
++
++	case AVZEXTVF2, AVSEXTVF2, AVZEXTVF4, AVSEXTVF4, AVZEXTVF8, AVSEXTVF8:
++		// Set mask bit
++		switch {
++		case ins.rs1 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs1 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		ins.rs1 = obj.REG_NONE
++
++	case AVMVVV, AVMVVX:
++		if ins.rs1 != obj.REG_NONE {
++			p.Ctxt.Diag("%v: too many operands for instruction", p)
++		}
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), REG_V0
++
++	case AVMVVI:
++		if ins.rs1 != obj.REG_NONE {
++			p.Ctxt.Diag("%v: too many operands for instruction", p)
++		}
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), obj.REG_NONE, REG_V0
++
++	case AVADCVVM, AVADCVXM, AVMADCVVM, AVMADCVXM, AVSBCVVM, AVSBCVXM, AVMSBCVVM, AVMSBCVXM, AVADCVIM, AVMADCVIM,
++		AVMERGEVVM, AVMERGEVXM, AVMERGEVIM:
++		if ins.rs3 != REG_V0 {
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg), obj.REG_NONE
++
++	case AVMADCVV, AVMADCVX, AVMSBCVV, AVMSBCVX, AVMADCVI:
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg)
++
++	case AVNEGV, AVWCVTXXV, AVWCVTUXXV, AVNCVTXXW:
++		// Set mask bit
++		switch {
++		case ins.rs1 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs1 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		switch ins.as {
++		case AVNEGV:
++			ins.as = AVRSUBVX
++		case AVWCVTXXV:
++			ins.as = AVWADDVX
++		case AVWCVTUXXV:
++			ins.as = AVWADDUVX
++		case AVNCVTXXW:
++			ins.as = AVNSRLWX
++		}
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), REG_X0, uint32(p.From.Reg)
++
++	case AVNOTV:
++		// Set mask bit
++		switch {
++		case ins.rs1 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs1 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		ins.as = AVXORVI
++		ins.rd, ins.rs1, ins.rs2, ins.imm = uint32(p.To.Reg), obj.REG_NONE, uint32(p.From.Reg), -1
++
++	case AVMSGTVV, AVMSGTUVV, AVMSGEVV, AVMSGEUVV:
++		// Set mask bit
++		switch {
++		case ins.rs3 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs3 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		switch ins.as {
++		case AVMSGTVV:
++			ins.as = AVMSLTVV
++		case AVMSGTUVV:
++			ins.as = AVMSLTUVV
++		case AVMSGEVV:
++			ins.as = AVMSLEVV
++		case AVMSGEUVV:
++			ins.as = AVMSLEUVV
++		}
++		ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), obj.REG_NONE
++
++	case AVMSLTVI, AVMSLTUVI, AVMSGEVI, AVMSGEUVI:
++		// Set mask bit
++		switch {
++		case ins.rs3 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs3 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		switch ins.as {
++		case AVMSLTVI:
++			ins.as = AVMSLEVI
++		case AVMSLTUVI:
++			ins.as = AVMSLEUVI
++		case AVMSGEVI:
++			ins.as = AVMSGTVI
++		case AVMSGEUVI:
++			ins.as = AVMSGTUVI
++		}
++		ins.rd, ins.rs1, ins.rs2, ins.rs3, ins.imm = uint32(p.To.Reg), obj.REG_NONE, uint32(p.Reg), obj.REG_NONE, ins.imm-1
+ 	}
+ 
+ 	for _, ins := range inss {
+-- 
+2.39.5
+
diff --git a/2090-cmd-internal-obj-riscv-add-support-for-vector-fixed-.patch b/2090-cmd-internal-obj-riscv-add-support-for-vector-fixed-.patch
new file mode 100644
index 0000000..1ffcdeb
--- /dev/null
+++ b/2090-cmd-internal-obj-riscv-add-support-for-vector-fixed-.patch
@@ -0,0 +1,266 @@
+From 7be6b0fd3f1ef4891d22549b127a7a087fbbd45c Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 090/119] cmd/internal/obj/riscv: add support for vector
+ fixed-point arithmetic instructions
+
+Add support for vector fixed-point arithmetic instructions to the
+RISC-V assembler. This includes single width saturating addition
+and subtraction, averaging addition and subtraction and scaling
+shift instructions.
+
+Change-Id: I9aa27e9565ad016ba5bb2b479e1ba70db24e4ff5
+Reviewed-on: https://go-review.googlesource.com/c/go/+/646776
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   | 74 +++++++++++++++++++
+ .../asm/internal/asm/testdata/riscv64error.s  | 32 ++++++++
+ .../internal/asm/testdata/riscv64validation.s | 32 ++++++++
+ src/cmd/internal/obj/riscv/obj.go             | 51 ++++++++++++-
+ 4 files changed, 186 insertions(+), 3 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 852104375b..506fe2a442 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -863,6 +863,80 @@ start:
+ 	VMVVX		X10, V3				// d741055e
+ 	VMVVI		$15, V3				// d7b1075e
+ 
++	// 31.12.1: Vector Single-Width Saturating Add and Subtract
++	VSADDUVV	V1, V2, V3			// d7812082
++	VSADDUVV	V1, V2, V0, V3			// d7812080
++	VSADDUVX	X10, V2, V3			// d7412582
++	VSADDUVX	X10, V2, V0, V3			// d7412580
++	VSADDUVI	$15, V2, V3			// d7b12782
++	VSADDUVI	$15, V2, V0, V3			// d7b12780
++	VSADDVV		V1, V2, V3			// d7812086
++	VSADDVV		V1, V2, V0, V3			// d7812084
++	VSADDVX		X10, V2, V3			// d7412586
++	VSADDVX		X10, V2, V0, V3			// d7412584
++	VSADDVI		$15, V2, V3			// d7b12786
++	VSADDVI		$15, V2, V0, V3			// d7b12784
++	VSSUBUVV	V1, V2, V3			// d781208a
++	VSSUBUVV	V1, V2, V0, V3			// d7812088
++	VSSUBUVX	X10, V2, V3			// d741258a
++	VSSUBUVX	X10, V2, V0, V3			// d7412588
++	VSSUBVV		V1, V2, V3			// d781208e
++	VSSUBVV		V1, V2, V0, V3			// d781208c
++	VSSUBVX		X10, V2, V3			// d741258e
++	VSSUBVX		X10, V2, V0, V3			// d741258c
++
++	// 31.12.2: Vector Single-Width Averaging Add and Subtract
++	VAADDUVV	V1, V2, V3			// d7a12022
++	VAADDUVV	V1, V2, V0, V3			// d7a12020
++	VAADDUVX	X10, V2, V3			// d7612522
++	VAADDUVX	X10, V2, V0, V3			// d7612520
++	VAADDVV		V1, V2, V3			// d7a12026
++	VAADDVV		V1, V2, V0, V3			// d7a12024
++	VAADDVX		X10, V2, V3			// d7612526
++	VAADDVX		X10, V2, V0, V3			// d7612524
++	VASUBUVV	V1, V2, V3			// d7a1202a
++	VASUBUVV	V1, V2, V0, V3			// d7a12028
++	VASUBUVX	X10, V2, V3			// d761252a
++	VASUBUVX	X10, V2, V0, V3			// d7612528
++	VASUBVV		V1, V2, V3			// d7a1202e
++	VASUBVV		V1, V2, V0, V3			// d7a1202c
++	VASUBVX		X10, V2, V3			// d761252e
++	VASUBVX		X10, V2, V0, V3			// d761252c
++
++	// 31.12.3: Vector Single-Width Fractional Multiply with Rounding and Saturation
++	VSMULVV		V1, V2, V3			// d781209e
++	VSMULVV		V1, V2, V0, V3			// d781209c
++	VSMULVX		X10, V2, V3			// d741259e
++	VSMULVX		X10, V2, V0, V3			// d741259c
++
++	// 31.12.4: Vector Single-Width Scaling Shift Instructions
++	VSSRLVV		V1, V2, V3			// d78120aa
++	VSSRLVV		V1, V2, V0, V3			// d78120a8
++	VSSRLVX		X10, V2, V3			// d74125aa
++	VSSRLVX		X10, V2, V0, V3			// d74125a8
++	VSSRLVI		$15, V2, V3			// d7b127aa
++	VSSRLVI		$15, V2, V0, V3			// d7b127a8
++	VSSRAVV		V1, V2, V3			// d78120ae
++	VSSRAVV		V1, V2, V0, V3			// d78120ac
++	VSSRAVX		X10, V2, V3			// d74125ae
++	VSSRAVX		X10, V2, V0, V3			// d74125ac
++	VSSRAVI		$16, V2, V3			// d73128ae
++	VSSRAVI		$16, V2, V0, V3			// d73128ac
++
++	// 31.12.5: Vector Narrowing Fixed-Point Clip Instructions
++	VNCLIPUWV	V1, V2, V3			// d78120ba
++	VNCLIPUWV	V1, V2, V0, V3			// d78120b8
++	VNCLIPUWX	X10, V2, V3			// d74125ba
++	VNCLIPUWX	X10, V2, V0, V3			// d74125b8
++	VNCLIPUWI	$16, V2, V3			// d73128ba
++	VNCLIPUWI	$16, V2, V0, V3			// d73128b8
++	VNCLIPWV	V1, V2, V3			// d78120be
++	VNCLIPWV	V1, V2, V0, V3			// d78120bc
++	VNCLIPWX	X10, V2, V3			// d74125be
++	VNCLIPWX	X10, V2, V0, V3			// d74125bc
++	VNCLIPWI	$16, V2, V3			// d73128be
++	VNCLIPWI	$16, V2, V0, V3			// d73128bc
++
+ 	//
+ 	// Privileged ISA
+ 	//
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index 025d63a15c..6a7c9b9444 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -221,5 +221,37 @@ TEXT errors(SB),$0
+ 	VMVVV	V1, V2, V3			// ERROR "too many operands for instruction"
+ 	VMVVX	X10, V2, V3			// ERROR "too many operands for instruction"
+ 	VMVVI	$15, V2, V3			// ERROR "too many operands for instruction"
++	VSADDUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSADDUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VSADDUVI $15, V2, V4, V3		// ERROR "invalid vector mask register"
++	VSADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSADDVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSADDVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSUBUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSUBUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VSSUBVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSUBVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VAADDUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VAADDUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VAADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VAADDVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VASUBUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VASUBUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VASUBVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VASUBVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSMULVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSMULVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRLVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRLVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRLVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRAVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRAVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRAVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNCLIPUWV V1, V2, V4, V3		// ERROR "invalid vector mask register"
++	VNCLIPUWX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VNCLIPUWI $16, V2, V4, V3		// ERROR "invalid vector mask register"
++	VNCLIPWV V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNCLIPWX X10, V2, V4, V3		// ERROR "invalid vector mask register"
++	VNCLIPWI $16, V2, V4, V3		// ERROR "invalid vector mask register"
+ 
+ 	RET
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+index 602cab2c2e..c6f71e64fb 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+@@ -237,5 +237,37 @@ TEXT validation(SB),$0
+ 	VMVVX	V1, V2				// ERROR "expected integer register in rs1 position"
+ 	VMVVI	$16, V2				// ERROR "signed immediate 16 must be in range [-16, 15]"
+ 	VMVVI	$-17, V2			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VSADDUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSADDUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSADDUVI $16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VSADDUVI $-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VSSUBUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSSUBUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VAADDUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VAADDUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VAADDVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VAADDVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VASUBUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VASUBUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VASUBVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VASUBVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSMULVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSMULVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSSRLVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSSRLVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSSRLVI $32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
++	VSSRLVI $-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
++	VSSRAVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSSRAVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSSRAVI $32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
++	VSSRAVI $-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
++	VNCLIPUWV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNCLIPUWX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNCLIPUWI $32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
++	VNCLIPUWI $-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
++	VNCLIPWV X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNCLIPWX V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNCLIPWI $32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
++	VNCLIPWI $-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
+ 
+ 	RET
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index d85bdd302c..e7870000cf 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -2286,6 +2286,48 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AVMVVX & obj.AMask: {enc: rVIVEncoding},
+ 	AVMVVI & obj.AMask: {enc: rVViEncoding},
+ 
++	// 31.12.1: Vector Single-Width Saturating Add and Subtract
++	AVSADDUVV & obj.AMask: {enc: rVVVEncoding},
++	AVSADDUVX & obj.AMask: {enc: rVIVEncoding},
++	AVSADDUVI & obj.AMask: {enc: rVViEncoding},
++	AVSADDVV & obj.AMask:  {enc: rVVVEncoding},
++	AVSADDVX & obj.AMask:  {enc: rVIVEncoding},
++	AVSADDVI & obj.AMask:  {enc: rVViEncoding},
++	AVSSUBUVV & obj.AMask: {enc: rVVVEncoding},
++	AVSSUBUVX & obj.AMask: {enc: rVIVEncoding},
++	AVSSUBVV & obj.AMask:  {enc: rVVVEncoding},
++	AVSSUBVX & obj.AMask:  {enc: rVIVEncoding},
++
++	// 31.12.2: Vector Single-Width Averaging Add and Subtract
++	AVAADDUVV & obj.AMask: {enc: rVVVEncoding},
++	AVAADDUVX & obj.AMask: {enc: rVIVEncoding},
++	AVAADDVV & obj.AMask:  {enc: rVVVEncoding},
++	AVAADDVX & obj.AMask:  {enc: rVIVEncoding},
++	AVASUBUVV & obj.AMask: {enc: rVVVEncoding},
++	AVASUBUVX & obj.AMask: {enc: rVIVEncoding},
++	AVASUBVV & obj.AMask:  {enc: rVVVEncoding},
++	AVASUBVX & obj.AMask:  {enc: rVIVEncoding},
++
++	// 31.12.3: Vector Single-Width Fractional Multiply with Rounding and Saturation
++	AVSMULVV & obj.AMask: {enc: rVVVEncoding},
++	AVSMULVX & obj.AMask: {enc: rVIVEncoding},
++
++	// 31.12.4: Vector Single-Width Scaling Shift Instructions
++	AVSSRLVV & obj.AMask: {enc: rVVVEncoding},
++	AVSSRLVX & obj.AMask: {enc: rVIVEncoding},
++	AVSSRLVI & obj.AMask: {enc: rVVuEncoding},
++	AVSSRAVV & obj.AMask: {enc: rVVVEncoding},
++	AVSSRAVX & obj.AMask: {enc: rVIVEncoding},
++	AVSSRAVI & obj.AMask: {enc: rVVuEncoding},
++
++	// 31.12.5: Vector Narrowing Fixed-Point Clip Instructions
++	AVNCLIPUWV & obj.AMask: {enc: rVVVEncoding},
++	AVNCLIPUWX & obj.AMask: {enc: rVIVEncoding},
++	AVNCLIPUWI & obj.AMask: {enc: rVVuEncoding},
++	AVNCLIPWV & obj.AMask:  {enc: rVVVEncoding},
++	AVNCLIPWX & obj.AMask:  {enc: rVIVEncoding},
++	AVNCLIPWI & obj.AMask:  {enc: rVVuEncoding},
++
+ 	//
+ 	// Privileged ISA
+ 	//
+@@ -3267,10 +3309,13 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		AVMINUVV, AVMINUVX, AVMINVV, AVMINVX, AVMAXUVV, AVMAXUVX, AVMAXVV, AVMAXVX,
+ 		AVMULVV, AVMULVX, AVMULHVV, AVMULHVX, AVMULHUVV, AVMULHUVX, AVMULHSUVV, AVMULHSUVX,
+ 		AVDIVUVV, AVDIVUVX, AVDIVVV, AVDIVVX, AVREMUVV, AVREMUVX, AVREMVV, AVREMVX,
+-		AVWMULVV, AVWMULVX, AVWMULUVV, AVWMULUVX, AVWMULSUVV, AVWMULSUVX,
+-		AVNSRLWV, AVNSRLWX, AVNSRAWV, AVNSRAWX,
++		AVWMULVV, AVWMULVX, AVWMULUVV, AVWMULUVX, AVWMULSUVV, AVWMULSUVX, AVNSRLWV, AVNSRLWX, AVNSRAWV, AVNSRAWX,
+ 		AVMACCVV, AVMACCVX, AVNMSACVV, AVNMSACVX, AVMADDVV, AVMADDVX, AVNMSUBVV, AVNMSUBVX,
+-		AVWMACCUVV, AVWMACCUVX, AVWMACCVV, AVWMACCVX, AVWMACCSUVV, AVWMACCSUVX, AVWMACCUSVX:
++		AVWMACCUVV, AVWMACCUVX, AVWMACCVV, AVWMACCVX, AVWMACCSUVV, AVWMACCSUVX, AVWMACCUSVX,
++		AVSADDUVV, AVSADDUVX, AVSADDUVI, AVSADDVV, AVSADDVX, AVSADDVI, AVSSUBUVV, AVSSUBUVX, AVSSUBVV, AVSSUBVX,
++		AVAADDUVV, AVAADDUVX, AVAADDVV, AVAADDVX, AVASUBUVV, AVASUBUVX, AVASUBVV, AVASUBVX,
++		AVSMULVV, AVSMULVX, AVSSRLVV, AVSSRLVX, AVSSRLVI, AVSSRAVV, AVSSRAVX, AVSSRAVI,
++		AVNCLIPUWV, AVNCLIPUWX, AVNCLIPUWI, AVNCLIPWV, AVNCLIPWX, AVNCLIPWI:
+ 		// Set mask bit
+ 		switch {
+ 		case ins.rs3 == obj.REG_NONE:
+-- 
+2.39.5
+
diff --git a/2091-crypto-sha512-remove-unnecessary-move-op-replace-wit.patch b/2091-crypto-sha512-remove-unnecessary-move-op-replace-wit.patch
new file mode 100644
index 0000000..0d5cc8e
--- /dev/null
+++ b/2091-crypto-sha512-remove-unnecessary-move-op-replace-wit.patch
@@ -0,0 +1,66 @@
+From cdba1be01401e1cbb07ea3f4a94b860c0917b7d0 Mon Sep 17 00:00:00 2001
+From: Julian Zhu <julian.oerv@isrc.iscas.ac.cn>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 091/119] crypto/sha512: remove unnecessary move op, replace
+ with direct add
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+goos: linux
+goarch: riscv64
+pkg: crypto/sha512
+                    │      o      │                 n                  │
+                    │   sec/op    │   sec/op     vs base               │
+Hash8Bytes/New-4      3.499µ ± 0%   3.444µ ± 0%  -1.56% (p=0.000 n=10)
+Hash8Bytes/Sum384-4   4.012µ ± 0%   3.957µ ± 0%  -1.37% (p=0.000 n=10)
+Hash8Bytes/Sum512-4   4.218µ ± 0%   4.162µ ± 0%  -1.32% (p=0.000 n=10)
+Hash1K/New-4          17.07µ ± 0%   16.57µ ± 0%  -2.97% (p=0.000 n=10)
+Hash1K/Sum384-4       17.59µ ± 0%   17.11µ ± 0%  -2.76% (p=0.000 n=10)
+Hash1K/Sum512-4       17.78µ ± 0%   17.30µ ± 0%  -2.72% (p=0.000 n=10)
+Hash8K/New-4          112.2µ ± 0%   108.7µ ± 0%  -3.08% (p=0.000 n=10)
+Hash8K/Sum384-4       112.7µ ± 0%   109.2µ ± 0%  -3.09% (p=0.000 n=10)
+Hash8K/Sum512-4       112.9µ ± 0%   109.4µ ± 0%  -3.07% (p=0.000 n=10)
+geomean               19.72µ        19.24µ       -2.44%
+
+                    │      o       │                  n                  │
+                    │     B/s      │     B/s       vs base               │
+Hash8Bytes/New-4      2.184Mi ± 0%   2.213Mi ± 0%  +1.31% (p=0.000 n=10)
+Hash8Bytes/Sum384-4   1.898Mi ± 1%   1.926Mi ± 0%  +1.51% (p=0.000 n=10)
+Hash8Bytes/Sum512-4   1.812Mi ± 1%   1.831Mi ± 0%  +1.05% (p=0.000 n=10)
+Hash1K/New-4          57.20Mi ± 0%   58.95Mi ± 0%  +3.06% (p=0.000 n=10)
+Hash1K/Sum384-4       55.51Mi ± 0%   57.09Mi ± 0%  +2.84% (p=0.000 n=10)
+Hash1K/Sum512-4       54.91Mi ± 0%   56.44Mi ± 0%  +2.79% (p=0.000 n=10)
+Hash8K/New-4          69.63Mi ± 0%   71.84Mi ± 0%  +3.17% (p=0.000 n=10)
+Hash8K/Sum384-4       69.30Mi ± 0%   71.52Mi ± 0%  +3.20% (p=0.000 n=10)
+Hash8K/Sum512-4       69.19Mi ± 0%   71.39Mi ± 0%  +3.18% (p=0.000 n=10)
+geomean               19.65Mi        20.13Mi       +2.45%
+
+Change-Id: Ib68b934276ec08246d4ae60ef9870c233f0eac69
+Reviewed-on: https://go-review.googlesource.com/c/go/+/665595
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Roland Shoemaker <roland@golang.org>
+---
+ src/crypto/sha512/sha512block_riscv64.s | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/src/crypto/sha512/sha512block_riscv64.s b/src/crypto/sha512/sha512block_riscv64.s
+index 0281464e4d..2bc5d889ea 100644
+--- a/src/crypto/sha512/sha512block_riscv64.s
++++ b/src/crypto/sha512/sha512block_riscv64.s
+@@ -136,9 +136,8 @@
+ #define SHA512ROUND(index, a, b, c, d, e, f, g, h) \
+ 	SHA512T1(index, e, f, g, h); \
+ 	SHA512T2(a, b, c); \
+-	MOV	X6, h; \
+ 	ADD	X5, d; \
+-	ADD	X5, h
++	ADD	X6, X5, h
+ 
+ #define SHA512ROUND0(index, a, b, c, d, e, f, g, h) \
+ 	MSGSCHEDULE0(index); \
+-- 
+2.39.5
+
diff --git a/2092-crypto-sha256-improve-performance-of-riscv64-assembl.patch b/2092-crypto-sha256-improve-performance-of-riscv64-assembl.patch
new file mode 100644
index 0000000..2d269eb
--- /dev/null
+++ b/2092-crypto-sha256-improve-performance-of-riscv64-assembl.patch
@@ -0,0 +1,120 @@
+From a70b3bc13e145ac78e32c1104cce9a511200f286 Mon Sep 17 00:00:00 2001
+From: Julian Zhu <julian.oerv@isrc.iscas.ac.cn>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 092/119] crypto/sha256: improve performance of riscv64
+ assembly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Simplified the implementation of Ch and Maj by reducing instructions, based on CL 605495 which made the same change for SHA-512.
+
+goos: linux
+goarch: riscv64
+pkg: crypto/sha256
+cpu: Spacemit(R) X60
+                    │  oldsha256  │              newsha256              │
+                    │   sec/op    │   sec/op     vs base                │
+Hash8Bytes/New-8      2.303µ ± 0%   2.098µ ± 0%   -8.90% (p=0.000 n=10)
+Hash8Bytes/Sum224-8   2.535µ ± 0%   2.329µ ± 0%   -8.13% (p=0.000 n=10)
+Hash8Bytes/Sum256-8   2.558µ ± 0%   2.352µ ± 0%   -8.04% (p=0.000 n=10)
+Hash1K/New-8          28.67µ ± 0%   25.21µ ± 0%  -12.06% (p=0.000 n=10)
+Hash1K/Sum224-8       28.89µ ± 0%   25.43µ ± 0%  -11.99% (p=0.000 n=10)
+Hash1K/Sum256-8       28.91µ ± 0%   25.43µ ± 0%  -12.04% (p=0.000 n=10)
+Hash8K/New-8          218.0µ ± 1%   192.7µ ± 2%  -11.58% (p=0.000 n=10)
+Hash8K/Sum224-8       218.0µ ± 1%   193.6µ ± 1%  -11.20% (p=0.000 n=10)
+Hash8K/Sum256-8       219.1µ ± 1%   193.4µ ± 1%  -11.74% (p=0.000 n=10)
+geomean               24.93µ        22.28µ       -10.65%
+
+                    │  oldsha256   │              newsha256               │
+                    │     B/s      │     B/s       vs base                │
+Hash8Bytes/New-8      3.309Mi ± 0%   3.633Mi ± 0%   +9.80% (p=0.000 n=10)
+Hash8Bytes/Sum224-8   3.009Mi ± 0%   3.271Mi ± 0%   +8.72% (p=0.000 n=10)
+Hash8Bytes/Sum256-8   2.985Mi ± 0%   3.242Mi ± 0%   +8.63% (p=0.000 n=10)
+Hash1K/New-8          34.06Mi ± 0%   38.73Mi ± 0%  +13.72% (p=0.000 n=10)
+Hash1K/Sum224-8       33.80Mi ± 0%   38.40Mi ± 0%  +13.63% (p=0.000 n=10)
+Hash1K/Sum256-8       33.78Mi ± 0%   38.40Mi ± 0%  +13.69% (p=0.000 n=10)
+Hash8K/New-8          35.84Mi ± 1%   40.54Mi ± 2%  +13.10% (p=0.000 n=10)
+Hash8K/Sum224-8       35.83Mi ± 1%   40.35Mi ± 1%  +12.61% (p=0.000 n=10)
+Hash8K/Sum256-8       35.66Mi ± 1%   40.40Mi ± 1%  +13.29% (p=0.000 n=10)
+geomean               15.54Mi        17.39Mi       +11.89%
+
+Change-Id: I9aa692fcfd70634dc6c308db9b5d06bd82ac2302
+Reviewed-on: https://go-review.googlesource.com/c/go/+/639495
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Junyang Shao <shaojunyang@google.com>
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/crypto/sha256/sha256block_riscv64.s | 27 ++++++++++++-------------
+ 1 file changed, 13 insertions(+), 14 deletions(-)
+
+diff --git a/src/crypto/sha256/sha256block_riscv64.s b/src/crypto/sha256/sha256block_riscv64.s
+index fc7bf65e41..2cf7454ba4 100644
+--- a/src/crypto/sha256/sha256block_riscv64.s
++++ b/src/crypto/sha256/sha256block_riscv64.s
+@@ -86,47 +86,46 @@
+ //   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
+ //     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
+ //     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
++//                 = ((y XOR z) AND x) XOR z
+ #define SHA256T1(index, e, f, g, h) \
+ 	MOVWU	(index*4)(X18), X8; \
+ 	ADD	X5, h; \
+ 	RORW	$6, e, X6; \
+ 	ADD	X8, h; \
+ 	RORW	$11, e, X7; \
+-	XOR	X7, X6; \
+ 	RORW	$25, e, X8; \
++	XOR	X7, X6; \
++	XOR	f, g, X5; \
+ 	XOR	X8, X6; \
++	AND	e, X5; \
+ 	ADD	X6, h; \
+-	AND	e, f, X5; \
+-	NOT	e, X7; \
+-	AND	g, X7; \
+-	XOR	X7, X5; \
++	XOR	g, X5; \
+ 	ADD	h, X5
+ 
+ // Calculate T2 in X6.
+ //   T2 = BIGSIGMA0(a) + Maj(a, b, c)
+ //     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
+ //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
++//                  = ((y XOR z) AND x) XOR (y AND z)
+ #define SHA256T2(a, b, c) \
+ 	RORW	$2, a, X6; \
+ 	RORW	$13, a, X7; \
+-	XOR	X7, X6; \
+ 	RORW	$22, a, X8; \
++	XOR	X7, X6; \
++	XOR	b, c, X9; \
++	AND	b, c, X7; \
++	AND	a, X9; \
+ 	XOR	X8, X6; \
+-	AND	a, b, X7; \
+-	AND	a, c, X8; \
+-	XOR	X8, X7; \
+-	AND	b, c, X9; \
+-	XOR	X9, X7; \
+-	ADD	X7, X6
++	XOR	X7, X9; \
++	ADD	X9, X6
+ 
+ // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
+ // The values for e and a are stored in d and h, ready for rotation.
+ #define SHA256ROUND(index, a, b, c, d, e, f, g, h) \
+ 	SHA256T1(index, e, f, g, h); \
+ 	SHA256T2(a, b, c); \
+-	MOV	X6, h; \
+ 	ADD	X5, d; \
+-	ADD	X5, h
++	ADD	X6, X5, h
+ 
+ #define SHA256ROUND0(index, a, b, c, d, e, f, g, h) \
+ 	MSGSCHEDULE0(index); \
+-- 
+2.39.5
+
diff --git a/2093-cmd-link-fix-cgo-on-riscv64-when-building-with-gcc-1.patch b/2093-cmd-link-fix-cgo-on-riscv64-when-building-with-gcc-1.patch
new file mode 100644
index 0000000..196a397
--- /dev/null
+++ b/2093-cmd-link-fix-cgo-on-riscv64-when-building-with-gcc-1.patch
@@ -0,0 +1,81 @@
+From a21957e9bed511d3e108c993e88bfe14c3efc66b Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 093/119] cmd/link: fix cgo on riscv64 when building with
+ gcc-15
+
+It's not currently possible to build cgo programs that are partially
+compiled with gcc-15 on riscv64 using the internal linker. There are
+two reasons for this.
+
+1. When gcc-15 compiles _cgo_export.c, which contains no actual code,
+   for a riscv64 target, it emits a label in the .text section called
+   .Letext0. This label is referred to by another section, .debug_line,
+   and an entry is generated in the symbol table for it. The Go linker
+   panics when processing the .Letext0 symbol in _cgo_export.o, as it
+   occurs in an empty section.
+2. GCC-15 is generating additional debug symbols with the .LVUS
+   prefix, e.g., .LVUS33, that need to be ignored.
+
+We fix the issue by removing the check in
+cmd/link/internal/loader/loader.go that panics if we encounter a
+symbol in an empty section (the comments preceding this check suggest
+it's safe to remove it) and by adding .LVUS to the list of symbol
+prefixes to ignore.
+
+Fixes #72840
+
+Change-Id: I00658b6bdd01606dde1581b5bc2f42edfc37de82
+Reviewed-on: https://go-review.googlesource.com/c/go/+/668276
+Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/cmd/link/internal/loadelf/ldelf.go | 8 +++++++-
+ src/cmd/link/internal/loader/loader.go | 8 --------
+ 2 files changed, 7 insertions(+), 9 deletions(-)
+
+diff --git a/src/cmd/link/internal/loadelf/ldelf.go b/src/cmd/link/internal/loadelf/ldelf.go
+index 942d54c06c..dea77bafcb 100644
+--- a/src/cmd/link/internal/loadelf/ldelf.go
++++ b/src/cmd/link/internal/loadelf/ldelf.go
+@@ -604,7 +604,13 @@ func Load(l *loader.Loader, arch *sys.Arch, localSymVersion int, f *bio.Reader,
+ 			if strings.HasPrefix(elfsym.name, ".LASF") { // gcc on s390x does this
+ 				continue
+ 			}
+-			return errorf("%v: sym#%d (%s): ignoring symbol in section %d (type %d)", elfsym.sym, i, elfsym.name, elfsym.shndx, elfsym.type_)
++
++			if strings.HasPrefix(elfsym.name, ".LASF") || strings.HasPrefix(elfsym.name, ".LLRL") || strings.HasPrefix(elfsym.name, ".LLST") || strings.HasPrefix(elfsym.name, ".LVUS") {
++				// gcc on s390x and riscv64 does this.
++				continue
++			}
++
++			return errorf("%v: sym#%d (%q): ignoring symbol in section %d (%q) (type %d)", elfsym.sym, i, elfsym.name, elfsym.shndx, sect.name, elfsym.type_)
+ 		}
+ 
+ 		s := elfsym.sym
+diff --git a/src/cmd/link/internal/loader/loader.go b/src/cmd/link/internal/loader/loader.go
+index 4d0b497d8e..223ce5dd59 100644
+--- a/src/cmd/link/internal/loader/loader.go
++++ b/src/cmd/link/internal/loader/loader.go
+@@ -1724,14 +1724,6 @@ func (l *Loader) GetVarDwarfAuxSym(i Sym) Sym {
+ // expected to have the actual content/payload) and then a set of
+ // interior loader.Sym's that point into a portion of the container.
+ func (l *Loader) AddInteriorSym(container Sym, interior Sym) {
+-	// Container symbols are expected to have content/data.
+-	// NB: this restriction may turn out to be too strict (it's possible
+-	// to imagine a zero-sized container with an interior symbol pointing
+-	// into it); it's ok to relax or remove it if we counter an
+-	// oddball host object that triggers this.
+-	if l.SymSize(container) == 0 && len(l.Data(container)) == 0 {
+-		panic("unexpected empty container symbol")
+-	}
+ 	// The interior symbols for a container are not expected to have
+ 	// content/data or relocations.
+ 	if len(l.Data(interior)) != 0 {
+-- 
+2.39.5
+
diff --git a/2094-internal-bytealg-deduplicate-code-between-Count-Coun.patch b/2094-internal-bytealg-deduplicate-code-between-Count-Coun.patch
new file mode 100644
index 0000000..5e55e2f
--- /dev/null
+++ b/2094-internal-bytealg-deduplicate-code-between-Count-Coun.patch
@@ -0,0 +1,63 @@
+From 95ad58dbd546edbcc4109fb6603254c4bf60dcb1 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 094/119] internal/bytealg: deduplicate code between
+ Count/CountString for riscv64
+
+Change-Id: I22eb4e7444e5fe5f6767cc960895f3c6e2fa13cc
+Reviewed-on: https://go-review.googlesource.com/c/go/+/661615
+Reviewed-by: Keith Randall <khr@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+Auto-Submit: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+---
+ src/internal/bytealg/count_riscv64.s | 28 +++++++---------------------
+ 1 file changed, 7 insertions(+), 21 deletions(-)
+
+diff --git a/src/internal/bytealg/count_riscv64.s b/src/internal/bytealg/count_riscv64.s
+index 3f255cd263..6cc49d1388 100644
+--- a/src/internal/bytealg/count_riscv64.s
++++ b/src/internal/bytealg/count_riscv64.s
+@@ -5,6 +5,13 @@
+ #include "go_asm.h"
+ #include "textflag.h"
+ 
++TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
++	// X10 = s_base
++	// X11 = s_len
++	// X12 = byte to count
++	MOV	X12, X13
++	JMP	·Count<ABIInternal>(SB)
++
+ TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
+ 	// X10 = b_base
+ 	// X11 = b_len
+@@ -26,24 +33,3 @@ loop:
+ done:
+ 	MOV	X14, X10
+ 	RET
+-
+-TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
+-	// X10 = s_base
+-	// X11 = s_len
+-	// X12 = byte to count
+-	AND	$0xff, X12
+-	MOV	ZERO, X14	// count
+-	ADD	X10, X11	// end
+-
+-	PCALIGN	$16
+-loop:
+-	BEQ	X10, X11, done
+-	MOVBU	(X10), X15
+-	ADD	$1, X10
+-	BNE	X12, X15, loop
+-	ADD	$1, X14
+-	JMP	loop
+-
+-done:
+-	MOV	X14, X10
+-	RET
+-- 
+2.39.5
+
diff --git a/2095-cmd-internal-obj-riscv-add-support-for-vector-floati.patch b/2095-cmd-internal-obj-riscv-add-support-for-vector-floati.patch
new file mode 100644
index 0000000..6931395
--- /dev/null
+++ b/2095-cmd-internal-obj-riscv-add-support-for-vector-floati.patch
@@ -0,0 +1,1735 @@
+From 6d2bdcd0a446828a3209372bfea6ccb948c40bcc Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 095/119] cmd/internal/obj/riscv: add support for vector
+ floating-point instructions
+
+Add support for vector floating-point instructions to the RISC-V
+assembler. This includes single-width and widening addition and
+subtraction, multiplication and division, fused multiply-addition,
+comparison, min/max, sign-injection, classification and type
+conversion instructions.
+
+Change-Id: I8bceb1c5d7eead0561ba5407ace00805a6144f51
+Reviewed-on: https://go-review.googlesource.com/c/go/+/646777
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Junyang Shao <shaojunyang@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   | 224 +++++++
+ .../asm/internal/asm/testdata/riscv64error.s  | 500 +++++++++------
+ .../internal/asm/testdata/riscv64validation.s | 603 ++++++++++--------
+ src/cmd/internal/obj/riscv/anames.go          |   4 +
+ src/cmd/internal/obj/riscv/cpu.go             |   4 +
+ src/cmd/internal/obj/riscv/obj.go             | 213 ++++++-
+ 6 files changed, 1076 insertions(+), 472 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 506fe2a442..7e2a070bd0 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -937,6 +937,230 @@ start:
+ 	VNCLIPWI	$16, V2, V3			// d73128be
+ 	VNCLIPWI	$16, V2, V0, V3			// d73128bc
+ 
++	// 31.13.2: Vector Single-Width Floating-Point Add/Subtract Instructions
++	VFADDVV		V1, V2, V3			// d7912002
++	VFADDVV		V1, V2, V0, V3			// d7912000
++	VFADDVF		F10, V2, V3			// d7512502
++	VFADDVF		F10, V2, V0, V3			// d7512500
++	VFSUBVV		V1, V2, V3			// d791200a
++	VFSUBVV		V1, V2, V0, V3			// d7912008
++	VFSUBVF		F10, V2, V3			// d751250a
++	VFSUBVF		F10, V2, V0, V3			// d7512508
++	VFRSUBVF	F10, V2, V3			// d751259e
++	VFRSUBVF	F10, V2, V0, V3			// d751259c
++
++	// 31.13.3: Vector Widening Floating-Point Add/Subtract Instructions
++	VFWADDVV	V1, V2, V3			// d79120c2
++	VFWADDVV	V1, V2, V0, V3			// d79120c0
++	VFWADDVF	F10, V2, V3			// d75125c2
++	VFWADDVF	F10, V2, V0, V3			// d75125c0
++	VFWSUBVV	V1, V2, V3			// d79120ca
++	VFWSUBVV	V1, V2, V0, V3			// d79120c8
++	VFWSUBVF	F10, V2, V3			// d75125ca
++	VFWSUBVF	F10, V2, V0, V3			// d75125c8
++	VFWADDWV	V1, V2, V3			// d79120d2
++	VFWADDWV	V1, V2, V0, V3			// d79120d0
++	VFWADDWF	F10, V2, V3			// d75125d2
++	VFWADDWF	F10, V2, V0, V3			// d75125d0
++	VFWSUBWV	V1, V2, V3			// d79120da
++	VFWSUBWV	V1, V2, V0, V3			// d79120d8
++	VFWSUBWF	F10, V2, V3			// d75125da
++	VFWSUBWF	F10, V2, V0, V3			// d75125d8
++
++	// 31.13.4: Vector Single-Width Floating-Point Multiply/Divide Instructions
++	VFMULVV		V1, V2, V3			// d7912092
++	VFMULVV		V1, V2, V0, V3			// d7912090
++	VFMULVF		F10, V2, V3			// d7512592
++	VFMULVF		F10, V2, V0, V3			// d7512590
++	VFDIVVV		V1, V2, V3			// d7912082
++	VFDIVVV		V1, V2, V0, V3			// d7912080
++	VFDIVVF		F10, V2, V3			// d7512582
++	VFDIVVF		F10, V2, V0, V3			// d7512580
++	VFRDIVVF	F10, V2, V3			// d7512586
++	VFRDIVVF	F10, V2, V0, V3			// d7512584
++
++	// 31.13.5: Vector Widening Floating-Point Multiply
++	VFWMULVV	V1, V2, V3			// d79120e2
++	VFWMULVV	V1, V2, V0, V3			// d79120e0
++	VFWMULVF	F10, V2, V3			// d75125e2
++	VFWMULVF	F10, V2, V0, V3			// d75125e0
++
++	// 31.13.6: Vector Single-Width Floating-Point Fused Multiply-Add Instructions
++	VFMACCVV	V2, V1, V3			// d79120b2
++	VFMACCVV	V2, V1, V0, V3			// d79120b0
++	VFMACCVF	V2, F10, V3			// d75125b2
++	VFMACCVF	V2, F10, V0, V3			// d75125b0
++	VFNMACCVV	V2, V1, V3			// d79120b6
++	VFNMACCVV	V2, V1, V0, V3			// d79120b4
++	VFNMACCVF	V2, F10, V3			// d75125b6
++	VFNMACCVF	V2, F10, V0, V3			// d75125b4
++	VFMSACVV	V2, V1, V3			// d79120ba
++	VFMSACVV	V2, V1, V0, V3			// d79120b8
++	VFMSACVF	V2, F10, V3			// d75125ba
++	VFMSACVF	V2, F10, V0, V3			// d75125b8
++	VFNMSACVV	V2, V1, V3			// d79120be
++	VFNMSACVV	V2, V1, V0, V3			// d79120bc
++	VFNMSACVF	V2, F10, V3			// d75125be
++	VFNMSACVF	V2, F10, V0, V3			// d75125bc
++	VFMADDVV	V2, V1, V3			// d79120a2
++	VFMADDVV	V2, V1, V0, V3			// d79120a0
++	VFMADDVF	V2, F10, V3			// d75125a2
++	VFMADDVF	V2, F10, V0, V3			// d75125a0
++	VFNMADDVV	V2, V1, V3			// d79120a6
++	VFNMADDVV	V2, V1, V0, V3			// d79120a4
++	VFNMADDVF	V2, F10, V3			// d75125a6
++	VFNMADDVF	V2, F10, V0, V3			// d75125a4
++	VFMSUBVV	V2, V1, V3			// d79120aa
++	VFMSUBVV	V2, V1, V0, V3			// d79120a8
++	VFMSUBVF	V2, F10, V3			// d75125aa
++	VFMSUBVF	V2, F10, V0, V3			// d75125a8
++	VFNMSUBVV	V2, V1, V3			// d79120ae
++	VFNMSUBVV	V2, V1, V0, V3			// d79120ac
++	VFNMSUBVF	V2, F10, V3			// d75125ae
++	VFNMSUBVF	V2, F10, V0, V3			// d75125ac
++
++	// 31.13.7: Vector Widening Floating-Point Fused Multiply-Add Instructions
++	VFWMACCVV	V2, V1, V3			// d79120f2
++	VFWMACCVV	V2, V1, V0, V3			// d79120f0
++	VFWMACCVF	V2, F10, V3			// d75125f2
++	VFWMACCVF	V2, F10, V0, V3			// d75125f0
++	VFWNMACCVV	V2, V1, V3			// d79120f6
++	VFWNMACCVV	V2, V1, V0, V3			// d79120f4
++	VFWNMACCVF	V2, F10, V3			// d75125f6
++	VFWNMACCVF	V2, F10, V0, V3			// d75125f4
++	VFWMSACVV	V2, V1, V3			// d79120fa
++	VFWMSACVV	V2, V1, V0, V3			// d79120f8
++	VFWMSACVF	V2, F10, V3			// d75125fa
++	VFWMSACVF	V2, F10, V0, V3			// d75125f8
++	VFWNMSACVV	V2, V1, V3			// d79120fe
++	VFWNMSACVV	V2, V1, V0, V3			// d79120fc
++	VFWNMSACVF	V2, F10, V3			// d75125fe
++	VFWNMSACVF	V2, F10, V0, V3			// d75125fc
++
++	// 31.13.8: Vector Floating-Point Square-Root Instruction
++	VFSQRTV		V2, V3				// d711204e
++	VFSQRTV		V2, V0, V3			// d711204c
++
++	// 31.13.9: Vector Floating-Point Reciprocal Square-Root Estimate Instruction
++	VFRSQRT7V	V2, V3				// d711224e
++	VFRSQRT7V	V2, V0, V3			// d711224c
++
++	// 31.13.10: Vector Floating-Point Reciprocal Estimate Instruction
++	VFREC7V		V2, V3				// d791224e
++	VFREC7V		V2, V0, V3			// d791224c
++
++	// 31.13.11: Vector Floating-Point MIN/MAX Instructions
++	VFMINVV		V1, V2, V3			// d7912012
++	VFMINVV		V1, V2, V0, V3			// d7912010
++	VFMINVF		F10, V2, V3			// d7512512
++	VFMINVF		F10, V2, V0, V3			// d7512510
++	VFMAXVV		V1, V2, V3			// d791201a
++	VFMAXVV		V1, V2, V0, V3			// d7912018
++	VFMAXVF		F10, V2, V3			// d751251a
++	VFMAXVF		F10, V2, V0, V3			// d7512518
++
++	// 31.13.12: Vector Floating-Point Sign-Injection Instructions
++	VFSGNJVV	V1, V2, V3			// d7912022
++	VFSGNJVV	V1, V2, V0, V3			// d7912020
++	VFSGNJVF	F10, V2, V3			// d7512522
++	VFSGNJVF	F10, V2, V0, V3			// d7512520
++	VFSGNJNVV	V1, V2, V3			// d7912026
++	VFSGNJNVV	V1, V2, V0, V3			// d7912024
++	VFSGNJNVF	F10, V2, V3			// d7512526
++	VFSGNJNVF	F10, V2, V0, V3			// d7512524
++	VFSGNJXVV	V1, V2, V3			// d791202a
++	VFSGNJXVV	V1, V2, V0, V3			// d7912028
++	VFSGNJXVF	F10, V2, V3			// d751252a
++	VFSGNJXVF	F10, V2, V0, V3			// d7512528
++	VFNEGV		V2, V3				// d7112126
++	VFNEGV		V2, V0, V3			// d7112124
++	VFABSV		V2, V3				// d711212a
++	VFABSV		V2, V0, V3			// d7112128
++
++	// 31.13.13: Vector Floating-Point Compare Instructions
++	VMFEQVV		V1, V2, V3			// d7912062
++	VMFEQVV		V1, V2, V0, V3			// d7912060
++	VMFEQVF		F10, V2, V3			// d7512562
++	VMFEQVF		F10, V2, V0, V3			// d7512560
++	VMFNEVV		V1, V2, V3			// d7912072
++	VMFNEVV		V1, V2, V0, V3			// d7912070
++	VMFNEVF		F10, V2, V3			// d7512572
++	VMFNEVF		F10, V2, V0, V3			// d7512570
++	VMFLTVV		V1, V2, V3			// d791206e
++	VMFLTVV		V1, V2, V0, V3			// d791206c
++	VMFLTVF		F10, V2, V3			// d751256e
++	VMFLTVF		F10, V2, V0, V3			// d751256c
++	VMFLEVV		V1, V2, V3			// d7912066
++	VMFLEVV		V1, V2, V0, V3			// d7912064
++	VMFLEVF		F10, V2, V3			// d7512566
++	VMFLEVF		F10, V2, V0, V3			// d7512564
++	VMFGTVF		F10, V2, V3			// d7512576
++	VMFGTVF		F10, V2, V0, V3			// d7512574
++	VMFGEVF		F10, V2, V3			// d751257e
++	VMFGEVF		F10, V2, V0, V3			// d751257c
++	VMFGTVV		V1, V2, V3			// d711116e
++	VMFGTVV		V1, V2, V0, V3			// d711116c
++	VMFGEVV		V1, V2, V3			// d7111166
++	VMFGEVV		V1, V2, V0, V3			// d7111164
++
++	// 31.13.14: Vector Floating-Point Classify Instruction
++	VFCLASSV	V2, V3				// d711284e
++	VFCLASSV	V2, V0, V3			// d711284c
++
++	// 31.13.15: Vector Floating-Point Merge Instruction
++	VFMERGEVFM	F10, V2, V0, V3			// d751255c
++
++	// 31.13.16: Vector Floating-Point Move Instruction
++	VFMVVF		F10, V3				// d751055e
++
++	// 31.13.17: Single-Width Floating-Point/Integer Type-Convert Instructions
++	VFCVTXUFV	V2, V3				// d711204a
++	VFCVTXUFV	V2, V0, V3			// d7112048
++	VFCVTXFV	V2, V3				// d791204a
++	VFCVTXFV	V2, V0, V3			// d7912048
++	VFCVTRTZXUFV	V2, V3				// d711234a
++	VFCVTRTZXUFV	V2, V0, V3			// d7112348
++	VFCVTRTZXFV	V2, V3				// d791234a
++	VFCVTRTZXFV	V2, V0, V3			// d7912348
++	VFCVTFXUV	V2, V3				// d711214a
++	VFCVTFXUV	V2, V0, V3			// d7112148
++	VFCVTFXV	V2, V3				// d791214a
++	VFCVTFXV	V2, V0, V3			// d7912148
++
++	// 31.13.18: Widening Floating-Point/Integer Type-Convert Instructions
++	VFWCVTXUFV	V2, V3				// d711244a
++	VFWCVTXUFV	V2, V0, V3			// d7112448
++	VFWCVTXFV	V2, V3				// d791244a
++	VFWCVTXFV	V2, V0, V3			// d7912448
++	VFWCVTRTZXUFV	V2, V3				// d711274a
++	VFWCVTRTZXUFV	V2, V0, V3			// d7112748
++	VFWCVTRTZXFV	V2, V3				// d791274a
++	VFWCVTRTZXFV	V2, V0, V3			// d7912748
++	VFWCVTFXUV	V2, V3				// d711254a
++	VFWCVTFXUV	V2, V0, V3			// d7112548
++	VFWCVTFXV	V2, V3				// d791254a
++	VFWCVTFXV	V2, V0, V3			// d7912548
++	VFWCVTFFV	V2, V3				// d711264a
++	VFWCVTFFV	V2, V0, V3			// d7112648
++
++	// 31.13.19: Narrowing Floating-Point/Integer Type-Convert Instructions
++	VFNCVTXUFW	V2, V3				// d711284a
++	VFNCVTXUFW	V2, V0, V3			// d7112848
++	VFNCVTXFW	V2, V3				// d791284a
++	VFNCVTXFW	V2, V0, V3			// d7912848
++	VFNCVTRTZXUFW	V2, V3				// d7112b4a
++	VFNCVTRTZXUFW	V2, V0, V3			// d7112b48
++	VFNCVTRTZXFW	V2, V3				// d7912b4a
++	VFNCVTRTZXFW	V2, V0, V3			// d7912b48
++	VFNCVTFXUW	V2, V3				// d711294a
++	VFNCVTFXUW	V2, V0, V3			// d7112948
++	VFNCVTFXW	V2, V3				// d791294a
++	VFNCVTFXW	V2, V0, V3			// d7912948
++	VFNCVTFFW	V2, V3				// d7112a4a
++	VFNCVTFFW	V2, V0, V3			// d7112a48
++	VFNCVTRODFFW	V2, V3				// d7912a4a
++	VFNCVTRODFFW	V2, V0, V3			// d7912a48
++
+ 	//
+ 	// Privileged ISA
+ 	//
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index 6a7c9b9444..3aeeadf848 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -50,208 +50,302 @@ TEXT errors(SB),$0
+ 	//
+ 	// "V" Standard Extension for Vector Operations, Version 1.0
+ 	//
+-	VSETIVLI X10, E32, M2, TA, MA, X12	// ERROR "expected immediate value"
+-	VLE8V	(X10), V1, V3			// ERROR "invalid vector mask register"
+-	VSE8V	V3, V1, (X10)			// ERROR "invalid vector mask register"
+-	VLSE8V	(X10), X10, V1, V3		// ERROR "invalid vector mask register"
+-	VSSE8V	V3, X11, V1, (X10)		// ERROR "invalid vector mask register"
+-	VLUXEI8V (X10), V2, V1, V3		// ERROR "invalid vector mask register"
+-	VSUXEI8V V3, V2, V1, (X10)		// ERROR "invalid vector mask register"
+-	VLOXEI8V (X10), V2, V1, V3		// ERROR "invalid vector mask register"
+-	VSOXEI8V V3, V2, V1, (X10)		// ERROR "invalid vector mask register"
+-	VL1RV	(X10), V0, V3			// ERROR "too many operands for instruction"
+-	VS1RV	V3, V0, (X11)			// ERROR "too many operands for instruction"
+-	VADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VADDVX	X10, V2, V1, V3			// ERROR "invalid vector mask register"
+-	VADDVI	$15, V4, V1, V2			// ERROR "invalid vector mask register"
+-	VSUBVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSUBVX	X10, V2, V1, V3			// ERROR "invalid vector mask register"
+-	VRSUBVX	X10, V2, V1, V3			// ERROR "invalid vector mask register"
+-	VRSUBVI	$15, V4, V1, V2			// ERROR "invalid vector mask register"
+-	VNEGV	V2, V3, V4			// ERROR "invalid vector mask register"
+-	VWADDUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWADDUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWSUBUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWSUBUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWADDVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWSUBVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWSUBVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWADDUWV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWADDUWX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWSUBUWV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWSUBUWX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWADDWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWADDWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWSUBWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWSUBWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWCVTXXV V2, V1, V3			// ERROR "invalid vector mask register"
+-	VWCVTUXXV V2, V1, V3			// ERROR "invalid vector mask register"
+-	VZEXTVF2 V2, V3, V4			// ERROR "invalid vector mask register"
+-	VSEXTVF2 V2, V3, V4			// ERROR "invalid vector mask register"
+-	VZEXTVF4 V2, V3, V4			// ERROR "invalid vector mask register"
+-	VSEXTVF4 V2, V3, V4			// ERROR "invalid vector mask register"
+-	VZEXTVF8 V2, V3, V4			// ERROR "invalid vector mask register"
+-	VSEXTVF8 V2, V3, V4			// ERROR "invalid vector mask register"
+-	VADCVVM	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VADCVVM	V1, V2, V3			// ERROR "invalid vector mask register"
+-	VADCVXM	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VADCVXM	X10, V2, V3			// ERROR "invalid vector mask register"
+-	VADCVIM	$15, V2, V1, V3			// ERROR "invalid vector mask register"
+-	VADCVIM	$15, V2, V3			// ERROR "invalid vector mask register"
+-	VMADCVVM V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMADCVVM V1, V2, V3			// ERROR "invalid vector mask register"
+-	VMADCVXM X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMADCVXM X10, V2, V3			// ERROR "invalid vector mask register"
+-	VMADCVIM $15, V2, V1, V3		// ERROR "invalid vector mask register"
+-	VMADCVIM $15, V2, V3			// ERROR "invalid vector mask register"
+-	VSBCVVM	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSBCVVM	V1, V2, V3			// ERROR "invalid vector mask register"
+-	VSBCVXM	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSBCVXM	X10, V2, V3			// ERROR "invalid vector mask register"
+-	VMSBCVVM V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSBCVVM V1, V2, V3			// ERROR "invalid vector mask register"
+-	VMSBCVXM X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMSBCVXM X10, V2, V3			// ERROR "invalid vector mask register"
+-	VANDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VANDVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VANDVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VORVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VORVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VORVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VXORVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VXORVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VXORVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNOTV	V1, V2, V3			// ERROR "invalid vector mask register"
+-	VSLLVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSLLVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSLLVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSRLVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSRLVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSRLVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSRAVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSRAVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSRAVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNSRLWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNSRLWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNSRLWI	$31, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNSRAWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNSRAWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNSRAWI	$31, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNCVTXXW V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSEQVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSEQVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSEQVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSNEVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSNEVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSNEVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSLTUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSLTUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMSLTVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSLTVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSLEUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSLEUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMSLEUVI $15, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMSLEVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSLEVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSLEVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSGTUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSGTUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMSGTUVI $15, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMSGTVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSGTVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSGTVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSGEVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSGEUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSLTVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSLTUVI $15, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMSGEVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMSGEUVI $15, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMINUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMINUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMINVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMINVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMAXUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMAXUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMAXVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMAXVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMULVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMULVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMULHVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMULHVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMULHUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMULHUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMULHSUVV V1, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMULHSUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VDIVUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VDIVUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VDIVVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VDIVVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VREMUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VREMUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VREMVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VREMVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWMULVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWMULVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWMULUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWMULUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWMULSUVV V1, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWMULSUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMACCVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMACCVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNMSACVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNMSACVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VMADDVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNMSUBVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNMSUBVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWMACCUVV V1, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWMACCUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWMACCVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VWMACCVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWMACCSUVV V1, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWMACCSUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VWMACCUSVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMERGEVVM V1, V2, V3			// ERROR "invalid vector mask register"
+-	VMERGEVVM V1, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMERGEVXM X10, V2, V3			// ERROR "invalid vector mask register"
+-	VMERGEVXM X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMERGEVIM $15, V2, V3			// ERROR "invalid vector mask register"
+-	VMERGEVIM $15, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VMVVV	V1, V2, V3			// ERROR "too many operands for instruction"
+-	VMVVX	X10, V2, V3			// ERROR "too many operands for instruction"
+-	VMVVI	$15, V2, V3			// ERROR "too many operands for instruction"
+-	VSADDUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSADDUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VSADDUVI $15, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VSADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSADDVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSADDVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSSUBUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSSUBUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VSSUBVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSSUBVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VAADDUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VAADDUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VAADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VAADDVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VASUBUVV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VASUBUVX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VASUBVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VASUBVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSMULVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSMULVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSSRLVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSSRLVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSSRLVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSSRAVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSSRAVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VSSRAVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNCLIPUWV V1, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VNCLIPUWX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VNCLIPUWI $16, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VNCLIPWV V1, V2, V4, V3			// ERROR "invalid vector mask register"
+-	VNCLIPWX X10, V2, V4, V3		// ERROR "invalid vector mask register"
+-	VNCLIPWI $16, V2, V4, V3		// ERROR "invalid vector mask register"
++	VSETIVLI	X10, E32, M2, TA, MA, X12	// ERROR "expected immediate value"
++	VLE8V		(X10), V1, V3			// ERROR "invalid vector mask register"
++	VSE8V		V3, V1, (X10)			// ERROR "invalid vector mask register"
++	VLSE8V		(X10), X10, V1, V3		// ERROR "invalid vector mask register"
++	VSSE8V		V3, X11, V1, (X10)		// ERROR "invalid vector mask register"
++	VLUXEI8V	(X10), V2, V1, V3		// ERROR "invalid vector mask register"
++	VSUXEI8V	V3, V2, V1, (X10)		// ERROR "invalid vector mask register"
++	VLOXEI8V	(X10), V2, V1, V3		// ERROR "invalid vector mask register"
++	VSOXEI8V	V3, V2, V1, (X10)		// ERROR "invalid vector mask register"
++	VL1RV		(X10), V0, V3			// ERROR "too many operands for instruction"
++	VS1RV		V3, V0, (X11)			// ERROR "too many operands for instruction"
++	VADDVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VADDVX		X10, V2, V1, V3			// ERROR "invalid vector mask register"
++	VADDVI		$15, V4, V1, V2			// ERROR "invalid vector mask register"
++	VSUBVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSUBVX		X10, V2, V1, V3			// ERROR "invalid vector mask register"
++	VRSUBVX		X10, V2, V1, V3			// ERROR "invalid vector mask register"
++	VRSUBVI		$15, V4, V1, V2			// ERROR "invalid vector mask register"
++	VNEGV		V2, V3, V4			// ERROR "invalid vector mask register"
++	VWADDUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDUWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDUWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBUWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBUWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDWV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWADDWX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBWV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWSUBWX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWCVTXXV	V2, V1, V3			// ERROR "invalid vector mask register"
++	VWCVTUXXV	V2, V1, V3			// ERROR "invalid vector mask register"
++	VZEXTVF2	V2, V3, V4			// ERROR "invalid vector mask register"
++	VSEXTVF2	V2, V3, V4			// ERROR "invalid vector mask register"
++	VZEXTVF4	V2, V3, V4			// ERROR "invalid vector mask register"
++	VSEXTVF4	V2, V3, V4			// ERROR "invalid vector mask register"
++	VZEXTVF8	V2, V3, V4			// ERROR "invalid vector mask register"
++	VSEXTVF8	V2, V3, V4			// ERROR "invalid vector mask register"
++	VADCVVM		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VADCVVM		V1, V2, V3			// ERROR "invalid vector mask register"
++	VADCVXM		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VADCVXM		X10, V2, V3			// ERROR "invalid vector mask register"
++	VADCVIM		$15, V2, V1, V3			// ERROR "invalid vector mask register"
++	VADCVIM		$15, V2, V3			// ERROR "invalid vector mask register"
++	VMADCVVM	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMADCVVM	V1, V2, V3			// ERROR "invalid vector mask register"
++	VMADCVXM	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMADCVXM	X10, V2, V3			// ERROR "invalid vector mask register"
++	VMADCVIM	$15, V2, V1, V3			// ERROR "invalid vector mask register"
++	VMADCVIM	$15, V2, V3			// ERROR "invalid vector mask register"
++	VSBCVVM		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSBCVVM		V1, V2, V3			// ERROR "invalid vector mask register"
++	VSBCVXM		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSBCVXM		X10, V2, V3			// ERROR "invalid vector mask register"
++	VMSBCVVM	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSBCVVM	V1, V2, V3			// ERROR "invalid vector mask register"
++	VMSBCVXM	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSBCVXM	X10, V2, V3			// ERROR "invalid vector mask register"
++	VANDVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VANDVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VANDVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VORVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VORVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VORVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VXORVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VXORVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VXORVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNOTV		V1, V2, V3			// ERROR "invalid vector mask register"
++	VSLLVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSLLVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSLLVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRLVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRLVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRLVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRAVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRAVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSRAVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRLWV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRLWX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRLWI		$31, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRAWV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRAWX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNSRAWI		$31, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNCVTXXW	V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSEQVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSEQVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSEQVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSNEVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSNEVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSNEVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLEUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLEUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLEUVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLEVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLEVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLEVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGTUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGTUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGTUVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGTVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGTVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGTVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGEVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGEUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSLTUVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGEVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSGEUVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMINUVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMINUVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMINVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMINVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMAXUVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMAXUVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMAXVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMAXVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULHVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULHVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULHUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULHUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULHSUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMULHSUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VDIVUVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VDIVUVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VDIVVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VDIVVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREMUVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREMUVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREMVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREMVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMULVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMULVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMULUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMULUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMULSUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMULSUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMACCVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMACCVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNMSACVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNMSACVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMADDVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMADDVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNMSUBVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNMSUBVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMACCUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMACCUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMACCVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMACCVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMACCSUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMACCSUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWMACCUSVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMERGEVVM	V1, V2, V3			// ERROR "invalid vector mask register"
++	VMERGEVVM	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMERGEVXM	X10, V2, V3			// ERROR "invalid vector mask register"
++	VMERGEVXM	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMERGEVIM	$15, V2, V3			// ERROR "invalid vector mask register"
++	VMERGEVIM	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMVVV		V1, V2, V3			// ERROR "too many operands for instruction"
++	VMVVX		X10, V2, V3			// ERROR "too many operands for instruction"
++	VMVVI		$15, V2, V3			// ERROR "too many operands for instruction"
++	VSADDUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSADDUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSADDUVI	$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSADDVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSADDVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSADDVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSUBUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSUBUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSUBVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSUBVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VAADDUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VAADDUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VAADDVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VAADDVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VASUBUVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VASUBUVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VASUBVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VASUBVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSMULVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSMULVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRLVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRLVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRLVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRAVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRAVX		X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSSRAVI		$15, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNCLIPUWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNCLIPUWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNCLIPUWI	$16, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNCLIPWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNCLIPWX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VNCLIPWI	$16, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFADDVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFADDVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFSUBVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFSUBVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFRSUBVF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWADDVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWADDVF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWSUBVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWSUBVF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWADDWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWADDWF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWSUBWV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWSUBWF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFMULVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFMULVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFDIVVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFDIVVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFRDIVVF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWMULVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWMULVF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFMACCVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFMACCVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFNMACCVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFNMACCVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFMSACVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFMSACVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFNMSACVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFNMSACVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFMADDVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFMADDVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFNMADDVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFNMADDVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFMSUBVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFMSUBVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFNMSUBVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFNMSUBVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFWMACCVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFWMACCVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFWNMACCVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFWNMACCVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFWMSACVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFWMSACVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFWNMSACVV	V2, V1, V4, V3			// ERROR "invalid vector mask register"
++	VFWNMSACVF	V2, F10, V4, V3			// ERROR "invalid vector mask register"
++	VFSQRTV		V2, V4, V3			// ERROR "invalid vector mask register"
++	VFRSQRT7V	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFREC7V		V2, V4, V3			// ERROR "invalid vector mask register"
++	VFMINVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFMINVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFMAXVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFMAXVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFSGNJVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFSGNJVF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFSGNJNVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFSGNJNVF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFSGNJXVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFSGNJXVF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFNEGV		V2, V4, V3			// ERROR "invalid vector mask register"
++	VFABSV		V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFEQVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFEQVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFNEVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFNEVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFLTVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFLTVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFLEVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFLEVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFGTVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFGEVF		F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFGTVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VMFGEVV		V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFMERGEVFM	X10, V2, V3			// ERROR "invalid vector mask register"
++	VFMERGEVFM	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFCVTXUFV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFCVTXFV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFCVTRTZXUFV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFCVTRTZXFV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFCVTFXUV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFCVTFXV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWCVTXUFV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWCVTXFV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWCVTRTZXUFV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWCVTRTZXFV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWCVTFXUV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWCVTFXV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWCVTFFV	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFNCVTXUFW	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFNCVTXFW	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFNCVTRTZXUFW	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFNCVTRTZXFW	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFNCVTFXUW	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFNCVTFXW	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFNCVTFFW	V2, V4, V3			// ERROR "invalid vector mask register"
++	VFNCVTRODFFW	V2, V4, V3			// ERROR "invalid vector mask register"
+ 
+ 	RET
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+index c6f71e64fb..2c509a1e91 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+@@ -15,259 +15,354 @@ TEXT validation(SB),$0
+ 	//
+ 	// "V" Standard Extension for Vector Operations, Version 1.0
+ 	//
+-	VSETVLI	$32, E16, M1, TU, MU, X12	// ERROR "must be in range [0, 31] (5 bits)"
+-	VSETVLI	$-1, E32, M2, TA, MA, X12	// ERROR "must be in range [0, 31] (5 bits)"
+-	VSETVL	X10, X11			// ERROR "expected integer register in rs1 position"
+-	VLE8V	(X10), X10			// ERROR "expected vector register in vd position"
+-	VLE8V	(V1), V3			// ERROR "expected integer register in rs1 position"
+-	VSE8V	X10, (X10)			// ERROR "expected vector register in vs1 position"
+-	VSE8V	V3, (V1)			// ERROR "expected integer register in rd position"
+-	VLSE8V	(X10), V3			// ERROR "expected integer register in rs2 position"
+-	VLSE8V	(X10), X10, X11			// ERROR "expected vector register in vd position"
+-	VLSE8V	(V1), X10, V3			// ERROR "expected integer register in rs1 position"
+-	VLSE8V	(X10), V1, V0, V3		// ERROR "expected integer register in rs2 position"
+-	VSSE8V	V3, (X10)			// ERROR "expected integer register in rs2 position"
+-	VSSE8V	X10, X11, (X10)			// ERROR "expected vector register in vd position"
+-	VSSE8V	V3, X11, (V1)			// ERROR "expected integer register in rs1 position"
+-	VSSE8V	V3, V1, V0, (X10)		// ERROR "expected integer register in rs2 position"
+-	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in vd position"
+-	VLUXEI8V (X10), V2, X11			// ERROR "expected vector register in vd position"
+-	VLUXEI8V (V1), V2, V3			// ERROR "expected integer register in rs1 position"
+-	VLUXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in vs2 position"
+-	VSUXEI8V X10, V2, (X10)			// ERROR "expected vector register in vd position"
+-	VSUXEI8V V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
+-	VSUXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in vs2 position"
+-	VLOXEI8V (X10), V2, X11			// ERROR "expected vector register in vd position"
+-	VLOXEI8V (V1), V2, V3			// ERROR "expected integer register in rs1 position"
+-	VLOXEI8V (X10), X11, V0, V3		// ERROR "expected vector register in vs2 position"
+-	VSOXEI8V X10, V2, (X10)			// ERROR "expected vector register in vd position"
+-	VSOXEI8V V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
+-	VSOXEI8V V3, X11, V0, (X10)		// ERROR "expected vector register in vs2 position"
+-	VL1RV	(X10), X10			// ERROR "expected vector register in vd position"
+-	VL1RV	(V1), V3			// ERROR "expected integer register in rs1 position"
+-	VS1RV	X11, (X11)			// ERROR "expected vector register in vs1 position"
+-	VS1RV	V3, (V1)			// ERROR "expected integer register in rd position"
+-	VADDVV	V1, X10, V3			// ERROR "expected vector register in vs2 position"
+-	VADDVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VADDVI	$16, V4, V2			// ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)"
+-	VADDVI	$-17, V4, V2			// ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)"
+-	VSUBVV	V1, X10, V3			// ERROR "expected vector register in vs2 position"
+-	VSUBVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VRSUBVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VRSUBVI	$16, V4, V2			// ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)"
+-	VRSUBVI	$-17, V4, V2			// ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)"
+-	VNEGV	X10, V3				// ERROR "expected vector register in vs2 position"
+-	VNEGV	V2				// ERROR "expected vector register in vd position"
+-	VWADDUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWADDUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWSUBUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWSUBUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWADDVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWADDVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWSUBVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWSUBVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWADDUWV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWADDUWX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWSUBUWV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWSUBUWX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWADDWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWADDWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWSUBWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWSUBWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWCVTXXV X10, V3			// ERROR "expected vector register in vs2 position"
+-	VWCVTUXXV X10, V3			// ERROR "expected vector register in vs2 position"
+-	VZEXTVF2 V2, V0, V3, V4			// ERROR "expected no register in rs3"
+-	VZEXTVF2 X10, V3			// ERROR "expected vector register in vs2 position"
+-	VSEXTVF2 V2, V0, V3, V4			// ERROR "expected no register in rs3"
+-	VSEXTVF2 X10, V3			// ERROR "expected vector register in vs2 position"
+-	VZEXTVF4 V2, V0, V3, V4			// ERROR "expected no register in rs3"
+-	VZEXTVF4 X10, V3			// ERROR "expected vector register in vs2 position"
+-	VSEXTVF4 V2, V0, V3, V4			// ERROR "expected no register in rs3"
+-	VSEXTVF4 X10, V3			// ERROR "expected vector register in vs2 position"
+-	VZEXTVF8 V2, V0, V3, V4			// ERROR "expected no register in rs3"
+-	VZEXTVF8 X10, V3			// ERROR "expected vector register in vs2 position"
+-	VSEXTVF8 V2, V0, V3, V4			// ERROR "expected no register in rs3"
+-	VSEXTVF8 X10, V3			// ERROR "expected vector register in vs2 position"
+-	VADCVVM	X10, V2, V0, V3			// ERROR "expected vector register in vs1 position"
+-	VADCVXM	V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
+-	VADCVIM	$16, V2, V0, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VADCVIM	$-17, V2, V0, V3		// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMADCVVM X10, V2, V0, V3		// ERROR "expected vector register in vs1 position"
+-	VMADCVXM V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
+-	VMADCVIM $16, V2, V0, V3		// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMADCVIM $-17, V2, V0, V3		// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMADCVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMADCVV	V1, V2, V0, V3			// ERROR "expected no register in rs3"
+-	VMADCVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMADCVX	X10, V2, V0, V3			// ERROR "expected no register in rs3"
+-	VMADCVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMADCVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMADCVI	$15, V2, V0, V3			// ERROR "expected no register in rs3"
+-	VSBCVVM	X10, V2, V0, V3			// ERROR "expected vector register in vs1 position"
+-	VSBCVXM	V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
+-	VMSBCVVM X10, V2, V0, V3		// ERROR "expected vector register in vs1 position"
+-	VMSBCVXM V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
+-	VMSBCVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMSBCVV	V1, V2, V0, V3			// ERROR "expected no register in rs3"
+-	VMSBCVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMSBCVX	X10, V2, V0, V3			// ERROR "expected no register in rs3"
+-	VANDVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VANDVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VANDVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VANDVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VORVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VORVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VORVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VORVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VXORVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VXORVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VXORVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VXORVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VNOTV	V3				// ERROR "expected vector register in vd position"
+-	VNOTV	X10, V3				// ERROR "expected vector register in vs2 position"
+-	VSLLVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VSLLVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VSLLVI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
+-	VSLLVI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
+-	VSRLVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VSRLVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VSRLVI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
+-	VSRLVI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
+-	VSRAVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VSRAVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VSRAVI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
+-	VSRAVI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
+-	VNSRLWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VNSRLWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VNSRLWI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
+-	VNSRLWI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
+-	VNSRAWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VNSRAWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VNSRAWI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
+-	VNSRAWI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
+-	VNCVTXXW X10, V3			// ERROR "expected vector register in vs2 position"
+-	VMSEQVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMSEQVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMSEQVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMSEQVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMSNEVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMSNEVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMSNEVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMSNEVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMSLTUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMSLTUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMSLTVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMSLTVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMSLEUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMSLEUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMSLEUVI $16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMSLEUVI $-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMSLEVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMSLEVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMSLEVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMSLEVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMSGTUVV X10, V2, V3			// ERROR "expected vector register in vs2 position"
+-	VMSGTUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMSGTUVI $16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMSGTUVI $-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMSGTVV	X10, V2, V3			// ERROR "expected vector register in vs2 position"
+-	VMSGTVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMSGTVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMSGTVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMSGEVV	X10, V2, V3			// ERROR "expected vector register in vs2 position"
+-	VMSGEUVV X10, V2, V3			// ERROR "expected vector register in vs2 position"
+-	VMSLTVI	$17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMSLTVI	$-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMSLTUVI $17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMSLTUVI $-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMSGEVI	$17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMSGEVI	$-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMSGEUVI $17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMSGEUVI $-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMINUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMINUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMINVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMINVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMAXUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMAXUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMAXVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMAXVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMULVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMULVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMULHVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMULHVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMULHUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMULHUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMULHSUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMULHSUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VDIVUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VDIVUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VDIVVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VDIVVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VREMUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VREMUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VREMVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VREMVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWMULVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWMULVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWMULUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWMULUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWMULSUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWMULSUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMACCVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMACCVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VNMSACVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VNMSACVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMADDVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VMADDVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VNMSUBVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VNMSUBVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWMACCUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWMACCUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWMACCVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWMACCVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWMACCSUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VWMACCSUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWMACCUSVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMERGEVVM X10, V2, V0, V3		// ERROR "expected vector register in vs1 position"
+-	VMERGEVXM V1, V2, V0, V3		// ERROR "expected integer register in rs1 position"
+-	VMERGEVIM $16, V2, V0, V3		// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMERGEVIM $-17, V2, V0, V3		// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VMVVV	X10, V3				// ERROR "expected vector register in vs1 position"
+-	VMVVX	V1, V2				// ERROR "expected integer register in rs1 position"
+-	VMVVI	$16, V2				// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VMVVI	$-17, V2			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VSADDUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VSADDUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VSADDUVI $16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
+-	VSADDUVI $-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
+-	VSSUBUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VSSUBUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VAADDUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VAADDUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VAADDVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VAADDVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VASUBUVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VASUBUVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VASUBVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VASUBVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VSMULVV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VSMULVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VSSRLVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VSSRLVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VSSRLVI $32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
+-	VSSRLVI $-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
+-	VSSRAVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VSSRAVX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VSSRAVI $32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
+-	VSSRAVI $-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
+-	VNCLIPUWV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VNCLIPUWX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VNCLIPUWI $32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
+-	VNCLIPUWI $-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
+-	VNCLIPWV X10, V2, V3			// ERROR "expected vector register in vs1 position"
+-	VNCLIPWX V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VNCLIPWI $32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
+-	VNCLIPWI $-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
++	VSETVLI		$32, E16, M1, TU, MU, X12	// ERROR "must be in range [0, 31] (5 bits)"
++	VSETVLI		$-1, E32, M2, TA, MA, X12	// ERROR "must be in range [0, 31] (5 bits)"
++	VSETVL		X10, X11			// ERROR "expected integer register in rs1 position"
++	VLE8V		(X10), X10			// ERROR "expected vector register in vd position"
++	VLE8V		(V1), V3			// ERROR "expected integer register in rs1 position"
++	VSE8V		X10, (X10)			// ERROR "expected vector register in vs1 position"
++	VSE8V		V3, (V1)			// ERROR "expected integer register in rd position"
++	VLSE8V		(X10), V3			// ERROR "expected integer register in rs2 position"
++	VLSE8V		(X10), X10, X11			// ERROR "expected vector register in vd position"
++	VLSE8V		(V1), X10, V3			// ERROR "expected integer register in rs1 position"
++	VLSE8V		(X10), V1, V0, V3		// ERROR "expected integer register in rs2 position"
++	VSSE8V		V3, (X10)			// ERROR "expected integer register in rs2 position"
++	VSSE8V		X10, X11, (X10)			// ERROR "expected vector register in vd position"
++	VSSE8V		V3, X11, (V1)			// ERROR "expected integer register in rs1 position"
++	VSSE8V		V3, V1, V0, (X10)		// ERROR "expected integer register in rs2 position"
++	VLUXEI8V	(X10), V2, X11			// ERROR "expected vector register in vd position"
++	VLUXEI8V	(X10), V2, X11			// ERROR "expected vector register in vd position"
++	VLUXEI8V	(V1), V2, V3			// ERROR "expected integer register in rs1 position"
++	VLUXEI8V	(X10), X11, V0, V3		// ERROR "expected vector register in vs2 position"
++	VSUXEI8V	X10, V2, (X10)			// ERROR "expected vector register in vd position"
++	VSUXEI8V	V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
++	VSUXEI8V	V3, X11, V0, (X10)		// ERROR "expected vector register in vs2 position"
++	VLOXEI8V	(X10), V2, X11			// ERROR "expected vector register in vd position"
++	VLOXEI8V	(V1), V2, V3			// ERROR "expected integer register in rs1 position"
++	VLOXEI8V	(X10), X11, V0, V3		// ERROR "expected vector register in vs2 position"
++	VSOXEI8V	X10, V2, (X10)			// ERROR "expected vector register in vd position"
++	VSOXEI8V	V3, V2, (V1)			// ERROR "expected integer register in rs1 position"
++	VSOXEI8V	V3, X11, V0, (X10)		// ERROR "expected vector register in vs2 position"
++	VL1RV		(X10), X10			// ERROR "expected vector register in vd position"
++	VL1RV		(V1), V3			// ERROR "expected integer register in rs1 position"
++	VS1RV		X11, (X11)			// ERROR "expected vector register in vs1 position"
++	VS1RV		V3, (V1)			// ERROR "expected integer register in rd position"
++	VADDVV		V1, X10, V3			// ERROR "expected vector register in vs2 position"
++	VADDVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VADDVI		$16, V4, V2			// ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)"
++	VADDVI		$-17, V4, V2			// ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)"
++	VSUBVV		V1, X10, V3			// ERROR "expected vector register in vs2 position"
++	VSUBVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VRSUBVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VRSUBVI		$16, V4, V2			// ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)"
++	VRSUBVI		$-17, V4, V2			// ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)"
++	VNEGV		X10, V3				// ERROR "expected vector register in vs2 position"
++	VNEGV		V2				// ERROR "expected vector register in vd position"
++	VWADDUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWADDUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWSUBUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWSUBUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWADDVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWADDVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWSUBVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWSUBVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWADDUWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWADDUWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWSUBUWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWSUBUWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWADDWV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWADDWX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWSUBWV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWSUBWX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWCVTXXV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VWCVTUXXV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VZEXTVF2	V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VZEXTVF2	X10, V3				// ERROR "expected vector register in vs2 position"
++	VSEXTVF2	V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VSEXTVF2	X10, V3				// ERROR "expected vector register in vs2 position"
++	VZEXTVF4	V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VZEXTVF4	X10, V3				// ERROR "expected vector register in vs2 position"
++	VSEXTVF4	V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VSEXTVF4	X10, V3				// ERROR "expected vector register in vs2 position"
++	VZEXTVF8	V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VZEXTVF8	X10, V3				// ERROR "expected vector register in vs2 position"
++	VSEXTVF8	V2, V0, V3, V4			// ERROR "expected no register in rs3"
++	VSEXTVF8	X10, V3				// ERROR "expected vector register in vs2 position"
++	VADCVVM		X10, V2, V0, V3			// ERROR "expected vector register in vs1 position"
++	VADCVXM		V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
++	VADCVIM		$16, V2, V0, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VADCVIM		$-17, V2, V0, V3		// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMADCVVM	X10, V2, V0, V3			// ERROR "expected vector register in vs1 position"
++	VMADCVXM	V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
++	VMADCVIM	$16, V2, V0, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMADCVIM	$-17, V2, V0, V3		// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMADCVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMADCVV		V1, V2, V0, V3			// ERROR "expected no register in rs3"
++	VMADCVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMADCVX		X10, V2, V0, V3			// ERROR "expected no register in rs3"
++	VMADCVI		$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMADCVI		$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMADCVI		$15, V2, V0, V3			// ERROR "expected no register in rs3"
++	VSBCVVM		X10, V2, V0, V3			// ERROR "expected vector register in vs1 position"
++	VSBCVXM		V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
++	VMSBCVVM	X10, V2, V0, V3			// ERROR "expected vector register in vs1 position"
++	VMSBCVXM	V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
++	VMSBCVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSBCVV		V1, V2, V0, V3			// ERROR "expected no register in rs3"
++	VMSBCVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSBCVX		X10, V2, V0, V3			// ERROR "expected no register in rs3"
++	VANDVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VANDVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VANDVI		$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VANDVI		$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VORVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VORVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VORVI		$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VORVI		$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VXORVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VXORVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VXORVI		$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VXORVI		$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VNOTV		V3				// ERROR "expected vector register in vd position"
++	VNOTV		X10, V3				// ERROR "expected vector register in vs2 position"
++	VSLLVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSLLVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSLLVI		$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VSLLVI		$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VSRLVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSRLVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSRLVI		$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VSRLVI		$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VSRAVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSRAVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSRAVI		$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VSRAVI		$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VNSRLWV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNSRLWX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNSRLWI		$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VNSRLWI		$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VNSRAWV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNSRAWX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNSRAWI		$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VNSRAWI		$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VNCVTXXW	X10, V3				// ERROR "expected vector register in vs2 position"
++	VMSEQVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSEQVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSEQVI		$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSEQVI		$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSNEVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSNEVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSNEVI		$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSNEVI		$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSLTUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSLTUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSLTVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSLTVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSLEUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSLEUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSLEUVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSLEUVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSLEVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMSLEVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSLEVI		$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSLEVI		$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSGTUVV	X10, V2, V3			// ERROR "expected vector register in vs2 position"
++	VMSGTUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSGTUVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSGTUVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSGTVV		X10, V2, V3			// ERROR "expected vector register in vs2 position"
++	VMSGTVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMSGTVI		$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSGTVI		$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSGEVV		X10, V2, V3			// ERROR "expected vector register in vs2 position"
++	VMSGEUVV	X10, V2, V3			// ERROR "expected vector register in vs2 position"
++	VMSLTVI		$17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSLTVI		$-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSLTUVI	$17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSLTUVI	$-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSGEVI		$17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSGEVI		$-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMSGEUVI	$17, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMSGEUVI	$-16, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMINUVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMINUVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMINVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMINVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMAXUVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMAXUVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMAXVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMAXVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMULVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMULVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMULHVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMULHVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMULHUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMULHUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMULHSUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMULHSUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VDIVUVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VDIVUVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VDIVVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VDIVVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VREMUVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREMUVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VREMVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREMVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMULVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMULVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMULUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMULUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMULSUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMULSUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMACCVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMACCVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNMSACVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNMSACVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMADDVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMADDVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNMSUBVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNMSUBVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMACCUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMACCUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMACCVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMACCVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMACCSUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMACCSUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VWMACCUSVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VMERGEVVM	X10, V2, V0, V3			// ERROR "expected vector register in vs1 position"
++	VMERGEVXM	V1, V2, V0, V3			// ERROR "expected integer register in rs1 position"
++	VMERGEVIM	$16, V2, V0, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMERGEVIM	$-17, V2, V0, V3		// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VMVVV		X10, V3				// ERROR "expected vector register in vs1 position"
++	VMVVX		V1, V2				// ERROR "expected integer register in rs1 position"
++	VMVVI		$16, V2				// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VMVVI		$-17, V2			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VSADDUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSADDUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSADDUVI	$16, V2, V3			// ERROR "signed immediate 16 must be in range [-16, 15]"
++	VSADDUVI	$-17, V2, V3			// ERROR "signed immediate -17 must be in range [-16, 15]"
++	VSSUBUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSSUBUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VAADDUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VAADDUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VAADDVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VAADDVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VASUBUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VASUBUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VASUBVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VASUBVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSMULVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSMULVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSSRLVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSSRLVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSSRLVI		$32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
++	VSSRLVI		$-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
++	VSSRAVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VSSRAVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSSRAVI		$32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
++	VSSRAVI		$-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
++	VNCLIPUWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNCLIPUWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNCLIPUWI	$32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
++	VNCLIPUWI	$-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
++	VNCLIPWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNCLIPWX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VNCLIPWI	$32, V2, V3			// ERROR "signed immediate 32 must be in range [0, 31]"
++	VNCLIPWI	$-1, V2, V3			// ERROR "signed immediate -1 must be in range [0, 31]"
++	VFADDVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFADDVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFSUBVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFSUBVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFRSUBVF	X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFWADDVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFWADDVF	X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFWSUBVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFWSUBVF	X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFWADDWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFWADDWF	X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFWSUBWV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFWSUBWF	X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFMULVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFMULVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFDIVVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFDIVVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFRDIVVF	X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFWMULVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFWMULVF	X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFMACCVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFMACCVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFNMACCVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFNMACCVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFMSACVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFMSACVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFNMSACVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFNMSACVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFMADDVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFMADDVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFNMADDVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFNMADDVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFMSUBVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFMSUBVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFNMSUBVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFNMSUBVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFWMACCVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFWMACCVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFWNMACCVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFWNMACCVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFWMSACVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFWMSACVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFWNMSACVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
++	VFWNMSACVF	V2, X10, V3			// ERROR "expected float register in rs1 position"
++	VFSQRTV		X10, V3				// ERROR "expected vector register in vs2 position"
++	VFRSQRT7V	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFREC7V		X10, V3				// ERROR "expected vector register in vs2 position"
++	VFMINVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFMINVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFMAXVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFMAXVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFSGNJVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFSGNJVF	X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFSGNJNVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFSGNJNVF	X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFSGNJXVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFSGNJXVF	X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VFNEGV		V2, X10				// ERROR "expected vector register in vd position"
++	VFABSV		V2, X10				// ERROR "expected vector register in vd position"
++	VMFEQVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMFEQVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VMFNEVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMFNEVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VMFLTVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMFLTVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VMFLEVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMFLEVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VMFGTVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VMFGEVF		X10, V2, V3			// ERROR "expected float register in rs1 position"
++	VMFGTVV		X10, V2, V3			// ERROR "expected vector register in vs2 position"
++	VMFGEVV		X10, V2, V3			// ERROR "expected vector register in vs2 position"
++	VFCLASSV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFMERGEVFM	X10, V2, V0, V3			// ERROR "expected float register in rs1 position"
++	VFMVVF		X10, V3				// ERROR "expected float register in rs1 position"
++	VFCVTXUFV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFCVTXFV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFCVTRTZXUFV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFCVTRTZXFV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFCVTFXUV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFCVTFXV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFWCVTXUFV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFWCVTXFV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFWCVTRTZXUFV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFWCVTRTZXFV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFWCVTFXUV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFWCVTFXV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFWCVTFFV	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFNCVTXUFW	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFNCVTXFW	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFNCVTRTZXUFW	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFNCVTRTZXFW	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFNCVTFXUW	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFNCVTFXW	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFNCVTFFW	X10, V3				// ERROR "expected vector register in vs2 position"
++	VFNCVTRODFFW	X10, V3				// ERROR "expected vector register in vs2 position"
+ 
+ 	RET
+diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go
+index a65dfceea9..bf1fdb8b88 100644
+--- a/src/cmd/internal/obj/riscv/anames.go
++++ b/src/cmd/internal/obj/riscv/anames.go
+@@ -650,6 +650,10 @@ var Anames = []string{
+ 	"RDTIME",
+ 	"SEQZ",
+ 	"SNEZ",
++	"VFABSV",
++	"VFNEGV",
++	"VMFGEVV",
++	"VMFGTVV",
+ 	"VL1RV",
+ 	"VL2RV",
+ 	"VL4RV",
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index 577b06f0ec..b641eadde7 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -1168,6 +1168,10 @@ const (
+ 	ARDTIME
+ 	ASEQZ
+ 	ASNEZ
++	AVFABSV
++	AVFNEGV
++	AVMFGEVV
++	AVMFGTVV
+ 	AVL1RV
+ 	AVL2RV
+ 	AVL4RV
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index e7870000cf..6066c840ca 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1189,17 +1189,24 @@ func validateRFI(ctxt *obj.Link, ins *instruction) {
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+-func validateRIF(ctxt *obj.Link, ins *instruction) {
++func validateRFF(ctxt *obj.Link, ins *instruction) {
+ 	wantFloatReg(ctxt, ins, "rd", ins.rd)
+ 	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
+-	wantIntReg(ctxt, ins, "rs2", ins.rs2)
++	wantFloatReg(ctxt, ins, "rs2", ins.rs2)
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+-func validateRFF(ctxt *obj.Link, ins *instruction) {
++func validateRIF(ctxt *obj.Link, ins *instruction) {
+ 	wantFloatReg(ctxt, ins, "rd", ins.rd)
+ 	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
+-	wantFloatReg(ctxt, ins, "rs2", ins.rs2)
++	wantIntReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateRVFV(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
++	wantFloatReg(ctxt, ins, "rs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "vs2", ins.rs2)
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
+@@ -1440,12 +1447,20 @@ func encodeRFI(ins *instruction) uint32 {
+ 	return encodeR(ins.as, regF(ins.rs2), 0, regI(ins.rd), ins.funct3, ins.funct7)
+ }
+ 
++func encodeRFF(ins *instruction) uint32 {
++	return encodeR(ins.as, regF(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7)
++}
++
+ func encodeRIF(ins *instruction) uint32 {
+ 	return encodeR(ins.as, regI(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7)
+ }
+ 
+-func encodeRFF(ins *instruction) uint32 {
+-	return encodeR(ins.as, regF(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7)
++func encodeRVFV(ins *instruction) uint32 {
++	return encodeR(ins.as, regF(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
++}
++
++func encodeRVIV(ins *instruction) uint32 {
++	return encodeR(ins.as, regI(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
+ }
+ 
+ func encodeRVV(ins *instruction) uint32 {
+@@ -1460,10 +1475,6 @@ func encodeRVVu(ins *instruction) uint32 {
+ 	return encodeR(ins.as, immU(ins.as, ins.imm, 5), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
+ }
+ 
+-func encodeRVIV(ins *instruction) uint32 {
+-	return encodeR(ins.as, regI(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
+-}
+-
+ func encodeRVVV(ins *instruction) uint32 {
+ 	return encodeR(ins.as, regV(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
+ }
+@@ -1751,10 +1762,11 @@ var (
+ 	rFIEncoding   = encoding{encode: encodeRFI, validate: validateRFI, length: 4}
+ 	rIFEncoding   = encoding{encode: encodeRIF, validate: validateRIF, length: 4}
+ 	rFFEncoding   = encoding{encode: encodeRFF, validate: validateRFF, length: 4}
++	rVFVEncoding  = encoding{encode: encodeRVFV, validate: validateRVFV, length: 4}
++	rVIVEncoding  = encoding{encode: encodeRVIV, validate: validateRVIV, length: 4}
+ 	rVVEncoding   = encoding{encode: encodeRVV, validate: validateRVV, length: 4}
+ 	rVViEncoding  = encoding{encode: encodeRVVi, validate: validateRVVi, length: 4}
+ 	rVVuEncoding  = encoding{encode: encodeRVVu, validate: validateRVVu, length: 4}
+-	rVIVEncoding  = encoding{encode: encodeRVIV, validate: validateRVIV, length: 4}
+ 	rVVVEncoding  = encoding{encode: encodeRVVV, validate: validateRVVV, length: 4}
+ 
+ 	iIIEncoding  = encoding{encode: encodeIII, validate: validateIII, length: 4}
+@@ -2328,6 +2340,133 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AVNCLIPWX & obj.AMask:  {enc: rVIVEncoding},
+ 	AVNCLIPWI & obj.AMask:  {enc: rVVuEncoding},
+ 
++	// 31.13.2: Vector Single-Width Floating-Point Add/Subtract Instructions
++	AVFADDVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFADDVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFSUBVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFSUBVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFRSUBVF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.3: Vector Widening Floating-Point Add/Subtract Instructions
++	AVFWADDVV & obj.AMask: {enc: rVVVEncoding},
++	AVFWADDVF & obj.AMask: {enc: rVFVEncoding},
++	AVFWSUBVV & obj.AMask: {enc: rVVVEncoding},
++	AVFWSUBVF & obj.AMask: {enc: rVFVEncoding},
++	AVFWADDWV & obj.AMask: {enc: rVVVEncoding},
++	AVFWADDWF & obj.AMask: {enc: rVFVEncoding},
++	AVFWSUBWV & obj.AMask: {enc: rVVVEncoding},
++	AVFWSUBWF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.4: Vector Single-Width Floating-Point Multiply/Divide Instructions
++	AVFMULVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFMULVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFDIVVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFDIVVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFRDIVVF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.5: Vector Widening Floating-Point Multiply
++	AVFWMULVV & obj.AMask: {enc: rVVVEncoding},
++	AVFWMULVF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.6: Vector Single-Width Floating-Point Fused Multiply-Add Instructions
++	AVFMACCVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFMACCVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFNMACCVV & obj.AMask: {enc: rVVVEncoding},
++	AVFNMACCVF & obj.AMask: {enc: rVFVEncoding},
++	AVFMSACVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFMSACVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFNMSACVV & obj.AMask: {enc: rVVVEncoding},
++	AVFNMSACVF & obj.AMask: {enc: rVFVEncoding},
++	AVFMADDVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFMADDVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFNMADDVV & obj.AMask: {enc: rVVVEncoding},
++	AVFNMADDVF & obj.AMask: {enc: rVFVEncoding},
++	AVFMSUBVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFMSUBVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFNMSUBVV & obj.AMask: {enc: rVVVEncoding},
++	AVFNMSUBVF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.7: Vector Widening Floating-Point Fused Multiply-Add Instructions
++	AVFWMACCVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFWMACCVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFWNMACCVV & obj.AMask: {enc: rVVVEncoding},
++	AVFWNMACCVF & obj.AMask: {enc: rVFVEncoding},
++	AVFWMSACVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFWMSACVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFWNMSACVV & obj.AMask: {enc: rVVVEncoding},
++	AVFWNMSACVF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.8: Vector Floating-Point Square-Root Instruction
++	AVFSQRTV & obj.AMask: {enc: rVVEncoding},
++
++	// 31.13.9: Vector Floating-Point Reciprocal Square-Root Estimate Instruction
++	AVFRSQRT7V & obj.AMask: {enc: rVVEncoding},
++
++	// 31.13.10: Vector Floating-Point Reciprocal Estimate Instruction
++	AVFREC7V & obj.AMask: {enc: rVVEncoding},
++
++	// 31.13.11: Vector Floating-Point MIN/MAX Instructions
++	AVFMINVV & obj.AMask: {enc: rVVVEncoding},
++	AVFMINVF & obj.AMask: {enc: rVFVEncoding},
++	AVFMAXVV & obj.AMask: {enc: rVVVEncoding},
++	AVFMAXVF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.12: Vector Floating-Point Sign-Injection Instructions
++	AVFSGNJVV & obj.AMask:  {enc: rVVVEncoding},
++	AVFSGNJVF & obj.AMask:  {enc: rVFVEncoding},
++	AVFSGNJNVV & obj.AMask: {enc: rVVVEncoding},
++	AVFSGNJNVF & obj.AMask: {enc: rVFVEncoding},
++	AVFSGNJXVV & obj.AMask: {enc: rVVVEncoding},
++	AVFSGNJXVF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.13: Vector Floating-Point Compare Instructions
++	AVMFEQVV & obj.AMask: {enc: rVVVEncoding},
++	AVMFEQVF & obj.AMask: {enc: rVFVEncoding},
++	AVMFNEVV & obj.AMask: {enc: rVVVEncoding},
++	AVMFNEVF & obj.AMask: {enc: rVFVEncoding},
++	AVMFLTVV & obj.AMask: {enc: rVVVEncoding},
++	AVMFLTVF & obj.AMask: {enc: rVFVEncoding},
++	AVMFLEVV & obj.AMask: {enc: rVVVEncoding},
++	AVMFLEVF & obj.AMask: {enc: rVFVEncoding},
++	AVMFGTVF & obj.AMask: {enc: rVFVEncoding},
++	AVMFGEVF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.14: Vector Floating-Point Classify Instruction
++	AVFCLASSV & obj.AMask: {enc: rVVEncoding},
++
++	// 31.13.15: Vector Floating-Point Merge Instruction
++	AVFMERGEVFM & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.16: Vector Floating-Point Move Instruction
++	AVFMVVF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.13.17: Single-Width Floating-Point/Integer Type-Convert Instructions
++	AVFCVTXUFV & obj.AMask:    {enc: rVVEncoding},
++	AVFCVTXFV & obj.AMask:     {enc: rVVEncoding},
++	AVFCVTRTZXUFV & obj.AMask: {enc: rVVEncoding},
++	AVFCVTRTZXFV & obj.AMask:  {enc: rVVEncoding},
++	AVFCVTFXUV & obj.AMask:    {enc: rVVEncoding},
++	AVFCVTFXV & obj.AMask:     {enc: rVVEncoding},
++
++	// 31.13.18: Widening Floating-Point/Integer Type-Convert Instructions
++	AVFWCVTXUFV & obj.AMask:    {enc: rVVEncoding},
++	AVFWCVTXFV & obj.AMask:     {enc: rVVEncoding},
++	AVFWCVTRTZXUFV & obj.AMask: {enc: rVVEncoding},
++	AVFWCVTRTZXFV & obj.AMask:  {enc: rVVEncoding},
++	AVFWCVTFXUV & obj.AMask:    {enc: rVVEncoding},
++	AVFWCVTFXV & obj.AMask:     {enc: rVVEncoding},
++	AVFWCVTFFV & obj.AMask:     {enc: rVVEncoding},
++
++	// 31.13.19: Narrowing Floating-Point/Integer Type-Convert Instructions
++	AVFNCVTXUFW & obj.AMask:    {enc: rVVEncoding},
++	AVFNCVTXFW & obj.AMask:     {enc: rVVEncoding},
++	AVFNCVTRTZXUFW & obj.AMask: {enc: rVVEncoding},
++	AVFNCVTRTZXFW & obj.AMask:  {enc: rVVEncoding},
++	AVFNCVTFXUW & obj.AMask:    {enc: rVVEncoding},
++	AVFNCVTFXW & obj.AMask:     {enc: rVVEncoding},
++	AVFNCVTFFW & obj.AMask:     {enc: rVVEncoding},
++	AVFNCVTRODFFW & obj.AMask:  {enc: rVVEncoding},
++
+ 	//
+ 	// Privileged ISA
+ 	//
+@@ -3315,7 +3454,13 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		AVSADDUVV, AVSADDUVX, AVSADDUVI, AVSADDVV, AVSADDVX, AVSADDVI, AVSSUBUVV, AVSSUBUVX, AVSSUBVV, AVSSUBVX,
+ 		AVAADDUVV, AVAADDUVX, AVAADDVV, AVAADDVX, AVASUBUVV, AVASUBUVX, AVASUBVV, AVASUBVX,
+ 		AVSMULVV, AVSMULVX, AVSSRLVV, AVSSRLVX, AVSSRLVI, AVSSRAVV, AVSSRAVX, AVSSRAVI,
+-		AVNCLIPUWV, AVNCLIPUWX, AVNCLIPUWI, AVNCLIPWV, AVNCLIPWX, AVNCLIPWI:
++		AVNCLIPUWV, AVNCLIPUWX, AVNCLIPUWI, AVNCLIPWV, AVNCLIPWX, AVNCLIPWI,
++		AVFADDVV, AVFADDVF, AVFSUBVV, AVFSUBVF, AVFRSUBVF,
++		AVFWADDVV, AVFWADDVF, AVFWSUBVV, AVFWSUBVF, AVFWADDWV, AVFWADDWF, AVFWSUBWV, AVFWSUBWF,
++		AVFMULVV, AVFMULVF, AVFDIVVV, AVFDIVVF, AVFRDIVVF, AVFWMULVV, AVFWMULVF,
++		AVFMINVV, AVFMINVF, AVFMAXVV, AVFMAXVF,
++		AVFSGNJVV, AVFSGNJVF, AVFSGNJNVV, AVFSGNJNVF, AVFSGNJXVV, AVFSGNJXVF,
++		AVMFEQVV, AVMFEQVF, AVMFNEVV, AVMFNEVF, AVMFLTVV, AVMFLTVF, AVMFLEVV, AVMFLEVF, AVMFGTVF, AVMFGEVF:
+ 		// Set mask bit
+ 		switch {
+ 		case ins.rs3 == obj.REG_NONE:
+@@ -3325,6 +3470,17 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		}
+ 		ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg), obj.REG_NONE
+ 
++	case AVFMACCVV, AVFMACCVF, AVFNMACCVV, AVFNMACCVF, AVFMSACVV, AVFMSACVF, AVFNMSACVV, AVFNMSACVF,
++		AVFMADDVV, AVFMADDVF, AVFNMADDVV, AVFNMADDVF, AVFMSUBVV, AVFMSUBVF, AVFNMSUBVV, AVFNMSUBVF,
++		AVFWMACCVV, AVFWMACCVF, AVFWNMACCVV, AVFWNMACCVF, AVFWMSACVV, AVFWMSACVF, AVFWNMSACVV, AVFWNMSACVF:
++		switch {
++		case ins.rs3 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs3 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), obj.REG_NONE
++
+ 	case AVADDVI, AVRSUBVI, AVANDVI, AVORVI, AVXORVI, AVMSEQVI, AVMSNEVI, AVMSLEUVI, AVMSLEVI, AVMSGTUVI, AVMSGTVI,
+ 		AVSLLVI, AVSRLVI, AVSRAVI, AVNSRLWI, AVNSRAWI:
+ 		// Set mask bit
+@@ -3336,7 +3492,10 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		}
+ 		ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), obj.REG_NONE, uint32(p.Reg), obj.REG_NONE
+ 
+-	case AVZEXTVF2, AVSEXTVF2, AVZEXTVF4, AVSEXTVF4, AVZEXTVF8, AVSEXTVF8:
++	case AVZEXTVF2, AVSEXTVF2, AVZEXTVF4, AVSEXTVF4, AVZEXTVF8, AVSEXTVF8, AVFSQRTV, AVFRSQRT7V, AVFREC7V, AVFCLASSV,
++		AVFCVTXUFV, AVFCVTXFV, AVFCVTRTZXUFV, AVFCVTRTZXFV, AVFCVTFXUV, AVFCVTFXV,
++		AVFWCVTXUFV, AVFWCVTXFV, AVFWCVTRTZXUFV, AVFWCVTRTZXFV, AVFWCVTFXUV, AVFWCVTFXV, AVFWCVTFFV,
++		AVFNCVTXUFW, AVFNCVTXFW, AVFNCVTRTZXUFW, AVFNCVTRTZXFW, AVFNCVTFXUW, AVFNCVTFXW, AVFNCVTFFW, AVFNCVTRODFFW:
+ 		// Set mask bit
+ 		switch {
+ 		case ins.rs1 == obj.REG_NONE:
+@@ -3358,8 +3517,12 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		}
+ 		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), obj.REG_NONE, REG_V0
+ 
++	case AVFMVVF:
++		ins.funct7 |= 1 // unmasked
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), REG_V0
++
+ 	case AVADCVVM, AVADCVXM, AVMADCVVM, AVMADCVXM, AVSBCVVM, AVSBCVXM, AVMSBCVVM, AVMSBCVXM, AVADCVIM, AVMADCVIM,
+-		AVMERGEVVM, AVMERGEVXM, AVMERGEVIM:
++		AVMERGEVVM, AVMERGEVXM, AVMERGEVIM, AVFMERGEVFM:
+ 		if ins.rs3 != REG_V0 {
+ 			p.Ctxt.Diag("%v: invalid vector mask register", p)
+ 		}
+@@ -3399,7 +3562,7 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		ins.as = AVXORVI
+ 		ins.rd, ins.rs1, ins.rs2, ins.imm = uint32(p.To.Reg), obj.REG_NONE, uint32(p.From.Reg), -1
+ 
+-	case AVMSGTVV, AVMSGTUVV, AVMSGEVV, AVMSGEUVV:
++	case AVMSGTVV, AVMSGTUVV, AVMSGEVV, AVMSGEUVV, AVMFGTVV, AVMFGEVV:
+ 		// Set mask bit
+ 		switch {
+ 		case ins.rs3 == obj.REG_NONE:
+@@ -3416,6 +3579,10 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 			ins.as = AVMSLEVV
+ 		case AVMSGEUVV:
+ 			ins.as = AVMSLEUVV
++		case AVMFGTVV:
++			ins.as = AVMFLTVV
++		case AVMFGEVV:
++			ins.as = AVMFLEVV
+ 		}
+ 		ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), obj.REG_NONE
+ 
+@@ -3438,6 +3605,22 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 			ins.as = AVMSGTUVI
+ 		}
+ 		ins.rd, ins.rs1, ins.rs2, ins.rs3, ins.imm = uint32(p.To.Reg), obj.REG_NONE, uint32(p.Reg), obj.REG_NONE, ins.imm-1
++
++	case AVFABSV, AVFNEGV:
++		// Set mask bit
++		switch {
++		case ins.rs1 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs1 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		switch ins.as {
++		case AVFABSV:
++			ins.as = AVFSGNJXVV
++		case AVFNEGV:
++			ins.as = AVFSGNJNVV
++		}
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.From.Reg)
+ 	}
+ 
+ 	for _, ins := range inss {
+-- 
+2.39.5
+
diff --git a/2096-cmd-internal-obj-riscv-add-support-for-vector-reduct.patch b/2096-cmd-internal-obj-riscv-add-support-for-vector-reduct.patch
new file mode 100644
index 0000000..ea3d014
--- /dev/null
+++ b/2096-cmd-internal-obj-riscv-add-support-for-vector-reduct.patch
@@ -0,0 +1,176 @@
+From a65365b6a8f51e2333a4228b2aaf64cf02902177 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 096/119] cmd/internal/obj/riscv: add support for vector
+ reduction instructions
+
+Add support for vector reduction instructions to the RISC-V assembler,
+including single-width integer reduction, widening integer reduction,
+single-width floating-point reduction and widening floating-point
+reduction.
+
+Change-Id: I8f17bef11389f3a017e0430275023fc5d75936e3
+Reviewed-on: https://go-review.googlesource.com/c/go/+/646778
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   | 40 +++++++++++++++++++
+ .../asm/internal/asm/testdata/riscv64error.s  | 15 +++++++
+ .../internal/asm/testdata/riscv64validation.s | 16 ++++++++
+ src/cmd/internal/obj/riscv/obj.go             | 28 ++++++++++++-
+ 4 files changed, 98 insertions(+), 1 deletion(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 7e2a070bd0..13f0279fc7 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -1161,6 +1161,46 @@ start:
+ 	VFNCVTRODFFW	V2, V3				// d7912a4a
+ 	VFNCVTRODFFW	V2, V0, V3			// d7912a48
+ 
++	// 31.14.1: Vector Single-Width Integer Reduction Instructions
++	VREDSUMVS	V1, V2, V3			// d7a12002
++	VREDSUMVS	V1, V2, V0, V3			// d7a12000
++	VREDMAXUVS	V1, V2, V3			// d7a1201a
++	VREDMAXUVS	V1, V2, V0, V3			// d7a12018
++	VREDMAXVS	V1, V2, V3			// d7a1201e
++	VREDMAXVS	V1, V2, V0, V3			// d7a1201c
++	VREDMINUVS	V1, V2, V3			// d7a12012
++	VREDMINUVS	V1, V2, V0, V3			// d7a12010
++	VREDMINVS	V1, V2, V3			// d7a12016
++	VREDMINVS	V1, V2, V0, V3			// d7a12014
++	VREDANDVS	V1, V2, V3			// d7a12006
++	VREDANDVS	V1, V2, V0, V3			// d7a12004
++	VREDORVS	V1, V2, V3			// d7a1200a
++	VREDORVS	V1, V2, V0, V3			// d7a12008
++	VREDXORVS	V1, V2, V3			// d7a1200e
++	VREDXORVS	V1, V2, V0, V3			// d7a1200c
++
++	// 31.14.2: Vector Widening Integer Reduction Instructions
++	VWREDSUMUVS	V1, V2, V3			// d78120c2
++	VWREDSUMUVS	V1, V2, V0, V3			// d78120c0
++	VWREDSUMVS	V1, V2, V3			// d78120c6
++	VWREDSUMVS	V1, V2, V0, V3			// d78120c4
++
++	// 31.14.3: Vector Single-Width Floating-Point Reduction Instructions
++	VFREDOSUMVS	V1, V2, V3			// d791200e
++	VFREDOSUMVS	V1, V2, V0, V3			// d791200c
++	VFREDUSUMVS	V1, V2, V3			// d7912006
++	VFREDUSUMVS	V1, V2, V0, V3			// d7912004
++	VFREDMAXVS	V1, V2, V3			// d791201e
++	VFREDMAXVS	V1, V2, V0, V3			// d791201c
++	VFREDMINVS	V1, V2, V3			// d7912016
++	VFREDMINVS	V1, V2, V0, V3			// d7912014
++
++	// 31.14.4: Vector Widening Floating-Point Reduction Instructions
++	VFWREDOSUMVS	V1, V2, V3			// d79120ce
++	VFWREDOSUMVS	V1, V2, V0, V3			// d79120cc
++	VFWREDUSUMVS	V1, V2, V3			// d79120c6
++	VFWREDUSUMVS	V1, V2, V0, V3			// d79120c4
++
+ 	//
+ 	// Privileged ISA
+ 	//
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index 3aeeadf848..3a4bb1c761 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -347,5 +347,20 @@ TEXT errors(SB),$0
+ 	VFNCVTFXW	V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VFNCVTFFW	V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VFNCVTRODFFW	V2, V4, V3			// ERROR "invalid vector mask register"
++	VREDSUMVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREDMAXUVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREDMAXVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREDMINUVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREDMINVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREDANDVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREDORVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VREDXORVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWREDSUMUVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VWREDSUMVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFREDOSUMVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFREDUSUMVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFREDMAXVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFREDMINVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFWREDOSUMVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+ 
+ 	RET
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+index 2c509a1e91..adb10823d7 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+@@ -364,5 +364,21 @@ TEXT validation(SB),$0
+ 	VFNCVTFXW	X10, V3				// ERROR "expected vector register in vs2 position"
+ 	VFNCVTFFW	X10, V3				// ERROR "expected vector register in vs2 position"
+ 	VFNCVTRODFFW	X10, V3				// ERROR "expected vector register in vs2 position"
++	VREDSUMVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREDMAXUVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREDMAXVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREDMINUVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREDMINVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREDANDVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREDORVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VREDXORVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWREDSUMUVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWREDSUMVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFREDOSUMVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFREDUSUMVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFREDMAXVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFREDMINVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFWREDOSUMVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VFWREDUSUMVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+ 
+ 	RET
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 6066c840ca..9b99416b95 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -2467,6 +2467,30 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AVFNCVTFFW & obj.AMask:     {enc: rVVEncoding},
+ 	AVFNCVTRODFFW & obj.AMask:  {enc: rVVEncoding},
+ 
++	// 31.14.1: Vector Single-Width Integer Reduction Instructions
++	AVREDSUMVS & obj.AMask:  {enc: rVVVEncoding},
++	AVREDMAXUVS & obj.AMask: {enc: rVVVEncoding},
++	AVREDMAXVS & obj.AMask:  {enc: rVVVEncoding},
++	AVREDMINUVS & obj.AMask: {enc: rVVVEncoding},
++	AVREDMINVS & obj.AMask:  {enc: rVVVEncoding},
++	AVREDANDVS & obj.AMask:  {enc: rVVVEncoding},
++	AVREDORVS & obj.AMask:   {enc: rVVVEncoding},
++	AVREDXORVS & obj.AMask:  {enc: rVVVEncoding},
++
++	// 31.14.2: Vector Widening Integer Reduction Instructions
++	AVWREDSUMUVS & obj.AMask: {enc: rVVVEncoding},
++	AVWREDSUMVS & obj.AMask:  {enc: rVVVEncoding},
++
++	// 31.14.3: Vector Single-Width Floating-Point Reduction Instructions
++	AVFREDOSUMVS & obj.AMask: {enc: rVVVEncoding},
++	AVFREDUSUMVS & obj.AMask: {enc: rVVVEncoding},
++	AVFREDMAXVS & obj.AMask:  {enc: rVVVEncoding},
++	AVFREDMINVS & obj.AMask:  {enc: rVVVEncoding},
++
++	// 31.14.4: Vector Widening Floating-Point Reduction Instructions
++	AVFWREDOSUMVS & obj.AMask: {enc: rVVVEncoding},
++	AVFWREDUSUMVS & obj.AMask: {enc: rVVVEncoding},
++
+ 	//
+ 	// Privileged ISA
+ 	//
+@@ -3460,7 +3484,9 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		AVFMULVV, AVFMULVF, AVFDIVVV, AVFDIVVF, AVFRDIVVF, AVFWMULVV, AVFWMULVF,
+ 		AVFMINVV, AVFMINVF, AVFMAXVV, AVFMAXVF,
+ 		AVFSGNJVV, AVFSGNJVF, AVFSGNJNVV, AVFSGNJNVF, AVFSGNJXVV, AVFSGNJXVF,
+-		AVMFEQVV, AVMFEQVF, AVMFNEVV, AVMFNEVF, AVMFLTVV, AVMFLTVF, AVMFLEVV, AVMFLEVF, AVMFGTVF, AVMFGEVF:
++		AVMFEQVV, AVMFEQVF, AVMFNEVV, AVMFNEVF, AVMFLTVV, AVMFLTVF, AVMFLEVV, AVMFLEVF, AVMFGTVF, AVMFGEVF,
++		AVREDSUMVS, AVREDMAXUVS, AVREDMAXVS, AVREDMINUVS, AVREDMINVS, AVREDANDVS, AVREDORVS, AVREDXORVS,
++		AVWREDSUMUVS, AVWREDSUMVS, AVFREDOSUMVS, AVFREDUSUMVS, AVFREDMAXVS, AVFREDMINVS, AVFWREDOSUMVS, AVFWREDUSUMVS:
+ 		// Set mask bit
+ 		switch {
+ 		case ins.rs3 == obj.REG_NONE:
+-- 
+2.39.5
+
diff --git a/2097-cmd-internal-obj-riscv-add-support-for-vector-mask-i.patch b/2097-cmd-internal-obj-riscv-add-support-for-vector-mask-i.patch
new file mode 100644
index 0000000..fb39fe4
--- /dev/null
+++ b/2097-cmd-internal-obj-riscv-add-support-for-vector-mask-i.patch
@@ -0,0 +1,269 @@
+From 1027809e88579f1ad50ce1acb5c2eb6bea8ca51d Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 097/119] cmd/internal/obj/riscv: add support for vector mask
+ instructions
+
+Add support for vector mask instructions to the RISC-V assembler.
+These allow manipulation of vector masks and include mask register
+logical instructions, population count and find-first bit set
+instructions.
+
+Change-Id: I3ab3aa0f918338aee9b37ac5a2b2fdc407875072
+Reviewed-on: https://go-review.googlesource.com/c/go/+/646779
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Junyang Shao <shaojunyang@google.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   | 28 ++++++++
+ .../asm/internal/asm/testdata/riscv64error.s  |  6 ++
+ .../internal/asm/testdata/riscv64validation.s | 19 +++++
+ src/cmd/internal/obj/riscv/anames.go          |  8 ++-
+ src/cmd/internal/obj/riscv/cpu.go             |  8 ++-
+ src/cmd/internal/obj/riscv/obj.go             | 70 +++++++++++++++++++
+ 6 files changed, 135 insertions(+), 4 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 13f0279fc7..ffffbf4bd4 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -1201,6 +1201,34 @@ start:
+ 	VFWREDUSUMVS	V1, V2, V3			// d79120c6
+ 	VFWREDUSUMVS	V1, V2, V0, V3			// d79120c4
+ 
++	// 31.15: Vector Mask Instructions
++	VMANDMM		V1, V2, V3			// d7a12066
++	VMNANDMM	V1, V2, V3			// d7a12076
++	VMANDNMM	V1, V2, V3			// d7a12062
++	VMXORMM		V1, V2, V3			// d7a1206e
++	VMORMM		V1, V2, V3			// d7a1206a
++	VMNORMM		V1, V2, V3			// d7a1207a
++	VMORNMM		V1, V2, V3			// d7a12072
++	VMXNORMM	V1, V2, V3			// d7a1207e
++	VMMVM		V2, V3				// d7212166
++	VMCLRM		V3				// d7a1316e
++	VMSETM		V3				// d7a1317e
++	VMNOTM		V2, V3				// d7212176
++	VCPOPM		V2, X10				// 57252842
++	VCPOPM		V2, V0, X10			// 57252840
++	VFIRSTM		V2, X10				// 57a52842
++	VFIRSTM		V2, V0, X10			// 57a52840
++	VMSBFM		V2, V3				// d7a12052
++	VMSBFM		V2, V0, V3			// d7a12050
++	VMSIFM		V2, V3				// d7a12152
++	VMSIFM		V2, V0, V3			// d7a12150
++	VMSOFM		V2, V3				// d7212152
++	VMSOFM		V2, V0, V3			// d7212150
++	VIOTAM		V2, V3				// d7212852
++	VIOTAM		V2, V0, V3			// d7212850
++	VIDV		V3				// d7a10852
++	VIDV		V0, V3				// d7a10850
++
+ 	//
+ 	// Privileged ISA
+ 	//
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index 3a4bb1c761..b076cf50e0 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -362,5 +362,11 @@ TEXT errors(SB),$0
+ 	VFREDMAXVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VFREDMINVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VFWREDOSUMVS	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VCPOPM		V2, V4, X10			// ERROR "invalid vector mask register"
++	VFIRSTM		V2, V4, X10			// ERROR "invalid vector mask register"
++	VMSBFM		V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSIFM		V2, V4, V3			// ERROR "invalid vector mask register"
++	VMSOFM		V2, V4, V3			// ERROR "invalid vector mask register"
++	VIOTAM		V2, V4, V3			// ERROR "invalid vector mask register"
+ 
+ 	RET
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+index adb10823d7..8b0349584f 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+@@ -380,5 +380,24 @@ TEXT validation(SB),$0
+ 	VFREDMINVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+ 	VFWREDOSUMVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+ 	VFWREDUSUMVS	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMANDMM		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMNANDMM	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMANDNMM	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMXORMM		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMORMM		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMNORMM		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMORNMM		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMXNORMM	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMMVM		V3, X10				// ERROR "expected vector register in vd position"
++	VMNOTM		V3, X10				// ERROR "expected vector register in vd position"
++	VCPOPM		V2, V1				// ERROR "expected integer register in rd position"
++	VCPOPM		X11, X10			// ERROR "expected vector register in vs2 position"
++	VFIRSTM		V2, V1				// ERROR "expected integer register in rd position"
++	VFIRSTM		X11, X10			// ERROR "expected vector register in vs2 position"
++	VMSBFM		X10, V3				// ERROR "expected vector register in vs2 position"
++	VMSIFM		X10, V3				// ERROR "expected vector register in vs2 position"
++	VMSOFM		X10, V3				// ERROR "expected vector register in vs2 position"
++	VIOTAM		X10, V3				// ERROR "expected vector register in vs2 position"
++	VIDV		X10				// ERROR "expected vector register in vd position"
+ 
+ 	RET
+diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go
+index bf1fdb8b88..a689f2de27 100644
+--- a/src/cmd/internal/obj/riscv/anames.go
++++ b/src/cmd/internal/obj/riscv/anames.go
+@@ -652,12 +652,16 @@ var Anames = []string{
+ 	"SNEZ",
+ 	"VFABSV",
+ 	"VFNEGV",
+-	"VMFGEVV",
+-	"VMFGTVV",
+ 	"VL1RV",
+ 	"VL2RV",
+ 	"VL4RV",
+ 	"VL8RV",
++	"VMCLRM",
++	"VMFGEVV",
++	"VMFGTVV",
++	"VMMVM",
++	"VMNOTM",
++	"VMSETM",
+ 	"VMSGEUVI",
+ 	"VMSGEUVV",
+ 	"VMSGEVI",
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index b641eadde7..aaf5db9e75 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -1170,12 +1170,16 @@ const (
+ 	ASNEZ
+ 	AVFABSV
+ 	AVFNEGV
+-	AVMFGEVV
+-	AVMFGTVV
+ 	AVL1RV
+ 	AVL2RV
+ 	AVL4RV
+ 	AVL8RV
++	AVMCLRM
++	AVMFGEVV
++	AVMFGTVV
++	AVMMVM
++	AVMNOTM
++	AVMSETM
+ 	AVMSGEUVI
+ 	AVMSGEUVV
+ 	AVMSGEVI
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 9b99416b95..0e1d482f1d 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1210,6 +1210,13 @@ func validateRVFV(ctxt *obj.Link, ins *instruction) {
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
++func validateRVI(ctxt *obj.Link, ins *instruction) {
++	wantIntReg(ctxt, ins, "rd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "vs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
+ func validateRVIV(ctxt *obj.Link, ins *instruction) {
+ 	wantVectorReg(ctxt, ins, "vd", ins.rd)
+ 	wantIntReg(ctxt, ins, "rs1", ins.rs1)
+@@ -1459,6 +1466,10 @@ func encodeRVFV(ins *instruction) uint32 {
+ 	return encodeR(ins.as, regF(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
+ }
+ 
++func encodeRVI(ins *instruction) uint32 {
++	return encodeR(ins.as, 0, regV(ins.rs2), regI(ins.rd), ins.funct3, ins.funct7)
++}
++
+ func encodeRVIV(ins *instruction) uint32 {
+ 	return encodeR(ins.as, regI(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
+ }
+@@ -1763,6 +1774,7 @@ var (
+ 	rIFEncoding   = encoding{encode: encodeRIF, validate: validateRIF, length: 4}
+ 	rFFEncoding   = encoding{encode: encodeRFF, validate: validateRFF, length: 4}
+ 	rVFVEncoding  = encoding{encode: encodeRVFV, validate: validateRVFV, length: 4}
++	rVIEncoding   = encoding{encode: encodeRVI, validate: validateRVI, length: 4}
+ 	rVIVEncoding  = encoding{encode: encodeRVIV, validate: validateRVIV, length: 4}
+ 	rVVEncoding   = encoding{encode: encodeRVV, validate: validateRVV, length: 4}
+ 	rVViEncoding  = encoding{encode: encodeRVVi, validate: validateRVVi, length: 4}
+@@ -2491,6 +2503,23 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AVFWREDOSUMVS & obj.AMask: {enc: rVVVEncoding},
+ 	AVFWREDUSUMVS & obj.AMask: {enc: rVVVEncoding},
+ 
++	// 31.15: Vector Mask Instructions
++	AVMANDMM & obj.AMask:  {enc: rVVVEncoding},
++	AVMNANDMM & obj.AMask: {enc: rVVVEncoding},
++	AVMANDNMM & obj.AMask: {enc: rVVVEncoding},
++	AVMXORMM & obj.AMask:  {enc: rVVVEncoding},
++	AVMORMM & obj.AMask:   {enc: rVVVEncoding},
++	AVMNORMM & obj.AMask:  {enc: rVVVEncoding},
++	AVMORNMM & obj.AMask:  {enc: rVVVEncoding},
++	AVMXNORMM & obj.AMask: {enc: rVVVEncoding},
++	AVCPOPM & obj.AMask:   {enc: rVIEncoding},
++	AVFIRSTM & obj.AMask:  {enc: rVIEncoding},
++	AVMSBFM & obj.AMask:   {enc: rVVEncoding},
++	AVMSIFM & obj.AMask:   {enc: rVVEncoding},
++	AVMSOFM & obj.AMask:   {enc: rVVEncoding},
++	AVIOTAM & obj.AMask:   {enc: rVVEncoding},
++	AVIDV & obj.AMask:     {enc: rVVEncoding},
++
+ 	//
+ 	// Privileged ISA
+ 	//
+@@ -3647,6 +3676,47 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 			ins.as = AVFSGNJNVV
+ 		}
+ 		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.From.Reg)
++
++	case AVMANDMM, AVMNANDMM, AVMANDNMM, AVMXORMM, AVMORMM, AVMNORMM, AVMORNMM, AVMXNORMM, AVMMVM, AVMNOTM:
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg)
++		switch ins.as {
++		case AVMMVM:
++			ins.as, ins.rs2 = AVMANDMM, ins.rs1
++		case AVMNOTM:
++			ins.as, ins.rs2 = AVMNANDMM, ins.rs1
++		}
++
++	case AVMCLRM, AVMSETM:
++		ins.rd, ins.rs1, ins.rs2 = uint32(p.From.Reg), uint32(p.From.Reg), uint32(p.From.Reg)
++		switch ins.as {
++		case AVMCLRM:
++			ins.as = AVMXORMM
++		case AVMSETM:
++			ins.as = AVMXNORMM
++		}
++
++	case AVCPOPM, AVFIRSTM, AVMSBFM, AVMSIFM, AVMSOFM, AVIOTAM:
++		// Set mask bit
++		switch {
++		case ins.rs1 == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rs1 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		ins.rs1 = obj.REG_NONE
++
++	case AVIDV:
++		// Set mask bit
++		switch {
++		case ins.rd == obj.REG_NONE:
++			ins.funct7 |= 1 // unmasked
++		case ins.rd != obj.REG_NONE && ins.rs2 != REG_V0:
++			p.Ctxt.Diag("%v: invalid vector mask register", p)
++		}
++		if ins.rd == obj.REG_NONE {
++			ins.rd = uint32(p.From.Reg)
++		}
++		ins.rs1, ins.rs2 = obj.REG_NONE, REG_V0
+ 	}
+ 
+ 	for _, ins := range inss {
+-- 
+2.39.5
+
diff --git a/2098-cmd-internal-obj-riscv-add-support-for-vector-permut.patch b/2098-cmd-internal-obj-riscv-add-support-for-vector-permut.patch
new file mode 100644
index 0000000..12f15e8
--- /dev/null
+++ b/2098-cmd-internal-obj-riscv-add-support-for-vector-permut.patch
@@ -0,0 +1,287 @@
+From 95cd5726bc0fd0b5f20049fad6b698981e404d04 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 098/119] cmd/internal/obj/riscv: add support for vector
+ permutation instructions
+
+Add support for vector permutation instructions to the RISC-V assembler.
+This includes integer scalar move, floating point scalar move, slide up
+and slide down, register gather, compression and whole vector register
+move instructions.
+
+Change-Id: I1da9f393091504fd81714006355725b8b9ecadea
+Reviewed-on: https://go-review.googlesource.com/c/go/+/646780
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Junyang Shao <shaojunyang@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   | 45 +++++++++++
+ .../asm/internal/asm/testdata/riscv64error.s  | 12 +++
+ .../internal/asm/testdata/riscv64validation.s | 28 +++++++
+ src/cmd/internal/obj/riscv/obj.go             | 77 ++++++++++++++++++-
+ 4 files changed, 159 insertions(+), 3 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index ffffbf4bd4..2bab6842e7 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -1229,6 +1229,51 @@ start:
+ 	VIDV		V3				// d7a10852
+ 	VIDV		V0, V3				// d7a10850
+ 
++	// 31.16.1: Integer Scalar Move Instructions
++	VMVXS		V2, X10				// 57252042
++	VMVSX		X10, V2				// 57610542
++
++	// 31.16.2: Floating-Point Scalar Move Instructions
++	VFMVFS		V2, F10				// 57152042
++	VFMVSF		F10, V2				// 57510542
++
++	// 31.16.3: Vector Slide Instructions
++	VSLIDEUPVX	X10, V2, V3			// d741253a
++	VSLIDEUPVX	X10, V2, V0, V3			// d7412538
++	VSLIDEUPVI	$16, V2, V3			// d731283a
++	VSLIDEUPVI	$16, V2, V0, V3			// d7312838
++	VSLIDEDOWNVX	X10, V2, V3			// d741253e
++	VSLIDEDOWNVX	X10, V2, V0, V3			// d741253c
++	VSLIDEDOWNVI	$16, V2, V3			// d731283e
++	VSLIDEDOWNVI	$16, V2, V0, V3			// d731283c
++	VSLIDE1UPVX	X10, V2, V3			// d761253a
++	VSLIDE1UPVX	X10, V2, V0, V3			// d7612538
++	VFSLIDE1UPVF	F10, V2, V3			// d751253a
++	VFSLIDE1UPVF	F10, V2, V0, V3			// d7512538
++	VSLIDE1DOWNVX	X10, V2, V3			// d761253e
++	VSLIDE1DOWNVX	X10, V2, V0, V3			// d761253c
++	VFSLIDE1DOWNVF	F10, V2, V3			// d751253e
++	VFSLIDE1DOWNVF	F10, V2, V0, V3			// d751253c
++
++	// 31.16.4: Vector Register Gather Instructions
++	VRGATHERVV	V1, V2, V3			// d7812032
++	VRGATHERVV	V1, V2, V0, V3			// d7812030
++	VRGATHEREI16VV	V1, V2, V3			// d781203a
++	VRGATHEREI16VV	V1, V2, V0, V3			// d7812038
++	VRGATHERVX	X10, V2, V3			// d7412532
++	VRGATHERVX	X10, V2, V0, V3			// d7412530
++	VRGATHERVI	$16, V2, V3			// d7312832
++	VRGATHERVI	$16, V2, V0, V3			// d7312830
++
++	// 31.16.5: Vector Compress Instruction
++	VCOMPRESSVM	V1, V2, V3			// d7a1205e
++
++	// 31.16.6: Whole Vector Register Move
++	VMV1RV		V2, V1				// d730209e
++	VMV2RV		V12, V10			// 57b5c09e
++	VMV4RV		V8, V4				// 57b2819e
++	VMV8RV		V8, V0				// 57b0839e
++
+ 	//
+ 	// Privileged ISA
+ 	//
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index b076cf50e0..4238197893 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -368,5 +368,17 @@ TEXT errors(SB),$0
+ 	VMSIFM		V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VMSOFM		V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VIOTAM		V2, V4, V3			// ERROR "invalid vector mask register"
++	VSLIDEUPVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSLIDEUPVI	$16, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSLIDEDOWNVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSLIDEDOWNVI	$16, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSLIDE1UPVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFSLIDE1UPVF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VSLIDE1DOWNVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VFSLIDE1DOWNVF	F10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VRGATHERVV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VRGATHEREI16VV	V1, V2, V4, V3			// ERROR "invalid vector mask register"
++	VRGATHERVX	X10, V2, V4, V3			// ERROR "invalid vector mask register"
++	VRGATHERVI	$16, V2, V4, V3			// ERROR "invalid vector mask register"
+ 
+ 	RET
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+index 8b0349584f..374a97dcfe 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+@@ -399,5 +399,33 @@ TEXT validation(SB),$0
+ 	VMSOFM		X10, V3				// ERROR "expected vector register in vs2 position"
+ 	VIOTAM		X10, V3				// ERROR "expected vector register in vs2 position"
+ 	VIDV		X10				// ERROR "expected vector register in vd position"
++	VMVXS		X11, X10			// ERROR "expected vector register in vs2 position"
++	VMVXS		V2, V1				// ERROR "expected integer register in rd position"
++	VMVSX		X11, X10			// ERROR "expected vector register in vd position"
++	VMVSX		V2, V1				// ERROR "expected integer register in rs2 position"
++	VFMVFS		X10, F10			// ERROR "expected vector register in vs2 position"
++	VFMVFS		V2, V1				// ERROR "expected float register in rd position"
++	VFMVSF		X10, V2				// ERROR "expected float register in rs2 position"
++	VFMVSF		V2, V1				// ERROR "expected float register in rs2 position"
++	VSLIDEUPVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSLIDEUPVI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VSLIDEUPVI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VSLIDEDOWNVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VSLIDEDOWNVI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VSLIDEDOWNVI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VSLIDE1UPVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VFSLIDE1UPVF	V1, V2, V3			// ERROR "expected float register in rs1 position"
++	VSLIDE1DOWNVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VFSLIDE1DOWNVF	V1, V2, V3			// ERROR "expected float register in rs1 position"
++	VRGATHERVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VRGATHEREI16VV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VRGATHERVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
++	VRGATHERVI	$-1, V2, V3			// ERROR "unsigned immediate -1 must be in range [0, 31]"
++	VRGATHERVI	$32, V2, V3			// ERROR "unsigned immediate 32 must be in range [0, 31]"
++	VCOMPRESSVM	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMV1RV		X10, V1				// ERROR "expected vector register in vs2 position"
++	VMV2RV		X10, V10			// ERROR "expected vector register in vs2 position"
++	VMV4RV		X10, V4				// ERROR "expected vector register in vs2 position"
++	VMV8RV		X10, V0				// ERROR "expected vector register in vs2 position"
+ 
+ 	RET
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 0e1d482f1d..4fbf4b4336 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -1189,6 +1189,13 @@ func validateRFI(ctxt *obj.Link, ins *instruction) {
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
++func validateRFV(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantFloatReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
+ func validateRFF(ctxt *obj.Link, ins *instruction) {
+ 	wantFloatReg(ctxt, ins, "rd", ins.rd)
+ 	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
+@@ -1203,6 +1210,20 @@ func validateRIF(ctxt *obj.Link, ins *instruction) {
+ 	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
+ }
+ 
++func validateRIV(ctxt *obj.Link, ins *instruction) {
++	wantVectorReg(ctxt, ins, "vd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantIntReg(ctxt, ins, "rs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
++func validateRVF(ctxt *obj.Link, ins *instruction) {
++	wantFloatReg(ctxt, ins, "rd", ins.rd)
++	wantNoneReg(ctxt, ins, "rs1", ins.rs1)
++	wantVectorReg(ctxt, ins, "vs2", ins.rs2)
++	wantNoneReg(ctxt, ins, "rs3", ins.rs3)
++}
++
+ func validateRVFV(ctxt *obj.Link, ins *instruction) {
+ 	wantVectorReg(ctxt, ins, "vd", ins.rd)
+ 	wantFloatReg(ctxt, ins, "rs1", ins.rs1)
+@@ -1458,10 +1479,22 @@ func encodeRFF(ins *instruction) uint32 {
+ 	return encodeR(ins.as, regF(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7)
+ }
+ 
++func encodeRFV(ins *instruction) uint32 {
++	return encodeR(ins.as, regF(ins.rs2), 0, regV(ins.rd), ins.funct3, ins.funct7)
++}
++
+ func encodeRIF(ins *instruction) uint32 {
+ 	return encodeR(ins.as, regI(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7)
+ }
+ 
++func encodeRIV(ins *instruction) uint32 {
++	return encodeR(ins.as, regI(ins.rs2), 0, regV(ins.rd), ins.funct3, ins.funct7)
++}
++
++func encodeRVF(ins *instruction) uint32 {
++	return encodeR(ins.as, 0, regV(ins.rs2), regF(ins.rd), ins.funct3, ins.funct7)
++}
++
+ func encodeRVFV(ins *instruction) uint32 {
+ 	return encodeR(ins.as, regF(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7)
+ }
+@@ -1771,8 +1804,11 @@ var (
+ 	rFFFFEncoding = encoding{encode: encodeRFFFF, validate: validateRFFFF, length: 4}
+ 	rFFIEncoding  = encoding{encode: encodeRFFI, validate: validateRFFI, length: 4}
+ 	rFIEncoding   = encoding{encode: encodeRFI, validate: validateRFI, length: 4}
++	rFVEncoding   = encoding{encode: encodeRFV, validate: validateRFV, length: 4}
+ 	rIFEncoding   = encoding{encode: encodeRIF, validate: validateRIF, length: 4}
++	rIVEncoding   = encoding{encode: encodeRIV, validate: validateRIV, length: 4}
+ 	rFFEncoding   = encoding{encode: encodeRFF, validate: validateRFF, length: 4}
++	rVFEncoding   = encoding{encode: encodeRVF, validate: validateRVF, length: 4}
+ 	rVFVEncoding  = encoding{encode: encodeRVFV, validate: validateRVFV, length: 4}
+ 	rVIEncoding   = encoding{encode: encodeRVI, validate: validateRVI, length: 4}
+ 	rVIVEncoding  = encoding{encode: encodeRVIV, validate: validateRVIV, length: 4}
+@@ -2520,6 +2556,39 @@ var instructions = [ALAST & obj.AMask]instructionData{
+ 	AVIOTAM & obj.AMask:   {enc: rVVEncoding},
+ 	AVIDV & obj.AMask:     {enc: rVVEncoding},
+ 
++	// 31.16.1: Integer Scalar Move Instructions
++	AVMVXS & obj.AMask: {enc: rVIEncoding},
++	AVMVSX & obj.AMask: {enc: rIVEncoding},
++
++	// 31.16.2: Floating-Point Scalar Move Instructions
++	AVFMVFS & obj.AMask: {enc: rVFEncoding},
++	AVFMVSF & obj.AMask: {enc: rFVEncoding},
++
++	// 31.16.3: Vector Slide Instructions
++	AVSLIDEUPVX & obj.AMask:     {enc: rVIVEncoding},
++	AVSLIDEUPVI & obj.AMask:     {enc: rVVuEncoding},
++	AVSLIDEDOWNVX & obj.AMask:   {enc: rVIVEncoding},
++	AVSLIDEDOWNVI & obj.AMask:   {enc: rVVuEncoding},
++	AVSLIDE1UPVX & obj.AMask:    {enc: rVIVEncoding},
++	AVFSLIDE1UPVF & obj.AMask:   {enc: rVFVEncoding},
++	AVSLIDE1DOWNVX & obj.AMask:  {enc: rVIVEncoding},
++	AVFSLIDE1DOWNVF & obj.AMask: {enc: rVFVEncoding},
++
++	// 31.16.4: Vector Register Gather Instructions
++	AVRGATHERVV & obj.AMask:     {enc: rVVVEncoding},
++	AVRGATHEREI16VV & obj.AMask: {enc: rVVVEncoding},
++	AVRGATHERVX & obj.AMask:     {enc: rVIVEncoding},
++	AVRGATHERVI & obj.AMask:     {enc: rVVuEncoding},
++
++	// 31.16.5: Vector Compress Instruction
++	AVCOMPRESSVM & obj.AMask: {enc: rVVVEncoding},
++
++	// 31.16.6: Whole Vector Register Move
++	AVMV1RV & obj.AMask: {enc: rVVEncoding},
++	AVMV2RV & obj.AMask: {enc: rVVEncoding},
++	AVMV4RV & obj.AMask: {enc: rVVEncoding},
++	AVMV8RV & obj.AMask: {enc: rVVEncoding},
++
+ 	//
+ 	// Privileged ISA
+ 	//
+@@ -3515,7 +3584,9 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		AVFSGNJVV, AVFSGNJVF, AVFSGNJNVV, AVFSGNJNVF, AVFSGNJXVV, AVFSGNJXVF,
+ 		AVMFEQVV, AVMFEQVF, AVMFNEVV, AVMFNEVF, AVMFLTVV, AVMFLTVF, AVMFLEVV, AVMFLEVF, AVMFGTVF, AVMFGEVF,
+ 		AVREDSUMVS, AVREDMAXUVS, AVREDMAXVS, AVREDMINUVS, AVREDMINVS, AVREDANDVS, AVREDORVS, AVREDXORVS,
+-		AVWREDSUMUVS, AVWREDSUMVS, AVFREDOSUMVS, AVFREDUSUMVS, AVFREDMAXVS, AVFREDMINVS, AVFWREDOSUMVS, AVFWREDUSUMVS:
++		AVWREDSUMUVS, AVWREDSUMVS, AVFREDOSUMVS, AVFREDUSUMVS, AVFREDMAXVS, AVFREDMINVS, AVFWREDOSUMVS, AVFWREDUSUMVS,
++		AVSLIDEUPVX, AVSLIDEDOWNVX, AVSLIDE1UPVX, AVFSLIDE1UPVF, AVSLIDE1DOWNVX, AVFSLIDE1DOWNVF,
++		AVRGATHERVV, AVRGATHEREI16VV, AVRGATHERVX:
+ 		// Set mask bit
+ 		switch {
+ 		case ins.rs3 == obj.REG_NONE:
+@@ -3537,7 +3608,7 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), obj.REG_NONE
+ 
+ 	case AVADDVI, AVRSUBVI, AVANDVI, AVORVI, AVXORVI, AVMSEQVI, AVMSNEVI, AVMSLEUVI, AVMSLEVI, AVMSGTUVI, AVMSGTVI,
+-		AVSLLVI, AVSRLVI, AVSRAVI, AVNSRLWI, AVNSRAWI:
++		AVSLLVI, AVSRLVI, AVSRAVI, AVNSRLWI, AVNSRAWI, AVRGATHERVI, AVSLIDEUPVI, AVSLIDEDOWNVI:
+ 		// Set mask bit
+ 		switch {
+ 		case ins.rs3 == obj.REG_NONE:
+@@ -3677,7 +3748,7 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		}
+ 		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.From.Reg)
+ 
+-	case AVMANDMM, AVMNANDMM, AVMANDNMM, AVMXORMM, AVMORMM, AVMNORMM, AVMORNMM, AVMXNORMM, AVMMVM, AVMNOTM:
++	case AVMANDMM, AVMNANDMM, AVMANDNMM, AVMXORMM, AVMORMM, AVMNORMM, AVMORNMM, AVMXNORMM, AVMMVM, AVMNOTM, AVCOMPRESSVM:
+ 		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg)
+ 		switch ins.as {
+ 		case AVMMVM:
+-- 
+2.39.5
+
diff --git a/2099-internal-bytealg-vector-implementation-of-equal-for-.patch b/2099-internal-bytealg-vector-implementation-of-equal-for-.patch
new file mode 100644
index 0000000..ba0dfc3
--- /dev/null
+++ b/2099-internal-bytealg-vector-implementation-of-equal-for-.patch
@@ -0,0 +1,186 @@
+From aeae24dea6c37769461cc13c64acc8e4ce27afa9 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 099/119] internal/bytealg: vector implementation of equal for
+ riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Provide a vector implementation of equal for riscv64, which is used
+when compiled with the rva23u64 profile, or when vector is detected
+to be available. Inputs that are 8 byte aligned will still be handled
+via a the non-vector code if the length is less than or equal to 64
+bytes.
+
+On a Banana Pi F3, with GORISCV64=rva23u64:
+
+                                │   equal.1    │               equal.2                │
+                                │    sec/op    │    sec/op     vs base                │
+Equal/0-8                         1.254n ±  0%   1.254n ±  0%        ~ (p=1.000 n=10)
+Equal/same/1-8                    21.32n ±  0%   21.32n ±  0%        ~ (p=0.466 n=10)
+Equal/same/6-8                    21.32n ±  0%   21.32n ±  0%        ~ (p=0.689 n=10)
+Equal/same/9-8                    21.32n ±  0%   21.32n ±  0%        ~ (p=0.861 n=10)
+Equal/same/15-8                   21.32n ±  0%   21.32n ±  0%        ~ (p=0.657 n=10)
+Equal/same/16-8                   21.32n ±  0%   21.33n ±  0%        ~ (p=0.075 n=10)
+Equal/same/20-8                   21.32n ±  0%   21.32n ±  0%        ~ (p=0.249 n=10)
+Equal/same/32-8                   21.32n ±  0%   21.32n ±  0%        ~ (p=0.303 n=10)
+Equal/same/4K-8                   21.32n ±  0%   21.32n ±  0%        ~ (p=1.000 n=10)
+Equal/same/4M-8                   21.32n ±  0%   21.32n ±  0%        ~ (p=0.582 n=10)
+Equal/same/64M-8                  21.32n ±  0%   21.32n ±  0%        ~ (p=0.930 n=10)
+Equal/1-8                         39.16n ±  1%   38.71n ±  0%   -1.15% (p=0.000 n=10)
+Equal/6-8                         51.49n ±  1%   50.40n ±  1%   -2.12% (p=0.000 n=10)
+Equal/9-8                         54.46n ±  1%   53.89n ±  0%   -1.04% (p=0.000 n=10)
+Equal/15-8                        71.81n ±  1%   70.59n ±  0%   -1.71% (p=0.000 n=10)
+Equal/16-8                        69.14n ±  0%   68.21n ±  0%   -1.34% (p=0.000 n=10)
+Equal/20-8                        78.59n ±  0%   77.59n ±  0%   -1.26% (p=0.000 n=10)
+Equal/32-8                        41.55n ±  0%   41.16n ±  0%   -0.96% (p=0.000 n=10)
+Equal/4K-8                        925.5n ±  0%   561.4n ±  1%  -39.34% (p=0.000 n=10)
+Equal/4M-8                        3.110m ± 32%   2.463m ± 16%  -20.80% (p=0.000 n=10)
+Equal/64M-8                       47.34m ± 30%   39.89m ± 16%  -15.75% (p=0.004 n=10)
+EqualBothUnaligned/64_0-8         32.17n ±  1%   32.11n ±  1%        ~ (p=0.184 n=10)
+EqualBothUnaligned/64_1-8         79.48n ±  0%   48.24n ±  1%  -39.31% (p=0.000 n=10)
+EqualBothUnaligned/64_4-8         72.71n ±  0%   48.37n ±  1%  -33.48% (p=0.000 n=10)
+EqualBothUnaligned/64_7-8         77.12n ±  0%   48.16n ±  1%  -37.56% (p=0.000 n=10)
+EqualBothUnaligned/4096_0-8       908.4n ±  0%   562.4n ±  2%  -38.09% (p=0.000 n=10)
+EqualBothUnaligned/4096_1-8       956.6n ±  0%   571.4n ±  3%  -40.26% (p=0.000 n=10)
+EqualBothUnaligned/4096_4-8       949.6n ±  0%   571.6n ±  3%  -39.81% (p=0.000 n=10)
+EqualBothUnaligned/4096_7-8       954.2n ±  0%   571.7n ±  3%  -40.09% (p=0.000 n=10)
+EqualBothUnaligned/4194304_0-8    2.935m ± 29%   2.664m ± 19%        ~ (p=0.089 n=10)
+EqualBothUnaligned/4194304_1-8    3.341m ± 13%   2.896m ± 34%        ~ (p=0.075 n=10)
+EqualBothUnaligned/4194304_4-8    3.204m ± 39%   3.352m ± 33%        ~ (p=0.796 n=10)
+EqualBothUnaligned/4194304_7-8    3.226m ± 30%   2.737m ± 34%  -15.16% (p=0.043 n=10)
+EqualBothUnaligned/67108864_0-8   49.04m ± 17%   39.94m ± 12%  -18.57% (p=0.005 n=10)
+EqualBothUnaligned/67108864_1-8   51.96m ± 15%   42.48m ± 15%  -18.23% (p=0.015 n=10)
+EqualBothUnaligned/67108864_4-8   47.67m ± 17%   37.85m ± 41%  -20.61% (p=0.035 n=10)
+EqualBothUnaligned/67108864_7-8   53.00m ± 22%   38.76m ± 21%  -26.87% (p=0.000 n=10)
+CompareBytesEqual-8               51.71n ±  1%   52.00n ±  0%   +0.57% (p=0.002 n=10)
+geomean                           1.469µ         1.265µ        -13.93%
+
+                                │    equal.1     │                equal.2                 │
+                                │      B/s       │      B/s        vs base                │
+Equal/same/1-8                     44.73Mi ±  0%    44.72Mi ±  0%        ~ (p=0.426 n=10)
+Equal/same/6-8                     268.3Mi ±  0%    268.4Mi ±  0%        ~ (p=0.753 n=10)
+Equal/same/9-8                     402.6Mi ±  0%    402.5Mi ±  0%        ~ (p=0.209 n=10)
+Equal/same/15-8                    670.9Mi ±  0%    670.9Mi ±  0%        ~ (p=0.724 n=10)
+Equal/same/16-8                    715.6Mi ±  0%    715.4Mi ±  0%   -0.04% (p=0.022 n=10)
+Equal/same/20-8                    894.6Mi ±  0%    894.5Mi ±  0%        ~ (p=0.060 n=10)
+Equal/same/32-8                    1.398Gi ±  0%    1.398Gi ±  0%        ~ (p=0.986 n=10)
+Equal/same/4K-8                    178.9Gi ±  0%    178.9Gi ±  0%        ~ (p=0.853 n=10)
+Equal/same/4M-8                    178.9Ti ±  0%    178.9Ti ±  0%        ~ (p=0.971 n=10)
+Equal/same/64M-8                  2862.8Ti ±  0%   2862.6Ti ±  0%        ~ (p=0.971 n=10)
+Equal/1-8                          24.35Mi ±  1%    24.63Mi ±  0%   +1.16% (p=0.000 n=10)
+Equal/6-8                          111.1Mi ±  1%    113.5Mi ±  1%   +2.17% (p=0.000 n=10)
+Equal/9-8                          157.6Mi ±  1%    159.3Mi ±  0%   +1.05% (p=0.000 n=10)
+Equal/15-8                         199.2Mi ±  1%    202.7Mi ±  0%   +1.74% (p=0.000 n=10)
+Equal/16-8                         220.7Mi ±  0%    223.7Mi ±  0%   +1.36% (p=0.000 n=10)
+Equal/20-8                         242.7Mi ±  0%    245.8Mi ±  0%   +1.27% (p=0.000 n=10)
+Equal/32-8                         734.3Mi ±  0%    741.6Mi ±  0%   +0.98% (p=0.000 n=10)
+Equal/4K-8                         4.122Gi ±  0%    6.795Gi ±  1%  +64.84% (p=0.000 n=10)
+Equal/4M-8                         1.258Gi ± 24%    1.586Gi ± 14%  +26.12% (p=0.000 n=10)
+Equal/64M-8                        1.320Gi ± 23%    1.567Gi ± 14%  +18.69% (p=0.004 n=10)
+EqualBothUnaligned/64_0-8          1.853Gi ±  1%    1.856Gi ±  1%        ~ (p=0.190 n=10)
+EqualBothUnaligned/64_1-8          767.9Mi ±  0%   1265.2Mi ±  1%  +64.76% (p=0.000 n=10)
+EqualBothUnaligned/64_4-8          839.4Mi ±  0%   1261.9Mi ±  1%  +50.33% (p=0.000 n=10)
+EqualBothUnaligned/64_7-8          791.4Mi ±  0%   1267.5Mi ±  1%  +60.16% (p=0.000 n=10)
+EqualBothUnaligned/4096_0-8        4.199Gi ±  0%    6.784Gi ±  2%  +61.54% (p=0.000 n=10)
+EqualBothUnaligned/4096_1-8        3.988Gi ±  0%    6.676Gi ±  3%  +67.40% (p=0.000 n=10)
+EqualBothUnaligned/4096_4-8        4.017Gi ±  0%    6.674Gi ±  3%  +66.14% (p=0.000 n=10)
+EqualBothUnaligned/4096_7-8        3.998Gi ±  0%    6.673Gi ±  3%  +66.92% (p=0.000 n=10)
+EqualBothUnaligned/4194304_0-8     1.332Gi ± 22%    1.468Gi ± 16%        ~ (p=0.089 n=10)
+EqualBothUnaligned/4194304_1-8     1.169Gi ± 12%    1.350Gi ± 25%        ~ (p=0.075 n=10)
+EqualBothUnaligned/4194304_4-8     1.222Gi ± 28%    1.165Gi ± 48%        ~ (p=0.796 n=10)
+EqualBothUnaligned/4194304_7-8     1.211Gi ± 23%    1.427Gi ± 26%  +17.88% (p=0.043 n=10)
+EqualBothUnaligned/67108864_0-8    1.274Gi ± 14%    1.567Gi ± 14%  +22.97% (p=0.005 n=10)
+EqualBothUnaligned/67108864_1-8    1.204Gi ± 14%    1.471Gi ± 13%  +22.18% (p=0.015 n=10)
+EqualBothUnaligned/67108864_4-8    1.311Gi ± 14%    1.651Gi ± 29%  +25.92% (p=0.035 n=10)
+EqualBothUnaligned/67108864_7-8    1.179Gi ± 18%    1.612Gi ± 17%  +36.73% (p=0.000 n=10)
+geomean                            1.870Gi          2.190Gi        +17.16%
+
+Change-Id: I9c5270bcc6997d020a96d1e97c7e7cfc7ca7fd34
+Reviewed-on: https://go-review.googlesource.com/c/go/+/646736
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Mark Freeman <markfreeman@google.com>
+---
+ src/internal/bytealg/bytealg.go      | 10 ++++++----
+ src/internal/bytealg/equal_riscv64.s | 30 ++++++++++++++++++++++++++++
+ 2 files changed, 36 insertions(+), 4 deletions(-)
+
+diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go
+index ae4b8b48d2..37881f75e6 100644
+--- a/src/internal/bytealg/bytealg.go
++++ b/src/internal/bytealg/bytealg.go
+@@ -11,13 +11,15 @@ import (
+ 
+ // Offsets into internal/cpu records for use in assembly.
+ const (
+-	offsetX86HasSSE42  = unsafe.Offsetof(cpu.X86.HasSSE42)
+-	offsetX86HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
+-	offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
++	offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
++
++	offsetRISCV64HasV = unsafe.Offsetof(cpu.RISCV64.HasV)
+ 
+ 	offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+ 
+-	offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
++	offsetX86HasSSE42  = unsafe.Offsetof(cpu.X86.HasSSE42)
++	offsetX86HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
++	offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+ )
+ 
+ // MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s
+index 87b2d79302..58e033f847 100644
+--- a/src/internal/bytealg/equal_riscv64.s
++++ b/src/internal/bytealg/equal_riscv64.s
+@@ -2,6 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
++#include "asm_riscv64.h"
+ #include "go_asm.h"
+ #include "textflag.h"
+ 
+@@ -28,6 +29,35 @@ length_check:
+ 	MOV	$32, X23
+ 	BLT	X12, X23, loop4_check
+ 
++#ifndef hasV
++	MOVB	internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5
++	BEQZ	X5, equal_scalar
++#endif
++
++	// Use vector if not 8 byte aligned.
++	OR	X10, X11, X5
++	AND	$7, X5
++	BNEZ	X5, vector_loop
++
++	// Use scalar if 8 byte aligned and <= 64 bytes.
++	SUB	$64, X12, X6
++	BLEZ	X6, loop32_check
++
++	PCALIGN	$16
++vector_loop:
++	VSETVLI	X12, E8, M8, TA, MA, X5
++	VLE8V	(X10), V8
++	VLE8V	(X11), V16
++	VMSNEVV	V8, V16, V0
++	VFIRSTM	V0, X6
++	BGEZ	X6, done
++	ADD	X5, X10
++	ADD	X5, X11
++	SUB	X5, X12
++	BNEZ	X12, vector_loop
++	JMP	done
++
++equal_scalar:
+ 	// Check alignment - if alignment differs we have to do one byte at a time.
+ 	AND	$7, X10, X9
+ 	AND	$7, X11, X19
+-- 
+2.39.5
+
diff --git a/2100-internal-bytealg-vector-implementation-of-indexbyte-.patch b/2100-internal-bytealg-vector-implementation-of-indexbyte-.patch
new file mode 100644
index 0000000..48fd0f3
--- /dev/null
+++ b/2100-internal-bytealg-vector-implementation-of-indexbyte-.patch
@@ -0,0 +1,156 @@
+From 4d2345cf8ba4e61588e881af5d83cc496300c583 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 100/119] internal/bytealg: vector implementation of indexbyte
+ for riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Provide a vector implementation of indexbyte for riscv64, which is used
+when compiled with the rva23u64 profile, or when vector is detected
+to be available. Inputs that are smaller than 24 bytes will continue
+to use the non-vector path.
+
+On a Banana Pi F3, with GORISCV64=rva23u64:
+
+                │  indexbyte.1  │             indexbyte.2              │
+                │    sec/op     │    sec/op     vs base                │
+IndexByte/10-8     52.68n ±  0%   47.26n ±  0%  -10.30% (p=0.000 n=10)
+IndexByte/32-8     68.62n ±  0%   47.02n ±  0%  -31.49% (p=0.000 n=10)
+IndexByte/4K-8    2217.0n ±  0%   420.4n ±  0%  -81.04% (p=0.000 n=10)
+IndexByte/4M-8    2624.4µ ±  0%   767.5µ ±  0%  -70.75% (p=0.000 n=10)
+IndexByte/64M-8    68.08m ± 10%   47.84m ± 45%  -29.73% (p=0.004 n=10)
+geomean            17.03µ         8.073µ        -52.59%
+
+                │ indexbyte.1  │               indexbyte.2               │
+                │     B/s      │      B/s        vs base                 │
+IndexByte/10-8    181.0Mi ± 0%    201.8Mi ±  0%   +11.48% (p=0.000 n=10)
+IndexByte/32-8    444.7Mi ± 0%    649.1Mi ±  0%   +45.97% (p=0.000 n=10)
+IndexByte/4K-8    1.721Gi ± 0%    9.076Gi ±  0%  +427.51% (p=0.000 n=10)
+IndexByte/4M-8    1.488Gi ± 0%    5.089Gi ±  0%  +241.93% (p=0.000 n=10)
+IndexByte/64M-8   940.3Mi ± 9%   1337.8Mi ± 31%   +42.27% (p=0.004 n=10)
+geomean           727.1Mi         1.498Gi        +110.94%
+
+Change-Id: If7b0dbef38d76fa7a2021e4ecaed668a1d4b9783
+Reviewed-on: https://go-review.googlesource.com/c/go/+/648856
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Mark Freeman <markfreeman@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+---
+ src/internal/bytealg/indexbyte_riscv64.s | 60 ++++++++++++++++--------
+ 1 file changed, 41 insertions(+), 19 deletions(-)
+
+diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s
+index fde00da0ea..527ae6d35e 100644
+--- a/src/internal/bytealg/indexbyte_riscv64.s
++++ b/src/internal/bytealg/indexbyte_riscv64.s
+@@ -2,6 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
++#include "asm_riscv64.h"
+ #include "go_asm.h"
+ #include "textflag.h"
+ 
+@@ -11,12 +12,14 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+ 	// X12 = b_cap (unused)
+ 	// X13 = byte to find
+ 	AND	$0xff, X13, X12		// x12 byte to look for
+-	MOV	X10, X13		// store base for later
+ 
+ 	SLTI	$24, X11, X14
+-	ADD	X10, X11		// end
+-	BEQZ	X14, bigBody
++	BNEZ	X14, small
++	JMP	indexByteBig<>(SB)
+ 
++small:
++	MOV	X10, X13		// store base for later
++	ADD	X10, X11		// end
+ 	SUB	$1, X10
+ loop:
+ 	ADD	$1, X10
+@@ -31,21 +34,19 @@ notfound:
+ 	MOV	$-1, X10
+ 	RET
+ 
+-bigBody:
+-	JMP	indexByteBig<>(SB)
+-
+ TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+ 	// X10 = b_base
+ 	// X11 = b_len
+ 	// X12 = byte to find
+-
+ 	AND	$0xff, X12		// x12 byte to look for
+-	MOV	X10, X13		// store base for later
+ 
+ 	SLTI	$24, X11, X14
+-	ADD	X10, X11		// end
+-	BEQZ	X14, bigBody
++	BNEZ	X14, small
++	JMP	indexByteBig<>(SB)
+ 
++small:
++	MOV	X10, X13		// store base for later
++	ADD	X10, X11		// end
+ 	SUB	$1, X10
+ loop:
+ 	ADD	$1, X10
+@@ -60,20 +61,41 @@ notfound:
+ 	MOV	$-1, X10
+ 	RET
+ 
+-bigBody:
+-	JMP	indexByteBig<>(SB)
+-
+ TEXT indexByteBig<>(SB),NOSPLIT|NOFRAME,$0
+-	// On entry
++	// On entry:
+ 	// X10 = b_base
+-	// X11 = end
++	// X11 = b_len (at least 16 bytes)
+ 	// X12 = byte to find
+-	// X13 = b_base
+-	// X11 is at least 16 bytes > X10
+-
+-	// On exit
++	// On exit:
+ 	// X10 = index of first instance of sought byte, if found, or -1 otherwise
+ 
++	MOV	X10, X13		// store base for later
++
++#ifndef hasV
++	MOVB	internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5
++	BEQZ	X5, indexbyte_scalar
++#endif
++
++	PCALIGN	$16
++vector_loop:
++	VSETVLI	X11, E8, M8, TA, MA, X5
++	VLE8V	(X10), V8
++	VMSEQVX	X12, V8, V0
++	VFIRSTM	V0, X6
++	BGEZ	X6, vector_found
++	ADD	X5, X10
++	SUB	X5, X11
++	BNEZ	X11, vector_loop
++	JMP	notfound
++
++vector_found:
++	SUB	X13, X10
++	ADD	X6, X10
++	RET
++
++indexbyte_scalar:
++	ADD	X10, X11		// end
++
+ 	// Process the first few bytes until we get to an 8 byte boundary
+ 	// No need to check for end here as we have at least 16 bytes in
+ 	// the buffer.
+-- 
+2.39.5
+
diff --git a/2101-cmd-internal-obj-riscv-reject-invalid-vadc-vsbc-enco.patch b/2101-cmd-internal-obj-riscv-reject-invalid-vadc-vsbc-enco.patch
new file mode 100644
index 0000000..871b3de
--- /dev/null
+++ b/2101-cmd-internal-obj-riscv-reject-invalid-vadc-vsbc-enco.patch
@@ -0,0 +1,123 @@
+From 73a717cb05344f5ad53e15755c2ed0146220dd1b Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 101/119] cmd/internal/obj/riscv: reject invalid vadc/vsbc
+ encodings
+
+The RISC-V Instruction Set Manual Volume states that "for vadc and
+vsbc, the instruction encoding is reserved if the destination vector
+register is v0". The assembler currently allows instructions like
+
+VADCVVM	V1, V2, V0, V0
+
+to be assembled. It's not clear what the behaviour of such
+instructions will be on target hardware so it's best to disallow
+them.
+
+For reference, binutils (2.44-3.fc42) allows the instruction
+
+vadc.vvm v0, v4, v8, v0
+
+to be assembled and the instruction actually executes on a Banana PI
+F3 without crashing. However, clang (20.1.2) refuses to assemble the
+instruction, producing the following error.
+
+error: the destination vector register group cannot be V0
+        vadc.vvm v0, v4, v8, v0
+                 ^
+Change-Id: Ia913cbd864ae8dbcf9227f69b963c93a99481cff
+Reviewed-on: https://go-review.googlesource.com/c/go/+/669315
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s      | 10 ++++++++++
+ src/cmd/asm/internal/asm/testdata/riscv64error.s |  5 +++++
+ src/cmd/internal/obj/riscv/obj.go                |  9 +++++++--
+ 3 files changed, 22 insertions(+), 2 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 2bab6842e7..01838664a3 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -623,17 +623,27 @@ start:
+ 	VADCVXM		X11, V2, V0, V3			// d7c12540
+ 	VADCVIM		$15, V2, V0, V3			// d7b12740
+ 	VMADCVVM	V1, V2, V0, V3			// d7812044
++	VMADCVVM	V1, V2, V0, V0			// 57802044
+ 	VMADCVXM	X11, V2, V0, V3			// d7c12544
++	VMADCVXM	X11, V2, V0, V0			// 57c02544
+ 	VMADCVIM	$15, V2, V0, V3			// d7b12744
++	VMADCVIM	$15, V2, V0, V0			// 57b02744
+ 	VMADCVV		V1, V2, V3			// d7812046
++	VMADCVV		V1, V2, V0			// 57802046
+ 	VMADCVX		X11, V2, V3			// d7c12546
++	VMADCVX		X11, V2, V0			// 57c02546
+ 	VMADCVI		$15, V2, V3			// d7b12746
++	VMADCVI		$15, V2, V0			// 57b02746
+ 	VSBCVVM		V1, V2, V0, V3			// d7812048
+ 	VSBCVXM		X11, V2, V0, V3			// d7c12548
+ 	VMSBCVVM	V1, V2, V0, V3			// d781204c
++	VMSBCVVM	V1, V2, V0, V0			// 5780204c
+ 	VMSBCVXM	X11, V2, V0, V3			// d7c1254c
++	VMSBCVXM	X11, V2, V0, V0			// 57c0254c
+ 	VMSBCVV		V1, V2, V3			// d781204e
++	VMSBCVV		V1, V2, V0			// 5780204e
+ 	VMSBCVX		X11, V2, V3			// d7c1254e
++	VMSBCVX		X11, V2, V0			// 57c0254e
+ 
+ 	// 31.11.5: Vector Bitwise Logical Instructions
+ 	VANDVV		V1, V2, V3			// d7812026
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+index 4238197893..4e6afa0ac2 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s
+@@ -95,10 +95,13 @@ TEXT errors(SB),$0
+ 	VSEXTVF8	V2, V3, V4			// ERROR "invalid vector mask register"
+ 	VADCVVM		V1, V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VADCVVM		V1, V2, V3			// ERROR "invalid vector mask register"
++	VADCVVM		V1, V2, V0, V0			// ERROR "invalid destination register V0"
+ 	VADCVXM		X10, V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VADCVXM		X10, V2, V3			// ERROR "invalid vector mask register"
++	VADCVXM		X10, V2, V0, V0			// ERROR "invalid destination register V0"
+ 	VADCVIM		$15, V2, V1, V3			// ERROR "invalid vector mask register"
+ 	VADCVIM		$15, V2, V3			// ERROR "invalid vector mask register"
++	VADCVIM		$15, V2, V0, V0			// ERROR "invalid destination register V0"
+ 	VMADCVVM	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VMADCVVM	V1, V2, V3			// ERROR "invalid vector mask register"
+ 	VMADCVXM	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+@@ -107,8 +110,10 @@ TEXT errors(SB),$0
+ 	VMADCVIM	$15, V2, V3			// ERROR "invalid vector mask register"
+ 	VSBCVVM		V1, V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VSBCVVM		V1, V2, V3			// ERROR "invalid vector mask register"
++	VSBCVVM		V1, V2, V0, V0			// ERROR "invalid destination register V0"
+ 	VSBCVXM		X10, V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VSBCVXM		X10, V2, V3			// ERROR "invalid vector mask register"
++	VSBCVXM		X10, V2, V0, V0			// ERROR "invalid destination register V0"
+ 	VMSBCVVM	V1, V2, V4, V3			// ERROR "invalid vector mask register"
+ 	VMSBCVVM	V1, V2, V3			// ERROR "invalid vector mask register"
+ 	VMSBCVXM	X10, V2, V4, V3			// ERROR "invalid vector mask register"
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 4fbf4b4336..592b7adba3 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -3647,8 +3647,13 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		ins.funct7 |= 1 // unmasked
+ 		ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), REG_V0
+ 
+-	case AVADCVVM, AVADCVXM, AVMADCVVM, AVMADCVXM, AVSBCVVM, AVSBCVXM, AVMSBCVVM, AVMSBCVXM, AVADCVIM, AVMADCVIM,
+-		AVMERGEVVM, AVMERGEVXM, AVMERGEVIM, AVFMERGEVFM:
++	case AVADCVIM, AVADCVVM, AVADCVXM, AVSBCVVM, AVSBCVXM:
++		if ins.rd == REG_V0 {
++			p.Ctxt.Diag("%v: invalid destination register V0", p)
++		}
++		fallthrough
++
++	case AVMADCVVM, AVMADCVXM, AVMSBCVVM, AVMSBCVXM, AVMADCVIM, AVMERGEVVM, AVMERGEVXM, AVMERGEVIM, AVFMERGEVFM:
+ 		if ins.rs3 != REG_V0 {
+ 			p.Ctxt.Diag("%v: invalid vector mask register", p)
+ 		}
+-- 
+2.39.5
+
diff --git a/2102-cmd-internal-obj-riscv-fix-LMUL-encoding-for-MF2-and.patch b/2102-cmd-internal-obj-riscv-fix-LMUL-encoding-for-MF2-and.patch
new file mode 100644
index 0000000..01182b5
--- /dev/null
+++ b/2102-cmd-internal-obj-riscv-fix-LMUL-encoding-for-MF2-and.patch
@@ -0,0 +1,68 @@
+From 7b07e3ed76af9a2e0e554cd06e9ce0f943a3f9f7 Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 102/119] cmd/internal/obj/riscv: fix LMUL encoding for MF2 and
+ MF8
+
+The encodings for the riscv64 special operands SPOP_MF2 and SPOP_MF8
+are incorrect, i.e., their values are swapped.  This leads to
+incorrect encodings for the VSETVLI and VSETIVLI instructions.  The
+assembler currently encodes
+
+VSETVLI	X10, E32, MF8, TA, MA, X12
+
+as
+
+VSETVLI	X10, E32, MF2, TA, MA, X12
+
+We update the encodings for SPOP_MF2 and SPOP_MF8 so that they match
+the LMUL table in section "31.3.4. Vector type register, vtype" of
+the "RISC-V Instruction Set Manual Volume 1".
+
+Change-Id: Ic73355533d7c2a901ee060b35c2f7af6d58453e4
+Reviewed-on: https://go-review.googlesource.com/c/go/+/670016
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Joel Sing <joel@sing.id.au>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s | 4 ++--
+ src/cmd/internal/obj/riscv/cpu.go           | 4 ++--
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 01838664a3..103f1e3272 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -438,9 +438,9 @@ start:
+ 	VSETVLI	X10, E32, M2, TA, MA, X12		// 5776150d
+ 	VSETVLI	X10, E32, M4, TA, MA, X12		// 5776250d
+ 	VSETVLI	X10, E32, M8, TA, MA, X12		// 5776350d
+-	VSETVLI	X10, E32, MF2, TA, MA, X12		// 5776550d
++	VSETVLI	X10, E32, MF8, TA, MA, X12		// 5776550d
+ 	VSETVLI	X10, E32, MF4, TA, MA, X12		// 5776650d
+-	VSETVLI	X10, E32, MF8, TA, MA, X12		// 5776750d
++	VSETVLI	X10, E32, MF2, TA, MA, X12		// 5776750d
+ 	VSETVLI	X10, E32, M1, TA, MA, X12		// 5776050d
+ 	VSETVLI	$15, E32, M1, TA, MA, X12		// 57f607cd
+ 	VSETIVLI $0, E32, M1, TA, MA, X12		// 577600cd
+diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go
+index aaf5db9e75..a2b6a436ba 100644
+--- a/src/cmd/internal/obj/riscv/cpu.go
++++ b/src/cmd/internal/obj/riscv/cpu.go
+@@ -1287,9 +1287,9 @@ var specialOperands = map[SpecialOperand]struct {
+ 	SPOP_M2:  {encoding: 1, name: "M2"},
+ 	SPOP_M4:  {encoding: 2, name: "M4"},
+ 	SPOP_M8:  {encoding: 3, name: "M8"},
+-	SPOP_MF2: {encoding: 5, name: "MF2"},
++	SPOP_MF8: {encoding: 5, name: "MF8"},
+ 	SPOP_MF4: {encoding: 6, name: "MF4"},
+-	SPOP_MF8: {encoding: 7, name: "MF8"},
++	SPOP_MF2: {encoding: 7, name: "MF2"},
+ 
+ 	SPOP_E8:  {encoding: 0, name: "E8"},
+ 	SPOP_E16: {encoding: 1, name: "E16"},
+-- 
+2.39.5
+
diff --git a/2103-cmd-compile-add-generic-simplifications-on-riscv64.patch b/2103-cmd-compile-add-generic-simplifications-on-riscv64.patch
new file mode 100644
index 0000000..3609d04
--- /dev/null
+++ b/2103-cmd-compile-add-generic-simplifications-on-riscv64.patch
@@ -0,0 +1,203 @@
+From 8a20dfdf22e664784ddbaa7c1664c1cb6898ae3e Mon Sep 17 00:00:00 2001
+From: Julian Zhu <julian.oerv@isrc.iscas.ac.cn>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 103/119] cmd/compile: add generic simplifications on riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+file      before    after     Δ       %
+addr2line 3636263   3636215   -48     -0.001%
+asm       6318110   6317966   -144    -0.002%
+buildid   3463352   3463224   -128    -0.004%
+cgo       5672502   5672214   -288    -0.005%
+compile   26904997  26905719  +722    +0.003%
+cover     6405603   6405467   -136    -0.002%
+dist      4092630   4092494   -136    -0.003%
+doc       9728281   9723977   -4304   -0.044%
+fix       4014891   4014835   -56     -0.001%
+link      8327674   8327426   -248    -0.003%
+nm        3628718   3628494   -224    -0.006%
+objdump   5951778   5951626   -152    -0.003%
+pack      2896080   2896040   -40     -0.001%
+pprof     17596796  17591908  -4888   -0.028%
+test2json 3346622   3346566   -56     -0.002%
+trace     16179738  16175706  -4032   -0.025%
+vet       9603472   9603264   -208    -0.002%
+total     156070021 156055655 -14366  -0.009%
+
+Change-Id: Ie4a79a3c410eb79155ce2418ae64fa670d1ccd53
+Reviewed-on: https://go-review.googlesource.com/c/go/+/673477
+Reviewed-by: Keith Randall <khr@google.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Auto-Submit: Keith Randall <khr@golang.org>
+Reviewed-by: David Chase <drchase@google.com>
+---
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  9 ++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 87 +++++++++++++++++++
+ 2 files changed, 96 insertions(+)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 93f4e6a948..f0d2d74b7b 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -719,6 +719,15 @@
+ (ROL  x (NEG y)) => (ROR  x y)
+ (ROLW x (NEG y)) => (RORW x y)
+ 
++// generic simplifications
++(ADD x (NEG y)) => (SUB x y)
++(SUB x (NEG y)) => (ADD x y)
++(SUB x x) => (MOVDconst [0])
++(AND x x) => x
++(OR  x x) => x
++(ORN x x) => (MOVDconst [-1])
++(XOR x x) => (MOVDconst [0])
++
+ // Convert const subtraction into ADDI with negative immediate, where possible.
+ (SUB x (MOVDconst [val])) && is32Bit(-val) => (ADDI [-val] x)
+ (SUB <t> (MOVDconst [val]) y) && is32Bit(-val) => (NEG (ADDI <t> [-val] y))
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index c3018f270c..966199c450 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -541,6 +541,8 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpRISCV64OR(v)
+ 	case OpRISCV64ORI:
+ 		return rewriteValueRISCV64_OpRISCV64ORI(v)
++	case OpRISCV64ORN:
++		return rewriteValueRISCV64_OpRISCV64ORN(v)
+ 	case OpRISCV64ROL:
+ 		return rewriteValueRISCV64_OpRISCV64ROL(v)
+ 	case OpRISCV64ROLW:
+@@ -3317,6 +3319,21 @@ func rewriteValueRISCV64_OpRISCV64ADD(v *Value) bool {
+ 		}
+ 		break
+ 	}
++	// match: (ADD x (NEG y))
++	// result: (SUB x y)
++	for {
++		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
++			x := v_0
++			if v_1.Op != OpRISCV64NEG {
++				continue
++			}
++			y := v_1.Args[0]
++			v.reset(OpRISCV64SUB)
++			v.AddArg2(x, y)
++			return true
++		}
++		break
++	}
+ 	// match: (ADD (SLLI [1] x) y)
+ 	// cond: buildcfg.GORISCV64 >= 22
+ 	// result: (SH1ADD x y)
+@@ -3467,6 +3484,16 @@ func rewriteValueRISCV64_OpRISCV64AND(v *Value) bool {
+ 		}
+ 		break
+ 	}
++	// match: (AND x x)
++	// result: x
++	for {
++		x := v_0
++		if x != v_1 {
++			break
++		}
++		v.copyOf(x)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueRISCV64_OpRISCV64ANDI(v *Value) bool {
+@@ -6191,6 +6218,16 @@ func rewriteValueRISCV64_OpRISCV64OR(v *Value) bool {
+ 		}
+ 		break
+ 	}
++	// match: (OR x x)
++	// result: x
++	for {
++		x := v_0
++		if x != v_1 {
++			break
++		}
++		v.copyOf(x)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueRISCV64_OpRISCV64ORI(v *Value) bool {
+@@ -6243,6 +6280,22 @@ func rewriteValueRISCV64_OpRISCV64ORI(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64ORN(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (ORN x x)
++	// result: (MOVDconst [-1])
++	for {
++		x := v_0
++		if x != v_1 {
++			break
++		}
++		v.reset(OpRISCV64MOVDconst)
++		v.AuxInt = int64ToAuxInt(-1)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64ROL(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+@@ -6888,6 +6941,29 @@ func rewriteValueRISCV64_OpRISCV64SUB(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
++	// match: (SUB x (NEG y))
++	// result: (ADD x y)
++	for {
++		x := v_0
++		if v_1.Op != OpRISCV64NEG {
++			break
++		}
++		y := v_1.Args[0]
++		v.reset(OpRISCV64ADD)
++		v.AddArg2(x, y)
++		return true
++	}
++	// match: (SUB x x)
++	// result: (MOVDconst [0])
++	for {
++		x := v_0
++		if x != v_1 {
++			break
++		}
++		v.reset(OpRISCV64MOVDconst)
++		v.AuxInt = int64ToAuxInt(0)
++		return true
++	}
+ 	// match: (SUB x (MOVDconst [val]))
+ 	// cond: is32Bit(-val)
+ 	// result: (ADDI [-val] x)
+@@ -6999,6 +7075,17 @@ func rewriteValueRISCV64_OpRISCV64XOR(v *Value) bool {
+ 		}
+ 		break
+ 	}
++	// match: (XOR x x)
++	// result: (MOVDconst [0])
++	for {
++		x := v_0
++		if x != v_1 {
++			break
++		}
++		v.reset(OpRISCV64MOVDconst)
++		v.AuxInt = int64ToAuxInt(0)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueRISCV64_OpRotateLeft16(v *Value) bool {
+-- 
+2.39.5
+
diff --git a/2104-cmd-internal-obj-riscv-fix-vector-integer-multiply-a.patch b/2104-cmd-internal-obj-riscv-fix-vector-integer-multiply-a.patch
new file mode 100644
index 0000000..a94d9ef
--- /dev/null
+++ b/2104-cmd-internal-obj-riscv-fix-vector-integer-multiply-a.patch
@@ -0,0 +1,187 @@
+From 33771744d688d61520413c2fa204cfddce74c10b Mon Sep 17 00:00:00 2001
+From: Mark Ryan <markdryan@rivosinc.com>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 104/119] cmd/internal/obj/riscv: fix vector integer multiply
+ add
+
+The RISC-V integer vector multiply add instructions are not encoded
+correctly; the first and second arguments are swapped. For example,
+the instruction
+
+VMACCVV V1, V2, V3
+
+encodes to
+
+b620a1d7 or vmacc.vv v3,v1,v2
+
+and not
+
+b61121d7 or vmacc.vv v3,v2,v1
+
+as expected.
+
+This is inconsistent with the argument ordering we use for 3
+argument vector instructions, in which the argument order, as given
+in the RISC-V specifications, is reversed, and also with the vector
+FMA instructions which have the same argument ordering as the vector
+integer multiply add instructions in the "The RISC-V Instruction Set
+Manual Volume I". For example, in the ISA manual we have the
+following instruction definitions
+
+; Integer multiply-add, overwrite addend
+vmacc.vv vd, vs1, vs2, vm    # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+
+; FP multiply-accumulate, overwrites addend
+vfmacc.vv vd, vs1, vs2, vm    # vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+
+It's reasonable to expect that the Go assembler would use the same
+argument ordering for both of these instructions. It currently does
+not.
+
+We fix the issue by switching the argument ordering for the vector
+integer multiply add instructions to match those of the vector FMA
+instructions.
+
+Change-Id: Ib98e9999617f991969e5c831734b3bb3324439f6
+Reviewed-on: https://go-review.googlesource.com/c/go/+/670335
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/asm/internal/asm/testdata/riscv64.s   | 60 +++++++++----------
+ .../internal/asm/testdata/riscv64validation.s | 14 ++---
+ src/cmd/internal/obj/riscv/obj.go             |  6 +-
+ 3 files changed, 40 insertions(+), 40 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s
+index 103f1e3272..1d8c2d3530 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s
+@@ -830,38 +830,38 @@ start:
+ 	VWMULSUVX	X10, V2, V0, V3			// d76125e8
+ 
+ 	// 31.11.13: Vector Single-Width Integer Multiply-Add Instructions
+-	VMACCVV		V1, V2, V3			// d7a120b6
+-	VMACCVV		V1, V2, V0, V3			// d7a120b4
+-	VMACCVX		X10, V2, V3			// d76125b6
+-	VMACCVX		X10, V2, V0, V3			// d76125b4
+-	VNMSACVV	V1, V2, V3			// d7a120be
+-	VNMSACVV	V1, V2, V0, V3			// d7a120bc
+-	VNMSACVX	X10, V2, V3			// d76125be
+-	VNMSACVX	X10, V2, V0, V3			// d76125bc
+-	VMADDVV		V1, V2, V3			// d7a120a6
+-	VMADDVV		V1, V2, V0, V3			// d7a120a4
+-	VMADDVX		X10, V2, V3			// d76125a6
+-	VMADDVX		X10, V2, V0, V3			// d76125a4
+-	VNMSUBVV	V1, V2, V3			// d7a120ae
+-	VNMSUBVV	V1, V2, V0, V3			// d7a120ac
+-	VNMSUBVX	X10, V2, V3			// d76125ae
+-	VNMSUBVX	X10, V2, V0, V3			// d76125ac
++	VMACCVV		V2, V1, V3			// d7a120b6
++	VMACCVV		V2, V1, V0, V3			// d7a120b4
++	VMACCVX		V2, X10, V3			// d76125b6
++	VMACCVX		V2, X10, V0, V3			// d76125b4
++	VNMSACVV	V2, V1, V3			// d7a120be
++	VNMSACVV	V2, V1, V0, V3			// d7a120bc
++	VNMSACVX	V2, X10, V3			// d76125be
++	VNMSACVX	V2, X10, V0, V3			// d76125bc
++	VMADDVV		V2, V1, V3			// d7a120a6
++	VMADDVV		V2, V1, V0, V3			// d7a120a4
++	VMADDVX		V2, X10, V3			// d76125a6
++	VMADDVX		V2, X10, V0, V3			// d76125a4
++	VNMSUBVV	V2, V1, V3			// d7a120ae
++	VNMSUBVV	V2, V1, V0, V3			// d7a120ac
++	VNMSUBVX	V2, X10, V3			// d76125ae
++	VNMSUBVX	V2, X10, V0, V3			// d76125ac
+ 
+ 	// 31.11.14: Vector Widening Integer Multiply-Add Instructions
+-	VWMACCUVV	V1, V2, V3			// d7a120f2
+-	VWMACCUVV	V1, V2, V0, V3			// d7a120f0
+-	VWMACCUVX	X10, V2, V3			// d76125f2
+-	VWMACCUVX	X10, V2, V0, V3			// d76125f0
+-	VWMACCVV	V1, V2, V3			// d7a120f6
+-	VWMACCVV	V1, V2, V0, V3			// d7a120f4
+-	VWMACCVX	X10, V2, V3			// d76125f6
+-	VWMACCVX	X10, V2, V0, V3			// d76125f4
+-	VWMACCSUVV	V1, V2, V3			// d7a120fe
+-	VWMACCSUVV	V1, V2, V0, V3			// d7a120fc
+-	VWMACCSUVX	X10, V2, V3			// d76125fe
+-	VWMACCSUVX	X10, V2, V0, V3			// d76125fc
+-	VWMACCUSVX	X10, V2, V3			// d76125fa
+-	VWMACCUSVX	X10, V2, V0, V3			// d76125f8
++	VWMACCUVV	V2, V1, V3			// d7a120f2
++	VWMACCUVV	V2, V1, V0, V3			// d7a120f0
++	VWMACCUVX	V2, X10, V3			// d76125f2
++	VWMACCUVX	V2, X10, V0, V3			// d76125f0
++	VWMACCVV	V2, V1, V3			// d7a120f6
++	VWMACCVV	V2, V1, V0, V3			// d7a120f4
++	VWMACCVX	V2, X10, V3			// d76125f6
++	VWMACCVX	V2, X10, V0, V3			// d76125f4
++	VWMACCSUVV	V2, V1, V3			// d7a120fe
++	VWMACCSUVV	V2, V1, V0, V3			// d7a120fc
++	VWMACCSUVX	V2, X10, V3			// d76125fe
++	VWMACCSUVX	V2, X10, V0, V3			// d76125fc
++	VWMACCUSVX	V2, X10, V3			// d76125fa
++	VWMACCUSVX	V2, X10, V0, V3			// d76125f8
+ 
+ 	// 31.11.15: Vector Integer Merge Instructions
+ 	VMERGEVVM	V1, V2, V0, V3			// d781205c
+diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+index 374a97dcfe..55bf518e68 100644
+--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s
++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s
+@@ -214,19 +214,19 @@ TEXT validation(SB),$0
+ 	VWMULUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+ 	VWMULSUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
+ 	VWMULSUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMACCVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMACCVV		V2, X10, V3			// ERROR "expected vector register in vs1 position"
+ 	VMACCVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VNMSACVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNMSACVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
+ 	VNMSACVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VMADDVV		X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VMADDVV		V2, X10, V3			// ERROR "expected vector register in vs1 position"
+ 	VMADDVX		V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VNMSUBVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VNMSUBVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
+ 	VNMSUBVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWMACCUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMACCUVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
+ 	VWMACCUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWMACCVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMACCVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
+ 	VWMACCVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+-	VWMACCSUVV	X10, V2, V3			// ERROR "expected vector register in vs1 position"
++	VWMACCSUVV	V2, X10, V3			// ERROR "expected vector register in vs1 position"
+ 	VWMACCSUVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+ 	VWMACCUSVX	V1, V2, V3			// ERROR "expected integer register in rs1 position"
+ 	VMERGEVVM	X10, V2, V0, V3			// ERROR "expected vector register in vs1 position"
+diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go
+index 592b7adba3..0b09a2e79c 100644
+--- a/src/cmd/internal/obj/riscv/obj.go
++++ b/src/cmd/internal/obj/riscv/obj.go
+@@ -3571,8 +3571,6 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 		AVMULVV, AVMULVX, AVMULHVV, AVMULHVX, AVMULHUVV, AVMULHUVX, AVMULHSUVV, AVMULHSUVX,
+ 		AVDIVUVV, AVDIVUVX, AVDIVVV, AVDIVVX, AVREMUVV, AVREMUVX, AVREMVV, AVREMVX,
+ 		AVWMULVV, AVWMULVX, AVWMULUVV, AVWMULUVX, AVWMULSUVV, AVWMULSUVX, AVNSRLWV, AVNSRLWX, AVNSRAWV, AVNSRAWX,
+-		AVMACCVV, AVMACCVX, AVNMSACVV, AVNMSACVX, AVMADDVV, AVMADDVX, AVNMSUBVV, AVNMSUBVX,
+-		AVWMACCUVV, AVWMACCUVX, AVWMACCVV, AVWMACCVX, AVWMACCSUVV, AVWMACCSUVX, AVWMACCUSVX,
+ 		AVSADDUVV, AVSADDUVX, AVSADDUVI, AVSADDVV, AVSADDVX, AVSADDVI, AVSSUBUVV, AVSSUBUVX, AVSSUBVV, AVSSUBVX,
+ 		AVAADDUVV, AVAADDUVX, AVAADDVV, AVAADDVX, AVASUBUVV, AVASUBUVX, AVASUBVV, AVASUBVX,
+ 		AVSMULVV, AVSMULVX, AVSSRLVV, AVSSRLVX, AVSSRLVI, AVSSRAVV, AVSSRAVX, AVSSRAVI,
+@@ -3598,7 +3596,9 @@ func instructionsForProg(p *obj.Prog) []*instruction {
+ 
+ 	case AVFMACCVV, AVFMACCVF, AVFNMACCVV, AVFNMACCVF, AVFMSACVV, AVFMSACVF, AVFNMSACVV, AVFNMSACVF,
+ 		AVFMADDVV, AVFMADDVF, AVFNMADDVV, AVFNMADDVF, AVFMSUBVV, AVFMSUBVF, AVFNMSUBVV, AVFNMSUBVF,
+-		AVFWMACCVV, AVFWMACCVF, AVFWNMACCVV, AVFWNMACCVF, AVFWMSACVV, AVFWMSACVF, AVFWNMSACVV, AVFWNMSACVF:
++		AVFWMACCVV, AVFWMACCVF, AVFWNMACCVV, AVFWNMACCVF, AVFWMSACVV, AVFWMSACVF, AVFWNMSACVV, AVFWNMSACVF,
++		AVMACCVV, AVMACCVX, AVNMSACVV, AVNMSACVX, AVMADDVV, AVMADDVX, AVNMSUBVV, AVNMSUBVX,
++		AVWMACCUVV, AVWMACCUVX, AVWMACCVV, AVWMACCVX, AVWMACCSUVV, AVWMACCSUVX, AVWMACCUSVX:
+ 		switch {
+ 		case ins.rs3 == obj.REG_NONE:
+ 			ins.funct7 |= 1 // unmasked
+-- 
+2.39.5
+
diff --git a/2105-cmd-compile-optimise-float-int-register-moves-on-ris.patch b/2105-cmd-compile-optimise-float-int-register-moves-on-ris.patch
new file mode 100644
index 0000000..b47b83a
--- /dev/null
+++ b/2105-cmd-compile-optimise-float-int-register-moves-on-ris.patch
@@ -0,0 +1,663 @@
+From eba33bc3c20003dd122cf703a93e6f66f5c85cf2 Mon Sep 17 00:00:00 2001
+From: Michael Munday <mike.munday@lowrisc.org>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 105/119] cmd/compile: optimise float <-> int register moves on
+ riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Use the FMV* instructions to move values between the floating point and
+integer register files.
+
+Note: I'm unsure why there is a slowdown in the Float32bits benchmark,
+I've checked and an FMVXS instruction is being used as expected. There
+are multiple loads and other instructions in the main loop.
+
+goos: linux
+goarch: riscv64
+pkg: math
+cpu: Spacemit(R) X60
+                    │ fmv-before.txt │            fmv-after.txt            │
+                    │     sec/op     │   sec/op     vs base                │
+Acos                     122.7n ± 0%   122.7n ± 0%        ~ (p=1.000 n=10)
+Acosh                    197.2n ± 0%   191.5n ± 0%   -2.89% (p=0.000 n=10)
+Asin                     122.7n ± 0%   122.7n ± 0%        ~ (p=0.474 n=10)
+Asinh                    231.0n ± 0%   224.1n ± 0%   -2.99% (p=0.000 n=10)
+Atan                     91.39n ± 0%   91.41n ± 0%        ~ (p=0.465 n=10)
+Atanh                    210.3n ± 0%   203.4n ± 0%   -3.26% (p=0.000 n=10)
+Atan2                    149.6n ± 0%   149.6n ± 0%        ~ (p=0.721 n=10)
+Cbrt                     176.5n ± 0%   165.9n ± 0%   -6.01% (p=0.000 n=10)
+Ceil                     25.67n ± 0%   24.42n ± 0%   -4.87% (p=0.000 n=10)
+Copysign                 3.756n ± 0%   3.756n ± 0%        ~ (p=0.149 n=10)
+Cos                      95.15n ± 0%   95.15n ± 0%        ~ (p=0.374 n=10)
+Cosh                     228.6n ± 0%   224.7n ± 0%   -1.71% (p=0.000 n=10)
+Erf                      115.2n ± 0%   115.2n ± 0%        ~ (p=0.474 n=10)
+Erfc                     116.4n ± 0%   116.4n ± 0%        ~ (p=0.628 n=10)
+Erfinv                   133.3n ± 0%   133.3n ± 0%        ~ (p=1.000 n=10)
+Erfcinv                  133.3n ± 0%   133.3n ± 0%        ~ (p=1.000 n=10)
+Exp                      194.1n ± 0%   190.3n ± 0%   -1.93% (p=0.000 n=10)
+ExpGo                    204.7n ± 0%   200.3n ± 0%   -2.15% (p=0.000 n=10)
+Expm1                    137.7n ± 0%   135.2n ± 0%   -1.82% (p=0.000 n=10)
+Exp2                     173.4n ± 0%   169.0n ± 0%   -2.54% (p=0.000 n=10)
+Exp2Go                   182.8n ± 0%   178.4n ± 0%   -2.41% (p=0.000 n=10)
+Abs                      3.756n ± 0%   3.756n ± 0%        ~ (p=0.157 n=10)
+Dim                      12.52n ± 0%   12.52n ± 0%        ~ (p=0.737 n=10)
+Floor                    25.67n ± 0%   24.42n ± 0%   -4.87% (p=0.000 n=10)
+Max                      21.29n ± 0%   20.03n ± 0%   -5.92% (p=0.000 n=10)
+Min                      21.28n ± 0%   20.04n ± 0%   -5.85% (p=0.000 n=10)
+Mod                      344.9n ± 0%   319.2n ± 0%   -7.45% (p=0.000 n=10)
+Frexp                    55.71n ± 0%   48.85n ± 0%  -12.30% (p=0.000 n=10)
+Gamma                    165.9n ± 0%   167.8n ± 0%   +1.15% (p=0.000 n=10)
+Hypot                    73.24n ± 0%   70.74n ± 0%   -3.41% (p=0.000 n=10)
+HypotGo                  84.50n ± 0%   82.63n ± 0%   -2.21% (p=0.000 n=10)
+Ilogb                    49.45n ± 0%   45.70n ± 0%   -7.59% (p=0.000 n=10)
+J0                       556.5n ± 0%   544.0n ± 0%   -2.25% (p=0.000 n=10)
+J1                       555.3n ± 0%   542.8n ± 0%   -2.24% (p=0.000 n=10)
+Jn                       1.181µ ± 0%   1.156µ ± 0%   -2.12% (p=0.000 n=10)
+Ldexp                    59.47n ± 0%   53.84n ± 0%   -9.47% (p=0.000 n=10)
+Lgamma                   167.2n ± 0%   154.6n ± 0%   -7.51% (p=0.000 n=10)
+Log                      160.9n ± 0%   154.6n ± 0%   -3.92% (p=0.000 n=10)
+Logb                     49.45n ± 0%   45.70n ± 0%   -7.58% (p=0.000 n=10)
+Log1p                    147.1n ± 0%   137.1n ± 0%   -6.80% (p=0.000 n=10)
+Log10                    162.1n ± 1%   154.6n ± 0%   -4.63% (p=0.000 n=10)
+Log2                     66.99n ± 0%   60.72n ± 0%   -9.36% (p=0.000 n=10)
+Modf                     29.42n ± 0%   26.29n ± 0%  -10.64% (p=0.000 n=10)
+Nextafter32              41.95n ± 0%   37.88n ± 0%   -9.70% (p=0.000 n=10)
+Nextafter64              38.82n ± 0%   33.49n ± 0%  -13.73% (p=0.000 n=10)
+PowInt                   252.3n ± 0%   237.3n ± 0%   -5.95% (p=0.000 n=10)
+PowFrac                  615.5n ± 0%   589.7n ± 0%   -4.19% (p=0.000 n=10)
+Pow10Pos                 10.64n ± 0%   10.64n ± 0%        ~ (p=1.000 n=10)
+Pow10Neg                 24.42n ± 0%   15.02n ± 0%  -38.49% (p=0.000 n=10)
+Round                    21.91n ± 0%   18.16n ± 0%  -17.12% (p=0.000 n=10)
+RoundToEven              24.42n ± 0%   21.29n ± 0%  -12.84% (p=0.000 n=10)
+Remainder                308.0n ± 0%   291.2n ± 0%   -5.44% (p=0.000 n=10)
+Signbit                  10.02n ± 0%   10.02n ± 0%        ~ (p=1.000 n=10)
+Sin                      102.7n ± 0%   102.7n ± 0%        ~ (p=0.211 n=10)
+Sincos                   124.0n ± 1%   123.3n ± 0%   -0.56% (p=0.002 n=10)
+Sinh                     239.1n ± 0%   234.7n ± 0%   -1.84% (p=0.000 n=10)
+SqrtIndirect             2.504n ± 0%   2.504n ± 0%        ~ (p=0.303 n=10)
+SqrtLatency              15.03n ± 0%   15.02n ± 0%        ~ (p=0.598 n=10)
+SqrtIndirectLatency      15.02n ± 0%   15.02n ± 0%        ~ (p=0.907 n=10)
+SqrtGoLatency            165.3n ± 0%   157.2n ± 0%   -4.90% (p=0.000 n=10)
+SqrtPrime                3.801µ ± 0%   3.802µ ± 0%        ~ (p=1.000 n=10)
+Tan                      125.2n ± 0%   125.2n ± 0%        ~ (p=0.458 n=10)
+Tanh                     244.2n ± 0%   239.9n ± 0%   -1.76% (p=0.000 n=10)
+Trunc                    25.67n ± 0%   24.42n ± 0%   -4.87% (p=0.000 n=10)
+Y0                       550.2n ± 0%   538.1n ± 0%   -2.21% (p=0.000 n=10)
+Y1                       552.8n ± 0%   540.6n ± 0%   -2.21% (p=0.000 n=10)
+Yn                       1.168µ ± 0%   1.143µ ± 0%   -2.14% (p=0.000 n=10)
+Float64bits              8.139n ± 0%   4.385n ± 0%  -46.13% (p=0.000 n=10)
+Float64frombits          7.512n ± 0%   3.759n ± 0%  -49.96% (p=0.000 n=10)
+Float32bits              8.138n ± 0%   9.393n ± 0%  +15.42% (p=0.000 n=10)
+Float32frombits          7.513n ± 0%   3.757n ± 0%  -49.98% (p=0.000 n=10)
+FMA                      3.756n ± 0%   3.756n ± 0%        ~ (p=0.246 n=10)
+geomean                  77.43n        72.42n        -6.47%
+
+Change-Id: I8dac69b1d17cb3d2af78d1c844d2b5d80000d667
+Reviewed-on: https://go-review.googlesource.com/c/go/+/599235
+Reviewed-by: Keith Randall <khr@google.com>
+Auto-Submit: Michael Munday <mikemndy@gmail.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |   2 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  23 ++
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |   6 +-
+ src/cmd/compile/internal/ssa/opGen.go         |  28 ++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 312 ++++++++++++++++++
+ test/codegen/math.go                          |   4 +
+ 6 files changed, 372 insertions(+), 3 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 759d8d7cf4..4aac891e13 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -416,7 +416,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		p.To.Type = obj.TYPE_REG
+ 		p.To.Reg = r
+ 	case ssa.OpRISCV64FSQRTS, ssa.OpRISCV64FNEGS, ssa.OpRISCV64FABSD, ssa.OpRISCV64FSQRTD, ssa.OpRISCV64FNEGD,
+-		ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVDX,
++		ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVXS, ssa.OpRISCV64FMVDX, ssa.OpRISCV64FMVXD,
+ 		ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS,
+ 		ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD,
+ 		ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW:
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index f0d2d74b7b..9e39a58197 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -275,6 +275,11 @@
+ 	(base.Op != OpSB || !config.ctxt.Flag_dynlink) =>
+ 	(MOV(B|BU|H|HU|W|WU|D)load [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 
++(FMOV(W|D)load [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) &&
++	is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) &&
++	(base.Op != OpSB || !config.ctxt.Flag_dynlink) =>
++	(FMOV(W|D)load [off1+off2] {mergeSym(sym1,sym2)} base mem)
++
+ (MOV(B|H|W|D)store [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) &&
+ 	is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) &&
+ 	(base.Op != OpSB || !config.ctxt.Flag_dynlink) =>
+@@ -285,15 +290,26 @@
+ 	(base.Op != OpSB || !config.ctxt.Flag_dynlink) =>
+ 	(MOV(B|H|W|D)storezero [off1+off2] {mergeSym(sym1,sym2)} base mem)
+ 
++(FMOV(W|D)store [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) &&
++	is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) &&
++	(base.Op != OpSB || !config.ctxt.Flag_dynlink) =>
++	(FMOV(W|D)store [off1+off2] {mergeSym(sym1,sym2)} base val mem)
++
+ (MOV(B|BU|H|HU|W|WU|D)load [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+ 	(MOV(B|BU|H|HU|W|WU|D)load [off1+int32(off2)] {sym} base mem)
+ 
++(FMOV(W|D)load [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
++	(FMOV(W|D)load [off1+int32(off2)] {sym} base mem)
++
+ (MOV(B|H|W|D)store [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
+ 	(MOV(B|H|W|D)store [off1+int32(off2)] {sym} base val mem)
+ 
+ (MOV(B|H|W|D)storezero [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) =>
+ 	(MOV(B|H|W|D)storezero [off1+int32(off2)] {sym} base mem)
+ 
++(FMOV(W|D)store [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) =>
++	(FMOV(W|D)store [off1+int32(off2)] {sym} base val mem)
++
+ // Similarly, fold ADDI into MOVaddr to avoid confusing live variable analysis
+ // with OffPtr -> ADDI.
+ (ADDI [c] (MOVaddr [d] {s} x)) && is32Bit(c+int64(d)) => (MOVaddr [int32(c)+d] {s} x)
+@@ -675,6 +691,13 @@
+ (MOVHUreg <t> x:(MOVHload  [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHUload <t> [off] {sym} ptr mem)
+ (MOVWUreg <t> x:(MOVWload  [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWUload <t> [off] {sym} ptr mem)
+ 
++// Replace load from same location as preceding store with copy.
++(MOVDload  [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (FMVXD x)
++(FMOVDload [off] {sym} ptr1 (MOVDstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (FMVDX x)
++(MOVWload  [off] {sym} ptr1 (FMOVWstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (FMVXS x)
++(MOVWUload [off] {sym} ptr1 (FMOVWstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVWUreg (FMVXS x))
++(FMOVWload [off] {sym} ptr1 (MOVWstore  [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (FMVSX x)
++
+ // If a register move has only 1 use, just use the same register without emitting instruction
+ // MOVnop does not emit an instruction, only for ensuring the type.
+ (MOVDreg x) && x.Uses == 1 => (MOVDnop x)
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index 7f3c4a2bf4..a69b347a84 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -440,7 +440,8 @@ func init() {
+ 		{name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS", commutative: true, typ: "Float32"},                                       // -(arg0 * arg1) - arg2
+ 		{name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS", typ: "Float32"},                                                            // sqrt(arg0)
+ 		{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS", typ: "Float32"},                                                              // -arg0
+-		{name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"},                                                              // reinterpret arg0 as float
++		{name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"},                                                              // reinterpret arg0 as float32
++		{name: "FMVXS", argLength: 1, reg: fpgp, asm: "FMVXS", typ: "Int32"},                                                                // reinterpret arg0 as int32, sign extended to 64 bits
+ 		{name: "FCVTSW", argLength: 1, reg: gpfp, asm: "FCVTSW", typ: "Float32"},                                                            // float32(low 32 bits of arg0)
+ 		{name: "FCVTSL", argLength: 1, reg: gpfp, asm: "FCVTSL", typ: "Float32"},                                                            // float32(arg0)
+ 		{name: "FCVTWS", argLength: 1, reg: fpgp, asm: "FCVTWS", typ: "Int32"},                                                              // int32(arg0)
+@@ -467,7 +468,8 @@ func init() {
+ 		{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD", typ: "Float64"},                                                              // -arg0
+ 		{name: "FABSD", argLength: 1, reg: fp11, asm: "FABSD", typ: "Float64"},                                                              // abs(arg0)
+ 		{name: "FSGNJD", argLength: 2, reg: fp21, asm: "FSGNJD", typ: "Float64"},                                                            // copy sign of arg1 to arg0
+-		{name: "FMVDX", argLength: 1, reg: gpfp, asm: "FMVDX", typ: "Float64"},                                                              // reinterpret arg0 as float
++		{name: "FMVDX", argLength: 1, reg: gpfp, asm: "FMVDX", typ: "Float64"},                                                              // reinterpret arg0 as float64
++		{name: "FMVXD", argLength: 1, reg: fpgp, asm: "FMVXD", typ: "Int64"},                                                                // reinterpret arg0 as int64
+ 		{name: "FCVTDW", argLength: 1, reg: gpfp, asm: "FCVTDW", typ: "Float64"},                                                            // float64(low 32 bits of arg0)
+ 		{name: "FCVTDL", argLength: 1, reg: gpfp, asm: "FCVTDL", typ: "Float64"},                                                            // float64(arg0)
+ 		{name: "FCVTWD", argLength: 1, reg: fpgp, asm: "FCVTWD", typ: "Int32"},                                                              // int32(arg0)
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index a02afc2da0..5fda7ffc2f 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2454,6 +2454,7 @@ const (
+ 	OpRISCV64FSQRTS
+ 	OpRISCV64FNEGS
+ 	OpRISCV64FMVSX
++	OpRISCV64FMVXS
+ 	OpRISCV64FCVTSW
+ 	OpRISCV64FCVTSL
+ 	OpRISCV64FCVTWS
+@@ -2479,6 +2480,7 @@ const (
+ 	OpRISCV64FABSD
+ 	OpRISCV64FSGNJD
+ 	OpRISCV64FMVDX
++	OpRISCV64FMVXD
+ 	OpRISCV64FCVTDW
+ 	OpRISCV64FCVTDL
+ 	OpRISCV64FCVTWD
+@@ -32948,6 +32950,19 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "FMVXS",
++		argLen: 1,
++		asm:    riscv.AFMVXS,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:   "FCVTSW",
+ 		argLen: 1,
+@@ -33308,6 +33323,19 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "FMVXD",
++		argLen: 1,
++		asm:    riscv.AFMVXD,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:   "FCVTDW",
+ 		argLen: 1,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 966199c450..a449ce01c6 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -471,6 +471,14 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpRISCV64FMADDD(v)
+ 	case OpRISCV64FMADDS:
+ 		return rewriteValueRISCV64_OpRISCV64FMADDS(v)
++	case OpRISCV64FMOVDload:
++		return rewriteValueRISCV64_OpRISCV64FMOVDload(v)
++	case OpRISCV64FMOVDstore:
++		return rewriteValueRISCV64_OpRISCV64FMOVDstore(v)
++	case OpRISCV64FMOVWload:
++		return rewriteValueRISCV64_OpRISCV64FMOVWload(v)
++	case OpRISCV64FMOVWstore:
++		return rewriteValueRISCV64_OpRISCV64FMOVWstore(v)
+ 	case OpRISCV64FMSUBD:
+ 		return rewriteValueRISCV64_OpRISCV64FMSUBD(v)
+ 	case OpRISCV64FMSUBS:
+@@ -3686,6 +3694,250 @@ func rewriteValueRISCV64_OpRISCV64FMADDS(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueRISCV64_OpRISCV64FMOVDload(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
++	// match: (FMOVDload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
++	// result: (FMOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
++	for {
++		off1 := auxIntToInt32(v.AuxInt)
++		sym1 := auxToSym(v.Aux)
++		if v_0.Op != OpRISCV64MOVaddr {
++			break
++		}
++		off2 := auxIntToInt32(v_0.AuxInt)
++		sym2 := auxToSym(v_0.Aux)
++		base := v_0.Args[0]
++		mem := v_1
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
++			break
++		}
++		v.reset(OpRISCV64FMOVDload)
++		v.AuxInt = int32ToAuxInt(off1 + off2)
++		v.Aux = symToAux(mergeSym(sym1, sym2))
++		v.AddArg2(base, mem)
++		return true
++	}
++	// match: (FMOVDload [off1] {sym} (ADDI [off2] base) mem)
++	// cond: is32Bit(int64(off1)+off2)
++	// result: (FMOVDload [off1+int32(off2)] {sym} base mem)
++	for {
++		off1 := auxIntToInt32(v.AuxInt)
++		sym := auxToSym(v.Aux)
++		if v_0.Op != OpRISCV64ADDI {
++			break
++		}
++		off2 := auxIntToInt64(v_0.AuxInt)
++		base := v_0.Args[0]
++		mem := v_1
++		if !(is32Bit(int64(off1) + off2)) {
++			break
++		}
++		v.reset(OpRISCV64FMOVDload)
++		v.AuxInt = int32ToAuxInt(off1 + int32(off2))
++		v.Aux = symToAux(sym)
++		v.AddArg2(base, mem)
++		return true
++	}
++	// match: (FMOVDload [off] {sym} ptr1 (MOVDstore [off] {sym} ptr2 x _))
++	// cond: isSamePtr(ptr1, ptr2)
++	// result: (FMVDX x)
++	for {
++		off := auxIntToInt32(v.AuxInt)
++		sym := auxToSym(v.Aux)
++		ptr1 := v_0
++		if v_1.Op != OpRISCV64MOVDstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
++			break
++		}
++		x := v_1.Args[1]
++		ptr2 := v_1.Args[0]
++		if !(isSamePtr(ptr1, ptr2)) {
++			break
++		}
++		v.reset(OpRISCV64FMVDX)
++		v.AddArg(x)
++		return true
++	}
++	return false
++}
++func rewriteValueRISCV64_OpRISCV64FMOVDstore(v *Value) bool {
++	v_2 := v.Args[2]
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
++	// match: (FMOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
++	// result: (FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
++	for {
++		off1 := auxIntToInt32(v.AuxInt)
++		sym1 := auxToSym(v.Aux)
++		if v_0.Op != OpRISCV64MOVaddr {
++			break
++		}
++		off2 := auxIntToInt32(v_0.AuxInt)
++		sym2 := auxToSym(v_0.Aux)
++		base := v_0.Args[0]
++		val := v_1
++		mem := v_2
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
++			break
++		}
++		v.reset(OpRISCV64FMOVDstore)
++		v.AuxInt = int32ToAuxInt(off1 + off2)
++		v.Aux = symToAux(mergeSym(sym1, sym2))
++		v.AddArg3(base, val, mem)
++		return true
++	}
++	// match: (FMOVDstore [off1] {sym} (ADDI [off2] base) val mem)
++	// cond: is32Bit(int64(off1)+off2)
++	// result: (FMOVDstore [off1+int32(off2)] {sym} base val mem)
++	for {
++		off1 := auxIntToInt32(v.AuxInt)
++		sym := auxToSym(v.Aux)
++		if v_0.Op != OpRISCV64ADDI {
++			break
++		}
++		off2 := auxIntToInt64(v_0.AuxInt)
++		base := v_0.Args[0]
++		val := v_1
++		mem := v_2
++		if !(is32Bit(int64(off1) + off2)) {
++			break
++		}
++		v.reset(OpRISCV64FMOVDstore)
++		v.AuxInt = int32ToAuxInt(off1 + int32(off2))
++		v.Aux = symToAux(sym)
++		v.AddArg3(base, val, mem)
++		return true
++	}
++	return false
++}
++func rewriteValueRISCV64_OpRISCV64FMOVWload(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
++	// match: (FMOVWload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
++	// result: (FMOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem)
++	for {
++		off1 := auxIntToInt32(v.AuxInt)
++		sym1 := auxToSym(v.Aux)
++		if v_0.Op != OpRISCV64MOVaddr {
++			break
++		}
++		off2 := auxIntToInt32(v_0.AuxInt)
++		sym2 := auxToSym(v_0.Aux)
++		base := v_0.Args[0]
++		mem := v_1
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
++			break
++		}
++		v.reset(OpRISCV64FMOVWload)
++		v.AuxInt = int32ToAuxInt(off1 + off2)
++		v.Aux = symToAux(mergeSym(sym1, sym2))
++		v.AddArg2(base, mem)
++		return true
++	}
++	// match: (FMOVWload [off1] {sym} (ADDI [off2] base) mem)
++	// cond: is32Bit(int64(off1)+off2)
++	// result: (FMOVWload [off1+int32(off2)] {sym} base mem)
++	for {
++		off1 := auxIntToInt32(v.AuxInt)
++		sym := auxToSym(v.Aux)
++		if v_0.Op != OpRISCV64ADDI {
++			break
++		}
++		off2 := auxIntToInt64(v_0.AuxInt)
++		base := v_0.Args[0]
++		mem := v_1
++		if !(is32Bit(int64(off1) + off2)) {
++			break
++		}
++		v.reset(OpRISCV64FMOVWload)
++		v.AuxInt = int32ToAuxInt(off1 + int32(off2))
++		v.Aux = symToAux(sym)
++		v.AddArg2(base, mem)
++		return true
++	}
++	// match: (FMOVWload [off] {sym} ptr1 (MOVWstore [off] {sym} ptr2 x _))
++	// cond: isSamePtr(ptr1, ptr2)
++	// result: (FMVSX x)
++	for {
++		off := auxIntToInt32(v.AuxInt)
++		sym := auxToSym(v.Aux)
++		ptr1 := v_0
++		if v_1.Op != OpRISCV64MOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
++			break
++		}
++		x := v_1.Args[1]
++		ptr2 := v_1.Args[0]
++		if !(isSamePtr(ptr1, ptr2)) {
++			break
++		}
++		v.reset(OpRISCV64FMVSX)
++		v.AddArg(x)
++		return true
++	}
++	return false
++}
++func rewriteValueRISCV64_OpRISCV64FMOVWstore(v *Value) bool {
++	v_2 := v.Args[2]
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	b := v.Block
++	config := b.Func.Config
++	// match: (FMOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem)
++	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
++	// result: (FMOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
++	for {
++		off1 := auxIntToInt32(v.AuxInt)
++		sym1 := auxToSym(v.Aux)
++		if v_0.Op != OpRISCV64MOVaddr {
++			break
++		}
++		off2 := auxIntToInt32(v_0.AuxInt)
++		sym2 := auxToSym(v_0.Aux)
++		base := v_0.Args[0]
++		val := v_1
++		mem := v_2
++		if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) {
++			break
++		}
++		v.reset(OpRISCV64FMOVWstore)
++		v.AuxInt = int32ToAuxInt(off1 + off2)
++		v.Aux = symToAux(mergeSym(sym1, sym2))
++		v.AddArg3(base, val, mem)
++		return true
++	}
++	// match: (FMOVWstore [off1] {sym} (ADDI [off2] base) val mem)
++	// cond: is32Bit(int64(off1)+off2)
++	// result: (FMOVWstore [off1+int32(off2)] {sym} base val mem)
++	for {
++		off1 := auxIntToInt32(v.AuxInt)
++		sym := auxToSym(v.Aux)
++		if v_0.Op != OpRISCV64ADDI {
++			break
++		}
++		off2 := auxIntToInt64(v_0.AuxInt)
++		base := v_0.Args[0]
++		val := v_1
++		mem := v_2
++		if !(is32Bit(int64(off1) + off2)) {
++			break
++		}
++		v.reset(OpRISCV64FMOVWstore)
++		v.AuxInt = int32ToAuxInt(off1 + int32(off2))
++		v.Aux = symToAux(sym)
++		v.AddArg3(base, val, mem)
++		return true
++	}
++	return false
++}
+ func rewriteValueRISCV64_OpRISCV64FMSUBD(v *Value) bool {
+ 	v_2 := v.Args[2]
+ 	v_1 := v.Args[1]
+@@ -4739,6 +4991,25 @@ func rewriteValueRISCV64_OpRISCV64MOVDload(v *Value) bool {
+ 		v.AddArg2(base, mem)
+ 		return true
+ 	}
++	// match: (MOVDload [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _))
++	// cond: isSamePtr(ptr1, ptr2)
++	// result: (FMVXD x)
++	for {
++		off := auxIntToInt32(v.AuxInt)
++		sym := auxToSym(v.Aux)
++		ptr1 := v_0
++		if v_1.Op != OpRISCV64FMOVDstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
++			break
++		}
++		x := v_1.Args[1]
++		ptr2 := v_1.Args[0]
++		if !(isSamePtr(ptr1, ptr2)) {
++			break
++		}
++		v.reset(OpRISCV64FMVXD)
++		v.AddArg(x)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueRISCV64_OpRISCV64MOVDnop(v *Value) bool {
+@@ -5420,6 +5691,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWUload(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	config := b.Func.Config
++	typ := &b.Func.Config.Types
+ 	// match: (MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
+ 	// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)
+ 	// result: (MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
+@@ -5463,6 +5735,27 @@ func rewriteValueRISCV64_OpRISCV64MOVWUload(v *Value) bool {
+ 		v.AddArg2(base, mem)
+ 		return true
+ 	}
++	// match: (MOVWUload [off] {sym} ptr1 (FMOVWstore [off] {sym} ptr2 x _))
++	// cond: isSamePtr(ptr1, ptr2)
++	// result: (MOVWUreg (FMVXS x))
++	for {
++		off := auxIntToInt32(v.AuxInt)
++		sym := auxToSym(v.Aux)
++		ptr1 := v_0
++		if v_1.Op != OpRISCV64FMOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
++			break
++		}
++		x := v_1.Args[1]
++		ptr2 := v_1.Args[0]
++		if !(isSamePtr(ptr1, ptr2)) {
++			break
++		}
++		v.reset(OpRISCV64MOVWUreg)
++		v0 := b.NewValue0(v_1.Pos, OpRISCV64FMVXS, typ.Int32)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueRISCV64_OpRISCV64MOVWUreg(v *Value) bool {
+@@ -5653,6 +5946,25 @@ func rewriteValueRISCV64_OpRISCV64MOVWload(v *Value) bool {
+ 		v.AddArg2(base, mem)
+ 		return true
+ 	}
++	// match: (MOVWload [off] {sym} ptr1 (FMOVWstore [off] {sym} ptr2 x _))
++	// cond: isSamePtr(ptr1, ptr2)
++	// result: (FMVXS x)
++	for {
++		off := auxIntToInt32(v.AuxInt)
++		sym := auxToSym(v.Aux)
++		ptr1 := v_0
++		if v_1.Op != OpRISCV64FMOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
++			break
++		}
++		x := v_1.Args[1]
++		ptr2 := v_1.Args[0]
++		if !(isSamePtr(ptr1, ptr2)) {
++			break
++		}
++		v.reset(OpRISCV64FMVXS)
++		v.AddArg(x)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueRISCV64_OpRISCV64MOVWreg(v *Value) bool {
+diff --git a/test/codegen/math.go b/test/codegen/math.go
+index 331ebbe609..3a2fac3e2e 100644
+--- a/test/codegen/math.go
++++ b/test/codegen/math.go
+@@ -158,6 +158,7 @@ func fromFloat64(f64 float64) uint64 {
+ 	// arm64:"FMOVD\tF.*, R.*"
+ 	// ppc64x:"MFVSRD"
+ 	// mips64/hardfloat:"MOVV\tF.*, R.*"
++	// riscv64:"FMVXD"
+ 	return math.Float64bits(f64+1) + 1
+ }
+ 
+@@ -165,6 +166,7 @@ func fromFloat32(f32 float32) uint32 {
+ 	// amd64:"MOVL\tX.*, [^X].*"
+ 	// arm64:"FMOVS\tF.*, R.*"
+ 	// mips64/hardfloat:"MOVW\tF.*, R.*"
++	// riscv64:"FMVXW"
+ 	return math.Float32bits(f32+1) + 1
+ }
+ 
+@@ -173,6 +175,7 @@ func toFloat64(u64 uint64) float64 {
+ 	// arm64:"FMOVD\tR.*, F.*"
+ 	// ppc64x:"MTVSRD"
+ 	// mips64/hardfloat:"MOVV\tR.*, F.*"
++	// riscv64:"FMVDX"
+ 	return math.Float64frombits(u64+1) + 1
+ }
+ 
+@@ -180,6 +183,7 @@ func toFloat32(u32 uint32) float32 {
+ 	// amd64:"MOVL\t[^X].*, X.*"
+ 	// arm64:"FMOVS\tR.*, F.*"
+ 	// mips64/hardfloat:"MOVW\tR.*, F.*"
++	// riscv64:"FMVWX"
+ 	return math.Float32frombits(u32+1) + 1
+ }
+ 
+-- 
+2.39.5
+
diff --git a/2106-internal-bytealg-vector-implementation-of-compare-fo.patch b/2106-internal-bytealg-vector-implementation-of-compare-fo.patch
new file mode 100644
index 0000000..a26f057
--- /dev/null
+++ b/2106-internal-bytealg-vector-implementation-of-compare-fo.patch
@@ -0,0 +1,163 @@
+From c06ec43f26c1a46351bb2320dacac177db1e0d9c Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:40 +0800
+Subject: [PATCH 106/119] internal/bytealg: vector implementation of compare
+ for riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Provide a vector implementation of compare for riscv64, which is used
+when compiled with the rva23u64 profile, or when vector is detected
+to be available. Inputs that are 8 byte aligned will still be handled
+via a the non-vector code if the length is less than or equal to 128
+bytes.
+
+On a Banana Pi F3, with GORISCV64=rva23u64:
+
+                                        │  compare.1   │              compare.2              │
+                                        │    sec/op    │   sec/op     vs base                │
+BytesCompare/1-8                           24.36n ± 0%   24.15n ± 0%   -0.84% (p=0.007 n=10)
+BytesCompare/2-8                           26.75n ± 0%   26.97n ± 0%   +0.82% (p=0.000 n=10)
+BytesCompare/4-8                           27.63n ± 0%   27.80n ± 0%   +0.60% (p=0.001 n=10)
+BytesCompare/8-8                           35.91n ± 0%   35.19n ± 0%   -2.01% (p=0.000 n=10)
+BytesCompare/16-8                          53.22n ± 0%   24.04n ± 1%  -54.82% (p=0.000 n=10)
+BytesCompare/32-8                          25.12n ± 0%   26.09n ± 1%   +3.86% (p=0.000 n=10)
+BytesCompare/64-8                          32.52n ± 0%   33.43n ± 1%   +2.78% (p=0.000 n=10)
+BytesCompare/128-8                         46.59n ± 0%   48.22n ± 1%   +3.50% (p=0.000 n=10)
+BytesCompare/256-8                         74.25n ± 0%   50.18n ± 0%  -32.42% (p=0.000 n=10)
+BytesCompare/512-8                        129.85n ± 0%   83.12n ± 0%  -35.98% (p=0.000 n=10)
+BytesCompare/1024-8                        244.6n ± 0%   148.0n ± 1%  -39.49% (p=0.000 n=10)
+BytesCompare/2048-8                        465.9n ± 0%   282.8n ± 2%  -39.30% (p=0.000 n=10)
+CompareBytesEqual-8                        51.96n ± 0%   52.90n ± 1%   +1.80% (p=0.000 n=10)
+CompareBytesToNil-8                        15.77n ± 1%   15.68n ± 0%   -0.57% (p=0.000 n=10)
+CompareBytesEmpty-8                        14.21n ± 1%   14.20n ± 1%        ~ (p=1.000 n=10)
+CompareBytesIdentical-8                    14.20n ± 1%   15.07n ± 1%   +6.20% (p=0.000 n=10)
+CompareBytesSameLength-8                   31.38n ± 0%   30.52n ± 0%   -2.74% (p=0.000 n=10)
+CompareBytesDifferentLength-8              31.38n ± 0%   30.53n ± 0%   -2.71% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=1-8       2401.0µ ± 0%   437.6µ ± 0%  -81.77% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=2-8       2376.8µ ± 0%   437.4µ ± 0%  -81.60% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=3-8       2384.1µ ± 0%   437.5µ ± 0%  -81.65% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=4-8       2377.7µ ± 0%   437.4µ ± 0%  -81.60% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=5-8       2366.3µ ± 0%   437.5µ ± 0%  -81.51% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=6-8       2357.3µ ± 0%   437.3µ ± 0%  -81.45% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=7-8       2385.3µ ± 0%   437.6µ ± 0%  -81.65% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=0-8    447.2µ ± 0%   464.8µ ± 0%   +3.94% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=1-8    447.7µ ± 0%   453.1µ ± 0%   +1.20% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=2-8    447.9µ ± 0%   453.0µ ± 0%   +1.15% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=3-8    448.0µ ± 0%   452.5µ ± 0%   +1.02% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=4-8    448.0µ ± 0%   452.1µ ± 0%   +0.92% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=5-8    447.8µ ± 0%   452.8µ ± 0%   +1.12% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=6-8    447.9µ ± 0%   452.4µ ± 0%   +1.01% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=7-8    447.9µ ± 0%   452.8µ ± 0%   +1.09% (p=0.000 n=10)
+CompareBytesBig-8                          441.2µ ± 0%   461.8µ ± 0%   +4.66% (p=0.000 n=10)
+CompareBytesBigIdentical-8                 13.81n ± 0%   13.80n ± 0%        ~ (p=0.519 n=10)
+geomean                                    3.980µ        2.651µ       -33.40%
+
+                                        │  compare.1   │               compare.2                │
+                                        │     B/s      │      B/s       vs base                 │
+CompareBytesBigUnaligned/offset=1-8       416.5Mi ± 0%   2285.1Mi ± 0%  +448.64% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=2-8       420.7Mi ± 0%   2286.4Mi ± 0%  +443.43% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=3-8       419.5Mi ± 0%   2285.9Mi ± 0%  +444.97% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=4-8       420.6Mi ± 0%   2286.1Mi ± 0%  +443.57% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=5-8       422.6Mi ± 0%   2285.7Mi ± 0%  +440.86% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=6-8       424.2Mi ± 0%   2286.8Mi ± 0%  +439.07% (p=0.000 n=10)
+CompareBytesBigUnaligned/offset=7-8       419.2Mi ± 0%   2285.2Mi ± 0%  +445.07% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=0-8   2.184Gi ± 0%    2.101Gi ± 0%    -3.79% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=1-8   2.181Gi ± 0%    2.155Gi ± 0%    -1.18% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=2-8   2.180Gi ± 0%    2.156Gi ± 0%    -1.13% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=3-8   2.180Gi ± 0%    2.158Gi ± 0%    -1.01% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=4-8   2.180Gi ± 0%    2.160Gi ± 0%    -0.91% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=5-8   2.181Gi ± 0%    2.157Gi ± 0%    -1.11% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=6-8   2.181Gi ± 0%    2.159Gi ± 0%    -1.00% (p=0.000 n=10)
+CompareBytesBigBothUnaligned/offset=7-8   2.180Gi ± 0%    2.157Gi ± 0%    -1.08% (p=0.000 n=10)
+CompareBytesBig-8                         2.213Gi ± 0%    2.115Gi ± 0%    -4.45% (p=0.000 n=10)
+CompareBytesBigIdentical-8                69.06Ti ± 0%    69.09Ti ± 0%         ~ (p=0.315 n=10)
+geomean                                   2.022Gi         4.022Gi        +98.95%
+
+Change-Id: Id3012faf8d353eb1be0e1fb01b78ac43fa4c7e8b
+Reviewed-on: https://go-review.googlesource.com/c/go/+/646737
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Mark Freeman <markfreeman@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/internal/bytealg/compare_riscv64.s | 47 ++++++++++++++++++++++++--
+ 1 file changed, 44 insertions(+), 3 deletions(-)
+
+diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s
+index 6388fcd209..3b1523dfbf 100644
+--- a/src/internal/bytealg/compare_riscv64.s
++++ b/src/internal/bytealg/compare_riscv64.s
+@@ -2,6 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
++#include "asm_riscv64.h"
+ #include "go_asm.h"
+ #include "textflag.h"
+ 
+@@ -35,6 +36,46 @@ TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
+ 	MIN	X11, X13, X5
+ 	BEQZ	X5, cmp_len
+ 
++	MOV	$16, X6
++	BLT	X5, X6, check8_unaligned
++
++#ifndef hasV
++	MOVB	internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X6
++	BEQZ	X6, compare_scalar
++#endif
++
++	// Use vector if not 8 byte aligned.
++	OR	X10, X12, X6
++	AND	$7, X6
++	BNEZ	X6, vector_loop
++
++	// Use scalar if 8 byte aligned and <= 128 bytes.
++	SUB	$128, X5, X6
++	BLEZ	X6, compare_scalar_aligned
++
++	PCALIGN	$16
++vector_loop:
++	VSETVLI	X5, E8, M8, TA, MA, X6
++	VLE8V	(X10), V8
++	VLE8V	(X12), V16
++	VMSNEVV	V8, V16, V0
++	VFIRSTM	V0, X7
++	BGEZ	X7, vector_not_eq
++	ADD	X6, X10
++	ADD	X6, X12
++	SUB	X6, X5
++	BNEZ	X5, vector_loop
++	JMP	cmp_len
++
++vector_not_eq:
++	// Load first differing bytes in X8/X9.
++	ADD	X7, X10
++	ADD	X7, X12
++	MOVBU	(X10), X8
++	MOVBU	(X12), X9
++	JMP	cmp
++
++compare_scalar:
+ 	MOV	$32, X6
+ 	BLT	X5, X6, check8_unaligned
+ 
+@@ -57,9 +98,9 @@ align:
+ 	ADD	$1, X12
+ 	BNEZ	X7, align
+ 
+-check32:
+-	// X6 contains $32
+-	BLT	X5, X6, compare16
++compare_scalar_aligned:
++	MOV	$32, X6
++	BLT	X5, X6, check16
+ compare32:
+ 	MOV	0(X10), X15
+ 	MOV	0(X12), X16
+-- 
+2.39.5
+
diff --git a/2107-cmd-compile-internal-ssagen-improve-intrinsic-archit.patch b/2107-cmd-compile-internal-ssagen-improve-intrinsic-archit.patch
new file mode 100644
index 0000000..6104747
--- /dev/null
+++ b/2107-cmd-compile-internal-ssagen-improve-intrinsic-archit.patch
@@ -0,0 +1,101 @@
+From c395e1476dbd60c4045c1061e5ef4d0283b31603 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:38:41 +0800
+Subject: [PATCH 107/119] cmd/compile/internal/ssagen: improve intrinsic
+ architecture handling
+
+The architecture handling code for intrinsics is more complex than
+it needs to be. sys.Archs is already an array of *sys.Arch and the
+existing InFamily function can be used instead of a reimplementation.
+
+Add some test coverage for sys.Arch.InFamily while here.
+
+Change-Id: Ia764f211114fea65424c09a421c5ccb02b7187b0
+Reviewed-on: https://go-review.googlesource.com/c/go/+/605476
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Keith Randall <khr@golang.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/cmd/compile/internal/ssagen/ssa.go | 16 ++++------------
+ src/cmd/internal/sys/arch_test.go      | 24 ++++++++++++++++++++++++
+ 2 files changed, 28 insertions(+), 12 deletions(-)
+ create mode 100644 src/cmd/internal/sys/arch_test.go
+
+diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
+index a0a3470ea2..cbf9587a56 100644
+--- a/src/cmd/compile/internal/ssagen/ssa.go
++++ b/src/cmd/compile/internal/ssagen/ssa.go
+@@ -4047,12 +4047,10 @@ type intrinsicKey struct {
+ func InitTables() {
+ 	intrinsics = map[intrinsicKey]intrinsicBuilder{}
+ 
+-	var all []*sys.Arch
+ 	var p4 []*sys.Arch
+ 	var p8 []*sys.Arch
+ 	var lwatomics []*sys.Arch
+-	for _, a := range &sys.Archs {
+-		all = append(all, a)
++	for _, a := range sys.Archs {
+ 		if a.PtrSize == 4 {
+ 			p4 = append(p4, a)
+ 		} else {
+@@ -4062,6 +4060,7 @@ func InitTables() {
+ 			lwatomics = append(lwatomics, a)
+ 		}
+ 	}
++	all := sys.Archs[:]
+ 
+ 	// add adds the intrinsic b for pkg.fn for the given list of architectures.
+ 	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
+@@ -4071,15 +4070,8 @@ func InitTables() {
+ 	}
+ 	// addF does the same as add but operates on architecture families.
+ 	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
+-		m := 0
+-		for _, f := range archFamilies {
+-			if f >= 32 {
+-				panic("too many architecture families")
+-			}
+-			m |= 1 << uint(f)
+-		}
+-		for _, a := range all {
+-			if m>>uint(a.Family)&1 != 0 {
++		for _, a := range sys.Archs {
++			if a.InFamily(archFamilies...) {
+ 				intrinsics[intrinsicKey{a, pkg, fn}] = b
+ 			}
+ 		}
+diff --git a/src/cmd/internal/sys/arch_test.go b/src/cmd/internal/sys/arch_test.go
+new file mode 100644
+index 0000000000..011d0923d5
+--- /dev/null
++++ b/src/cmd/internal/sys/arch_test.go
+@@ -0,0 +1,24 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++package sys
++
++import (
++	"testing"
++)
++
++func TestArchInFamily(t *testing.T) {
++	if got, want := ArchPPC64LE.InFamily(AMD64), false; got != want {
++		t.Errorf("Got ArchPPC64LE.InFamily(AMD64) = %v, want %v", got, want)
++	}
++	if got, want := ArchPPC64LE.InFamily(PPC64), true; got != want {
++		t.Errorf("Got ArchPPC64LE.InFamily(PPC64) = %v, want %v", got, want)
++	}
++	if got, want := ArchPPC64LE.InFamily(AMD64, RISCV64), false; got != want {
++		t.Errorf("Got ArchPPC64LE.InFamily(AMD64, RISCV64) = %v, want %v", got, want)
++	}
++	if got, want := ArchPPC64LE.InFamily(AMD64, PPC64), true; got != want {
++		t.Errorf("Got ArchPPC64LE.InFamily(AMD64, PPC64) = %v, want %v", got, want)
++	}
++}
+-- 
+2.39.5
+
diff --git a/2108-cmd-compile-internal-ssagen-factor-out-intrinsics-co.patch b/2108-cmd-compile-internal-ssagen-factor-out-intrinsics-co.patch
new file mode 100644
index 0000000..4128118
--- /dev/null
+++ b/2108-cmd-compile-internal-ssagen-factor-out-intrinsics-co.patch
@@ -0,0 +1,2066 @@
+From c6eff25852170f02d0526aee69ee0110ab5f9f9e Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:42:51 +0800
+Subject: [PATCH 108/119] cmd/compile/internal/ssagen: factor out intrinsics
+ code
+
+The intrinsic handling code is a good thousand lines in the fairly
+large ssa.go file. This code is already reasonably self-contained - factor
+it out into a separate file so that future changes are easier to manage
+(and it becomes easier to add/change intrinsics for an architecture).
+
+Change-Id: I3c18d3d1bb6332f1817d902150e736373bf1ac81
+Reviewed-on: https://go-review.googlesource.com/c/go/+/605477
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/compile/internal/ssagen/intrinsics.go | 1047 +++++++++++++++++
+ src/cmd/compile/internal/ssagen/ssa.go        |  969 +--------------
+ 2 files changed, 1056 insertions(+), 960 deletions(-)
+ create mode 100644 src/cmd/compile/internal/ssagen/intrinsics.go
+
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
+new file mode 100644
+index 0000000000..59eb1869bb
+--- /dev/null
++++ b/src/cmd/compile/internal/ssagen/intrinsics.go
+@@ -0,0 +1,1047 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++package ssagen
++
++import (
++	"fmt"
++	"internal/buildcfg"
++
++	"cmd/compile/internal/base"
++	"cmd/compile/internal/ir"
++	"cmd/compile/internal/ssa"
++	"cmd/compile/internal/types"
++	"cmd/internal/sys"
++)
++
++var intrinsics map[intrinsicKey]intrinsicBuilder
++
++// An intrinsicBuilder converts a call node n into an ssa value that
++// implements that call as an intrinsic. args is a list of arguments to the func.
++type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
++
++type intrinsicKey struct {
++	arch *sys.Arch
++	pkg  string
++	fn   string
++}
++
++func initIntrinsics() {
++	intrinsics = map[intrinsicKey]intrinsicBuilder{}
++
++	var p4 []*sys.Arch
++	var p8 []*sys.Arch
++	var lwatomics []*sys.Arch
++	for _, a := range sys.Archs {
++		if a.PtrSize == 4 {
++			p4 = append(p4, a)
++		} else {
++			p8 = append(p8, a)
++		}
++		if a.Family != sys.PPC64 {
++			lwatomics = append(lwatomics, a)
++		}
++	}
++	all := sys.Archs[:]
++
++	// add adds the intrinsic b for pkg.fn for the given list of architectures.
++	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
++		for _, a := range archs {
++			intrinsics[intrinsicKey{a, pkg, fn}] = b
++		}
++	}
++	// addF does the same as add but operates on architecture families.
++	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
++		for _, a := range sys.Archs {
++			if a.InFamily(archFamilies...) {
++				intrinsics[intrinsicKey{a, pkg, fn}] = b
++			}
++		}
++	}
++	// alias defines pkg.fn = pkg2.fn2 for all architectures in archs for which pkg2.fn2 exists.
++	alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
++		aliased := false
++		for _, a := range archs {
++			if b, ok := intrinsics[intrinsicKey{a, pkg2, fn2}]; ok {
++				intrinsics[intrinsicKey{a, pkg, fn}] = b
++				aliased = true
++			}
++		}
++		if !aliased {
++			panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
++		}
++	}
++
++	/******** runtime ********/
++	if !base.Flag.Cfg.Instrumenting {
++		add("runtime", "slicebytetostringtmp",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
++				// for the backend instead of slicebytetostringtmp calls
++				// when not instrumenting.
++				return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
++			},
++			all...)
++	}
++	addF("internal/runtime/math", "MulUintptr",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			if s.config.PtrSize == 4 {
++				return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
++			}
++			return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
++		},
++		sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.RISCV64, sys.ARM64)
++	add("runtime", "KeepAlive",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
++			s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
++			return nil
++		},
++		all...)
++	add("runtime", "getclosureptr",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
++		},
++		all...)
++
++	add("runtime", "getcallerpc",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
++		},
++		all...)
++
++	add("runtime", "getcallersp",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
++		},
++		all...)
++
++	addF("runtime", "publicationBarrier",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
++			return nil
++		},
++		sys.ARM64, sys.PPC64, sys.RISCV64)
++
++	brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X}
++	if buildcfg.GOPPC64 >= 10 {
++		// Use only on Power10 as the new byte reverse instructions that Power10 provide
++		// make it worthwhile as an intrinsic
++		brev_arch = append(brev_arch, sys.PPC64)
++	}
++	/******** internal/runtime/sys ********/
++	addF("internal/runtime/sys", "Bswap32",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
++		},
++		brev_arch...)
++	addF("internal/runtime/sys", "Bswap64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
++		},
++		brev_arch...)
++
++	/****** Prefetch ******/
++	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
++			return nil
++		}
++	}
++
++	// Make Prefetch intrinsics for supported platforms
++	// On the unsupported platforms stub function will be eliminated
++	addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
++		sys.AMD64, sys.ARM64, sys.PPC64)
++	addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
++		sys.AMD64, sys.ARM64, sys.PPC64)
++
++	/******** internal/runtime/atomic ********/
++	addF("internal/runtime/atomic", "Load",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
++		},
++		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "Load8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
++		},
++		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "Load64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
++		},
++		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "LoadAcq",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
++		},
++		sys.PPC64, sys.S390X)
++	addF("internal/runtime/atomic", "LoadAcq64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
++		},
++		sys.PPC64)
++	addF("internal/runtime/atomic", "Loadp",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
++		},
++		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++
++	addF("internal/runtime/atomic", "Store",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
++			return nil
++		},
++		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "Store8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
++			return nil
++		},
++		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "Store64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
++			return nil
++		},
++		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "StorepNoWB",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
++			return nil
++		},
++		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "StoreRel",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
++			return nil
++		},
++		sys.PPC64, sys.S390X)
++	addF("internal/runtime/atomic", "StoreRel64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
++			return nil
++		},
++		sys.PPC64)
++
++	addF("internal/runtime/atomic", "Xchg",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
++		},
++		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "Xchg64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
++		},
++		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++
++	type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
++
++	makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
++
++		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			if buildcfg.GOARM64.LSE || buildcfg.GOARM64.KPAtomicOpt {
++				emit(s, n, args, op1, typ, needReturn)
++			} else {
++				// Target Atomic feature is identified by dynamic detection
++				addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
++				v := s.load(types.Types[types.TBOOL], addr)
++				b := s.endBlock()
++				b.Kind = ssa.BlockIf
++				b.SetControl(v)
++				bTrue := s.f.NewBlock(ssa.BlockPlain)
++				bFalse := s.f.NewBlock(ssa.BlockPlain)
++				bEnd := s.f.NewBlock(ssa.BlockPlain)
++				b.AddEdgeTo(bTrue)
++				b.AddEdgeTo(bFalse)
++				b.Likely = ssa.BranchLikely
++
++				// We have atomic instructions - use it directly.
++				s.startBlock(bTrue)
++				emit(s, n, args, op1, typ, needReturn)
++				s.endBlock().AddEdgeTo(bEnd)
++
++				// Use original instruction sequence.
++				s.startBlock(bFalse)
++				emit(s, n, args, op0, typ, needReturn)
++				s.endBlock().AddEdgeTo(bEnd)
++
++				// Merge results.
++				s.startBlock(bEnd)
++			}
++			if needReturn {
++				return s.variable(n, types.Types[typ])
++			} else {
++				return nil
++			}
++		}
++	}
++	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
++		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
++	}
++	makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
++		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
++	}
++
++	atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
++		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
++		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++		if needReturn {
++			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
++		}
++	}
++	addF("internal/runtime/atomic", "Xchg",
++		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
++		sys.ARM64)
++	addF("internal/runtime/atomic", "Xchg64",
++		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
++		sys.ARM64)
++
++	addF("internal/runtime/atomic", "Xadd",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
++		},
++		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "Xadd64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
++		},
++		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++
++	addF("internal/runtime/atomic", "Xadd",
++		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
++		sys.ARM64)
++	addF("internal/runtime/atomic", "Xadd64",
++		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
++		sys.ARM64)
++
++	addF("internal/runtime/atomic", "Cas",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
++		},
++		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "Cas64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
++		},
++		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "CasRel",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
++			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
++		},
++		sys.PPC64)
++
++	atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
++		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
++		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
++		if needReturn {
++			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
++		}
++	}
++
++	addF("internal/runtime/atomic", "Cas",
++		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
++		sys.ARM64)
++	addF("internal/runtime/atomic", "Cas64",
++		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
++		sys.ARM64)
++
++	// Old-style atomic logical operation API (all supported archs except arm64).
++	addF("internal/runtime/atomic", "And8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
++			return nil
++		},
++		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "And",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
++			return nil
++		},
++		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "Or8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
++			return nil
++		},
++		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("internal/runtime/atomic", "Or",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
++			return nil
++		},
++		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
++
++	// arm64 always uses the new-style atomic logical operations, for both the
++	// old and new style API.
++	addF("internal/runtime/atomic", "And8",
++		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
++		sys.ARM64)
++	addF("internal/runtime/atomic", "Or8",
++		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
++		sys.ARM64)
++	addF("internal/runtime/atomic", "And64",
++		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
++		sys.ARM64)
++	addF("internal/runtime/atomic", "And32",
++		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
++		sys.ARM64)
++	addF("internal/runtime/atomic", "And",
++		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
++		sys.ARM64)
++	addF("internal/runtime/atomic", "Or64",
++		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
++		sys.ARM64)
++	addF("internal/runtime/atomic", "Or32",
++		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
++		sys.ARM64)
++	addF("internal/runtime/atomic", "Or",
++		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
++		sys.ARM64)
++
++	// New-style atomic logical operations, which return the old memory value.
++	addF("internal/runtime/atomic", "And64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
++			p0, p1 := s.split(v)
++			s.vars[memVar] = p1
++			return p0
++		},
++		sys.AMD64)
++	addF("internal/runtime/atomic", "And32",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
++			p0, p1 := s.split(v)
++			s.vars[memVar] = p1
++			return p0
++		},
++		sys.AMD64)
++	addF("internal/runtime/atomic", "Or64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
++			p0, p1 := s.split(v)
++			s.vars[memVar] = p1
++			return p0
++		},
++		sys.AMD64)
++	addF("internal/runtime/atomic", "Or32",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
++			p0, p1 := s.split(v)
++			s.vars[memVar] = p1
++			return p0
++		},
++		sys.AMD64)
++
++	// Aliases for atomic load operations
++	alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
++	alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
++	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
++	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
++	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
++	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
++	alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
++	alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
++	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
++	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
++	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
++	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
++
++	// Aliases for atomic store operations
++	alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
++	alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
++	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
++	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
++	alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
++	alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
++	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
++	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
++	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
++	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
++
++	// Aliases for atomic swap operations
++	alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
++	alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
++	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
++	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
++
++	// Aliases for atomic add operations
++	alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
++	alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
++	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
++	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
++
++	// Aliases for atomic CAS operations
++	alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
++	alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
++	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
++	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
++	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
++	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
++	alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
++
++	// Aliases for atomic And/Or operations
++	alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64)
++	alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64)
++
++	/******** math ********/
++	addF("math", "sqrt",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
++		},
++		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
++	addF("math", "Trunc",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
++		},
++		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
++	addF("math", "Ceil",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
++		},
++		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
++	addF("math", "Floor",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
++		},
++		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
++	addF("math", "Round",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
++		},
++		sys.ARM64, sys.PPC64, sys.S390X)
++	addF("math", "RoundToEven",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
++		},
++		sys.ARM64, sys.S390X, sys.Wasm)
++	addF("math", "Abs",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
++		},
++		sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
++	addF("math", "Copysign",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
++		},
++		sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
++	addF("math", "FMA",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
++		},
++		sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X)
++	addF("math", "FMA",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			if !s.config.UseFMA {
++				s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
++				return s.variable(n, types.Types[types.TFLOAT64])
++			}
++
++			if buildcfg.GOAMD64 >= 3 {
++				return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
++			}
++
++			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
++			b := s.endBlock()
++			b.Kind = ssa.BlockIf
++			b.SetControl(v)
++			bTrue := s.f.NewBlock(ssa.BlockPlain)
++			bFalse := s.f.NewBlock(ssa.BlockPlain)
++			bEnd := s.f.NewBlock(ssa.BlockPlain)
++			b.AddEdgeTo(bTrue)
++			b.AddEdgeTo(bFalse)
++			b.Likely = ssa.BranchLikely // >= haswell cpus are common
++
++			// We have the intrinsic - use it directly.
++			s.startBlock(bTrue)
++			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
++			s.endBlock().AddEdgeTo(bEnd)
++
++			// Call the pure Go version.
++			s.startBlock(bFalse)
++			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
++			s.endBlock().AddEdgeTo(bEnd)
++
++			// Merge results.
++			s.startBlock(bEnd)
++			return s.variable(n, types.Types[types.TFLOAT64])
++		},
++		sys.AMD64)
++	addF("math", "FMA",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			if !s.config.UseFMA {
++				s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
++				return s.variable(n, types.Types[types.TFLOAT64])
++			}
++			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
++			v := s.load(types.Types[types.TBOOL], addr)
++			b := s.endBlock()
++			b.Kind = ssa.BlockIf
++			b.SetControl(v)
++			bTrue := s.f.NewBlock(ssa.BlockPlain)
++			bFalse := s.f.NewBlock(ssa.BlockPlain)
++			bEnd := s.f.NewBlock(ssa.BlockPlain)
++			b.AddEdgeTo(bTrue)
++			b.AddEdgeTo(bFalse)
++			b.Likely = ssa.BranchLikely
++
++			// We have the intrinsic - use it directly.
++			s.startBlock(bTrue)
++			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
++			s.endBlock().AddEdgeTo(bEnd)
++
++			// Call the pure Go version.
++			s.startBlock(bFalse)
++			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
++			s.endBlock().AddEdgeTo(bEnd)
++
++			// Merge results.
++			s.startBlock(bEnd)
++			return s.variable(n, types.Types[types.TFLOAT64])
++		},
++		sys.ARM)
++
++	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			if buildcfg.GOAMD64 >= 2 {
++				return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
++			}
++
++			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
++			b := s.endBlock()
++			b.Kind = ssa.BlockIf
++			b.SetControl(v)
++			bTrue := s.f.NewBlock(ssa.BlockPlain)
++			bFalse := s.f.NewBlock(ssa.BlockPlain)
++			bEnd := s.f.NewBlock(ssa.BlockPlain)
++			b.AddEdgeTo(bTrue)
++			b.AddEdgeTo(bFalse)
++			b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
++
++			// We have the intrinsic - use it directly.
++			s.startBlock(bTrue)
++			s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
++			s.endBlock().AddEdgeTo(bEnd)
++
++			// Call the pure Go version.
++			s.startBlock(bFalse)
++			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
++			s.endBlock().AddEdgeTo(bEnd)
++
++			// Merge results.
++			s.startBlock(bEnd)
++			return s.variable(n, types.Types[types.TFLOAT64])
++		}
++	}
++	addF("math", "RoundToEven",
++		makeRoundAMD64(ssa.OpRoundToEven),
++		sys.AMD64)
++	addF("math", "Floor",
++		makeRoundAMD64(ssa.OpFloor),
++		sys.AMD64)
++	addF("math", "Ceil",
++		makeRoundAMD64(ssa.OpCeil),
++		sys.AMD64)
++	addF("math", "Trunc",
++		makeRoundAMD64(ssa.OpTrunc),
++		sys.AMD64)
++
++	/******** math/bits ********/
++	addF("math/bits", "TrailingZeros64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
++		},
++		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
++	addF("math/bits", "TrailingZeros32",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
++		},
++		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
++	addF("math/bits", "TrailingZeros16",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
++			c := s.constInt32(types.Types[types.TUINT32], 1<<16)
++			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
++			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
++		},
++		sys.MIPS)
++	addF("math/bits", "TrailingZeros16",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
++		},
++		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
++	addF("math/bits", "TrailingZeros16",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
++			c := s.constInt64(types.Types[types.TUINT64], 1<<16)
++			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
++			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
++		},
++		sys.S390X, sys.PPC64)
++	addF("math/bits", "TrailingZeros8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
++			c := s.constInt32(types.Types[types.TUINT32], 1<<8)
++			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
++			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
++		},
++		sys.MIPS)
++	addF("math/bits", "TrailingZeros8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
++		},
++		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
++	addF("math/bits", "TrailingZeros8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
++			c := s.constInt64(types.Types[types.TUINT64], 1<<8)
++			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
++			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
++		},
++		sys.S390X)
++	alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
++	alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
++	// ReverseBytes inlines correctly, no need to intrinsify it.
++	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
++	// On Power10, 16-bit rotate is not available so use BRH instruction
++	if buildcfg.GOPPC64 >= 10 {
++		addF("math/bits", "ReverseBytes16",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
++			},
++			sys.PPC64)
++	}
++
++	addF("math/bits", "Len64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
++		},
++		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
++	addF("math/bits", "Len32",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
++		},
++		sys.AMD64, sys.ARM64, sys.PPC64)
++	addF("math/bits", "Len32",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			if s.config.PtrSize == 4 {
++				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
++			}
++			x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0])
++			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
++		},
++		sys.ARM, sys.S390X, sys.MIPS, sys.Wasm)
++	addF("math/bits", "Len16",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			if s.config.PtrSize == 4 {
++				x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
++				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
++			}
++			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
++			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
++		},
++		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
++	addF("math/bits", "Len16",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
++		},
++		sys.AMD64)
++	addF("math/bits", "Len8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			if s.config.PtrSize == 4 {
++				x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
++				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
++			}
++			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
++			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
++		},
++		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
++	addF("math/bits", "Len8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
++		},
++		sys.AMD64)
++	addF("math/bits", "Len",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			if s.config.PtrSize == 4 {
++				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
++			}
++			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
++		},
++		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
++	// LeadingZeros is handled because it trivially calls Len.
++	addF("math/bits", "Reverse64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
++		},
++		sys.ARM64)
++	addF("math/bits", "Reverse32",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
++		},
++		sys.ARM64)
++	addF("math/bits", "Reverse16",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
++		},
++		sys.ARM64)
++	addF("math/bits", "Reverse8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
++		},
++		sys.ARM64)
++	addF("math/bits", "Reverse",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
++		},
++		sys.ARM64)
++	addF("math/bits", "RotateLeft8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
++		},
++		sys.AMD64, sys.RISCV64)
++	addF("math/bits", "RotateLeft16",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
++		},
++		sys.AMD64, sys.RISCV64)
++	addF("math/bits", "RotateLeft32",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
++		},
++		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
++	addF("math/bits", "RotateLeft64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
++		},
++		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
++	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
++
++	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			if buildcfg.GOAMD64 >= 2 {
++				return s.newValue1(op, types.Types[types.TINT], args[0])
++			}
++
++			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
++			b := s.endBlock()
++			b.Kind = ssa.BlockIf
++			b.SetControl(v)
++			bTrue := s.f.NewBlock(ssa.BlockPlain)
++			bFalse := s.f.NewBlock(ssa.BlockPlain)
++			bEnd := s.f.NewBlock(ssa.BlockPlain)
++			b.AddEdgeTo(bTrue)
++			b.AddEdgeTo(bFalse)
++			b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
++
++			// We have the intrinsic - use it directly.
++			s.startBlock(bTrue)
++			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
++			s.endBlock().AddEdgeTo(bEnd)
++
++			// Call the pure Go version.
++			s.startBlock(bFalse)
++			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
++			s.endBlock().AddEdgeTo(bEnd)
++
++			// Merge results.
++			s.startBlock(bEnd)
++			return s.variable(n, types.Types[types.TINT])
++		}
++	}
++	addF("math/bits", "OnesCount64",
++		makeOnesCountAMD64(ssa.OpPopCount64),
++		sys.AMD64)
++	addF("math/bits", "OnesCount64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
++		},
++		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
++	addF("math/bits", "OnesCount32",
++		makeOnesCountAMD64(ssa.OpPopCount32),
++		sys.AMD64)
++	addF("math/bits", "OnesCount32",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
++		},
++		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
++	addF("math/bits", "OnesCount16",
++		makeOnesCountAMD64(ssa.OpPopCount16),
++		sys.AMD64)
++	addF("math/bits", "OnesCount16",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
++		},
++		sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
++	addF("math/bits", "OnesCount8",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
++		},
++		sys.S390X, sys.PPC64, sys.Wasm)
++	addF("math/bits", "OnesCount",
++		makeOnesCountAMD64(ssa.OpPopCount64),
++		sys.AMD64)
++	addF("math/bits", "Mul64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
++		},
++		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64)
++	alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
++	alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
++	addF("math/bits", "Add64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
++		},
++		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
++	alias("math/bits", "Add", "math/bits", "Add64", p8...)
++	alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
++	addF("math/bits", "Sub64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
++		},
++		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
++	alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
++	addF("math/bits", "Div64",
++		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++			// check for divide-by-zero/overflow and panic with appropriate message
++			cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
++			s.check(cmpZero, ir.Syms.Panicdivide)
++			cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
++			s.check(cmpOverflow, ir.Syms.Panicoverflow)
++			return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
++		},
++		sys.AMD64)
++	alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
++
++	alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
++	alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
++	alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
++	alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
++	alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
++	alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
++
++	/******** sync/atomic ********/
++
++	// Note: these are disabled by flag_race in findIntrinsic below.
++	alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
++	alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
++	alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
++	alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
++	alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
++	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
++	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
++
++	alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
++	alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
++	// Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
++	alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
++	alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
++	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
++	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
++
++	alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
++	alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
++	alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
++	alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
++	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
++	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
++
++	alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
++	alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
++	alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
++	alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
++	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
++	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
++
++	alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
++	alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
++	alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
++	alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
++	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
++	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
++
++	alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
++	alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
++	alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
++	alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
++	alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
++	alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
++	alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
++	alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
++	alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
++	alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
++
++	/******** math/big ********/
++	alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
++}
++
++// findIntrinsic returns a function which builds the SSA equivalent of the
++// function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
++func findIntrinsic(sym *types.Sym) intrinsicBuilder {
++	if sym == nil || sym.Pkg == nil {
++		return nil
++	}
++	pkg := sym.Pkg.Path
++	if sym.Pkg == ir.Pkgs.Runtime {
++		pkg = "runtime"
++	}
++	if base.Flag.Race && pkg == "sync/atomic" {
++		// The race detector needs to be able to intercept these calls.
++		// We can't intrinsify them.
++		return nil
++	}
++	// Skip intrinsifying math functions (which may contain hard-float
++	// instructions) when soft-float
++	if Arch.SoftFloat && pkg == "math" {
++		return nil
++	}
++
++	fn := sym.Name
++	if ssa.IntrinsicsDisable {
++		if pkg == "runtime" && (fn == "getcallerpc" || fn == "getcallersp" || fn == "getclosureptr") {
++			// These runtime functions don't have definitions, must be intrinsics.
++		} else {
++			return nil
++		}
++	}
++	return intrinsics[intrinsicKey{Arch.LinkArch.Arch, pkg, fn}]
++}
++
++func IsIntrinsicCall(n *ir.CallExpr) bool {
++	if n == nil {
++		return false
++	}
++	name, ok := n.Fun.(*ir.Name)
++	if !ok {
++		return false
++	}
++	return findIntrinsic(name.Sym()) != nil
++}
+diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
+index cbf9587a56..0f6f2de4a7 100644
+--- a/src/cmd/compile/internal/ssagen/ssa.go
++++ b/src/cmd/compile/internal/ssagen/ssa.go
+@@ -208,6 +208,10 @@ func InitConfig() {
+ 	ir.Syms.SigPanic = typecheck.LookupRuntimeFunc("sigpanic")
+ }
+ 
++func InitTables() {
++	initIntrinsics()
++}
++
+ // AbiForBodylessFuncStackMap returns the ABI for a bodyless function's stack map.
+ // This is not necessarily the ABI used to call it.
+ // Currently (1.17 dev) such a stack map is always ABI0;
+@@ -4032,966 +4036,11 @@ func (s *state) sfcall(op ssa.Op, args ...*ssa.Value) (*ssa.Value, bool) {
+ 	return nil, false
+ }
+ 
+-var intrinsics map[intrinsicKey]intrinsicBuilder
+-
+-// An intrinsicBuilder converts a call node n into an ssa value that
+-// implements that call as an intrinsic. args is a list of arguments to the func.
+-type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
+-
+-type intrinsicKey struct {
+-	arch *sys.Arch
+-	pkg  string
+-	fn   string
+-}
+-
+-func InitTables() {
+-	intrinsics = map[intrinsicKey]intrinsicBuilder{}
+-
+-	var p4 []*sys.Arch
+-	var p8 []*sys.Arch
+-	var lwatomics []*sys.Arch
+-	for _, a := range sys.Archs {
+-		if a.PtrSize == 4 {
+-			p4 = append(p4, a)
+-		} else {
+-			p8 = append(p8, a)
+-		}
+-		if a.Family != sys.PPC64 {
+-			lwatomics = append(lwatomics, a)
+-		}
+-	}
+-	all := sys.Archs[:]
+-
+-	// add adds the intrinsic b for pkg.fn for the given list of architectures.
+-	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
+-		for _, a := range archs {
+-			intrinsics[intrinsicKey{a, pkg, fn}] = b
+-		}
+-	}
+-	// addF does the same as add but operates on architecture families.
+-	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
+-		for _, a := range sys.Archs {
+-			if a.InFamily(archFamilies...) {
+-				intrinsics[intrinsicKey{a, pkg, fn}] = b
+-			}
+-		}
+-	}
+-	// alias defines pkg.fn = pkg2.fn2 for all architectures in archs for which pkg2.fn2 exists.
+-	alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
+-		aliased := false
+-		for _, a := range archs {
+-			if b, ok := intrinsics[intrinsicKey{a, pkg2, fn2}]; ok {
+-				intrinsics[intrinsicKey{a, pkg, fn}] = b
+-				aliased = true
+-			}
+-		}
+-		if !aliased {
+-			panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
+-		}
+-	}
+-
+-	/******** runtime ********/
+-	if !base.Flag.Cfg.Instrumenting {
+-		add("runtime", "slicebytetostringtmp",
+-			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
+-				// for the backend instead of slicebytetostringtmp calls
+-				// when not instrumenting.
+-				return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
+-			},
+-			all...)
+-	}
+-	addF("runtime/internal/math", "MulUintptr",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if s.config.PtrSize == 4 {
+-				return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
+-			}
+-			return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
+-		},
+-		sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.RISCV64, sys.ARM64)
+-	alias("runtime", "mulUintptr", "runtime/internal/math", "MulUintptr", all...)
+-	add("runtime", "KeepAlive",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
+-			s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
+-			return nil
+-		},
+-		all...)
+-	add("runtime", "getclosureptr",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
+-		},
+-		all...)
+-
+-	add("runtime", "getcallerpc",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
+-		},
+-		all...)
+-
+-	add("runtime", "getcallersp",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
+-		},
+-		all...)
+-
+-	addF("runtime", "publicationBarrier",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
+-			return nil
+-		},
+-		sys.ARM64, sys.PPC64, sys.RISCV64)
+-
+-	brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X}
+-	if buildcfg.GOPPC64 >= 10 {
+-		// Use only on Power10 as the new byte reverse instructions that Power10 provide
+-		// make it worthwhile as an intrinsic
+-		brev_arch = append(brev_arch, sys.PPC64)
+-	}
+-	/******** runtime/internal/sys ********/
+-	addF("runtime/internal/sys", "Bswap32",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
+-		},
+-		brev_arch...)
+-	addF("runtime/internal/sys", "Bswap64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
+-		},
+-		brev_arch...)
+-
+-	/****** Prefetch ******/
+-	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
+-			return nil
+-		}
+-	}
+-
+-	// Make Prefetch intrinsics for supported platforms
+-	// On the unsupported platforms stub function will be eliminated
+-	addF("runtime/internal/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
+-		sys.AMD64, sys.ARM64, sys.PPC64)
+-	addF("runtime/internal/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
+-		sys.AMD64, sys.ARM64, sys.PPC64)
+-
+-	/******** runtime/internal/atomic ********/
+-	addF("runtime/internal/atomic", "Load",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+-		},
+-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "Load8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
+-		},
+-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "Load64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+-		},
+-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "LoadAcq",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+-		},
+-		sys.PPC64, sys.S390X)
+-	addF("runtime/internal/atomic", "LoadAcq64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+-		},
+-		sys.PPC64)
+-	addF("runtime/internal/atomic", "Loadp",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
+-		},
+-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-
+-	addF("runtime/internal/atomic", "Store",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
+-			return nil
+-		},
+-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "Store8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
+-			return nil
+-		},
+-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "Store64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
+-			return nil
+-		},
+-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "StorepNoWB",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
+-			return nil
+-		},
+-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "StoreRel",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
+-			return nil
+-		},
+-		sys.PPC64, sys.S390X)
+-	addF("runtime/internal/atomic", "StoreRel64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
+-			return nil
+-		},
+-		sys.PPC64)
+-
+-	addF("runtime/internal/atomic", "Xchg",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+-		},
+-		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "Xchg64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+-		},
+-		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-
+-	type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind)
+-
+-	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ, rtyp types.Kind, emit atomicOpEmitter) intrinsicBuilder {
+-
+-		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if buildcfg.GOARM64.KPAtomicOpt {
+-				emit(s, n, args, op0, typ)
+-			} else {
+-				// Target Atomic feature is identified by dynamic detection
+-				addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
+-				v := s.load(types.Types[types.TBOOL], addr)
+-				b := s.endBlock()
+-				b.Kind = ssa.BlockIf
+-				b.SetControl(v)
+-				bTrue := s.f.NewBlock(ssa.BlockPlain)
+-				bFalse := s.f.NewBlock(ssa.BlockPlain)
+-				bEnd := s.f.NewBlock(ssa.BlockPlain)
+-				b.AddEdgeTo(bTrue)
+-				b.AddEdgeTo(bFalse)
+-				b.Likely = ssa.BranchLikely
+-
+-				// We have atomic instructions - use it directly.
+-				s.startBlock(bTrue)
+-				emit(s, n, args, op1, typ)
+-				s.endBlock().AddEdgeTo(bEnd)
+-
+-				// Use original instruction sequence.
+-				s.startBlock(bFalse)
+-				emit(s, n, args, op0, typ)
+-				s.endBlock().AddEdgeTo(bEnd)
+-
+-				// Merge results.
+-				s.startBlock(bEnd)
+-			}
+-			if rtyp == types.TNIL {
+-				return nil
+-			} else {
+-				return s.variable(n, types.Types[rtyp])
+-			}
+-			
+-		}
+-	}
+-
+-	atomicXchgXaddEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind) {
+-		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
+-		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-		s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
+-	}
+-	addF("runtime/internal/atomic", "Xchg",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, types.TUINT32, atomicXchgXaddEmitterARM64),
+-		sys.ARM64)
+-	addF("runtime/internal/atomic", "Xchg64",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, types.TUINT64, atomicXchgXaddEmitterARM64),
+-		sys.ARM64)
+-
+-	addF("runtime/internal/atomic", "Xadd",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+-		},
+-		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "Xadd64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+-		},
+-		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-
+-	addF("runtime/internal/atomic", "Xadd",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, types.TUINT32, atomicXchgXaddEmitterARM64),
+-		sys.ARM64)
+-	addF("runtime/internal/atomic", "Xadd64",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, types.TUINT64, atomicXchgXaddEmitterARM64),
+-		sys.ARM64)
+-
+-	addF("runtime/internal/atomic", "Cas",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
+-		},
+-		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "Cas64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
+-		},
+-		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "CasRel",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
+-		},
+-		sys.PPC64)
+-
+-	atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind) {
+-		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+-		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+-		s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
+-	}
+-
+-	addF("runtime/internal/atomic", "Cas",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TUINT32, types.TBOOL, atomicCasEmitterARM64),
+-		sys.ARM64)
+-	addF("runtime/internal/atomic", "Cas64",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TUINT64, types.TBOOL, atomicCasEmitterARM64),
+-		sys.ARM64)
+-
+-	addF("runtime/internal/atomic", "And8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
+-			return nil
+-		},
+-		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "And",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
+-			return nil
+-		},
+-		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "Or8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
+-			return nil
+-		},
+-		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("runtime/internal/atomic", "Or",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
+-			return nil
+-		},
+-		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+-
+-	atomicAndOrEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind) {
+-		s.vars[memVar] = s.newValue3(op, types.TypeMem, args[0], args[1], s.mem())
+-	}
+-
+-	addF("runtime/internal/atomic", "And8",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd8, ssa.OpAtomicAnd8Variant, types.TNIL, types.TNIL, atomicAndOrEmitterARM64),
+-		sys.ARM64)
+-	addF("runtime/internal/atomic", "And",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32, ssa.OpAtomicAnd32Variant, types.TNIL, types.TNIL, atomicAndOrEmitterARM64),
+-		sys.ARM64)
+-	addF("runtime/internal/atomic", "Or8",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr8, ssa.OpAtomicOr8Variant, types.TNIL, types.TNIL, atomicAndOrEmitterARM64),
+-		sys.ARM64)
+-	addF("runtime/internal/atomic", "Or",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32, ssa.OpAtomicOr32Variant, types.TNIL, types.TNIL, atomicAndOrEmitterARM64),
+-		sys.ARM64)
+-
+-	// Aliases for atomic load operations
+-	alias("runtime/internal/atomic", "Loadint32", "runtime/internal/atomic", "Load", all...)
+-	alias("runtime/internal/atomic", "Loadint64", "runtime/internal/atomic", "Load64", all...)
+-	alias("runtime/internal/atomic", "Loaduintptr", "runtime/internal/atomic", "Load", p4...)
+-	alias("runtime/internal/atomic", "Loaduintptr", "runtime/internal/atomic", "Load64", p8...)
+-	alias("runtime/internal/atomic", "Loaduint", "runtime/internal/atomic", "Load", p4...)
+-	alias("runtime/internal/atomic", "Loaduint", "runtime/internal/atomic", "Load64", p8...)
+-	alias("runtime/internal/atomic", "LoadAcq", "runtime/internal/atomic", "Load", lwatomics...)
+-	alias("runtime/internal/atomic", "LoadAcq64", "runtime/internal/atomic", "Load64", lwatomics...)
+-	alias("runtime/internal/atomic", "LoadAcquintptr", "runtime/internal/atomic", "LoadAcq", p4...)
+-	alias("sync", "runtime_LoadAcquintptr", "runtime/internal/atomic", "LoadAcq", p4...) // linknamed
+-	alias("runtime/internal/atomic", "LoadAcquintptr", "runtime/internal/atomic", "LoadAcq64", p8...)
+-	alias("sync", "runtime_LoadAcquintptr", "runtime/internal/atomic", "LoadAcq64", p8...) // linknamed
+-
+-	// Aliases for atomic store operations
+-	alias("runtime/internal/atomic", "Storeint32", "runtime/internal/atomic", "Store", all...)
+-	alias("runtime/internal/atomic", "Storeint64", "runtime/internal/atomic", "Store64", all...)
+-	alias("runtime/internal/atomic", "Storeuintptr", "runtime/internal/atomic", "Store", p4...)
+-	alias("runtime/internal/atomic", "Storeuintptr", "runtime/internal/atomic", "Store64", p8...)
+-	alias("runtime/internal/atomic", "StoreRel", "runtime/internal/atomic", "Store", lwatomics...)
+-	alias("runtime/internal/atomic", "StoreRel64", "runtime/internal/atomic", "Store64", lwatomics...)
+-	alias("runtime/internal/atomic", "StoreReluintptr", "runtime/internal/atomic", "StoreRel", p4...)
+-	alias("sync", "runtime_StoreReluintptr", "runtime/internal/atomic", "StoreRel", p4...) // linknamed
+-	alias("runtime/internal/atomic", "StoreReluintptr", "runtime/internal/atomic", "StoreRel64", p8...)
+-	alias("sync", "runtime_StoreReluintptr", "runtime/internal/atomic", "StoreRel64", p8...) // linknamed
+-
+-	// Aliases for atomic swap operations
+-	alias("runtime/internal/atomic", "Xchgint32", "runtime/internal/atomic", "Xchg", all...)
+-	alias("runtime/internal/atomic", "Xchgint64", "runtime/internal/atomic", "Xchg64", all...)
+-	alias("runtime/internal/atomic", "Xchguintptr", "runtime/internal/atomic", "Xchg", p4...)
+-	alias("runtime/internal/atomic", "Xchguintptr", "runtime/internal/atomic", "Xchg64", p8...)
+-
+-	// Aliases for atomic add operations
+-	alias("runtime/internal/atomic", "Xaddint32", "runtime/internal/atomic", "Xadd", all...)
+-	alias("runtime/internal/atomic", "Xaddint64", "runtime/internal/atomic", "Xadd64", all...)
+-	alias("runtime/internal/atomic", "Xadduintptr", "runtime/internal/atomic", "Xadd", p4...)
+-	alias("runtime/internal/atomic", "Xadduintptr", "runtime/internal/atomic", "Xadd64", p8...)
+-
+-	// Aliases for atomic CAS operations
+-	alias("runtime/internal/atomic", "Casint32", "runtime/internal/atomic", "Cas", all...)
+-	alias("runtime/internal/atomic", "Casint64", "runtime/internal/atomic", "Cas64", all...)
+-	alias("runtime/internal/atomic", "Casuintptr", "runtime/internal/atomic", "Cas", p4...)
+-	alias("runtime/internal/atomic", "Casuintptr", "runtime/internal/atomic", "Cas64", p8...)
+-	alias("runtime/internal/atomic", "Casp1", "runtime/internal/atomic", "Cas", p4...)
+-	alias("runtime/internal/atomic", "Casp1", "runtime/internal/atomic", "Cas64", p8...)
+-	alias("runtime/internal/atomic", "CasRel", "runtime/internal/atomic", "Cas", lwatomics...)
+-
+-	/******** math ********/
+-	addF("math", "sqrt",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
+-		},
+-		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+-	addF("math", "Trunc",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
+-		},
+-		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
+-	addF("math", "Ceil",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
+-		},
+-		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
+-	addF("math", "Floor",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
+-		},
+-		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
+-	addF("math", "Round",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
+-		},
+-		sys.ARM64, sys.PPC64, sys.S390X)
+-	addF("math", "RoundToEven",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
+-		},
+-		sys.ARM64, sys.S390X, sys.Wasm)
+-	addF("math", "Abs",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
+-		},
+-		sys.ARM64, sys.ARM, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
+-	addF("math", "Copysign",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
+-		},
+-		sys.PPC64, sys.RISCV64, sys.Wasm)
+-	addF("math", "FMA",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+-		},
+-		sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X)
+-	addF("math", "FMA",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if !s.config.UseFMA {
+-				s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+-				return s.variable(n, types.Types[types.TFLOAT64])
+-			}
+-
+-			if buildcfg.GOAMD64 >= 3 {
+-				return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+-			}
+-
+-			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
+-			b := s.endBlock()
+-			b.Kind = ssa.BlockIf
+-			b.SetControl(v)
+-			bTrue := s.f.NewBlock(ssa.BlockPlain)
+-			bFalse := s.f.NewBlock(ssa.BlockPlain)
+-			bEnd := s.f.NewBlock(ssa.BlockPlain)
+-			b.AddEdgeTo(bTrue)
+-			b.AddEdgeTo(bFalse)
+-			b.Likely = ssa.BranchLikely // >= haswell cpus are common
+-
+-			// We have the intrinsic - use it directly.
+-			s.startBlock(bTrue)
+-			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+-			s.endBlock().AddEdgeTo(bEnd)
+-
+-			// Call the pure Go version.
+-			s.startBlock(bFalse)
+-			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+-			s.endBlock().AddEdgeTo(bEnd)
+-
+-			// Merge results.
+-			s.startBlock(bEnd)
+-			return s.variable(n, types.Types[types.TFLOAT64])
+-		},
+-		sys.AMD64)
+-	addF("math", "FMA",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if !s.config.UseFMA {
+-				s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+-				return s.variable(n, types.Types[types.TFLOAT64])
+-			}
+-			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
+-			v := s.load(types.Types[types.TBOOL], addr)
+-			b := s.endBlock()
+-			b.Kind = ssa.BlockIf
+-			b.SetControl(v)
+-			bTrue := s.f.NewBlock(ssa.BlockPlain)
+-			bFalse := s.f.NewBlock(ssa.BlockPlain)
+-			bEnd := s.f.NewBlock(ssa.BlockPlain)
+-			b.AddEdgeTo(bTrue)
+-			b.AddEdgeTo(bFalse)
+-			b.Likely = ssa.BranchLikely
+-
+-			// We have the intrinsic - use it directly.
+-			s.startBlock(bTrue)
+-			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+-			s.endBlock().AddEdgeTo(bEnd)
+-
+-			// Call the pure Go version.
+-			s.startBlock(bFalse)
+-			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+-			s.endBlock().AddEdgeTo(bEnd)
+-
+-			// Merge results.
+-			s.startBlock(bEnd)
+-			return s.variable(n, types.Types[types.TFLOAT64])
+-		},
+-		sys.ARM)
+-
+-	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if buildcfg.GOAMD64 >= 2 {
+-				return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
+-			}
+-
+-			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
+-			b := s.endBlock()
+-			b.Kind = ssa.BlockIf
+-			b.SetControl(v)
+-			bTrue := s.f.NewBlock(ssa.BlockPlain)
+-			bFalse := s.f.NewBlock(ssa.BlockPlain)
+-			bEnd := s.f.NewBlock(ssa.BlockPlain)
+-			b.AddEdgeTo(bTrue)
+-			b.AddEdgeTo(bFalse)
+-			b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
+-
+-			// We have the intrinsic - use it directly.
+-			s.startBlock(bTrue)
+-			s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
+-			s.endBlock().AddEdgeTo(bEnd)
+-
+-			// Call the pure Go version.
+-			s.startBlock(bFalse)
+-			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+-			s.endBlock().AddEdgeTo(bEnd)
+-
+-			// Merge results.
+-			s.startBlock(bEnd)
+-			return s.variable(n, types.Types[types.TFLOAT64])
+-		}
+-	}
+-	addF("math", "RoundToEven",
+-		makeRoundAMD64(ssa.OpRoundToEven),
+-		sys.AMD64)
+-	addF("math", "Floor",
+-		makeRoundAMD64(ssa.OpFloor),
+-		sys.AMD64)
+-	addF("math", "Ceil",
+-		makeRoundAMD64(ssa.OpCeil),
+-		sys.AMD64)
+-	addF("math", "Trunc",
+-		makeRoundAMD64(ssa.OpTrunc),
+-		sys.AMD64)
+-
+-	/******** math/bits ********/
+-	addF("math/bits", "TrailingZeros64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
+-		},
+-		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+-	addF("math/bits", "TrailingZeros32",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
+-		},
+-		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+-	addF("math/bits", "TrailingZeros16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
+-			c := s.constInt32(types.Types[types.TUINT32], 1<<16)
+-			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
+-			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
+-		},
+-		sys.MIPS)
+-	addF("math/bits", "TrailingZeros16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
+-		},
+-		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
+-	addF("math/bits", "TrailingZeros16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
+-			c := s.constInt64(types.Types[types.TUINT64], 1<<16)
+-			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
+-			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
+-		},
+-		sys.S390X, sys.PPC64)
+-	addF("math/bits", "TrailingZeros8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
+-			c := s.constInt32(types.Types[types.TUINT32], 1<<8)
+-			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
+-			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
+-		},
+-		sys.MIPS)
+-	addF("math/bits", "TrailingZeros8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
+-		},
+-		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
+-	addF("math/bits", "TrailingZeros8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
+-			c := s.constInt64(types.Types[types.TUINT64], 1<<8)
+-			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
+-			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
+-		},
+-		sys.S390X)
+-	alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...)
+-	alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...)
+-	// ReverseBytes inlines correctly, no need to intrinsify it.
+-	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
+-	// On Power10, 16-bit rotate is not available so use BRH instruction
+-	if buildcfg.GOPPC64 >= 10 {
+-		addF("math/bits", "ReverseBytes16",
+-			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
+-			},
+-			sys.PPC64)
+-	}
+-
+-	addF("math/bits", "Len64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
+-		},
+-		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+-	addF("math/bits", "Len32",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+-		},
+-		sys.AMD64, sys.ARM64, sys.PPC64)
+-	addF("math/bits", "Len32",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if s.config.PtrSize == 4 {
+-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+-			}
+-			x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0])
+-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+-		},
+-		sys.ARM, sys.S390X, sys.MIPS, sys.Wasm)
+-	addF("math/bits", "Len16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if s.config.PtrSize == 4 {
+-				x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
+-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
+-			}
+-			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
+-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+-		},
+-		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+-	addF("math/bits", "Len16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
+-		},
+-		sys.AMD64)
+-	addF("math/bits", "Len8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if s.config.PtrSize == 4 {
+-				x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
+-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
+-			}
+-			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
+-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+-		},
+-		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+-	addF("math/bits", "Len8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
+-		},
+-		sys.AMD64)
+-	addF("math/bits", "Len",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if s.config.PtrSize == 4 {
+-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+-			}
+-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
+-		},
+-		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+-	// LeadingZeros is handled because it trivially calls Len.
+-	addF("math/bits", "Reverse64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
+-		},
+-		sys.ARM64)
+-	addF("math/bits", "Reverse32",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
+-		},
+-		sys.ARM64)
+-	addF("math/bits", "Reverse16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
+-		},
+-		sys.ARM64)
+-	addF("math/bits", "Reverse8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
+-		},
+-		sys.ARM64)
+-	addF("math/bits", "Reverse",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
+-		},
+-		sys.ARM64)
+-	addF("math/bits", "RotateLeft8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
+-		},
+-		sys.AMD64, sys.RISCV64)
+-	addF("math/bits", "RotateLeft16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
+-		},
+-		sys.AMD64, sys.RISCV64)
+-	addF("math/bits", "RotateLeft32",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
+-		},
+-		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+-	addF("math/bits", "RotateLeft64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
+-		},
+-		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+-	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
+-
+-	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if buildcfg.GOAMD64 >= 2 {
+-				return s.newValue1(op, types.Types[types.TINT], args[0])
+-			}
+-
+-			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
+-			b := s.endBlock()
+-			b.Kind = ssa.BlockIf
+-			b.SetControl(v)
+-			bTrue := s.f.NewBlock(ssa.BlockPlain)
+-			bFalse := s.f.NewBlock(ssa.BlockPlain)
+-			bEnd := s.f.NewBlock(ssa.BlockPlain)
+-			b.AddEdgeTo(bTrue)
+-			b.AddEdgeTo(bFalse)
+-			b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
+-
+-			// We have the intrinsic - use it directly.
+-			s.startBlock(bTrue)
+-			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
+-			s.endBlock().AddEdgeTo(bEnd)
+-
+-			// Call the pure Go version.
+-			s.startBlock(bFalse)
+-			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
+-			s.endBlock().AddEdgeTo(bEnd)
+-
+-			// Merge results.
+-			s.startBlock(bEnd)
+-			return s.variable(n, types.Types[types.TINT])
+-		}
+-	}
+-	addF("math/bits", "OnesCount64",
+-		makeOnesCountAMD64(ssa.OpPopCount64),
+-		sys.AMD64)
+-	addF("math/bits", "OnesCount64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
+-		},
+-		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
+-	addF("math/bits", "OnesCount32",
+-		makeOnesCountAMD64(ssa.OpPopCount32),
+-		sys.AMD64)
+-	addF("math/bits", "OnesCount32",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
+-		},
+-		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
+-	addF("math/bits", "OnesCount16",
+-		makeOnesCountAMD64(ssa.OpPopCount16),
+-		sys.AMD64)
+-	addF("math/bits", "OnesCount16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
+-		},
+-		sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
+-	addF("math/bits", "OnesCount8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
+-		},
+-		sys.S390X, sys.PPC64, sys.Wasm)
+-	addF("math/bits", "OnesCount",
+-		makeOnesCountAMD64(ssa.OpPopCount64),
+-		sys.AMD64)
+-	addF("math/bits", "Mul64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
+-		},
+-		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64)
+-	alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
+-	alias("runtime/internal/math", "Mul64", "math/bits", "Mul64", p8...)
+-	addF("math/bits", "Add64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
+-		},
+-		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64)
+-	alias("math/bits", "Add", "math/bits", "Add64", p8...)
+-	addF("math/bits", "Sub64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
+-		},
+-		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64)
+-	alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
+-	addF("math/bits", "Div64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			// check for divide-by-zero/overflow and panic with appropriate message
+-			cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
+-			s.check(cmpZero, ir.Syms.Panicdivide)
+-			cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
+-			s.check(cmpOverflow, ir.Syms.Panicoverflow)
+-			return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
+-		},
+-		sys.AMD64)
+-	alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
+-
+-	alias("runtime/internal/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
+-	alias("runtime/internal/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
+-	alias("runtime/internal/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
+-	alias("runtime/internal/sys", "Len8", "math/bits", "Len8", all...)
+-	alias("runtime/internal/sys", "Len64", "math/bits", "Len64", all...)
+-	alias("runtime/internal/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
+-
+-	/******** sync/atomic ********/
+-
+-	// Note: these are disabled by flag_race in findIntrinsic below.
+-	alias("sync/atomic", "LoadInt32", "runtime/internal/atomic", "Load", all...)
+-	alias("sync/atomic", "LoadInt64", "runtime/internal/atomic", "Load64", all...)
+-	alias("sync/atomic", "LoadPointer", "runtime/internal/atomic", "Loadp", all...)
+-	alias("sync/atomic", "LoadUint32", "runtime/internal/atomic", "Load", all...)
+-	alias("sync/atomic", "LoadUint64", "runtime/internal/atomic", "Load64", all...)
+-	alias("sync/atomic", "LoadUintptr", "runtime/internal/atomic", "Load", p4...)
+-	alias("sync/atomic", "LoadUintptr", "runtime/internal/atomic", "Load64", p8...)
+-
+-	alias("sync/atomic", "StoreInt32", "runtime/internal/atomic", "Store", all...)
+-	alias("sync/atomic", "StoreInt64", "runtime/internal/atomic", "Store64", all...)
+-	// Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
+-	alias("sync/atomic", "StoreUint32", "runtime/internal/atomic", "Store", all...)
+-	alias("sync/atomic", "StoreUint64", "runtime/internal/atomic", "Store64", all...)
+-	alias("sync/atomic", "StoreUintptr", "runtime/internal/atomic", "Store", p4...)
+-	alias("sync/atomic", "StoreUintptr", "runtime/internal/atomic", "Store64", p8...)
+-
+-	alias("sync/atomic", "SwapInt32", "runtime/internal/atomic", "Xchg", all...)
+-	alias("sync/atomic", "SwapInt64", "runtime/internal/atomic", "Xchg64", all...)
+-	alias("sync/atomic", "SwapUint32", "runtime/internal/atomic", "Xchg", all...)
+-	alias("sync/atomic", "SwapUint64", "runtime/internal/atomic", "Xchg64", all...)
+-	alias("sync/atomic", "SwapUintptr", "runtime/internal/atomic", "Xchg", p4...)
+-	alias("sync/atomic", "SwapUintptr", "runtime/internal/atomic", "Xchg64", p8...)
+-
+-	alias("sync/atomic", "CompareAndSwapInt32", "runtime/internal/atomic", "Cas", all...)
+-	alias("sync/atomic", "CompareAndSwapInt64", "runtime/internal/atomic", "Cas64", all...)
+-	alias("sync/atomic", "CompareAndSwapUint32", "runtime/internal/atomic", "Cas", all...)
+-	alias("sync/atomic", "CompareAndSwapUint64", "runtime/internal/atomic", "Cas64", all...)
+-	alias("sync/atomic", "CompareAndSwapUintptr", "runtime/internal/atomic", "Cas", p4...)
+-	alias("sync/atomic", "CompareAndSwapUintptr", "runtime/internal/atomic", "Cas64", p8...)
+-
+-	alias("sync/atomic", "AddInt32", "runtime/internal/atomic", "Xadd", all...)
+-	alias("sync/atomic", "AddInt64", "runtime/internal/atomic", "Xadd64", all...)
+-	alias("sync/atomic", "AddUint32", "runtime/internal/atomic", "Xadd", all...)
+-	alias("sync/atomic", "AddUint64", "runtime/internal/atomic", "Xadd64", all...)
+-	alias("sync/atomic", "AddUintptr", "runtime/internal/atomic", "Xadd", p4...)
+-	alias("sync/atomic", "AddUintptr", "runtime/internal/atomic", "Xadd64", p8...)
+-
+-	/******** math/big ********/
+-	alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
+-}
+-
+-// findIntrinsic returns a function which builds the SSA equivalent of the
+-// function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
+-func findIntrinsic(sym *types.Sym) intrinsicBuilder {
+-	if sym == nil || sym.Pkg == nil {
+-		return nil
+-	}
+-	pkg := sym.Pkg.Path
+-	if sym.Pkg == ir.Pkgs.Runtime {
+-		pkg = "runtime"
+-	}
+-	if base.Flag.Race && pkg == "sync/atomic" {
+-		// The race detector needs to be able to intercept these calls.
+-		// We can't intrinsify them.
+-		return nil
+-	}
+-	// Skip intrinsifying math functions (which may contain hard-float
+-	// instructions) when soft-float
+-	if Arch.SoftFloat && pkg == "math" {
+-		return nil
+-	}
+-
+-	fn := sym.Name
+-	if ssa.IntrinsicsDisable {
+-		if pkg == "runtime" && (fn == "getcallerpc" || fn == "getcallersp" || fn == "getclosureptr") {
+-			// These runtime functions don't have definitions, must be intrinsics.
+-		} else {
+-			return nil
+-		}
+-	}
+-	return intrinsics[intrinsicKey{Arch.LinkArch.Arch, pkg, fn}]
+-}
+-
+-func IsIntrinsicCall(n *ir.CallExpr) bool {
+-	if n == nil {
+-		return false
+-	}
+-	name, ok := n.X.(*ir.Name)
+-	if !ok {
+-		return false
+-	}
+-	return findIntrinsic(name.Sym()) != nil
++// split breaks up a tuple-typed value into its 2 parts.
++func (s *state) split(v *ssa.Value) (*ssa.Value, *ssa.Value) {
++	p0 := s.newValue1(ssa.OpSelect0, v.Type.FieldType(0), v)
++	p1 := s.newValue1(ssa.OpSelect1, v.Type.FieldType(1), v)
++	return p0, p1
+ }
+ 
+ // intrinsicCall converts a call to a recognized intrinsic function into the intrinsic SSA operation.
+-- 
+2.39.5
+
diff --git a/2109-cmd-compile-internal-ssagen-add-initial-test-coverag.patch b/2109-cmd-compile-internal-ssagen-add-initial-test-coverag.patch
new file mode 100644
index 0000000..b48f59a
--- /dev/null
+++ b/2109-cmd-compile-internal-ssagen-add-initial-test-coverag.patch
@@ -0,0 +1,1254 @@
+From 7ccd47a906363e0bc476ca09cbcf09dcf961149e Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:42:56 +0800
+Subject: [PATCH 109/119] cmd/compile/internal/ssagen: add initial test
+ coverage for intrinsics
+
+Add basic test coverage for the intrinisic table - this at least allows
+us to tell if intrinsics are added or removed unexpectedly. Code
+generation changes resulting from intrinsics is not covered and is
+left for test/codegen and others.
+
+Change-Id: I3d538708b90cd04d3f449945e0fd9388097d683e
+Reviewed-on: https://go-review.googlesource.com/c/go/+/605475
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: David Chase <drchase@google.com>
+---
+ .../internal/ssagen/intrinsics_test.go        | 1224 +++++++++++++++++
+ 1 file changed, 1224 insertions(+)
+ create mode 100644 src/cmd/compile/internal/ssagen/intrinsics_test.go
+
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+new file mode 100644
+index 0000000000..74ea276cc0
+--- /dev/null
++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+@@ -0,0 +1,1224 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++package ssagen
++
++import (
++	"internal/buildcfg"
++	"testing"
++)
++
++type testIntrinsicKey struct {
++	archName string
++	pkg      string
++	fn       string
++}
++
++var wantIntrinsics = map[testIntrinsicKey]struct{}{
++	{"386", "internal/runtime/math", "MulUintptr"}:             struct{}{},
++	{"386", "internal/runtime/sys", "Bswap32"}:                 struct{}{},
++	{"386", "internal/runtime/sys", "Bswap64"}:                 struct{}{},
++	{"386", "internal/runtime/sys", "TrailingZeros32"}:         struct{}{},
++	{"386", "internal/runtime/sys", "TrailingZeros64"}:         struct{}{},
++	{"386", "internal/runtime/sys", "TrailingZeros8"}:          struct{}{},
++	{"386", "math", "sqrt"}:                                    struct{}{},
++	{"386", "math/bits", "ReverseBytes32"}:                     struct{}{},
++	{"386", "math/bits", "ReverseBytes64"}:                     struct{}{},
++	{"386", "math/bits", "TrailingZeros16"}:                    struct{}{},
++	{"386", "math/bits", "TrailingZeros32"}:                    struct{}{},
++	{"386", "math/bits", "TrailingZeros64"}:                    struct{}{},
++	{"386", "math/bits", "TrailingZeros8"}:                     struct{}{},
++	{"386", "runtime", "KeepAlive"}:                            struct{}{},
++	{"386", "runtime", "getcallerpc"}:                          struct{}{},
++	{"386", "runtime", "getcallersp"}:                          struct{}{},
++	{"386", "runtime", "getclosureptr"}:                        struct{}{},
++	{"386", "runtime", "slicebytetostringtmp"}:                 struct{}{},
++	{"amd64", "internal/runtime/atomic", "And"}:                struct{}{},
++	{"amd64", "internal/runtime/atomic", "And8"}:               struct{}{},
++	{"amd64", "internal/runtime/atomic", "Cas"}:                struct{}{},
++	{"amd64", "internal/runtime/atomic", "Cas64"}:              struct{}{},
++	{"amd64", "internal/runtime/atomic", "CasRel"}:             struct{}{},
++	{"amd64", "internal/runtime/atomic", "Casint32"}:           struct{}{},
++	{"amd64", "internal/runtime/atomic", "Casint64"}:           struct{}{},
++	{"amd64", "internal/runtime/atomic", "Casp1"}:              struct{}{},
++	{"amd64", "internal/runtime/atomic", "Casuintptr"}:         struct{}{},
++	{"amd64", "internal/runtime/atomic", "Load"}:               struct{}{},
++	{"amd64", "internal/runtime/atomic", "Load64"}:             struct{}{},
++	{"amd64", "internal/runtime/atomic", "Load8"}:              struct{}{},
++	{"amd64", "internal/runtime/atomic", "LoadAcq"}:            struct{}{},
++	{"amd64", "internal/runtime/atomic", "LoadAcq64"}:          struct{}{},
++	{"amd64", "internal/runtime/atomic", "LoadAcquintptr"}:     struct{}{},
++	{"amd64", "internal/runtime/atomic", "Loadint32"}:          struct{}{},
++	{"amd64", "internal/runtime/atomic", "Loadint64"}:          struct{}{},
++	{"amd64", "internal/runtime/atomic", "Loadp"}:              struct{}{},
++	{"amd64", "internal/runtime/atomic", "Loaduint"}:           struct{}{},
++	{"amd64", "internal/runtime/atomic", "Loaduintptr"}:        struct{}{},
++	{"amd64", "internal/runtime/atomic", "Or"}:                 struct{}{},
++	{"amd64", "internal/runtime/atomic", "Or8"}:                struct{}{},
++	{"amd64", "internal/runtime/atomic", "Store"}:              struct{}{},
++	{"amd64", "internal/runtime/atomic", "Store64"}:            struct{}{},
++	{"amd64", "internal/runtime/atomic", "Store8"}:             struct{}{},
++	{"amd64", "internal/runtime/atomic", "StoreRel"}:           struct{}{},
++	{"amd64", "internal/runtime/atomic", "StoreRel64"}:         struct{}{},
++	{"amd64", "internal/runtime/atomic", "StoreReluintptr"}:    struct{}{},
++	{"amd64", "internal/runtime/atomic", "Storeint32"}:         struct{}{},
++	{"amd64", "internal/runtime/atomic", "Storeint64"}:         struct{}{},
++	{"amd64", "internal/runtime/atomic", "StorepNoWB"}:         struct{}{},
++	{"amd64", "internal/runtime/atomic", "Storeuintptr"}:       struct{}{},
++	{"amd64", "internal/runtime/atomic", "Xadd"}:               struct{}{},
++	{"amd64", "internal/runtime/atomic", "Xadd64"}:             struct{}{},
++	{"amd64", "internal/runtime/atomic", "Xaddint32"}:          struct{}{},
++	{"amd64", "internal/runtime/atomic", "Xaddint64"}:          struct{}{},
++	{"amd64", "internal/runtime/atomic", "Xadduintptr"}:        struct{}{},
++	{"amd64", "internal/runtime/atomic", "Xchg"}:               struct{}{},
++	{"amd64", "internal/runtime/atomic", "Xchg64"}:             struct{}{},
++	{"amd64", "internal/runtime/atomic", "Xchgint32"}:          struct{}{},
++	{"amd64", "internal/runtime/atomic", "Xchgint64"}:          struct{}{},
++	{"amd64", "internal/runtime/atomic", "Xchguintptr"}:        struct{}{},
++	{"amd64", "internal/runtime/math", "Add64"}:                struct{}{},
++	{"amd64", "internal/runtime/math", "Mul64"}:                struct{}{},
++	{"amd64", "internal/runtime/math", "MulUintptr"}:           struct{}{},
++	{"amd64", "internal/runtime/sys", "Bswap32"}:               struct{}{},
++	{"amd64", "internal/runtime/sys", "Bswap64"}:               struct{}{},
++	{"amd64", "internal/runtime/sys", "Len64"}:                 struct{}{},
++	{"amd64", "internal/runtime/sys", "Len8"}:                  struct{}{},
++	{"amd64", "internal/runtime/sys", "OnesCount64"}:           struct{}{},
++	{"amd64", "internal/runtime/sys", "Prefetch"}:              struct{}{},
++	{"amd64", "internal/runtime/sys", "PrefetchStreamed"}:      struct{}{},
++	{"amd64", "internal/runtime/sys", "TrailingZeros32"}:       struct{}{},
++	{"amd64", "internal/runtime/sys", "TrailingZeros64"}:       struct{}{},
++	{"amd64", "internal/runtime/sys", "TrailingZeros8"}:        struct{}{},
++	{"amd64", "math", "Ceil"}:                                  struct{}{},
++	{"amd64", "math", "FMA"}:                                   struct{}{},
++	{"amd64", "math", "Floor"}:                                 struct{}{},
++	{"amd64", "math", "RoundToEven"}:                           struct{}{},
++	{"amd64", "math", "Trunc"}:                                 struct{}{},
++	{"amd64", "math", "sqrt"}:                                  struct{}{},
++	{"amd64", "math/big", "mulWW"}:                             struct{}{},
++	{"amd64", "math/bits", "Add"}:                              struct{}{},
++	{"amd64", "math/bits", "Add64"}:                            struct{}{},
++	{"amd64", "math/bits", "Div"}:                              struct{}{},
++	{"amd64", "math/bits", "Div64"}:                            struct{}{},
++	{"amd64", "math/bits", "Len"}:                              struct{}{},
++	{"amd64", "math/bits", "Len16"}:                            struct{}{},
++	{"amd64", "math/bits", "Len32"}:                            struct{}{},
++	{"amd64", "math/bits", "Len64"}:                            struct{}{},
++	{"amd64", "math/bits", "Len8"}:                             struct{}{},
++	{"amd64", "math/bits", "Mul"}:                              struct{}{},
++	{"amd64", "math/bits", "Mul64"}:                            struct{}{},
++	{"amd64", "math/bits", "OnesCount"}:                        struct{}{},
++	{"amd64", "math/bits", "OnesCount16"}:                      struct{}{},
++	{"amd64", "math/bits", "OnesCount32"}:                      struct{}{},
++	{"amd64", "math/bits", "OnesCount64"}:                      struct{}{},
++	{"amd64", "math/bits", "ReverseBytes32"}:                   struct{}{},
++	{"amd64", "math/bits", "ReverseBytes64"}:                   struct{}{},
++	{"amd64", "math/bits", "RotateLeft"}:                       struct{}{},
++	{"amd64", "math/bits", "RotateLeft16"}:                     struct{}{},
++	{"amd64", "math/bits", "RotateLeft32"}:                     struct{}{},
++	{"amd64", "math/bits", "RotateLeft64"}:                     struct{}{},
++	{"amd64", "math/bits", "RotateLeft8"}:                      struct{}{},
++	{"amd64", "math/bits", "Sub"}:                              struct{}{},
++	{"amd64", "math/bits", "Sub64"}:                            struct{}{},
++	{"amd64", "math/bits", "TrailingZeros16"}:                  struct{}{},
++	{"amd64", "math/bits", "TrailingZeros32"}:                  struct{}{},
++	{"amd64", "math/bits", "TrailingZeros64"}:                  struct{}{},
++	{"amd64", "math/bits", "TrailingZeros8"}:                   struct{}{},
++	{"amd64", "runtime", "KeepAlive"}:                          struct{}{},
++	{"amd64", "runtime", "getcallerpc"}:                        struct{}{},
++	{"amd64", "runtime", "getcallersp"}:                        struct{}{},
++	{"amd64", "runtime", "getclosureptr"}:                      struct{}{},
++	{"amd64", "runtime", "slicebytetostringtmp"}:               struct{}{},
++	{"amd64", "sync", "runtime_LoadAcquintptr"}:                struct{}{},
++	{"amd64", "sync", "runtime_StoreReluintptr"}:               struct{}{},
++	{"amd64", "sync/atomic", "AddInt32"}:                       struct{}{},
++	{"amd64", "sync/atomic", "AddInt64"}:                       struct{}{},
++	{"amd64", "sync/atomic", "AddUint32"}:                      struct{}{},
++	{"amd64", "sync/atomic", "AddUint64"}:                      struct{}{},
++	{"amd64", "sync/atomic", "AddUintptr"}:                     struct{}{},
++	{"amd64", "sync/atomic", "CompareAndSwapInt32"}:            struct{}{},
++	{"amd64", "sync/atomic", "CompareAndSwapInt64"}:            struct{}{},
++	{"amd64", "sync/atomic", "CompareAndSwapUint32"}:           struct{}{},
++	{"amd64", "sync/atomic", "CompareAndSwapUint64"}:           struct{}{},
++	{"amd64", "sync/atomic", "CompareAndSwapUintptr"}:          struct{}{},
++	{"amd64", "sync/atomic", "LoadInt32"}:                      struct{}{},
++	{"amd64", "sync/atomic", "LoadInt64"}:                      struct{}{},
++	{"amd64", "sync/atomic", "LoadPointer"}:                    struct{}{},
++	{"amd64", "sync/atomic", "LoadUint32"}:                     struct{}{},
++	{"amd64", "sync/atomic", "LoadUint64"}:                     struct{}{},
++	{"amd64", "sync/atomic", "LoadUintptr"}:                    struct{}{},
++	{"amd64", "sync/atomic", "StoreInt32"}:                     struct{}{},
++	{"amd64", "sync/atomic", "StoreInt64"}:                     struct{}{},
++	{"amd64", "sync/atomic", "StoreUint32"}:                    struct{}{},
++	{"amd64", "sync/atomic", "StoreUint64"}:                    struct{}{},
++	{"amd64", "sync/atomic", "StoreUintptr"}:                   struct{}{},
++	{"amd64", "sync/atomic", "SwapInt32"}:                      struct{}{},
++	{"amd64", "sync/atomic", "SwapInt64"}:                      struct{}{},
++	{"amd64", "sync/atomic", "SwapUint32"}:                     struct{}{},
++	{"amd64", "sync/atomic", "SwapUint64"}:                     struct{}{},
++	{"amd64", "sync/atomic", "SwapUintptr"}:                    struct{}{},
++	{"arm", "internal/runtime/sys", "Bswap32"}:                 struct{}{},
++	{"arm", "internal/runtime/sys", "Bswap64"}:                 struct{}{},
++	{"arm", "internal/runtime/sys", "Len64"}:                   struct{}{},
++	{"arm", "internal/runtime/sys", "Len8"}:                    struct{}{},
++	{"arm", "internal/runtime/sys", "TrailingZeros32"}:         struct{}{},
++	{"arm", "internal/runtime/sys", "TrailingZeros64"}:         struct{}{},
++	{"arm", "internal/runtime/sys", "TrailingZeros8"}:          struct{}{},
++	{"arm", "math", "Abs"}:                                     struct{}{},
++	{"arm", "math", "FMA"}:                                     struct{}{},
++	{"arm", "math", "sqrt"}:                                    struct{}{},
++	{"arm", "math/bits", "Len"}:                                struct{}{},
++	{"arm", "math/bits", "Len16"}:                              struct{}{},
++	{"arm", "math/bits", "Len32"}:                              struct{}{},
++	{"arm", "math/bits", "Len64"}:                              struct{}{},
++	{"arm", "math/bits", "Len8"}:                               struct{}{},
++	{"arm", "math/bits", "ReverseBytes32"}:                     struct{}{},
++	{"arm", "math/bits", "ReverseBytes64"}:                     struct{}{},
++	{"arm", "math/bits", "RotateLeft32"}:                       struct{}{},
++	{"arm", "math/bits", "TrailingZeros16"}:                    struct{}{},
++	{"arm", "math/bits", "TrailingZeros32"}:                    struct{}{},
++	{"arm", "math/bits", "TrailingZeros64"}:                    struct{}{},
++	{"arm", "math/bits", "TrailingZeros8"}:                     struct{}{},
++	{"arm", "runtime", "KeepAlive"}:                            struct{}{},
++	{"arm", "runtime", "getcallerpc"}:                          struct{}{},
++	{"arm", "runtime", "getcallersp"}:                          struct{}{},
++	{"arm", "runtime", "getclosureptr"}:                        struct{}{},
++	{"arm", "runtime", "slicebytetostringtmp"}:                 struct{}{},
++	{"arm64", "internal/runtime/atomic", "Cas"}:                struct{}{},
++	{"arm64", "internal/runtime/atomic", "Cas64"}:              struct{}{},
++	{"arm64", "internal/runtime/atomic", "CasRel"}:             struct{}{},
++	{"arm64", "internal/runtime/atomic", "Casint32"}:           struct{}{},
++	{"arm64", "internal/runtime/atomic", "Casint64"}:           struct{}{},
++	{"arm64", "internal/runtime/atomic", "Casp1"}:              struct{}{},
++	{"arm64", "internal/runtime/atomic", "Casuintptr"}:         struct{}{},
++	{"arm64", "internal/runtime/atomic", "Load"}:               struct{}{},
++	{"arm64", "internal/runtime/atomic", "Load64"}:             struct{}{},
++	{"arm64", "internal/runtime/atomic", "Load8"}:              struct{}{},
++	{"arm64", "internal/runtime/atomic", "LoadAcq"}:            struct{}{},
++	{"arm64", "internal/runtime/atomic", "LoadAcq64"}:          struct{}{},
++	{"arm64", "internal/runtime/atomic", "LoadAcquintptr"}:     struct{}{},
++	{"arm64", "internal/runtime/atomic", "Loadint32"}:          struct{}{},
++	{"arm64", "internal/runtime/atomic", "Loadint64"}:          struct{}{},
++	{"arm64", "internal/runtime/atomic", "Loadp"}:              struct{}{},
++	{"arm64", "internal/runtime/atomic", "Loaduint"}:           struct{}{},
++	{"arm64", "internal/runtime/atomic", "Loaduintptr"}:        struct{}{},
++	{"arm64", "internal/runtime/atomic", "Store"}:              struct{}{},
++	{"arm64", "internal/runtime/atomic", "Store64"}:            struct{}{},
++	{"arm64", "internal/runtime/atomic", "Store8"}:             struct{}{},
++	{"arm64", "internal/runtime/atomic", "StoreRel"}:           struct{}{},
++	{"arm64", "internal/runtime/atomic", "StoreRel64"}:         struct{}{},
++	{"arm64", "internal/runtime/atomic", "StoreReluintptr"}:    struct{}{},
++	{"arm64", "internal/runtime/atomic", "Storeint32"}:         struct{}{},
++	{"arm64", "internal/runtime/atomic", "Storeint64"}:         struct{}{},
++	{"arm64", "internal/runtime/atomic", "StorepNoWB"}:         struct{}{},
++	{"arm64", "internal/runtime/atomic", "Storeuintptr"}:       struct{}{},
++	{"arm64", "internal/runtime/atomic", "Xadd"}:               struct{}{},
++	{"arm64", "internal/runtime/atomic", "Xadd64"}:             struct{}{},
++	{"arm64", "internal/runtime/atomic", "Xaddint32"}:          struct{}{},
++	{"arm64", "internal/runtime/atomic", "Xaddint64"}:          struct{}{},
++	{"arm64", "internal/runtime/atomic", "Xadduintptr"}:        struct{}{},
++	{"arm64", "internal/runtime/atomic", "Xchg"}:               struct{}{},
++	{"arm64", "internal/runtime/atomic", "Xchg64"}:             struct{}{},
++	{"arm64", "internal/runtime/atomic", "Xchgint32"}:          struct{}{},
++	{"arm64", "internal/runtime/atomic", "Xchgint64"}:          struct{}{},
++	{"arm64", "internal/runtime/atomic", "Xchguintptr"}:        struct{}{},
++	{"arm64", "internal/runtime/math", "Add64"}:                struct{}{},
++	{"arm64", "internal/runtime/math", "Mul64"}:                struct{}{},
++	{"arm64", "internal/runtime/math", "MulUintptr"}:           struct{}{},
++	{"arm64", "internal/runtime/sys", "Bswap32"}:               struct{}{},
++	{"arm64", "internal/runtime/sys", "Bswap64"}:               struct{}{},
++	{"arm64", "internal/runtime/sys", "Len64"}:                 struct{}{},
++	{"arm64", "internal/runtime/sys", "Len8"}:                  struct{}{},
++	{"arm64", "internal/runtime/sys", "OnesCount64"}:           struct{}{},
++	{"arm64", "internal/runtime/sys", "Prefetch"}:              struct{}{},
++	{"arm64", "internal/runtime/sys", "PrefetchStreamed"}:      struct{}{},
++	{"arm64", "internal/runtime/sys", "TrailingZeros32"}:       struct{}{},
++	{"arm64", "internal/runtime/sys", "TrailingZeros64"}:       struct{}{},
++	{"arm64", "internal/runtime/sys", "TrailingZeros8"}:        struct{}{},
++	{"arm64", "math", "Abs"}:                                   struct{}{},
++	{"arm64", "math", "Ceil"}:                                  struct{}{},
++	{"arm64", "math", "FMA"}:                                   struct{}{},
++	{"arm64", "math", "Floor"}:                                 struct{}{},
++	{"arm64", "math", "Round"}:                                 struct{}{},
++	{"arm64", "math", "RoundToEven"}:                           struct{}{},
++	{"arm64", "math", "Trunc"}:                                 struct{}{},
++	{"arm64", "math", "sqrt"}:                                  struct{}{},
++	{"arm64", "math/big", "mulWW"}:                             struct{}{},
++	{"arm64", "math/bits", "Add"}:                              struct{}{},
++	{"arm64", "math/bits", "Add64"}:                            struct{}{},
++	{"arm64", "math/bits", "Len"}:                              struct{}{},
++	{"arm64", "math/bits", "Len16"}:                            struct{}{},
++	{"arm64", "math/bits", "Len32"}:                            struct{}{},
++	{"arm64", "math/bits", "Len64"}:                            struct{}{},
++	{"arm64", "math/bits", "Len8"}:                             struct{}{},
++	{"arm64", "math/bits", "Mul"}:                              struct{}{},
++	{"arm64", "math/bits", "Mul64"}:                            struct{}{},
++	{"arm64", "math/bits", "OnesCount16"}:                      struct{}{},
++	{"arm64", "math/bits", "OnesCount32"}:                      struct{}{},
++	{"arm64", "math/bits", "OnesCount64"}:                      struct{}{},
++	{"arm64", "math/bits", "Reverse"}:                          struct{}{},
++	{"arm64", "math/bits", "Reverse16"}:                        struct{}{},
++	{"arm64", "math/bits", "Reverse32"}:                        struct{}{},
++	{"arm64", "math/bits", "Reverse64"}:                        struct{}{},
++	{"arm64", "math/bits", "Reverse8"}:                         struct{}{},
++	{"arm64", "math/bits", "ReverseBytes32"}:                   struct{}{},
++	{"arm64", "math/bits", "ReverseBytes64"}:                   struct{}{},
++	{"arm64", "math/bits", "RotateLeft"}:                       struct{}{},
++	{"arm64", "math/bits", "RotateLeft32"}:                     struct{}{},
++	{"arm64", "math/bits", "RotateLeft64"}:                     struct{}{},
++	{"arm64", "math/bits", "Sub"}:                              struct{}{},
++	{"arm64", "math/bits", "Sub64"}:                            struct{}{},
++	{"arm64", "math/bits", "TrailingZeros16"}:                  struct{}{},
++	{"arm64", "math/bits", "TrailingZeros32"}:                  struct{}{},
++	{"arm64", "math/bits", "TrailingZeros64"}:                  struct{}{},
++	{"arm64", "math/bits", "TrailingZeros8"}:                   struct{}{},
++	{"arm64", "runtime", "KeepAlive"}:                          struct{}{},
++	{"arm64", "runtime", "getcallerpc"}:                        struct{}{},
++	{"arm64", "runtime", "getcallersp"}:                        struct{}{},
++	{"arm64", "runtime", "getclosureptr"}:                      struct{}{},
++	{"arm64", "runtime", "publicationBarrier"}:                 struct{}{},
++	{"arm64", "runtime", "slicebytetostringtmp"}:               struct{}{},
++	{"arm64", "sync", "runtime_LoadAcquintptr"}:                struct{}{},
++	{"arm64", "sync", "runtime_StoreReluintptr"}:               struct{}{},
++	{"arm64", "sync/atomic", "AddInt32"}:                       struct{}{},
++	{"arm64", "sync/atomic", "AddInt64"}:                       struct{}{},
++	{"arm64", "sync/atomic", "AddUint32"}:                      struct{}{},
++	{"arm64", "sync/atomic", "AddUint64"}:                      struct{}{},
++	{"arm64", "sync/atomic", "AddUintptr"}:                     struct{}{},
++	{"arm64", "sync/atomic", "CompareAndSwapInt32"}:            struct{}{},
++	{"arm64", "sync/atomic", "CompareAndSwapInt64"}:            struct{}{},
++	{"arm64", "sync/atomic", "CompareAndSwapUint32"}:           struct{}{},
++	{"arm64", "sync/atomic", "CompareAndSwapUint64"}:           struct{}{},
++	{"arm64", "sync/atomic", "CompareAndSwapUintptr"}:          struct{}{},
++	{"arm64", "sync/atomic", "LoadInt32"}:                      struct{}{},
++	{"arm64", "sync/atomic", "LoadInt64"}:                      struct{}{},
++	{"arm64", "sync/atomic", "LoadPointer"}:                    struct{}{},
++	{"arm64", "sync/atomic", "LoadUint32"}:                     struct{}{},
++	{"arm64", "sync/atomic", "LoadUint64"}:                     struct{}{},
++	{"arm64", "sync/atomic", "LoadUintptr"}:                    struct{}{},
++	{"arm64", "sync/atomic", "StoreInt32"}:                     struct{}{},
++	{"arm64", "sync/atomic", "StoreInt64"}:                     struct{}{},
++	{"arm64", "sync/atomic", "StoreUint32"}:                    struct{}{},
++	{"arm64", "sync/atomic", "StoreUint64"}:                    struct{}{},
++	{"arm64", "sync/atomic", "StoreUintptr"}:                   struct{}{},
++	{"arm64", "sync/atomic", "SwapInt32"}:                      struct{}{},
++	{"arm64", "sync/atomic", "SwapInt64"}:                      struct{}{},
++	{"arm64", "sync/atomic", "SwapUint32"}:                     struct{}{},
++	{"arm64", "sync/atomic", "SwapUint64"}:                     struct{}{},
++	{"arm64", "sync/atomic", "SwapUintptr"}:                    struct{}{},
++	{"loong64", "internal/runtime/atomic", "Cas"}:              struct{}{},
++	{"loong64", "internal/runtime/atomic", "Cas64"}:            struct{}{},
++	{"loong64", "internal/runtime/atomic", "CasRel"}:           struct{}{},
++	{"loong64", "internal/runtime/atomic", "Casint32"}:         struct{}{},
++	{"loong64", "internal/runtime/atomic", "Casint64"}:         struct{}{},
++	{"loong64", "internal/runtime/atomic", "Casp1"}:            struct{}{},
++	{"loong64", "internal/runtime/atomic", "Casuintptr"}:       struct{}{},
++	{"loong64", "internal/runtime/atomic", "Load"}:             struct{}{},
++	{"loong64", "internal/runtime/atomic", "Load64"}:           struct{}{},
++	{"loong64", "internal/runtime/atomic", "Load8"}:            struct{}{},
++	{"loong64", "internal/runtime/atomic", "LoadAcq"}:          struct{}{},
++	{"loong64", "internal/runtime/atomic", "LoadAcq64"}:        struct{}{},
++	{"loong64", "internal/runtime/atomic", "LoadAcquintptr"}:   struct{}{},
++	{"loong64", "internal/runtime/atomic", "Loadint32"}:        struct{}{},
++	{"loong64", "internal/runtime/atomic", "Loadint64"}:        struct{}{},
++	{"loong64", "internal/runtime/atomic", "Loadp"}:            struct{}{},
++	{"loong64", "internal/runtime/atomic", "Loaduint"}:         struct{}{},
++	{"loong64", "internal/runtime/atomic", "Loaduintptr"}:      struct{}{},
++	{"loong64", "internal/runtime/atomic", "Store"}:            struct{}{},
++	{"loong64", "internal/runtime/atomic", "Store64"}:          struct{}{},
++	{"loong64", "internal/runtime/atomic", "Store8"}:           struct{}{},
++	{"loong64", "internal/runtime/atomic", "StoreRel"}:         struct{}{},
++	{"loong64", "internal/runtime/atomic", "StoreRel64"}:       struct{}{},
++	{"loong64", "internal/runtime/atomic", "StoreReluintptr"}:  struct{}{},
++	{"loong64", "internal/runtime/atomic", "Storeint32"}:       struct{}{},
++	{"loong64", "internal/runtime/atomic", "Storeint64"}:       struct{}{},
++	{"loong64", "internal/runtime/atomic", "StorepNoWB"}:       struct{}{},
++	{"loong64", "internal/runtime/atomic", "Storeuintptr"}:     struct{}{},
++	{"loong64", "internal/runtime/atomic", "Xadd"}:             struct{}{},
++	{"loong64", "internal/runtime/atomic", "Xadd64"}:           struct{}{},
++	{"loong64", "internal/runtime/atomic", "Xaddint32"}:        struct{}{},
++	{"loong64", "internal/runtime/atomic", "Xaddint64"}:        struct{}{},
++	{"loong64", "internal/runtime/atomic", "Xadduintptr"}:      struct{}{},
++	{"loong64", "internal/runtime/atomic", "Xchg"}:             struct{}{},
++	{"loong64", "internal/runtime/atomic", "Xchg64"}:           struct{}{},
++	{"loong64", "internal/runtime/atomic", "Xchgint32"}:        struct{}{},
++	{"loong64", "internal/runtime/atomic", "Xchgint64"}:        struct{}{},
++	{"loong64", "internal/runtime/atomic", "Xchguintptr"}:      struct{}{},
++	{"loong64", "internal/runtime/math", "Add64"}:              struct{}{},
++	{"loong64", "internal/runtime/math", "Mul64"}:              struct{}{},
++	{"loong64", "internal/runtime/math", "MulUintptr"}:         struct{}{},
++	{"loong64", "math", "Abs"}:                                 struct{}{},
++	{"loong64", "math", "Copysign"}:                            struct{}{},
++	{"loong64", "math", "sqrt"}:                                struct{}{},
++	{"loong64", "math/big", "mulWW"}:                           struct{}{},
++	{"loong64", "math/bits", "Add"}:                            struct{}{},
++	{"loong64", "math/bits", "Add64"}:                          struct{}{},
++	{"loong64", "math/bits", "Mul"}:                            struct{}{},
++	{"loong64", "math/bits", "Mul64"}:                          struct{}{},
++	{"loong64", "math/bits", "RotateLeft"}:                     struct{}{},
++	{"loong64", "math/bits", "RotateLeft32"}:                   struct{}{},
++	{"loong64", "math/bits", "RotateLeft64"}:                   struct{}{},
++	{"loong64", "math/bits", "Sub"}:                            struct{}{},
++	{"loong64", "math/bits", "Sub64"}:                          struct{}{},
++	{"loong64", "runtime", "KeepAlive"}:                        struct{}{},
++	{"loong64", "runtime", "getcallerpc"}:                      struct{}{},
++	{"loong64", "runtime", "getcallersp"}:                      struct{}{},
++	{"loong64", "runtime", "getclosureptr"}:                    struct{}{},
++	{"loong64", "runtime", "slicebytetostringtmp"}:             struct{}{},
++	{"loong64", "sync", "runtime_LoadAcquintptr"}:              struct{}{},
++	{"loong64", "sync", "runtime_StoreReluintptr"}:             struct{}{},
++	{"loong64", "sync/atomic", "AddInt32"}:                     struct{}{},
++	{"loong64", "sync/atomic", "AddInt64"}:                     struct{}{},
++	{"loong64", "sync/atomic", "AddUint32"}:                    struct{}{},
++	{"loong64", "sync/atomic", "AddUint64"}:                    struct{}{},
++	{"loong64", "sync/atomic", "AddUintptr"}:                   struct{}{},
++	{"loong64", "sync/atomic", "CompareAndSwapInt32"}:          struct{}{},
++	{"loong64", "sync/atomic", "CompareAndSwapInt64"}:          struct{}{},
++	{"loong64", "sync/atomic", "CompareAndSwapUint32"}:         struct{}{},
++	{"loong64", "sync/atomic", "CompareAndSwapUint64"}:         struct{}{},
++	{"loong64", "sync/atomic", "CompareAndSwapUintptr"}:        struct{}{},
++	{"loong64", "sync/atomic", "LoadInt32"}:                    struct{}{},
++	{"loong64", "sync/atomic", "LoadInt64"}:                    struct{}{},
++	{"loong64", "sync/atomic", "LoadPointer"}:                  struct{}{},
++	{"loong64", "sync/atomic", "LoadUint32"}:                   struct{}{},
++	{"loong64", "sync/atomic", "LoadUint64"}:                   struct{}{},
++	{"loong64", "sync/atomic", "LoadUintptr"}:                  struct{}{},
++	{"loong64", "sync/atomic", "StoreInt32"}:                   struct{}{},
++	{"loong64", "sync/atomic", "StoreInt64"}:                   struct{}{},
++	{"loong64", "sync/atomic", "StoreUint32"}:                  struct{}{},
++	{"loong64", "sync/atomic", "StoreUint64"}:                  struct{}{},
++	{"loong64", "sync/atomic", "StoreUintptr"}:                 struct{}{},
++	{"loong64", "sync/atomic", "SwapInt32"}:                    struct{}{},
++	{"loong64", "sync/atomic", "SwapInt64"}:                    struct{}{},
++	{"loong64", "sync/atomic", "SwapUint32"}:                   struct{}{},
++	{"loong64", "sync/atomic", "SwapUint64"}:                   struct{}{},
++	{"loong64", "sync/atomic", "SwapUintptr"}:                  struct{}{},
++	{"mips", "internal/runtime/atomic", "And"}:                 struct{}{},
++	{"mips", "internal/runtime/atomic", "And8"}:                struct{}{},
++	{"mips", "internal/runtime/atomic", "Cas"}:                 struct{}{},
++	{"mips", "internal/runtime/atomic", "CasRel"}:              struct{}{},
++	{"mips", "internal/runtime/atomic", "Casint32"}:            struct{}{},
++	{"mips", "internal/runtime/atomic", "Casp1"}:               struct{}{},
++	{"mips", "internal/runtime/atomic", "Casuintptr"}:          struct{}{},
++	{"mips", "internal/runtime/atomic", "Load"}:                struct{}{},
++	{"mips", "internal/runtime/atomic", "Load8"}:               struct{}{},
++	{"mips", "internal/runtime/atomic", "LoadAcq"}:             struct{}{},
++	{"mips", "internal/runtime/atomic", "LoadAcquintptr"}:      struct{}{},
++	{"mips", "internal/runtime/atomic", "Loadint32"}:           struct{}{},
++	{"mips", "internal/runtime/atomic", "Loadp"}:               struct{}{},
++	{"mips", "internal/runtime/atomic", "Loaduint"}:            struct{}{},
++	{"mips", "internal/runtime/atomic", "Loaduintptr"}:         struct{}{},
++	{"mips", "internal/runtime/atomic", "Or"}:                  struct{}{},
++	{"mips", "internal/runtime/atomic", "Or8"}:                 struct{}{},
++	{"mips", "internal/runtime/atomic", "Store"}:               struct{}{},
++	{"mips", "internal/runtime/atomic", "Store8"}:              struct{}{},
++	{"mips", "internal/runtime/atomic", "StoreRel"}:            struct{}{},
++	{"mips", "internal/runtime/atomic", "StoreReluintptr"}:     struct{}{},
++	{"mips", "internal/runtime/atomic", "Storeint32"}:          struct{}{},
++	{"mips", "internal/runtime/atomic", "StorepNoWB"}:          struct{}{},
++	{"mips", "internal/runtime/atomic", "Storeuintptr"}:        struct{}{},
++	{"mips", "internal/runtime/atomic", "Xadd"}:                struct{}{},
++	{"mips", "internal/runtime/atomic", "Xaddint32"}:           struct{}{},
++	{"mips", "internal/runtime/atomic", "Xadduintptr"}:         struct{}{},
++	{"mips", "internal/runtime/atomic", "Xchg"}:                struct{}{},
++	{"mips", "internal/runtime/atomic", "Xchgint32"}:           struct{}{},
++	{"mips", "internal/runtime/atomic", "Xchguintptr"}:         struct{}{},
++	{"mips", "internal/runtime/sys", "Len64"}:                  struct{}{},
++	{"mips", "internal/runtime/sys", "Len8"}:                   struct{}{},
++	{"mips", "internal/runtime/sys", "TrailingZeros32"}:        struct{}{},
++	{"mips", "internal/runtime/sys", "TrailingZeros64"}:        struct{}{},
++	{"mips", "internal/runtime/sys", "TrailingZeros8"}:         struct{}{},
++	{"mips", "math", "Abs"}:                                    struct{}{},
++	{"mips", "math", "sqrt"}:                                   struct{}{},
++	{"mips", "math/bits", "Len"}:                               struct{}{},
++	{"mips", "math/bits", "Len16"}:                             struct{}{},
++	{"mips", "math/bits", "Len32"}:                             struct{}{},
++	{"mips", "math/bits", "Len64"}:                             struct{}{},
++	{"mips", "math/bits", "Len8"}:                              struct{}{},
++	{"mips", "math/bits", "TrailingZeros16"}:                   struct{}{},
++	{"mips", "math/bits", "TrailingZeros32"}:                   struct{}{},
++	{"mips", "math/bits", "TrailingZeros64"}:                   struct{}{},
++	{"mips", "math/bits", "TrailingZeros8"}:                    struct{}{},
++	{"mips", "runtime", "KeepAlive"}:                           struct{}{},
++	{"mips", "runtime", "getcallerpc"}:                         struct{}{},
++	{"mips", "runtime", "getcallersp"}:                         struct{}{},
++	{"mips", "runtime", "getclosureptr"}:                       struct{}{},
++	{"mips", "runtime", "slicebytetostringtmp"}:                struct{}{},
++	{"mips", "sync", "runtime_LoadAcquintptr"}:                 struct{}{},
++	{"mips", "sync", "runtime_StoreReluintptr"}:                struct{}{},
++	{"mips", "sync/atomic", "AddInt32"}:                        struct{}{},
++	{"mips", "sync/atomic", "AddUint32"}:                       struct{}{},
++	{"mips", "sync/atomic", "AddUintptr"}:                      struct{}{},
++	{"mips", "sync/atomic", "CompareAndSwapInt32"}:             struct{}{},
++	{"mips", "sync/atomic", "CompareAndSwapUint32"}:            struct{}{},
++	{"mips", "sync/atomic", "CompareAndSwapUintptr"}:           struct{}{},
++	{"mips", "sync/atomic", "LoadInt32"}:                       struct{}{},
++	{"mips", "sync/atomic", "LoadPointer"}:                     struct{}{},
++	{"mips", "sync/atomic", "LoadUint32"}:                      struct{}{},
++	{"mips", "sync/atomic", "LoadUintptr"}:                     struct{}{},
++	{"mips", "sync/atomic", "StoreInt32"}:                      struct{}{},
++	{"mips", "sync/atomic", "StoreUint32"}:                     struct{}{},
++	{"mips", "sync/atomic", "StoreUintptr"}:                    struct{}{},
++	{"mips", "sync/atomic", "SwapInt32"}:                       struct{}{},
++	{"mips", "sync/atomic", "SwapUint32"}:                      struct{}{},
++	{"mips", "sync/atomic", "SwapUintptr"}:                     struct{}{},
++	{"mips64", "internal/runtime/atomic", "And"}:               struct{}{},
++	{"mips64", "internal/runtime/atomic", "And8"}:              struct{}{},
++	{"mips64", "internal/runtime/atomic", "Cas"}:               struct{}{},
++	{"mips64", "internal/runtime/atomic", "Cas64"}:             struct{}{},
++	{"mips64", "internal/runtime/atomic", "CasRel"}:            struct{}{},
++	{"mips64", "internal/runtime/atomic", "Casint32"}:          struct{}{},
++	{"mips64", "internal/runtime/atomic", "Casint64"}:          struct{}{},
++	{"mips64", "internal/runtime/atomic", "Casp1"}:             struct{}{},
++	{"mips64", "internal/runtime/atomic", "Casuintptr"}:        struct{}{},
++	{"mips64", "internal/runtime/atomic", "Load"}:              struct{}{},
++	{"mips64", "internal/runtime/atomic", "Load64"}:            struct{}{},
++	{"mips64", "internal/runtime/atomic", "Load8"}:             struct{}{},
++	{"mips64", "internal/runtime/atomic", "LoadAcq"}:           struct{}{},
++	{"mips64", "internal/runtime/atomic", "LoadAcq64"}:         struct{}{},
++	{"mips64", "internal/runtime/atomic", "LoadAcquintptr"}:    struct{}{},
++	{"mips64", "internal/runtime/atomic", "Loadint32"}:         struct{}{},
++	{"mips64", "internal/runtime/atomic", "Loadint64"}:         struct{}{},
++	{"mips64", "internal/runtime/atomic", "Loadp"}:             struct{}{},
++	{"mips64", "internal/runtime/atomic", "Loaduint"}:          struct{}{},
++	{"mips64", "internal/runtime/atomic", "Loaduintptr"}:       struct{}{},
++	{"mips64", "internal/runtime/atomic", "Or"}:                struct{}{},
++	{"mips64", "internal/runtime/atomic", "Or8"}:               struct{}{},
++	{"mips64", "internal/runtime/atomic", "Store"}:             struct{}{},
++	{"mips64", "internal/runtime/atomic", "Store64"}:           struct{}{},
++	{"mips64", "internal/runtime/atomic", "Store8"}:            struct{}{},
++	{"mips64", "internal/runtime/atomic", "StoreRel"}:          struct{}{},
++	{"mips64", "internal/runtime/atomic", "StoreRel64"}:        struct{}{},
++	{"mips64", "internal/runtime/atomic", "StoreReluintptr"}:   struct{}{},
++	{"mips64", "internal/runtime/atomic", "Storeint32"}:        struct{}{},
++	{"mips64", "internal/runtime/atomic", "Storeint64"}:        struct{}{},
++	{"mips64", "internal/runtime/atomic", "StorepNoWB"}:        struct{}{},
++	{"mips64", "internal/runtime/atomic", "Storeuintptr"}:      struct{}{},
++	{"mips64", "internal/runtime/atomic", "Xadd"}:              struct{}{},
++	{"mips64", "internal/runtime/atomic", "Xadd64"}:            struct{}{},
++	{"mips64", "internal/runtime/atomic", "Xaddint32"}:         struct{}{},
++	{"mips64", "internal/runtime/atomic", "Xaddint64"}:         struct{}{},
++	{"mips64", "internal/runtime/atomic", "Xadduintptr"}:       struct{}{},
++	{"mips64", "internal/runtime/atomic", "Xchg"}:              struct{}{},
++	{"mips64", "internal/runtime/atomic", "Xchg64"}:            struct{}{},
++	{"mips64", "internal/runtime/atomic", "Xchgint32"}:         struct{}{},
++	{"mips64", "internal/runtime/atomic", "Xchgint64"}:         struct{}{},
++	{"mips64", "internal/runtime/atomic", "Xchguintptr"}:       struct{}{},
++	{"mips64", "internal/runtime/math", "Add64"}:               struct{}{},
++	{"mips64", "internal/runtime/math", "Mul64"}:               struct{}{},
++	{"mips64", "internal/runtime/math", "MulUintptr"}:          struct{}{},
++	{"mips64", "math", "Abs"}:                                  struct{}{},
++	{"mips64", "math", "sqrt"}:                                 struct{}{},
++	{"mips64", "math/big", "mulWW"}:                            struct{}{},
++	{"mips64", "math/bits", "Add"}:                             struct{}{},
++	{"mips64", "math/bits", "Add64"}:                           struct{}{},
++	{"mips64", "math/bits", "Mul"}:                             struct{}{},
++	{"mips64", "math/bits", "Mul64"}:                           struct{}{},
++	{"mips64", "math/bits", "Sub"}:                             struct{}{},
++	{"mips64", "math/bits", "Sub64"}:                           struct{}{},
++	{"mips64", "runtime", "KeepAlive"}:                         struct{}{},
++	{"mips64", "runtime", "getcallerpc"}:                       struct{}{},
++	{"mips64", "runtime", "getcallersp"}:                       struct{}{},
++	{"mips64", "runtime", "getclosureptr"}:                     struct{}{},
++	{"mips64", "runtime", "slicebytetostringtmp"}:              struct{}{},
++	{"mips64", "sync", "runtime_LoadAcquintptr"}:               struct{}{},
++	{"mips64", "sync", "runtime_StoreReluintptr"}:              struct{}{},
++	{"mips64", "sync/atomic", "AddInt32"}:                      struct{}{},
++	{"mips64", "sync/atomic", "AddInt64"}:                      struct{}{},
++	{"mips64", "sync/atomic", "AddUint32"}:                     struct{}{},
++	{"mips64", "sync/atomic", "AddUint64"}:                     struct{}{},
++	{"mips64", "sync/atomic", "AddUintptr"}:                    struct{}{},
++	{"mips64", "sync/atomic", "CompareAndSwapInt32"}:           struct{}{},
++	{"mips64", "sync/atomic", "CompareAndSwapInt64"}:           struct{}{},
++	{"mips64", "sync/atomic", "CompareAndSwapUint32"}:          struct{}{},
++	{"mips64", "sync/atomic", "CompareAndSwapUint64"}:          struct{}{},
++	{"mips64", "sync/atomic", "CompareAndSwapUintptr"}:         struct{}{},
++	{"mips64", "sync/atomic", "LoadInt32"}:                     struct{}{},
++	{"mips64", "sync/atomic", "LoadInt64"}:                     struct{}{},
++	{"mips64", "sync/atomic", "LoadPointer"}:                   struct{}{},
++	{"mips64", "sync/atomic", "LoadUint32"}:                    struct{}{},
++	{"mips64", "sync/atomic", "LoadUint64"}:                    struct{}{},
++	{"mips64", "sync/atomic", "LoadUintptr"}:                   struct{}{},
++	{"mips64", "sync/atomic", "StoreInt32"}:                    struct{}{},
++	{"mips64", "sync/atomic", "StoreInt64"}:                    struct{}{},
++	{"mips64", "sync/atomic", "StoreUint32"}:                   struct{}{},
++	{"mips64", "sync/atomic", "StoreUint64"}:                   struct{}{},
++	{"mips64", "sync/atomic", "StoreUintptr"}:                  struct{}{},
++	{"mips64", "sync/atomic", "SwapInt32"}:                     struct{}{},
++	{"mips64", "sync/atomic", "SwapInt64"}:                     struct{}{},
++	{"mips64", "sync/atomic", "SwapUint32"}:                    struct{}{},
++	{"mips64", "sync/atomic", "SwapUint64"}:                    struct{}{},
++	{"mips64", "sync/atomic", "SwapUintptr"}:                   struct{}{},
++	{"mips64le", "internal/runtime/atomic", "And"}:             struct{}{},
++	{"mips64le", "internal/runtime/atomic", "And8"}:            struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Cas"}:             struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Cas64"}:           struct{}{},
++	{"mips64le", "internal/runtime/atomic", "CasRel"}:          struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Casint32"}:        struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Casint64"}:        struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Casp1"}:           struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Casuintptr"}:      struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Load"}:            struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Load64"}:          struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Load8"}:           struct{}{},
++	{"mips64le", "internal/runtime/atomic", "LoadAcq"}:         struct{}{},
++	{"mips64le", "internal/runtime/atomic", "LoadAcq64"}:       struct{}{},
++	{"mips64le", "internal/runtime/atomic", "LoadAcquintptr"}:  struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Loadint32"}:       struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Loadint64"}:       struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Loadp"}:           struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Loaduint"}:        struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Loaduintptr"}:     struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Or"}:              struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Or8"}:             struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Store"}:           struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Store64"}:         struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Store8"}:          struct{}{},
++	{"mips64le", "internal/runtime/atomic", "StoreRel"}:        struct{}{},
++	{"mips64le", "internal/runtime/atomic", "StoreRel64"}:      struct{}{},
++	{"mips64le", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Storeint32"}:      struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Storeint64"}:      struct{}{},
++	{"mips64le", "internal/runtime/atomic", "StorepNoWB"}:      struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Storeuintptr"}:    struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Xadd"}:            struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Xadd64"}:          struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Xaddint32"}:       struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Xaddint64"}:       struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Xadduintptr"}:     struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Xchg"}:            struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Xchg64"}:          struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Xchgint32"}:       struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Xchgint64"}:       struct{}{},
++	{"mips64le", "internal/runtime/atomic", "Xchguintptr"}:     struct{}{},
++	{"mips64le", "internal/runtime/math", "Add64"}:             struct{}{},
++	{"mips64le", "internal/runtime/math", "Mul64"}:             struct{}{},
++	{"mips64le", "internal/runtime/math", "MulUintptr"}:        struct{}{},
++	{"mips64le", "math", "Abs"}:                                struct{}{},
++	{"mips64le", "math", "sqrt"}:                               struct{}{},
++	{"mips64le", "math/big", "mulWW"}:                          struct{}{},
++	{"mips64le", "math/bits", "Add"}:                           struct{}{},
++	{"mips64le", "math/bits", "Add64"}:                         struct{}{},
++	{"mips64le", "math/bits", "Mul"}:                           struct{}{},
++	{"mips64le", "math/bits", "Mul64"}:                         struct{}{},
++	{"mips64le", "math/bits", "Sub"}:                           struct{}{},
++	{"mips64le", "math/bits", "Sub64"}:                         struct{}{},
++	{"mips64le", "runtime", "KeepAlive"}:                       struct{}{},
++	{"mips64le", "runtime", "getcallerpc"}:                     struct{}{},
++	{"mips64le", "runtime", "getcallersp"}:                     struct{}{},
++	{"mips64le", "runtime", "getclosureptr"}:                   struct{}{},
++	{"mips64le", "runtime", "slicebytetostringtmp"}:            struct{}{},
++	{"mips64le", "sync", "runtime_LoadAcquintptr"}:             struct{}{},
++	{"mips64le", "sync", "runtime_StoreReluintptr"}:            struct{}{},
++	{"mips64le", "sync/atomic", "AddInt32"}:                    struct{}{},
++	{"mips64le", "sync/atomic", "AddInt64"}:                    struct{}{},
++	{"mips64le", "sync/atomic", "AddUint32"}:                   struct{}{},
++	{"mips64le", "sync/atomic", "AddUint64"}:                   struct{}{},
++	{"mips64le", "sync/atomic", "AddUintptr"}:                  struct{}{},
++	{"mips64le", "sync/atomic", "CompareAndSwapInt32"}:         struct{}{},
++	{"mips64le", "sync/atomic", "CompareAndSwapInt64"}:         struct{}{},
++	{"mips64le", "sync/atomic", "CompareAndSwapUint32"}:        struct{}{},
++	{"mips64le", "sync/atomic", "CompareAndSwapUint64"}:        struct{}{},
++	{"mips64le", "sync/atomic", "CompareAndSwapUintptr"}:       struct{}{},
++	{"mips64le", "sync/atomic", "LoadInt32"}:                   struct{}{},
++	{"mips64le", "sync/atomic", "LoadInt64"}:                   struct{}{},
++	{"mips64le", "sync/atomic", "LoadPointer"}:                 struct{}{},
++	{"mips64le", "sync/atomic", "LoadUint32"}:                  struct{}{},
++	{"mips64le", "sync/atomic", "LoadUint64"}:                  struct{}{},
++	{"mips64le", "sync/atomic", "LoadUintptr"}:                 struct{}{},
++	{"mips64le", "sync/atomic", "StoreInt32"}:                  struct{}{},
++	{"mips64le", "sync/atomic", "StoreInt64"}:                  struct{}{},
++	{"mips64le", "sync/atomic", "StoreUint32"}:                 struct{}{},
++	{"mips64le", "sync/atomic", "StoreUint64"}:                 struct{}{},
++	{"mips64le", "sync/atomic", "StoreUintptr"}:                struct{}{},
++	{"mips64le", "sync/atomic", "SwapInt32"}:                   struct{}{},
++	{"mips64le", "sync/atomic", "SwapInt64"}:                   struct{}{},
++	{"mips64le", "sync/atomic", "SwapUint32"}:                  struct{}{},
++	{"mips64le", "sync/atomic", "SwapUint64"}:                  struct{}{},
++	{"mips64le", "sync/atomic", "SwapUintptr"}:                 struct{}{},
++	{"mipsle", "internal/runtime/atomic", "And"}:               struct{}{},
++	{"mipsle", "internal/runtime/atomic", "And8"}:              struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Cas"}:               struct{}{},
++	{"mipsle", "internal/runtime/atomic", "CasRel"}:            struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Casint32"}:          struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Casp1"}:             struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Casuintptr"}:        struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Load"}:              struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Load8"}:             struct{}{},
++	{"mipsle", "internal/runtime/atomic", "LoadAcq"}:           struct{}{},
++	{"mipsle", "internal/runtime/atomic", "LoadAcquintptr"}:    struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Loadint32"}:         struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Loadp"}:             struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Loaduint"}:          struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Loaduintptr"}:       struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Or"}:                struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Or8"}:               struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Store"}:             struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Store8"}:            struct{}{},
++	{"mipsle", "internal/runtime/atomic", "StoreRel"}:          struct{}{},
++	{"mipsle", "internal/runtime/atomic", "StoreReluintptr"}:   struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Storeint32"}:        struct{}{},
++	{"mipsle", "internal/runtime/atomic", "StorepNoWB"}:        struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Storeuintptr"}:      struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Xadd"}:              struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Xaddint32"}:         struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Xadduintptr"}:       struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Xchg"}:              struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Xchgint32"}:         struct{}{},
++	{"mipsle", "internal/runtime/atomic", "Xchguintptr"}:       struct{}{},
++	{"mipsle", "internal/runtime/sys", "Len64"}:                struct{}{},
++	{"mipsle", "internal/runtime/sys", "Len8"}:                 struct{}{},
++	{"mipsle", "internal/runtime/sys", "TrailingZeros32"}:      struct{}{},
++	{"mipsle", "internal/runtime/sys", "TrailingZeros64"}:      struct{}{},
++	{"mipsle", "internal/runtime/sys", "TrailingZeros8"}:       struct{}{},
++	{"mipsle", "math", "Abs"}:                                  struct{}{},
++	{"mipsle", "math", "sqrt"}:                                 struct{}{},
++	{"mipsle", "math/bits", "Len"}:                             struct{}{},
++	{"mipsle", "math/bits", "Len16"}:                           struct{}{},
++	{"mipsle", "math/bits", "Len32"}:                           struct{}{},
++	{"mipsle", "math/bits", "Len64"}:                           struct{}{},
++	{"mipsle", "math/bits", "Len8"}:                            struct{}{},
++	{"mipsle", "math/bits", "TrailingZeros16"}:                 struct{}{},
++	{"mipsle", "math/bits", "TrailingZeros32"}:                 struct{}{},
++	{"mipsle", "math/bits", "TrailingZeros64"}:                 struct{}{},
++	{"mipsle", "math/bits", "TrailingZeros8"}:                  struct{}{},
++	{"mipsle", "runtime", "KeepAlive"}:                         struct{}{},
++	{"mipsle", "runtime", "getcallerpc"}:                       struct{}{},
++	{"mipsle", "runtime", "getcallersp"}:                       struct{}{},
++	{"mipsle", "runtime", "getclosureptr"}:                     struct{}{},
++	{"mipsle", "runtime", "slicebytetostringtmp"}:              struct{}{},
++	{"mipsle", "sync", "runtime_LoadAcquintptr"}:               struct{}{},
++	{"mipsle", "sync", "runtime_StoreReluintptr"}:              struct{}{},
++	{"mipsle", "sync/atomic", "AddInt32"}:                      struct{}{},
++	{"mipsle", "sync/atomic", "AddUint32"}:                     struct{}{},
++	{"mipsle", "sync/atomic", "AddUintptr"}:                    struct{}{},
++	{"mipsle", "sync/atomic", "CompareAndSwapInt32"}:           struct{}{},
++	{"mipsle", "sync/atomic", "CompareAndSwapUint32"}:          struct{}{},
++	{"mipsle", "sync/atomic", "CompareAndSwapUintptr"}:         struct{}{},
++	{"mipsle", "sync/atomic", "LoadInt32"}:                     struct{}{},
++	{"mipsle", "sync/atomic", "LoadPointer"}:                   struct{}{},
++	{"mipsle", "sync/atomic", "LoadUint32"}:                    struct{}{},
++	{"mipsle", "sync/atomic", "LoadUintptr"}:                   struct{}{},
++	{"mipsle", "sync/atomic", "StoreInt32"}:                    struct{}{},
++	{"mipsle", "sync/atomic", "StoreUint32"}:                   struct{}{},
++	{"mipsle", "sync/atomic", "StoreUintptr"}:                  struct{}{},
++	{"mipsle", "sync/atomic", "SwapInt32"}:                     struct{}{},
++	{"mipsle", "sync/atomic", "SwapUint32"}:                    struct{}{},
++	{"mipsle", "sync/atomic", "SwapUintptr"}:                   struct{}{},
++	{"ppc64", "internal/runtime/atomic", "And"}:                struct{}{},
++	{"ppc64", "internal/runtime/atomic", "And8"}:               struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Cas"}:                struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Cas64"}:              struct{}{},
++	{"ppc64", "internal/runtime/atomic", "CasRel"}:             struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Casint32"}:           struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Casint64"}:           struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Casp1"}:              struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Casuintptr"}:         struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Load"}:               struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Load64"}:             struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Load8"}:              struct{}{},
++	{"ppc64", "internal/runtime/atomic", "LoadAcq"}:            struct{}{},
++	{"ppc64", "internal/runtime/atomic", "LoadAcq64"}:          struct{}{},
++	{"ppc64", "internal/runtime/atomic", "LoadAcquintptr"}:     struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Loadint32"}:          struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Loadint64"}:          struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Loadp"}:              struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Loaduint"}:           struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Loaduintptr"}:        struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Or"}:                 struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Or8"}:                struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Store"}:              struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Store64"}:            struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Store8"}:             struct{}{},
++	{"ppc64", "internal/runtime/atomic", "StoreRel"}:           struct{}{},
++	{"ppc64", "internal/runtime/atomic", "StoreRel64"}:         struct{}{},
++	{"ppc64", "internal/runtime/atomic", "StoreReluintptr"}:    struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Storeint32"}:         struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Storeint64"}:         struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Storeuintptr"}:       struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Xadd"}:               struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Xadd64"}:             struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Xaddint32"}:          struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Xaddint64"}:          struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Xadduintptr"}:        struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Xchg"}:               struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Xchg64"}:             struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Xchgint32"}:          struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Xchgint64"}:          struct{}{},
++	{"ppc64", "internal/runtime/atomic", "Xchguintptr"}:        struct{}{},
++	{"ppc64", "internal/runtime/math", "Add64"}:                struct{}{},
++	{"ppc64", "internal/runtime/math", "Mul64"}:                struct{}{},
++	{"ppc64", "internal/runtime/sys", "Len64"}:                 struct{}{},
++	{"ppc64", "internal/runtime/sys", "Len8"}:                  struct{}{},
++	{"ppc64", "internal/runtime/sys", "OnesCount64"}:           struct{}{},
++	{"ppc64", "internal/runtime/sys", "Prefetch"}:              struct{}{},
++	{"ppc64", "internal/runtime/sys", "PrefetchStreamed"}:      struct{}{},
++	{"ppc64", "internal/runtime/sys", "TrailingZeros32"}:       struct{}{},
++	{"ppc64", "internal/runtime/sys", "TrailingZeros64"}:       struct{}{},
++	{"ppc64", "math", "Abs"}:                                   struct{}{},
++	{"ppc64", "math", "Ceil"}:                                  struct{}{},
++	{"ppc64", "math", "Copysign"}:                              struct{}{},
++	{"ppc64", "math", "FMA"}:                                   struct{}{},
++	{"ppc64", "math", "Floor"}:                                 struct{}{},
++	{"ppc64", "math", "Round"}:                                 struct{}{},
++	{"ppc64", "math", "Trunc"}:                                 struct{}{},
++	{"ppc64", "math", "sqrt"}:                                  struct{}{},
++	{"ppc64", "math/big", "mulWW"}:                             struct{}{},
++	{"ppc64", "math/bits", "Add"}:                              struct{}{},
++	{"ppc64", "math/bits", "Add64"}:                            struct{}{},
++	{"ppc64", "math/bits", "Len"}:                              struct{}{},
++	{"ppc64", "math/bits", "Len16"}:                            struct{}{},
++	{"ppc64", "math/bits", "Len32"}:                            struct{}{},
++	{"ppc64", "math/bits", "Len64"}:                            struct{}{},
++	{"ppc64", "math/bits", "Len8"}:                             struct{}{},
++	{"ppc64", "math/bits", "Mul"}:                              struct{}{},
++	{"ppc64", "math/bits", "Mul64"}:                            struct{}{},
++	{"ppc64", "math/bits", "OnesCount16"}:                      struct{}{},
++	{"ppc64", "math/bits", "OnesCount32"}:                      struct{}{},
++	{"ppc64", "math/bits", "OnesCount64"}:                      struct{}{},
++	{"ppc64", "math/bits", "OnesCount8"}:                       struct{}{},
++	{"ppc64", "math/bits", "RotateLeft"}:                       struct{}{},
++	{"ppc64", "math/bits", "RotateLeft32"}:                     struct{}{},
++	{"ppc64", "math/bits", "RotateLeft64"}:                     struct{}{},
++	{"ppc64", "math/bits", "Sub"}:                              struct{}{},
++	{"ppc64", "math/bits", "Sub64"}:                            struct{}{},
++	{"ppc64", "math/bits", "TrailingZeros16"}:                  struct{}{},
++	{"ppc64", "math/bits", "TrailingZeros32"}:                  struct{}{},
++	{"ppc64", "math/bits", "TrailingZeros64"}:                  struct{}{},
++	{"ppc64", "runtime", "KeepAlive"}:                          struct{}{},
++	{"ppc64", "runtime", "getcallerpc"}:                        struct{}{},
++	{"ppc64", "runtime", "getcallersp"}:                        struct{}{},
++	{"ppc64", "runtime", "getclosureptr"}:                      struct{}{},
++	{"ppc64", "runtime", "publicationBarrier"}:                 struct{}{},
++	{"ppc64", "runtime", "slicebytetostringtmp"}:               struct{}{},
++	{"ppc64", "sync", "runtime_LoadAcquintptr"}:                struct{}{},
++	{"ppc64", "sync", "runtime_StoreReluintptr"}:               struct{}{},
++	{"ppc64", "sync/atomic", "AddInt32"}:                       struct{}{},
++	{"ppc64", "sync/atomic", "AddInt64"}:                       struct{}{},
++	{"ppc64", "sync/atomic", "AddUint32"}:                      struct{}{},
++	{"ppc64", "sync/atomic", "AddUint64"}:                      struct{}{},
++	{"ppc64", "sync/atomic", "AddUintptr"}:                     struct{}{},
++	{"ppc64", "sync/atomic", "CompareAndSwapInt32"}:            struct{}{},
++	{"ppc64", "sync/atomic", "CompareAndSwapInt64"}:            struct{}{},
++	{"ppc64", "sync/atomic", "CompareAndSwapUint32"}:           struct{}{},
++	{"ppc64", "sync/atomic", "CompareAndSwapUint64"}:           struct{}{},
++	{"ppc64", "sync/atomic", "CompareAndSwapUintptr"}:          struct{}{},
++	{"ppc64", "sync/atomic", "LoadInt32"}:                      struct{}{},
++	{"ppc64", "sync/atomic", "LoadInt64"}:                      struct{}{},
++	{"ppc64", "sync/atomic", "LoadPointer"}:                    struct{}{},
++	{"ppc64", "sync/atomic", "LoadUint32"}:                     struct{}{},
++	{"ppc64", "sync/atomic", "LoadUint64"}:                     struct{}{},
++	{"ppc64", "sync/atomic", "LoadUintptr"}:                    struct{}{},
++	{"ppc64", "sync/atomic", "StoreInt32"}:                     struct{}{},
++	{"ppc64", "sync/atomic", "StoreInt64"}:                     struct{}{},
++	{"ppc64", "sync/atomic", "StoreUint32"}:                    struct{}{},
++	{"ppc64", "sync/atomic", "StoreUint64"}:                    struct{}{},
++	{"ppc64", "sync/atomic", "StoreUintptr"}:                   struct{}{},
++	{"ppc64", "sync/atomic", "SwapInt32"}:                      struct{}{},
++	{"ppc64", "sync/atomic", "SwapInt64"}:                      struct{}{},
++	{"ppc64", "sync/atomic", "SwapUint32"}:                     struct{}{},
++	{"ppc64", "sync/atomic", "SwapUint64"}:                     struct{}{},
++	{"ppc64", "sync/atomic", "SwapUintptr"}:                    struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "And"}:              struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "And8"}:             struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Cas"}:              struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Cas64"}:            struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "CasRel"}:           struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Casint32"}:         struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Casint64"}:         struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Casp1"}:            struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Casuintptr"}:       struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Load"}:             struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Load64"}:           struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Load8"}:            struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "LoadAcq"}:          struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "LoadAcq64"}:        struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "LoadAcquintptr"}:   struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Loadint32"}:        struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Loadint64"}:        struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Loadp"}:            struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Loaduint"}:         struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Loaduintptr"}:      struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Or"}:               struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Or8"}:              struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Store"}:            struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Store64"}:          struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Store8"}:           struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "StoreRel"}:         struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "StoreRel64"}:       struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "StoreReluintptr"}:  struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Storeint32"}:       struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Storeint64"}:       struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Storeuintptr"}:     struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Xadd"}:             struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Xadd64"}:           struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Xaddint32"}:        struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Xaddint64"}:        struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Xadduintptr"}:      struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Xchg"}:             struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Xchg64"}:           struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Xchgint32"}:        struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Xchgint64"}:        struct{}{},
++	{"ppc64le", "internal/runtime/atomic", "Xchguintptr"}:      struct{}{},
++	{"ppc64le", "internal/runtime/math", "Add64"}:              struct{}{},
++	{"ppc64le", "internal/runtime/math", "Mul64"}:              struct{}{},
++	{"ppc64le", "internal/runtime/sys", "Len64"}:               struct{}{},
++	{"ppc64le", "internal/runtime/sys", "Len8"}:                struct{}{},
++	{"ppc64le", "internal/runtime/sys", "OnesCount64"}:         struct{}{},
++	{"ppc64le", "internal/runtime/sys", "Prefetch"}:            struct{}{},
++	{"ppc64le", "internal/runtime/sys", "PrefetchStreamed"}:    struct{}{},
++	{"ppc64le", "internal/runtime/sys", "TrailingZeros32"}:     struct{}{},
++	{"ppc64le", "internal/runtime/sys", "TrailingZeros64"}:     struct{}{},
++	{"ppc64le", "math", "Abs"}:                                 struct{}{},
++	{"ppc64le", "math", "Ceil"}:                                struct{}{},
++	{"ppc64le", "math", "Copysign"}:                            struct{}{},
++	{"ppc64le", "math", "FMA"}:                                 struct{}{},
++	{"ppc64le", "math", "Floor"}:                               struct{}{},
++	{"ppc64le", "math", "Round"}:                               struct{}{},
++	{"ppc64le", "math", "Trunc"}:                               struct{}{},
++	{"ppc64le", "math", "sqrt"}:                                struct{}{},
++	{"ppc64le", "math/big", "mulWW"}:                           struct{}{},
++	{"ppc64le", "math/bits", "Add"}:                            struct{}{},
++	{"ppc64le", "math/bits", "Add64"}:                          struct{}{},
++	{"ppc64le", "math/bits", "Len"}:                            struct{}{},
++	{"ppc64le", "math/bits", "Len16"}:                          struct{}{},
++	{"ppc64le", "math/bits", "Len32"}:                          struct{}{},
++	{"ppc64le", "math/bits", "Len64"}:                          struct{}{},
++	{"ppc64le", "math/bits", "Len8"}:                           struct{}{},
++	{"ppc64le", "math/bits", "Mul"}:                            struct{}{},
++	{"ppc64le", "math/bits", "Mul64"}:                          struct{}{},
++	{"ppc64le", "math/bits", "OnesCount16"}:                    struct{}{},
++	{"ppc64le", "math/bits", "OnesCount32"}:                    struct{}{},
++	{"ppc64le", "math/bits", "OnesCount64"}:                    struct{}{},
++	{"ppc64le", "math/bits", "OnesCount8"}:                     struct{}{},
++	{"ppc64le", "math/bits", "RotateLeft"}:                     struct{}{},
++	{"ppc64le", "math/bits", "RotateLeft32"}:                   struct{}{},
++	{"ppc64le", "math/bits", "RotateLeft64"}:                   struct{}{},
++	{"ppc64le", "math/bits", "Sub"}:                            struct{}{},
++	{"ppc64le", "math/bits", "Sub64"}:                          struct{}{},
++	{"ppc64le", "math/bits", "TrailingZeros16"}:                struct{}{},
++	{"ppc64le", "math/bits", "TrailingZeros32"}:                struct{}{},
++	{"ppc64le", "math/bits", "TrailingZeros64"}:                struct{}{},
++	{"ppc64le", "runtime", "KeepAlive"}:                        struct{}{},
++	{"ppc64le", "runtime", "getcallerpc"}:                      struct{}{},
++	{"ppc64le", "runtime", "getcallersp"}:                      struct{}{},
++	{"ppc64le", "runtime", "getclosureptr"}:                    struct{}{},
++	{"ppc64le", "runtime", "publicationBarrier"}:               struct{}{},
++	{"ppc64le", "runtime", "slicebytetostringtmp"}:             struct{}{},
++	{"ppc64le", "sync", "runtime_LoadAcquintptr"}:              struct{}{},
++	{"ppc64le", "sync", "runtime_StoreReluintptr"}:             struct{}{},
++	{"ppc64le", "sync/atomic", "AddInt32"}:                     struct{}{},
++	{"ppc64le", "sync/atomic", "AddInt64"}:                     struct{}{},
++	{"ppc64le", "sync/atomic", "AddUint32"}:                    struct{}{},
++	{"ppc64le", "sync/atomic", "AddUint64"}:                    struct{}{},
++	{"ppc64le", "sync/atomic", "AddUintptr"}:                   struct{}{},
++	{"ppc64le", "sync/atomic", "CompareAndSwapInt32"}:          struct{}{},
++	{"ppc64le", "sync/atomic", "CompareAndSwapInt64"}:          struct{}{},
++	{"ppc64le", "sync/atomic", "CompareAndSwapUint32"}:         struct{}{},
++	{"ppc64le", "sync/atomic", "CompareAndSwapUint64"}:         struct{}{},
++	{"ppc64le", "sync/atomic", "CompareAndSwapUintptr"}:        struct{}{},
++	{"ppc64le", "sync/atomic", "LoadInt32"}:                    struct{}{},
++	{"ppc64le", "sync/atomic", "LoadInt64"}:                    struct{}{},
++	{"ppc64le", "sync/atomic", "LoadPointer"}:                  struct{}{},
++	{"ppc64le", "sync/atomic", "LoadUint32"}:                   struct{}{},
++	{"ppc64le", "sync/atomic", "LoadUint64"}:                   struct{}{},
++	{"ppc64le", "sync/atomic", "LoadUintptr"}:                  struct{}{},
++	{"ppc64le", "sync/atomic", "StoreInt32"}:                   struct{}{},
++	{"ppc64le", "sync/atomic", "StoreInt64"}:                   struct{}{},
++	{"ppc64le", "sync/atomic", "StoreUint32"}:                  struct{}{},
++	{"ppc64le", "sync/atomic", "StoreUint64"}:                  struct{}{},
++	{"ppc64le", "sync/atomic", "StoreUintptr"}:                 struct{}{},
++	{"ppc64le", "sync/atomic", "SwapInt32"}:                    struct{}{},
++	{"ppc64le", "sync/atomic", "SwapInt64"}:                    struct{}{},
++	{"ppc64le", "sync/atomic", "SwapUint32"}:                   struct{}{},
++	{"ppc64le", "sync/atomic", "SwapUint64"}:                   struct{}{},
++	{"ppc64le", "sync/atomic", "SwapUintptr"}:                  struct{}{},
++	{"riscv64", "internal/runtime/atomic", "And"}:              struct{}{},
++	{"riscv64", "internal/runtime/atomic", "And8"}:             struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Cas"}:              struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Cas64"}:            struct{}{},
++	{"riscv64", "internal/runtime/atomic", "CasRel"}:           struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Casint32"}:         struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Casint64"}:         struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Casp1"}:            struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Casuintptr"}:       struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Load"}:             struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Load64"}:           struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Load8"}:            struct{}{},
++	{"riscv64", "internal/runtime/atomic", "LoadAcq"}:          struct{}{},
++	{"riscv64", "internal/runtime/atomic", "LoadAcq64"}:        struct{}{},
++	{"riscv64", "internal/runtime/atomic", "LoadAcquintptr"}:   struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Loadint32"}:        struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Loadint64"}:        struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Loadp"}:            struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Loaduint"}:         struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Loaduintptr"}:      struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Or"}:               struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Or8"}:              struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Store"}:            struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Store64"}:          struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Store8"}:           struct{}{},
++	{"riscv64", "internal/runtime/atomic", "StoreRel"}:         struct{}{},
++	{"riscv64", "internal/runtime/atomic", "StoreRel64"}:       struct{}{},
++	{"riscv64", "internal/runtime/atomic", "StoreReluintptr"}:  struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Storeint32"}:       struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Storeint64"}:       struct{}{},
++	{"riscv64", "internal/runtime/atomic", "StorepNoWB"}:       struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Storeuintptr"}:     struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Xadd"}:             struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Xadd64"}:           struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Xaddint32"}:        struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Xaddint64"}:        struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Xadduintptr"}:      struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Xchg"}:             struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Xchg64"}:           struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Xchgint32"}:        struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Xchgint64"}:        struct{}{},
++	{"riscv64", "internal/runtime/atomic", "Xchguintptr"}:      struct{}{},
++	{"riscv64", "internal/runtime/math", "Add64"}:              struct{}{},
++	{"riscv64", "internal/runtime/math", "Mul64"}:              struct{}{},
++	{"riscv64", "internal/runtime/math", "MulUintptr"}:         struct{}{},
++	{"riscv64", "math", "Abs"}:                                 struct{}{},
++	{"riscv64", "math", "Copysign"}:                            struct{}{},
++	{"riscv64", "math", "FMA"}:                                 struct{}{},
++	{"riscv64", "math", "sqrt"}:                                struct{}{},
++	{"riscv64", "math/big", "mulWW"}:                           struct{}{},
++	{"riscv64", "math/bits", "Add"}:                            struct{}{},
++	{"riscv64", "math/bits", "Add64"}:                          struct{}{},
++	{"riscv64", "math/bits", "Mul"}:                            struct{}{},
++	{"riscv64", "math/bits", "Mul64"}:                          struct{}{},
++	{"riscv64", "math/bits", "RotateLeft"}:                     struct{}{},
++	{"riscv64", "math/bits", "RotateLeft16"}:                   struct{}{},
++	{"riscv64", "math/bits", "RotateLeft32"}:                   struct{}{},
++	{"riscv64", "math/bits", "RotateLeft64"}:                   struct{}{},
++	{"riscv64", "math/bits", "RotateLeft8"}:                    struct{}{},
++	{"riscv64", "math/bits", "Sub"}:                            struct{}{},
++	{"riscv64", "math/bits", "Sub64"}:                          struct{}{},
++	{"riscv64", "runtime", "KeepAlive"}:                        struct{}{},
++	{"riscv64", "runtime", "getcallerpc"}:                      struct{}{},
++	{"riscv64", "runtime", "getcallersp"}:                      struct{}{},
++	{"riscv64", "runtime", "getclosureptr"}:                    struct{}{},
++	{"riscv64", "runtime", "publicationBarrier"}:               struct{}{},
++	{"riscv64", "runtime", "slicebytetostringtmp"}:             struct{}{},
++	{"riscv64", "sync", "runtime_LoadAcquintptr"}:              struct{}{},
++	{"riscv64", "sync", "runtime_StoreReluintptr"}:             struct{}{},
++	{"riscv64", "sync/atomic", "AddInt32"}:                     struct{}{},
++	{"riscv64", "sync/atomic", "AddInt64"}:                     struct{}{},
++	{"riscv64", "sync/atomic", "AddUint32"}:                    struct{}{},
++	{"riscv64", "sync/atomic", "AddUint64"}:                    struct{}{},
++	{"riscv64", "sync/atomic", "AddUintptr"}:                   struct{}{},
++	{"riscv64", "sync/atomic", "CompareAndSwapInt32"}:          struct{}{},
++	{"riscv64", "sync/atomic", "CompareAndSwapInt64"}:          struct{}{},
++	{"riscv64", "sync/atomic", "CompareAndSwapUint32"}:         struct{}{},
++	{"riscv64", "sync/atomic", "CompareAndSwapUint64"}:         struct{}{},
++	{"riscv64", "sync/atomic", "CompareAndSwapUintptr"}:        struct{}{},
++	{"riscv64", "sync/atomic", "LoadInt32"}:                    struct{}{},
++	{"riscv64", "sync/atomic", "LoadInt64"}:                    struct{}{},
++	{"riscv64", "sync/atomic", "LoadPointer"}:                  struct{}{},
++	{"riscv64", "sync/atomic", "LoadUint32"}:                   struct{}{},
++	{"riscv64", "sync/atomic", "LoadUint64"}:                   struct{}{},
++	{"riscv64", "sync/atomic", "LoadUintptr"}:                  struct{}{},
++	{"riscv64", "sync/atomic", "StoreInt32"}:                   struct{}{},
++	{"riscv64", "sync/atomic", "StoreInt64"}:                   struct{}{},
++	{"riscv64", "sync/atomic", "StoreUint32"}:                  struct{}{},
++	{"riscv64", "sync/atomic", "StoreUint64"}:                  struct{}{},
++	{"riscv64", "sync/atomic", "StoreUintptr"}:                 struct{}{},
++	{"riscv64", "sync/atomic", "SwapInt32"}:                    struct{}{},
++	{"riscv64", "sync/atomic", "SwapInt64"}:                    struct{}{},
++	{"riscv64", "sync/atomic", "SwapUint32"}:                   struct{}{},
++	{"riscv64", "sync/atomic", "SwapUint64"}:                   struct{}{},
++	{"riscv64", "sync/atomic", "SwapUintptr"}:                  struct{}{},
++	{"s390x", "internal/runtime/atomic", "And"}:                struct{}{},
++	{"s390x", "internal/runtime/atomic", "And8"}:               struct{}{},
++	{"s390x", "internal/runtime/atomic", "Cas"}:                struct{}{},
++	{"s390x", "internal/runtime/atomic", "Cas64"}:              struct{}{},
++	{"s390x", "internal/runtime/atomic", "CasRel"}:             struct{}{},
++	{"s390x", "internal/runtime/atomic", "Casint32"}:           struct{}{},
++	{"s390x", "internal/runtime/atomic", "Casint64"}:           struct{}{},
++	{"s390x", "internal/runtime/atomic", "Casp1"}:              struct{}{},
++	{"s390x", "internal/runtime/atomic", "Casuintptr"}:         struct{}{},
++	{"s390x", "internal/runtime/atomic", "Load"}:               struct{}{},
++	{"s390x", "internal/runtime/atomic", "Load64"}:             struct{}{},
++	{"s390x", "internal/runtime/atomic", "Load8"}:              struct{}{},
++	{"s390x", "internal/runtime/atomic", "LoadAcq"}:            struct{}{},
++	{"s390x", "internal/runtime/atomic", "LoadAcq64"}:          struct{}{},
++	{"s390x", "internal/runtime/atomic", "LoadAcquintptr"}:     struct{}{},
++	{"s390x", "internal/runtime/atomic", "Loadint32"}:          struct{}{},
++	{"s390x", "internal/runtime/atomic", "Loadint64"}:          struct{}{},
++	{"s390x", "internal/runtime/atomic", "Loadp"}:              struct{}{},
++	{"s390x", "internal/runtime/atomic", "Loaduint"}:           struct{}{},
++	{"s390x", "internal/runtime/atomic", "Loaduintptr"}:        struct{}{},
++	{"s390x", "internal/runtime/atomic", "Or"}:                 struct{}{},
++	{"s390x", "internal/runtime/atomic", "Or8"}:                struct{}{},
++	{"s390x", "internal/runtime/atomic", "Store"}:              struct{}{},
++	{"s390x", "internal/runtime/atomic", "Store64"}:            struct{}{},
++	{"s390x", "internal/runtime/atomic", "Store8"}:             struct{}{},
++	{"s390x", "internal/runtime/atomic", "StoreRel"}:           struct{}{},
++	{"s390x", "internal/runtime/atomic", "StoreRel64"}:         struct{}{},
++	{"s390x", "internal/runtime/atomic", "StoreReluintptr"}:    struct{}{},
++	{"s390x", "internal/runtime/atomic", "Storeint32"}:         struct{}{},
++	{"s390x", "internal/runtime/atomic", "Storeint64"}:         struct{}{},
++	{"s390x", "internal/runtime/atomic", "StorepNoWB"}:         struct{}{},
++	{"s390x", "internal/runtime/atomic", "Storeuintptr"}:       struct{}{},
++	{"s390x", "internal/runtime/atomic", "Xadd"}:               struct{}{},
++	{"s390x", "internal/runtime/atomic", "Xadd64"}:             struct{}{},
++	{"s390x", "internal/runtime/atomic", "Xaddint32"}:          struct{}{},
++	{"s390x", "internal/runtime/atomic", "Xaddint64"}:          struct{}{},
++	{"s390x", "internal/runtime/atomic", "Xadduintptr"}:        struct{}{},
++	{"s390x", "internal/runtime/atomic", "Xchg"}:               struct{}{},
++	{"s390x", "internal/runtime/atomic", "Xchg64"}:             struct{}{},
++	{"s390x", "internal/runtime/atomic", "Xchgint32"}:          struct{}{},
++	{"s390x", "internal/runtime/atomic", "Xchgint64"}:          struct{}{},
++	{"s390x", "internal/runtime/atomic", "Xchguintptr"}:        struct{}{},
++	{"s390x", "internal/runtime/math", "Add64"}:                struct{}{},
++	{"s390x", "internal/runtime/math", "Mul64"}:                struct{}{},
++	{"s390x", "internal/runtime/sys", "Bswap32"}:               struct{}{},
++	{"s390x", "internal/runtime/sys", "Bswap64"}:               struct{}{},
++	{"s390x", "internal/runtime/sys", "Len64"}:                 struct{}{},
++	{"s390x", "internal/runtime/sys", "Len8"}:                  struct{}{},
++	{"s390x", "internal/runtime/sys", "OnesCount64"}:           struct{}{},
++	{"s390x", "internal/runtime/sys", "TrailingZeros32"}:       struct{}{},
++	{"s390x", "internal/runtime/sys", "TrailingZeros64"}:       struct{}{},
++	{"s390x", "internal/runtime/sys", "TrailingZeros8"}:        struct{}{},
++	{"s390x", "math", "Ceil"}:                                  struct{}{},
++	{"s390x", "math", "FMA"}:                                   struct{}{},
++	{"s390x", "math", "Floor"}:                                 struct{}{},
++	{"s390x", "math", "Round"}:                                 struct{}{},
++	{"s390x", "math", "RoundToEven"}:                           struct{}{},
++	{"s390x", "math", "Trunc"}:                                 struct{}{},
++	{"s390x", "math", "sqrt"}:                                  struct{}{},
++	{"s390x", "math/big", "mulWW"}:                             struct{}{},
++	{"s390x", "math/bits", "Add"}:                              struct{}{},
++	{"s390x", "math/bits", "Add64"}:                            struct{}{},
++	{"s390x", "math/bits", "Len"}:                              struct{}{},
++	{"s390x", "math/bits", "Len16"}:                            struct{}{},
++	{"s390x", "math/bits", "Len32"}:                            struct{}{},
++	{"s390x", "math/bits", "Len64"}:                            struct{}{},
++	{"s390x", "math/bits", "Len8"}:                             struct{}{},
++	{"s390x", "math/bits", "Mul"}:                              struct{}{},
++	{"s390x", "math/bits", "Mul64"}:                            struct{}{},
++	{"s390x", "math/bits", "OnesCount16"}:                      struct{}{},
++	{"s390x", "math/bits", "OnesCount32"}:                      struct{}{},
++	{"s390x", "math/bits", "OnesCount64"}:                      struct{}{},
++	{"s390x", "math/bits", "OnesCount8"}:                       struct{}{},
++	{"s390x", "math/bits", "ReverseBytes32"}:                   struct{}{},
++	{"s390x", "math/bits", "ReverseBytes64"}:                   struct{}{},
++	{"s390x", "math/bits", "RotateLeft"}:                       struct{}{},
++	{"s390x", "math/bits", "RotateLeft32"}:                     struct{}{},
++	{"s390x", "math/bits", "RotateLeft64"}:                     struct{}{},
++	{"s390x", "math/bits", "Sub"}:                              struct{}{},
++	{"s390x", "math/bits", "Sub64"}:                            struct{}{},
++	{"s390x", "math/bits", "TrailingZeros16"}:                  struct{}{},
++	{"s390x", "math/bits", "TrailingZeros32"}:                  struct{}{},
++	{"s390x", "math/bits", "TrailingZeros64"}:                  struct{}{},
++	{"s390x", "math/bits", "TrailingZeros8"}:                   struct{}{},
++	{"s390x", "runtime", "KeepAlive"}:                          struct{}{},
++	{"s390x", "runtime", "getcallerpc"}:                        struct{}{},
++	{"s390x", "runtime", "getcallersp"}:                        struct{}{},
++	{"s390x", "runtime", "getclosureptr"}:                      struct{}{},
++	{"s390x", "runtime", "slicebytetostringtmp"}:               struct{}{},
++	{"s390x", "sync", "runtime_LoadAcquintptr"}:                struct{}{},
++	{"s390x", "sync", "runtime_StoreReluintptr"}:               struct{}{},
++	{"s390x", "sync/atomic", "AddInt32"}:                       struct{}{},
++	{"s390x", "sync/atomic", "AddInt64"}:                       struct{}{},
++	{"s390x", "sync/atomic", "AddUint32"}:                      struct{}{},
++	{"s390x", "sync/atomic", "AddUint64"}:                      struct{}{},
++	{"s390x", "sync/atomic", "AddUintptr"}:                     struct{}{},
++	{"s390x", "sync/atomic", "CompareAndSwapInt32"}:            struct{}{},
++	{"s390x", "sync/atomic", "CompareAndSwapInt64"}:            struct{}{},
++	{"s390x", "sync/atomic", "CompareAndSwapUint32"}:           struct{}{},
++	{"s390x", "sync/atomic", "CompareAndSwapUint64"}:           struct{}{},
++	{"s390x", "sync/atomic", "CompareAndSwapUintptr"}:          struct{}{},
++	{"s390x", "sync/atomic", "LoadInt32"}:                      struct{}{},
++	{"s390x", "sync/atomic", "LoadInt64"}:                      struct{}{},
++	{"s390x", "sync/atomic", "LoadPointer"}:                    struct{}{},
++	{"s390x", "sync/atomic", "LoadUint32"}:                     struct{}{},
++	{"s390x", "sync/atomic", "LoadUint64"}:                     struct{}{},
++	{"s390x", "sync/atomic", "LoadUintptr"}:                    struct{}{},
++	{"s390x", "sync/atomic", "StoreInt32"}:                     struct{}{},
++	{"s390x", "sync/atomic", "StoreInt64"}:                     struct{}{},
++	{"s390x", "sync/atomic", "StoreUint32"}:                    struct{}{},
++	{"s390x", "sync/atomic", "StoreUint64"}:                    struct{}{},
++	{"s390x", "sync/atomic", "StoreUintptr"}:                   struct{}{},
++	{"s390x", "sync/atomic", "SwapInt32"}:                      struct{}{},
++	{"s390x", "sync/atomic", "SwapInt64"}:                      struct{}{},
++	{"s390x", "sync/atomic", "SwapUint32"}:                     struct{}{},
++	{"s390x", "sync/atomic", "SwapUint64"}:                     struct{}{},
++	{"s390x", "sync/atomic", "SwapUintptr"}:                    struct{}{},
++	{"wasm", "internal/runtime/sys", "Len64"}:                  struct{}{},
++	{"wasm", "internal/runtime/sys", "Len8"}:                   struct{}{},
++	{"wasm", "internal/runtime/sys", "OnesCount64"}:            struct{}{},
++	{"wasm", "internal/runtime/sys", "TrailingZeros32"}:        struct{}{},
++	{"wasm", "internal/runtime/sys", "TrailingZeros64"}:        struct{}{},
++	{"wasm", "internal/runtime/sys", "TrailingZeros8"}:         struct{}{},
++	{"wasm", "math", "Abs"}:                                    struct{}{},
++	{"wasm", "math", "Ceil"}:                                   struct{}{},
++	{"wasm", "math", "Copysign"}:                               struct{}{},
++	{"wasm", "math", "Floor"}:                                  struct{}{},
++	{"wasm", "math", "RoundToEven"}:                            struct{}{},
++	{"wasm", "math", "Trunc"}:                                  struct{}{},
++	{"wasm", "math", "sqrt"}:                                   struct{}{},
++	{"wasm", "math/bits", "Len"}:                               struct{}{},
++	{"wasm", "math/bits", "Len16"}:                             struct{}{},
++	{"wasm", "math/bits", "Len32"}:                             struct{}{},
++	{"wasm", "math/bits", "Len64"}:                             struct{}{},
++	{"wasm", "math/bits", "Len8"}:                              struct{}{},
++	{"wasm", "math/bits", "OnesCount16"}:                       struct{}{},
++	{"wasm", "math/bits", "OnesCount32"}:                       struct{}{},
++	{"wasm", "math/bits", "OnesCount64"}:                       struct{}{},
++	{"wasm", "math/bits", "OnesCount8"}:                        struct{}{},
++	{"wasm", "math/bits", "RotateLeft"}:                        struct{}{},
++	{"wasm", "math/bits", "RotateLeft32"}:                      struct{}{},
++	{"wasm", "math/bits", "RotateLeft64"}:                      struct{}{},
++	{"wasm", "math/bits", "TrailingZeros16"}:                   struct{}{},
++	{"wasm", "math/bits", "TrailingZeros32"}:                   struct{}{},
++	{"wasm", "math/bits", "TrailingZeros64"}:                   struct{}{},
++	{"wasm", "math/bits", "TrailingZeros8"}:                    struct{}{},
++	{"wasm", "runtime", "KeepAlive"}:                           struct{}{},
++	{"wasm", "runtime", "getcallerpc"}:                         struct{}{},
++	{"wasm", "runtime", "getcallersp"}:                         struct{}{},
++	{"wasm", "runtime", "getclosureptr"}:                       struct{}{},
++	{"wasm", "runtime", "slicebytetostringtmp"}:                struct{}{},
++}
++
++var wantIntrinsicsPower10 = map[testIntrinsicKey]struct{}{
++	{"ppc64", "internal/runtime/sys", "Bswap32"}:   struct{}{},
++	{"ppc64", "internal/runtime/sys", "Bswap64"}:   struct{}{},
++	{"ppc64", "math/bits", "ReverseBytes16"}:       struct{}{},
++	{"ppc64", "math/bits", "ReverseBytes32"}:       struct{}{},
++	{"ppc64", "math/bits", "ReverseBytes64"}:       struct{}{},
++	{"ppc64le", "internal/runtime/sys", "Bswap32"}: struct{}{},
++	{"ppc64le", "internal/runtime/sys", "Bswap64"}: struct{}{},
++	{"ppc64le", "math/bits", "ReverseBytes16"}:     struct{}{},
++	{"ppc64le", "math/bits", "ReverseBytes32"}:     struct{}{},
++	{"ppc64le", "math/bits", "ReverseBytes64"}:     struct{}{},
++}
++
++func TestIntrinsics(t *testing.T) {
++	initIntrinsics()
++
++	want := make(map[testIntrinsicKey]struct{})
++	for ik, iv := range wantIntrinsics {
++		want[ik] = iv
++	}
++	if buildcfg.GOPPC64 >= 10 {
++		for ik, iv := range wantIntrinsicsPower10 {
++			want[ik] = iv
++		}
++	}
++
++	got := make(map[testIntrinsicKey]struct{})
++	for ik, _ := range intrinsics {
++		got[testIntrinsicKey{ik.arch.Name, ik.pkg, ik.fn}] = struct{}{}
++	}
++	for ik, _ := range got {
++		if _, found := want[ik]; !found {
++			t.Errorf("Got unwanted intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn)
++		}
++	}
++	for ik, _ := range want {
++		if _, found := got[ik]; !found {
++			t.Errorf("Want intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn)
++		}
++	}
++}
+-- 
+2.39.5
+
diff --git a/2110-cmd-dist-internal-add-GOARM64-environment-variable.patch b/2110-cmd-dist-internal-add-GOARM64-environment-variable.patch
new file mode 100644
index 0000000..847f7bb
--- /dev/null
+++ b/2110-cmd-dist-internal-add-GOARM64-environment-variable.patch
@@ -0,0 +1,232 @@
+From 8891253e664aabdeb96a60aab2097b34076f0b18 Mon Sep 17 00:00:00 2001
+From: Andrey Bokhanko <andreybokhanko@gmail.com>
+Date: Fri, 26 Sep 2025 17:47:28 +0800
+Subject: [PATCH 110/119] cmd/dist,internal: add GOARM64 environment variable
+
+Adds GOARM64 environment variable with accepted range of values "v8.{0-9}",
+"v9.{0-5}" and optional ",lse" and ",crypto" suffixes.
+
+Right now it doesn't affect anything, but can be used in the future to
+selectively target specific versions of different ARM64 hardware.
+
+For #60905
+
+Change-Id: I6d530041b6931aa884e34f719f8ec41b1cb03ece
+Reviewed-on: https://go-review.googlesource.com/c/go/+/559555
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Shu-Chun Weng <scw@google.com>
+Reviewed-by: Fannie Zhang <Fannie.Zhang@arm.com>
+---
+ src/cmd/go/alldocs.go               |  2 +
+ src/cmd/go/internal/help/helpdoc.go |  2 +
+ src/internal/buildcfg/cfg.go        | 29 ++++++++--
+ src/internal/buildcfg/cfg_test.go   | 85 +++++++++++++++++++++++++++++
+ 4 files changed, 114 insertions(+), 4 deletions(-)
+
+diff --git a/src/cmd/go/alldocs.go b/src/cmd/go/alldocs.go
+index 32e2ba15e9..fd9602a9b3 100644
+--- a/src/cmd/go/alldocs.go
++++ b/src/cmd/go/alldocs.go
+@@ -1965,6 +1965,8 @@
+ //     correspond to the amd64.v1, amd64.v2, and amd64.v3 feature build tags.
+ //   - For GOARCH=arm, GOARM=5, 6, and 7
+ //     correspond to the arm.5, arm.6, and arm.7 feature build tags.
++//   - For GOARCH=arm64, GOARM64=v8.{0-9} and v9.{0-5}
++//     correspond to the arm64.v8.{0-9} and arm64.v9.{0-5} feature build tags.
+ //   - For GOARCH=mips or mipsle,
+ //     GOMIPS=hardfloat and softfloat
+ //     correspond to the mips.hardfloat and mips.softfloat
+diff --git a/src/cmd/go/internal/help/helpdoc.go b/src/cmd/go/internal/help/helpdoc.go
+index 12b667e9be..b5f820c159 100644
+--- a/src/cmd/go/internal/help/helpdoc.go
++++ b/src/cmd/go/internal/help/helpdoc.go
+@@ -897,6 +897,8 @@ The defined architecture feature build tags are:
+ 	  correspond to the amd64.v1, amd64.v2, and amd64.v3 feature build tags.
+ 	- For GOARCH=arm, GOARM=5, 6, and 7
+ 	  correspond to the arm.5, arm.6, and arm.7 feature build tags.
++	- For GOARCH=arm64, GOARM64=v8.{0-9} and v9.{0-5}
++	  correspond to the arm64.v8.{0-9} and arm64.v9.{0-5} feature build tags.
+ 	- For GOARCH=mips or mipsle,
+ 	  GOMIPS=hardfloat and softfloat
+ 	  correspond to the mips.hardfloat and mips.softfloat
+diff --git a/src/internal/buildcfg/cfg.go b/src/internal/buildcfg/cfg.go
+index f6fb2d232f..97d078e954 100644
+--- a/src/internal/buildcfg/cfg.go
++++ b/src/internal/buildcfg/cfg.go
+@@ -93,6 +93,12 @@ type Goarm64Features struct {
+ 	Version string
+ 	// Large System Extension
+ 	LSE bool
++	// ARM v8.0 Cryptographic Extension. It includes the following features:
++	// * FEAT_AES, which includes the AESD and AESE instructions.
++	// * FEAT_PMULL, which includes the PMULL, PMULL2 instructions.
++	// * FEAT_SHA1, which includes the SHA1* instructions.
++	// * FEAT_SHA256, which includes the SHA256* instructions.
++	Crypto bool
+ 	// Kunpeng atomic optimize
+ 	KPAtomicOpt bool
+ }
+@@ -102,7 +108,9 @@ func (g Goarm64Features) String() string {
+ 	if g.LSE {
+ 		arm64Str += ",lse"
+ 	}
+-
++	if g.Crypto {
++		arm64Str += ",crypto"
++	}
+ 	if g.KPAtomicOpt {
+ 		arm64Str += ",kpatomicopt"
+ 	}
+@@ -113,10 +121,12 @@ func (g Goarm64Features) String() string {
+ func ParseGoarm64(v string) (g Goarm64Features, e error) {
+ 	const (
+ 		lseOpt      = ",lse"
++		cryptoOpt   = ",crypto"
+ 		kpAtomicOpt = ",kpatomicopt"
+ 	)
+ 
+ 	g.LSE = false
++	g.Crypto = false
+ 	g.KPAtomicOpt = false
+ 
+ 	// We allow any combination of suffixes, in any order
+@@ -127,6 +137,12 @@ func ParseGoarm64(v string) (g Goarm64Features, e error) {
+ 			continue
+ 		}
+ 
++		if strings.HasSuffix(v, cryptoOpt) {
++			g.Crypto = true
++			v = v[:len(v)-len(cryptoOpt)]
++			continue
++		}
++
+ 		if strings.HasSuffix(v, kpAtomicOpt) {
+ 			if os.Getenv("AI_OPT") == "1" {
+ 				g.KPAtomicOpt = true
+@@ -139,12 +155,16 @@ func ParseGoarm64(v string) (g Goarm64Features, e error) {
+ 	}
+ 
+ 	switch v {
+-	case "v8.0", "v8.1", "v8.2", "v8.3", "v8.4", "v8.5", "v8.6", "v8.7", "v8.8", "v8.9",
+-		"v9.0", "v9.1", "v9.2", "v9.4", "v9.5":
++	case "v8.0":
++		g.Version = v
++	case "v8.1", "v8.2", "v8.3", "v8.4", "v8.5", "v8.6", "v8.7", "v8.8", "v8.9",
++		"v9.0", "v9.1", "v9.2", "v9.3", "v9.4", "v9.5":
+ 		g.Version = v
++		// LSE extension is mandatory starting from 8.1
++		g.LSE = true
+ 	default:
+ 		e = fmt.Errorf("invalid GOARM64: must start with v8.{0-9} or v9.{0-5} and may optionally end in %q and/or %q",
+-			lseOpt, kpAtomicOpt)
++			lseOpt, cryptoOpt, kpAtomicOpt)
+ 		g.Version = defaultGOARM64
+ 	}
+ 
+@@ -177,6 +197,7 @@ func (g Goarm64Features) Supports(s string) bool {
+ 	if major == g_major {
+ 		return minor <= g_minor
+ 	} else if g_major == '9' {
++		// v9.0 diverged from v8.5. This means we should compare with g_minor increased by five.
+ 		return minor <= g_minor+5
+ 	} else {
+ 		return false
+diff --git a/src/internal/buildcfg/cfg_test.go b/src/internal/buildcfg/cfg_test.go
+index 1513cdc9b0..67f9d82d92 100644
+--- a/src/internal/buildcfg/cfg_test.go
++++ b/src/internal/buildcfg/cfg_test.go
+@@ -41,4 +41,89 @@ func TestConfigFlags(t *testing.T) {
+ 	if _ = goriscv64(); Error == nil {
+ 		t.Errorf("Wrong parsing of RISCV64=rva22")
+ 	}
++	Error = nil
++	os.Setenv("GOARM64", "v7.0")
++	if _ = goarm64(); Error == nil {
++		t.Errorf("Wrong parsing of GOARM64=7.0")
++	}
++	Error = nil
++	os.Setenv("GOARM64", "8.0")
++	if _ = goarm64(); Error == nil {
++		t.Errorf("Wrong parsing of GOARM64=8.0")
++	}
++	Error = nil
++	os.Setenv("GOARM64", "v8.0,lsb")
++	if _ = goarm64(); Error == nil {
++		t.Errorf("Wrong parsing of GOARM64=v8.0,lsb")
++	}
++	os.Setenv("GOARM64", "v8.0,lse")
++	if goarm64().Version != "v8.0" || goarm64().LSE != true || goarm64().Crypto != false {
++		t.Errorf("Wrong parsing of GOARM64=v8.0,lse")
++	}
++	os.Setenv("GOARM64", "v8.0,crypto")
++	if goarm64().Version != "v8.0" || goarm64().LSE != false || goarm64().Crypto != true {
++		t.Errorf("Wrong parsing of GOARM64=v8.0,crypto")
++	}
++	os.Setenv("GOARM64", "v8.0,crypto,lse")
++	if goarm64().Version != "v8.0" || goarm64().LSE != true || goarm64().Crypto != true {
++		t.Errorf("Wrong parsing of GOARM64=v8.0,crypto,lse")
++	}
++	os.Setenv("GOARM64", "v8.0,lse,crypto")
++	if goarm64().Version != "v8.0" || goarm64().LSE != true || goarm64().Crypto != true {
++		t.Errorf("Wrong parsing of GOARM64=v8.0,lse,crypto")
++	}
++	os.Setenv("GOARM64", "v9.0")
++	if goarm64().Version != "v9.0" || goarm64().LSE != true || goarm64().Crypto != false {
++		t.Errorf("Wrong parsing of GOARM64=v9.0")
++	}
++}
++
++func TestGoarm64FeaturesSupports(t *testing.T) {
++	g := parseGoarm64("v9.3")
++
++	if !g.Supports("v9.3") {
++		t.Errorf("Wrong goarm64Features.Supports for v9.3, v9.3")
++	}
++
++	if g.Supports("v9.4") {
++		t.Errorf("Wrong goarm64Features.Supports for v9.3, v9.4")
++	}
++
++	if !g.Supports("v8.8") {
++		t.Errorf("Wrong goarm64Features.Supports for v9.3, v8.8")
++	}
++
++	if g.Supports("v8.9") {
++		t.Errorf("Wrong goarm64Features.Supports for v9.3, v8.9")
++	}
++
++	if g.Supports(",lse") {
++		t.Errorf("Wrong goarm64Features.Supports for v9.3, ,lse")
++	}
++}
++
++func TestGogoarchTags(t *testing.T) {
++	old_goarch := GOARCH
++	old_goarm64 := GOARM64
++
++	GOARCH = "arm64"
++
++	os.Setenv("GOARM64", "v9.5")
++	GOARM64 = goarm64()
++	tags := gogoarchTags()
++	want := []string{"arm64.v9.0", "arm64.v9.1", "arm64.v9.2", "arm64.v9.3", "arm64.v9.4", "arm64.v9.5",
++		"arm64.v8.0", "arm64.v8.1", "arm64.v8.2", "arm64.v8.3", "arm64.v8.4", "arm64.v8.5", "arm64.v8.6", "arm64.v8.7", "arm64.v8.8", "arm64.v8.9"}
++	if len(tags) != len(want) {
++		t.Errorf("Wrong number of tags for GOARM64=v9.5")
++	} else {
++		for i, v := range tags {
++			if v != want[i] {
++				t.Error("Wrong tags for GOARM64=v9.5")
++				break
++			}
++		}
++	}
++
++	GOARCH = old_goarch
++	GOARM64 = old_goarm64
+ }
+-- 
+2.39.5
+
diff --git a/2111-cmd-compile-internal-ssagen-provide-intrinsicBuilder.patch b/2111-cmd-compile-internal-ssagen-provide-intrinsicBuilder.patch
new file mode 100644
index 0000000..8975dbe
--- /dev/null
+++ b/2111-cmd-compile-internal-ssagen-provide-intrinsicBuilder.patch
@@ -0,0 +1,706 @@
+From 6f517fe363d2bc8ce99f839f6736f6802790fa1a Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:48:20 +0800
+Subject: [PATCH 111/119] cmd/compile/internal/ssagen: provide
+ intrinsicBuilders
+
+Create an intrinsicBuilders type that has functions for adding and
+looking up intrinsics. This makes the implementation more self contained,
+readable and testable. Additionally, pass an *intrinsicBuildConfig to
+initIntrinsics to improve testability without needing to modify package
+level variables.
+
+Change-Id: I0ee0a19c192dd6da9f1c5f1c29b98a3ad8161fe2
+Reviewed-on: https://go-review.googlesource.com/c/go/+/605478
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Keith Randall <khr@google.com>
+Auto-Submit: Joel Sing <joel@sing.id.au>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+---
+ src/cmd/compile/internal/ssagen/intrinsics.go | 227 ++++++++----------
+ .../internal/ssagen/intrinsics_test.go        | 195 +++++++++------
+ src/cmd/compile/internal/ssagen/ssa.go        |   2 +-
+ 3 files changed, 224 insertions(+), 200 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
+index 59eb1869bb..c62837cd5b 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics.go
+@@ -15,7 +15,7 @@ import (
+ 	"cmd/internal/sys"
+ )
+ 
+-var intrinsics map[intrinsicKey]intrinsicBuilder
++var intrinsics intrinsicBuilders
+ 
+ // An intrinsicBuilder converts a call node n into an ssa value that
+ // implements that call as an intrinsic. args is a list of arguments to the func.
+@@ -27,8 +27,80 @@ type intrinsicKey struct {
+ 	fn   string
+ }
+ 
+-func initIntrinsics() {
+-	intrinsics = map[intrinsicKey]intrinsicBuilder{}
++// intrinsicBuildConfig specifies the config to use for intrinsic building.
++type intrinsicBuildConfig struct {
++	instrumenting bool
++
++	go386     string
++	goamd64   int
++	goarm     int
++	goarm64   buildcfg.Goarm64Features
++	gomips    string
++	gomips64  string
++	goppc64   int
++	goriscv64 int
++}
++
++type intrinsicBuilders map[intrinsicKey]intrinsicBuilder
++
++// add adds the intrinsic builder b for pkg.fn for the given architecture.
++func (ib intrinsicBuilders) add(arch *sys.Arch, pkg, fn string, b intrinsicBuilder) {
++	ib[intrinsicKey{arch, pkg, fn}] = b
++}
++
++// addForArchs adds the intrinsic builder b for pkg.fn for the given architectures.
++func (ib intrinsicBuilders) addForArchs(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
++	for _, arch := range archs {
++		ib.add(arch, pkg, fn, b)
++	}
++}
++
++// addForFamilies does the same as addForArchs but operates on architecture families.
++func (ib intrinsicBuilders) addForFamilies(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
++	for _, arch := range sys.Archs {
++		if arch.InFamily(archFamilies...) {
++			intrinsics.add(arch, pkg, fn, b)
++		}
++	}
++}
++
++// alias aliases pkg.fn to targetPkg.targetFn for all architectures in archs
++// for which targetPkg.targetFn already exists.
++func (ib intrinsicBuilders) alias(pkg, fn, targetPkg, targetFn string, archs ...*sys.Arch) {
++	// TODO(jsing): Consider making this work even if the alias is added
++	// before the intrinsic.
++	aliased := false
++	for _, arch := range archs {
++		if b := intrinsics.lookup(arch, targetPkg, targetFn); b != nil {
++			intrinsics.add(arch, pkg, fn, b)
++			aliased = true
++		}
++	}
++	if !aliased {
++		panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
++	}
++}
++
++// lookup looks up the intrinsic for a pkg.fn on the specified architecture.
++func (ib intrinsicBuilders) lookup(arch *sys.Arch, pkg, fn string) intrinsicBuilder {
++	return intrinsics[intrinsicKey{arch, pkg, fn}]
++}
++
++func initIntrinsics(cfg *intrinsicBuildConfig) {
++	if cfg == nil {
++		cfg = &intrinsicBuildConfig{
++			instrumenting: base.Flag.Cfg.Instrumenting,
++			go386:         buildcfg.GO386,
++			goamd64:       buildcfg.GOAMD64,
++			goarm:         buildcfg.GOARM,
++			goarm64:       buildcfg.GOARM64,
++			gomips:        buildcfg.GOMIPS,
++			gomips64:      buildcfg.GOMIPS64,
++			goppc64:       buildcfg.GOPPC64,
++			goriscv64:     buildcfg.GORISCV64,
++		}
++	}
++	intrinsics = intrinsicBuilders{}
+ 
+ 	var p4 []*sys.Arch
+ 	var p8 []*sys.Arch
+@@ -45,36 +117,18 @@ func initIntrinsics() {
+ 	}
+ 	all := sys.Archs[:]
+ 
+-	// add adds the intrinsic b for pkg.fn for the given list of architectures.
+ 	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
+-		for _, a := range archs {
+-			intrinsics[intrinsicKey{a, pkg, fn}] = b
+-		}
++		intrinsics.addForArchs(pkg, fn, b, archs...)
+ 	}
+-	// addF does the same as add but operates on architecture families.
+ 	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
+-		for _, a := range sys.Archs {
+-			if a.InFamily(archFamilies...) {
+-				intrinsics[intrinsicKey{a, pkg, fn}] = b
+-			}
+-		}
++		intrinsics.addForFamilies(pkg, fn, b, archFamilies...)
+ 	}
+-	// alias defines pkg.fn = pkg2.fn2 for all architectures in archs for which pkg2.fn2 exists.
+ 	alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
+-		aliased := false
+-		for _, a := range archs {
+-			if b, ok := intrinsics[intrinsicKey{a, pkg2, fn2}]; ok {
+-				intrinsics[intrinsicKey{a, pkg, fn}] = b
+-				aliased = true
+-			}
+-		}
+-		if !aliased {
+-			panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
+-		}
++		intrinsics.alias(pkg, fn, pkg2, fn2, archs...)
+ 	}
+ 
+ 	/******** runtime ********/
+-	if !base.Flag.Cfg.Instrumenting {
++	if !cfg.instrumenting {
+ 		add("runtime", "slicebytetostringtmp",
+ 			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
+@@ -125,18 +179,18 @@ func initIntrinsics() {
+ 		sys.ARM64, sys.PPC64, sys.RISCV64)
+ 
+ 	brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X}
+-	if buildcfg.GOPPC64 >= 10 {
++	if cfg.goppc64 >= 10 {
+ 		// Use only on Power10 as the new byte reverse instructions that Power10 provide
+ 		// make it worthwhile as an intrinsic
+ 		brev_arch = append(brev_arch, sys.PPC64)
+ 	}
+-	/******** internal/runtime/sys ********/
+-	addF("internal/runtime/sys", "Bswap32",
++	/******** runtime/internal/sys ********/
++	addF("runtime/internal/sys", "Bswap32",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
+ 		},
+ 		brev_arch...)
+-	addF("internal/runtime/sys", "Bswap64",
++	addF("runtime/internal/sys", "Bswap64",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
+ 		},
+@@ -152,9 +206,9 @@ func initIntrinsics() {
+ 
+ 	// Make Prefetch intrinsics for supported platforms
+ 	// On the unsupported platforms stub function will be eliminated
+-	addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
++	addF("runtime/internal/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
+ 		sys.AMD64, sys.ARM64, sys.PPC64)
+-	addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
++	addF("runtime/internal/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
+ 		sys.AMD64, sys.ARM64, sys.PPC64)
+ 
+ 	/******** internal/runtime/atomic ********/
+@@ -258,7 +312,7 @@ func initIntrinsics() {
+ 	makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
+ 
+ 		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if buildcfg.GOARM64.LSE || buildcfg.GOARM64.KPAtomicOpt {
++			if cfg.goarm64.LSE || cfg.goarm64.KPAtomicOpt {
+ 				emit(s, n, args, op1, typ, needReturn)
+ 			} else {
+ 				// Target Atomic feature is identified by dynamic detection
+@@ -297,9 +351,6 @@ func initIntrinsics() {
+ 	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
+ 		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
+ 	}
+-	makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
+-		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
+-	}
+ 
+ 	atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
+ 		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
+@@ -400,67 +451,6 @@ func initIntrinsics() {
+ 		},
+ 		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+ 
+-	// arm64 always uses the new-style atomic logical operations, for both the
+-	// old and new style API.
+-	addF("internal/runtime/atomic", "And8",
+-		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
+-		sys.ARM64)
+-	addF("internal/runtime/atomic", "Or8",
+-		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
+-		sys.ARM64)
+-	addF("internal/runtime/atomic", "And64",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
+-		sys.ARM64)
+-	addF("internal/runtime/atomic", "And32",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
+-		sys.ARM64)
+-	addF("internal/runtime/atomic", "And",
+-		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
+-		sys.ARM64)
+-	addF("internal/runtime/atomic", "Or64",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
+-		sys.ARM64)
+-	addF("internal/runtime/atomic", "Or32",
+-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
+-		sys.ARM64)
+-	addF("internal/runtime/atomic", "Or",
+-		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
+-		sys.ARM64)
+-
+-	// New-style atomic logical operations, which return the old memory value.
+-	addF("internal/runtime/atomic", "And64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+-			p0, p1 := s.split(v)
+-			s.vars[memVar] = p1
+-			return p0
+-		},
+-		sys.AMD64)
+-	addF("internal/runtime/atomic", "And32",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+-			p0, p1 := s.split(v)
+-			s.vars[memVar] = p1
+-			return p0
+-		},
+-		sys.AMD64)
+-	addF("internal/runtime/atomic", "Or64",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+-			p0, p1 := s.split(v)
+-			s.vars[memVar] = p1
+-			return p0
+-		},
+-		sys.AMD64)
+-	addF("internal/runtime/atomic", "Or32",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+-			p0, p1 := s.split(v)
+-			s.vars[memVar] = p1
+-			return p0
+-		},
+-		sys.AMD64)
+-
+ 	// Aliases for atomic load operations
+ 	alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
+ 	alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
+@@ -508,9 +498,9 @@ func initIntrinsics() {
+ 	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
+ 	alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
+ 
+-	// Aliases for atomic And/Or operations
+-	alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64)
+-	alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64)
++	// // Aliases for atomic And/Or operations
++	// alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64)
++	// alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64)
+ 
+ 	/******** math ********/
+ 	addF("math", "sqrt",
+@@ -565,7 +555,7 @@ func initIntrinsics() {
+ 				return s.variable(n, types.Types[types.TFLOAT64])
+ 			}
+ 
+-			if buildcfg.GOAMD64 >= 3 {
++			if cfg.goamd64 >= 3 {
+ 				return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+ 			}
+ 
+@@ -631,7 +621,7 @@ func initIntrinsics() {
+ 
+ 	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if buildcfg.GOAMD64 >= 2 {
++			if cfg.goamd64 >= 2 {
+ 				return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
+ 			}
+ 
+@@ -727,12 +717,12 @@ func initIntrinsics() {
+ 			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
+ 		},
+ 		sys.S390X)
+-	alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
+-	alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
++	alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...)
++	alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...)
+ 	// ReverseBytes inlines correctly, no need to intrinsify it.
+ 	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
+ 	// On Power10, 16-bit rotate is not available so use BRH instruction
+-	if buildcfg.GOPPC64 >= 10 {
++	if cfg.goppc64 >= 10 {
+ 		addF("math/bits", "ReverseBytes16",
+ 			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
+@@ -847,7 +837,7 @@ func initIntrinsics() {
+ 
+ 	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if buildcfg.GOAMD64 >= 2 {
++			if cfg.goamd64 >= 2 {
+ 				return s.newValue1(op, types.Types[types.TINT], args[0])
+ 			}
+ 
+@@ -941,12 +931,12 @@ func initIntrinsics() {
+ 		sys.AMD64)
+ 	alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
+ 
+-	alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
+-	alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
+-	alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
+-	alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
+-	alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
+-	alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
++	alias("runtime/internal/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
++	alias("runtime/internal/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
++	alias("runtime/internal/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
++	alias("runtime/internal/sys", "Len8", "math/bits", "Len8", all...)
++	alias("runtime/internal/sys", "Len64", "math/bits", "Len64", all...)
++	alias("runtime/internal/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
+ 
+ 	/******** sync/atomic ********/
+ 
+@@ -988,17 +978,6 @@ func initIntrinsics() {
+ 	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
+ 	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
+ 
+-	alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
+-	alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
+-	alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
+-	alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
+-	alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
+-	alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
+-	alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
+-	alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
+-	alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
+-	alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
+-
+ 	/******** math/big ********/
+ 	alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
+ }
+@@ -1032,14 +1011,14 @@ func findIntrinsic(sym *types.Sym) intrinsicBuilder {
+ 			return nil
+ 		}
+ 	}
+-	return intrinsics[intrinsicKey{Arch.LinkArch.Arch, pkg, fn}]
++	return intrinsics.lookup(Arch.LinkArch.Arch, pkg, fn)
+ }
+ 
+ func IsIntrinsicCall(n *ir.CallExpr) bool {
+ 	if n == nil {
+ 		return false
+ 	}
+-	name, ok := n.Fun.(*ir.Name)
++	name, ok := n.X.(*ir.Name)
+ 	if !ok {
+ 		return false
+ 	}
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+index 74ea276cc0..d725e4092a 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+@@ -7,6 +7,8 @@ package ssagen
+ import (
+ 	"internal/buildcfg"
+ 	"testing"
++
++	"cmd/internal/sys"
+ )
+ 
+ type testIntrinsicKey struct {
+@@ -17,11 +19,11 @@ type testIntrinsicKey struct {
+ 
+ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"386", "internal/runtime/math", "MulUintptr"}:             struct{}{},
+-	{"386", "internal/runtime/sys", "Bswap32"}:                 struct{}{},
+-	{"386", "internal/runtime/sys", "Bswap64"}:                 struct{}{},
+-	{"386", "internal/runtime/sys", "TrailingZeros32"}:         struct{}{},
+-	{"386", "internal/runtime/sys", "TrailingZeros64"}:         struct{}{},
+-	{"386", "internal/runtime/sys", "TrailingZeros8"}:          struct{}{},
++	{"386", "runtime/internal/sys", "Bswap32"}:                 struct{}{},
++	{"386", "runtime/internal/sys", "Bswap64"}:                 struct{}{},
++	{"386", "runtime/internal/sys", "TrailingZeros32"}:         struct{}{},
++	{"386", "runtime/internal/sys", "TrailingZeros64"}:         struct{}{},
++	{"386", "runtime/internal/sys", "TrailingZeros8"}:          struct{}{},
+ 	{"386", "math", "sqrt"}:                                    struct{}{},
+ 	{"386", "math/bits", "ReverseBytes32"}:                     struct{}{},
+ 	{"386", "math/bits", "ReverseBytes64"}:                     struct{}{},
+@@ -79,16 +81,16 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"amd64", "internal/runtime/math", "Add64"}:                struct{}{},
+ 	{"amd64", "internal/runtime/math", "Mul64"}:                struct{}{},
+ 	{"amd64", "internal/runtime/math", "MulUintptr"}:           struct{}{},
+-	{"amd64", "internal/runtime/sys", "Bswap32"}:               struct{}{},
+-	{"amd64", "internal/runtime/sys", "Bswap64"}:               struct{}{},
+-	{"amd64", "internal/runtime/sys", "Len64"}:                 struct{}{},
+-	{"amd64", "internal/runtime/sys", "Len8"}:                  struct{}{},
+-	{"amd64", "internal/runtime/sys", "OnesCount64"}:           struct{}{},
+-	{"amd64", "internal/runtime/sys", "Prefetch"}:              struct{}{},
+-	{"amd64", "internal/runtime/sys", "PrefetchStreamed"}:      struct{}{},
+-	{"amd64", "internal/runtime/sys", "TrailingZeros32"}:       struct{}{},
+-	{"amd64", "internal/runtime/sys", "TrailingZeros64"}:       struct{}{},
+-	{"amd64", "internal/runtime/sys", "TrailingZeros8"}:        struct{}{},
++	{"amd64", "runtime/internal/sys", "Bswap32"}:               struct{}{},
++	{"amd64", "runtime/internal/sys", "Bswap64"}:               struct{}{},
++	{"amd64", "runtime/internal/sys", "Len64"}:                 struct{}{},
++	{"amd64", "runtime/internal/sys", "Len8"}:                  struct{}{},
++	{"amd64", "runtime/internal/sys", "OnesCount64"}:           struct{}{},
++	{"amd64", "runtime/internal/sys", "Prefetch"}:              struct{}{},
++	{"amd64", "runtime/internal/sys", "PrefetchStreamed"}:      struct{}{},
++	{"amd64", "runtime/internal/sys", "TrailingZeros32"}:       struct{}{},
++	{"amd64", "runtime/internal/sys", "TrailingZeros64"}:       struct{}{},
++	{"amd64", "runtime/internal/sys", "TrailingZeros8"}:        struct{}{},
+ 	{"amd64", "math", "Ceil"}:                                  struct{}{},
+ 	{"amd64", "math", "FMA"}:                                   struct{}{},
+ 	{"amd64", "math", "Floor"}:                                 struct{}{},
+@@ -157,13 +159,13 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"amd64", "sync/atomic", "SwapUint32"}:                     struct{}{},
+ 	{"amd64", "sync/atomic", "SwapUint64"}:                     struct{}{},
+ 	{"amd64", "sync/atomic", "SwapUintptr"}:                    struct{}{},
+-	{"arm", "internal/runtime/sys", "Bswap32"}:                 struct{}{},
+-	{"arm", "internal/runtime/sys", "Bswap64"}:                 struct{}{},
+-	{"arm", "internal/runtime/sys", "Len64"}:                   struct{}{},
+-	{"arm", "internal/runtime/sys", "Len8"}:                    struct{}{},
+-	{"arm", "internal/runtime/sys", "TrailingZeros32"}:         struct{}{},
+-	{"arm", "internal/runtime/sys", "TrailingZeros64"}:         struct{}{},
+-	{"arm", "internal/runtime/sys", "TrailingZeros8"}:          struct{}{},
++	{"arm", "runtime/internal/sys", "Bswap32"}:                 struct{}{},
++	{"arm", "runtime/internal/sys", "Bswap64"}:                 struct{}{},
++	{"arm", "runtime/internal/sys", "Len64"}:                   struct{}{},
++	{"arm", "runtime/internal/sys", "Len8"}:                    struct{}{},
++	{"arm", "runtime/internal/sys", "TrailingZeros32"}:         struct{}{},
++	{"arm", "runtime/internal/sys", "TrailingZeros64"}:         struct{}{},
++	{"arm", "runtime/internal/sys", "TrailingZeros8"}:          struct{}{},
+ 	{"arm", "math", "Abs"}:                                     struct{}{},
+ 	{"arm", "math", "FMA"}:                                     struct{}{},
+ 	{"arm", "math", "sqrt"}:                                    struct{}{},
+@@ -225,16 +227,16 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"arm64", "internal/runtime/math", "Add64"}:                struct{}{},
+ 	{"arm64", "internal/runtime/math", "Mul64"}:                struct{}{},
+ 	{"arm64", "internal/runtime/math", "MulUintptr"}:           struct{}{},
+-	{"arm64", "internal/runtime/sys", "Bswap32"}:               struct{}{},
+-	{"arm64", "internal/runtime/sys", "Bswap64"}:               struct{}{},
+-	{"arm64", "internal/runtime/sys", "Len64"}:                 struct{}{},
+-	{"arm64", "internal/runtime/sys", "Len8"}:                  struct{}{},
+-	{"arm64", "internal/runtime/sys", "OnesCount64"}:           struct{}{},
+-	{"arm64", "internal/runtime/sys", "Prefetch"}:              struct{}{},
+-	{"arm64", "internal/runtime/sys", "PrefetchStreamed"}:      struct{}{},
+-	{"arm64", "internal/runtime/sys", "TrailingZeros32"}:       struct{}{},
+-	{"arm64", "internal/runtime/sys", "TrailingZeros64"}:       struct{}{},
+-	{"arm64", "internal/runtime/sys", "TrailingZeros8"}:        struct{}{},
++	{"arm64", "runtime/internal/sys", "Bswap32"}:               struct{}{},
++	{"arm64", "runtime/internal/sys", "Bswap64"}:               struct{}{},
++	{"arm64", "runtime/internal/sys", "Len64"}:                 struct{}{},
++	{"arm64", "runtime/internal/sys", "Len8"}:                  struct{}{},
++	{"arm64", "runtime/internal/sys", "OnesCount64"}:           struct{}{},
++	{"arm64", "runtime/internal/sys", "Prefetch"}:              struct{}{},
++	{"arm64", "runtime/internal/sys", "PrefetchStreamed"}:      struct{}{},
++	{"arm64", "runtime/internal/sys", "TrailingZeros32"}:       struct{}{},
++	{"arm64", "runtime/internal/sys", "TrailingZeros64"}:       struct{}{},
++	{"arm64", "runtime/internal/sys", "TrailingZeros8"}:        struct{}{},
+ 	{"arm64", "math", "Abs"}:                                   struct{}{},
+ 	{"arm64", "math", "Ceil"}:                                  struct{}{},
+ 	{"arm64", "math", "FMA"}:                                   struct{}{},
+@@ -423,11 +425,11 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"mips", "internal/runtime/atomic", "Xchg"}:                struct{}{},
+ 	{"mips", "internal/runtime/atomic", "Xchgint32"}:           struct{}{},
+ 	{"mips", "internal/runtime/atomic", "Xchguintptr"}:         struct{}{},
+-	{"mips", "internal/runtime/sys", "Len64"}:                  struct{}{},
+-	{"mips", "internal/runtime/sys", "Len8"}:                   struct{}{},
+-	{"mips", "internal/runtime/sys", "TrailingZeros32"}:        struct{}{},
+-	{"mips", "internal/runtime/sys", "TrailingZeros64"}:        struct{}{},
+-	{"mips", "internal/runtime/sys", "TrailingZeros8"}:         struct{}{},
++	{"mips", "runtime/internal/sys", "Len64"}:                  struct{}{},
++	{"mips", "runtime/internal/sys", "Len8"}:                   struct{}{},
++	{"mips", "runtime/internal/sys", "TrailingZeros32"}:        struct{}{},
++	{"mips", "runtime/internal/sys", "TrailingZeros64"}:        struct{}{},
++	{"mips", "runtime/internal/sys", "TrailingZeros8"}:         struct{}{},
+ 	{"mips", "math", "Abs"}:                                    struct{}{},
+ 	{"mips", "math", "sqrt"}:                                   struct{}{},
+ 	{"mips", "math/bits", "Len"}:                               struct{}{},
+@@ -666,11 +668,11 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"mipsle", "internal/runtime/atomic", "Xchg"}:              struct{}{},
+ 	{"mipsle", "internal/runtime/atomic", "Xchgint32"}:         struct{}{},
+ 	{"mipsle", "internal/runtime/atomic", "Xchguintptr"}:       struct{}{},
+-	{"mipsle", "internal/runtime/sys", "Len64"}:                struct{}{},
+-	{"mipsle", "internal/runtime/sys", "Len8"}:                 struct{}{},
+-	{"mipsle", "internal/runtime/sys", "TrailingZeros32"}:      struct{}{},
+-	{"mipsle", "internal/runtime/sys", "TrailingZeros64"}:      struct{}{},
+-	{"mipsle", "internal/runtime/sys", "TrailingZeros8"}:       struct{}{},
++	{"mipsle", "runtime/internal/sys", "Len64"}:                struct{}{},
++	{"mipsle", "runtime/internal/sys", "Len8"}:                 struct{}{},
++	{"mipsle", "runtime/internal/sys", "TrailingZeros32"}:      struct{}{},
++	{"mipsle", "runtime/internal/sys", "TrailingZeros64"}:      struct{}{},
++	{"mipsle", "runtime/internal/sys", "TrailingZeros8"}:       struct{}{},
+ 	{"mipsle", "math", "Abs"}:                                  struct{}{},
+ 	{"mipsle", "math", "sqrt"}:                                 struct{}{},
+ 	{"mipsle", "math/bits", "Len"}:                             struct{}{},
+@@ -748,13 +750,13 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64", "internal/runtime/atomic", "Xchguintptr"}:        struct{}{},
+ 	{"ppc64", "internal/runtime/math", "Add64"}:                struct{}{},
+ 	{"ppc64", "internal/runtime/math", "Mul64"}:                struct{}{},
+-	{"ppc64", "internal/runtime/sys", "Len64"}:                 struct{}{},
+-	{"ppc64", "internal/runtime/sys", "Len8"}:                  struct{}{},
+-	{"ppc64", "internal/runtime/sys", "OnesCount64"}:           struct{}{},
+-	{"ppc64", "internal/runtime/sys", "Prefetch"}:              struct{}{},
+-	{"ppc64", "internal/runtime/sys", "PrefetchStreamed"}:      struct{}{},
+-	{"ppc64", "internal/runtime/sys", "TrailingZeros32"}:       struct{}{},
+-	{"ppc64", "internal/runtime/sys", "TrailingZeros64"}:       struct{}{},
++	{"ppc64", "runtime/internal/sys", "Len64"}:                 struct{}{},
++	{"ppc64", "runtime/internal/sys", "Len8"}:                  struct{}{},
++	{"ppc64", "runtime/internal/sys", "OnesCount64"}:           struct{}{},
++	{"ppc64", "runtime/internal/sys", "Prefetch"}:              struct{}{},
++	{"ppc64", "runtime/internal/sys", "PrefetchStreamed"}:      struct{}{},
++	{"ppc64", "runtime/internal/sys", "TrailingZeros32"}:       struct{}{},
++	{"ppc64", "runtime/internal/sys", "TrailingZeros64"}:       struct{}{},
+ 	{"ppc64", "math", "Abs"}:                                   struct{}{},
+ 	{"ppc64", "math", "Ceil"}:                                  struct{}{},
+ 	{"ppc64", "math", "Copysign"}:                              struct{}{},
+@@ -862,13 +864,13 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64le", "internal/runtime/atomic", "Xchguintptr"}:      struct{}{},
+ 	{"ppc64le", "internal/runtime/math", "Add64"}:              struct{}{},
+ 	{"ppc64le", "internal/runtime/math", "Mul64"}:              struct{}{},
+-	{"ppc64le", "internal/runtime/sys", "Len64"}:               struct{}{},
+-	{"ppc64le", "internal/runtime/sys", "Len8"}:                struct{}{},
+-	{"ppc64le", "internal/runtime/sys", "OnesCount64"}:         struct{}{},
+-	{"ppc64le", "internal/runtime/sys", "Prefetch"}:            struct{}{},
+-	{"ppc64le", "internal/runtime/sys", "PrefetchStreamed"}:    struct{}{},
+-	{"ppc64le", "internal/runtime/sys", "TrailingZeros32"}:     struct{}{},
+-	{"ppc64le", "internal/runtime/sys", "TrailingZeros64"}:     struct{}{},
++	{"ppc64le", "runtime/internal/sys", "Len64"}:               struct{}{},
++	{"ppc64le", "runtime/internal/sys", "Len8"}:                struct{}{},
++	{"ppc64le", "runtime/internal/sys", "OnesCount64"}:         struct{}{},
++	{"ppc64le", "runtime/internal/sys", "Prefetch"}:            struct{}{},
++	{"ppc64le", "runtime/internal/sys", "PrefetchStreamed"}:    struct{}{},
++	{"ppc64le", "runtime/internal/sys", "TrailingZeros32"}:     struct{}{},
++	{"ppc64le", "runtime/internal/sys", "TrailingZeros64"}:     struct{}{},
+ 	{"ppc64le", "math", "Abs"}:                                 struct{}{},
+ 	{"ppc64le", "math", "Ceil"}:                                struct{}{},
+ 	{"ppc64le", "math", "Copysign"}:                            struct{}{},
+@@ -1072,14 +1074,14 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"s390x", "internal/runtime/atomic", "Xchguintptr"}:        struct{}{},
+ 	{"s390x", "internal/runtime/math", "Add64"}:                struct{}{},
+ 	{"s390x", "internal/runtime/math", "Mul64"}:                struct{}{},
+-	{"s390x", "internal/runtime/sys", "Bswap32"}:               struct{}{},
+-	{"s390x", "internal/runtime/sys", "Bswap64"}:               struct{}{},
+-	{"s390x", "internal/runtime/sys", "Len64"}:                 struct{}{},
+-	{"s390x", "internal/runtime/sys", "Len8"}:                  struct{}{},
+-	{"s390x", "internal/runtime/sys", "OnesCount64"}:           struct{}{},
+-	{"s390x", "internal/runtime/sys", "TrailingZeros32"}:       struct{}{},
+-	{"s390x", "internal/runtime/sys", "TrailingZeros64"}:       struct{}{},
+-	{"s390x", "internal/runtime/sys", "TrailingZeros8"}:        struct{}{},
++	{"s390x", "runtime/internal/sys", "Bswap32"}:               struct{}{},
++	{"s390x", "runtime/internal/sys", "Bswap64"}:               struct{}{},
++	{"s390x", "runtime/internal/sys", "Len64"}:                 struct{}{},
++	{"s390x", "runtime/internal/sys", "Len8"}:                  struct{}{},
++	{"s390x", "runtime/internal/sys", "OnesCount64"}:           struct{}{},
++	{"s390x", "runtime/internal/sys", "TrailingZeros32"}:       struct{}{},
++	{"s390x", "runtime/internal/sys", "TrailingZeros64"}:       struct{}{},
++	{"s390x", "runtime/internal/sys", "TrailingZeros8"}:        struct{}{},
+ 	{"s390x", "math", "Ceil"}:                                  struct{}{},
+ 	{"s390x", "math", "FMA"}:                                   struct{}{},
+ 	{"s390x", "math", "Floor"}:                                 struct{}{},
+@@ -1145,12 +1147,12 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"s390x", "sync/atomic", "SwapUint32"}:                     struct{}{},
+ 	{"s390x", "sync/atomic", "SwapUint64"}:                     struct{}{},
+ 	{"s390x", "sync/atomic", "SwapUintptr"}:                    struct{}{},
+-	{"wasm", "internal/runtime/sys", "Len64"}:                  struct{}{},
+-	{"wasm", "internal/runtime/sys", "Len8"}:                   struct{}{},
+-	{"wasm", "internal/runtime/sys", "OnesCount64"}:            struct{}{},
+-	{"wasm", "internal/runtime/sys", "TrailingZeros32"}:        struct{}{},
+-	{"wasm", "internal/runtime/sys", "TrailingZeros64"}:        struct{}{},
+-	{"wasm", "internal/runtime/sys", "TrailingZeros8"}:         struct{}{},
++	{"wasm", "runtime/internal/sys", "Len64"}:                  struct{}{},
++	{"wasm", "runtime/internal/sys", "Len8"}:                   struct{}{},
++	{"wasm", "runtime/internal/sys", "OnesCount64"}:            struct{}{},
++	{"wasm", "runtime/internal/sys", "TrailingZeros32"}:        struct{}{},
++	{"wasm", "runtime/internal/sys", "TrailingZeros64"}:        struct{}{},
++	{"wasm", "runtime/internal/sys", "TrailingZeros8"}:         struct{}{},
+ 	{"wasm", "math", "Abs"}:                                    struct{}{},
+ 	{"wasm", "math", "Ceil"}:                                   struct{}{},
+ 	{"wasm", "math", "Copysign"}:                               struct{}{},
+@@ -1182,20 +1184,20 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ }
+ 
+ var wantIntrinsicsPower10 = map[testIntrinsicKey]struct{}{
+-	{"ppc64", "internal/runtime/sys", "Bswap32"}:   struct{}{},
+-	{"ppc64", "internal/runtime/sys", "Bswap64"}:   struct{}{},
++	{"ppc64", "runtime/internal/sys", "Bswap32"}:   struct{}{},
++	{"ppc64", "runtime/internal/sys", "Bswap64"}:   struct{}{},
+ 	{"ppc64", "math/bits", "ReverseBytes16"}:       struct{}{},
+ 	{"ppc64", "math/bits", "ReverseBytes32"}:       struct{}{},
+ 	{"ppc64", "math/bits", "ReverseBytes64"}:       struct{}{},
+-	{"ppc64le", "internal/runtime/sys", "Bswap32"}: struct{}{},
+-	{"ppc64le", "internal/runtime/sys", "Bswap64"}: struct{}{},
++	{"ppc64le", "runtime/internal/sys", "Bswap32"}: struct{}{},
++	{"ppc64le", "runtime/internal/sys", "Bswap64"}: struct{}{},
+ 	{"ppc64le", "math/bits", "ReverseBytes16"}:     struct{}{},
+ 	{"ppc64le", "math/bits", "ReverseBytes32"}:     struct{}{},
+ 	{"ppc64le", "math/bits", "ReverseBytes64"}:     struct{}{},
+ }
+ 
+ func TestIntrinsics(t *testing.T) {
+-	initIntrinsics()
++	initIntrinsics(nil)
+ 
+ 	want := make(map[testIntrinsicKey]struct{})
+ 	for ik, iv := range wantIntrinsics {
+@@ -1222,3 +1224,46 @@ func TestIntrinsics(t *testing.T) {
+ 		}
+ 	}
+ }
++
++func TestIntrinsicBuilders(t *testing.T) {
++	cfg := &intrinsicBuildConfig{}
++	initIntrinsics(cfg)
++
++	for _, arch := range sys.Archs {
++		if intrinsics.lookup(arch, "runtime", "getcallersp") == nil {
++			t.Errorf("No intrinsic for runtime.getcallersp on arch %v", arch)
++		}
++	}
++
++	if intrinsics.lookup(sys.ArchAMD64, "runtime", "slicebytetostringtmp") == nil {
++		t.Error("No intrinsic for runtime.slicebytetostringtmp")
++	}
++
++	if intrinsics.lookup(sys.ArchRISCV64, "runtime", "publicationBarrier") == nil {
++		t.Errorf("No intrinsic for runtime.publicationBarrier on arch %v", sys.ArchRISCV64)
++	}
++
++	if intrinsics.lookup(sys.ArchAMD64, "runtime/internal/sys", "Bswap32") == nil {
++		t.Errorf("No intrinsic for runtime/internal/sys.Bswap32 on arch %v", sys.ArchAMD64)
++	}
++	if intrinsics.lookup(sys.ArchAMD64, "runtime/internal/sys", "Bswap64") == nil {
++		t.Errorf("No intrinsic for runtime/internal/sys.Bswap64 on arch %v", sys.ArchAMD64)
++	}
++
++	if intrinsics.lookup(sys.ArchPPC64, "runtime/internal/sys", "Bswap64") != nil {
++		t.Errorf("Found intrinsic for runtime/internal/sys.Bswap64 on arch %v", sys.ArchPPC64)
++	}
++
++	cfg.goppc64 = 10
++	cfg.instrumenting = true
++
++	initIntrinsics(cfg)
++
++	if intrinsics.lookup(sys.ArchAMD64, "runtime", "slicebytetostringtmp") != nil {
++		t.Error("Intrinsic incorrectly exists for runtime.slicebytetostringtmp")
++	}
++
++	if intrinsics.lookup(sys.ArchPPC64, "runtime/internal/sys", "Bswap64") == nil {
++		t.Errorf("No intrinsic for runtime/internal/sys.Bswap64 on arch %v", sys.ArchPPC64)
++	}
++}
+diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
+index 0f6f2de4a7..71ad2df0a7 100644
+--- a/src/cmd/compile/internal/ssagen/ssa.go
++++ b/src/cmd/compile/internal/ssagen/ssa.go
+@@ -209,7 +209,7 @@ func InitConfig() {
+ }
+ 
+ func InitTables() {
+-	initIntrinsics()
++	initIntrinsics(nil)
+ }
+ 
+ // AbiForBodylessFuncStackMap returns the ABI for a bodyless function's stack map.
+-- 
+2.39.5
+
diff --git a/2112-cmd-compile-internal-ssagen-improve-intrinsic-test.patch b/2112-cmd-compile-internal-ssagen-improve-intrinsic-test.patch
new file mode 100644
index 0000000..02c47f7
--- /dev/null
+++ b/2112-cmd-compile-internal-ssagen-improve-intrinsic-test.patch
@@ -0,0 +1,155 @@
+From e22dc7c7ba5db0c2850f8d11cc5bacc0c99598dd Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:48:22 +0800
+Subject: [PATCH 112/119] cmd/compile/internal/ssagen: improve intrinsic test
+
+Now that we can pass configuration to initIntrinsics, clean up the
+intrinsic test and always enable power10. Additionally, provide an
+-update flag that prints out updated golden values.
+
+Change-Id: Ibfef339d513a4d67d53a5a310a82165592ca338f
+Reviewed-on: https://go-review.googlesource.com/c/go/+/607055
+Reviewed-by: David Chase <drchase@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Keith Randall <khr@google.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+---
+ .../internal/ssagen/intrinsics_test.go        | 72 +++++++++++--------
+ 1 file changed, 44 insertions(+), 28 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+index d725e4092a..4bf5fce2a5 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+@@ -5,12 +5,17 @@
+ package ssagen
+ 
+ import (
+-	"internal/buildcfg"
++	"flag"
++	"fmt"
++	"slices"
++	"strings"
+ 	"testing"
+ 
+ 	"cmd/internal/sys"
+ )
+ 
++var updateIntrinsics = flag.Bool("update", false, "Print an updated intrinsics table")
++
+ type testIntrinsicKey struct {
+ 	archName string
+ 	pkg      string
+@@ -750,6 +755,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64", "internal/runtime/atomic", "Xchguintptr"}:        struct{}{},
+ 	{"ppc64", "internal/runtime/math", "Add64"}:                struct{}{},
+ 	{"ppc64", "internal/runtime/math", "Mul64"}:                struct{}{},
++	{"ppc64", "runtime/internal/sys", "Bswap32"}:               struct{}{},
++	{"ppc64", "runtime/internal/sys", "Bswap64"}:               struct{}{},
+ 	{"ppc64", "runtime/internal/sys", "Len64"}:                 struct{}{},
+ 	{"ppc64", "runtime/internal/sys", "Len8"}:                  struct{}{},
+ 	{"ppc64", "runtime/internal/sys", "OnesCount64"}:           struct{}{},
+@@ -779,6 +786,9 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64", "math/bits", "OnesCount32"}:                      struct{}{},
+ 	{"ppc64", "math/bits", "OnesCount64"}:                      struct{}{},
+ 	{"ppc64", "math/bits", "OnesCount8"}:                       struct{}{},
++	{"ppc64", "math/bits", "ReverseBytes16"}:                   struct{}{},
++	{"ppc64", "math/bits", "ReverseBytes32"}:                   struct{}{},
++	{"ppc64", "math/bits", "ReverseBytes64"}:                   struct{}{},
+ 	{"ppc64", "math/bits", "RotateLeft"}:                       struct{}{},
+ 	{"ppc64", "math/bits", "RotateLeft32"}:                     struct{}{},
+ 	{"ppc64", "math/bits", "RotateLeft64"}:                     struct{}{},
+@@ -864,6 +874,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64le", "internal/runtime/atomic", "Xchguintptr"}:      struct{}{},
+ 	{"ppc64le", "internal/runtime/math", "Add64"}:              struct{}{},
+ 	{"ppc64le", "internal/runtime/math", "Mul64"}:              struct{}{},
++	{"ppc64le", "runtime/internal/sys", "Bswap32"}:             struct{}{},
++	{"ppc64le", "runtime/internal/sys", "Bswap64"}:             struct{}{},
+ 	{"ppc64le", "runtime/internal/sys", "Len64"}:               struct{}{},
+ 	{"ppc64le", "runtime/internal/sys", "Len8"}:                struct{}{},
+ 	{"ppc64le", "runtime/internal/sys", "OnesCount64"}:         struct{}{},
+@@ -893,6 +905,9 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64le", "math/bits", "OnesCount32"}:                    struct{}{},
+ 	{"ppc64le", "math/bits", "OnesCount64"}:                    struct{}{},
+ 	{"ppc64le", "math/bits", "OnesCount8"}:                     struct{}{},
++	{"ppc64le", "math/bits", "ReverseBytes16"}:                 struct{}{},
++	{"ppc64le", "math/bits", "ReverseBytes32"}:                 struct{}{},
++	{"ppc64le", "math/bits", "ReverseBytes64"}:                 struct{}{},
+ 	{"ppc64le", "math/bits", "RotateLeft"}:                     struct{}{},
+ 	{"ppc64le", "math/bits", "RotateLeft32"}:                   struct{}{},
+ 	{"ppc64le", "math/bits", "RotateLeft64"}:                   struct{}{},
+@@ -1183,43 +1198,44 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"wasm", "runtime", "slicebytetostringtmp"}:                struct{}{},
+ }
+ 
+-var wantIntrinsicsPower10 = map[testIntrinsicKey]struct{}{
+-	{"ppc64", "runtime/internal/sys", "Bswap32"}:   struct{}{},
+-	{"ppc64", "runtime/internal/sys", "Bswap64"}:   struct{}{},
+-	{"ppc64", "math/bits", "ReverseBytes16"}:       struct{}{},
+-	{"ppc64", "math/bits", "ReverseBytes32"}:       struct{}{},
+-	{"ppc64", "math/bits", "ReverseBytes64"}:       struct{}{},
+-	{"ppc64le", "runtime/internal/sys", "Bswap32"}: struct{}{},
+-	{"ppc64le", "runtime/internal/sys", "Bswap64"}: struct{}{},
+-	{"ppc64le", "math/bits", "ReverseBytes16"}:     struct{}{},
+-	{"ppc64le", "math/bits", "ReverseBytes32"}:     struct{}{},
+-	{"ppc64le", "math/bits", "ReverseBytes64"}:     struct{}{},
+-}
+-
+ func TestIntrinsics(t *testing.T) {
+-	initIntrinsics(nil)
+-
+-	want := make(map[testIntrinsicKey]struct{})
+-	for ik, iv := range wantIntrinsics {
+-		want[ik] = iv
++	cfg := &intrinsicBuildConfig{
++		goppc64: 10,
+ 	}
+-	if buildcfg.GOPPC64 >= 10 {
+-		for ik, iv := range wantIntrinsicsPower10 {
+-			want[ik] = iv
++	initIntrinsics(cfg)
++
++	if *updateIntrinsics {
++		var updatedIntrinsics []*testIntrinsicKey
++		for ik, _ := range intrinsics {
++			updatedIntrinsics = append(updatedIntrinsics, &testIntrinsicKey{ik.arch.Name, ik.pkg, ik.fn})
+ 		}
++		slices.SortFunc(updatedIntrinsics, func(a, b *testIntrinsicKey) int {
++			if n := strings.Compare(a.archName, b.archName); n != 0 {
++				return n
++			}
++			if n := strings.Compare(a.pkg, b.pkg); n != 0 {
++				return n
++			}
++			return strings.Compare(a.fn, b.fn)
++		})
++		for _, tik := range updatedIntrinsics {
++			fmt.Printf("\t{%q, %q, %q}: struct{}{},\n", tik.archName, tik.pkg, tik.fn)
++		}
++		return
+ 	}
+ 
+-	got := make(map[testIntrinsicKey]struct{})
++	gotIntrinsics := make(map[testIntrinsicKey]struct{})
+ 	for ik, _ := range intrinsics {
+-		got[testIntrinsicKey{ik.arch.Name, ik.pkg, ik.fn}] = struct{}{}
++		gotIntrinsics[testIntrinsicKey{ik.arch.Name, ik.pkg, ik.fn}] = struct{}{}
+ 	}
+-	for ik, _ := range got {
+-		if _, found := want[ik]; !found {
++	for ik, _ := range gotIntrinsics {
++		if _, found := wantIntrinsics[ik]; !found {
+ 			t.Errorf("Got unwanted intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn)
+ 		}
+ 	}
+-	for ik, _ := range want {
+-		if _, found := got[ik]; !found {
++
++	for ik, _ := range wantIntrinsics {
++		if _, found := gotIntrinsics[ik]; !found {
+ 			t.Errorf("Want intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn)
+ 		}
+ 	}
+-- 
+2.39.5
+
diff --git a/2113-cmd-compile-simplify-intrinsification-of-BitLen16-an.patch b/2113-cmd-compile-simplify-intrinsification-of-BitLen16-an.patch
new file mode 100644
index 0000000..a1ac37c
--- /dev/null
+++ b/2113-cmd-compile-simplify-intrinsification-of-BitLen16-an.patch
@@ -0,0 +1,582 @@
+From 0967ad259be6b3a768723327135dcbe368f655a2 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:48:22 +0800
+Subject: [PATCH 113/119] cmd/compile: simplify intrinsification of BitLen16
+ and BitLen8
+
+Decompose BitLen16 and BitLen8 within the SSA rules for architectures that
+support BitLen32 or BitLen64, rather than having a custom intrinsic.
+
+Change-Id: Ie4188ce69d1021e63cec27a8e7418efb0714812b
+Reviewed-on: https://go-review.googlesource.com/c/go/+/651817
+Reviewed-by: Keith Randall <khr@golang.org>
+Reviewed-by: Michael Pratt <mpratt@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Junyang Shao <shaojunyang@google.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: Michael Knyszek <mknyszek@google.com>
+---
+ src/cmd/compile/internal/ssa/_gen/ARM.rules   |  1 +
+ src/cmd/compile/internal/ssa/_gen/ARM64.rules |  1 +
+ src/cmd/compile/internal/ssa/_gen/MIPS.rules  |  1 +
+ src/cmd/compile/internal/ssa/_gen/PPC64.rules |  1 +
+ src/cmd/compile/internal/ssa/_gen/S390X.rules |  1 +
+ src/cmd/compile/internal/ssa/_gen/Wasm.rules  |  1 +
+ src/cmd/compile/internal/ssa/rewriteARM.go    | 34 +++++++++++++
+ src/cmd/compile/internal/ssa/rewriteARM64.go  | 34 +++++++++++++
+ src/cmd/compile/internal/ssa/rewriteMIPS.go   | 34 +++++++++++++
+ src/cmd/compile/internal/ssa/rewritePPC64.go  | 34 +++++++++++++
+ src/cmd/compile/internal/ssa/rewriteS390X.go  | 51 +++++++++++++++++++
+ src/cmd/compile/internal/ssa/rewriteWasm.go   | 51 +++++++++++++++++++
+ src/cmd/compile/internal/ssagen/intrinsics.go | 39 ++------------
+ 13 files changed, 249 insertions(+), 34 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/ARM.rules b/src/cmd/compile/internal/ssa/_gen/ARM.rules
+index a60afb000a..46dbe31f3a 100644
+--- a/src/cmd/compile/internal/ssa/_gen/ARM.rules
++++ b/src/cmd/compile/internal/ssa/_gen/ARM.rules
+@@ -80,6 +80,7 @@
+ 
+ // bit length
+ (BitLen32 <t> x) => (RSBconst [32] (CLZ <t> x))
++(BitLen(16|8) x) => (BitLen32 (ZeroExt(16|8)to32 x))
+ 
+ // byte swap for ARMv5
+ // let (a, b, c, d) be the bytes of x from high to low
+diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64.rules b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
+index 94032d6ca4..4d0affae79 100644
+--- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
+@@ -103,6 +103,7 @@
+ 
+ (BitLen64 x) => (SUB (MOVDconst [64]) (CLZ <typ.Int> x))
+ (BitLen32 x) => (SUB (MOVDconst [32]) (CLZW <typ.Int> x))
++(BitLen(16|8) x) => (BitLen64 (ZeroExt(16|8)to64 x))
+ 
+ (Bswap64 ...) => (REV ...)
+ (Bswap32 ...) => (REVW ...)
+diff --git a/src/cmd/compile/internal/ssa/_gen/MIPS.rules b/src/cmd/compile/internal/ssa/_gen/MIPS.rules
+index d6ae0101cb..e10cf359e3 100644
+--- a/src/cmd/compile/internal/ssa/_gen/MIPS.rules
++++ b/src/cmd/compile/internal/ssa/_gen/MIPS.rules
+@@ -135,6 +135,7 @@
+ 
+ // bit length
+ (BitLen32 <t> x) => (SUB (MOVWconst [32]) (CLZ <t> x))
++(BitLen(16|8) x) => (BitLen32 (ZeroExt(16|8)to32 x))
+ 
+ // boolean ops -- booleans are represented with 0=false, 1=true
+ (AndB ...) => (AND ...)
+diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64.rules b/src/cmd/compile/internal/ssa/_gen/PPC64.rules
+index 97e592fd7e..d1e8bba7ef 100644
+--- a/src/cmd/compile/internal/ssa/_gen/PPC64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/PPC64.rules
+@@ -252,6 +252,7 @@
+ 
+ (BitLen64 x) => (SUBFCconst [64] (CNTLZD <typ.Int> x))
+ (BitLen32 x) => (SUBFCconst [32] (CNTLZW <typ.Int> x))
++(BitLen(16|8) x) => (BitLen64 (ZeroExt(16|8)to64 x))
+ 
+ (PopCount64 ...) => (POPCNTD ...)
+ (PopCount(32|16|8) x) => (POPCNT(W|W|B) (MOV(W|H|B)Zreg x))
+diff --git a/src/cmd/compile/internal/ssa/_gen/S390X.rules b/src/cmd/compile/internal/ssa/_gen/S390X.rules
+index a9d62c79ce..78ef1214d7 100644
+--- a/src/cmd/compile/internal/ssa/_gen/S390X.rules
++++ b/src/cmd/compile/internal/ssa/_gen/S390X.rules
+@@ -89,6 +89,7 @@
+ (Ctz32 <t> x) => (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW <t> (SUBWconst <t> [1] x) (NOTW <t> x)))))
+ 
+ (BitLen64 x) => (SUB (MOVDconst [64]) (FLOGR x))
++(BitLen(32|16|8) x) => (BitLen64 (ZeroExt(32|16|8)to64 x))
+ 
+ // POPCNT treats the input register as a vector of 8 bytes, producing
+ // a population count for each individual byte. For inputs larger than
+diff --git a/src/cmd/compile/internal/ssa/_gen/Wasm.rules b/src/cmd/compile/internal/ssa/_gen/Wasm.rules
+index 91a9fc5e4a..03c681f440 100644
+--- a/src/cmd/compile/internal/ssa/_gen/Wasm.rules
++++ b/src/cmd/compile/internal/ssa/_gen/Wasm.rules
+@@ -329,6 +329,7 @@
+ (Ctz(64|32|16|8)NonZero ...) => (I64Ctz ...)
+ 
+ (BitLen64 x) => (I64Sub (I64Const [64]) (I64Clz x))
++(BitLen(32|16|8) x) => (BitLen64 (ZeroExt(32|16|8)to64 x))
+ 
+ (PopCount64 ...) => (I64Popcnt ...)
+ (PopCount32 x) => (I64Popcnt (ZeroExt32to64 x))
+diff --git a/src/cmd/compile/internal/ssa/rewriteARM.go b/src/cmd/compile/internal/ssa/rewriteARM.go
+index 70cacb90ed..d622a0cd3a 100644
+--- a/src/cmd/compile/internal/ssa/rewriteARM.go
++++ b/src/cmd/compile/internal/ssa/rewriteARM.go
+@@ -466,8 +466,12 @@ func rewriteValueARM(v *Value) bool {
+ 		return true
+ 	case OpAvg32u:
+ 		return rewriteValueARM_OpAvg32u(v)
++	case OpBitLen16:
++		return rewriteValueARM_OpBitLen16(v)
+ 	case OpBitLen32:
+ 		return rewriteValueARM_OpBitLen32(v)
++	case OpBitLen8:
++		return rewriteValueARM_OpBitLen8(v)
+ 	case OpBswap32:
+ 		return rewriteValueARM_OpBswap32(v)
+ 	case OpClosureCall:
+@@ -13042,6 +13046,21 @@ func rewriteValueARM_OpAvg32u(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueARM_OpBitLen16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen16 x)
++	// result: (BitLen32 (ZeroExt16to32 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen32)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to32, typ.UInt32)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueARM_OpBitLen32(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+@@ -13058,6 +13077,21 @@ func rewriteValueARM_OpBitLen32(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueARM_OpBitLen8(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen8 x)
++	// result: (BitLen32 (ZeroExt8to32 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen32)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to32, typ.UInt32)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueARM_OpBswap32(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
+index 93a741ad87..65d99f5a9f 100644
+--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
++++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
+@@ -535,10 +535,14 @@ func rewriteValueARM64(v *Value) bool {
+ 		return true
+ 	case OpAvg64u:
+ 		return rewriteValueARM64_OpAvg64u(v)
++	case OpBitLen16:
++		return rewriteValueARM64_OpBitLen16(v)
+ 	case OpBitLen32:
+ 		return rewriteValueARM64_OpBitLen32(v)
+ 	case OpBitLen64:
+ 		return rewriteValueARM64_OpBitLen64(v)
++	case OpBitLen8:
++		return rewriteValueARM64_OpBitLen8(v)
+ 	case OpBitRev16:
+ 		return rewriteValueARM64_OpBitRev16(v)
+ 	case OpBitRev32:
+@@ -18425,6 +18429,21 @@ func rewriteValueARM64_OpAvg64u(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueARM64_OpBitLen16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen16 x)
++	// result: (BitLen64 (ZeroExt16to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueARM64_OpBitLen32(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+@@ -18459,6 +18478,21 @@ func rewriteValueARM64_OpBitLen64(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueARM64_OpBitLen8(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen8 x)
++	// result: (BitLen64 (ZeroExt8to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueARM64_OpBitRev16(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+diff --git a/src/cmd/compile/internal/ssa/rewriteMIPS.go b/src/cmd/compile/internal/ssa/rewriteMIPS.go
+index 6a259f5a47..978be79417 100644
+--- a/src/cmd/compile/internal/ssa/rewriteMIPS.go
++++ b/src/cmd/compile/internal/ssa/rewriteMIPS.go
+@@ -82,8 +82,12 @@ func rewriteValueMIPS(v *Value) bool {
+ 		return true
+ 	case OpAvg32u:
+ 		return rewriteValueMIPS_OpAvg32u(v)
++	case OpBitLen16:
++		return rewriteValueMIPS_OpBitLen16(v)
+ 	case OpBitLen32:
+ 		return rewriteValueMIPS_OpBitLen32(v)
++	case OpBitLen8:
++		return rewriteValueMIPS_OpBitLen8(v)
+ 	case OpClosureCall:
+ 		v.Op = OpMIPSCALLclosure
+ 		return true
+@@ -792,6 +796,21 @@ func rewriteValueMIPS_OpAvg32u(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueMIPS_OpBitLen16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen16 x)
++	// result: (BitLen32 (ZeroExt16to32 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen32)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to32, typ.UInt32)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueMIPS_OpBitLen32(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+@@ -810,6 +829,21 @@ func rewriteValueMIPS_OpBitLen32(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueMIPS_OpBitLen8(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen8 x)
++	// result: (BitLen32 (ZeroExt8to32 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen32)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to32, typ.UInt32)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueMIPS_OpCom16(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	// match: (Com16 x)
+diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go
+index d1c0c2b07f..bdc690212e 100644
+--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
++++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
+@@ -103,10 +103,14 @@ func rewriteValuePPC64(v *Value) bool {
+ 		return rewriteValuePPC64_OpAtomicStoreRel64(v)
+ 	case OpAvg64u:
+ 		return rewriteValuePPC64_OpAvg64u(v)
++	case OpBitLen16:
++		return rewriteValuePPC64_OpBitLen16(v)
+ 	case OpBitLen32:
+ 		return rewriteValuePPC64_OpBitLen32(v)
+ 	case OpBitLen64:
+ 		return rewriteValuePPC64_OpBitLen64(v)
++	case OpBitLen8:
++		return rewriteValuePPC64_OpBitLen8(v)
+ 	case OpBswap16:
+ 		return rewriteValuePPC64_OpBswap16(v)
+ 	case OpBswap32:
+@@ -1106,6 +1110,21 @@ func rewriteValuePPC64_OpAvg64u(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValuePPC64_OpBitLen16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen16 x)
++	// result: (BitLen64 (ZeroExt16to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValuePPC64_OpBitLen32(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+@@ -1138,6 +1157,21 @@ func rewriteValuePPC64_OpBitLen64(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValuePPC64_OpBitLen8(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen8 x)
++	// result: (BitLen64 (ZeroExt8to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValuePPC64_OpBswap16(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go
+index a3d621898f..bf3073eea9 100644
+--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
++++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
+@@ -88,8 +88,14 @@ func rewriteValueS390X(v *Value) bool {
+ 		return rewriteValueS390X_OpAtomicStoreRel32(v)
+ 	case OpAvg64u:
+ 		return rewriteValueS390X_OpAvg64u(v)
++	case OpBitLen16:
++		return rewriteValueS390X_OpBitLen16(v)
++	case OpBitLen32:
++		return rewriteValueS390X_OpBitLen32(v)
+ 	case OpBitLen64:
+ 		return rewriteValueS390X_OpBitLen64(v)
++	case OpBitLen8:
++		return rewriteValueS390X_OpBitLen8(v)
+ 	case OpBswap16:
+ 		return rewriteValueS390X_OpBswap16(v)
+ 	case OpBswap32:
+@@ -1261,6 +1267,36 @@ func rewriteValueS390X_OpAvg64u(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueS390X_OpBitLen16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen16 x)
++	// result: (BitLen64 (ZeroExt16to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
++func rewriteValueS390X_OpBitLen32(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen32 x)
++	// result: (BitLen64 (ZeroExt32to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueS390X_OpBitLen64(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+@@ -1278,6 +1314,21 @@ func rewriteValueS390X_OpBitLen64(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueS390X_OpBitLen8(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen8 x)
++	// result: (BitLen64 (ZeroExt8to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueS390X_OpBswap16(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+diff --git a/src/cmd/compile/internal/ssa/rewriteWasm.go b/src/cmd/compile/internal/ssa/rewriteWasm.go
+index 6f83aea13a..f3b8205b24 100644
+--- a/src/cmd/compile/internal/ssa/rewriteWasm.go
++++ b/src/cmd/compile/internal/ssa/rewriteWasm.go
+@@ -49,8 +49,14 @@ func rewriteValueWasm(v *Value) bool {
+ 	case OpAndB:
+ 		v.Op = OpWasmI64And
+ 		return true
++	case OpBitLen16:
++		return rewriteValueWasm_OpBitLen16(v)
++	case OpBitLen32:
++		return rewriteValueWasm_OpBitLen32(v)
+ 	case OpBitLen64:
+ 		return rewriteValueWasm_OpBitLen64(v)
++	case OpBitLen8:
++		return rewriteValueWasm_OpBitLen8(v)
+ 	case OpCeil:
+ 		v.Op = OpWasmF64Ceil
+ 		return true
+@@ -679,6 +685,36 @@ func rewriteValueWasm_OpAddr(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueWasm_OpBitLen16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen16 x)
++	// result: (BitLen64 (ZeroExt16to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
++func rewriteValueWasm_OpBitLen32(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen32 x)
++	// result: (BitLen64 (ZeroExt32to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueWasm_OpBitLen64(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+@@ -696,6 +732,21 @@ func rewriteValueWasm_OpBitLen64(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueWasm_OpBitLen8(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen8 x)
++	// result: (BitLen64 (ZeroExt8to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueWasm_OpCom16(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
+index c62837cd5b..6fcdcf57ed 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics.go
+@@ -734,51 +734,22 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
+ 		},
+-		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
++		sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
+ 	addF("math/bits", "Len32",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+ 		},
+-		sys.AMD64, sys.ARM64, sys.PPC64)
+-	addF("math/bits", "Len32",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if s.config.PtrSize == 4 {
+-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+-			}
+-			x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0])
+-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+-		},
+-		sys.ARM, sys.S390X, sys.MIPS, sys.Wasm)
+-	addF("math/bits", "Len16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if s.config.PtrSize == 4 {
+-				x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
+-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
+-			}
+-			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
+-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+-		},
+-		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
++		sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
+ 	addF("math/bits", "Len16",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
+ 		},
+-		sys.AMD64)
+-	addF("math/bits", "Len8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if s.config.PtrSize == 4 {
+-				x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
+-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
+-			}
+-			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
+-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+-		},
+-		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
++		sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
+ 	addF("math/bits", "Len8",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
+ 		},
+-		sys.AMD64)
++		sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
+ 	addF("math/bits", "Len",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			if s.config.PtrSize == 4 {
+@@ -786,7 +757,7 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
+ 			}
+ 			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
+ 		},
+-		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
++		sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
+ 	// LeadingZeros is handled because it trivially calls Len.
+ 	addF("math/bits", "Reverse64",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-- 
+2.39.5
+
diff --git a/2114-cmd-compile-simplify-intrinsification-of-TrailingZer.patch b/2114-cmd-compile-simplify-intrinsification-of-TrailingZer.patch
new file mode 100644
index 0000000..937cab4
--- /dev/null
+++ b/2114-cmd-compile-simplify-intrinsification-of-TrailingZer.patch
@@ -0,0 +1,563 @@
+From a0cbf6b18dada4ea1b2a86df54d509c1151f47cd Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:48:22 +0800
+Subject: [PATCH 114/119] cmd/compile: simplify intrinsification of
+ TrailingZeros16 and TrailingZeros8
+
+Decompose Ctz16 and Ctz8 within the SSA rules for LOONG64, MIPS, PPC64
+and S390X, rather than having a custom intrinsic. Note that for PPC64 this
+actually allows the existing Ctz16 and Ctz8 rules to be used.
+
+Change-Id: I27a5e978f852b9d75396d2a80f5d7dfcb5ef7dd4
+Reviewed-on: https://go-review.googlesource.com/c/go/+/651816
+Reviewed-by: Paul Murphy <murp@ibm.com>
+TryBot-Result: Gopher Robot <gobot@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Michael Pratt <mpratt@google.com>
+Run-TryBot: Joel Sing <joel@sing.id.au>
+Reviewed-by: Keith Randall <khr@golang.org>
+Reviewed-by: Keith Randall <khr@google.com>
+---
+ src/cmd/compile/internal/ssa/_gen/MIPS.rules  |  5 +-
+ src/cmd/compile/internal/ssa/_gen/PPC64.rules | 21 +++---
+ src/cmd/compile/internal/ssa/_gen/S390X.rules |  6 +-
+ src/cmd/compile/internal/ssa/rewriteMIPS.go   | 44 +++++++++++++
+ src/cmd/compile/internal/ssa/rewritePPC64.go  | 64 ++++++++++++++++++-
+ src/cmd/compile/internal/ssa/rewriteS390X.go  | 46 ++++++++++++-
+ src/cmd/compile/internal/ssagen/intrinsics.go | 47 ++------------
+ .../internal/ssagen/intrinsics_test.go        |  6 +-
+ test/codegen/mathbits.go                      |  6 +-
+ 9 files changed, 181 insertions(+), 64 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/MIPS.rules b/src/cmd/compile/internal/ssa/_gen/MIPS.rules
+index e10cf359e3..80c7f71685 100644
+--- a/src/cmd/compile/internal/ssa/_gen/MIPS.rules
++++ b/src/cmd/compile/internal/ssa/_gen/MIPS.rules
+@@ -126,12 +126,13 @@
+ (Sqrt ...) => (SQRTD ...)
+ (Sqrt32 ...) => (SQRTF ...)
+ 
+-// TODO: optimize this case?
+-(Ctz32NonZero ...) => (Ctz32 ...)
++(Ctz(32|16|8)NonZero ...) => (Ctz32 ...)
+ 
+ // count trailing zero
+ // 32 - CLZ(x&-x - 1)
+ (Ctz32 <t> x) => (SUB (MOVWconst [32]) (CLZ <t> (SUBconst <t> [1] (AND <t> x (NEG <t> x)))))
++(Ctz16 x) => (Ctz32 (Or32 <typ.UInt32> x (MOVWconst [1<<16])))
++(Ctz8  x) => (Ctz32 (Or32 <typ.UInt32> x (MOVWconst [1<<8])))
+ 
+ // bit length
+ (BitLen32 <t> x) => (SUB (MOVWconst [32]) (CLZ <t> x))
+diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64.rules b/src/cmd/compile/internal/ssa/_gen/PPC64.rules
+index d1e8bba7ef..1a34de9aad 100644
+--- a/src/cmd/compile/internal/ssa/_gen/PPC64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/PPC64.rules
+@@ -239,16 +239,17 @@
+ (LocalAddr <t> {sym} base _)  && !t.Elem().HasPointers() => (MOVDaddr {sym} base)
+ (OffPtr [off] ptr) => (ADD (MOVDconst <typ.Int64> [off]) ptr)
+ 
+-// TODO: optimize these cases?
+-(Ctz32NonZero ...) => (Ctz32 ...)
+-(Ctz64NonZero ...) => (Ctz64 ...)
+-
+-(Ctz64 x) && buildcfg.GOPPC64<=8 => (POPCNTD (ANDN <typ.Int64> (ADDconst <typ.Int64> [-1] x) x))
+-(Ctz64 x) => (CNTTZD x)
+-(Ctz32 x) && buildcfg.GOPPC64<=8 => (POPCNTW (MOVWZreg (ANDN <typ.Int> (ADDconst <typ.Int> [-1] x) x)))
+-(Ctz32 x) => (CNTTZW (MOVWZreg x))
+-(Ctz16 x) => (POPCNTW (MOVHZreg (ANDN <typ.Int16> (ADDconst <typ.Int16> [-1] x) x)))
+-(Ctz8 x)  => (POPCNTB (MOVBZreg (ANDN <typ.UInt8> (ADDconst <typ.UInt8> [-1] x) x)))
++(Ctz(64|32|16|8)NonZero ...) => (Ctz64 ...)
++
++(Ctz64 x) && buildcfg.GOPPC64 <= 8 => (POPCNTD (ANDN <typ.Int64> (ADDconst <typ.Int64> [-1] x) x))
++(Ctz32 x) && buildcfg.GOPPC64 <= 8 => (POPCNTW (MOVWZreg (ANDN <typ.Int> (ADDconst <typ.Int> [-1] x) x)))
++(Ctz16 x) && buildcfg.GOPPC64 <= 8 => (POPCNTW (MOVHZreg (ANDN <typ.Int16> (ADDconst <typ.Int16> [-1] x) x)))
++(Ctz8  x) && buildcfg.GOPPC64 <= 8 => (POPCNTB (MOVBZreg (ANDN <typ.UInt8> (ADDconst <typ.UInt8> [-1] x) x)))
++
++(Ctz64 x) && buildcfg.GOPPC64 >= 9 => (CNTTZD x)
++(Ctz32 x) && buildcfg.GOPPC64 >= 9 => (CNTTZW (MOVWZreg x))
++(Ctz16 x) && buildcfg.GOPPC64 >= 9 => (CNTTZD (OR <typ.UInt64> x (MOVDconst [1<<16])))
++(Ctz8  x) && buildcfg.GOPPC64 >= 9 => (CNTTZD (OR <typ.UInt64> x (MOVDconst [1<<8])))
+ 
+ (BitLen64 x) => (SUBFCconst [64] (CNTLZD <typ.Int> x))
+ (BitLen32 x) => (SUBFCconst [32] (CNTLZW <typ.Int> x))
+diff --git a/src/cmd/compile/internal/ssa/_gen/S390X.rules b/src/cmd/compile/internal/ssa/_gen/S390X.rules
+index 78ef1214d7..7505a5ff06 100644
+--- a/src/cmd/compile/internal/ssa/_gen/S390X.rules
++++ b/src/cmd/compile/internal/ssa/_gen/S390X.rules
+@@ -80,13 +80,13 @@
+ (OffPtr [off] ptr) && is32Bit(off) => (ADDconst [int32(off)] ptr)
+ (OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr)
+ 
+-// TODO: optimize these cases?
+-(Ctz64NonZero ...) => (Ctz64 ...)
+-(Ctz32NonZero ...) => (Ctz32 ...)
++(Ctz(64|32|16|8)NonZero ...) => (Ctz64 ...)
+ 
+ // Ctz(x) = 64 - findLeftmostOne((x-1)&^x)
+ (Ctz64 <t> x) => (SUB (MOVDconst [64]) (FLOGR (AND <t> (SUBconst <t> [1] x) (NOT <t> x))))
+ (Ctz32 <t> x) => (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW <t> (SUBWconst <t> [1] x) (NOTW <t> x)))))
++(Ctz16 x) => (Ctz64 (Or64 <typ.UInt64> x (MOVDconst [1<<16])))
++(Ctz8  x) => (Ctz64 (Or64 <typ.UInt64> x (MOVDconst [1<<8])))
+ 
+ (BitLen64 x) => (SUB (MOVDconst [64]) (FLOGR x))
+ (BitLen(32|16|8) x) => (BitLen64 (ZeroExt(32|16|8)to64 x))
+diff --git a/src/cmd/compile/internal/ssa/rewriteMIPS.go b/src/cmd/compile/internal/ssa/rewriteMIPS.go
+index 978be79417..eb34dfd03e 100644
+--- a/src/cmd/compile/internal/ssa/rewriteMIPS.go
++++ b/src/cmd/compile/internal/ssa/rewriteMIPS.go
+@@ -113,11 +113,21 @@ func rewriteValueMIPS(v *Value) bool {
+ 		return rewriteValueMIPS_OpConstBool(v)
+ 	case OpConstNil:
+ 		return rewriteValueMIPS_OpConstNil(v)
++	case OpCtz16:
++		return rewriteValueMIPS_OpCtz16(v)
++	case OpCtz16NonZero:
++		v.Op = OpCtz32
++		return true
+ 	case OpCtz32:
+ 		return rewriteValueMIPS_OpCtz32(v)
+ 	case OpCtz32NonZero:
+ 		v.Op = OpCtz32
+ 		return true
++	case OpCtz8:
++		return rewriteValueMIPS_OpCtz8(v)
++	case OpCtz8NonZero:
++		v.Op = OpCtz32
++		return true
+ 	case OpCvt32Fto32:
+ 		v.Op = OpMIPSTRUNCFW
+ 		return true
+@@ -929,6 +939,23 @@ func rewriteValueMIPS_OpConstNil(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueMIPS_OpCtz16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (Ctz16 x)
++	// result: (Ctz32 (Or32 <typ.UInt32> x (MOVWconst [1<<16])))
++	for {
++		x := v_0
++		v.reset(OpCtz32)
++		v0 := b.NewValue0(v.Pos, OpOr32, typ.UInt32)
++		v1 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32)
++		v1.AuxInt = int32ToAuxInt(1 << 16)
++		v0.AddArg2(x, v1)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueMIPS_OpCtz32(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+@@ -954,6 +981,23 @@ func rewriteValueMIPS_OpCtz32(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueMIPS_OpCtz8(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (Ctz8 x)
++	// result: (Ctz32 (Or32 <typ.UInt32> x (MOVWconst [1<<8])))
++	for {
++		x := v_0
++		v.reset(OpCtz32)
++		v0 := b.NewValue0(v.Pos, OpOr32, typ.UInt32)
++		v1 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32)
++		v1.AuxInt = int32ToAuxInt(1 << 8)
++		v0.AddArg2(x, v1)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueMIPS_OpDiv16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go
+index bdc690212e..19b779e02c 100644
+--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
++++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
+@@ -155,10 +155,13 @@ func rewriteValuePPC64(v *Value) bool {
+ 		return rewriteValuePPC64_OpCopysign(v)
+ 	case OpCtz16:
+ 		return rewriteValuePPC64_OpCtz16(v)
++	case OpCtz16NonZero:
++		v.Op = OpCtz64
++		return true
+ 	case OpCtz32:
+ 		return rewriteValuePPC64_OpCtz32(v)
+ 	case OpCtz32NonZero:
+-		v.Op = OpCtz32
++		v.Op = OpCtz64
+ 		return true
+ 	case OpCtz64:
+ 		return rewriteValuePPC64_OpCtz64(v)
+@@ -167,6 +170,9 @@ func rewriteValuePPC64(v *Value) bool {
+ 		return true
+ 	case OpCtz8:
+ 		return rewriteValuePPC64_OpCtz8(v)
++	case OpCtz8NonZero:
++		v.Op = OpCtz64
++		return true
+ 	case OpCvt32Fto32:
+ 		return rewriteValuePPC64_OpCvt32Fto32(v)
+ 	case OpCvt32Fto64:
+@@ -1520,9 +1526,13 @@ func rewriteValuePPC64_OpCtz16(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Ctz16 x)
++	// cond: buildcfg.GOPPC64 <= 8
+ 	// result: (POPCNTW (MOVHZreg (ANDN <typ.Int16> (ADDconst <typ.Int16> [-1] x) x)))
+ 	for {
+ 		x := v_0
++		if !(buildcfg.GOPPC64 <= 8) {
++			break
++		}
+ 		v.reset(OpPPC64POPCNTW)
+ 		v0 := b.NewValue0(v.Pos, OpPPC64MOVHZreg, typ.Int64)
+ 		v1 := b.NewValue0(v.Pos, OpPPC64ANDN, typ.Int16)
+@@ -1534,13 +1544,30 @@ func rewriteValuePPC64_OpCtz16(v *Value) bool {
+ 		v.AddArg(v0)
+ 		return true
+ 	}
++	// match: (Ctz16 x)
++	// cond: buildcfg.GOPPC64 >= 9
++	// result: (CNTTZD (OR <typ.UInt64> x (MOVDconst [1<<16])))
++	for {
++		x := v_0
++		if !(buildcfg.GOPPC64 >= 9) {
++			break
++		}
++		v.reset(OpPPC64CNTTZD)
++		v0 := b.NewValue0(v.Pos, OpPPC64OR, typ.UInt64)
++		v1 := b.NewValue0(v.Pos, OpPPC64MOVDconst, typ.Int64)
++		v1.AuxInt = int64ToAuxInt(1 << 16)
++		v0.AddArg2(x, v1)
++		v.AddArg(v0)
++		return true
++	}
++	return false
+ }
+ func rewriteValuePPC64_OpCtz32(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Ctz32 x)
+-	// cond: buildcfg.GOPPC64<=8
++	// cond: buildcfg.GOPPC64 <= 8
+ 	// result: (POPCNTW (MOVWZreg (ANDN <typ.Int> (ADDconst <typ.Int> [-1] x) x)))
+ 	for {
+ 		x := v_0
+@@ -1559,22 +1586,27 @@ func rewriteValuePPC64_OpCtz32(v *Value) bool {
+ 		return true
+ 	}
+ 	// match: (Ctz32 x)
++	// cond: buildcfg.GOPPC64 >= 9
+ 	// result: (CNTTZW (MOVWZreg x))
+ 	for {
+ 		x := v_0
++		if !(buildcfg.GOPPC64 >= 9) {
++			break
++		}
+ 		v.reset(OpPPC64CNTTZW)
+ 		v0 := b.NewValue0(v.Pos, OpPPC64MOVWZreg, typ.Int64)
+ 		v0.AddArg(x)
+ 		v.AddArg(v0)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValuePPC64_OpCtz64(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Ctz64 x)
+-	// cond: buildcfg.GOPPC64<=8
++	// cond: buildcfg.GOPPC64 <= 8
+ 	// result: (POPCNTD (ANDN <typ.Int64> (ADDconst <typ.Int64> [-1] x) x))
+ 	for {
+ 		x := v_0
+@@ -1591,22 +1623,31 @@ func rewriteValuePPC64_OpCtz64(v *Value) bool {
+ 		return true
+ 	}
+ 	// match: (Ctz64 x)
++	// cond: buildcfg.GOPPC64 >= 9
+ 	// result: (CNTTZD x)
+ 	for {
+ 		x := v_0
++		if !(buildcfg.GOPPC64 >= 9) {
++			break
++		}
+ 		v.reset(OpPPC64CNTTZD)
+ 		v.AddArg(x)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValuePPC64_OpCtz8(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Ctz8 x)
++	// cond: buildcfg.GOPPC64 <= 8
+ 	// result: (POPCNTB (MOVBZreg (ANDN <typ.UInt8> (ADDconst <typ.UInt8> [-1] x) x)))
+ 	for {
+ 		x := v_0
++		if !(buildcfg.GOPPC64 <= 8) {
++			break
++		}
+ 		v.reset(OpPPC64POPCNTB)
+ 		v0 := b.NewValue0(v.Pos, OpPPC64MOVBZreg, typ.Int64)
+ 		v1 := b.NewValue0(v.Pos, OpPPC64ANDN, typ.UInt8)
+@@ -1618,6 +1659,23 @@ func rewriteValuePPC64_OpCtz8(v *Value) bool {
+ 		v.AddArg(v0)
+ 		return true
+ 	}
++	// match: (Ctz8 x)
++	// cond: buildcfg.GOPPC64 >= 9
++	// result: (CNTTZD (OR <typ.UInt64> x (MOVDconst [1<<8])))
++	for {
++		x := v_0
++		if !(buildcfg.GOPPC64 >= 9) {
++			break
++		}
++		v.reset(OpPPC64CNTTZD)
++		v0 := b.NewValue0(v.Pos, OpPPC64OR, typ.UInt64)
++		v1 := b.NewValue0(v.Pos, OpPPC64MOVDconst, typ.Int64)
++		v1.AuxInt = int64ToAuxInt(1 << 8)
++		v0.AddArg2(x, v1)
++		v.AddArg(v0)
++		return true
++	}
++	return false
+ }
+ func rewriteValuePPC64_OpCvt32Fto32(v *Value) bool {
+ 	v_0 := v.Args[0]
+diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go
+index bf3073eea9..e54e3ba7fc 100644
+--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
++++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
+@@ -139,16 +139,26 @@ func rewriteValueS390X(v *Value) bool {
+ 		return rewriteValueS390X_OpConstBool(v)
+ 	case OpConstNil:
+ 		return rewriteValueS390X_OpConstNil(v)
++	case OpCtz16:
++		return rewriteValueS390X_OpCtz16(v)
++	case OpCtz16NonZero:
++		v.Op = OpCtz64
++		return true
+ 	case OpCtz32:
+ 		return rewriteValueS390X_OpCtz32(v)
+ 	case OpCtz32NonZero:
+-		v.Op = OpCtz32
++		v.Op = OpCtz64
+ 		return true
+ 	case OpCtz64:
+ 		return rewriteValueS390X_OpCtz64(v)
+ 	case OpCtz64NonZero:
+ 		v.Op = OpCtz64
+ 		return true
++	case OpCtz8:
++		return rewriteValueS390X_OpCtz8(v)
++	case OpCtz8NonZero:
++		v.Op = OpCtz64
++		return true
+ 	case OpCvt32Fto32:
+ 		v.Op = OpS390XCFEBRA
+ 		return true
+@@ -1449,6 +1459,23 @@ func rewriteValueS390X_OpConstNil(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueS390X_OpCtz16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (Ctz16 x)
++	// result: (Ctz64 (Or64 <typ.UInt64> x (MOVDconst [1<<16])))
++	for {
++		x := v_0
++		v.reset(OpCtz64)
++		v0 := b.NewValue0(v.Pos, OpOr64, typ.UInt64)
++		v1 := b.NewValue0(v.Pos, OpS390XMOVDconst, typ.UInt64)
++		v1.AuxInt = int64ToAuxInt(1 << 16)
++		v0.AddArg2(x, v1)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueS390X_OpCtz32(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+@@ -1501,6 +1528,23 @@ func rewriteValueS390X_OpCtz64(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueS390X_OpCtz8(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (Ctz8 x)
++	// result: (Ctz64 (Or64 <typ.UInt64> x (MOVDconst [1<<8])))
++	for {
++		x := v_0
++		v.reset(OpCtz64)
++		v0 := b.NewValue0(v.Pos, OpOr64, typ.UInt64)
++		v1 := b.NewValue0(v.Pos, OpS390XMOVDconst, typ.UInt64)
++		v1.AuxInt = int64ToAuxInt(1 << 8)
++		v0.AddArg2(x, v1)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueS390X_OpDiv16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
+index 6fcdcf57ed..a337ef7f1b 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics.go
+@@ -675,48 +675,16 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
+ 			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
+ 		},
+ 		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+-	addF("math/bits", "TrailingZeros16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
+-			c := s.constInt32(types.Types[types.TUINT32], 1<<16)
+-			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
+-			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
+-		},
+-		sys.MIPS)
+ 	addF("math/bits", "TrailingZeros16",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
+ 		},
+-		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
+-	addF("math/bits", "TrailingZeros16",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
+-			c := s.constInt64(types.Types[types.TUINT64], 1<<16)
+-			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
+-			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
+-		},
+-		sys.S390X, sys.PPC64)
+-	addF("math/bits", "TrailingZeros8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
+-			c := s.constInt32(types.Types[types.TUINT32], 1<<8)
+-			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
+-			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
+-		},
+-		sys.MIPS)
++		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
+ 	addF("math/bits", "TrailingZeros8",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
+ 		},
+-		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
+-	addF("math/bits", "TrailingZeros8",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
+-			c := s.constInt64(types.Types[types.TUINT64], 1<<8)
+-			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
+-			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
+-		},
+-		sys.S390X)
++		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
+ 	alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...)
+ 	alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...)
+ 	// ReverseBytes inlines correctly, no need to intrinsify it.
+@@ -750,14 +718,9 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
+ 			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
+ 		},
+ 		sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
+-	addF("math/bits", "Len",
+-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+-			if s.config.PtrSize == 4 {
+-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+-			}
+-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
+-		},
+-		sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
++	alias("math/bits", "Len", "math/bits", "Len64", p8...)
++	alias("math/bits", "Len", "math/bits", "Len32", p4...)
++
+ 	// LeadingZeros is handled because it trivially calls Len.
+ 	addF("math/bits", "Reverse64",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+index 4bf5fce2a5..b3f43eff5e 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+@@ -762,6 +762,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64", "runtime/internal/sys", "OnesCount64"}:           struct{}{},
+ 	{"ppc64", "runtime/internal/sys", "Prefetch"}:              struct{}{},
+ 	{"ppc64", "runtime/internal/sys", "PrefetchStreamed"}:      struct{}{},
++	{"ppc64", "runtime/internal/sys", "TrailingZeros8"}:        struct{}{},
+ 	{"ppc64", "runtime/internal/sys", "TrailingZeros32"}:       struct{}{},
+ 	{"ppc64", "runtime/internal/sys", "TrailingZeros64"}:       struct{}{},
+ 	{"ppc64", "math", "Abs"}:                                   struct{}{},
+@@ -794,6 +795,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64", "math/bits", "RotateLeft64"}:                     struct{}{},
+ 	{"ppc64", "math/bits", "Sub"}:                              struct{}{},
+ 	{"ppc64", "math/bits", "Sub64"}:                            struct{}{},
++	{"ppc64", "math/bits", "TrailingZeros8"}:                   struct{}{},
+ 	{"ppc64", "math/bits", "TrailingZeros16"}:                  struct{}{},
+ 	{"ppc64", "math/bits", "TrailingZeros32"}:                  struct{}{},
+ 	{"ppc64", "math/bits", "TrailingZeros64"}:                  struct{}{},
+@@ -881,6 +883,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64le", "runtime/internal/sys", "OnesCount64"}:         struct{}{},
+ 	{"ppc64le", "runtime/internal/sys", "Prefetch"}:            struct{}{},
+ 	{"ppc64le", "runtime/internal/sys", "PrefetchStreamed"}:    struct{}{},
++	{"ppc64le", "runtime/internal/sys", "TrailingZeros8"}:      struct{}{},
+ 	{"ppc64le", "runtime/internal/sys", "TrailingZeros32"}:     struct{}{},
+ 	{"ppc64le", "runtime/internal/sys", "TrailingZeros64"}:     struct{}{},
+ 	{"ppc64le", "math", "Abs"}:                                 struct{}{},
+@@ -913,6 +916,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64le", "math/bits", "RotateLeft64"}:                   struct{}{},
+ 	{"ppc64le", "math/bits", "Sub"}:                            struct{}{},
+ 	{"ppc64le", "math/bits", "Sub64"}:                          struct{}{},
++	{"ppc64le", "math/bits", "TrailingZeros8"}:                 struct{}{},
+ 	{"ppc64le", "math/bits", "TrailingZeros16"}:                struct{}{},
+ 	{"ppc64le", "math/bits", "TrailingZeros32"}:                struct{}{},
+ 	{"ppc64le", "math/bits", "TrailingZeros64"}:                struct{}{},
+@@ -1236,7 +1240,7 @@ func TestIntrinsics(t *testing.T) {
+ 
+ 	for ik, _ := range wantIntrinsics {
+ 		if _, found := gotIntrinsics[ik]; !found {
+-			t.Errorf("Want intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn)
++			t.Errorf("Want missing intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn)
+ 		}
+ 	}
+ }
+diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
+index bf2e8130c4..b6375c5e7a 100644
+--- a/test/codegen/mathbits.go
++++ b/test/codegen/mathbits.go
+@@ -345,8 +345,8 @@ func TrailingZeros16(n uint16) int {
+ 	// arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR"
+ 	// arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t"
+ 	// s390x:"FLOGR","OR\t\\$65536"
+-	// ppc64x/power8:"POPCNTD","OR\\t\\$65536"
+-	// ppc64x/power9:"CNTTZD","OR\\t\\$65536"
++	// ppc64x/power8:"POPCNTW","ADD\t\\$-1"
++	// ppc64x/power9:"CNTTZD","ORIS\\t\\$1"
+ 	// wasm:"I64Ctz"
+ 	return bits.TrailingZeros16(n)
+ }
+@@ -356,6 +356,8 @@ func TrailingZeros8(n uint8) int {
+ 	// 386:"BSFL"
+ 	// arm:"ORR\t\\$256","CLZ",-"MOVBU\tR"
+ 	// arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t"
++	// ppc64x/power8:"POPCNTB","ADD\t\\$-1"
++	// ppc64x/power9:"CNTTZD","OR\t\\$256"
+ 	// s390x:"FLOGR","OR\t\\$256"
+ 	// wasm:"I64Ctz"
+ 	return bits.TrailingZeros8(n)
+-- 
+2.39.5
+
diff --git a/2115-cmd-compile-intrinsify-math-bits.TrailingZeros-on-ri.patch b/2115-cmd-compile-intrinsify-math-bits.TrailingZeros-on-ri.patch
new file mode 100644
index 0000000..3f5b7ef
--- /dev/null
+++ b/2115-cmd-compile-intrinsify-math-bits.TrailingZeros-on-ri.patch
@@ -0,0 +1,375 @@
+From 49515bbf06a159e4af4681ce5b8081af7462766d Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:48:22 +0800
+Subject: [PATCH 115/119] cmd/compile: intrinsify math/bits.TrailingZeros on
+ riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+For riscv64/rva22u64 and above, we can intrinsify math/bits.TrailingZeros
+using the CTZ/CTZW machine instructions.
+
+On a StarFive VisionFive 2 with GORISCV64=rva22u64:
+
+                  │   ctz.b.1    │               ctz.b.2               │
+                  │    sec/op    │   sec/op     vs base                │
+TrailingZeros-4     25.500n ± 0%   8.052n ± 0%  -68.42% (p=0.000 n=10)
+TrailingZeros8-4     14.76n ± 0%   10.74n ± 0%  -27.24% (p=0.000 n=10)
+TrailingZeros16-4    26.84n ± 0%   10.74n ± 0%  -59.99% (p=0.000 n=10)
+TrailingZeros32-4   25.500n ± 0%   8.052n ± 0%  -68.42% (p=0.000 n=10)
+TrailingZeros64-4   25.500n ± 0%   8.052n ± 0%  -68.42% (p=0.000 n=10)
+geomean              23.09n        9.035n       -60.88%
+
+Change-Id: I71edf2b988acb7a68e797afda4ee66d7a57d587e
+Reviewed-on: https://go-review.googlesource.com/c/go/+/652320
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: David Chase <drchase@google.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |  2 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  7 +++
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |  2 +
+ src/cmd/compile/internal/ssa/opGen.go         | 28 ++++++++++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 54 +++++++++++++++++++
+ src/cmd/compile/internal/ssagen/intrinsics.go | 24 +++++++++
+ .../internal/ssagen/intrinsics_test.go        | 10 +++-
+ test/codegen/mathbits.go                      | 19 +++++--
+ 8 files changed, 141 insertions(+), 5 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 4aac891e13..ba982a13cd 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -419,7 +419,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVXS, ssa.OpRISCV64FMVDX, ssa.OpRISCV64FMVXD,
+ 		ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS,
+ 		ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD,
+-		ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW:
++		ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW:
+ 		p := s.Prog(v.Op.Asm())
+ 		p.From.Type = obj.TYPE_REG
+ 		p.From.Reg = v.Args[0].Reg()
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 9e39a58197..72e4e8d7b3 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -218,6 +218,13 @@
+ (RotateLeft32 ...) => (ROLW ...)
+ (RotateLeft64 ...) => (ROL  ...)
+ 
++// Count trailing zeros (note that these will only be emitted for rva22u64 and above).
++(Ctz(64|32|16|8)NonZero ...) => (Ctz64 ...)
++(Ctz64 ...) => (CTZ  ...)
++(Ctz32 ...) => (CTZW ...)
++(Ctz16 x) => (CTZW (ORI <typ.UInt32> [1<<16] x))
++(Ctz8  x) => (CTZW (ORI <typ.UInt32> [1<<8]  x))
++
+ (Less64  ...) => (SLT  ...)
+ (Less32  x y) => (SLT  (SignExt32to64 x) (SignExt32to64 y))
+ (Less16  x y) => (SLT  (SignExt16to64 x) (SignExt16to64 y))
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index a69b347a84..f62bce8980 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -229,6 +229,8 @@ func init() {
+ 		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true},   // arg0 & arg1
+ 		{name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"},                    // ^arg0 & arg1
+ 		{name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"},      // arg0 & auxint
++		{name: "CTZ", argLength: 1, reg: gp11, asm: "CTZ"},                      // count trailing zeros
++		{name: "CTZW", argLength: 1, reg: gp11, asm: "CTZW"},                    // count trailing zeros of least significant word
+ 		{name: "NOT", argLength: 1, reg: gp11, asm: "NOT"},                      // ^arg0
+ 		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},     // arg0 | arg1
+ 		{name: "ORN", argLength: 2, reg: gp21, asm: "ORN"},                      // ^arg0 | arg1
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index 5fda7ffc2f..6afa1662c3 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2387,6 +2387,8 @@ const (
+ 	OpRISCV64AND
+ 	OpRISCV64ANDN
+ 	OpRISCV64ANDI
++	OpRISCV64CTZ
++	OpRISCV64CTZW
+ 	OpRISCV64NOT
+ 	OpRISCV64OR
+ 	OpRISCV64ORN
+@@ -32041,6 +32043,32 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "CTZ",
++		argLen: 1,
++		asm:    riscv.ACTZ,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:   "CTZW",
++		argLen: 1,
++		asm:    riscv.ACTZW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:   "NOT",
+ 		argLen: 1,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index a449ce01c6..85b6a05d7c 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -138,6 +138,28 @@ func rewriteValueRISCV64(v *Value) bool {
+ 	case OpCopysign:
+ 		v.Op = OpRISCV64FSGNJD
+ 		return true
++	case OpCtz16:
++		return rewriteValueRISCV64_OpCtz16(v)
++	case OpCtz16NonZero:
++		v.Op = OpCtz64
++		return true
++	case OpCtz32:
++		v.Op = OpRISCV64CTZW
++		return true
++	case OpCtz32NonZero:
++		v.Op = OpCtz64
++		return true
++	case OpCtz64:
++		v.Op = OpRISCV64CTZ
++		return true
++	case OpCtz64NonZero:
++		v.Op = OpCtz64
++		return true
++	case OpCtz8:
++		return rewriteValueRISCV64_OpCtz8(v)
++	case OpCtz8NonZero:
++		v.Op = OpCtz64
++		return true
+ 	case OpCvt32Fto32:
+ 		v.Op = OpRISCV64FCVTWS
+ 		return true
+@@ -1005,6 +1027,38 @@ func rewriteValueRISCV64_OpConstNil(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueRISCV64_OpCtz16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (Ctz16 x)
++	// result: (CTZW (ORI <typ.UInt32> [1<<16] x))
++	for {
++		x := v_0
++		v.reset(OpRISCV64CTZW)
++		v0 := b.NewValue0(v.Pos, OpRISCV64ORI, typ.UInt32)
++		v0.AuxInt = int64ToAuxInt(1 << 16)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
++func rewriteValueRISCV64_OpCtz8(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (Ctz8 x)
++	// result: (CTZW (ORI <typ.UInt32> [1<<8] x))
++	for {
++		x := v_0
++		v.reset(OpRISCV64CTZW)
++		v0 := b.NewValue0(v.Pos, OpRISCV64ORI, typ.UInt32)
++		v0.AuxInt = int64ToAuxInt(1 << 8)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueRISCV64_OpDiv16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
+index a337ef7f1b..3554558519 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics.go
+@@ -685,6 +685,30 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
+ 			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
+ 		},
+ 		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
++
++	if cfg.goriscv64 >= 22 {
++		addF("math/bits", "TrailingZeros64",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
++			},
++			sys.RISCV64)
++		addF("math/bits", "TrailingZeros32",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
++			},
++			sys.RISCV64)
++		addF("math/bits", "TrailingZeros16",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
++			},
++			sys.RISCV64)
++		addF("math/bits", "TrailingZeros8",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
++			},
++			sys.RISCV64)
++	}
++
+ 	alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...)
+ 	alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...)
+ 	// ReverseBytes inlines correctly, no need to intrinsify it.
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+index b3f43eff5e..e8803f1ddf 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+@@ -999,6 +999,9 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"riscv64", "internal/runtime/math", "Add64"}:              struct{}{},
+ 	{"riscv64", "internal/runtime/math", "Mul64"}:              struct{}{},
+ 	{"riscv64", "internal/runtime/math", "MulUintptr"}:         struct{}{},
++	{"riscv64", "runtime/internal/sys", "TrailingZeros8"}:      struct{}{},
++	{"riscv64", "runtime/internal/sys", "TrailingZeros32"}:     struct{}{},
++	{"riscv64", "runtime/internal/sys", "TrailingZeros64"}:     struct{}{},
+ 	{"riscv64", "math", "Abs"}:                                 struct{}{},
+ 	{"riscv64", "math", "Copysign"}:                            struct{}{},
+ 	{"riscv64", "math", "FMA"}:                                 struct{}{},
+@@ -1015,6 +1018,10 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"riscv64", "math/bits", "RotateLeft8"}:                    struct{}{},
+ 	{"riscv64", "math/bits", "Sub"}:                            struct{}{},
+ 	{"riscv64", "math/bits", "Sub64"}:                          struct{}{},
++	{"riscv64", "math/bits", "TrailingZeros16"}:                struct{}{},
++	{"riscv64", "math/bits", "TrailingZeros32"}:                struct{}{},
++	{"riscv64", "math/bits", "TrailingZeros64"}:                struct{}{},
++	{"riscv64", "math/bits", "TrailingZeros8"}:                 struct{}{},
+ 	{"riscv64", "runtime", "KeepAlive"}:                        struct{}{},
+ 	{"riscv64", "runtime", "getcallerpc"}:                      struct{}{},
+ 	{"riscv64", "runtime", "getcallersp"}:                      struct{}{},
+@@ -1204,7 +1211,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 
+ func TestIntrinsics(t *testing.T) {
+ 	cfg := &intrinsicBuildConfig{
+-		goppc64: 10,
++		goppc64:   10,
++		goriscv64: 23,
+ 	}
+ 	initIntrinsics(cfg)
+ 
+diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
+index b6375c5e7a..97926c4ddc 100644
+--- a/test/codegen/mathbits.go
++++ b/test/codegen/mathbits.go
+@@ -296,26 +296,30 @@ func RotateLeftVariable32(n uint32, m int) uint32 {
+ // ------------------------ //
+ 
+ func TrailingZeros(n uint) int {
++	// 386:"BSFL"
+ 	// amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
+ 	// amd64/v3:"TZCNTQ"
+-	// 386:"BSFL"
+ 	// arm:"CLZ"
+ 	// arm64:"RBIT","CLZ"
+ 	// s390x:"FLOGR"
+ 	// ppc64x/power8:"ANDN","POPCNTD"
+ 	// ppc64x/power9: "CNTTZD"
++	// riscv64/rva22u64,riscv64/rva23u64: "CTZ\t"
++	// s390x:"FLOGR"
+ 	// wasm:"I64Ctz"
+ 	return bits.TrailingZeros(n)
+ }
+ 
+ func TrailingZeros64(n uint64) int {
++	// 386:"BSFL","JNE"
+ 	// amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
+ 	// amd64/v3:"TZCNTQ"
+-	// 386:"BSFL","JNE"
+ 	// arm64:"RBIT","CLZ"
+ 	// s390x:"FLOGR"
+ 	// ppc64x/power8:"ANDN","POPCNTD"
+ 	// ppc64x/power9: "CNTTZD"
++	// riscv64/rva22u64,riscv64/rva23u64: "CTZ\t"
++	// s390x:"FLOGR"
+ 	// wasm:"I64Ctz"
+ 	return bits.TrailingZeros64(n)
+ }
+@@ -327,14 +331,16 @@ func TrailingZeros64Subtract(n uint64) int {
+ }
+ 
+ func TrailingZeros32(n uint32) int {
++	// 386:"BSFL"
+ 	// amd64/v1,amd64/v2:"BTSQ\\t\\$32","BSFQ"
+ 	// amd64/v3:"TZCNTL"
+-	// 386:"BSFL"
+ 	// arm:"CLZ"
+ 	// arm64:"RBITW","CLZW"
+ 	// s390x:"FLOGR","MOVWZ"
+ 	// ppc64x/power8:"ANDN","POPCNTW"
+ 	// ppc64x/power9: "CNTTZW"
++	// riscv64/rva22u64,riscv64/rva23u64: "CTZW"
++	// s390x:"FLOGR","MOVWZ"
+ 	// wasm:"I64Ctz"
+ 	return bits.TrailingZeros32(n)
+ }
+@@ -342,11 +348,14 @@ func TrailingZeros32(n uint32) int {
+ func TrailingZeros16(n uint16) int {
+ 	// amd64:"BSFL","BTSL\\t\\$16"
+ 	// 386:"BSFL\t"
++	// amd64:"BSFL","ORL\\t\\$65536"
+ 	// arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR"
+ 	// arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t"
+ 	// s390x:"FLOGR","OR\t\\$65536"
+ 	// ppc64x/power8:"POPCNTW","ADD\t\\$-1"
+ 	// ppc64x/power9:"CNTTZD","ORIS\\t\\$1"
++	// riscv64/rva22u64,riscv64/rva23u64: "ORI\t\\$65536","CTZW"
++	// s390x:"FLOGR","OR\t\\$65536"
+ 	// wasm:"I64Ctz"
+ 	return bits.TrailingZeros16(n)
+ }
+@@ -354,10 +363,12 @@ func TrailingZeros16(n uint16) int {
+ func TrailingZeros8(n uint8) int {
+ 	// amd64:"BSFL","BTSL\\t\\$8"
+ 	// 386:"BSFL"
++	// amd64:"BSFL","ORL\\t\\$256"
+ 	// arm:"ORR\t\\$256","CLZ",-"MOVBU\tR"
+ 	// arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t"
+ 	// ppc64x/power8:"POPCNTB","ADD\t\\$-1"
+ 	// ppc64x/power9:"CNTTZD","OR\t\\$256"
++	// riscv64/rva22u64,riscv64/rva23u64: "ORI\t\\$256","CTZW"
+ 	// s390x:"FLOGR","OR\t\\$256"
+ 	// wasm:"I64Ctz"
+ 	return bits.TrailingZeros8(n)
+@@ -404,6 +415,7 @@ func IterateBits16(n uint16) int {
+ 		// amd64/v1,amd64/v2:"BSFL",-"BTSL"
+ 		// amd64/v3:"TZCNTL"
+ 		// arm64:"RBITW","CLZW",-"ORR"
++		// riscv64/rva22u64,riscv64/rva23u64: "CTZ\t",-"ORR"
+ 		i += bits.TrailingZeros16(n)
+ 		n &= n - 1
+ 	}
+@@ -416,6 +428,7 @@ func IterateBits8(n uint8) int {
+ 		// amd64/v1,amd64/v2:"BSFL",-"BTSL"
+ 		// amd64/v3:"TZCNTL"
+ 		// arm64:"RBITW","CLZW",-"ORR"
++		// riscv64/rva22u64,riscv64/rva23u64: "CTZ\t",-"ORR"
+ 		i += bits.TrailingZeros8(n)
+ 		n &= n - 1
+ 	}
+-- 
+2.39.5
+
diff --git a/2116-cmd-compile-internal-ssagen-use-an-alias-for-math-bi.patch b/2116-cmd-compile-internal-ssagen-use-an-alias-for-math-bi.patch
new file mode 100644
index 0000000..a92b41f
--- /dev/null
+++ b/2116-cmd-compile-internal-ssagen-use-an-alias-for-math-bi.patch
@@ -0,0 +1,86 @@
+From 7cd780b2c6e1b1d3b58a407b98e8efbaf25ca56a Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:48:22 +0800
+Subject: [PATCH 116/119] cmd/compile/internal/ssagen: use an alias for
+ math/bits.OnesCount
+
+Currently, only amd64 has an intrinsic for math/bits.OnesCount, which
+generates the same code as math/bits.OnesCount64. Replace this with
+an alias that maps math/bits.OnesCount to math/bits.OnesCount64 on
+64 bit platforms.
+
+Change-Id: Ifa12a2173a201aacd52c3c22b9a948be6e314405
+Reviewed-on: https://go-review.googlesource.com/c/go/+/659215
+Reviewed-by: Keith Randall <khr@google.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+Reviewed-by: Keith Randall <khr@golang.org>
+Auto-Submit: Keith Randall <khr@golang.org>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+---
+ src/cmd/compile/internal/ssagen/intrinsics.go      | 5 ++---
+ src/cmd/compile/internal/ssagen/intrinsics_test.go | 5 +++++
+ 2 files changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
+index 3554558519..52c0db64a9 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics.go
+@@ -854,9 +854,8 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
+ 			return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
+ 		},
+ 		sys.S390X, sys.PPC64, sys.Wasm)
+-	addF("math/bits", "OnesCount",
+-		makeOnesCountAMD64(ssa.OpPopCount64),
+-		sys.AMD64)
++	alias("math/bits", "OnesCount", "math/bits", "OnesCount64", p8...)
++
+ 	addF("math/bits", "Mul64",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+index e8803f1ddf..23a0b1678f 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+@@ -260,6 +260,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"arm64", "math/bits", "Len8"}:                             struct{}{},
+ 	{"arm64", "math/bits", "Mul"}:                              struct{}{},
+ 	{"arm64", "math/bits", "Mul64"}:                            struct{}{},
++	{"arm64", "math/bits", "OnesCount"}:                        struct{}{},
+ 	{"arm64", "math/bits", "OnesCount16"}:                      struct{}{},
+ 	{"arm64", "math/bits", "OnesCount32"}:                      struct{}{},
+ 	{"arm64", "math/bits", "OnesCount64"}:                      struct{}{},
+@@ -783,6 +784,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64", "math/bits", "Len8"}:                             struct{}{},
+ 	{"ppc64", "math/bits", "Mul"}:                              struct{}{},
+ 	{"ppc64", "math/bits", "Mul64"}:                            struct{}{},
++	{"ppc64", "math/bits", "OnesCount"}:                        struct{}{},
+ 	{"ppc64", "math/bits", "OnesCount16"}:                      struct{}{},
+ 	{"ppc64", "math/bits", "OnesCount32"}:                      struct{}{},
+ 	{"ppc64", "math/bits", "OnesCount64"}:                      struct{}{},
+@@ -904,6 +906,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"ppc64le", "math/bits", "Len8"}:                           struct{}{},
+ 	{"ppc64le", "math/bits", "Mul"}:                            struct{}{},
+ 	{"ppc64le", "math/bits", "Mul64"}:                          struct{}{},
++	{"ppc64le", "math/bits", "OnesCount"}:                      struct{}{},
+ 	{"ppc64le", "math/bits", "OnesCount16"}:                    struct{}{},
+ 	{"ppc64le", "math/bits", "OnesCount32"}:                    struct{}{},
+ 	{"ppc64le", "math/bits", "OnesCount64"}:                    struct{}{},
+@@ -1125,6 +1128,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"s390x", "math/bits", "Len8"}:                             struct{}{},
+ 	{"s390x", "math/bits", "Mul"}:                              struct{}{},
+ 	{"s390x", "math/bits", "Mul64"}:                            struct{}{},
++	{"s390x", "math/bits", "OnesCount"}:                        struct{}{},
+ 	{"s390x", "math/bits", "OnesCount16"}:                      struct{}{},
+ 	{"s390x", "math/bits", "OnesCount32"}:                      struct{}{},
+ 	{"s390x", "math/bits", "OnesCount64"}:                      struct{}{},
+@@ -1191,6 +1195,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"wasm", "math/bits", "Len32"}:                             struct{}{},
+ 	{"wasm", "math/bits", "Len64"}:                             struct{}{},
+ 	{"wasm", "math/bits", "Len8"}:                              struct{}{},
++	{"wasm", "math/bits", "OnesCount"}:                         struct{}{},
+ 	{"wasm", "math/bits", "OnesCount16"}:                       struct{}{},
+ 	{"wasm", "math/bits", "OnesCount32"}:                       struct{}{},
+ 	{"wasm", "math/bits", "OnesCount64"}:                       struct{}{},
+-- 
+2.39.5
+
diff --git a/2117-cmd-compile-intrinsify-math-bits.Len-on-riscv64.patch b/2117-cmd-compile-intrinsify-math-bits.Len-on-riscv64.patch
new file mode 100644
index 0000000..e4d725a
--- /dev/null
+++ b/2117-cmd-compile-intrinsify-math-bits.Len-on-riscv64.patch
@@ -0,0 +1,446 @@
+From 2406dc38b01afcf8c11c3a1e87f76613bc64684b Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:48:22 +0800
+Subject: [PATCH 117/119] cmd/compile: intrinsify math/bits.Len on riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+For riscv64/rva22u64 and above, we can intrinsify math/bits.Len using the
+CLZ/CLZW machine instructions.
+
+On a StarFive VisionFive 2 with GORISCV64=rva22u64:
+
+                 │   clz.b.1   │               clz.b.2               │
+                 │   sec/op    │   sec/op     vs base                │
+LeadingZeros-4     28.89n ± 0%   12.08n ± 0%  -58.19% (p=0.000 n=10)
+LeadingZeros8-4    18.79n ± 0%   14.76n ± 0%  -21.45% (p=0.000 n=10)
+LeadingZeros16-4   25.27n ± 0%   14.76n ± 0%  -41.59% (p=0.000 n=10)
+LeadingZeros32-4   25.12n ± 0%   12.08n ± 0%  -51.92% (p=0.000 n=10)
+LeadingZeros64-4   25.89n ± 0%   12.08n ± 0%  -53.35% (p=0.000 n=10)
+geomean            24.55n        13.09n       -46.70%
+
+Change-Id: I0dda684713dbdf5336af393f5ccbdae861c4f694
+Reviewed-on: https://go-review.googlesource.com/c/go/+/652321
+Reviewed-by: David Chase <drchase@google.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+Reviewed-by: Cherry Mui <cherryyz@google.com>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |  2 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  6 ++
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |  2 +
+ src/cmd/compile/internal/ssa/opGen.go         | 28 +++++++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 74 +++++++++++++++++++
+ src/cmd/compile/internal/ssagen/intrinsics.go | 24 ++++++
+ .../internal/ssagen/intrinsics_test.go        |  7 ++
+ test/codegen/mathbits.go                      | 45 ++++++++---
+ 8 files changed, 175 insertions(+), 13 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index ba982a13cd..1f0880f80b 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -419,7 +419,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVXS, ssa.OpRISCV64FMVDX, ssa.OpRISCV64FMVXD,
+ 		ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS,
+ 		ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD,
+-		ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW:
++		ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CLZ, ssa.OpRISCV64CLZW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW:
+ 		p := s.Prog(v.Op.Asm())
+ 		p.From.Type = obj.TYPE_REG
+ 		p.From.Reg = v.Args[0].Reg()
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 72e4e8d7b3..36c9b53eef 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -225,6 +225,12 @@
+ (Ctz16 x) => (CTZW (ORI <typ.UInt32> [1<<16] x))
+ (Ctz8  x) => (CTZW (ORI <typ.UInt32> [1<<8]  x))
+ 
++// Bit length (note that these will only be emitted for rva22u64 and above).
++(BitLen64 <t> x) => (SUB (MOVDconst [64]) (CLZ  <t> x))
++(BitLen32 <t> x) => (SUB (MOVDconst [32]) (CLZW <t> x))
++(BitLen16 x) => (BitLen64 (ZeroExt16to64 x))
++(BitLen8  x) => (BitLen64 (ZeroExt8to64 x))
++
+ (Less64  ...) => (SLT  ...)
+ (Less32  x y) => (SLT  (SignExt32to64 x) (SignExt32to64 y))
+ (Less16  x y) => (SLT  (SignExt16to64 x) (SignExt16to64 y))
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index f62bce8980..b411766354 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -229,6 +229,8 @@ func init() {
+ 		{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true},   // arg0 & arg1
+ 		{name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"},                    // ^arg0 & arg1
+ 		{name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"},      // arg0 & auxint
++		{name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"},                      // count leading zeros
++		{name: "CLZW", argLength: 1, reg: gp11, asm: "CLZW"},                    // count leading zeros of least significant word
+ 		{name: "CTZ", argLength: 1, reg: gp11, asm: "CTZ"},                      // count trailing zeros
+ 		{name: "CTZW", argLength: 1, reg: gp11, asm: "CTZW"},                    // count trailing zeros of least significant word
+ 		{name: "NOT", argLength: 1, reg: gp11, asm: "NOT"},                      // ^arg0
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index 6afa1662c3..28bd5a34bf 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2387,6 +2387,8 @@ const (
+ 	OpRISCV64AND
+ 	OpRISCV64ANDN
+ 	OpRISCV64ANDI
++	OpRISCV64CLZ
++	OpRISCV64CLZW
+ 	OpRISCV64CTZ
+ 	OpRISCV64CTZW
+ 	OpRISCV64NOT
+@@ -32043,6 +32045,32 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "CLZ",
++		argLen: 1,
++		asm:    riscv.ACLZ,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
++	{
++		name:   "CLZW",
++		argLen: 1,
++		asm:    riscv.ACLZW,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:   "CTZ",
+ 		argLen: 1,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 85b6a05d7c..474deeede6 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -104,6 +104,14 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return true
+ 	case OpAvg64u:
+ 		return rewriteValueRISCV64_OpAvg64u(v)
++	case OpBitLen16:
++		return rewriteValueRISCV64_OpBitLen16(v)
++	case OpBitLen32:
++		return rewriteValueRISCV64_OpBitLen32(v)
++	case OpBitLen64:
++		return rewriteValueRISCV64_OpBitLen64(v)
++	case OpBitLen8:
++		return rewriteValueRISCV64_OpBitLen8(v)
+ 	case OpClosureCall:
+ 		v.Op = OpRISCV64CALLclosure
+ 		return true
+@@ -940,6 +948,72 @@ func rewriteValueRISCV64_OpAvg64u(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueRISCV64_OpBitLen16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen16 x)
++	// result: (BitLen64 (ZeroExt16to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
++func rewriteValueRISCV64_OpBitLen32(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen32 <t> x)
++	// result: (SUB (MOVDconst [32]) (CLZW <t> x))
++	for {
++		t := v.Type
++		x := v_0
++		v.reset(OpRISCV64SUB)
++		v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
++		v0.AuxInt = int64ToAuxInt(32)
++		v1 := b.NewValue0(v.Pos, OpRISCV64CLZW, t)
++		v1.AddArg(x)
++		v.AddArg2(v0, v1)
++		return true
++	}
++}
++func rewriteValueRISCV64_OpBitLen64(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen64 <t> x)
++	// result: (SUB (MOVDconst [64]) (CLZ <t> x))
++	for {
++		t := v.Type
++		x := v_0
++		v.reset(OpRISCV64SUB)
++		v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
++		v0.AuxInt = int64ToAuxInt(64)
++		v1 := b.NewValue0(v.Pos, OpRISCV64CLZ, t)
++		v1.AddArg(x)
++		v.AddArg2(v0, v1)
++		return true
++	}
++}
++func rewriteValueRISCV64_OpBitLen8(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (BitLen8 x)
++	// result: (BitLen64 (ZeroExt8to64 x))
++	for {
++		x := v_0
++		v.reset(OpBitLen64)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueRISCV64_OpConst16(v *Value) bool {
+ 	// match: (Const16 [val])
+ 	// result: (MOVDconst [int64(val)])
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
+index 52c0db64a9..0888ef27d5 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics.go
+@@ -742,6 +742,30 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
+ 			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
+ 		},
+ 		sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
++
++	if cfg.goriscv64 >= 22 {
++		addF("math/bits", "Len64",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
++			},
++			sys.RISCV64)
++		addF("math/bits", "Len32",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
++			},
++			sys.RISCV64)
++		addF("math/bits", "Len16",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
++			},
++			sys.RISCV64)
++		addF("math/bits", "Len8",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
++			},
++			sys.RISCV64)
++	}
++
+ 	alias("math/bits", "Len", "math/bits", "Len64", p8...)
+ 	alias("math/bits", "Len", "math/bits", "Len32", p4...)
+ 
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+index 23a0b1678f..c31a6ee609 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+@@ -1002,6 +1002,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"riscv64", "internal/runtime/math", "Add64"}:              struct{}{},
+ 	{"riscv64", "internal/runtime/math", "Mul64"}:              struct{}{},
+ 	{"riscv64", "internal/runtime/math", "MulUintptr"}:         struct{}{},
++	{"riscv64", "runtime/internal/sys", "Len64"}:               struct{}{},
++	{"riscv64", "runtime/internal/sys", "Len8"}:                struct{}{},
+ 	{"riscv64", "runtime/internal/sys", "TrailingZeros8"}:      struct{}{},
+ 	{"riscv64", "runtime/internal/sys", "TrailingZeros32"}:     struct{}{},
+ 	{"riscv64", "runtime/internal/sys", "TrailingZeros64"}:     struct{}{},
+@@ -1012,6 +1014,11 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"riscv64", "math/big", "mulWW"}:                           struct{}{},
+ 	{"riscv64", "math/bits", "Add"}:                            struct{}{},
+ 	{"riscv64", "math/bits", "Add64"}:                          struct{}{},
++	{"riscv64", "math/bits", "Len"}:                            struct{}{},
++	{"riscv64", "math/bits", "Len16"}:                          struct{}{},
++	{"riscv64", "math/bits", "Len32"}:                          struct{}{},
++	{"riscv64", "math/bits", "Len64"}:                          struct{}{},
++	{"riscv64", "math/bits", "Len8"}:                           struct{}{},
+ 	{"riscv64", "math/bits", "Mul"}:                            struct{}{},
+ 	{"riscv64", "math/bits", "Mul64"}:                          struct{}{},
+ 	{"riscv64", "math/bits", "RotateLeft"}:                     struct{}{},
+diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
+index 97926c4ddc..45048f86eb 100644
+--- a/test/codegen/mathbits.go
++++ b/test/codegen/mathbits.go
+@@ -18,8 +18,10 @@ func LeadingZeros(n uint) int {
+ 	// s390x:"FLOGR"
+ 	// arm:"CLZ" arm64:"CLZ"
+ 	// mips:"CLZ"
+-	// wasm:"I64Clz"
+ 	// ppc64x:"CNTLZD"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"SUB"
++	// s390x:"FLOGR"
++	// wasm:"I64Clz"
+ 	return bits.LeadingZeros(n)
+ }
+ 
+@@ -29,8 +31,10 @@ func LeadingZeros64(n uint64) int {
+ 	// s390x:"FLOGR"
+ 	// arm:"CLZ" arm64:"CLZ"
+ 	// mips:"CLZ"
+-	// wasm:"I64Clz"
+ 	// ppc64x:"CNTLZD"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"ADDI"
++	// s390x:"FLOGR"
++	// wasm:"I64Clz"
+ 	return bits.LeadingZeros64(n)
+ }
+ 
+@@ -40,8 +44,10 @@ func LeadingZeros32(n uint32) int {
+ 	// s390x:"FLOGR"
+ 	// arm:"CLZ" arm64:"CLZW"
+ 	// mips:"CLZ"
+-	// wasm:"I64Clz"
+ 	// ppc64x:"CNTLZW"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZW",-"ADDI"
++	// s390x:"FLOGR"
++	// wasm:"I64Clz"
+ 	return bits.LeadingZeros32(n)
+ }
+ 
+@@ -51,8 +57,10 @@ func LeadingZeros16(n uint16) int {
+ 	// s390x:"FLOGR"
+ 	// arm:"CLZ" arm64:"CLZ"
+ 	// mips:"CLZ"
+-	// wasm:"I64Clz"
+ 	// ppc64x:"CNTLZD"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-48",-"NEG"
++	// s390x:"FLOGR"
++	// wasm:"I64Clz"
+ 	return bits.LeadingZeros16(n)
+ }
+ 
+@@ -62,8 +70,10 @@ func LeadingZeros8(n uint8) int {
+ 	// s390x:"FLOGR"
+ 	// arm:"CLZ" arm64:"CLZ"
+ 	// mips:"CLZ"
+-	// wasm:"I64Clz"
+ 	// ppc64x:"CNTLZD"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-56",-"NEG"
++	// s390x:"FLOGR"
++	// wasm:"I64Clz"
+ 	return bits.LeadingZeros8(n)
+ }
+ 
+@@ -77,8 +87,10 @@ func Len(n uint) int {
+ 	// s390x:"FLOGR"
+ 	// arm:"CLZ" arm64:"CLZ"
+ 	// mips:"CLZ"
+-	// wasm:"I64Clz"
+ 	// ppc64x:"SUBC","CNTLZD"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64"
++	// s390x:"FLOGR"
++	// wasm:"I64Clz"
+ 	return bits.Len(n)
+ }
+ 
+@@ -88,13 +100,16 @@ func Len64(n uint64) int {
+ 	// s390x:"FLOGR"
+ 	// arm:"CLZ" arm64:"CLZ"
+ 	// mips:"CLZ"
+-	// wasm:"I64Clz"
+ 	// ppc64x:"SUBC","CNTLZD"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64"
++	// s390x:"FLOGR"
++	// wasm:"I64Clz"
+ 	return bits.Len64(n)
+ }
+ 
+ func SubFromLen64(n uint64) int {
+ 	// ppc64x:"CNTLZD",-"SUBC"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"ADDI",-"NEG"
+ 	return 64 - bits.Len64(n)
+ }
+ 
+@@ -104,8 +119,10 @@ func Len32(n uint32) int {
+ 	// s390x:"FLOGR"
+ 	// arm:"CLZ" arm64:"CLZ"
+ 	// mips:"CLZ"
+-	// wasm:"I64Clz"
+ 	// ppc64x: "CNTLZW"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZW","ADDI\t\\$-32"
++	// s390x:"FLOGR"
++	// wasm:"I64Clz"
+ 	return bits.Len32(n)
+ }
+ 
+@@ -115,8 +132,10 @@ func Len16(n uint16) int {
+ 	// s390x:"FLOGR"
+ 	// arm:"CLZ" arm64:"CLZ"
+ 	// mips:"CLZ"
+-	// wasm:"I64Clz"
+ 	// ppc64x:"SUBC","CNTLZD"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64"
++	// s390x:"FLOGR"
++	// wasm:"I64Clz"
+ 	return bits.Len16(n)
+ }
+ 
+@@ -126,8 +145,10 @@ func Len8(n uint8) int {
+ 	// s390x:"FLOGR"
+ 	// arm:"CLZ" arm64:"CLZ"
+ 	// mips:"CLZ"
+-	// wasm:"I64Clz"
+ 	// ppc64x:"SUBC","CNTLZD"
++	// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64"
++	// s390x:"FLOGR"
++	// wasm:"I64Clz"
+ 	return bits.Len8(n)
+ }
+ 
+@@ -348,7 +369,6 @@ func TrailingZeros32(n uint32) int {
+ func TrailingZeros16(n uint16) int {
+ 	// amd64:"BSFL","BTSL\\t\\$16"
+ 	// 386:"BSFL\t"
+-	// amd64:"BSFL","ORL\\t\\$65536"
+ 	// arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR"
+ 	// arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t"
+ 	// s390x:"FLOGR","OR\t\\$65536"
+@@ -363,7 +383,6 @@ func TrailingZeros16(n uint16) int {
+ func TrailingZeros8(n uint8) int {
+ 	// amd64:"BSFL","BTSL\\t\\$8"
+ 	// 386:"BSFL"
+-	// amd64:"BSFL","ORL\\t\\$256"
+ 	// arm:"ORR\t\\$256","CLZ",-"MOVBU\tR"
+ 	// arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t"
+ 	// ppc64x/power8:"POPCNTB","ADD\t\\$-1"
+@@ -392,6 +411,7 @@ func IterateBits64(n uint64) int {
+ 	for n != 0 {
+ 		// amd64/v1,amd64/v2:"BSFQ",-"CMOVEQ"
+ 		// amd64/v3:"TZCNTQ"
++		// riscv64/rva22u64,riscv64/rva23u64: "CTZ\t"
+ 		i += bits.TrailingZeros64(n)
+ 		n &= n - 1
+ 	}
+@@ -403,6 +423,7 @@ func IterateBits32(n uint32) int {
+ 	for n != 0 {
+ 		// amd64/v1,amd64/v2:"BSFL",-"BTSQ"
+ 		// amd64/v3:"TZCNTL"
++		// riscv64/rva22u64,riscv64/rva23u64: "CTZ\t"
+ 		i += bits.TrailingZeros32(n)
+ 		n &= n - 1
+ 	}
+-- 
+2.39.5
+
diff --git a/2118-cmd-compile-intrinsify-math-bits.Bswap-on-riscv64.patch b/2118-cmd-compile-intrinsify-math-bits.Bswap-on-riscv64.patch
new file mode 100644
index 0000000..01733ef
--- /dev/null
+++ b/2118-cmd-compile-intrinsify-math-bits.Bswap-on-riscv64.patch
@@ -0,0 +1,331 @@
+From 4f4047a3396eeedb4e6972e7ac23073706bd9f57 Mon Sep 17 00:00:00 2001
+From: Joel Sing <joel@sing.id.au>
+Date: Fri, 26 Sep 2025 17:48:22 +0800
+Subject: [PATCH 118/119] cmd/compile: intrinsify math/bits.Bswap on riscv64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+For riscv64/rva22u64 and above, we can intrinsify math/bits.Bswap
+using the REV8 machine instruction.
+
+On a StarFive VisionFive 2 with GORISCV64=rva22u64:
+
+                 │     rb.1     │                rb.2                 │
+                 │    sec/op    │   sec/op     vs base                │
+ReverseBytes-4     18.790n ± 0%   4.026n ± 0%  -78.57% (p=0.000 n=10)
+ReverseBytes16-4    6.710n ± 0%   5.368n ± 0%  -20.00% (p=0.000 n=10)
+ReverseBytes32-4   13.420n ± 0%   5.368n ± 0%  -60.00% (p=0.000 n=10)
+ReverseBytes64-4   17.450n ± 0%   4.026n ± 0%  -76.93% (p=0.000 n=10)
+geomean             13.11n        4.649n       -64.54%
+
+Change-Id: I26eee34270b1721f7304bb1cddb0fda129b20ece
+Reviewed-on: https://go-review.googlesource.com/c/go/+/660855
+Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
+LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
+Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
+Reviewed-by: Carlos Amedee <carlos@golang.org>
+Reviewed-by: Junyang Shao <shaojunyang@google.com>
+---
+ src/cmd/compile/internal/riscv64/ssa.go       |  3 +-
+ .../compile/internal/ssa/_gen/RISCV64.rules   |  5 ++
+ .../compile/internal/ssa/_gen/RISCV64Ops.go   |  1 +
+ src/cmd/compile/internal/ssa/opGen.go         | 14 ++++++
+ .../compile/internal/ssa/rewriteRISCV64.go    | 39 +++++++++++++++
+ src/cmd/compile/internal/ssagen/intrinsics.go | 50 +++++++++++++++----
+ .../internal/ssagen/intrinsics_test.go        |  5 ++
+ test/codegen/mathbits.go                      | 13 +++--
+ 8 files changed, 111 insertions(+), 19 deletions(-)
+
+diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go
+index 1f0880f80b..96b0e605e8 100644
+--- a/src/cmd/compile/internal/riscv64/ssa.go
++++ b/src/cmd/compile/internal/riscv64/ssa.go
+@@ -419,7 +419,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVXS, ssa.OpRISCV64FMVDX, ssa.OpRISCV64FMVXD,
+ 		ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS,
+ 		ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD,
+-		ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CLZ, ssa.OpRISCV64CLZW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW:
++		ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CLZ, ssa.OpRISCV64CLZW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW,
++		ssa.OpRISCV64REV8:
+ 		p := s.Prog(v.Op.Asm())
+ 		p.From.Type = obj.TYPE_REG
+ 		p.From.Reg = v.Args[0].Reg()
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+index 36c9b53eef..93c4e790f8 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules
+@@ -231,6 +231,11 @@
+ (BitLen16 x) => (BitLen64 (ZeroExt16to64 x))
+ (BitLen8  x) => (BitLen64 (ZeroExt8to64 x))
+ 
++// Byte swap (note that these will only be emitted for rva22u64 and above).
++(Bswap64 ...) => (REV8 ...)
++(Bswap32 <t> x) => (SRLI [32] (REV8 <t> x))
++(Bswap16 <t> x) => (SRLI [48] (REV8 <t> x))
++
+ (Less64  ...) => (SLT  ...)
+ (Less32  x y) => (SLT  (SignExt32to64 x) (SignExt32to64 y))
+ (Less16  x y) => (SLT  (SignExt16to64 x) (SignExt16to64 y))
+diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+index b411766354..0a46dc7d5b 100644
+--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+@@ -237,6 +237,7 @@ func init() {
+ 		{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},     // arg0 | arg1
+ 		{name: "ORN", argLength: 2, reg: gp21, asm: "ORN"},                      // ^arg0 | arg1
+ 		{name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"},        // arg0 | auxint
++		{name: "REV8", argLength: 1, reg: gp11, asm: "REV8"},                    // reverse bytes
+ 		{name: "ROL", argLength: 2, reg: gp21, asm: "ROL"},                      // rotate left arg0 by (arg1 & 63)
+ 		{name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"},                    // rotate left least significant word of arg0 by (arg1 & 31), sign extended
+ 		{name: "ROR", argLength: 2, reg: gp21, asm: "ROR"},                      // rotate right arg0 by (arg1 & 63)
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index 28bd5a34bf..f906f86ec1 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -2395,6 +2395,7 @@ const (
+ 	OpRISCV64OR
+ 	OpRISCV64ORN
+ 	OpRISCV64ORI
++	OpRISCV64REV8
+ 	OpRISCV64ROL
+ 	OpRISCV64ROLW
+ 	OpRISCV64ROR
+@@ -32153,6 +32154,19 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "REV8",
++		argLen: 1,
++		asm:    riscv.AREV8,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++			outputs: []outputInfo{
++				{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
++			},
++		},
++	},
+ 	{
+ 		name:   "ROL",
+ 		argLen: 2,
+diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+index 474deeede6..5a0c9b70c4 100644
+--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go
++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go
+@@ -112,6 +112,13 @@ func rewriteValueRISCV64(v *Value) bool {
+ 		return rewriteValueRISCV64_OpBitLen64(v)
+ 	case OpBitLen8:
+ 		return rewriteValueRISCV64_OpBitLen8(v)
++	case OpBswap16:
++		return rewriteValueRISCV64_OpBswap16(v)
++	case OpBswap32:
++		return rewriteValueRISCV64_OpBswap32(v)
++	case OpBswap64:
++		v.Op = OpRISCV64REV8
++		return true
+ 	case OpClosureCall:
+ 		v.Op = OpRISCV64CALLclosure
+ 		return true
+@@ -1014,6 +1021,38 @@ func rewriteValueRISCV64_OpBitLen8(v *Value) bool {
+ 		return true
+ 	}
+ }
++func rewriteValueRISCV64_OpBswap16(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	// match: (Bswap16 <t> x)
++	// result: (SRLI [48] (REV8 <t> x))
++	for {
++		t := v.Type
++		x := v_0
++		v.reset(OpRISCV64SRLI)
++		v.AuxInt = int64ToAuxInt(48)
++		v0 := b.NewValue0(v.Pos, OpRISCV64REV8, t)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
++func rewriteValueRISCV64_OpBswap32(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	// match: (Bswap32 <t> x)
++	// result: (SRLI [32] (REV8 <t> x))
++	for {
++		t := v.Type
++		x := v_0
++		v.reset(OpRISCV64SRLI)
++		v.AuxInt = int64ToAuxInt(32)
++		v0 := b.NewValue0(v.Pos, OpRISCV64REV8, t)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++}
+ func rewriteValueRISCV64_OpConst16(v *Value) bool {
+ 	// match: (Const16 [val])
+ 	// result: (MOVDconst [int64(val)])
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
+index 0888ef27d5..019b76195d 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics.go
+@@ -178,23 +178,44 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
+ 		},
+ 		sys.ARM64, sys.PPC64, sys.RISCV64)
+ 
+-	brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X}
+-	if cfg.goppc64 >= 10 {
+-		// Use only on Power10 as the new byte reverse instructions that Power10 provide
+-		// make it worthwhile as an intrinsic
+-		brev_arch = append(brev_arch, sys.PPC64)
+-	}
+-	/******** runtime/internal/sys ********/
+ 	addF("runtime/internal/sys", "Bswap32",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
+ 		},
+-		brev_arch...)
++		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X)
+ 	addF("runtime/internal/sys", "Bswap64",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
+ 		},
+-		brev_arch...)
++		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X)
++
++	if cfg.goppc64 >= 10 {
++		// Use only on Power10 as the new byte reverse instructions that Power10 provide
++		// make it worthwhile as an intrinsic
++		addF("runtime/internal/sys", "Bswap32",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
++			},
++			sys.PPC64)
++		addF("runtime/internal/sys", "Bswap64",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
++			},
++			sys.PPC64)
++	}
++
++	if cfg.goriscv64 >= 22 {
++		addF("runtime/internal/sys", "Bswap32",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
++			},
++			sys.RISCV64)
++		addF("runtime/internal/sys", "Bswap64",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
++			},
++			sys.RISCV64)
++	}
+ 
+ 	/****** Prefetch ******/
+ 	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+@@ -709,18 +730,25 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
+ 			sys.RISCV64)
+ 	}
+ 
++	// ReverseBytes inlines correctly, no need to intrinsify it.
+ 	alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...)
+ 	alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...)
+-	// ReverseBytes inlines correctly, no need to intrinsify it.
+ 	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
+-	// On Power10, 16-bit rotate is not available so use BRH instruction
+ 	if cfg.goppc64 >= 10 {
++		// On Power10, 16-bit rotate is not available so use BRH instruction
+ 		addF("math/bits", "ReverseBytes16",
+ 			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+ 				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
+ 			},
+ 			sys.PPC64)
+ 	}
++	if cfg.goriscv64 >= 22 {
++		addF("math/bits", "ReverseBytes16",
++			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
++				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
++			},
++			sys.RISCV64)
++	}
+ 
+ 	addF("math/bits", "Len64",
+ 		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+index c31a6ee609..3062c4e489 100644
+--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go
++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go
+@@ -1002,6 +1002,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"riscv64", "internal/runtime/math", "Add64"}:              struct{}{},
+ 	{"riscv64", "internal/runtime/math", "Mul64"}:              struct{}{},
+ 	{"riscv64", "internal/runtime/math", "MulUintptr"}:         struct{}{},
++	{"riscv64", "runtime/internal/sys", "Bswap32"}:             struct{}{},
++	{"riscv64", "runtime/internal/sys", "Bswap64"}:             struct{}{},
+ 	{"riscv64", "runtime/internal/sys", "Len64"}:               struct{}{},
+ 	{"riscv64", "runtime/internal/sys", "Len8"}:                struct{}{},
+ 	{"riscv64", "runtime/internal/sys", "TrailingZeros8"}:      struct{}{},
+@@ -1021,6 +1023,9 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
+ 	{"riscv64", "math/bits", "Len8"}:                           struct{}{},
+ 	{"riscv64", "math/bits", "Mul"}:                            struct{}{},
+ 	{"riscv64", "math/bits", "Mul64"}:                          struct{}{},
++	{"riscv64", "math/bits", "ReverseBytes16"}:                 struct{}{},
++	{"riscv64", "math/bits", "ReverseBytes32"}:                 struct{}{},
++	{"riscv64", "math/bits", "ReverseBytes64"}:                 struct{}{},
+ 	{"riscv64", "math/bits", "RotateLeft"}:                     struct{}{},
+ 	{"riscv64", "math/bits", "RotateLeft16"}:                   struct{}{},
+ 	{"riscv64", "math/bits", "RotateLeft32"}:                   struct{}{},
+diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go
+index 45048f86eb..4ddb87a980 100644
+--- a/test/codegen/mathbits.go
++++ b/test/codegen/mathbits.go
+@@ -209,38 +209,37 @@ func OnesCount8(n uint8) int {
+ // ----------------------- //
+ 
+ func ReverseBytes(n uint) uint {
+-	// amd64:"BSWAPQ"
+ 	// 386:"BSWAPL"
+-	// s390x:"MOVDBR"
++	// amd64:"BSWAPQ"
+ 	// arm64:"REV"
+ 	return bits.ReverseBytes(n)
+ }
+ 
+ func ReverseBytes64(n uint64) uint64 {
+-	// amd64:"BSWAPQ"
+ 	// 386:"BSWAPL"
+-	// s390x:"MOVDBR"
++	// amd64:"BSWAPQ"
+ 	// arm64:"REV"
+ 	// ppc64x/power10: "BRD"
+ 	return bits.ReverseBytes64(n)
+ }
+ 
+ func ReverseBytes32(n uint32) uint32 {
+-	// amd64:"BSWAPL"
+ 	// 386:"BSWAPL"
+-	// s390x:"MOVWBR"
++	// amd64:"BSWAPL"
+ 	// arm64:"REVW"
+ 	// ppc64x/power10: "BRW"
++	// riscv64/rva22u64,riscv64/rva23u64:"REV8","SRLI\t\\$32"
++	// s390x:"MOVWBR"
+ 	return bits.ReverseBytes32(n)
+ }
+ 
+ func ReverseBytes16(n uint16) uint16 {
+ 	// amd64:"ROLW"
+-	// arm64:"REV16W",-"UBFX",-"ORR"
+ 	// arm/5:"SLL","SRL","ORR"
+ 	// arm/6:"REV16"
+ 	// arm/7:"REV16"
+ 	// ppc64x/power10: "BRH"
++	// riscv64/rva22u64,riscv64/rva23u64:"REV8","SRLI\t\\$48"
+ 	return bits.ReverseBytes16(n)
+ }
+ 
+-- 
+2.39.5
+
diff --git a/golang.spec b/golang.spec
index aa468c1..946155f 100644
--- a/golang.spec
+++ b/golang.spec
@@ -66,7 +66,7 @@
 
 Name:           golang
 Version:        1.21.4
-Release:        37
+Release:        38
 Summary:        The Go Programming Language
 License:        BSD and Public Domain
 URL:            https://golang.org/
@@ -164,6 +164,130 @@ Patch8002:      8002-cmd-go-Use-AI-to-guide-optimization.patch
 Patch8003:      8003-internal-buildcfg-add-Kunpeng-atomic-optimize.patch
 Patch8004:      8004-runtime-add-gcRatio-option.patch
 
+# Part 2000 ~ 2119
+# RISC-V RVA23 support backport
+%ifarch riscv64
+Patch2000:	2000-cmd-asm-cmd-internal-obj-riscv-cmd-link-improve-TLS-.patch
+Patch2001:	2001-cmd-compile-fold-most-repetitive-operations-to-simpl.patch
+Patch2002:	2002-crypto-internal-bigmod-provide-assembly-addMulVVW-fo.patch
+Patch2003:	2003-cmd-compile-sign-or-zero-extend-for-32-bit-equality-.patch
+Patch2004:	2004-cmd-compile-improve-FP-FMA-performance-on-riscv64.patch
+Patch2005:	2005-cmd-compile-add-single-precision-FMA-code-generation.patch
+Patch2006:	2006-NOT-FULL-BACKPORT-cmd-internal-obj-riscv-cmd-link-ad.patch
+Patch2007:	2007-cmd-internal-obj-riscv-clean-up-error-checking-for-e.patch
+Patch2008:	2008-cmd-internal-obj-riscv-correct-message-in-regVal-pan.patch
+Patch2009:	2009-cmd-internal-obj-riscv-simplify-instructionsForMOV.patch
+Patch2010:	2010-internal-cpu-fix-wrong-cache-line-size-of-riscv64.patch
+Patch2011:	2011-cmd-internal-obj-riscv-clean-up-immediate-checking.patch
+Patch2012:	2012-cmd-compile-internal-intrinsify-publicationBarrier-o.patch
+Patch2013:	2013-cmd-compile-internal-stop-lowering-OpConvert-on-risc.patch
+Patch2014:	2014-cmd-compile-optimize-right-shifts-of-uint32-on-riscv.patch
+Patch2015:	2015-cmd-link-internal-ld-assign-temporary-addresses-to-p.patch
+Patch2016:	2016-cmd-compile-optimize-right-shifts-of-int32-on-riscv6.patch
+Patch2017:	2017-cmd-internal-obj-riscv-support-subtraction-with-a-co.patch
+Patch2018:	2018-cmd-internal-obj-riscv-fix-the-offset-of-JALR-transf.patch
+Patch2019:	2019-cmd-internal-obj-riscv-improve-handling-of-invalid-a.patch
+Patch2020:	2020-all-clean-up-addition-of-constants-in-riscv64-assemb.patch
+Patch2021:	2021-cmd-internal-obj-riscv-add-support-of-PCALIGN-direct.patch
+Patch2022:	2022-internal-bytealg-optimize-Count-with-PCALIGN-in-risc.patch
+Patch2023:	2023-cmd-compile-correct-code-generation-for-right-shifts.patch
+Patch2024:	2024-crypto-sha512-provide-optimised-assembly-for-riscv64.patch
+Patch2025:	2025-cmd-go-add-GORISCV64-environment-variable.patch
+Patch2026:	2026-cmd-compile-implement-float-min-max-in-hardware-for-.patch
+Patch2027:	2027-cmd-compile-implement-float-min-max-in-hardware-for-.patch
+Patch2028:	2028-cmd-compile-improve-rotations-for-riscv64.patch
+Patch2029:	2029-cmd-asm-cmd-internal-obj-enable-rounding-mode-suffix.patch
+Patch2030:	2030-math-add-round-assembly-implementations-on-riscv64.patch
+Patch2031:	2031-cmd-link-internal-riscv64-generate-local-text-symbol.patch
+Patch2032:	2032-cmd-compile-cmd-internal-obj-provide-rotation-pseudo.patch
+Patch2033:	2033-cmd-internal-obj-support-Zba-Zbb-Zbs-extensions-in-r.patch
+Patch2034:	2034-cmd-internal-obj-riscv-improve-register-MOVB-MOVH-MO.patch
+Patch2035:	2035-cmd-internal-obj-riscv-use-native-rotation-instructi.patch
+Patch2036:	2036-cmd-internal-obj-riscv-check-immediate-for-rotation-.patch
+Patch2037:	2037-test-codegen-add-Mul-test-for-riscv64.patch
+Patch2038:	2038-math-remove-riscv64-assembly-implementations-of-roun.patch
+Patch2039:	2039-cmd-compile-drop-TODO-in-NilCheck-for-riscv64.patch
+Patch2040:	2040-math-big-implement-addVV-in-riscv64-assembly.patch
+Patch2041:	2041-math-big-implement-subVV-in-riscv64-assembly.patch
+Patch2042:	2042-cmd-compile-use-integer-min-max-instructions-on-risc.patch
+Patch2043:	2043-math-big-implement-addVW-in-riscv64-assembly.patch
+Patch2044:	2044-math-big-implement-subVW-in-riscv64-assembly.patch
+Patch2045:	2045-crypto-sha256-provide-optimised-assembly-for-riscv64.patch
+Patch2046:	2046-math-big-implement-mulAddVWW-in-riscv64-assembly.patch
+Patch2047:	2047-math-big-implement-addMulVVW-in-riscv64-assembly.patch
+Patch2048:	2048-test-codegen-add-initial-codegen-tests-for-integer-m.patch
+Patch2049:	2049-cmd-compile-internal-ssa-combine-shift-and-addition-.patch
+Patch2050:	2050-math-add-round-assembly-implementations-on-riscv64.patch
+Patch2051:	2051-test-codegen-add-Rotate-test-for-riscv64.patch
+Patch2052:	2052-runtime-add-asm_riscv64.h.patch
+Patch2053:	2053-cmd-compile-cmd-internal-obj-riscv-always-provide-AN.patch
+Patch2054:	2054-crypto-md5-provide-optimised-assembly-for-riscv64.patch
+Patch2055:	2055-cmd-internal-obj-riscv-rename-the-iIEncoding.patch
+Patch2056:	2056-cmd-internal-obj-riscv-add-vector-instruction-encodi.patch
+Patch2057:	2057-cmd-internal-obj-cmd-asm-add-vector-registers-to-ris.patch
+Patch2058:	2058-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch
+Patch2059:	2059-cmd-internal-obj-add-prologue_end-DWARF-stmt-for-ris.patch
+Patch2060:	2060-cmd-internal-obj-riscv-update-RISC-V-instruction-tab.patch
+Patch2061:	2061-crypto-sha512-improve-performance-of-riscv64-assembl.patch
+Patch2062:	2062-internal-bytealg-optimize-IndexByte-for-riscv64.patch
+Patch2063:	2063-cmd-internal-obj-riscv-rework-instruction-encoding-i.patch
+Patch2064:	2064-cpu-internal-provide-runtime-detection-of-RISC-V-ext.patch
+Patch2065:	2065-cmd-go-add-rva23u64-as-a-valid-value-for-GORISCV64.patch
+Patch2066:	2066-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch
+Patch2067:	2067-cmd-compile-don-t-merge-symbols-on-riscv64-when-dyna.patch
+Patch2068:	2068-cmd-internal-obj-riscv-support-MOVD-with-floating-po.patch
+Patch2069:	2069-cmd-asm-cmd-internal-obj-riscv-implement-vector-conf.patch
+Patch2070:	2070-internal-bytealg-clean-up-and-simplify-the-riscv64-e.patch
+Patch2071:	2071-bytes-internal-bytealg-eliminate-HashStrBytes-HashSt.patch
+Patch2072:	2072-cmd-internal-obj-riscv-implement-vector-load-store-i.patch
+Patch2073:	2073-cmd-internal-obj-riscv-add-riscv64-CSR-map.patch
+Patch2074:	2074-test-codegen-tighten-the-TrailingZeros64-test-on-386.patch
+Patch2075:	2075-test-codegen-add-riscv64-codegen-for-arithmetic-test.patch
+Patch2076:	2076-test-codegen-add-riscv64-rva23u64-specifiers-to-exis.patch
+Patch2077:	2077-test-codegen-add-a-test-for-negation-and-conversion-.patch
+Patch2078:	2078-cmd-compile-combine-negation-and-word-sign-extension.patch
+Patch2079:	2079-cmd-compile-internal-ssa-remove-double-negation-with.patch
+Patch2080:	2080-cmd-internal-obj-riscv-prevent-duplicate-error-repor.patch
+Patch2081:	2081-cmd-internal-obj-riscv-prevent-panics-on-bad-branche.patch
+Patch2082:	2082-cmd-internal-obj-riscv-fix-the-encoding-for-REV8-and.patch
+Patch2083:	2083-cmd-internal-obj-riscv-factor-out-shift-constant-cod.patch
+Patch2084:	2084-cmd-asm-internal-asm-add-additional-tests-for-consta.patch
+Patch2085:	2085-test-codegen-add-combined-conversion-and-shift-tests.patch
+Patch2086:	2086-cmd-internal-obj-riscv-internal-bytealg-synthesize-M.patch
+Patch2087:	2087-cmd-internal-obj-riscv-improve-constant-construction.patch
+Patch2088:	2088-cmd-compile-internal-ssa-optimise-more-branches-with.patch
+Patch2089:	2089-cmd-internal-obj-riscv-add-support-for-vector-intege.patch
+Patch2090:	2090-cmd-internal-obj-riscv-add-support-for-vector-fixed-.patch
+Patch2091:	2091-crypto-sha512-remove-unnecessary-move-op-replace-wit.patch
+Patch2092:	2092-crypto-sha256-improve-performance-of-riscv64-assembl.patch
+Patch2093:	2093-cmd-link-fix-cgo-on-riscv64-when-building-with-gcc-1.patch
+Patch2094:	2094-internal-bytealg-deduplicate-code-between-Count-Coun.patch
+Patch2095:	2095-cmd-internal-obj-riscv-add-support-for-vector-floati.patch
+Patch2096:	2096-cmd-internal-obj-riscv-add-support-for-vector-reduct.patch
+Patch2097:	2097-cmd-internal-obj-riscv-add-support-for-vector-mask-i.patch
+Patch2098:	2098-cmd-internal-obj-riscv-add-support-for-vector-permut.patch
+Patch2099:	2099-internal-bytealg-vector-implementation-of-equal-for-.patch
+Patch2100:	2100-internal-bytealg-vector-implementation-of-indexbyte-.patch
+Patch2101:	2101-cmd-internal-obj-riscv-reject-invalid-vadc-vsbc-enco.patch
+Patch2102:	2102-cmd-internal-obj-riscv-fix-LMUL-encoding-for-MF2-and.patch
+Patch2103:	2103-cmd-compile-add-generic-simplifications-on-riscv64.patch
+Patch2104:	2104-cmd-internal-obj-riscv-fix-vector-integer-multiply-a.patch
+Patch2105:	2105-cmd-compile-optimise-float-int-register-moves-on-ris.patch
+Patch2106:	2106-internal-bytealg-vector-implementation-of-compare-fo.patch
+Patch2107:	2107-cmd-compile-internal-ssagen-improve-intrinsic-archit.patch
+Patch2108:	2108-cmd-compile-internal-ssagen-factor-out-intrinsics-co.patch
+Patch2109:	2109-cmd-compile-internal-ssagen-add-initial-test-coverag.patch
+Patch2110:	2110-cmd-dist-internal-add-GOARM64-environment-variable.patch
+Patch2111:	2111-cmd-compile-internal-ssagen-provide-intrinsicBuilder.patch
+Patch2112:	2112-cmd-compile-internal-ssagen-improve-intrinsic-test.patch
+Patch2113:	2113-cmd-compile-simplify-intrinsification-of-BitLen16-an.patch
+Patch2114:	2114-cmd-compile-simplify-intrinsification-of-TrailingZer.patch
+Patch2115:	2115-cmd-compile-intrinsify-math-bits.TrailingZeros-on-ri.patch
+Patch2116:	2116-cmd-compile-internal-ssagen-use-an-alias-for-math-bi.patch
+Patch2117:	2117-cmd-compile-intrinsify-math-bits.Len-on-riscv64.patch
+Patch2118:	2118-cmd-compile-intrinsify-math-bits.Bswap-on-riscv64.patch
+%endif
+
 ExclusiveArch:  %{golang_arches}
 
 %description
@@ -401,6 +525,12 @@ fi
 %files devel -f go-tests.list -f go-misc.list -f go-src.list
 
 %changelog
+* Fri Sep 26 2025 Julian Zhu <julian.oerv@isrc.iscas.ac.cn> - 1.21.4-38
+- Type:Feature
+- CVE:NA
+- SUG:NA
+- DESC: Backport RISC-V RVA23 support for RISC-V 64
+
 * Mon Sep 15 2025 songliyang <songliyang@kylinos.cn> - 1.21.4-37
 - Type:CVE
 - CVE:CVE-2025-22871
-- 
Gitee