From 881ac8a472c301bf423a50d36ef6b7c55e51962b Mon Sep 17 00:00:00 2001 From: Julian Zhu Date: Fri, 26 Sep 2025 17:24:06 +0800 Subject: [PATCH] - Backport RISC-V RVA23 support for RISC-V 64 Signed-off-by: Julian Zhu --- ...rnal-obj-riscv-cmd-link-improve-TLS-.patch | 335 +++ ...-most-repetitive-operations-to-simpl.patch | 209 ++ ...bigmod-provide-assembly-addMulVVW-fo.patch | 169 ++ ...-or-zero-extend-for-32-bit-equality-.patch | 242 ++ ...mprove-FP-FMA-performance-on-riscv64.patch | 276 ++ ...single-precision-FMA-code-generation.patch | 512 ++++ ...T-cmd-internal-obj-riscv-cmd-link-ad.patch | 180 ++ ...-riscv-clean-up-error-checking-for-e.patch | 41 + ...-riscv-correct-message-in-regVal-pan.patch | 34 + ...bj-riscv-simplify-instructionsForMOV.patch | 56 + ...fix-wrong-cache-line-size-of-riscv64.patch | 34 + ...bj-riscv-clean-up-immediate-checking.patch | 206 ++ ...rnal-intrinsify-publicationBarrier-o.patch | 145 + ...rnal-stop-lowering-OpConvert-on-risc.patch | 117 + ...mize-right-shifts-of-uint32-on-riscv.patch | 558 ++++ ...l-ld-assign-temporary-addresses-to-p.patch | 267 ++ ...mize-right-shifts-of-int32-on-riscv6.patch | 540 ++++ ...-riscv-support-subtraction-with-a-co.patch | 72 + ...-riscv-fix-the-offset-of-JALR-transf.patch | 119 + ...-riscv-improve-handling-of-invalid-a.patch | 376 +++ ...ition-of-constants-in-riscv64-assemb.patch | 555 ++++ ...-riscv-add-support-of-PCALIGN-direct.patch | 152 + ...-optimize-Count-with-PCALIGN-in-risc.patch | 94 + ...ect-code-generation-for-right-shifts.patch | 980 +++++++ ...ovide-optimised-assembly-for-riscv64.patch | 380 +++ ...o-add-GORISCV64-environment-variable.patch | 396 +++ ...ement-float-min-max-in-hardware-for-.patch | 520 ++++ ...ement-float-min-max-in-hardware-for-.patch | 348 +++ ...ompile-improve-rotations-for-riscv64.patch | 596 ++++ ...rnal-obj-enable-rounding-mode-suffix.patch | 308 +++ ...-assembly-implementations-on-riscv64.patch | 113 + ...l-riscv64-generate-local-text-symbol.patch | 47 + ...internal-obj-provide-rotation-pseudo.patch | 864 ++++++ ...-support-Zba-Zbb-Zbs-extensions-in-r.patch | 617 +++++ ...-riscv-improve-register-MOVB-MOVH-MO.patch | 118 + ...-riscv-use-native-rotation-instructi.patch | 62 + ...-riscv-check-immediate-for-rotation-.patch | 102 + ...est-codegen-add-Mul-test-for-riscv64.patch | 31 + ...v64-assembly-implementations-of-roun.patch | 127 + ...le-drop-TODO-in-NilCheck-for-riscv64.patch | 49 + ...-implement-addVV-in-riscv64-assembly.patch | 148 + ...-implement-subVV-in-riscv64-assembly.patch | 148 + ...integer-min-max-instructions-on-risc.patch | 360 +++ ...-implement-addVW-in-riscv64-assembly.patch | 146 + ...-implement-subVW-in-riscv64-assembly.patch | 146 + ...ovide-optimised-assembly-for-riscv64.patch | 349 +++ ...lement-mulAddVWW-in-riscv64-assembly.patch | 141 + ...lement-addMulVVW-in-riscv64-assembly.patch | 158 ++ ...-initial-codegen-tests-for-integer-m.patch | 63 + ...rnal-ssa-combine-shift-and-addition-.patch | 232 ++ ...-assembly-implementations-on-riscv64.patch | 125 + ...-codegen-add-Rotate-test-for-riscv64.patch | 62 + 2052-runtime-add-asm_riscv64.h.patch | 67 + ...internal-obj-riscv-always-provide-AN.patch | 387 +++ ...ovide-optimised-assembly-for-riscv64.patch | 385 +++ ...rnal-obj-riscv-rename-the-iIEncoding.patch | 200 ++ ...-riscv-add-vector-instruction-encodi.patch | 2444 +++++++++++++++++ ...-cmd-asm-add-vector-registers-to-ris.patch | 139 + ...-riscv-update-references-to-RISC-V-s.patch | 578 ++++ ...-add-prologue_end-DWARF-stmt-for-ris.patch | 58 + ...-riscv-update-RISC-V-instruction-tab.patch | 371 +++ ...prove-performance-of-riscv64-assembl.patch | 110 + ...tealg-optimize-IndexByte-for-riscv64.patch | 466 ++++ ...-riscv-rework-instruction-encoding-i.patch | 624 +++++ ...vide-runtime-detection-of-RISC-V-ext.patch | 255 ++ ...23u64-as-a-valid-value-for-GORISCV64.patch | 190 ++ ...-riscv-update-references-to-RISC-V-s.patch | 671 +++++ ...t-merge-symbols-on-riscv64-when-dyna.patch | 589 ++++ ...-riscv-support-MOVD-with-floating-po.patch | 83 + ...rnal-obj-riscv-implement-vector-conf.patch | 618 +++++ ...-clean-up-and-simplify-the-riscv64-e.patch | 160 ++ ...ytealg-eliminate-HashStrBytes-HashSt.patch | 126 + ...-riscv-implement-vector-load-store-i.patch | 539 ++++ ...ternal-obj-riscv-add-riscv64-CSR-map.patch | 363 +++ ...hten-the-TrailingZeros64-test-on-386.patch | 36 + ...-riscv64-codegen-for-arithmetic-test.patch | 102 + ...-riscv64-rva23u64-specifiers-to-exis.patch | 84 + ...-a-test-for-negation-and-conversion-.patch | 39 + ...ine-negation-and-word-sign-extension.patch | 80 + ...rnal-ssa-remove-double-negation-with.patch | 97 + ...-riscv-prevent-duplicate-error-repor.patch | 189 ++ ...-riscv-prevent-panics-on-bad-branche.patch | 74 + ...-riscv-fix-the-encoding-for-REV8-and.patch | 41 + ...-riscv-factor-out-shift-constant-cod.patch | 151 + ...-asm-add-additional-tests-for-consta.patch | 75 + ...-combined-conversion-and-shift-tests.patch | 95 + ...-riscv-internal-bytealg-synthesize-M.patch | 453 +++ ...-riscv-improve-constant-construction.patch | 239 ++ ...rnal-ssa-optimise-more-branches-with.patch | 125 + ...-riscv-add-support-for-vector-intege.patch | 1327 +++++++++ ...-riscv-add-support-for-vector-fixed-.patch | 266 ++ ...move-unnecessary-move-op-replace-wit.patch | 66 + ...prove-performance-of-riscv64-assembl.patch | 120 + ...-on-riscv64-when-building-with-gcc-1.patch | 81 + ...-deduplicate-code-between-Count-Coun.patch | 63 + ...-riscv-add-support-for-vector-floati.patch | 1735 ++++++++++++ ...-riscv-add-support-for-vector-reduct.patch | 176 ++ ...-riscv-add-support-for-vector-mask-i.patch | 269 ++ ...-riscv-add-support-for-vector-permut.patch | 287 ++ ...-vector-implementation-of-equal-for-.patch | 186 ++ ...-vector-implementation-of-indexbyte-.patch | 156 ++ ...-riscv-reject-invalid-vadc-vsbc-enco.patch | 123 + ...-riscv-fix-LMUL-encoding-for-MF2-and.patch | 68 + ...d-generic-simplifications-on-riscv64.patch | 203 ++ ...-riscv-fix-vector-integer-multiply-a.patch | 187 ++ ...mise-float-int-register-moves-on-ris.patch | 663 +++++ ...-vector-implementation-of-compare-fo.patch | 163 ++ ...rnal-ssagen-improve-intrinsic-archit.patch | 101 + ...rnal-ssagen-factor-out-intrinsics-co.patch | 2066 ++++++++++++++ ...rnal-ssagen-add-initial-test-coverag.patch | 1254 +++++++++ ...nal-add-GOARM64-environment-variable.patch | 232 ++ ...rnal-ssagen-provide-intrinsicBuilder.patch | 706 +++++ ...ternal-ssagen-improve-intrinsic-test.patch | 155 ++ ...lify-intrinsification-of-BitLen16-an.patch | 582 ++++ ...lify-intrinsification-of-TrailingZer.patch | 563 ++++ ...insify-math-bits.TrailingZeros-on-ri.patch | 375 +++ ...rnal-ssagen-use-an-alias-for-math-bi.patch | 86 + ...-intrinsify-math-bits.Len-on-riscv64.patch | 446 +++ ...ntrinsify-math-bits.Bswap-on-riscv64.patch | 331 +++ golang.spec | 132 +- 120 files changed, 37495 insertions(+), 1 deletion(-) create mode 100644 2000-cmd-asm-cmd-internal-obj-riscv-cmd-link-improve-TLS-.patch create mode 100644 2001-cmd-compile-fold-most-repetitive-operations-to-simpl.patch create mode 100644 2002-crypto-internal-bigmod-provide-assembly-addMulVVW-fo.patch create mode 100644 2003-cmd-compile-sign-or-zero-extend-for-32-bit-equality-.patch create mode 100644 2004-cmd-compile-improve-FP-FMA-performance-on-riscv64.patch create mode 100644 2005-cmd-compile-add-single-precision-FMA-code-generation.patch create mode 100644 2006-NOT-FULL-BACKPORT-cmd-internal-obj-riscv-cmd-link-ad.patch create mode 100644 2007-cmd-internal-obj-riscv-clean-up-error-checking-for-e.patch create mode 100644 2008-cmd-internal-obj-riscv-correct-message-in-regVal-pan.patch create mode 100644 2009-cmd-internal-obj-riscv-simplify-instructionsForMOV.patch create mode 100644 2010-internal-cpu-fix-wrong-cache-line-size-of-riscv64.patch create mode 100644 2011-cmd-internal-obj-riscv-clean-up-immediate-checking.patch create mode 100644 2012-cmd-compile-internal-intrinsify-publicationBarrier-o.patch create mode 100644 2013-cmd-compile-internal-stop-lowering-OpConvert-on-risc.patch create mode 100644 2014-cmd-compile-optimize-right-shifts-of-uint32-on-riscv.patch create mode 100644 2015-cmd-link-internal-ld-assign-temporary-addresses-to-p.patch create mode 100644 2016-cmd-compile-optimize-right-shifts-of-int32-on-riscv6.patch create mode 100644 2017-cmd-internal-obj-riscv-support-subtraction-with-a-co.patch create mode 100644 2018-cmd-internal-obj-riscv-fix-the-offset-of-JALR-transf.patch create mode 100644 2019-cmd-internal-obj-riscv-improve-handling-of-invalid-a.patch create mode 100644 2020-all-clean-up-addition-of-constants-in-riscv64-assemb.patch create mode 100644 2021-cmd-internal-obj-riscv-add-support-of-PCALIGN-direct.patch create mode 100644 2022-internal-bytealg-optimize-Count-with-PCALIGN-in-risc.patch create mode 100644 2023-cmd-compile-correct-code-generation-for-right-shifts.patch create mode 100644 2024-crypto-sha512-provide-optimised-assembly-for-riscv64.patch create mode 100644 2025-cmd-go-add-GORISCV64-environment-variable.patch create mode 100644 2026-cmd-compile-implement-float-min-max-in-hardware-for-.patch create mode 100644 2027-cmd-compile-implement-float-min-max-in-hardware-for-.patch create mode 100644 2028-cmd-compile-improve-rotations-for-riscv64.patch create mode 100644 2029-cmd-asm-cmd-internal-obj-enable-rounding-mode-suffix.patch create mode 100644 2030-math-add-round-assembly-implementations-on-riscv64.patch create mode 100644 2031-cmd-link-internal-riscv64-generate-local-text-symbol.patch create mode 100644 2032-cmd-compile-cmd-internal-obj-provide-rotation-pseudo.patch create mode 100644 2033-cmd-internal-obj-support-Zba-Zbb-Zbs-extensions-in-r.patch create mode 100644 2034-cmd-internal-obj-riscv-improve-register-MOVB-MOVH-MO.patch create mode 100644 2035-cmd-internal-obj-riscv-use-native-rotation-instructi.patch create mode 100644 2036-cmd-internal-obj-riscv-check-immediate-for-rotation-.patch create mode 100644 2037-test-codegen-add-Mul-test-for-riscv64.patch create mode 100644 2038-math-remove-riscv64-assembly-implementations-of-roun.patch create mode 100644 2039-cmd-compile-drop-TODO-in-NilCheck-for-riscv64.patch create mode 100644 2040-math-big-implement-addVV-in-riscv64-assembly.patch create mode 100644 2041-math-big-implement-subVV-in-riscv64-assembly.patch create mode 100644 2042-cmd-compile-use-integer-min-max-instructions-on-risc.patch create mode 100644 2043-math-big-implement-addVW-in-riscv64-assembly.patch create mode 100644 2044-math-big-implement-subVW-in-riscv64-assembly.patch create mode 100644 2045-crypto-sha256-provide-optimised-assembly-for-riscv64.patch create mode 100644 2046-math-big-implement-mulAddVWW-in-riscv64-assembly.patch create mode 100644 2047-math-big-implement-addMulVVW-in-riscv64-assembly.patch create mode 100644 2048-test-codegen-add-initial-codegen-tests-for-integer-m.patch create mode 100644 2049-cmd-compile-internal-ssa-combine-shift-and-addition-.patch create mode 100644 2050-math-add-round-assembly-implementations-on-riscv64.patch create mode 100644 2051-test-codegen-add-Rotate-test-for-riscv64.patch create mode 100644 2052-runtime-add-asm_riscv64.h.patch create mode 100644 2053-cmd-compile-cmd-internal-obj-riscv-always-provide-AN.patch create mode 100644 2054-crypto-md5-provide-optimised-assembly-for-riscv64.patch create mode 100644 2055-cmd-internal-obj-riscv-rename-the-iIEncoding.patch create mode 100644 2056-cmd-internal-obj-riscv-add-vector-instruction-encodi.patch create mode 100644 2057-cmd-internal-obj-cmd-asm-add-vector-registers-to-ris.patch create mode 100644 2058-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch create mode 100644 2059-cmd-internal-obj-add-prologue_end-DWARF-stmt-for-ris.patch create mode 100644 2060-cmd-internal-obj-riscv-update-RISC-V-instruction-tab.patch create mode 100644 2061-crypto-sha512-improve-performance-of-riscv64-assembl.patch create mode 100644 2062-internal-bytealg-optimize-IndexByte-for-riscv64.patch create mode 100644 2063-cmd-internal-obj-riscv-rework-instruction-encoding-i.patch create mode 100644 2064-cpu-internal-provide-runtime-detection-of-RISC-V-ext.patch create mode 100644 2065-cmd-go-add-rva23u64-as-a-valid-value-for-GORISCV64.patch create mode 100644 2066-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch create mode 100644 2067-cmd-compile-don-t-merge-symbols-on-riscv64-when-dyna.patch create mode 100644 2068-cmd-internal-obj-riscv-support-MOVD-with-floating-po.patch create mode 100644 2069-cmd-asm-cmd-internal-obj-riscv-implement-vector-conf.patch create mode 100644 2070-internal-bytealg-clean-up-and-simplify-the-riscv64-e.patch create mode 100644 2071-bytes-internal-bytealg-eliminate-HashStrBytes-HashSt.patch create mode 100644 2072-cmd-internal-obj-riscv-implement-vector-load-store-i.patch create mode 100644 2073-cmd-internal-obj-riscv-add-riscv64-CSR-map.patch create mode 100644 2074-test-codegen-tighten-the-TrailingZeros64-test-on-386.patch create mode 100644 2075-test-codegen-add-riscv64-codegen-for-arithmetic-test.patch create mode 100644 2076-test-codegen-add-riscv64-rva23u64-specifiers-to-exis.patch create mode 100644 2077-test-codegen-add-a-test-for-negation-and-conversion-.patch create mode 100644 2078-cmd-compile-combine-negation-and-word-sign-extension.patch create mode 100644 2079-cmd-compile-internal-ssa-remove-double-negation-with.patch create mode 100644 2080-cmd-internal-obj-riscv-prevent-duplicate-error-repor.patch create mode 100644 2081-cmd-internal-obj-riscv-prevent-panics-on-bad-branche.patch create mode 100644 2082-cmd-internal-obj-riscv-fix-the-encoding-for-REV8-and.patch create mode 100644 2083-cmd-internal-obj-riscv-factor-out-shift-constant-cod.patch create mode 100644 2084-cmd-asm-internal-asm-add-additional-tests-for-consta.patch create mode 100644 2085-test-codegen-add-combined-conversion-and-shift-tests.patch create mode 100644 2086-cmd-internal-obj-riscv-internal-bytealg-synthesize-M.patch create mode 100644 2087-cmd-internal-obj-riscv-improve-constant-construction.patch create mode 100644 2088-cmd-compile-internal-ssa-optimise-more-branches-with.patch create mode 100644 2089-cmd-internal-obj-riscv-add-support-for-vector-intege.patch create mode 100644 2090-cmd-internal-obj-riscv-add-support-for-vector-fixed-.patch create mode 100644 2091-crypto-sha512-remove-unnecessary-move-op-replace-wit.patch create mode 100644 2092-crypto-sha256-improve-performance-of-riscv64-assembl.patch create mode 100644 2093-cmd-link-fix-cgo-on-riscv64-when-building-with-gcc-1.patch create mode 100644 2094-internal-bytealg-deduplicate-code-between-Count-Coun.patch create mode 100644 2095-cmd-internal-obj-riscv-add-support-for-vector-floati.patch create mode 100644 2096-cmd-internal-obj-riscv-add-support-for-vector-reduct.patch create mode 100644 2097-cmd-internal-obj-riscv-add-support-for-vector-mask-i.patch create mode 100644 2098-cmd-internal-obj-riscv-add-support-for-vector-permut.patch create mode 100644 2099-internal-bytealg-vector-implementation-of-equal-for-.patch create mode 100644 2100-internal-bytealg-vector-implementation-of-indexbyte-.patch create mode 100644 2101-cmd-internal-obj-riscv-reject-invalid-vadc-vsbc-enco.patch create mode 100644 2102-cmd-internal-obj-riscv-fix-LMUL-encoding-for-MF2-and.patch create mode 100644 2103-cmd-compile-add-generic-simplifications-on-riscv64.patch create mode 100644 2104-cmd-internal-obj-riscv-fix-vector-integer-multiply-a.patch create mode 100644 2105-cmd-compile-optimise-float-int-register-moves-on-ris.patch create mode 100644 2106-internal-bytealg-vector-implementation-of-compare-fo.patch create mode 100644 2107-cmd-compile-internal-ssagen-improve-intrinsic-archit.patch create mode 100644 2108-cmd-compile-internal-ssagen-factor-out-intrinsics-co.patch create mode 100644 2109-cmd-compile-internal-ssagen-add-initial-test-coverag.patch create mode 100644 2110-cmd-dist-internal-add-GOARM64-environment-variable.patch create mode 100644 2111-cmd-compile-internal-ssagen-provide-intrinsicBuilder.patch create mode 100644 2112-cmd-compile-internal-ssagen-improve-intrinsic-test.patch create mode 100644 2113-cmd-compile-simplify-intrinsification-of-BitLen16-an.patch create mode 100644 2114-cmd-compile-simplify-intrinsification-of-TrailingZer.patch create mode 100644 2115-cmd-compile-intrinsify-math-bits.TrailingZeros-on-ri.patch create mode 100644 2116-cmd-compile-internal-ssagen-use-an-alias-for-math-bi.patch create mode 100644 2117-cmd-compile-intrinsify-math-bits.Len-on-riscv64.patch create mode 100644 2118-cmd-compile-intrinsify-math-bits.Bswap-on-riscv64.patch diff --git a/2000-cmd-asm-cmd-internal-obj-riscv-cmd-link-improve-TLS-.patch b/2000-cmd-asm-cmd-internal-obj-riscv-cmd-link-improve-TLS-.patch new file mode 100644 index 0000000..e3da792 --- /dev/null +++ b/2000-cmd-asm-cmd-internal-obj-riscv-cmd-link-improve-TLS-.patch @@ -0,0 +1,335 @@ +From 4c97a50488b7e40651b55e440792a2840a6269db Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 001/120] cmd/asm,cmd/internal/obj/riscv,cmd/link: improve TLS + handling on riscv64 + +The existing Thread Local Storage (TLS) implementation for riscv64 uses +initial-exec (IE) mode, however a MOV of a TLS symbol currently loads the +thread pointer offset and not the actual address or memory location. + +Rework TLS on riscv64 to generate the full instruction sequence needed to +load from or store to a TLS symbol. Additionally, provide support for both +initial-exec (IE) and local-exec (LE) TLS - in many cases we can use LE, +which is slightly more efficient and easier to support in the linker. + +Change-Id: I1b43f8888b3b6b10354bbb79d604771e64d92645 +Reviewed-on: https://go-review.googlesource.com/c/go/+/431103 +Reviewed-by: Cherry Mui +Reviewed-by: M Zhuo +TryBot-Result: Gopher Robot +Reviewed-by: David Chase +Run-TryBot: Joel Sing +--- + src/cmd/asm/internal/asm/endtoend_test.go | 5 ++ + src/cmd/asm/internal/asm/testdata/riscv64.s | 10 ++++ + src/cmd/internal/obj/riscv/obj.go | 63 +++++++++++++++++++-- + src/cmd/internal/objabi/reloctype.go | 16 +++--- + src/cmd/internal/objabi/reloctype_string.go | 8 +-- + src/cmd/link/internal/riscv64/asm.go | 22 ++++--- + src/runtime/tls_riscv64.s | 11 +--- + 7 files changed, 103 insertions(+), 32 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/endtoend_test.go b/src/cmd/asm/internal/asm/endtoend_test.go +index ef41667c8e..02bc6b7923 100644 +--- a/src/cmd/asm/internal/asm/endtoend_test.go ++++ b/src/cmd/asm/internal/asm/endtoend_test.go +@@ -68,6 +68,11 @@ Diff: + continue + } + ++ // Ignore GLOBL. ++ if strings.HasPrefix(line, "GLOBL ") { ++ continue ++ } ++ + // The general form of a test input line is: + // // comment + // INST args [// printed form] [// hex encoding] +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 53538320f0..9899ec9e7b 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -354,6 +354,14 @@ start: + MOVD F0, 4(X5) // 27b20200 + MOVD F0, F1 // d3000022 + ++ // TLS load with local-exec (LUI + ADDIW + ADD of TP + load) ++ MOV tls(SB), X5 // b70f00009b8f0f00b38f4f0083b20f00 ++ MOVB tls(SB), X5 // b70f00009b8f0f00b38f4f0083820f00 ++ ++ // TLS store with local-exec (LUI + ADDIW + ADD of TP + store) ++ MOV X5, tls(SB) // b70f00009b8f0f00b38f4f0023b05f00 ++ MOVB X5, tls(SB) // b70f00009b8f0f00b38f4f0023805f00 ++ + // NOT pseudo-instruction + NOT X5 // 93c2f2ff + NOT X5, X6 // 13c3f2ff +@@ -407,3 +415,5 @@ start: + FLTD F0, F1, X5 // d39200a2 + FLED F0, F1, X5 // d38200a2 + FEQD F0, F1, X5 // d3a200a2 ++ ++GLOBL tls(SB), TLSBSS, $8 +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 43fa7351bf..2e55fac812 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1827,6 +1827,53 @@ func instructionsForStore(p *obj.Prog, as obj.As, rd int16) []*instruction { + return []*instruction{insLUI, insADD, ins} + } + ++func instructionsForTLS(p *obj.Prog, ins *instruction) []*instruction { ++ insAddTP := &instruction{as: AADD, rd: REG_TMP, rs1: REG_TMP, rs2: REG_TP} ++ ++ var inss []*instruction ++ if p.Ctxt.Flag_shared { ++ // TLS initial-exec mode - load TLS offset from GOT, add the thread pointer ++ // register, then load from or store to the resulting memory location. ++ insAUIPC := &instruction{as: AAUIPC, rd: REG_TMP} ++ insLoadTLSOffset := &instruction{as: ALD, rd: REG_TMP, rs1: REG_TMP} ++ inss = []*instruction{insAUIPC, insLoadTLSOffset, insAddTP, ins} ++ } else { ++ // TLS local-exec mode - load upper TLS offset, add the lower TLS offset, ++ // add the thread pointer register, then load from or store to the resulting ++ // memory location. Note that this differs from the suggested three ++ // instruction sequence, as the Go linker does not currently have an ++ // easy way to handle relocation across 12 bytes of machine code. ++ insLUI := &instruction{as: ALUI, rd: REG_TMP} ++ insADDIW := &instruction{as: AADDIW, rd: REG_TMP, rs1: REG_TMP} ++ inss = []*instruction{insLUI, insADDIW, insAddTP, ins} ++ } ++ return inss ++} ++ ++func instructionsForTLSLoad(p *obj.Prog) []*instruction { ++ if p.From.Sym.Type != objabi.STLSBSS { ++ p.Ctxt.Diag("%v: %v is not a TLS symbol", p, p.From.Sym) ++ return nil ++ } ++ ++ ins := instructionForProg(p) ++ ins.as, ins.rs1, ins.rs2, ins.imm = movToLoad(p.As), REG_TMP, obj.REG_NONE, 0 ++ ++ return instructionsForTLS(p, ins) ++} ++ ++func instructionsForTLSStore(p *obj.Prog) []*instruction { ++ if p.To.Sym.Type != objabi.STLSBSS { ++ p.Ctxt.Diag("%v: %v is not a TLS symbol", p, p.To.Sym) ++ return nil ++ } ++ ++ ins := instructionForProg(p) ++ ins.as, ins.rd, ins.rs1, ins.rs2, ins.imm = movToStore(p.As), REG_TMP, uint32(p.From.Reg), obj.REG_NONE, 0 ++ ++ return instructionsForTLS(p, ins) ++} ++ + // instructionsForMOV returns the machine instructions for an *obj.Prog that + // uses a MOV pseudo-instruction. + func instructionsForMOV(p *obj.Prog) []*instruction { +@@ -1939,6 +1986,10 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + inss = instructionsForLoad(p, movToLoad(p.As), addrToReg(p.From)) + + case obj.NAME_EXTERN, obj.NAME_STATIC: ++ if p.From.Sym.Type == objabi.STLSBSS { ++ return instructionsForTLSLoad(p) ++ } ++ + // Note that the values for $off_hi and $off_lo are currently + // zero and will be assigned during relocation. + // +@@ -1966,6 +2017,10 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + inss = instructionsForStore(p, movToStore(p.As), addrToReg(p.To)) + + case obj.NAME_EXTERN, obj.NAME_STATIC: ++ if p.To.Sym.Type == objabi.STLSBSS { ++ return instructionsForTLSStore(p) ++ } ++ + // Note that the values for $off_hi and $off_lo are currently + // zero and will be assigned during relocation. + // +@@ -2244,10 +2299,10 @@ func assemble(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { + break + } + if addr.Sym.Type == objabi.STLSBSS { +- if rt == objabi.R_RISCV_PCREL_ITYPE { +- rt = objabi.R_RISCV_TLS_IE_ITYPE +- } else if rt == objabi.R_RISCV_PCREL_STYPE { +- rt = objabi.R_RISCV_TLS_IE_STYPE ++ if ctxt.Flag_shared { ++ rt = objabi.R_RISCV_TLS_IE ++ } else { ++ rt = objabi.R_RISCV_TLS_LE + } + } + +diff --git a/src/cmd/internal/objabi/reloctype.go b/src/cmd/internal/objabi/reloctype.go +index 996c300d95..3eaa5824e6 100644 +--- a/src/cmd/internal/objabi/reloctype.go ++++ b/src/cmd/internal/objabi/reloctype.go +@@ -269,21 +269,21 @@ const ( + // only used by the linker and are not emitted by the compiler or assembler. + R_RISCV_CALL_TRAMP + +- // R_RISCV_PCREL_ITYPE resolves a 32-bit PC-relative address using an ++ // R_RISCV_PCREL_ITYPE resolves a 32 bit PC-relative address using an + // AUIPC + I-type instruction pair. + R_RISCV_PCREL_ITYPE + +- // R_RISCV_PCREL_STYPE resolves a 32-bit PC-relative address using an ++ // R_RISCV_PCREL_STYPE resolves a 32 bit PC-relative address using an + // AUIPC + S-type instruction pair. + R_RISCV_PCREL_STYPE + +- // R_RISCV_TLS_IE_ITYPE resolves a 32-bit TLS initial-exec TOC offset +- // address using an AUIPC + I-type instruction pair. +- R_RISCV_TLS_IE_ITYPE ++ // R_RISCV_TLS_IE resolves a 32 bit TLS initial-exec address using an ++ // AUIPC + I-type instruction pair. ++ R_RISCV_TLS_IE + +- // R_RISCV_TLS_IE_STYPE resolves a 32-bit TLS initial-exec TOC offset +- // address using an AUIPC + S-type instruction pair. +- R_RISCV_TLS_IE_STYPE ++ // R_RISCV_TLS_LE resolves a 32 bit TLS local-exec address using an ++ // LUI + I-type instruction sequence. ++ R_RISCV_TLS_LE + + // R_PCRELDBL relocates s390x 2-byte aligned PC-relative addresses. + // TODO(mundaym): remove once variants can be serialized - see issue 14218. +diff --git a/src/cmd/internal/objabi/reloctype_string.go b/src/cmd/internal/objabi/reloctype_string.go +index c7441efa28..bc8fb6b73c 100644 +--- a/src/cmd/internal/objabi/reloctype_string.go ++++ b/src/cmd/internal/objabi/reloctype_string.go +@@ -71,8 +71,8 @@ func _() { + _ = x[R_RISCV_CALL_TRAMP-61] + _ = x[R_RISCV_PCREL_ITYPE-62] + _ = x[R_RISCV_PCREL_STYPE-63] +- _ = x[R_RISCV_TLS_IE_ITYPE-64] +- _ = x[R_RISCV_TLS_IE_STYPE-65] ++ _ = x[R_RISCV_TLS_IE-64] ++ _ = x[R_RISCV_TLS_LE-65] + _ = x[R_PCRELDBL-66] + _ = x[R_ADDRLOONG64-67] + _ = x[R_ADDRLOONG64U-68] +@@ -91,9 +91,9 @@ func _() { + _ = x[R_INITORDER-81] + } + +-const _RelocType_name = "R_ADDRR_ADDRPOWERR_ADDRARM64R_ADDRMIPSR_ADDROFFR_SIZER_CALLR_CALLARMR_CALLARM64R_CALLINDR_CALLPOWERR_CALLMIPSR_CONSTR_PCRELR_TLS_LER_TLS_IER_GOTOFFR_PLT0R_PLT1R_PLT2R_USEFIELDR_USETYPER_USEIFACER_USEIFACEMETHODR_USEGENERICIFACEMETHODR_METHODOFFR_KEEPR_POWER_TOCR_GOTPCRELR_JMPMIPSR_DWARFSECREFR_DWARFFILEREFR_ARM64_TLS_LER_ARM64_TLS_IER_ARM64_GOTPCRELR_ARM64_GOTR_ARM64_PCRELR_ARM64_PCREL_LDST8R_ARM64_PCREL_LDST16R_ARM64_PCREL_LDST32R_ARM64_PCREL_LDST64R_ARM64_LDST8R_ARM64_LDST16R_ARM64_LDST32R_ARM64_LDST64R_ARM64_LDST128R_POWER_TLS_LER_POWER_TLS_IER_POWER_TLSR_POWER_TLS_IE_PCREL34R_POWER_TLS_LE_TPREL34R_ADDRPOWER_DSR_ADDRPOWER_GOTR_ADDRPOWER_GOT_PCREL34R_ADDRPOWER_PCRELR_ADDRPOWER_TOCRELR_ADDRPOWER_TOCREL_DSR_ADDRPOWER_D34R_ADDRPOWER_PCREL34R_RISCV_CALLR_RISCV_CALL_TRAMPR_RISCV_PCREL_ITYPER_RISCV_PCREL_STYPER_RISCV_TLS_IE_ITYPER_RISCV_TLS_IE_STYPER_PCRELDBLR_ADDRLOONG64R_ADDRLOONG64UR_ADDRLOONG64TLSR_ADDRLOONG64TLSUR_CALLLOONG64R_LOONG64_TLS_IE_PCREL_HIR_LOONG64_TLS_IE_LOR_JMPLOONG64R_ADDRMIPSUR_ADDRMIPSTLSR_ADDRCUOFFR_WASMIMPORTR_XCOFFREFR_PEIMAGEOFFR_INITORDER" ++const _RelocType_name = "R_ADDRR_ADDRPOWERR_ADDRARM64R_ADDRMIPSR_ADDROFFR_SIZER_CALLR_CALLARMR_CALLARM64R_CALLINDR_CALLPOWERR_CALLMIPSR_CONSTR_PCRELR_TLS_LER_TLS_IER_GOTOFFR_PLT0R_PLT1R_PLT2R_USEFIELDR_USETYPER_USEIFACER_USEIFACEMETHODR_USEGENERICIFACEMETHODR_METHODOFFR_KEEPR_POWER_TOCR_GOTPCRELR_JMPMIPSR_DWARFSECREFR_DWARFFILEREFR_ARM64_TLS_LER_ARM64_TLS_IER_ARM64_GOTPCRELR_ARM64_GOTR_ARM64_PCRELR_ARM64_PCREL_LDST8R_ARM64_PCREL_LDST16R_ARM64_PCREL_LDST32R_ARM64_PCREL_LDST64R_ARM64_LDST8R_ARM64_LDST16R_ARM64_LDST32R_ARM64_LDST64R_ARM64_LDST128R_POWER_TLS_LER_POWER_TLS_IER_POWER_TLSR_POWER_TLS_IE_PCREL34R_POWER_TLS_LE_TPREL34R_ADDRPOWER_DSR_ADDRPOWER_GOTR_ADDRPOWER_GOT_PCREL34R_ADDRPOWER_PCRELR_ADDRPOWER_TOCRELR_ADDRPOWER_TOCREL_DSR_ADDRPOWER_D34R_ADDRPOWER_PCREL34R_RISCV_CALLR_RISCV_CALL_TRAMPR_RISCV_PCREL_ITYPER_RISCV_PCREL_STYPER_RISCV_TLS_IER_RISCV_TLS_LER_PCRELDBLR_ADDRLOONG64R_ADDRLOONG64UR_ADDRLOONG64TLSR_ADDRLOONG64TLSUR_CALLLOONG64R_LOONG64_TLS_IE_PCREL_HIR_LOONG64_TLS_IE_LOR_JMPLOONG64R_ADDRMIPSUR_ADDRMIPSTLSR_ADDRCUOFFR_WASMIMPORTR_XCOFFREFR_PEIMAGEOFFR_INITORDER" + +-var _RelocType_index = [...]uint16{0, 6, 17, 28, 38, 47, 53, 59, 68, 79, 88, 99, 109, 116, 123, 131, 139, 147, 153, 159, 165, 175, 184, 194, 210, 233, 244, 250, 261, 271, 280, 293, 307, 321, 335, 351, 362, 375, 394, 414, 434, 454, 467, 481, 495, 509, 524, 538, 552, 563, 585, 607, 621, 636, 659, 676, 694, 715, 730, 749, 761, 779, 798, 817, 837, 857, 867, 880, 894, 910, 927, 940, 965, 984, 996, 1007, 1020, 1031, 1043, 1053, 1065, 1076} ++var _RelocType_index = [...]uint16{0, 6, 17, 28, 38, 47, 53, 59, 68, 79, 88, 99, 109, 116, 123, 131, 139, 147, 153, 159, 165, 175, 184, 194, 210, 233, 244, 250, 261, 271, 280, 293, 307, 321, 335, 351, 362, 375, 394, 414, 434, 454, 467, 481, 495, 509, 524, 538, 552, 563, 585, 607, 621, 636, 659, 676, 694, 715, 730, 749, 761, 779, 798, 817, 831, 845, 855, 868, 882, 898, 915, 928, 953, 972, 984, 995, 1008, 1019, 1031, 1041, 1053, 1064} + + func (i RelocType) String() string { + i -= 1 +diff --git a/src/cmd/link/internal/riscv64/asm.go b/src/cmd/link/internal/riscv64/asm.go +index 6b5c0cbe5a..f3186398eb 100644 +--- a/src/cmd/link/internal/riscv64/asm.go ++++ b/src/cmd/link/internal/riscv64/asm.go +@@ -39,7 +39,7 @@ func genSymsLate(ctxt *ld.Link, ldr *loader.Loader) { + for ri := 0; ri < relocs.Count(); ri++ { + r := relocs.At(ri) + if r.Type() != objabi.R_RISCV_PCREL_ITYPE && r.Type() != objabi.R_RISCV_PCREL_STYPE && +- r.Type() != objabi.R_RISCV_TLS_IE_ITYPE && r.Type() != objabi.R_RISCV_TLS_IE_STYPE { ++ r.Type() != objabi.R_RISCV_TLS_IE { + continue + } + if r.Off() == 0 && ldr.SymType(s) == sym.STEXT { +@@ -101,7 +101,7 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym, + out.Write64(uint64(elf.R_RISCV_JAL) | uint64(elfsym)<<32) + out.Write64(uint64(r.Xadd)) + +- case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE_ITYPE, objabi.R_RISCV_TLS_IE_STYPE: ++ case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE: + // Find the text symbol for the AUIPC instruction targeted + // by this relocation. + relocs := ldr.Relocs(s) +@@ -127,10 +127,8 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym, + hiRel, loRel = elf.R_RISCV_PCREL_HI20, elf.R_RISCV_PCREL_LO12_I + case objabi.R_RISCV_PCREL_STYPE: + hiRel, loRel = elf.R_RISCV_PCREL_HI20, elf.R_RISCV_PCREL_LO12_S +- case objabi.R_RISCV_TLS_IE_ITYPE: ++ case objabi.R_RISCV_TLS_IE: + hiRel, loRel = elf.R_RISCV_TLS_GOT_HI20, elf.R_RISCV_PCREL_LO12_I +- case objabi.R_RISCV_TLS_IE_STYPE: +- hiRel, loRel = elf.R_RISCV_TLS_GOT_HI20, elf.R_RISCV_PCREL_LO12_S + } + out.Write64(uint64(sectoff)) + out.Write64(uint64(hiRel) | uint64(elfsym)<<32) +@@ -139,6 +137,14 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym, + out.Write64(uint64(loRel) | uint64(hi20ElfSym)<<32) + out.Write64(uint64(0)) + ++ case objabi.R_RISCV_TLS_LE: ++ out.Write64(uint64(sectoff)) ++ out.Write64(uint64(elf.R_RISCV_TPREL_HI20) | uint64(elfsym)<<32) ++ out.Write64(uint64(r.Xadd)) ++ out.Write64(uint64(sectoff + 4)) ++ out.Write64(uint64(elf.R_RISCV_TPREL_LO12_I) | uint64(elfsym)<<32) ++ out.Write64(uint64(r.Xadd)) ++ + default: + return false + } +@@ -189,7 +195,7 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade + case objabi.R_RISCV_CALL, objabi.R_RISCV_CALL_TRAMP: + return val, 1, true + +- case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE_ITYPE, objabi.R_RISCV_TLS_IE_STYPE: ++ case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE, objabi.R_RISCV_TLS_LE: + return val, 2, true + } + +@@ -211,7 +217,7 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade + + return val, 0, true + +- case objabi.R_RISCV_TLS_IE_ITYPE, objabi.R_RISCV_TLS_IE_STYPE: ++ case objabi.R_RISCV_TLS_IE, objabi.R_RISCV_TLS_LE: + // TLS relocations are not currently handled for internal linking. + // For now, TLS is only used when cgo is in use and cgo currently + // requires external linking. However, we need to accept these +@@ -273,7 +279,7 @@ func extreloc(target *ld.Target, ldr *loader.Loader, r loader.Reloc, s loader.Sy + case objabi.R_RISCV_CALL, objabi.R_RISCV_CALL_TRAMP: + return ld.ExtrelocSimple(ldr, r), true + +- case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE_ITYPE, objabi.R_RISCV_TLS_IE_STYPE: ++ case objabi.R_RISCV_PCREL_ITYPE, objabi.R_RISCV_PCREL_STYPE, objabi.R_RISCV_TLS_IE, objabi.R_RISCV_TLS_LE: + return ld.ExtrelocViaOuterSym(ldr, r, s), true + } + return loader.ExtReloc{}, false +diff --git a/src/runtime/tls_riscv64.s b/src/runtime/tls_riscv64.s +index 397919aeba..a0a58ea4a0 100644 +--- a/src/runtime/tls_riscv64.s ++++ b/src/runtime/tls_riscv64.s +@@ -12,19 +12,14 @@ + // NOTE: mcall() assumes this clobbers only X31 (REG_TMP). + TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0-0 + MOVB runtime·iscgo(SB), X31 +- BEQ X0, X31, nocgo +- +- MOV runtime·tls_g(SB), X31 +- ADD TP, X31 // add offset to thread pointer (X4) +- MOV g, (X31) ++ BEQZ X31, nocgo + ++ MOV g, runtime·tls_g(SB) + nocgo: + RET + + TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0-0 +- MOV runtime·tls_g(SB), X31 +- ADD TP, X31 // add offset to thread pointer (X4) +- MOV (X31), g ++ MOV runtime·tls_g(SB), g + RET + + GLOBL runtime·tls_g(SB), TLSBSS, $8 +-- +2.39.5 + diff --git a/2001-cmd-compile-fold-most-repetitive-operations-to-simpl.patch b/2001-cmd-compile-fold-most-repetitive-operations-to-simpl.patch new file mode 100644 index 0000000..cca54e3 --- /dev/null +++ b/2001-cmd-compile-fold-most-repetitive-operations-to-simpl.patch @@ -0,0 +1,209 @@ +From f1ab206096dedb2c0920ae2aa154323d443b2c65 Mon Sep 17 00:00:00 2001 +From: Junxian Zhu +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 001/119] cmd/compile: fold most repetitive operations to + simplify riscv64 rules + +Most of repetitive rules in riscv64 are simple, so that we can simplify and fold it with | without losting rules readability. + +No change in the actual compiler code after running rulegen. + +Change-Id: Id0bbfd93e63b49b7f66ecb62eb9440b4900c7938 +Reviewed-on: https://go-review.googlesource.com/c/go/+/498455 +Reviewed-by: Keith Randall +Reviewed-by: Keith Randall +Run-TryBot: M Zhuo +TryBot-Result: Gopher Robot +Reviewed-by: Michael Knyszek +Reviewed-by: M Zhuo +--- + .../compile/internal/ssa/_gen/RISCV64.rules | 109 +++++------------- + 1 file changed, 28 insertions(+), 81 deletions(-) + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 9a6fcebdc5..d90427132c 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -3,21 +3,11 @@ + // license that can be found in the LICENSE file. + + // Lowering arithmetic +-(Add64 ...) => (ADD ...) +-(AddPtr ...) => (ADD ...) +-(Add32 ...) => (ADD ...) +-(Add16 ...) => (ADD ...) +-(Add8 ...) => (ADD ...) +-(Add32F ...) => (FADDS ...) +-(Add64F ...) => (FADDD ...) +- +-(Sub64 ...) => (SUB ...) +-(SubPtr ...) => (SUB ...) +-(Sub32 ...) => (SUB ...) +-(Sub16 ...) => (SUB ...) +-(Sub8 ...) => (SUB ...) +-(Sub32F ...) => (FSUBS ...) +-(Sub64F ...) => (FSUBD ...) ++(Add(Ptr|64|32|16|8) ...) => (ADD ...) ++(Add(64|32)F ...) => (FADD(D|S) ...) ++ ++(Sub(Ptr|64|32|16|8) ...) => (SUB ...) ++(Sub(64|32)F ...) => (FSUB(D|S) ...) + + (Mul64 ...) => (MUL ...) + (Mul64uhilo ...) => (LoweredMuluhilo ...) +@@ -25,11 +15,9 @@ + (Mul32 ...) => (MULW ...) + (Mul16 x y) => (MULW (SignExt16to32 x) (SignExt16to32 y)) + (Mul8 x y) => (MULW (SignExt8to32 x) (SignExt8to32 y)) +-(Mul32F ...) => (FMULS ...) +-(Mul64F ...) => (FMULD ...) ++(Mul(64|32)F ...) => (FMUL(D|S) ...) + +-(Div32F ...) => (FDIVS ...) +-(Div64F ...) => (FDIVD ...) ++(Div(64|32)F ...) => (FDIV(D|S) ...) + + (Div64 x y [false]) => (DIV x y) + (Div64u ...) => (DIVU ...) +@@ -65,32 +53,15 @@ + (Mod8 x y) => (REMW (SignExt8to32 x) (SignExt8to32 y)) + (Mod8u x y) => (REMUW (ZeroExt8to32 x) (ZeroExt8to32 y)) + +-(And64 ...) => (AND ...) +-(And32 ...) => (AND ...) +-(And16 ...) => (AND ...) +-(And8 ...) => (AND ...) +- +-(Or64 ...) => (OR ...) +-(Or32 ...) => (OR ...) +-(Or16 ...) => (OR ...) +-(Or8 ...) => (OR ...) +- +-(Xor64 ...) => (XOR ...) +-(Xor32 ...) => (XOR ...) +-(Xor16 ...) => (XOR ...) +-(Xor8 ...) => (XOR ...) +- +-(Neg64 ...) => (NEG ...) +-(Neg32 ...) => (NEG ...) +-(Neg16 ...) => (NEG ...) +-(Neg8 ...) => (NEG ...) +-(Neg32F ...) => (FNEGS ...) +-(Neg64F ...) => (FNEGD ...) +- +-(Com64 ...) => (NOT ...) +-(Com32 ...) => (NOT ...) +-(Com16 ...) => (NOT ...) +-(Com8 ...) => (NOT ...) ++(And(64|32|16|8) ...) => (AND ...) ++(Or(64|32|16|8) ...) => (OR ...) ++(Xor(64|32|16|8) ...) => (XOR ...) ++ ++(Neg(64|32|16|8) ...) => (NEG ...) ++(Neg(64|32)F ...) => (FNEG(D|S) ...) ++ ++(Com(64|32|16|8) ...) => (NOT ...) ++ + + (Sqrt ...) => (FSQRTD ...) + (Sqrt32 ...) => (FSQRTS ...) +@@ -132,8 +103,7 @@ + + (CvtBoolToUint8 ...) => (Copy ...) + +-(Round32F ...) => (Copy ...) +-(Round64F ...) => (Copy ...) ++(Round(64|32)F ...) => (Copy ...) + + (Slicemask x) => (SRAI [63] (NEG x)) + +@@ -250,36 +220,26 @@ + (Less32U x y) => (SLTU (ZeroExt32to64 x) (ZeroExt32to64 y)) + (Less16U x y) => (SLTU (ZeroExt16to64 x) (ZeroExt16to64 y)) + (Less8U x y) => (SLTU (ZeroExt8to64 x) (ZeroExt8to64 y)) +-(Less64F ...) => (FLTD ...) +-(Less32F ...) => (FLTS ...) ++(Less(64|32)F ...) => (FLT(D|S) ...) + + // Convert x <= y to !(y > x). +-(Leq64 x y) => (Not (Less64 y x)) +-(Leq32 x y) => (Not (Less32 y x)) +-(Leq16 x y) => (Not (Less16 y x)) +-(Leq8 x y) => (Not (Less8 y x)) +-(Leq64U x y) => (Not (Less64U y x)) +-(Leq32U x y) => (Not (Less32U y x)) +-(Leq16U x y) => (Not (Less16U y x)) +-(Leq8U x y) => (Not (Less8U y x)) +-(Leq64F ...) => (FLED ...) +-(Leq32F ...) => (FLES ...) ++(Leq(64|32|16|8) x y) => (Not (Less(64|32|16|8) y x)) ++(Leq(64|32|16|8)U x y) => (Not (Less(64|32|16|8)U y x)) ++(Leq(64|32)F ...) => (FLE(D|S) ...) + + (EqPtr x y) => (SEQZ (SUB x y)) + (Eq64 x y) => (SEQZ (SUB x y)) + (Eq32 x y) => (SEQZ (SUB (ZeroExt32to64 x) (ZeroExt32to64 y))) + (Eq16 x y) => (SEQZ (SUB (ZeroExt16to64 x) (ZeroExt16to64 y))) + (Eq8 x y) => (SEQZ (SUB (ZeroExt8to64 x) (ZeroExt8to64 y))) +-(Eq64F ...) => (FEQD ...) +-(Eq32F ...) => (FEQS ...) ++(Eq(64|32)F ...) => (FEQ(D|S) ...) + + (NeqPtr x y) => (SNEZ (SUB x y)) + (Neq64 x y) => (SNEZ (SUB x y)) + (Neq32 x y) => (SNEZ (SUB (ZeroExt32to64 x) (ZeroExt32to64 y))) + (Neq16 x y) => (SNEZ (SUB (ZeroExt16to64 x) (ZeroExt16to64 y))) + (Neq8 x y) => (SNEZ (SUB (ZeroExt8to64 x) (ZeroExt8to64 y))) +-(Neq64F ...) => (FNED ...) +-(Neq32F ...) => (FNES ...) ++(Neq(64|32)F ...) => (FNE(D|S) ...) + + // Loads + (Load ptr mem) && t.IsBoolean() => (MOVBUload ptr mem) +@@ -537,10 +497,7 @@ + (OffPtr [off] ptr) && is32Bit(off) => (ADDI [off] ptr) + (OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr) + +-(Const8 [val]) => (MOVDconst [int64(val)]) +-(Const16 [val]) => (MOVDconst [int64(val)]) +-(Const32 [val]) => (MOVDconst [int64(val)]) +-(Const64 [val]) => (MOVDconst [int64(val)]) ++(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)]) + (Const32F [val]) => (FMVSX (MOVDconst [int64(math.Float32bits(val))])) + (Const64F [val]) => (FMVDX (MOVDconst [int64(math.Float64bits(val))])) + (ConstNil) => (MOVDconst [0]) +@@ -557,18 +514,9 @@ + (TailCall ...) => (CALLtail ...) + + // Atomic Intrinsics +-(AtomicLoad8 ...) => (LoweredAtomicLoad8 ...) +-(AtomicLoad32 ...) => (LoweredAtomicLoad32 ...) +-(AtomicLoad64 ...) => (LoweredAtomicLoad64 ...) +-(AtomicLoadPtr ...) => (LoweredAtomicLoad64 ...) +- +-(AtomicStore8 ...) => (LoweredAtomicStore8 ...) +-(AtomicStore32 ...) => (LoweredAtomicStore32 ...) +-(AtomicStore64 ...) => (LoweredAtomicStore64 ...) +-(AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...) +- +-(AtomicAdd32 ...) => (LoweredAtomicAdd32 ...) +-(AtomicAdd64 ...) => (LoweredAtomicAdd64 ...) ++(AtomicLoad(Ptr|64|32|8) ...) => (LoweredAtomicLoad(64|64|32|8) ...) ++(AtomicStore(PtrNoWB|64|32|8) ...) => (LoweredAtomicStore(64|64|32|8) ...) ++(AtomicAdd(64|32) ...) => (LoweredAtomicAdd(64|32) ...) + + // AtomicAnd8(ptr,val) => LoweredAtomicAnd32(ptr&^3, ^((uint8(val) ^ 0xff) << ((ptr & 3) * 8))) + (AtomicAnd8 ptr val mem) => +@@ -581,8 +529,7 @@ + (AtomicCompareAndSwap32 ptr old new mem) => (LoweredAtomicCas32 ptr (SignExt32to64 old) new mem) + (AtomicCompareAndSwap64 ...) => (LoweredAtomicCas64 ...) + +-(AtomicExchange32 ...) => (LoweredAtomicExchange32 ...) +-(AtomicExchange64 ...) => (LoweredAtomicExchange64 ...) ++(AtomicExchange(64|32) ...) => (LoweredAtomicExchange(64|32) ...) + + // AtomicOr8(ptr,val) => LoweredAtomicOr32(ptr&^3, uint32(val)<<((ptr&3)*8)) + (AtomicOr8 ptr val mem) => +-- +2.39.5 + diff --git a/2002-crypto-internal-bigmod-provide-assembly-addMulVVW-fo.patch b/2002-crypto-internal-bigmod-provide-assembly-addMulVVW-fo.patch new file mode 100644 index 0000000..49c6a49 --- /dev/null +++ b/2002-crypto-internal-bigmod-provide-assembly-addMulVVW-fo.patch @@ -0,0 +1,169 @@ +From c63ac393ef890036d861a284e7404e1758b40113 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 002/119] crypto/internal/bigmod: provide assembly addMulVVW* + for riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This provides an assembly implementation of addMulVVW* for riscv64, +processing four words per loop, resulting in a performance gain +of 23%+ for RSA decryption/signing on a StarFive VisionFive 2: + + │ rsa1 │ rsa2 │ + │ sec/op │ sec/op vs base │ +DecryptPKCS1v15/2048-4 24.29m ± 0% 18.65m ± 0% -23.24% (p=0.000 n=10) +DecryptPKCS1v15/3072-4 73.28m ± 0% 54.08m ± 0% -26.20% (p=0.000 n=10) +DecryptPKCS1v15/4096-4 163.5m ± 0% 119.1m ± 0% -27.17% (p=0.000 n=10) +EncryptPKCS1v15/2048-4 1.505m ± 0% 1.446m ± 0% -3.93% (p=0.000 n=10) +DecryptOAEP/2048-4 24.37m ± 0% 18.72m ± 0% -23.17% (p=0.000 n=10) +EncryptOAEP/2048-4 1.570m ± 0% 1.510m ± 0% -3.84% (p=0.000 n=10) +SignPKCS1v15/2048-4 24.52m ± 0% 18.80m ± 0% -23.36% (p=0.000 n=10) +VerifyPKCS1v15/2048-4 1.491m ± 0% 1.431m ± 0% -4.00% (p=0.000 n=10) +SignPSS/2048-4 24.60m ± 0% 18.89m ± 0% -23.21% (p=0.000 n=10) +VerifyPSS/2048-4 1.565m ± 0% 1.504m ± 0% -3.87% (p=0.000 n=10) +geomean 10.90m 9.066m -16.79% + +Change-Id: I8414ba0028b0781a945610abe02c285d2387aef3 +Reviewed-on: https://go-review.googlesource.com/c/go/+/516536 +Reviewed-by: Mark Ryan +Reviewed-by: Filippo Valsorda +Reviewed-by: Dmitri Shuralyov +Reviewed-by: M Zhuo +Reviewed-by: Michael Knyszek +Run-TryBot: Joel Sing +TryBot-Result: Gopher Robot +--- + src/crypto/internal/bigmod/nat_asm.go | 2 +- + src/crypto/internal/bigmod/nat_noasm.go | 2 +- + src/crypto/internal/bigmod/nat_riscv64.s | 91 ++++++++++++++++++++++++ + 3 files changed, 93 insertions(+), 2 deletions(-) + create mode 100644 src/crypto/internal/bigmod/nat_riscv64.s + +diff --git a/src/crypto/internal/bigmod/nat_asm.go b/src/crypto/internal/bigmod/nat_asm.go +index 5eb91e1c6c..0283b07e68 100644 +--- a/src/crypto/internal/bigmod/nat_asm.go ++++ b/src/crypto/internal/bigmod/nat_asm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !purego && (386 || amd64 || arm || arm64 || ppc64 || ppc64le || s390x) ++//go:build !purego && (386 || amd64 || arm || arm64 || ppc64 || ppc64le || riscv64 || s390x) + + package bigmod + +diff --git a/src/crypto/internal/bigmod/nat_noasm.go b/src/crypto/internal/bigmod/nat_noasm.go +index eff12536f9..71f38da754 100644 +--- a/src/crypto/internal/bigmod/nat_noasm.go ++++ b/src/crypto/internal/bigmod/nat_noasm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build purego || !(386 || amd64 || arm || arm64 || ppc64 || ppc64le || s390x) ++//go:build purego || !(386 || amd64 || arm || arm64 || ppc64 || ppc64le || riscv64 || s390x) + + package bigmod + +diff --git a/src/crypto/internal/bigmod/nat_riscv64.s b/src/crypto/internal/bigmod/nat_riscv64.s +new file mode 100644 +index 0000000000..1d8c8c8900 +--- /dev/null ++++ b/src/crypto/internal/bigmod/nat_riscv64.s +@@ -0,0 +1,91 @@ ++// Copyright 2023 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++//go:build !purego ++ ++#include "textflag.h" ++ ++// func addMulVVW1024(z, x *uint, y uint) (c uint) ++TEXT ·addMulVVW1024(SB),$0-32 ++ MOV $16, X30 ++ JMP addMulVVWx(SB) ++ ++// func addMulVVW1536(z, x *uint, y uint) (c uint) ++TEXT ·addMulVVW1536(SB),$0-32 ++ MOV $24, X30 ++ JMP addMulVVWx(SB) ++ ++// func addMulVVW2048(z, x *uint, y uint) (c uint) ++TEXT ·addMulVVW2048(SB),$0-32 ++ MOV $32, X30 ++ JMP addMulVVWx(SB) ++ ++TEXT addMulVVWx(SB),NOFRAME|NOSPLIT,$0 ++ MOV z+0(FP), X5 ++ MOV x+8(FP), X7 ++ MOV y+16(FP), X6 ++ MOV $0, X29 ++ ++ BEQZ X30, done ++loop: ++ MOV 0*8(X5), X10 // z[0] ++ MOV 1*8(X5), X13 // z[1] ++ MOV 2*8(X5), X16 // z[2] ++ MOV 3*8(X5), X19 // z[3] ++ ++ MOV 0*8(X7), X8 // x[0] ++ MOV 1*8(X7), X11 // x[1] ++ MOV 2*8(X7), X14 // x[2] ++ MOV 3*8(X7), X17 // x[3] ++ ++ MULHU X8, X6, X9 // z_hi[0] = x[0] * y ++ MUL X8, X6, X8 // z_lo[0] = x[0] * y ++ ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0] ++ SLTU X8, X21, X22 ++ ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0] ++ ADD X21, X29, X10 // z_lo[0] = x[0] * y + z[0] + c ++ SLTU X21, X10, X22 ++ ADD X9, X22, X29 // next c ++ ++ MULHU X11, X6, X12 // z_hi[1] = x[1] * y ++ MUL X11, X6, X11 // z_lo[1] = x[1] * y ++ ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1] ++ SLTU X11, X21, X22 ++ ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1] ++ ADD X21, X29, X13 // z_lo[1] = x[1] * y + z[1] + c ++ SLTU X21, X13, X22 ++ ADD X12, X22, X29 // next c ++ ++ MULHU X14, X6, X15 // z_hi[2] = x[2] * y ++ MUL X14, X6, X14 // z_lo[2] = x[2] * y ++ ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2] ++ SLTU X14, X21, X22 ++ ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2] ++ ADD X21, X29, X16 // z_lo[2] = x[2] * y + z[2] + c ++ SLTU X21, X16, X22 ++ ADD X15, X22, X29 // next c ++ ++ MULHU X17, X6, X18 // z_hi[3] = x[3] * y ++ MUL X17, X6, X17 // z_lo[3] = x[3] * y ++ ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3] ++ SLTU X17, X21, X22 ++ ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3] ++ ADD X21, X29, X19 // z_lo[3] = x[3] * y + z[3] + c ++ SLTU X21, X19, X22 ++ ADD X18, X22, X29 // next c ++ ++ MOV X10, 0*8(X5) // z[0] ++ MOV X13, 1*8(X5) // z[1] ++ MOV X16, 2*8(X5) // z[2] ++ MOV X19, 3*8(X5) // z[3] ++ ++ ADDI $32, X5 ++ ADDI $32, X7 ++ ++ ADDI $-4, X30 ++ BNEZ X30, loop ++ ++done: ++ MOV X29, c+24(FP) ++ RET +-- +2.39.5 + diff --git a/2003-cmd-compile-sign-or-zero-extend-for-32-bit-equality-.patch b/2003-cmd-compile-sign-or-zero-extend-for-32-bit-equality-.patch new file mode 100644 index 0000000..f42c95c --- /dev/null +++ b/2003-cmd-compile-sign-or-zero-extend-for-32-bit-equality-.patch @@ -0,0 +1,242 @@ +From 8d8ed2bb0d3c76380a641adec7ff5ee9a26e000e Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 003/119] cmd/compile: sign or zero extend for 32 bit equality + on riscv64 + +For 32 bit equality (Eq32), rather than always zero extending to 64 bits, +sign extend for signed types and zero extend for unsigned types. This makes +no difference to the equality test (via SUB), however it increases the +likelihood of avoiding unnecessary sign or zero extension simply for the +purpose of equality testing. + +While here, replace the Neq* rules with (Not (Eq*)) - this makes no +difference to the generated code (as the intermediates get expanded and +eliminated), however it means that changes to the equality rules also +reflect in the inequality rules. + +As an example, the following: + + lw t0,956(t0) + slli t0,t0,0x20 + srli t0,t0,0x20 + li t1,1 + bne t1,t0,278fc + +Becomes: + + lw t0,1024(t0) + li t1,1 + bne t1,t0,278b0 + +Removes almost 1000 instructions from the Go binary on riscv64. + +Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f +Reviewed-on: https://go-review.googlesource.com/c/go/+/516595 +Run-TryBot: Joel Sing +Reviewed-by: Dmitri Shuralyov +TryBot-Result: Gopher Robot +Reviewed-by: M Zhuo +Reviewed-by: Michael Knyszek +--- + .../compile/internal/ssa/_gen/RISCV64.rules | 13 +-- + .../compile/internal/ssa/rewriteRISCV64.go | 101 +++++++++++------- + 2 files changed, 67 insertions(+), 47 deletions(-) + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index d90427132c..181b46a7ce 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -229,16 +229,17 @@ + + (EqPtr x y) => (SEQZ (SUB x y)) + (Eq64 x y) => (SEQZ (SUB x y)) +-(Eq32 x y) => (SEQZ (SUB (ZeroExt32to64 x) (ZeroExt32to64 y))) ++(Eq32 x y) && x.Type.IsSigned() => (SEQZ (SUB (SignExt32to64 x) (SignExt32to64 y))) ++(Eq32 x y) && !x.Type.IsSigned() => (SEQZ (SUB (ZeroExt32to64 x) (ZeroExt32to64 y))) + (Eq16 x y) => (SEQZ (SUB (ZeroExt16to64 x) (ZeroExt16to64 y))) + (Eq8 x y) => (SEQZ (SUB (ZeroExt8to64 x) (ZeroExt8to64 y))) + (Eq(64|32)F ...) => (FEQ(D|S) ...) + +-(NeqPtr x y) => (SNEZ (SUB x y)) +-(Neq64 x y) => (SNEZ (SUB x y)) +-(Neq32 x y) => (SNEZ (SUB (ZeroExt32to64 x) (ZeroExt32to64 y))) +-(Neq16 x y) => (SNEZ (SUB (ZeroExt16to64 x) (ZeroExt16to64 y))) +-(Neq8 x y) => (SNEZ (SUB (ZeroExt8to64 x) (ZeroExt8to64 y))) ++(NeqPtr x y) => (Not (EqPtr x y)) ++(Neq64 x y) => (Not (Eq64 x y)) ++(Neq32 x y) => (Not (Eq32 x y)) ++(Neq16 x y) => (Not (Eq16 x y)) ++(Neq8 x y) => (Not (Eq8 x y)) + (Neq(64|32)F ...) => (FNE(D|S) ...) + + // Loads +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index ffbeb1df47..e8002599ef 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -1081,20 +1081,50 @@ func rewriteValueRISCV64_OpEq32(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Eq32 x y) ++ // cond: x.Type.IsSigned() ++ // result: (SEQZ (SUB (SignExt32to64 x) (SignExt32to64 y))) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ x := v_0 ++ y := v_1 ++ if !(x.Type.IsSigned()) { ++ continue ++ } ++ v.reset(OpRISCV64SEQZ) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type) ++ v1 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) ++ v1.AddArg(x) ++ v2 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) ++ v2.AddArg(y) ++ v0.AddArg2(v1, v2) ++ v.AddArg(v0) ++ return true ++ } ++ break ++ } ++ // match: (Eq32 x y) ++ // cond: !x.Type.IsSigned() + // result: (SEQZ (SUB (ZeroExt32to64 x) (ZeroExt32to64 y))) + for { +- x := v_0 +- y := v_1 +- v.reset(OpRISCV64SEQZ) +- v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type) +- v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v1.AddArg(x) +- v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v2.AddArg(y) +- v0.AddArg2(v1, v2) +- v.AddArg(v0) +- return true ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ x := v_0 ++ y := v_1 ++ if !(!x.Type.IsSigned()) { ++ continue ++ } ++ v.reset(OpRISCV64SEQZ) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type) ++ v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) ++ v1.AddArg(x) ++ v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) ++ v2.AddArg(y) ++ v0.AddArg2(v1, v2) ++ v.AddArg(v0) ++ return true ++ } ++ break + } ++ return false + } + func rewriteValueRISCV64_OpEq64(v *Value) bool { + v_1 := v.Args[1] +@@ -2942,17 +2972,13 @@ func rewriteValueRISCV64_OpNeq16(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Neq16 x y) +- // result: (SNEZ (SUB (ZeroExt16to64 x) (ZeroExt16to64 y))) ++ // result: (Not (Eq16 x y)) + for { + x := v_0 + y := v_1 +- v.reset(OpRISCV64SNEZ) +- v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type) +- v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +- v1.AddArg(x) +- v2 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +- v2.AddArg(y) +- v0.AddArg2(v1, v2) ++ v.reset(OpNot) ++ v0 := b.NewValue0(v.Pos, OpEq16, typ.Bool) ++ v0.AddArg2(x, y) + v.AddArg(v0) + return true + } +@@ -2963,17 +2989,13 @@ func rewriteValueRISCV64_OpNeq32(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Neq32 x y) +- // result: (SNEZ (SUB (ZeroExt32to64 x) (ZeroExt32to64 y))) ++ // result: (Not (Eq32 x y)) + for { + x := v_0 + y := v_1 +- v.reset(OpRISCV64SNEZ) +- v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type) +- v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v1.AddArg(x) +- v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v2.AddArg(y) +- v0.AddArg2(v1, v2) ++ v.reset(OpNot) ++ v0 := b.NewValue0(v.Pos, OpEq32, typ.Bool) ++ v0.AddArg2(x, y) + v.AddArg(v0) + return true + } +@@ -2982,13 +3004,14 @@ func rewriteValueRISCV64_OpNeq64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block ++ typ := &b.Func.Config.Types + // match: (Neq64 x y) +- // result: (SNEZ (SUB x y)) ++ // result: (Not (Eq64 x y)) + for { + x := v_0 + y := v_1 +- v.reset(OpRISCV64SNEZ) +- v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type) ++ v.reset(OpNot) ++ v0 := b.NewValue0(v.Pos, OpEq64, typ.Bool) + v0.AddArg2(x, y) + v.AddArg(v0) + return true +@@ -3000,17 +3023,13 @@ func rewriteValueRISCV64_OpNeq8(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Neq8 x y) +- // result: (SNEZ (SUB (ZeroExt8to64 x) (ZeroExt8to64 y))) ++ // result: (Not (Eq8 x y)) + for { + x := v_0 + y := v_1 +- v.reset(OpRISCV64SNEZ) +- v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type) +- v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +- v1.AddArg(x) +- v2 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +- v2.AddArg(y) +- v0.AddArg2(v1, v2) ++ v.reset(OpNot) ++ v0 := b.NewValue0(v.Pos, OpEq8, typ.Bool) ++ v0.AddArg2(x, y) + v.AddArg(v0) + return true + } +@@ -3038,12 +3057,12 @@ func rewriteValueRISCV64_OpNeqPtr(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (NeqPtr x y) +- // result: (SNEZ (SUB x y)) ++ // result: (Not (EqPtr x y)) + for { + x := v_0 + y := v_1 +- v.reset(OpRISCV64SNEZ) +- v0 := b.NewValue0(v.Pos, OpRISCV64SUB, typ.Uintptr) ++ v.reset(OpNot) ++ v0 := b.NewValue0(v.Pos, OpEqPtr, typ.Bool) + v0.AddArg2(x, y) + v.AddArg(v0) + return true +-- +2.39.5 + diff --git a/2004-cmd-compile-improve-FP-FMA-performance-on-riscv64.patch b/2004-cmd-compile-improve-FP-FMA-performance-on-riscv64.patch new file mode 100644 index 0000000..a0f302c --- /dev/null +++ b/2004-cmd-compile-improve-FP-FMA-performance-on-riscv64.patch @@ -0,0 +1,276 @@ +From a2f69cbaaae63c86b4e8f29085414a237c24def4 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 004/119] cmd/compile: improve FP FMA performance on riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +FMADD/FMSUB/FNSUB are an efficient FP FMA instructions, which can +be used by the compiler to improve FP performance. + +Erf 188.0n ± 2% 139.5n ± 2% -25.82% (p=0.000 n=10) +Erfc 193.6n ± 1% 143.2n ± 1% -26.01% (p=0.000 n=10) +Erfinv 244.4n ± 2% 172.6n ± 0% -29.40% (p=0.000 n=10) +Erfcinv 244.7n ± 2% 173.0n ± 1% -29.31% (p=0.000 n=10) +geomean 216.0n 156.3n -27.65% + +Ref: The RISC-V Instruction Set Manual Volume I: Unprivileged ISA +11.6 Single-Precision Floating-Point Computational Instructions + +Change-Id: I89aa3a4df7576fdd47f4a6ee608ac16feafd093c +Reviewed-on: https://go-review.googlesource.com/c/go/+/506036 +Reviewed-by: Joel Sing +Run-TryBot: M Zhuo +Reviewed-by: David Chase +Reviewed-by: Keith Randall +Reviewed-by: Keith Randall +TryBot-Result: Gopher Robot +--- + src/cmd/compile/internal/riscv64/ssa.go | 3 + + .../compile/internal/ssa/_gen/RISCV64.rules | 5 +- + .../compile/internal/ssa/_gen/RISCV64Ops.go | 4 ++ + src/cmd/compile/internal/ssa/opGen.go | 28 ++++++++ + .../compile/internal/ssa/rewriteRISCV64.go | 72 ++++++++++++++++++- + test/codegen/floats.go | 3 + + 6 files changed, 112 insertions(+), 3 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 2eb1e7ffa0..143e7c525a 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -694,6 +694,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.To.Sym = ir.Syms.Duffcopy + p.To.Offset = v.AuxInt + ++ case ssa.OpRISCV64LoweredRound32F, ssa.OpRISCV64LoweredRound64F: ++ // input is already rounded ++ + case ssa.OpClobber, ssa.OpClobberReg: + // TODO: implement for clobberdead experiment. Nop is ok for now. + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 181b46a7ce..ac68dfed76 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -103,7 +103,7 @@ + + (CvtBoolToUint8 ...) => (Copy ...) + +-(Round(64|32)F ...) => (Copy ...) ++(Round(32|64)F ...) => (LoweredRound(32|64)F ...) + + (Slicemask x) => (SRAI [63] (NEG x)) + +@@ -780,6 +780,9 @@ + (Select0 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MULHU x y) + (Select1 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MUL x y) + ++(FADDD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FMADDD x y a) ++(FSUBD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FNMSUBD x y a) ++(FSUBD (FMULD x y) a) && a.Block.Func.useFMA(v) => (FMSUBD x y a) + // Merge negation into fused multiply-add and multiply-subtract. + // + // Key: +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 52e87cbe72..69f2950a88 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -237,6 +237,10 @@ func init() { + // gets correctly ordered with respect to GC safepoints. + {name: "MOVconvert", argLength: 2, reg: gp11, asm: "MOV"}, // arg0, but converted to int/ptr as appropriate; arg1=mem + ++ // Round ops to block fused-multiply-add extraction. ++ {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true}, ++ {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true}, ++ + // Calls + {name: "CALLstatic", argLength: -1, reg: call, aux: "CallOff", call: true}, // call static function aux.(*gc.Sym). last arg=mem, auxint=argsize, returns mem + {name: "CALLtail", argLength: -1, reg: call, aux: "CallOff", call: true, tailCall: true}, // tail call static function aux.(*gc.Sym). last arg=mem, auxint=argsize, returns mem +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index 1480fcf45b..e838a26f79 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2382,6 +2382,8 @@ const ( + OpRISCV64SLTU + OpRISCV64SLTIU + OpRISCV64MOVconvert ++ OpRISCV64LoweredRound32F ++ OpRISCV64LoweredRound64F + OpRISCV64CALLstatic + OpRISCV64CALLtail + OpRISCV64CALLclosure +@@ -31916,6 +31918,32 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "LoweredRound32F", ++ argLen: 1, ++ resultInArg0: true, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, ++ { ++ name: "LoweredRound64F", ++ argLen: 1, ++ resultInArg0: true, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, + { + name: "CALLstatic", + auxType: auxCallOff, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index e8002599ef..17af023db3 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -440,6 +440,8 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpRISCV64AND(v) + case OpRISCV64ANDI: + return rewriteValueRISCV64_OpRISCV64ANDI(v) ++ case OpRISCV64FADDD: ++ return rewriteValueRISCV64_OpRISCV64FADDD(v) + case OpRISCV64FMADDD: + return rewriteValueRISCV64_OpRISCV64FMADDD(v) + case OpRISCV64FMSUBD: +@@ -448,6 +450,8 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpRISCV64FNMADDD(v) + case OpRISCV64FNMSUBD: + return rewriteValueRISCV64_OpRISCV64FNMSUBD(v) ++ case OpRISCV64FSUBD: ++ return rewriteValueRISCV64_OpRISCV64FSUBD(v) + case OpRISCV64MOVBUload: + return rewriteValueRISCV64_OpRISCV64MOVBUload(v) + case OpRISCV64MOVBUreg: +@@ -541,10 +545,10 @@ func rewriteValueRISCV64(v *Value) bool { + case OpRotateLeft8: + return rewriteValueRISCV64_OpRotateLeft8(v) + case OpRound32F: +- v.Op = OpCopy ++ v.Op = OpRISCV64LoweredRound32F + return true + case OpRound64F: +- v.Op = OpCopy ++ v.Op = OpRISCV64LoweredRound64F + return true + case OpRsh16Ux16: + return rewriteValueRISCV64_OpRsh16Ux16(v) +@@ -3335,6 +3339,31 @@ func rewriteValueRISCV64_OpRISCV64ANDI(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64FADDD(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (FADDD a (FMULD x y)) ++ // cond: a.Block.Func.useFMA(v) ++ // result: (FMADDD x y a) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ a := v_0 ++ if v_1.Op != OpRISCV64FMULD { ++ continue ++ } ++ y := v_1.Args[1] ++ x := v_1.Args[0] ++ if !(a.Block.Func.useFMA(v)) { ++ continue ++ } ++ v.reset(OpRISCV64FMADDD) ++ v.AddArg3(x, y, a) ++ return true ++ } ++ break ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64FMADDD(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] +@@ -3515,6 +3544,45 @@ func rewriteValueRISCV64_OpRISCV64FNMSUBD(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64FSUBD(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (FSUBD a (FMULD x y)) ++ // cond: a.Block.Func.useFMA(v) ++ // result: (FNMSUBD x y a) ++ for { ++ a := v_0 ++ if v_1.Op != OpRISCV64FMULD { ++ break ++ } ++ y := v_1.Args[1] ++ x := v_1.Args[0] ++ if !(a.Block.Func.useFMA(v)) { ++ break ++ } ++ v.reset(OpRISCV64FNMSUBD) ++ v.AddArg3(x, y, a) ++ return true ++ } ++ // match: (FSUBD (FMULD x y) a) ++ // cond: a.Block.Func.useFMA(v) ++ // result: (FMSUBD x y a) ++ for { ++ if v_0.Op != OpRISCV64FMULD { ++ break ++ } ++ y := v_0.Args[1] ++ x := v_0.Args[0] ++ a := v_1 ++ if !(a.Block.Func.useFMA(v)) { ++ break ++ } ++ v.reset(OpRISCV64FMSUBD) ++ v.AddArg3(x, y, a) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64MOVBUload(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +diff --git a/test/codegen/floats.go b/test/codegen/floats.go +index 9cb62e031a..1c5fc8a31a 100644 +--- a/test/codegen/floats.go ++++ b/test/codegen/floats.go +@@ -88,17 +88,20 @@ func FusedAdd64(x, y, z float64) float64 { + // s390x:"FMADD\t" + // ppc64x:"FMADD\t" + // arm64:"FMADDD" ++ // riscv64:"FMADDD\t" + return x*y + z + } + + func FusedSub64_a(x, y, z float64) float64 { + // s390x:"FMSUB\t" + // ppc64x:"FMSUB\t" ++ // riscv64:"FMSUBD\t" + return x*y - z + } + + func FusedSub64_b(x, y, z float64) float64 { + // arm64:"FMSUBD" ++ // riscv64:"FNMSUBD\t" + return z - x*y + } + +-- +2.39.5 + diff --git a/2005-cmd-compile-add-single-precision-FMA-code-generation.patch b/2005-cmd-compile-add-single-precision-FMA-code-generation.patch new file mode 100644 index 0000000..d5a7daa --- /dev/null +++ b/2005-cmd-compile-add-single-precision-FMA-code-generation.patch @@ -0,0 +1,512 @@ +From 567178ee5e574611b048418b56905c63e98c9658 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 005/119] cmd/compile: add single-precision FMA code generation + for riscv64 + +This CL adds FMADDS,FMSUBS,FNMADDS,FNMSUBS SSA support for riscv + +Change-Id: I1e7dd322b46b9e0f4923dbba256303d69ed12066 +Reviewed-on: https://go-review.googlesource.com/c/go/+/506616 +Reviewed-by: Joel Sing +Reviewed-by: David Chase +TryBot-Result: Gopher Robot +Reviewed-by: Keith Randall +Run-TryBot: M Zhuo +--- + src/cmd/compile/internal/riscv64/ssa.go | 3 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 9 +- + .../compile/internal/ssa/_gen/RISCV64Ops.go | 4 + + src/cmd/compile/internal/ssa/opGen.go | 68 +++++ + .../compile/internal/ssa/rewriteRISCV64.go | 256 ++++++++++++++++++ + test/codegen/floats.go | 3 + + 6 files changed, 339 insertions(+), 4 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 143e7c525a..f8cf786920 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -332,7 +332,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p2.From.Reg = v.Reg1() + p2.To.Type = obj.TYPE_REG + p2.To.Reg = v.Reg1() +- case ssa.OpRISCV64FMADDD, ssa.OpRISCV64FMSUBD, ssa.OpRISCV64FNMADDD, ssa.OpRISCV64FNMSUBD: ++ case ssa.OpRISCV64FMADDD, ssa.OpRISCV64FMSUBD, ssa.OpRISCV64FNMADDD, ssa.OpRISCV64FNMSUBD, ++ ssa.OpRISCV64FMADDS, ssa.OpRISCV64FMSUBS, ssa.OpRISCV64FNMADDS, ssa.OpRISCV64FNMSUBS: + r := v.Reg() + r1 := v.Args[0].Reg() + r2 := v.Args[1].Reg() +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index ac68dfed76..e0bf00d45d 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -780,9 +780,10 @@ + (Select0 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MULHU x y) + (Select1 m:(LoweredMuluhilo x y)) && m.Uses == 1 => (MUL x y) + +-(FADDD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FMADDD x y a) +-(FSUBD a (FMULD x y)) && a.Block.Func.useFMA(v) => (FNMSUBD x y a) +-(FSUBD (FMULD x y) a) && a.Block.Func.useFMA(v) => (FMSUBD x y a) ++(FADD(S|D) a (FMUL(S|D) x y)) && a.Block.Func.useFMA(v) => (FMADD(S|D) x y a) ++(FSUB(S|D) a (FMUL(S|D) x y)) && a.Block.Func.useFMA(v) => (FNMSUB(S|D) x y a) ++(FSUB(S|D) (FMUL(S|D) x y) a) && a.Block.Func.useFMA(v) => (FMSUB(S|D) x y a) ++ + // Merge negation into fused multiply-add and multiply-subtract. + // + // Key: +@@ -793,5 +794,7 @@ + // D B + // + // Note: multiplication commutativity handled by rule generator. ++(F(MADD|NMADD|MSUB|NMSUB)S neg:(FNEGS x) y z) && neg.Uses == 1 => (F(NMSUB|MSUB|NMADD|MADD)S x y z) ++(F(MADD|NMADD|MSUB|NMSUB)S x y neg:(FNEGS z)) && neg.Uses == 1 => (F(MSUB|NMSUB|MADD|NMADD)S x y z) + (F(MADD|NMADD|MSUB|NMSUB)D neg:(FNEGD x) y z) && neg.Uses == 1 => (F(NMSUB|MSUB|NMADD|MADD)D x y z) + (F(MADD|NMADD|MSUB|NMSUB)D x y neg:(FNEGD z)) && neg.Uses == 1 => (F(MSUB|NMSUB|MADD|NMADD)D x y z) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 69f2950a88..317e9150c9 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -411,6 +411,10 @@ func init() { + {name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS", commutative: false, typ: "Float32"}, // arg0 - arg1 + {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, typ: "Float32"}, // arg0 * arg1 + {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", commutative: false, typ: "Float32"}, // arg0 / arg1 ++ {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS", commutative: true, typ: "Float32"}, // (arg0 * arg1) + arg2 ++ {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS", commutative: true, typ: "Float32"}, // (arg0 * arg1) - arg2 ++ {name: "FNMADDS", argLength: 3, reg: fp31, asm: "FNMADDS", commutative: true, typ: "Float32"}, // -(arg0 * arg1) + arg2 ++ {name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS", commutative: true, typ: "Float32"}, // -(arg0 * arg1) - arg2 + {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS", typ: "Float32"}, // sqrt(arg0) + {name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS", typ: "Float32"}, // -arg0 + {name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"}, // reinterpret arg0 as float +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index e838a26f79..5af047c38f 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2418,6 +2418,10 @@ const ( + OpRISCV64FSUBS + OpRISCV64FMULS + OpRISCV64FDIVS ++ OpRISCV64FMADDS ++ OpRISCV64FMSUBS ++ OpRISCV64FNMADDS ++ OpRISCV64FNMSUBS + OpRISCV64FSQRTS + OpRISCV64FNEGS + OpRISCV64FMVSX +@@ -32391,6 +32395,70 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "FMADDS", ++ argLen: 3, ++ commutative: true, ++ asm: riscv.AFMADDS, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, ++ { ++ name: "FMSUBS", ++ argLen: 3, ++ commutative: true, ++ asm: riscv.AFMSUBS, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, ++ { ++ name: "FNMADDS", ++ argLen: 3, ++ commutative: true, ++ asm: riscv.AFNMADDS, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, ++ { ++ name: "FNMSUBS", ++ argLen: 3, ++ commutative: true, ++ asm: riscv.AFNMSUBS, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, + { + name: "FSQRTS", + argLen: 1, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 17af023db3..0ad6433bf4 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -442,16 +442,28 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpRISCV64ANDI(v) + case OpRISCV64FADDD: + return rewriteValueRISCV64_OpRISCV64FADDD(v) ++ case OpRISCV64FADDS: ++ return rewriteValueRISCV64_OpRISCV64FADDS(v) + case OpRISCV64FMADDD: + return rewriteValueRISCV64_OpRISCV64FMADDD(v) ++ case OpRISCV64FMADDS: ++ return rewriteValueRISCV64_OpRISCV64FMADDS(v) + case OpRISCV64FMSUBD: + return rewriteValueRISCV64_OpRISCV64FMSUBD(v) ++ case OpRISCV64FMSUBS: ++ return rewriteValueRISCV64_OpRISCV64FMSUBS(v) + case OpRISCV64FNMADDD: + return rewriteValueRISCV64_OpRISCV64FNMADDD(v) ++ case OpRISCV64FNMADDS: ++ return rewriteValueRISCV64_OpRISCV64FNMADDS(v) + case OpRISCV64FNMSUBD: + return rewriteValueRISCV64_OpRISCV64FNMSUBD(v) ++ case OpRISCV64FNMSUBS: ++ return rewriteValueRISCV64_OpRISCV64FNMSUBS(v) + case OpRISCV64FSUBD: + return rewriteValueRISCV64_OpRISCV64FSUBD(v) ++ case OpRISCV64FSUBS: ++ return rewriteValueRISCV64_OpRISCV64FSUBS(v) + case OpRISCV64MOVBUload: + return rewriteValueRISCV64_OpRISCV64MOVBUload(v) + case OpRISCV64MOVBUreg: +@@ -3364,6 +3376,31 @@ func rewriteValueRISCV64_OpRISCV64FADDD(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64FADDS(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (FADDS a (FMULS x y)) ++ // cond: a.Block.Func.useFMA(v) ++ // result: (FMADDS x y a) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ a := v_0 ++ if v_1.Op != OpRISCV64FMULS { ++ continue ++ } ++ y := v_1.Args[1] ++ x := v_1.Args[0] ++ if !(a.Block.Func.useFMA(v)) { ++ continue ++ } ++ v.reset(OpRISCV64FMADDS) ++ v.AddArg3(x, y, a) ++ return true ++ } ++ break ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64FMADDD(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] +@@ -3409,6 +3446,51 @@ func rewriteValueRISCV64_OpRISCV64FMADDD(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64FMADDS(v *Value) bool { ++ v_2 := v.Args[2] ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (FMADDS neg:(FNEGS x) y z) ++ // cond: neg.Uses == 1 ++ // result: (FNMSUBS x y z) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ neg := v_0 ++ if neg.Op != OpRISCV64FNEGS { ++ continue ++ } ++ x := neg.Args[0] ++ y := v_1 ++ z := v_2 ++ if !(neg.Uses == 1) { ++ continue ++ } ++ v.reset(OpRISCV64FNMSUBS) ++ v.AddArg3(x, y, z) ++ return true ++ } ++ break ++ } ++ // match: (FMADDS x y neg:(FNEGS z)) ++ // cond: neg.Uses == 1 ++ // result: (FMSUBS x y z) ++ for { ++ x := v_0 ++ y := v_1 ++ neg := v_2 ++ if neg.Op != OpRISCV64FNEGS { ++ break ++ } ++ z := neg.Args[0] ++ if !(neg.Uses == 1) { ++ break ++ } ++ v.reset(OpRISCV64FMSUBS) ++ v.AddArg3(x, y, z) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64FMSUBD(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] +@@ -3454,6 +3536,51 @@ func rewriteValueRISCV64_OpRISCV64FMSUBD(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64FMSUBS(v *Value) bool { ++ v_2 := v.Args[2] ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (FMSUBS neg:(FNEGS x) y z) ++ // cond: neg.Uses == 1 ++ // result: (FNMADDS x y z) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ neg := v_0 ++ if neg.Op != OpRISCV64FNEGS { ++ continue ++ } ++ x := neg.Args[0] ++ y := v_1 ++ z := v_2 ++ if !(neg.Uses == 1) { ++ continue ++ } ++ v.reset(OpRISCV64FNMADDS) ++ v.AddArg3(x, y, z) ++ return true ++ } ++ break ++ } ++ // match: (FMSUBS x y neg:(FNEGS z)) ++ // cond: neg.Uses == 1 ++ // result: (FMADDS x y z) ++ for { ++ x := v_0 ++ y := v_1 ++ neg := v_2 ++ if neg.Op != OpRISCV64FNEGS { ++ break ++ } ++ z := neg.Args[0] ++ if !(neg.Uses == 1) { ++ break ++ } ++ v.reset(OpRISCV64FMADDS) ++ v.AddArg3(x, y, z) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64FNMADDD(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] +@@ -3499,6 +3626,51 @@ func rewriteValueRISCV64_OpRISCV64FNMADDD(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64FNMADDS(v *Value) bool { ++ v_2 := v.Args[2] ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (FNMADDS neg:(FNEGS x) y z) ++ // cond: neg.Uses == 1 ++ // result: (FMSUBS x y z) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ neg := v_0 ++ if neg.Op != OpRISCV64FNEGS { ++ continue ++ } ++ x := neg.Args[0] ++ y := v_1 ++ z := v_2 ++ if !(neg.Uses == 1) { ++ continue ++ } ++ v.reset(OpRISCV64FMSUBS) ++ v.AddArg3(x, y, z) ++ return true ++ } ++ break ++ } ++ // match: (FNMADDS x y neg:(FNEGS z)) ++ // cond: neg.Uses == 1 ++ // result: (FNMSUBS x y z) ++ for { ++ x := v_0 ++ y := v_1 ++ neg := v_2 ++ if neg.Op != OpRISCV64FNEGS { ++ break ++ } ++ z := neg.Args[0] ++ if !(neg.Uses == 1) { ++ break ++ } ++ v.reset(OpRISCV64FNMSUBS) ++ v.AddArg3(x, y, z) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64FNMSUBD(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] +@@ -3544,6 +3716,51 @@ func rewriteValueRISCV64_OpRISCV64FNMSUBD(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64FNMSUBS(v *Value) bool { ++ v_2 := v.Args[2] ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (FNMSUBS neg:(FNEGS x) y z) ++ // cond: neg.Uses == 1 ++ // result: (FMADDS x y z) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ neg := v_0 ++ if neg.Op != OpRISCV64FNEGS { ++ continue ++ } ++ x := neg.Args[0] ++ y := v_1 ++ z := v_2 ++ if !(neg.Uses == 1) { ++ continue ++ } ++ v.reset(OpRISCV64FMADDS) ++ v.AddArg3(x, y, z) ++ return true ++ } ++ break ++ } ++ // match: (FNMSUBS x y neg:(FNEGS z)) ++ // cond: neg.Uses == 1 ++ // result: (FNMADDS x y z) ++ for { ++ x := v_0 ++ y := v_1 ++ neg := v_2 ++ if neg.Op != OpRISCV64FNEGS { ++ break ++ } ++ z := neg.Args[0] ++ if !(neg.Uses == 1) { ++ break ++ } ++ v.reset(OpRISCV64FNMADDS) ++ v.AddArg3(x, y, z) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64FSUBD(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +@@ -3583,6 +3800,45 @@ func rewriteValueRISCV64_OpRISCV64FSUBD(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64FSUBS(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (FSUBS a (FMULS x y)) ++ // cond: a.Block.Func.useFMA(v) ++ // result: (FNMSUBS x y a) ++ for { ++ a := v_0 ++ if v_1.Op != OpRISCV64FMULS { ++ break ++ } ++ y := v_1.Args[1] ++ x := v_1.Args[0] ++ if !(a.Block.Func.useFMA(v)) { ++ break ++ } ++ v.reset(OpRISCV64FNMSUBS) ++ v.AddArg3(x, y, a) ++ return true ++ } ++ // match: (FSUBS (FMULS x y) a) ++ // cond: a.Block.Func.useFMA(v) ++ // result: (FMSUBS x y a) ++ for { ++ if v_0.Op != OpRISCV64FMULS { ++ break ++ } ++ y := v_0.Args[1] ++ x := v_0.Args[0] ++ a := v_1 ++ if !(a.Block.Func.useFMA(v)) { ++ break ++ } ++ v.reset(OpRISCV64FMSUBS) ++ v.AddArg3(x, y, a) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64MOVBUload(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +diff --git a/test/codegen/floats.go b/test/codegen/floats.go +index 1c5fc8a31a..7991174b66 100644 +--- a/test/codegen/floats.go ++++ b/test/codegen/floats.go +@@ -70,17 +70,20 @@ func FusedAdd32(x, y, z float32) float32 { + // s390x:"FMADDS\t" + // ppc64x:"FMADDS\t" + // arm64:"FMADDS" ++ // riscv64:"FMADDS\t" + return x*y + z + } + + func FusedSub32_a(x, y, z float32) float32 { + // s390x:"FMSUBS\t" + // ppc64x:"FMSUBS\t" ++ // riscv64:"FMSUBS\t" + return x*y - z + } + + func FusedSub32_b(x, y, z float32) float32 { + // arm64:"FMSUBS" ++ // riscv64:"FNMSUBS\t" + return z - x*y + } + +-- +2.39.5 + diff --git a/2006-NOT-FULL-BACKPORT-cmd-internal-obj-riscv-cmd-link-ad.patch b/2006-NOT-FULL-BACKPORT-cmd-internal-obj-riscv-cmd-link-ad.patch new file mode 100644 index 0000000..9b7f6a0 --- /dev/null +++ b/2006-NOT-FULL-BACKPORT-cmd-internal-obj-riscv-cmd-link-ad.patch @@ -0,0 +1,180 @@ +From 40c441967b1e581a63ab416802def74d5af22c5d Mon Sep 17 00:00:00 2001 +From: Julian Zhu +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 006/119] [NOT FULL BACKPORT] cmd/internal/obj/riscv,cmd/link: + add support for internal cgo linking on riscv64 + +--- + src/cmd/internal/obj/riscv/cpu.go | 18 +++++-- + src/cmd/internal/obj/riscv/obj.go | 82 +++++++++++++++++++++++++++++-- + 2 files changed, 92 insertions(+), 8 deletions(-) + +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index dde1231e15..bfd5153da4 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -619,14 +619,26 @@ var unaryDst = map[obj.As]bool{ + + // Instruction encoding masks. + const ( +- // JTypeImmMask is a mask including only the immediate portion of +- // J-type instructions. +- JTypeImmMask = 0xfffff000 ++ // BTypeImmMask is a mask including only the immediate portion of ++ // B-type instructions. ++ BTypeImmMask = 0xfe000f80 ++ ++ // CBTypeImmMask is a mask including only the immediate portion of ++ // CB-type instructions. ++ CBTypeImmMask = 0x1c7c ++ ++ // CJTypeImmMask is a mask including only the immediate portion of ++ // CJ-type instructions. ++ CJTypeImmMask = 0x1f7c + + // ITypeImmMask is a mask including only the immediate portion of + // I-type instructions. + ITypeImmMask = 0xfff00000 + ++ // JTypeImmMask is a mask including only the immediate portion of ++ // J-type instructions. ++ JTypeImmMask = 0xfffff000 ++ + // STypeImmMask is a mask including only the immediate portion of + // S-type instructions. + STypeImmMask = 0xfe000f80 +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 2e55fac812..776c3a8df6 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1181,6 +1181,12 @@ func validateRaw(ctxt *obj.Link, ins *instruction) { + } + } + ++// extractBitAndShift extracts the specified bit from the given immediate, ++// before shifting it to the requested position and returning it. ++func extractBitAndShift(imm uint32, bit, pos int) uint32 { ++ return ((imm >> bit) & 1) << pos ++} ++ + // encodeR encodes an R-type RISC-V instruction. + func encodeR(as obj.As, rs1, rs2, rd, funct3, funct7 uint32) uint32 { + enc := encode(as) +@@ -1272,6 +1278,11 @@ func encodeSF(ins *instruction) uint32 { + return encodeS(ins.as, regI(ins.rd), regF(ins.rs1), uint32(ins.imm)) + } + ++// encodeBImmediate encodes an immediate for a B-type RISC-V instruction. ++func encodeBImmediate(imm uint32) uint32 { ++ return (imm>>12)<<31 | ((imm>>5)&0x3f)<<25 | ((imm>>1)&0xf)<<8 | ((imm>>11)&0x1)<<7 ++} ++ + // encodeB encodes a B-type RISC-V instruction. + func encodeB(ins *instruction) uint32 { + imm := immI(ins.as, ins.imm, 13) +@@ -1281,7 +1292,7 @@ func encodeB(ins *instruction) uint32 { + if enc == nil { + panic("encodeB: could not encode instruction") + } +- return (imm>>12)<<31 | ((imm>>5)&0x3f)<<25 | rs2<<20 | rs1<<15 | enc.funct3<<12 | ((imm>>1)&0xf)<<8 | ((imm>>11)&0x1)<<7 | enc.opcode ++ return encodeBImmediate(imm) | rs2<<20 | rs1<<15 | enc.funct3<<12 | enc.opcode + } + + // encodeU encodes a U-type RISC-V instruction. +@@ -1315,6 +1326,37 @@ func encodeJ(ins *instruction) uint32 { + return encodeJImmediate(imm) | rd<<7 | enc.opcode + } + ++// encodeCBImmediate encodes an immediate for a CB-type RISC-V instruction. ++func encodeCBImmediate(imm uint32) uint32 { ++ // Bit order - [8|4:3|7:6|2:1|5] ++ bits := extractBitAndShift(imm, 8, 7) ++ bits |= extractBitAndShift(imm, 4, 6) ++ bits |= extractBitAndShift(imm, 3, 5) ++ bits |= extractBitAndShift(imm, 7, 4) ++ bits |= extractBitAndShift(imm, 6, 3) ++ bits |= extractBitAndShift(imm, 2, 2) ++ bits |= extractBitAndShift(imm, 1, 1) ++ bits |= extractBitAndShift(imm, 5, 0) ++ return (bits>>5)<<10 | (bits&0x1f)<<2 ++} ++ ++// encodeCJImmediate encodes an immediate for a CJ-type RISC-V instruction. ++func encodeCJImmediate(imm uint32) uint32 { ++ // Bit order - [11|4|9:8|10|6|7|3:1|5] ++ bits := extractBitAndShift(imm, 11, 10) ++ bits |= extractBitAndShift(imm, 4, 9) ++ bits |= extractBitAndShift(imm, 9, 8) ++ bits |= extractBitAndShift(imm, 8, 7) ++ bits |= extractBitAndShift(imm, 10, 6) ++ bits |= extractBitAndShift(imm, 6, 5) ++ bits |= extractBitAndShift(imm, 7, 4) ++ bits |= extractBitAndShift(imm, 3, 3) ++ bits |= extractBitAndShift(imm, 2, 2) ++ bits |= extractBitAndShift(imm, 1, 1) ++ bits |= extractBitAndShift(imm, 5, 0) ++ return bits << 2 ++} ++ + func encodeRawIns(ins *instruction) uint32 { + // Treat the raw value specially as a 32-bit unsigned integer. + // Nobody wants to enter negative machine code. +@@ -1324,14 +1366,34 @@ func encodeRawIns(ins *instruction) uint32 { + return uint32(ins.imm) + } + +-func EncodeJImmediate(imm int64) (int64, error) { +- if !immIFits(imm, 21) { +- return 0, fmt.Errorf("immediate %#x does not fit in 21 bits", imm) ++func EncodeBImmediate(imm int64) (int64, error) { ++ if !immIFits(imm, 13) { ++ return 0, fmt.Errorf("immediate %#x does not fit in 13 bits", imm) + } + if imm&1 != 0 { + return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm) + } +- return int64(encodeJImmediate(uint32(imm))), nil ++ return int64(encodeBImmediate(uint32(imm))), nil ++} ++ ++func EncodeCBImmediate(imm int64) (int64, error) { ++ if !immIFits(imm, 9) { ++ return 0, fmt.Errorf("immediate %#x does not fit in 9 bits", imm) ++ } ++ if imm&1 != 0 { ++ return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm) ++ } ++ return int64(encodeCBImmediate(uint32(imm))), nil ++} ++ ++func EncodeCJImmediate(imm int64) (int64, error) { ++ if !immIFits(imm, 12) { ++ return 0, fmt.Errorf("immediate %#x does not fit in 12 bits", imm) ++ } ++ if imm&1 != 0 { ++ return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm) ++ } ++ return int64(encodeCJImmediate(uint32(imm))), nil + } + + func EncodeIImmediate(imm int64) (int64, error) { +@@ -1341,6 +1403,16 @@ func EncodeIImmediate(imm int64) (int64, error) { + return imm << 20, nil + } + ++func EncodeJImmediate(imm int64) (int64, error) { ++ if !immIFits(imm, 21) { ++ return 0, fmt.Errorf("immediate %#x does not fit in 21 bits", imm) ++ } ++ if imm&1 != 0 { ++ return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm) ++ } ++ return int64(encodeJImmediate(uint32(imm))), nil ++} ++ + func EncodeSImmediate(imm int64) (int64, error) { + if !immIFits(imm, 12) { + return 0, fmt.Errorf("immediate %#x does not fit in 12 bits", imm) +-- +2.39.5 + diff --git a/2007-cmd-internal-obj-riscv-clean-up-error-checking-for-e.patch b/2007-cmd-internal-obj-riscv-clean-up-error-checking-for-e.patch new file mode 100644 index 0000000..1a725e2 --- /dev/null +++ b/2007-cmd-internal-obj-riscv-clean-up-error-checking-for-e.patch @@ -0,0 +1,41 @@ +From b74c6eef59b684a8c9b65084399050aaaa6ac162 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 007/119] cmd/internal/obj/riscv: clean up error checking for + encoding + +Replace a "fixme" with a more appropriate error. Also invert the condition +so that the error returns early, which is more Go idiomatic. + +Change-Id: I03006572c4010fb47037bed3ee1fd7f92bfc20d3 +Reviewed-on: https://go-review.googlesource.com/c/go/+/523457 +TryBot-Result: Gopher Robot +Reviewed-by: Cherry Mui +Reviewed-by: Dmitri Shuralyov +Run-TryBot: Joel Sing +Reviewed-by: M Zhuo +--- + src/cmd/internal/obj/riscv/obj.go | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 776c3a8df6..4a386eb1fc 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1722,10 +1722,10 @@ func (ins *instruction) encode() (uint32, error) { + if err != nil { + return 0, err + } +- if enc.length > 0 { +- return enc.encode(ins), nil ++ if enc.length <= 0 { ++ return 0, fmt.Errorf("%v: encoding called for a pseudo instruction", ins.as) + } +- return 0, fmt.Errorf("fixme") ++ return enc.encode(ins), nil + } + + func (ins *instruction) length() int { +-- +2.39.5 + diff --git a/2008-cmd-internal-obj-riscv-correct-message-in-regVal-pan.patch b/2008-cmd-internal-obj-riscv-correct-message-in-regVal-pan.patch new file mode 100644 index 0000000..b01e6ea --- /dev/null +++ b/2008-cmd-internal-obj-riscv-correct-message-in-regVal-pan.patch @@ -0,0 +1,34 @@ +From 0d075e31f49e99beab462ae9115f6a6438e38b61 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 008/119] cmd/internal/obj/riscv: correct message in regVal + panic + +Change-Id: I68be4110216145ad1fb2e5095e1f2b143f9e69ac +Reviewed-on: https://go-review.googlesource.com/c/go/+/523456 +Reviewed-by: Cherry Mui +Reviewed-by: Dmitri Shuralyov +TryBot-Result: Gopher Robot +Reviewed-by: Mark Ryan +Reviewed-by: M Zhuo +Run-TryBot: Joel Sing +--- + src/cmd/internal/obj/riscv/obj.go | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 4a386eb1fc..cf80c82f79 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -973,7 +973,7 @@ func Split32BitImmediate(imm int64) (low, high int64, err error) { + + func regVal(r, min, max uint32) uint32 { + if r < min || r > max { +- panic(fmt.Sprintf("register out of range, want %d < %d < %d", min, r, max)) ++ panic(fmt.Sprintf("register out of range, want %d <= %d <= %d", min, r, max)) + } + return r - min + } +-- +2.39.5 + diff --git a/2009-cmd-internal-obj-riscv-simplify-instructionsForMOV.patch b/2009-cmd-internal-obj-riscv-simplify-instructionsForMOV.patch new file mode 100644 index 0000000..f291cae --- /dev/null +++ b/2009-cmd-internal-obj-riscv-simplify-instructionsForMOV.patch @@ -0,0 +1,56 @@ +From 6f5dfd0c04b3433056eea4d6193a1b04423dd43f Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 009/119] cmd/internal/obj/riscv: simplify instructionsForMOV + +Rather than handling shift based scaling in two locations, rework logic +so there is a single exit path. + +Change-Id: I832b4932d53183736050059a11019ced08281b3b +Reviewed-on: https://go-review.googlesource.com/c/go/+/523455 +Reviewed-by: M Zhuo +Reviewed-by: Cherry Mui +Run-TryBot: Joel Sing +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Mark Ryan +TryBot-Result: Gopher Robot +--- + src/cmd/internal/obj/riscv/obj.go | 21 ++++++++------------- + 1 file changed, 8 insertions(+), 13 deletions(-) + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index cf80c82f79..7b5621f650 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1992,20 +1992,15 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + ins.as, ins.rs1, ins.rs2, ins.imm = AADDI, REG_ZERO, obj.REG_NONE, low + + // LUI is only necessary if the constant does not fit in 12 bits. +- if high == 0 { +- if insSLLI != nil { +- inss = append(inss, insSLLI) ++ if high != 0 { ++ // LUI top20bits(c), R ++ // ADD bottom12bits(c), R, R ++ insLUI := &instruction{as: ALUI, rd: ins.rd, imm: high} ++ inss = []*instruction{insLUI} ++ if low != 0 { ++ ins.as, ins.rs1 = AADDIW, ins.rd ++ inss = append(inss, ins) + } +- break +- } +- +- // LUI top20bits(c), R +- // ADD bottom12bits(c), R, R +- insLUI := &instruction{as: ALUI, rd: ins.rd, imm: high} +- inss = []*instruction{insLUI} +- if low != 0 { +- ins.as, ins.rs1 = AADDIW, ins.rd +- inss = append(inss, ins) + } + if insSLLI != nil { + inss = append(inss, insSLLI) +-- +2.39.5 + diff --git a/2010-internal-cpu-fix-wrong-cache-line-size-of-riscv64.patch b/2010-internal-cpu-fix-wrong-cache-line-size-of-riscv64.patch new file mode 100644 index 0000000..97a69b7 --- /dev/null +++ b/2010-internal-cpu-fix-wrong-cache-line-size-of-riscv64.patch @@ -0,0 +1,34 @@ +From 8b2675740c122a84bf74d387aaa595d76d5b3192 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 010/119] internal/cpu: fix wrong cache line size of riscv64 + +All of riscv CPU using 64B for cache-line size. +i.e. U540 of Hifive Unleashed (https://www.sifive.com/boards/hifive-unleashed) + +Change-Id: I0d72d88ac026f45383c3b3eb3a77233d3c2e4004 +Reviewed-on: https://go-review.googlesource.com/c/go/+/526659 +Run-TryBot: M Zhuo +Reviewed-by: Cherry Mui +Reviewed-by: Heschi Kreinick +TryBot-Result: Gopher Robot +--- + src/internal/cpu/cpu_riscv64.go | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/internal/cpu/cpu_riscv64.go b/src/internal/cpu/cpu_riscv64.go +index 54b8c3378b..2173fe8886 100644 +--- a/src/internal/cpu/cpu_riscv64.go ++++ b/src/internal/cpu/cpu_riscv64.go +@@ -4,7 +4,7 @@ + + package cpu + +-const CacheLinePadSize = 32 ++const CacheLinePadSize = 64 + + func doinit() { + } +-- +2.39.5 + diff --git a/2011-cmd-internal-obj-riscv-clean-up-immediate-checking.patch b/2011-cmd-internal-obj-riscv-clean-up-immediate-checking.patch new file mode 100644 index 0000000..9d56a30 --- /dev/null +++ b/2011-cmd-internal-obj-riscv-clean-up-immediate-checking.patch @@ -0,0 +1,206 @@ +From e7c39e53ac0c6aeb715ddeb724f4324cecc19ef5 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 011/119] cmd/internal/obj/riscv: clean up immediate checking + +Change immIFits to return an error in the case that it does not fit. +This allows for deduplication and consistency of error messages. +Additionally, since we've already calculated the min and max values, +we can easily include these in the message. Also provide and use +immEven, for the same reasons. + +Change-Id: Ie680558744f3e9bc19d6913c4144ce9ddbd0429c +Reviewed-on: https://go-review.googlesource.com/c/go/+/523458 +Reviewed-by: Cherry Mui +Reviewed-by: Mark Ryan +Run-TryBot: M Zhuo +TryBot-Result: Gopher Robot +Reviewed-by: M Zhuo +Reviewed-by: Matthew Dempsky +--- + src/cmd/internal/obj/riscv/obj.go | 93 ++++++++++++++++++------------- + 1 file changed, 54 insertions(+), 39 deletions(-) + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 7b5621f650..ab41e53b8c 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -941,12 +941,12 @@ func signExtend(val int64, bit uint) int64 { + // result. For example, high may be used in LUI and low in a following ADDI to + // generate a full 32-bit constant. + func Split32BitImmediate(imm int64) (low, high int64, err error) { +- if !immIFits(imm, 32) { +- return 0, 0, fmt.Errorf("immediate does not fit in 32 bits: %d", imm) ++ if err := immIFits(imm, 32); err != nil { ++ return 0, 0, err + } + + // Nothing special needs to be done if the immediate fits in 12 bits. +- if immIFits(imm, 12) { ++ if err := immIFits(imm, 12); err == nil { + return imm, 0, nil + } + +@@ -1006,26 +1006,41 @@ func regFAddr(a obj.Addr) uint32 { + return regAddr(a, REG_F0, REG_F31) + } + +-// immIFits reports whether immediate value x fits in nbits bits +-// as a signed integer. +-func immIFits(x int64, nbits uint) bool { ++// immEven checks that the immediate is a multiple of two. If it ++// is not, an error is returned. ++func immEven(x int64) error { ++ if x&1 != 0 { ++ return fmt.Errorf("immediate %#x is not a multiple of two", x) ++ } ++ return nil ++} ++ ++// immIFits checks whether the immediate value x fits in nbits bits ++// as a signed integer. If it does not, an error is returned. ++func immIFits(x int64, nbits uint) error { + nbits-- +- var min int64 = -1 << nbits +- var max int64 = 1< max { ++ if nbits <= 16 { ++ return fmt.Errorf("signed immediate %d must be in range [%d, %d] (%d bits)", x, min, max, nbits) ++ } ++ return fmt.Errorf("signed immediate %#x must be in range [%#x, %#x] (%d bits)", x, min, max, nbits) ++ } ++ return nil + } + + // immI extracts the signed integer of the specified size from an immediate. + func immI(as obj.As, imm int64, nbits uint) uint32 { +- if !immIFits(imm, nbits) { +- panic(fmt.Sprintf("%v: signed immediate %d cannot fit in %d bits", as, imm, nbits)) ++ if err := immIFits(imm, nbits); err != nil { ++ panic(fmt.Sprintf("%v: %v", as, err)) + } + return uint32(imm) + } + + func wantImmI(ctxt *obj.Link, as obj.As, imm int64, nbits uint) { +- if !immIFits(imm, nbits) { +- ctxt.Diag("%v: signed immediate %d cannot be larger than %d bits", as, imm, nbits) ++ if err := immIFits(imm, nbits); err != nil { ++ ctxt.Diag("%v: %v", as, err) + } + } + +@@ -1057,8 +1072,8 @@ func wantFloatReg(ctxt *obj.Link, as obj.As, pos string, r uint32) { + + // wantEvenOffset checks that the offset is a multiple of two. + func wantEvenOffset(ctxt *obj.Link, as obj.As, offset int64) { +- if offset%1 != 0 { +- ctxt.Diag("%v: jump offset %d must be a multiple of two", as, offset) ++ if err := immEven(offset); err != nil { ++ ctxt.Diag("%v: %v", as, err) + } + } + +@@ -1367,62 +1382,62 @@ func encodeRawIns(ins *instruction) uint32 { + } + + func EncodeBImmediate(imm int64) (int64, error) { +- if !immIFits(imm, 13) { +- return 0, fmt.Errorf("immediate %#x does not fit in 13 bits", imm) ++ if err := immIFits(imm, 13); err != nil { ++ return 0, err + } +- if imm&1 != 0 { +- return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm) ++ if err := immEven(imm); err != nil { ++ return 0, err + } + return int64(encodeBImmediate(uint32(imm))), nil + } + + func EncodeCBImmediate(imm int64) (int64, error) { +- if !immIFits(imm, 9) { +- return 0, fmt.Errorf("immediate %#x does not fit in 9 bits", imm) ++ if err := immIFits(imm, 9); err != nil { ++ return 0, err + } +- if imm&1 != 0 { +- return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm) ++ if err := immEven(imm); err != nil { ++ return 0, err + } + return int64(encodeCBImmediate(uint32(imm))), nil + } + + func EncodeCJImmediate(imm int64) (int64, error) { +- if !immIFits(imm, 12) { +- return 0, fmt.Errorf("immediate %#x does not fit in 12 bits", imm) ++ if err := immIFits(imm, 12); err != nil { ++ return 0, err + } +- if imm&1 != 0 { +- return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm) ++ if err := immEven(imm); err != nil { ++ return 0, err + } + return int64(encodeCJImmediate(uint32(imm))), nil + } + + func EncodeIImmediate(imm int64) (int64, error) { +- if !immIFits(imm, 12) { +- return 0, fmt.Errorf("immediate %#x does not fit in 12 bits", imm) ++ if err := immIFits(imm, 12); err != nil { ++ return 0, err + } + return imm << 20, nil + } + + func EncodeJImmediate(imm int64) (int64, error) { +- if !immIFits(imm, 21) { +- return 0, fmt.Errorf("immediate %#x does not fit in 21 bits", imm) ++ if err := immIFits(imm, 21); err != nil { ++ return 0, err + } +- if imm&1 != 0 { +- return 0, fmt.Errorf("immediate %#x is not a multiple of two", imm) ++ if err := immEven(imm); err != nil { ++ return 0, err + } + return int64(encodeJImmediate(uint32(imm))), nil + } + + func EncodeSImmediate(imm int64) (int64, error) { +- if !immIFits(imm, 12) { +- return 0, fmt.Errorf("immediate %#x does not fit in 12 bits", imm) ++ if err := immIFits(imm, 12); err != nil { ++ return 0, err + } + return ((imm >> 5) << 25) | ((imm & 0x1f) << 7), nil + } + + func EncodeUImmediate(imm int64) (int64, error) { +- if !immIFits(imm, 20) { +- return 0, fmt.Errorf("immediate %#x does not fit in 20 bits", imm) ++ if err := immIFits(imm, 20); err != nil { ++ return 0, err + } + return imm << 12, nil + } +@@ -1974,9 +1989,9 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + // MOV $1, X10 + // SLLI $63, X10, X10 + var insSLLI *instruction +- if !immIFits(ins.imm, 32) { ++ if err := immIFits(ins.imm, 32); err != nil { + ctz := bits.TrailingZeros64(uint64(ins.imm)) +- if immIFits(ins.imm>>ctz, 32) { ++ if err := immIFits(ins.imm>>ctz, 32); err == nil { + ins.imm = ins.imm >> ctz + insSLLI = &instruction{as: ASLLI, rd: ins.rd, rs1: ins.rd, imm: int64(ctz)} + } +-- +2.39.5 + diff --git a/2012-cmd-compile-internal-intrinsify-publicationBarrier-o.patch b/2012-cmd-compile-internal-intrinsify-publicationBarrier-o.patch new file mode 100644 index 0000000..9e690b0 --- /dev/null +++ b/2012-cmd-compile-internal-intrinsify-publicationBarrier-o.patch @@ -0,0 +1,145 @@ +From b0843e50edcaff87cfd59e6f70433faf00321bc9 Mon Sep 17 00:00:00 2001 +From: Xianmiao Qu +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 012/119] cmd/compile/internal: intrinsify publicationBarrier + on riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This enables publicationBarrier to be used as an intrinsic +on riscv64, optimizing the required function call and return +instructions for invoking the "runtime.publicationBarrier" +function. + +This function is called by mallocgc. The benchmark results for malloc tested on Lichee-Pi-4A(TH1520, RISC-V 2.0G C910 x4) are as follows. + +goos: linux +goarch: riscv64 +pkg: runtime + │ old.txt │ new.txt │ + │ sec/op │ sec/op vs base │ +Malloc8-4 92.78n ± 1% 90.77n ± 1% -2.17% (p=0.001 n=10) +Malloc16-4 156.5n ± 1% 151.7n ± 2% -3.10% (p=0.000 n=10) +MallocTypeInfo8-4 131.7n ± 1% 130.6n ± 2% ~ (p=0.165 n=10) +MallocTypeInfo16-4 186.5n ± 2% 186.2n ± 1% ~ (p=0.956 n=10) +MallocLargeStruct-4 1.345µ ± 1% 1.355µ ± 1% ~ (p=0.093 n=10) +geomean 216.9n 214.5n -1.10% + + +Change-Id: Ieab6c02309614bac5c1b12b5ee3311f988ff644d +Reviewed-on: https://go-review.googlesource.com/c/go/+/531719 +Reviewed-by: Michael Pratt +Auto-Submit: Michael Pratt +Reviewed-by: Cherry Mui +Run-TryBot: M Zhuo +TryBot-Result: Gopher Robot +Reviewed-by: Joel Sing +--- + src/cmd/compile/internal/riscv64/ssa.go | 4 ++++ + src/cmd/compile/internal/ssa/_gen/RISCV64.rules | 3 +++ + src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go | 3 +++ + src/cmd/compile/internal/ssa/opGen.go | 8 ++++++++ + src/cmd/compile/internal/ssa/rewriteRISCV64.go | 3 +++ + src/cmd/compile/internal/ssagen/ssa.go | 2 +- + 6 files changed, 22 insertions(+), 1 deletion(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index f8cf786920..1100878794 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -695,6 +695,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.To.Sym = ir.Syms.Duffcopy + p.To.Offset = v.AuxInt + ++ case ssa.OpRISCV64LoweredPubBarrier: ++ // FENCE ++ s.Prog(v.Op.Asm()) ++ + case ssa.OpRISCV64LoweredRound32F, ssa.OpRISCV64LoweredRound64F: + // input is already rounded + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index e0bf00d45d..e498218c60 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -412,6 +412,9 @@ + // Write barrier. + (WB ...) => (LoweredWB ...) + ++// Publication barrier as intrinsic ++(PubBarrier ...) => (LoweredPubBarrier ...) ++ + (PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem) + (PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem) + (PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 317e9150c9..741769f036 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -399,6 +399,9 @@ func init() { + // Returns a pointer to a write barrier buffer in X24. + {name: "LoweredWB", argLength: 1, reg: regInfo{clobbers: (callerSave &^ (gpMask | regNamed["g"])) | regNamed["X1"], outputs: []regMask{regNamed["X24"]}}, clobberFlags: true, aux: "Int64"}, + ++ // Do data barrier. arg0=memorys ++ {name: "LoweredPubBarrier", argLength: 1, asm: "FENCE", hasSideEffects: true}, ++ + // There are three of these functions so that they can have three different register inputs. + // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the + // default registers to match so we don't need to copy registers around unnecessarily. +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index 5af047c38f..1e99c2bc07 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2411,6 +2411,7 @@ const ( + OpRISCV64LoweredGetCallerSP + OpRISCV64LoweredGetCallerPC + OpRISCV64LoweredWB ++ OpRISCV64LoweredPubBarrier + OpRISCV64LoweredPanicBoundsA + OpRISCV64LoweredPanicBoundsB + OpRISCV64LoweredPanicBoundsC +@@ -32301,6 +32302,13 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "LoweredPubBarrier", ++ argLen: 1, ++ hasSideEffects: true, ++ asm: riscv.AFENCE, ++ reg: regInfo{}, ++ }, + { + name: "LoweredPanicBoundsA", + auxType: auxInt64, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 0ad6433bf4..1ca03a58a9 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -432,6 +432,9 @@ func rewriteValueRISCV64(v *Value) bool { + return true + case OpPanicBounds: + return rewriteValueRISCV64_OpPanicBounds(v) ++ case OpPubBarrier: ++ v.Op = OpRISCV64LoweredPubBarrier ++ return true + case OpRISCV64ADD: + return rewriteValueRISCV64_OpRISCV64ADD(v) + case OpRISCV64ADDI: +diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go +index d243ebb7cd..cc70dc8f7d 100644 +--- a/src/cmd/compile/internal/ssagen/ssa.go ++++ b/src/cmd/compile/internal/ssagen/ssa.go +@@ -4108,7 +4108,7 @@ func InitTables() { + s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem()) + return nil + }, +- sys.ARM64, sys.PPC64) ++ sys.ARM64, sys.PPC64, sys.RISCV64) + + brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X} + if buildcfg.GOPPC64 >= 10 { +-- +2.39.5 + diff --git a/2013-cmd-compile-internal-stop-lowering-OpConvert-on-risc.patch b/2013-cmd-compile-internal-stop-lowering-OpConvert-on-risc.patch new file mode 100644 index 0000000..c670ed2 --- /dev/null +++ b/2013-cmd-compile-internal-stop-lowering-OpConvert-on-risc.patch @@ -0,0 +1,117 @@ +From 1d9850b6b89b6a5ca5558bfc44b1c1ff7777b4ca Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 013/119] cmd/compile/internal: stop lowering OpConvert on + riscv64 + +Lowering for OpConvert was removed for all architectures in CL#108496, +prior to the riscv64 port being upstreamed. Remove lowering of OpConvert +on riscv64, which brings it inline with all other architectures. This +results in 1,600+ instructions being removed from the riscv64 go binary. + +Change-Id: Iaaf1f8b397875926604048b66ad8ac91a98c871e +Reviewed-on: https://go-review.googlesource.com/c/go/+/533335 +Run-TryBot: Joel Sing +Reviewed-by: Cherry Mui +TryBot-Result: Gopher Robot +Reviewed-by: Michael Pratt +--- + src/cmd/compile/internal/riscv64/ssa.go | 2 +- + src/cmd/compile/internal/ssa/_gen/RISCV64.rules | 2 -- + src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go | 6 ------ + src/cmd/compile/internal/ssa/opGen.go | 14 -------------- + src/cmd/compile/internal/ssa/rewriteRISCV64.go | 3 --- + 5 files changed, 1 insertion(+), 26 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 1100878794..182cd8690e 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -193,7 +193,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + // input args need no code + case ssa.OpPhi: + ssagen.CheckLoweredPhi(v) +- case ssa.OpCopy, ssa.OpRISCV64MOVconvert, ssa.OpRISCV64MOVDreg: ++ case ssa.OpCopy, ssa.OpRISCV64MOVDreg: + if v.Type.IsMemory() { + return + } +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index e498218c60..031c68c8a0 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -396,8 +396,6 @@ + (ADD ptr (MOVDconst [s-moveSize(t.Alignment(), config)])) + mem) + +-(Convert ...) => (MOVconvert ...) +- + // Checks + (IsNonNil ...) => (SNEZ ...) + (IsInBounds ...) => (Less64U ...) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 741769f036..e8194be1df 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -231,12 +231,6 @@ func init() { + {name: "SLTU", argLength: 2, reg: gp21, asm: "SLTU"}, // arg0 < arg1, unsigned, result is 0 or 1 + {name: "SLTIU", argLength: 1, reg: gp11, asm: "SLTIU", aux: "Int64"}, // arg0 < auxint, unsigned, result is 0 or 1 + +- // MOVconvert converts between pointers and integers. +- // We have a special op for this so as to not confuse GC +- // (particularly stack maps). It takes a memory arg so it +- // gets correctly ordered with respect to GC safepoints. +- {name: "MOVconvert", argLength: 2, reg: gp11, asm: "MOV"}, // arg0, but converted to int/ptr as appropriate; arg1=mem +- + // Round ops to block fused-multiply-add extraction. + {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true}, + {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true}, +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index 1e99c2bc07..14453a4532 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2381,7 +2381,6 @@ const ( + OpRISCV64SLTI + OpRISCV64SLTU + OpRISCV64SLTIU +- OpRISCV64MOVconvert + OpRISCV64LoweredRound32F + OpRISCV64LoweredRound64F + OpRISCV64CALLstatic +@@ -31910,19 +31909,6 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, +- { +- name: "MOVconvert", +- argLen: 2, +- asm: riscv.AMOV, +- reg: regInfo{ +- inputs: []inputInfo{ +- {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 +- }, +- outputs: []outputInfo{ +- {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 +- }, +- }, +- }, + { + name: "LoweredRound32F", + argLen: 1, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 1ca03a58a9..e71102d27e 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -132,9 +132,6 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpConstBool(v) + case OpConstNil: + return rewriteValueRISCV64_OpConstNil(v) +- case OpConvert: +- v.Op = OpRISCV64MOVconvert +- return true + case OpCopysign: + v.Op = OpRISCV64FSGNJD + return true +-- +2.39.5 + diff --git a/2014-cmd-compile-optimize-right-shifts-of-uint32-on-riscv.patch b/2014-cmd-compile-optimize-right-shifts-of-uint32-on-riscv.patch new file mode 100644 index 0000000..c6c5189 --- /dev/null +++ b/2014-cmd-compile-optimize-right-shifts-of-uint32-on-riscv.patch @@ -0,0 +1,558 @@ +From 1e9c0a4876d93e28c0f078d5f9194628c4ed0470 Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 014/119] cmd/compile: optimize right shifts of uint32 on riscv +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The compiler is currently zero extending 32 bit unsigned integers to +64 bits before right shifting them using a 64 bit shift instruction. +There's no need to do this as RISC-V has instructions for right +shifting 32 bit unsigned values (srlw and srliw) which zero extend +the result of the shift to 64 bits. Change the compiler so that +it uses srlw and srliw for 32 bit unsigned shifts reducing in most +cases the number of instructions needed to perform the shift. + +Here are some examples of code sequences that are changed by this +patch: + +uint32(a) >> 2 + + before: + + sll x5,x10,0x20 + srl x10,x5,0x22 + + after: + + srlw x10,x10,0x2 + +uint32(a) >> int(b) + + before: + + sll x5,x10,0x20 + srl x5,x5,0x20 + srl x5,x5,x11 + sltiu x6,x11,64 + neg x6,x6 + and x10,x5,x6 + + after: + + srlw x5,x10,x11 + sltiu x6,x11,32 + neg x6,x6 + and x10,x5,x6 + +bits.RotateLeft32(uint32(a), 1) + + before: + + sll x5,x10,0x1 + sll x6,x10,0x20 + srl x7,x6,0x3f + or x5,x5,x7 + + after: + + sll x5,x10,0x1 + srlw x6,x10,0x1f + or x10,x5,x6 + +bits.RotateLeft32(uint32(a), int(b)) + + before: + and x6,x11,31 + sll x7,x10,x6 + sll x8,x10,0x20 + srl x8,x8,0x20 + add x6,x6,-32 + neg x6,x6 + srl x9,x8,x6 + sltiu x6,x6,64 + neg x6,x6 + and x6,x9,x6 + or x6,x6,x7 + + after: + + and x5,x11,31 + sll x6,x10,x5 + add x5,x5,-32 + neg x5,x5 + srlw x7,x10,x5 + sltiu x5,x5,32 + neg x5,x5 + and x5,x7,x5 + or x10,x6,x5 + +The one regression observed is the following case, an unbounded right +shift of a uint32 where the value we're shifting by is known to be +< 64 but > 31. As this is an unusual case this commit does not +optimize for it, although the existing code does. + +uint32(a) >> (b & 63) + + before: + + sll x5,x10,0x20 + srl x5,x5,0x20 + and x6,x11,63 + srl x10,x5,x6 + + after + + and x5,x11,63 + srlw x6,x10,x5 + sltiu x5,x5,32 + neg x5,x5 + and x10,x6,x5 + +Here we have one extra instruction. + +Some benchmark highlights, generated on a VisionFive2 8GB running +Ubuntu 23.04. + +pkg: math/bits +LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) +LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) +TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) +TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) +TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) +OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) +RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) +RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) +Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) +ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) +Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) +geomean 11.50n 11.33n -1.45% + +pkg: crypto/md5 +Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) +Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) +Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) +Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) +Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) +Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) +Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) +Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) +Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) +Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) +Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) +Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) +geomean 28.32µ 26.42µ -6.72% + +Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 +Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 +Reviewed-by: Michael Pratt +Reviewed-by: Cherry Mui +Reviewed-by: Joel Sing +Run-TryBot: Joel Sing +TryBot-Result: Gopher Robot +--- + src/cmd/compile/internal/riscv64/ssa.go | 4 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 18 +++-- + .../compile/internal/ssa/_gen/RISCV64Ops.go | 14 ++-- + src/cmd/compile/internal/ssa/opGen.go | 30 +++++++++ + .../compile/internal/ssa/rewriteRISCV64.go | 65 ++++++++++++++++--- + test/codegen/shift.go | 21 +++--- + 6 files changed, 122 insertions(+), 30 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 182cd8690e..332f5841b7 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -278,7 +278,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.To.Type = obj.TYPE_REG + p.To.Reg = rd + case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64SUBW, ssa.OpRISCV64XOR, ssa.OpRISCV64OR, ssa.OpRISCV64AND, +- ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRL, ++ ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW, + ssa.OpRISCV64SLT, ssa.OpRISCV64SLTU, ssa.OpRISCV64MUL, ssa.OpRISCV64MULW, ssa.OpRISCV64MULH, + ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW, + ssa.OpRISCV64DIVUW, ssa.OpRISCV64REM, ssa.OpRISCV64REMU, ssa.OpRISCV64REMW, +@@ -356,7 +356,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpRISCV64ADDI, ssa.OpRISCV64ADDIW, ssa.OpRISCV64XORI, ssa.OpRISCV64ORI, ssa.OpRISCV64ANDI, +- ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRLI, ssa.OpRISCV64SLTI, ++ ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI, + ssa.OpRISCV64SLTIU: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 031c68c8a0..4cacabb236 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -150,8 +150,9 @@ + (Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y) + (Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y) + +-// SRL only considers the bottom 6 bits of y. If y > 64, the result should +-// always be 0. See Lsh above for a detailed description. ++// SRL only considers the bottom 6 bits of y, similarly SRLW only considers the ++// bottom 5 bits of y. Ensure that the result is always zero if the shift exceeds ++// the maximum value. See Lsh above for a detailed description. + (Rsh8Ux8 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] (ZeroExt8to64 y)))) + (Rsh8Ux16 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] (ZeroExt16to64 y)))) + (Rsh8Ux32 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] (ZeroExt32to64 y)))) +@@ -160,10 +161,10 @@ + (Rsh16Ux16 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] (ZeroExt16to64 y)))) + (Rsh16Ux32 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] (ZeroExt32to64 y)))) + (Rsh16Ux64 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] y))) +-(Rsh32Ux8 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [64] (ZeroExt8to64 y)))) +-(Rsh32Ux16 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [64] (ZeroExt16to64 y)))) +-(Rsh32Ux32 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [64] (ZeroExt32to64 y)))) +-(Rsh32Ux64 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [64] y))) ++(Rsh32Ux8 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt8to64 y)))) ++(Rsh32Ux16 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt16to64 y)))) ++(Rsh32Ux32 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt32to64 y)))) ++(Rsh32Ux64 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] y))) + (Rsh64Ux8 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] (ZeroExt8to64 y)))) + (Rsh64Ux16 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] (ZeroExt16to64 y)))) + (Rsh64Ux32 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] (ZeroExt32to64 y)))) +@@ -705,6 +706,10 @@ + // But for now, this is enough to get rid of lots of them. + (MOVDnop (MOVDconst [c])) => (MOVDconst [c]) + ++// Avoid unnecessary zero extension when right shifting. ++(SRL (MOVWUreg x) y) => (SRLW x y) ++(SRLI [x] (MOVWUreg y)) => (SRLIW [int64(x&31)] y) ++ + // Fold constant into immediate instructions where possible. + (ADD (MOVDconst [val]) x) && is32Bit(val) && !t.IsPtr() => (ADDI [val] x) + (AND (MOVDconst [val]) x) && is32Bit(val) => (ANDI [val] x) +@@ -712,6 +717,7 @@ + (XOR (MOVDconst [val]) x) && is32Bit(val) => (XORI [val] x) + (SLL x (MOVDconst [val])) => (SLLI [int64(val&63)] x) + (SRL x (MOVDconst [val])) => (SRLI [int64(val&63)] x) ++(SRLW x (MOVDconst [val])) => (SRLIW [int64(val&31)] x) + (SRA x (MOVDconst [val])) => (SRAI [int64(val&63)] x) + (SLT x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTI [val] x) + (SLTU x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTIU [val] x) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index e8194be1df..360eff6bcf 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -207,12 +207,14 @@ func init() { + {name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}, resultInArg0: true}, // nop, return arg0 in same register + + // Shift ops +- {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << (aux1 & 63) +- {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> (aux1 & 63), signed +- {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> (aux1 & 63), unsigned +- {name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"}, // arg0 << auxint, shift amount 0-63 +- {name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-63 +- {name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-63 ++ {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << (aux1 & 63) ++ {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> (aux1 & 63), signed ++ {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> (aux1 & 63), unsigned ++ {name: "SRLW", argLength: 2, reg: gp21, asm: "SRLW"}, // arg0 >> (aux1 & 31), unsigned ++ {name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"}, // arg0 << auxint, shift amount 0-63 ++ {name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-63 ++ {name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-63 ++ {name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-31 + + // Bitwise ops + {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1 +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index 14453a4532..dadf1f20c7 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2365,9 +2365,11 @@ const ( + OpRISCV64SLL + OpRISCV64SRA + OpRISCV64SRL ++ OpRISCV64SRLW + OpRISCV64SLLI + OpRISCV64SRAI + OpRISCV64SRLI ++ OpRISCV64SRLIW + OpRISCV64XOR + OpRISCV64XORI + OpRISCV64OR +@@ -31685,6 +31687,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SRLW", ++ argLen: 2, ++ asm: riscv.ASRLW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "SLLI", + auxType: auxInt64, +@@ -31727,6 +31743,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SRLIW", ++ auxType: auxInt64, ++ argLen: 1, ++ asm: riscv.ASRLIW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "XOR", + argLen: 2, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index e71102d27e..7d16fe887f 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -542,6 +542,8 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpRISCV64SRL(v) + case OpRISCV64SRLI: + return rewriteValueRISCV64_OpRISCV64SRLI(v) ++ case OpRISCV64SRLW: ++ return rewriteValueRISCV64_OpRISCV64SRLW(v) + case OpRISCV64SUB: + return rewriteValueRISCV64_OpRISCV64SUB(v) + case OpRISCV64SUBW: +@@ -6290,6 +6292,20 @@ func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] ++ // match: (SRL (MOVWUreg x) y) ++ // result: (SRLW x y) ++ for { ++ t := v.Type ++ if v_0.Op != OpRISCV64MOVWUreg { ++ break ++ } ++ x := v_0.Args[0] ++ y := v_1 ++ v.reset(OpRISCV64SRLW) ++ v.Type = t ++ v.AddArg2(x, y) ++ return true ++ } + // match: (SRL x (MOVDconst [val])) + // result: (SRLI [int64(val&63)] x) + for { +@@ -6307,6 +6323,21 @@ func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool { + } + func rewriteValueRISCV64_OpRISCV64SRLI(v *Value) bool { + v_0 := v.Args[0] ++ // match: (SRLI [x] (MOVWUreg y)) ++ // result: (SRLIW [x] y) ++ for { ++ t := v.Type ++ x := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpRISCV64MOVWUreg { ++ break ++ } ++ y := v_0.Args[0] ++ v.reset(OpRISCV64SRLIW) ++ v.Type = t ++ v.AuxInt = int64ToAuxInt(x) ++ v.AddArg(y) ++ return true ++ } + // match: (SRLI [x] (MOVDconst [y])) + // result: (MOVDconst [int64(uint64(y) >> uint32(x))]) + for { +@@ -6321,6 +6352,24 @@ func rewriteValueRISCV64_OpRISCV64SRLI(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64SRLW(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (SRLW x (MOVDconst [val])) ++ // result: (SRLIW [int64(val&31)] x) ++ for { ++ x := v_0 ++ if v_1.Op != OpRISCV64MOVDconst { ++ break ++ } ++ val := auxIntToInt64(v_1.AuxInt) ++ v.reset(OpRISCV64SRLIW) ++ v.AuxInt = int64ToAuxInt(int64(val & 31)) ++ v.AddArg(x) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64SUB(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +@@ -6937,7 +6986,7 @@ func rewriteValueRISCV64_OpRsh32Ux16(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32Ux16 x y) + // cond: !shiftIsBounded(v) +- // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [64] (ZeroExt16to64 y)))) ++ // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt16to64 y)))) + for { + t := v.Type + x := v_0 +@@ -6952,7 +7001,7 @@ func rewriteValueRISCV64_OpRsh32Ux16(v *Value) bool { + v0.AddArg2(v1, y) + v2 := b.NewValue0(v.Pos, OpNeg32, t) + v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) + v4.AddArg(y) + v3.AddArg(v4) +@@ -6984,7 +7033,7 @@ func rewriteValueRISCV64_OpRsh32Ux32(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32Ux32 x y) + // cond: !shiftIsBounded(v) +- // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [64] (ZeroExt32to64 y)))) ++ // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt32to64 y)))) + for { + t := v.Type + x := v_0 +@@ -6999,7 +7048,7 @@ func rewriteValueRISCV64_OpRsh32Ux32(v *Value) bool { + v0.AddArg2(v1, y) + v2 := b.NewValue0(v.Pos, OpNeg32, t) + v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) + v4.AddArg(y) + v3.AddArg(v4) +@@ -7031,7 +7080,7 @@ func rewriteValueRISCV64_OpRsh32Ux64(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32Ux64 x y) + // cond: !shiftIsBounded(v) +- // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [64] y))) ++ // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] y))) + for { + t := v.Type + x := v_0 +@@ -7046,7 +7095,7 @@ func rewriteValueRISCV64_OpRsh32Ux64(v *Value) bool { + v0.AddArg2(v1, y) + v2 := b.NewValue0(v.Pos, OpNeg32, t) + v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v3.AddArg(y) + v2.AddArg(v3) + v.AddArg2(v0, v2) +@@ -7076,7 +7125,7 @@ func rewriteValueRISCV64_OpRsh32Ux8(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32Ux8 x y) + // cond: !shiftIsBounded(v) +- // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [64] (ZeroExt8to64 y)))) ++ // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt8to64 y)))) + for { + t := v.Type + x := v_0 +@@ -7091,7 +7140,7 @@ func rewriteValueRISCV64_OpRsh32Ux8(v *Value) bool { + v0.AddArg2(v1, y) + v2 := b.NewValue0(v.Pos, OpNeg32, t) + v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) + v4.AddArg(y) + v3.AddArg(v4) +diff --git a/test/codegen/shift.go b/test/codegen/shift.go +index d34ff9b428..302560d5b0 100644 +--- a/test/codegen/shift.go ++++ b/test/codegen/shift.go +@@ -18,7 +18,7 @@ func lshConst64x64(v int64) int64 { + + func rshConst64Ux64(v uint64) uint64 { + // ppc64x:"SRD" +- // riscv64:"SRLI",-"AND",-"SLTIU" ++ // riscv64:"SRLI\t",-"AND",-"SLTIU" + return v >> uint64(33) + } + +@@ -36,7 +36,7 @@ func lshConst32x64(v int32) int32 { + + func rshConst32Ux64(v uint32) uint32 { + // ppc64x:"SRW" +- // riscv64:"SRLI",-"AND",-"SLTIU", -"MOVW" ++ // riscv64:"SRLIW",-"AND",-"SLTIU", -"MOVW" + return v >> uint64(29) + } + +@@ -54,7 +54,7 @@ func lshConst64x32(v int64) int64 { + + func rshConst64Ux32(v uint64) uint64 { + // ppc64x:"SRD" +- // riscv64:"SRLI",-"AND",-"SLTIU" ++ // riscv64:"SRLI\t",-"AND",-"SLTIU" + return v >> uint32(33) + } + +@@ -79,7 +79,7 @@ func lshMask64x64(v int64, s uint64) int64 { + func rshMask64Ux64(v uint64, s uint64) uint64 { + // arm64:"LSR",-"AND",-"CSEL" + // ppc64x:"ANDCC",-"ORN",-"ISEL" +- // riscv64:"SRL",-"AND\t",-"SLTIU" ++ // riscv64:"SRL\t",-"AND\t",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + return v >> (s & 63) + } +@@ -103,11 +103,16 @@ func lshMask32x64(v int32, s uint64) int32 { + func rshMask32Ux64(v uint32, s uint64) uint32 { + // arm64:"LSR",-"AND" + // ppc64x:"ISEL",-"ORN" +- // riscv64:"SRL",-"AND\t",-"SLTIU" ++ // riscv64:"SRLW","SLTIU","NEG","AND\t",-"SRL\t" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + return v >> (s & 63) + } + ++func rsh5Mask32Ux64(v uint32, s uint64) uint32 { ++ // riscv64:"SRLW",-"AND\t",-"SLTIU",-"SRL\t" ++ return v >> (s & 31) ++} ++ + func rshMask32x64(v int32, s uint64) int32 { + // arm64:"ASR",-"AND" + // ppc64x:"ISEL",-"ORN" +@@ -127,7 +132,7 @@ func lshMask64x32(v int64, s uint32) int64 { + func rshMask64Ux32(v uint64, s uint32) uint64 { + // arm64:"LSR",-"AND",-"CSEL" + // ppc64x:"ANDCC",-"ORN" +- // riscv64:"SRL",-"AND\t",-"SLTIU" ++ // riscv64:"SRL\t",-"AND\t",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + return v >> (s & 63) + } +@@ -149,7 +154,7 @@ func lshMask64x32Ext(v int64, s int32) int64 { + + func rshMask64Ux32Ext(v uint64, s int32) uint64 { + // ppc64x:"ANDCC",-"ORN",-"ISEL" +- // riscv64:"SRL",-"AND\t",-"SLTIU" ++ // riscv64:"SRL\t",-"AND\t",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + return v >> uint(s&63) + } +@@ -206,7 +211,7 @@ func lshGuarded64(v int64, s uint) int64 { + + func rshGuarded64U(v uint64, s uint) uint64 { + if s < 64 { +- // riscv64:"SRL",-"AND",-"SLTIU" ++ // riscv64:"SRL\t",-"AND",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + // wasm:-"Select",-".*LtU" + // arm64:"LSR",-"CSEL" +-- +2.39.5 + diff --git a/2015-cmd-link-internal-ld-assign-temporary-addresses-to-p.patch b/2015-cmd-link-internal-ld-assign-temporary-addresses-to-p.patch new file mode 100644 index 0000000..fd56038 --- /dev/null +++ b/2015-cmd-link-internal-ld-assign-temporary-addresses-to-p.patch @@ -0,0 +1,267 @@ +From b8f30343204b5a19577d25b00614d08e26a77947 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 015/119] cmd/link/internal/ld: assign temporary addresses to + per-package text + +If trampolines may be required, the current text addressing second +pass resets all assigned addresses, before assigning addresses and +laying down trampolines in a linear fashion. However, this approach +means that intra-package calls are to a symbol that has not yet +been assigned an address, when the symbol is ahead of the current +function. + +In the case of RISC-V the JAL instruction is limited to +/-1MiB. +As such, if a call is to a symbol with no address currently assigned, +we have to assume that a trampoline will be required. During the +relocation phase we can fix up and avoid trampolines in some cases, +however this results in unused trampolines that are still present +in the binary (since removing them would change text addresses). + +In order to significantly reduce the number of unused trampolines, +assign temporary addresses to functions within the same package, +based on the maximum number of trampolines that may be required by +a function. This allows for better decisions to be made regarding +the requirement for intra-package trampolines, as we reset the +addressing for a function, assign its final address and lay down +any resulting trampolines. + +This results in ~2,300 unused trampolines being removed from the +Go binary and ~5,600 unused trampolines being removed from the +compile binary, on linux/riscv64. + +This reapplies CL 349650, however does not pass big to assignAddress +when assigning temporary addresses, as this can result in side +effects such as section splitting. + +Change-Id: Id7febdb65d962d6b1297a91294a8dc27c94d8696 +Reviewed-on: https://go-review.googlesource.com/c/go/+/534760 +Reviewed-by: Cherry Mui +Reviewed-by: Dmitri Shuralyov +Run-TryBot: Joel Sing +TryBot-Result: Gopher Robot +Reviewed-by: Than McIntosh +--- + src/cmd/link/internal/ld/data.go | 89 +++++++++++++++++++++-------- + src/cmd/link/internal/ld/ld_test.go | 68 ++++++++++++++++++++++ + 2 files changed, 132 insertions(+), 25 deletions(-) + +diff --git a/src/cmd/link/internal/ld/data.go b/src/cmd/link/internal/ld/data.go +index 0550f07d5c..02905f9f42 100644 +--- a/src/cmd/link/internal/ld/data.go ++++ b/src/cmd/link/internal/ld/data.go +@@ -84,14 +84,15 @@ func maxSizeTrampolines(ctxt *Link, ldr *loader.Loader, s loader.Sym, isTramp bo + } + } + +- if ctxt.IsARM() { ++ switch { ++ case ctxt.IsARM(): + return n * 20 // Trampolines in ARM range from 3 to 5 instructions. +- } +- if ctxt.IsPPC64() { +- return n * 16 // Trampolines in PPC64 are 4 instructions. +- } +- if ctxt.IsARM64() { ++ case ctxt.IsARM64(): + return n * 12 // Trampolines in ARM64 are 3 instructions. ++ case ctxt.IsPPC64(): ++ return n * 16 // Trampolines in PPC64 are 4 instructions. ++ case ctxt.IsRISCV64(): ++ return n * 8 // Trampolines in RISCV64 are 2 instructions. + } + panic("unreachable") + } +@@ -118,18 +119,21 @@ func trampoline(ctxt *Link, s loader.Sym) { + continue // something is wrong. skip it here and we'll emit a better error later + } + +- // RISC-V is only able to reach +/-1MiB via a JAL instruction, +- // which we can readily exceed in the same package. As such, we +- // need to generate trampolines when the address is unknown. +- if ldr.SymValue(rs) == 0 && !ctxt.Target.IsRISCV64() && ldr.SymType(rs) != sym.SDYNIMPORT && ldr.SymType(rs) != sym.SUNDEFEXT { ++ if ldr.SymValue(rs) == 0 && ldr.SymType(rs) != sym.SDYNIMPORT && ldr.SymType(rs) != sym.SUNDEFEXT { ++ // Symbols in the same package are laid out together. ++ // Except that if SymPkg(s) == "", it is a host object symbol ++ // which may call an external symbol via PLT. + if ldr.SymPkg(s) != "" && ldr.SymPkg(rs) == ldr.SymPkg(s) { +- // Symbols in the same package are laid out together. +- // Except that if SymPkg(s) == "", it is a host object symbol +- // which may call an external symbol via PLT. +- continue ++ // RISC-V is only able to reach +/-1MiB via a JAL instruction. ++ // We need to generate a trampoline when an address is ++ // currently unknown. ++ if !ctxt.Target.IsRISCV64() { ++ continue ++ } + } ++ // Runtime packages are laid out together. + if isRuntimeDepPkg(ldr.SymPkg(s)) && isRuntimeDepPkg(ldr.SymPkg(rs)) { +- continue // runtime packages are laid out together ++ continue + } + } + thearch.Trampoline(ctxt, ldr, ri, rs, s) +@@ -2419,8 +2423,8 @@ func (ctxt *Link) textaddress() { + limit = 1 + } + +- // First pass: assign addresses assuming the program is small and +- // don't generate trampolines. ++ // First pass: assign addresses assuming the program is small and will ++ // not require trampoline generation. + big := false + for _, s := range ctxt.Textp { + sect, n, va = assignAddress(ctxt, sect, n, s, va, false, big) +@@ -2435,21 +2439,45 @@ func (ctxt *Link) textaddress() { + if big { + // reset addresses + for _, s := range ctxt.Textp { +- if ldr.OuterSym(s) != 0 || s == text { +- continue +- } +- oldv := ldr.SymValue(s) +- for sub := s; sub != 0; sub = ldr.SubSym(sub) { +- ldr.SetSymValue(sub, ldr.SymValue(sub)-oldv) ++ if s != text { ++ resetAddress(ctxt, s) + } + } + va = start + + ntramps := 0 +- for _, s := range ctxt.Textp { ++ var curPkg string ++ for i, s := range ctxt.Textp { ++ // When we find the first symbol in a package, perform a ++ // single iteration that assigns temporary addresses to all ++ // of the text in the same package, using the maximum possible ++ // number of trampolines. This allows for better decisions to ++ // be made regarding reachability and the need for trampolines. ++ if symPkg := ldr.SymPkg(s); symPkg != "" && curPkg != symPkg { ++ curPkg = symPkg ++ vaTmp := va ++ for j := i; j < len(ctxt.Textp); j++ { ++ curSym := ctxt.Textp[j] ++ if symPkg := ldr.SymPkg(curSym); symPkg == "" || curPkg != symPkg { ++ break ++ } ++ // We do not pass big to assignAddress here, as this ++ // can result in side effects such as section splitting. ++ sect, n, vaTmp = assignAddress(ctxt, sect, n, curSym, vaTmp, false, false) ++ vaTmp += maxSizeTrampolines(ctxt, ldr, curSym, false) ++ } ++ } ++ ++ // Reset address for current symbol. ++ if s != text { ++ resetAddress(ctxt, s) ++ } ++ ++ // Assign actual address for current symbol. + sect, n, va = assignAddress(ctxt, sect, n, s, va, false, big) + +- trampoline(ctxt, s) // resolve jumps, may add trampolines if jump too far ++ // Resolve jumps, adding trampolines if they are needed. ++ trampoline(ctxt, s) + + // lay down trampolines after each function + for ; ntramps < len(ctxt.tramps); ntramps++ { +@@ -2597,6 +2625,17 @@ func assignAddress(ctxt *Link, sect *sym.Section, n int, s loader.Sym, va uint64 + return sect, n, va + } + ++func resetAddress(ctxt *Link, s loader.Sym) { ++ ldr := ctxt.loader ++ if ldr.OuterSym(s) != 0 { ++ return ++ } ++ oldv := ldr.SymValue(s) ++ for sub := s; sub != 0; sub = ldr.SubSym(sub) { ++ ldr.SetSymValue(sub, ldr.SymValue(sub)-oldv) ++ } ++} ++ + // Return whether we may need to split text sections. + // + // On PPC64x, when external linking, a text section should not be +diff --git a/src/cmd/link/internal/ld/ld_test.go b/src/cmd/link/internal/ld/ld_test.go +index a7a6082f54..1767667759 100644 +--- a/src/cmd/link/internal/ld/ld_test.go ++++ b/src/cmd/link/internal/ld/ld_test.go +@@ -344,3 +344,71 @@ func main() { + }) + } + } ++ ++func TestRISCVTrampolines(t *testing.T) { ++ testenv.MustHaveGoBuild(t) ++ t.Parallel() ++ ++ tmpDir := t.TempDir() ++ tmpFile := filepath.Join(tmpDir, "x.s") ++ ++ // Calling b from a or c should not use trampolines, however ++ // calling from d to a will require one. ++ buf := new(bytes.Buffer) ++ fmt.Fprintf(buf, "TEXT a(SB),$0-0\n") ++ for i := 0; i < 1<<17; i++ { ++ fmt.Fprintf(buf, "\tADD $0, X0, X0\n") ++ } ++ fmt.Fprintf(buf, "\tCALL b(SB)\n") ++ fmt.Fprintf(buf, "\tRET\n") ++ fmt.Fprintf(buf, "TEXT b(SB),$0-0\n") ++ fmt.Fprintf(buf, "\tRET\n") ++ fmt.Fprintf(buf, "TEXT c(SB),$0-0\n") ++ fmt.Fprintf(buf, "\tCALL b(SB)\n") ++ fmt.Fprintf(buf, "\tRET\n") ++ fmt.Fprintf(buf, "TEXT ·d(SB),0,$0-0\n") ++ for i := 0; i < 1<<17; i++ { ++ fmt.Fprintf(buf, "\tADD $0, X0, X0\n") ++ } ++ fmt.Fprintf(buf, "\tCALL a(SB)\n") ++ fmt.Fprintf(buf, "\tCALL c(SB)\n") ++ fmt.Fprintf(buf, "\tRET\n") ++ if err := os.WriteFile(tmpFile, buf.Bytes(), 0644); err != nil { ++ t.Fatalf("Failed to write assembly file: %v", err) ++ } ++ ++ if err := os.WriteFile(filepath.Join(tmpDir, "go.mod"), []byte("module riscvtramp"), 0644); err != nil { ++ t.Fatalf("Failed to write file: %v\n", err) ++ } ++ main := `package main ++func main() { ++ d() ++} ++ ++func d() ++` ++ if err := os.WriteFile(filepath.Join(tmpDir, "x.go"), []byte(main), 0644); err != nil { ++ t.Fatalf("failed to write main: %v\n", err) ++ } ++ cmd := testenv.Command(t, testenv.GoToolPath(t), "build", "-ldflags=-linkmode=internal") ++ cmd.Dir = tmpDir ++ cmd.Env = append(os.Environ(), "GOARCH=riscv64", "GOOS=linux") ++ out, err := cmd.CombinedOutput() ++ if err != nil { ++ t.Fatalf("Build failed: %v, output: %s", err, out) ++ } ++ ++ // Check what trampolines exist. ++ cmd = testenv.Command(t, testenv.GoToolPath(t), "tool", "nm", filepath.Join(tmpDir, "riscvtramp")) ++ cmd.Env = append(os.Environ(), "GOARCH=riscv64", "GOOS=linux") ++ out, err = cmd.CombinedOutput() ++ if err != nil { ++ t.Fatalf("nm failure: %s\n%s\n", err, string(out)) ++ } ++ if !bytes.Contains(out, []byte(" T a-tramp0")) { ++ t.Errorf("Trampoline a-tramp0 is missing") ++ } ++ if bytes.Contains(out, []byte(" T b-tramp0")) { ++ t.Errorf("Trampoline b-tramp0 exists unnecessarily") ++ } ++} +-- +2.39.5 + diff --git a/2016-cmd-compile-optimize-right-shifts-of-int32-on-riscv6.patch b/2016-cmd-compile-optimize-right-shifts-of-int32-on-riscv6.patch new file mode 100644 index 0000000..42867e1 --- /dev/null +++ b/2016-cmd-compile-optimize-right-shifts-of-int32-on-riscv6.patch @@ -0,0 +1,540 @@ +From 3018460e65fe8c0a0afe5b0bf09db3c9b0d909c9 Mon Sep 17 00:00:00 2001 +From: Ubuntu +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 016/119] cmd/compile: optimize right shifts of int32 on + riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The compiler is currently sign extending 32 bit signed integers to +64 bits before right shifting them using a 64 bit shift instruction. +There's no need to do this as RISC-V has instructions for right +shifting 32 bit signed values (sraw and sraiw) which sign extend +the result of the shift to 64 bits. Change the compiler so that +it uses sraw and sraiw for shifts of signed 32 bit integers reducing +in most cases the number of instructions needed to perform the shift. + +Here are some examples of code sequences that are changed by this +patch: + +int32(a) >> 2 + + before: + + sll x5,x10,0x20 + sra x10,x5,0x22 + + after: + + sraw x10,x10,0x2 + +int32(v) >> int(s) + + before: + + sext.w x5,x10 + sltiu x6,x11,64 + add x6,x6,-1 + or x6,x11,x6 + sra x10,x5,x6 + + after: + + sltiu x5,x11,32 + add x5,x5,-1 + or x5,x11,x5 + sraw x10,x10,x5 + +int32(v) >> (int(s) & 31) + + before: + + sext.w x5,x10 + and x6,x11,63 + sra x10,x5,x6 + +after: + + and x5,x11,31 + sraw x10,x10,x5 + +int32(100) >> int(a) + + before: + + bltz x10, + sltiu x5,x10,64 + add x5,x5,-1 + or x5,x10,x5 + li x6,100 + sra x10,x6,x5 + + after: + + bltz x10, + sltiu x5,x10,32 + add x5,x5,-1 + or x5,x10,x5 + li x6,100 + sraw x10,x6,x5 + +int32(v) >> (int(s) & 63) + + before: + + sext.w x5,x10 + and x6,x11,63 + sra x10,x5,x6 + + after: + + and x5,x11,63 + sltiu x6,x5,32 + add x6,x6,-1 + or x5,x5,x6 + sraw x10,x10,x5 + +In most cases we eliminate one instruction. In the case where +we shift a int32 constant by a variable the number of instructions +generated is identical. A sra is simply replaced by a sraw. In the +unusual case where we shift right by a variable anded with a constant +> 31 but < 64, we generate two additional instructions. As this is +an unusual case we do not try to optimize for it. + +Some improvements can be seen in some of the existing benchmarks, +notably in the utf8 package which performs right shifts of runes +which are signed 32 bit integers. + + | utf8-old | utf8-new | + | sec/op | sec/op vs base | +EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) +EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) +AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) +AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) +DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) +DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) + +Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 +Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 +Run-TryBot: Joel Sing +TryBot-Result: Gopher Robot +Reviewed-by: Joel Sing +Reviewed-by: Cherry Mui +Reviewed-by: M Zhuo +Reviewed-by: David Chase +--- + src/cmd/compile/internal/riscv64/ssa.go | 4 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 26 +++++--- + .../compile/internal/ssa/_gen/RISCV64Ops.go | 2 + + src/cmd/compile/internal/ssa/opGen.go | 30 +++++++++ + .../compile/internal/ssa/rewriteRISCV64.go | 65 ++++++++++++++++--- + test/codegen/shift.go | 23 ++++--- + 6 files changed, 121 insertions(+), 29 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 332f5841b7..22338188e5 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -278,7 +278,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.To.Type = obj.TYPE_REG + p.To.Reg = rd + case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64SUBW, ssa.OpRISCV64XOR, ssa.OpRISCV64OR, ssa.OpRISCV64AND, +- ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW, ++ ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRAW, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW, + ssa.OpRISCV64SLT, ssa.OpRISCV64SLTU, ssa.OpRISCV64MUL, ssa.OpRISCV64MULW, ssa.OpRISCV64MULH, + ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW, + ssa.OpRISCV64DIVUW, ssa.OpRISCV64REM, ssa.OpRISCV64REMU, ssa.OpRISCV64REMW, +@@ -356,7 +356,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpRISCV64ADDI, ssa.OpRISCV64ADDIW, ssa.OpRISCV64XORI, ssa.OpRISCV64ORI, ssa.OpRISCV64ANDI, +- ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI, ++ ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRAIW, ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI, + ssa.OpRISCV64SLTIU: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 4cacabb236..9afe5995ae 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -175,16 +175,19 @@ + (Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt32to64 x) y) + (Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL x y) + +-// SRA only considers the bottom 6 bits of y. If y > 64, the result should +-// be either 0 or -1 based on the sign bit. ++// SRA only considers the bottom 6 bits of y, similarly SRAW only considers the ++// bottom 5 bits. If y is greater than the maximum value (either 63 or 31 ++// depending on the instruction), the result of the shift should be either 0 ++// or -1 based on the sign bit of x. + // +-// We implement this by performing the max shift (-1) if y >= 64. ++// We implement this by performing the max shift (-1) if y > the maximum value. + // + // We OR (uint64(y < 64) - 1) into y before passing it to SRA. This leaves +-// us with -1 (0xffff...) if y >= 64. ++// us with -1 (0xffff...) if y >= 64. Similarly, we OR (uint64(y < 32) - 1) into y ++// before passing it to SRAW. + // + // We don't need to sign-extend the OR result, as it will be at minimum 8 bits, +-// more than the 6 bits SRA cares about. ++// more than the 5 or 6 bits SRAW and SRA care about. + (Rsh8x8 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt8to64 y))))) + (Rsh8x16 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) + (Rsh8x32 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) +@@ -193,10 +196,10 @@ + (Rsh16x16 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) + (Rsh16x32 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) + (Rsh16x64 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] y)))) +-(Rsh32x8 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt8to64 y))))) +-(Rsh32x16 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) +-(Rsh32x32 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) +-(Rsh32x64 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [64] y)))) ++(Rsh32x8 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt8to64 y))))) ++(Rsh32x16 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt16to64 y))))) ++(Rsh32x32 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt32to64 y))))) ++(Rsh32x64 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] y)))) + (Rsh64x8 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] (ZeroExt8to64 y))))) + (Rsh64x16 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) + (Rsh64x32 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) +@@ -706,9 +709,11 @@ + // But for now, this is enough to get rid of lots of them. + (MOVDnop (MOVDconst [c])) => (MOVDconst [c]) + +-// Avoid unnecessary zero extension when right shifting. ++// Avoid unnecessary zero and sign extension when right shifting. + (SRL (MOVWUreg x) y) => (SRLW x y) + (SRLI [x] (MOVWUreg y)) => (SRLIW [int64(x&31)] y) ++(SRA (MOVWreg x) y) => (SRAW x y) ++(SRAI [x] (MOVWreg y)) => (SRAIW [int64(x&31)] y) + + // Fold constant into immediate instructions where possible. + (ADD (MOVDconst [val]) x) && is32Bit(val) && !t.IsPtr() => (ADDI [val] x) +@@ -719,6 +724,7 @@ + (SRL x (MOVDconst [val])) => (SRLI [int64(val&63)] x) + (SRLW x (MOVDconst [val])) => (SRLIW [int64(val&31)] x) + (SRA x (MOVDconst [val])) => (SRAI [int64(val&63)] x) ++(SRAW x (MOVDconst [val])) => (SRAIW [int64(val&31)] x) + (SLT x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTI [val] x) + (SLTU x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTIU [val] x) + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 360eff6bcf..93f20f8a99 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -209,10 +209,12 @@ func init() { + // Shift ops + {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << (aux1 & 63) + {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> (aux1 & 63), signed ++ {name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"}, // arg0 >> (aux1 & 31), signed + {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> (aux1 & 63), unsigned + {name: "SRLW", argLength: 2, reg: gp21, asm: "SRLW"}, // arg0 >> (aux1 & 31), unsigned + {name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"}, // arg0 << auxint, shift amount 0-63 + {name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-63 ++ {name: "SRAIW", argLength: 1, reg: gp11, asm: "SRAIW", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-31 + {name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-63 + {name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-31 + +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index dadf1f20c7..62b516ce61 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2364,10 +2364,12 @@ const ( + OpRISCV64MOVDnop + OpRISCV64SLL + OpRISCV64SRA ++ OpRISCV64SRAW + OpRISCV64SRL + OpRISCV64SRLW + OpRISCV64SLLI + OpRISCV64SRAI ++ OpRISCV64SRAIW + OpRISCV64SRLI + OpRISCV64SRLIW + OpRISCV64XOR +@@ -31673,6 +31675,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SRAW", ++ argLen: 2, ++ asm: riscv.ASRAW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "SRL", + argLen: 2, +@@ -31729,6 +31745,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SRAIW", ++ auxType: auxInt64, ++ argLen: 1, ++ asm: riscv.ASRAIW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "SRLI", + auxType: auxInt64, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 7d16fe887f..572dac249e 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -538,6 +538,8 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpRISCV64SRA(v) + case OpRISCV64SRAI: + return rewriteValueRISCV64_OpRISCV64SRAI(v) ++ case OpRISCV64SRAW: ++ return rewriteValueRISCV64_OpRISCV64SRAW(v) + case OpRISCV64SRL: + return rewriteValueRISCV64_OpRISCV64SRL(v) + case OpRISCV64SRLI: +@@ -6258,6 +6260,20 @@ func rewriteValueRISCV64_OpRISCV64SNEZ(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64SRA(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] ++ // match: (SRA (MOVWreg x) y) ++ // result: (SRAW x y) ++ for { ++ t := v.Type ++ if v_0.Op != OpRISCV64MOVWreg { ++ break ++ } ++ x := v_0.Args[0] ++ y := v_1 ++ v.reset(OpRISCV64SRAW) ++ v.Type = t ++ v.AddArg2(x, y) ++ return true ++ } + // match: (SRA x (MOVDconst [val])) + // result: (SRAI [int64(val&63)] x) + for { +@@ -6275,6 +6291,21 @@ func rewriteValueRISCV64_OpRISCV64SRA(v *Value) bool { + } + func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool { + v_0 := v.Args[0] ++ // match: (SRAI [x] (MOVWreg y)) ++ // result: (SRAIW [int64(x&31)] y) ++ for { ++ t := v.Type ++ x := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpRISCV64MOVWreg { ++ break ++ } ++ y := v_0.Args[0] ++ v.reset(OpRISCV64SRAIW) ++ v.Type = t ++ v.AuxInt = int64ToAuxInt(int64(x & 31)) ++ v.AddArg(y) ++ return true ++ } + // match: (SRAI [x] (MOVDconst [y])) + // result: (MOVDconst [int64(y) >> uint32(x)]) + for { +@@ -6289,6 +6320,24 @@ func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64SRAW(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (SRAW x (MOVDconst [val])) ++ // result: (SRAIW [int64(val&31)] x) ++ for { ++ x := v_0 ++ if v_1.Op != OpRISCV64MOVDconst { ++ break ++ } ++ val := auxIntToInt64(v_1.AuxInt) ++ v.reset(OpRISCV64SRAIW) ++ v.AuxInt = int64ToAuxInt(int64(val & 31)) ++ v.AddArg(x) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +@@ -7172,7 +7221,7 @@ func rewriteValueRISCV64_OpRsh32x16(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32x16 x y) + // cond: !shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) ++ // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt16to64 y))))) + for { + t := v.Type + x := v_0 +@@ -7188,7 +7237,7 @@ func rewriteValueRISCV64_OpRsh32x16(v *Value) bool { + v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) + v2.AuxInt = int64ToAuxInt(-1) + v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) + v4.AddArg(y) + v3.AddArg(v4) +@@ -7221,7 +7270,7 @@ func rewriteValueRISCV64_OpRsh32x32(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32x32 x y) + // cond: !shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) ++ // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt32to64 y))))) + for { + t := v.Type + x := v_0 +@@ -7237,7 +7286,7 @@ func rewriteValueRISCV64_OpRsh32x32(v *Value) bool { + v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) + v2.AuxInt = int64ToAuxInt(-1) + v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) + v4.AddArg(y) + v3.AddArg(v4) +@@ -7270,7 +7319,7 @@ func rewriteValueRISCV64_OpRsh32x64(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32x64 x y) + // cond: !shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [64] y)))) ++ // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] y)))) + for { + t := v.Type + x := v_0 +@@ -7286,7 +7335,7 @@ func rewriteValueRISCV64_OpRsh32x64(v *Value) bool { + v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) + v2.AuxInt = int64ToAuxInt(-1) + v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v3.AddArg(y) + v2.AddArg(v3) + v1.AddArg2(y, v2) +@@ -7317,7 +7366,7 @@ func rewriteValueRISCV64_OpRsh32x8(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32x8 x y) + // cond: !shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt8to64 y))))) ++ // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt8to64 y))))) + for { + t := v.Type + x := v_0 +@@ -7333,7 +7382,7 @@ func rewriteValueRISCV64_OpRsh32x8(v *Value) bool { + v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) + v2.AuxInt = int64ToAuxInt(-1) + v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) +- v3.AuxInt = int64ToAuxInt(64) ++ v3.AuxInt = int64ToAuxInt(32) + v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) + v4.AddArg(y) + v3.AddArg(v4) +diff --git a/test/codegen/shift.go b/test/codegen/shift.go +index 302560d5b0..b9d888ca6c 100644 +--- a/test/codegen/shift.go ++++ b/test/codegen/shift.go +@@ -24,7 +24,7 @@ func rshConst64Ux64(v uint64) uint64 { + + func rshConst64x64(v int64) int64 { + // ppc64x:"SRAD" +- // riscv64:"SRAI",-"OR",-"SLTIU" ++ // riscv64:"SRAI\t",-"OR",-"SLTIU" + return v >> uint64(33) + } + +@@ -42,7 +42,7 @@ func rshConst32Ux64(v uint32) uint32 { + + func rshConst32x64(v int32) int32 { + // ppc64x:"SRAW" +- // riscv64:"SRAI",-"OR",-"SLTIU", -"MOVW" ++ // riscv64:"SRAIW",-"OR",-"SLTIU", -"MOVW" + return v >> uint64(29) + } + +@@ -60,7 +60,7 @@ func rshConst64Ux32(v uint64) uint64 { + + func rshConst64x32(v int64) int64 { + // ppc64x:"SRAD" +- // riscv64:"SRAI",-"OR",-"SLTIU" ++ // riscv64:"SRAI\t",-"OR",-"SLTIU" + return v >> uint32(33) + } + +@@ -87,7 +87,7 @@ func rshMask64Ux64(v uint64, s uint64) uint64 { + func rshMask64x64(v int64, s uint64) int64 { + // arm64:"ASR",-"AND",-"CSEL" + // ppc64x:"ANDCC",-"ORN",-"ISEL" +- // riscv64:"SRA",-"OR",-"SLTIU" ++ // riscv64:"SRA\t",-"OR",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + return v >> (s & 63) + } +@@ -116,11 +116,16 @@ func rsh5Mask32Ux64(v uint32, s uint64) uint32 { + func rshMask32x64(v int32, s uint64) int32 { + // arm64:"ASR",-"AND" + // ppc64x:"ISEL",-"ORN" +- // riscv64:"SRA",-"OR",-"SLTIU" ++ // riscv64:"SRAW","OR","SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + return v >> (s & 63) + } + ++func rsh5Mask32x64(v int32, s uint64) int32 { ++ // riscv64:"SRAW",-"OR",-"SLTIU" ++ return v >> (s & 31) ++} ++ + func lshMask64x32(v int64, s uint32) int64 { + // arm64:"LSL",-"AND" + // ppc64x:"ANDCC",-"ORN" +@@ -139,8 +144,8 @@ func rshMask64Ux32(v uint64, s uint32) uint64 { + + func rshMask64x32(v int64, s uint32) int64 { + // arm64:"ASR",-"AND",-"CSEL" +- // ppc64x:"ANDCC",-"ORN",-"ISEL" +- // riscv64:"SRA",-"OR",-"SLTIU" ++ // ppc64x:"RLDICL",-"ORN",-"ISEL" ++ // riscv64:"SRA\t",-"OR",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + return v >> (s & 63) + } +@@ -161,7 +166,7 @@ func rshMask64Ux32Ext(v uint64, s int32) uint64 { + + func rshMask64x32Ext(v int64, s int32) int64 { + // ppc64x:"ANDCC",-"ORN",-"ISEL" +- // riscv64:"SRA",-"OR",-"SLTIU" ++ // riscv64:"SRA\t",-"OR",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + return v >> uint(s&63) + } +@@ -222,7 +227,7 @@ func rshGuarded64U(v uint64, s uint) uint64 { + + func rshGuarded64(v int64, s uint) int64 { + if s < 64 { +- // riscv64:"SRA",-"OR",-"SLTIU" ++ // riscv64:"SRA\t",-"OR",-"SLTIU" + // s390x:-"RISBGZ",-"AND",-"LOCGR" + // wasm:-"Select",-".*LtU" + // arm64:"ASR",-"CSEL" +-- +2.39.5 + diff --git a/2017-cmd-internal-obj-riscv-support-subtraction-with-a-co.patch b/2017-cmd-internal-obj-riscv-support-subtraction-with-a-co.patch new file mode 100644 index 0000000..df82012 --- /dev/null +++ b/2017-cmd-internal-obj-riscv-support-subtraction-with-a-co.patch @@ -0,0 +1,72 @@ +From 2a7bd73050a6314f4ba9ab19ce6b85764612f5e6 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 017/119] cmd/internal/obj/riscv: support subtraction with a + constant + +Allow SUB and SUBW to be specified with a constant, which are mapped +to ADDI and ADDIW with negated values. + +Change-Id: I7dc55692febc81ea87393b0a3a7d23a43c30313b +Reviewed-on: https://go-review.googlesource.com/c/go/+/538915 +Run-TryBot: Joel Sing +Reviewed-by: Cherry Mui +Reviewed-by: M Zhuo +TryBot-Result: Gopher Robot +Reviewed-by: Heschi Kreinick +Reviewed-by: Mark Ryan +Reviewed-by: Wang Yaduo +Reviewed-by: Mauri de Souza Meneguzzo +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 5 +++++ + src/cmd/internal/obj/riscv/obj.go | 4 ++++ + 2 files changed, 9 insertions(+) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 9899ec9e7b..11a9e30080 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -94,6 +94,10 @@ start: + + SUB X6, X5, X7 // b3836240 + SUB X5, X6 // 33035340 ++ SUB $-2047, X5, X6 // 1383f27f ++ SUB $2048, X5, X6 // 13830280 ++ SUB $-2047, X5 // 9382f27f ++ SUB $2048, X5 // 93820280 + + SRA X6, X5, X7 // b3d36240 + SRA X5, X6 // 33535340 +@@ -157,6 +161,7 @@ start: + ADDW $1, X6 // 1b031300 + SLLW $1, X6 // 1b131300 + SRLW $1, X6 // 1b531300 ++ SUBW $1, X6 // 1b03f3ff + SRAW $1, X6 // 1b531340 + + // 5.3: Load and Store Instructions (RV64I) +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index ab41e53b8c..997c962bdd 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -69,6 +69,8 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + switch p.As { + case AADD: + p.As = AADDI ++ case ASUB: ++ p.As, p.From.Offset = AADDI, -p.From.Offset + case ASLT: + p.As = ASLTI + case ASLTU: +@@ -87,6 +89,8 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + p.As = ASRAI + case AADDW: + p.As = AADDIW ++ case ASUBW: ++ p.As, p.From.Offset = AADDIW, -p.From.Offset + case ASLLW: + p.As = ASLLIW + case ASRLW: +-- +2.39.5 + diff --git a/2018-cmd-internal-obj-riscv-fix-the-offset-of-JALR-transf.patch b/2018-cmd-internal-obj-riscv-fix-the-offset-of-JALR-transf.patch new file mode 100644 index 0000000..03e335e --- /dev/null +++ b/2018-cmd-internal-obj-riscv-fix-the-offset-of-JALR-transf.patch @@ -0,0 +1,119 @@ +From 2ca7a420f327934e43c481827c30348ff2dc2340 Mon Sep 17 00:00:00 2001 +From: Wang Yaduo +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 018/119] cmd/internal/obj/riscv: fix the offset of JALR + transformed from JAL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Currently, the offset of JALR is zero all the time, which is transformed +from JAL with over ±1MB offset. This causes the segment fault for the +wrong address. + +Change-Id: I4dcb3eb13bd1ea71e9eb27f07c03ffec376608ab +Reviewed-on: https://go-review.googlesource.com/c/go/+/538135 +Run-TryBot: M Zhuo +TryBot-Result: Gopher Robot +Reviewed-by: Heschi Kreinick +Reviewed-by: M Zhuo +Reviewed-by: Joel Sing +Reviewed-by: Cherry Mui +--- + src/cmd/internal/obj/riscv/asm_test.go | 66 ++++++++++++++++++++++++++ + src/cmd/internal/obj/riscv/obj.go | 2 +- + 2 files changed, 67 insertions(+), 1 deletion(-) + +diff --git a/src/cmd/internal/obj/riscv/asm_test.go b/src/cmd/internal/obj/riscv/asm_test.go +index c22428cdc5..afe0525532 100644 +--- a/src/cmd/internal/obj/riscv/asm_test.go ++++ b/src/cmd/internal/obj/riscv/asm_test.go +@@ -126,6 +126,72 @@ func genLargeCall(buf *bytes.Buffer) { + fmt.Fprintln(buf, "RET") + } + ++// TestLargeJump generates a large jump (>1MB of text) with a JMP to the ++// end of the function, in order to ensure that it assembles correctly. ++func TestLargeJump(t *testing.T) { ++ if testing.Short() { ++ t.Skip("Skipping test in short mode") ++ } ++ if runtime.GOARCH != "riscv64" { ++ t.Skip("Require riscv64 to run") ++ } ++ testenv.MustHaveGoBuild(t) ++ ++ dir := t.TempDir() ++ ++ if err := os.WriteFile(filepath.Join(dir, "go.mod"), []byte("module largejump"), 0644); err != nil { ++ t.Fatalf("Failed to write file: %v\n", err) ++ } ++ main := `package main ++ ++import "fmt" ++ ++func main() { ++ fmt.Print(x()) ++} ++ ++func x() uint64 ++` ++ if err := os.WriteFile(filepath.Join(dir, "x.go"), []byte(main), 0644); err != nil { ++ t.Fatalf("failed to write main: %v\n", err) ++ } ++ ++ // Generate a very large jump instruction. ++ buf := bytes.NewBuffer(make([]byte, 0, 7000000)) ++ genLargeJump(buf) ++ ++ if err := os.WriteFile(filepath.Join(dir, "x.s"), buf.Bytes(), 0644); err != nil { ++ t.Fatalf("Failed to write file: %v\n", err) ++ } ++ ++ // Build generated files. ++ cmd := testenv.Command(t, testenv.GoToolPath(t), "build", "-o", "x.exe") ++ cmd.Dir = dir ++ out, err := cmd.CombinedOutput() ++ if err != nil { ++ t.Errorf("Build failed: %v, output: %s", err, out) ++ } ++ ++ cmd = testenv.Command(t, filepath.Join(dir, "x.exe")) ++ out, err = cmd.CombinedOutput() ++ if string(out) != "1" { ++ t.Errorf(`Got test output %q, want "1"`, string(out)) ++ } ++} ++ ++func genLargeJump(buf *bytes.Buffer) { ++ fmt.Fprintln(buf, "TEXT ·x(SB),0,$0-8") ++ fmt.Fprintln(buf, "MOV X0, X10") ++ fmt.Fprintln(buf, "JMP end") ++ for i := 0; i < 1<<18; i++ { ++ fmt.Fprintln(buf, "ADD $1, X10, X10") ++ } ++ fmt.Fprintln(buf, "end:") ++ fmt.Fprintln(buf, "ADD $1, X10, X10") ++ fmt.Fprintln(buf, "MOV X10, r+0(FP)") ++ fmt.Fprintln(buf, "RET") ++} ++ + // Issue 20348. + func TestNoRet(t *testing.T) { + dir, err := os.MkdirTemp("", "testnoret") +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 997c962bdd..3ab1ae94b9 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -731,7 +731,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { + ctxt.Diag("%v: jump displacement %d too large", p, p.To.Target().Pc-p.Pc) + } + p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: high, Sym: cursym} +- p.Link.From.Offset = low ++ p.Link.To.Offset = low + } + } + } +-- +2.39.5 + diff --git a/2019-cmd-internal-obj-riscv-improve-handling-of-invalid-a.patch b/2019-cmd-internal-obj-riscv-improve-handling-of-invalid-a.patch new file mode 100644 index 0000000..06ad9c9 --- /dev/null +++ b/2019-cmd-internal-obj-riscv-improve-handling-of-invalid-a.patch @@ -0,0 +1,376 @@ +From 74d9867e13eaa1dea10c5eddedc88bedb4fbf865 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 019/119] cmd/internal/obj/riscv: improve handling of invalid + assembly + +Currently, instruction validation failure will result in a panic during +encoding. Furthermore, the errors generated do not include the PC or +file/line information that is normally present. + +Fix this by: + +- Tracking and printing the *obj.Prog associated with the instruction, + including the assembly instruction/opcode if it differs. This provides + the standard PC and file/line prefix, which is also expected by assembly + error end-to-end tests. + +- Not proceeding with assembly if errors exist - with the current design, + errors are identified during validation, which is run via preprocess. + Attempts to encode invalid instructions will intentionally panic. + +Add some additional riscv64 encoding errors, now that we can actually do so. + +Change-Id: I64a7b83680c4d12aebdc96c67f9df625b5ef90d3 +Reviewed-on: https://go-review.googlesource.com/c/go/+/523459 +Run-TryBot: Joel Sing +Reviewed-by: Mark Ryan +Reviewed-by: Heschi Kreinick +TryBot-Result: Gopher Robot +Run-TryBot: M Zhuo +Reviewed-by: Cherry Mui +Reviewed-by: M Zhuo +--- + .../asm/internal/asm/testdata/riscv64error.s | 5 +- + src/cmd/internal/obj/riscv/obj.go | 201 ++++++++++-------- + 2 files changed, 116 insertions(+), 90 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index cdb8a028bd..2dc9db3fb1 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -38,5 +38,8 @@ TEXT errors(SB),$0 + SLLIW $-1, X5, X6 // ERROR "shift amount out of range 0 to 31" + SRLIW $-1, X5, X6 // ERROR "shift amount out of range 0 to 31" + SRAIW $-1, X5, X6 // ERROR "shift amount out of range 0 to 31" +- ++ SD X5, 4294967296(X6) // ERROR "constant 4294967296 too large" ++ SRLI $1, X5, F1 // ERROR "expected integer register in rd position but got non-integer register F1" ++ SRLI $1, F1, X5 // ERROR "expected integer register in rs1 position but got non-integer register F1" ++ FNES F1, (X5) // ERROR "needs an integer register output" + RET +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 3ab1ae94b9..195cd26413 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1042,154 +1042,154 @@ func immI(as obj.As, imm int64, nbits uint) uint32 { + return uint32(imm) + } + +-func wantImmI(ctxt *obj.Link, as obj.As, imm int64, nbits uint) { ++func wantImmI(ctxt *obj.Link, ins *instruction, imm int64, nbits uint) { + if err := immIFits(imm, nbits); err != nil { +- ctxt.Diag("%v: %v", as, err) ++ ctxt.Diag("%v: %v", ins, err) + } + } + +-func wantReg(ctxt *obj.Link, as obj.As, pos string, descr string, r, min, max uint32) { ++func wantReg(ctxt *obj.Link, ins *instruction, pos string, descr string, r, min, max uint32) { + if r < min || r > max { + var suffix string + if r != obj.REG_NONE { + suffix = fmt.Sprintf(" but got non-%s register %s", descr, RegName(int(r))) + } +- ctxt.Diag("%v: expected %s register in %s position%s", as, descr, pos, suffix) ++ ctxt.Diag("%v: expected %s register in %s position%s", ins, descr, pos, suffix) + } + } + +-func wantNoneReg(ctxt *obj.Link, as obj.As, pos string, r uint32) { ++func wantNoneReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) { + if r != obj.REG_NONE { +- ctxt.Diag("%v: expected no register in %s but got register %s", as, pos, RegName(int(r))) ++ ctxt.Diag("%v: expected no register in %s but got register %s", ins, pos, RegName(int(r))) + } + } + + // wantIntReg checks that r is an integer register. +-func wantIntReg(ctxt *obj.Link, as obj.As, pos string, r uint32) { +- wantReg(ctxt, as, pos, "integer", r, REG_X0, REG_X31) ++func wantIntReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) { ++ wantReg(ctxt, ins, pos, "integer", r, REG_X0, REG_X31) + } + + // wantFloatReg checks that r is a floating-point register. +-func wantFloatReg(ctxt *obj.Link, as obj.As, pos string, r uint32) { +- wantReg(ctxt, as, pos, "float", r, REG_F0, REG_F31) ++func wantFloatReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) { ++ wantReg(ctxt, ins, pos, "float", r, REG_F0, REG_F31) + } + + // wantEvenOffset checks that the offset is a multiple of two. +-func wantEvenOffset(ctxt *obj.Link, as obj.As, offset int64) { ++func wantEvenOffset(ctxt *obj.Link, ins *instruction, offset int64) { + if err := immEven(offset); err != nil { +- ctxt.Diag("%v: %v", as, err) ++ ctxt.Diag("%v: %v", ins, err) + } + } + + func validateRIII(ctxt *obj.Link, ins *instruction) { +- wantIntReg(ctxt, ins.as, "rd", ins.rd) +- wantIntReg(ctxt, ins.as, "rs1", ins.rs1) +- wantIntReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantIntReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateRFFF(ctxt *obj.Link, ins *instruction) { +- wantFloatReg(ctxt, ins.as, "rd", ins.rd) +- wantFloatReg(ctxt, ins.as, "rs1", ins.rs1) +- wantFloatReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantFloatReg(ctxt, ins, "rd", ins.rd) ++ wantFloatReg(ctxt, ins, "rs1", ins.rs1) ++ wantFloatReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateRFFFF(ctxt *obj.Link, ins *instruction) { +- wantFloatReg(ctxt, ins.as, "rd", ins.rd) +- wantFloatReg(ctxt, ins.as, "rs1", ins.rs1) +- wantFloatReg(ctxt, ins.as, "rs2", ins.rs2) +- wantFloatReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantFloatReg(ctxt, ins, "rd", ins.rd) ++ wantFloatReg(ctxt, ins, "rs1", ins.rs1) ++ wantFloatReg(ctxt, ins, "rs2", ins.rs2) ++ wantFloatReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateRFFI(ctxt *obj.Link, ins *instruction) { +- wantIntReg(ctxt, ins.as, "rd", ins.rd) +- wantFloatReg(ctxt, ins.as, "rs1", ins.rs1) +- wantFloatReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantFloatReg(ctxt, ins, "rs1", ins.rs1) ++ wantFloatReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateRFI(ctxt *obj.Link, ins *instruction) { +- wantIntReg(ctxt, ins.as, "rd", ins.rd) +- wantNoneReg(ctxt, ins.as, "rs1", ins.rs1) +- wantFloatReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantFloatReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateRIF(ctxt *obj.Link, ins *instruction) { +- wantFloatReg(ctxt, ins.as, "rd", ins.rd) +- wantNoneReg(ctxt, ins.as, "rs1", ins.rs1) +- wantIntReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantFloatReg(ctxt, ins, "rd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantIntReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateRFF(ctxt *obj.Link, ins *instruction) { +- wantFloatReg(ctxt, ins.as, "rd", ins.rd) +- wantNoneReg(ctxt, ins.as, "rs1", ins.rs1) +- wantFloatReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantFloatReg(ctxt, ins, "rd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantFloatReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateII(ctxt *obj.Link, ins *instruction) { +- wantImmI(ctxt, ins.as, ins.imm, 12) +- wantIntReg(ctxt, ins.as, "rd", ins.rd) +- wantIntReg(ctxt, ins.as, "rs1", ins.rs1) +- wantNoneReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantImmI(ctxt, ins, ins.imm, 12) ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateIF(ctxt *obj.Link, ins *instruction) { +- wantImmI(ctxt, ins.as, ins.imm, 12) +- wantFloatReg(ctxt, ins.as, "rd", ins.rd) +- wantIntReg(ctxt, ins.as, "rs1", ins.rs1) +- wantNoneReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantImmI(ctxt, ins, ins.imm, 12) ++ wantFloatReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateSI(ctxt *obj.Link, ins *instruction) { +- wantImmI(ctxt, ins.as, ins.imm, 12) +- wantIntReg(ctxt, ins.as, "rd", ins.rd) +- wantIntReg(ctxt, ins.as, "rs1", ins.rs1) +- wantNoneReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantImmI(ctxt, ins, ins.imm, 12) ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateSF(ctxt *obj.Link, ins *instruction) { +- wantImmI(ctxt, ins.as, ins.imm, 12) +- wantIntReg(ctxt, ins.as, "rd", ins.rd) +- wantFloatReg(ctxt, ins.as, "rs1", ins.rs1) +- wantNoneReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantImmI(ctxt, ins, ins.imm, 12) ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantFloatReg(ctxt, ins, "rs1", ins.rs1) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateB(ctxt *obj.Link, ins *instruction) { + // Offsets are multiples of two, so accept 13 bit immediates for the + // 12 bit slot. We implicitly drop the least significant bit in encodeB. +- wantEvenOffset(ctxt, ins.as, ins.imm) +- wantImmI(ctxt, ins.as, ins.imm, 13) +- wantNoneReg(ctxt, ins.as, "rd", ins.rd) +- wantIntReg(ctxt, ins.as, "rs1", ins.rs1) +- wantIntReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantEvenOffset(ctxt, ins, ins.imm) ++ wantImmI(ctxt, ins, ins.imm, 13) ++ wantNoneReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantIntReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateU(ctxt *obj.Link, ins *instruction) { +- wantImmI(ctxt, ins.as, ins.imm, 20) +- wantIntReg(ctxt, ins.as, "rd", ins.rd) +- wantNoneReg(ctxt, ins.as, "rs1", ins.rs1) +- wantNoneReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantImmI(ctxt, ins, ins.imm, 20) ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateJ(ctxt *obj.Link, ins *instruction) { + // Offsets are multiples of two, so accept 21 bit immediates for the + // 20 bit slot. We implicitly drop the least significant bit in encodeJ. +- wantEvenOffset(ctxt, ins.as, ins.imm) +- wantImmI(ctxt, ins.as, ins.imm, 21) +- wantIntReg(ctxt, ins.as, "rd", ins.rd) +- wantNoneReg(ctxt, ins.as, "rs1", ins.rs1) +- wantNoneReg(ctxt, ins.as, "rs2", ins.rs2) +- wantNoneReg(ctxt, ins.as, "rs3", ins.rs3) ++ wantEvenOffset(ctxt, ins, ins.imm) ++ wantImmI(ctxt, ins, ins.imm, 21) ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateRaw(ctxt *obj.Link, ins *instruction) { +@@ -1726,14 +1726,26 @@ func encodingForAs(as obj.As) (encoding, error) { + } + + type instruction struct { +- as obj.As // Assembler opcode +- rd uint32 // Destination register +- rs1 uint32 // Source register 1 +- rs2 uint32 // Source register 2 +- rs3 uint32 // Source register 3 +- imm int64 // Immediate +- funct3 uint32 // Function 3 +- funct7 uint32 // Function 7 (or Function 2) ++ p *obj.Prog // Prog that instruction is for ++ as obj.As // Assembler opcode ++ rd uint32 // Destination register ++ rs1 uint32 // Source register 1 ++ rs2 uint32 // Source register 2 ++ rs3 uint32 // Source register 3 ++ imm int64 // Immediate ++ funct3 uint32 // Function 3 ++ funct7 uint32 // Function 7 (or Function 2) ++} ++ ++func (ins *instruction) String() string { ++ if ins.p == nil { ++ return ins.as.String() ++ } ++ var suffix string ++ if ins.p.As != ins.as { ++ suffix = fmt.Sprintf(" (%v)", ins.as) ++ } ++ return fmt.Sprintf("%v%v", ins.p, suffix) + } + + func (ins *instruction) encode() (uint32, error) { +@@ -2199,13 +2211,13 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.imm = p.To.Offset + + case AMOV, AMOVB, AMOVH, AMOVW, AMOVBU, AMOVHU, AMOVWU, AMOVF, AMOVD: +- return instructionsForMOV(p) ++ inss = instructionsForMOV(p) + + case ALW, ALWU, ALH, ALHU, ALB, ALBU, ALD, AFLW, AFLD: +- return instructionsForLoad(p, ins.as, p.From.Reg) ++ inss = instructionsForLoad(p, ins.as, p.From.Reg) + + case ASW, ASH, ASB, ASD, AFSW, AFSD: +- return instructionsForStore(p, ins.as, p.To.Reg) ++ inss = instructionsForStore(p, ins.as, p.To.Reg) + + case ALRW, ALRD: + // Set aq to use acquire access ordering +@@ -2245,7 +2257,7 @@ func instructionsForProg(p *obj.Prog) []*instruction { + case AFNES, AFNED: + // Replace FNE[SD] with FEQ[SD] and NOT. + if p.To.Type != obj.TYPE_REG { +- p.Ctxt.Diag("%v needs an integer register output", ins.as) ++ p.Ctxt.Diag("%v needs an integer register output", p) + return nil + } + if ins.as == AFNES { +@@ -2334,6 +2346,11 @@ func instructionsForProg(p *obj.Prog) []*instruction { + p.Ctxt.Diag("%v: shift amount out of range 0 to 31", p) + } + } ++ ++ for _, ins := range inss { ++ ins.p = p ++ } ++ + return inss + } + +@@ -2345,6 +2362,12 @@ func assemble(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { + ctxt.Retpoline = false // don't keep printing + } + ++ // If errors were encountered during preprocess/validation, proceeding ++ // and attempting to encode said instructions will only lead to panics. ++ if ctxt.Errors > 0 { ++ return ++ } ++ + for p := cursym.Func().Text; p != nil; p = p.Link { + switch p.As { + case AJAL: +-- +2.39.5 + diff --git a/2020-all-clean-up-addition-of-constants-in-riscv64-assemb.patch b/2020-all-clean-up-addition-of-constants-in-riscv64-assemb.patch new file mode 100644 index 0000000..091f569 --- /dev/null +++ b/2020-all-clean-up-addition-of-constants-in-riscv64-assemb.patch @@ -0,0 +1,555 @@ +From 29b9498db7d1bf02c85eb98cba4e2bb63237ba05 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 020/119] all: clean up addition of constants in riscv64 + assembly + +Use ADD with constants, instead of ADDI. Also use SUB with a positive constant +rather than ADD with a negative constant. The resulting assembly is still the +same. + +Change-Id: Ife10bf5ae4122e525f0e7d41b5e463e748236a9c +Reviewed-on: https://go-review.googlesource.com/c/go/+/540136 +TryBot-Result: Gopher Robot +Reviewed-by: M Zhuo +Reviewed-by: Cherry Mui +Reviewed-by: Mark Ryan +Reviewed-by: Heschi Kreinick +Run-TryBot: Joel Sing +--- + src/crypto/internal/bigmod/nat_riscv64.s | 6 +-- + src/internal/bytealg/compare_riscv64.s | 12 ++--- + src/internal/bytealg/equal_riscv64.s | 10 ++-- + src/internal/bytealg/indexbyte_riscv64.s | 4 +- + src/runtime/asm_riscv64.s | 10 ++-- + src/runtime/memclr_riscv64.s | 14 +++--- + src/runtime/memmove_riscv64.s | 64 ++++++++++++------------ + src/runtime/mkpreempt.go | 2 +- + src/runtime/preempt_riscv64.s | 2 +- + src/runtime/sys_linux_riscv64.s | 4 +- + 10 files changed, 64 insertions(+), 64 deletions(-) + +diff --git a/src/crypto/internal/bigmod/nat_riscv64.s b/src/crypto/internal/bigmod/nat_riscv64.s +index 1d8c8c8900..c1d9cc0dd4 100644 +--- a/src/crypto/internal/bigmod/nat_riscv64.s ++++ b/src/crypto/internal/bigmod/nat_riscv64.s +@@ -80,10 +80,10 @@ loop: + MOV X16, 2*8(X5) // z[2] + MOV X19, 3*8(X5) // z[3] + +- ADDI $32, X5 +- ADDI $32, X7 ++ ADD $32, X5 ++ ADD $32, X7 + +- ADDI $-4, X30 ++ SUB $4, X30 + BNEZ X30, loop + + done: +diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s +index a4164a2b81..b1e1f7bcc7 100644 +--- a/src/internal/bytealg/compare_riscv64.s ++++ b/src/internal/bytealg/compare_riscv64.s +@@ -53,7 +53,7 @@ use_a_len: + ADD $8, X7, X7 + SUB X7, X5, X5 + align: +- ADD $-1, X7 ++ SUB $1, X7 + MOVBU 0(X10), X8 + MOVBU 0(X12), X9 + BNE X8, X9, cmp +@@ -79,7 +79,7 @@ compare32: + BNE X17, X18, cmp8b + ADD $32, X10 + ADD $32, X12 +- ADD $-32, X5 ++ SUB $32, X5 + BGE X5, X6, compare32 + BEQZ X5, cmp_len + +@@ -95,7 +95,7 @@ compare16: + BNE X17, X18, cmp8b + ADD $16, X10 + ADD $16, X12 +- ADD $-16, X5 ++ SUB $16, X5 + BEQZ X5, cmp_len + + check8_unaligned: +@@ -128,7 +128,7 @@ compare8_unaligned: + BNE X29, X30, cmp1h + ADD $8, X10 + ADD $8, X12 +- ADD $-8, X5 ++ SUB $8, X5 + BGE X5, X6, compare8_unaligned + BEQZ X5, cmp_len + +@@ -150,7 +150,7 @@ compare4_unaligned: + BNE X19, X20, cmp1d + ADD $4, X10 + ADD $4, X12 +- ADD $-4, X5 ++ SUB $4, X5 + BGE X5, X6, compare4_unaligned + + compare1: +@@ -160,7 +160,7 @@ compare1: + BNE X8, X9, cmp + ADD $1, X10 + ADD $1, X12 +- ADD $-1, X5 ++ SUB $1, X5 + JMP compare1 + + // Compare 8 bytes of memory in X15/X16 that are known to differ. +diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s +index 503aac5751..7f470ce0a0 100644 +--- a/src/internal/bytealg/equal_riscv64.s ++++ b/src/internal/bytealg/equal_riscv64.s +@@ -41,7 +41,7 @@ TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0 + ADD $8, X9, X9 + SUB X9, X12, X12 + align: +- ADD $-1, X9 ++ SUB $1, X9 + MOVBU 0(X10), X19 + MOVBU 0(X11), X20 + BNE X19, X20, not_eq +@@ -67,7 +67,7 @@ loop32: + BNE X16, X17, not_eq + ADD $32, X10 + ADD $32, X11 +- ADD $-32, X12 ++ SUB $32, X12 + BGE X12, X9, loop32 + BEQZ X12, eq + +@@ -83,7 +83,7 @@ loop16: + BNE X21, X22, not_eq + ADD $16, X10 + ADD $16, X11 +- ADD $-16, X12 ++ SUB $16, X12 + BGE X12, X23, loop16 + BEQZ X12, eq + +@@ -105,7 +105,7 @@ loop4: + BNE X16, X17, not_eq + ADD $4, X10 + ADD $4, X11 +- ADD $-4, X12 ++ SUB $4, X12 + BGE X12, X23, loop4 + + loop1: +@@ -115,7 +115,7 @@ loop1: + BNE X19, X20, not_eq + ADD $1, X10 + ADD $1, X11 +- ADD $-1, X12 ++ SUB $1, X12 + JMP loop1 + + not_eq: +diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s +index 8be78ed950..de00983c7b 100644 +--- a/src/internal/bytealg/indexbyte_riscv64.s ++++ b/src/internal/bytealg/indexbyte_riscv64.s +@@ -13,7 +13,7 @@ TEXT ·IndexByte(SB),NOSPLIT,$0-40 + AND $0xff, X13 + MOV X10, X12 // store base for later + ADD X10, X11 // end +- ADD $-1, X10 ++ SUB $1, X10 + + loop: + ADD $1, X10 +@@ -35,7 +35,7 @@ TEXT ·IndexByteString(SB),NOSPLIT,$0-32 + AND $0xff, X12 + MOV X10, X13 // store base for later + ADD X10, X11 // end +- ADD $-1, X10 ++ SUB $1, X10 + + loop: + ADD $1, X10 +diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s +index eb53cbbf47..bb0d161ad4 100644 +--- a/src/runtime/asm_riscv64.s ++++ b/src/runtime/asm_riscv64.s +@@ -9,7 +9,7 @@ + // func rt0_go() + TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0 + // X2 = stack; A0 = argc; A1 = argv +- ADD $-24, X2 ++ SUB $24, X2 + MOV A0, 8(X2) // argc + MOV A1, 16(X2) // argv + +@@ -57,7 +57,7 @@ nocgo: + + // create a new goroutine to start program + MOV $runtime·mainPC(SB), T0 // entry +- ADD $-16, X2 ++ SUB $16, X2 + MOV T0, 8(X2) + MOV ZERO, 0(X2) + CALL runtime·newproc(SB) +@@ -200,7 +200,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0 + MOV (g_sched+gobuf_sp)(g), X2 + // Create a stack frame on g0 to call newstack. + MOV ZERO, -8(X2) // Zero saved LR in frame +- ADD $-8, X2 ++ SUB $8, X2 + CALL runtime·newstack(SB) + + // Not reached, but make sure the return PC from the call to newstack +@@ -285,7 +285,7 @@ TEXT runtime·mcall(SB), NOSPLIT|NOFRAME, $0-8 + MOV 0(CTXT), T1 // code pointer + MOV (g_sched+gobuf_sp)(g), X2 // sp = m->g0->sched.sp + // we don't need special macro for regabi since arg0(X10) = g +- ADD $-16, X2 ++ SUB $16, X2 + MOV X10, 8(X2) // setup g + MOV ZERO, 0(X2) // clear return address + JALR RA, T1 +@@ -338,7 +338,7 @@ TEXT ·asmcgocall(SB),NOSPLIT,$0-20 + // Now on a scheduling stack (a pthread-created stack). + g0: + // Save room for two of our pointers. +- ADD $-16, X2 ++ SUB $16, X2 + MOV X9, 0(X2) // save old g on stack + MOV (g_stack+stack_hi)(X9), X9 + SUB X8, X9, X8 +diff --git a/src/runtime/memclr_riscv64.s b/src/runtime/memclr_riscv64.s +index 1c1e6ab54d..16c511c603 100644 +--- a/src/runtime/memclr_riscv64.s ++++ b/src/runtime/memclr_riscv64.s +@@ -23,7 +23,7 @@ TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16 + SUB X5, X9, X5 + SUB X5, X11, X11 + align: +- ADD $-1, X5 ++ SUB $1, X5 + MOVB ZERO, 0(X10) + ADD $1, X10 + BNEZ X5, align +@@ -47,7 +47,7 @@ loop64: + MOV ZERO, 48(X10) + MOV ZERO, 56(X10) + ADD $64, X10 +- ADD $-64, X11 ++ SUB $64, X11 + BGE X11, X9, loop64 + BEQZ X11, done + +@@ -60,7 +60,7 @@ zero32: + MOV ZERO, 16(X10) + MOV ZERO, 24(X10) + ADD $32, X10 +- ADD $-32, X11 ++ SUB $32, X11 + BEQZ X11, done + + check16: +@@ -70,7 +70,7 @@ zero16: + MOV ZERO, 0(X10) + MOV ZERO, 8(X10) + ADD $16, X10 +- ADD $-16, X11 ++ SUB $16, X11 + BEQZ X11, done + + check8: +@@ -79,7 +79,7 @@ check8: + zero8: + MOV ZERO, 0(X10) + ADD $8, X10 +- ADD $-8, X11 ++ SUB $8, X11 + BEQZ X11, done + + check4: +@@ -91,13 +91,13 @@ zero4: + MOVB ZERO, 2(X10) + MOVB ZERO, 3(X10) + ADD $4, X10 +- ADD $-4, X11 ++ SUB $4, X11 + + loop1: + BEQZ X11, done + MOVB ZERO, 0(X10) + ADD $1, X10 +- ADD $-1, X11 ++ SUB $1, X11 + JMP loop1 + + done: +diff --git a/src/runtime/memmove_riscv64.s b/src/runtime/memmove_riscv64.s +index f5db86562b..e099a64100 100644 +--- a/src/runtime/memmove_riscv64.s ++++ b/src/runtime/memmove_riscv64.s +@@ -32,7 +32,7 @@ TEXT runtime·memmove(SB),NOSPLIT,$-0-24 + SUB X5, X9, X5 + SUB X5, X12, X12 + f_align: +- ADD $-1, X5 ++ SUB $1, X5 + MOVB 0(X11), X14 + MOVB X14, 0(X10) + ADD $1, X10 +@@ -65,7 +65,7 @@ f_loop64: + MOV X21, 56(X10) + ADD $64, X10 + ADD $64, X11 +- ADD $-64, X12 ++ SUB $64, X12 + BGE X12, X9, f_loop64 + BEQZ X12, done + +@@ -83,7 +83,7 @@ f_loop32: + MOV X17, 24(X10) + ADD $32, X10 + ADD $32, X11 +- ADD $-32, X12 ++ SUB $32, X12 + BGE X12, X9, f_loop32 + BEQZ X12, done + +@@ -97,7 +97,7 @@ f_loop16: + MOV X15, 8(X10) + ADD $16, X10 + ADD $16, X11 +- ADD $-16, X12 ++ SUB $16, X12 + BGE X12, X9, f_loop16 + BEQZ X12, done + +@@ -109,7 +109,7 @@ f_loop8: + MOV X14, 0(X10) + ADD $8, X10 + ADD $8, X11 +- ADD $-8, X12 ++ SUB $8, X12 + BGE X12, X9, f_loop8 + BEQZ X12, done + JMP f_loop4_check +@@ -136,7 +136,7 @@ f_loop8_unaligned: + MOVB X21, 7(X10) + ADD $8, X10 + ADD $8, X11 +- ADD $-8, X12 ++ SUB $8, X12 + BGE X12, X9, f_loop8_unaligned + + f_loop4_check: +@@ -153,7 +153,7 @@ f_loop4: + MOVB X17, 3(X10) + ADD $4, X10 + ADD $4, X11 +- ADD $-4, X12 ++ SUB $4, X12 + BGE X12, X9, f_loop4 + + f_loop1: +@@ -162,7 +162,7 @@ f_loop1: + MOVB X14, 0(X10) + ADD $1, X10 + ADD $1, X11 +- ADD $-1, X12 ++ SUB $1, X12 + JMP f_loop1 + + backward: +@@ -182,9 +182,9 @@ backward: + // Move one byte at a time until we reach 8 byte alignment. + SUB X5, X12, X12 + b_align: +- ADD $-1, X5 +- ADD $-1, X10 +- ADD $-1, X11 ++ SUB $1, X5 ++ SUB $1, X10 ++ SUB $1, X11 + MOVB 0(X11), X14 + MOVB X14, 0(X10) + BNEZ X5, b_align +@@ -197,8 +197,8 @@ b_loop_check: + MOV $64, X9 + BLT X12, X9, b_loop32_check + b_loop64: +- ADD $-64, X10 +- ADD $-64, X11 ++ SUB $64, X10 ++ SUB $64, X11 + MOV 0(X11), X14 + MOV 8(X11), X15 + MOV 16(X11), X16 +@@ -215,7 +215,7 @@ b_loop64: + MOV X19, 40(X10) + MOV X20, 48(X10) + MOV X21, 56(X10) +- ADD $-64, X12 ++ SUB $64, X12 + BGE X12, X9, b_loop64 + BEQZ X12, done + +@@ -223,8 +223,8 @@ b_loop32_check: + MOV $32, X9 + BLT X12, X9, b_loop16_check + b_loop32: +- ADD $-32, X10 +- ADD $-32, X11 ++ SUB $32, X10 ++ SUB $32, X11 + MOV 0(X11), X14 + MOV 8(X11), X15 + MOV 16(X11), X16 +@@ -233,7 +233,7 @@ b_loop32: + MOV X15, 8(X10) + MOV X16, 16(X10) + MOV X17, 24(X10) +- ADD $-32, X12 ++ SUB $32, X12 + BGE X12, X9, b_loop32 + BEQZ X12, done + +@@ -241,13 +241,13 @@ b_loop16_check: + MOV $16, X9 + BLT X12, X9, b_loop8_check + b_loop16: +- ADD $-16, X10 +- ADD $-16, X11 ++ SUB $16, X10 ++ SUB $16, X11 + MOV 0(X11), X14 + MOV 8(X11), X15 + MOV X14, 0(X10) + MOV X15, 8(X10) +- ADD $-16, X12 ++ SUB $16, X12 + BGE X12, X9, b_loop16 + BEQZ X12, done + +@@ -255,11 +255,11 @@ b_loop8_check: + MOV $8, X9 + BLT X12, X9, b_loop4_check + b_loop8: +- ADD $-8, X10 +- ADD $-8, X11 ++ SUB $8, X10 ++ SUB $8, X11 + MOV 0(X11), X14 + MOV X14, 0(X10) +- ADD $-8, X12 ++ SUB $8, X12 + BGE X12, X9, b_loop8 + BEQZ X12, done + JMP b_loop4_check +@@ -268,8 +268,8 @@ b_loop8_unaligned_check: + MOV $8, X9 + BLT X12, X9, b_loop4_check + b_loop8_unaligned: +- ADD $-8, X10 +- ADD $-8, X11 ++ SUB $8, X10 ++ SUB $8, X11 + MOVB 0(X11), X14 + MOVB 1(X11), X15 + MOVB 2(X11), X16 +@@ -286,15 +286,15 @@ b_loop8_unaligned: + MOVB X19, 5(X10) + MOVB X20, 6(X10) + MOVB X21, 7(X10) +- ADD $-8, X12 ++ SUB $8, X12 + BGE X12, X9, b_loop8_unaligned + + b_loop4_check: + MOV $4, X9 + BLT X12, X9, b_loop1 + b_loop4: +- ADD $-4, X10 +- ADD $-4, X11 ++ SUB $4, X10 ++ SUB $4, X11 + MOVB 0(X11), X14 + MOVB 1(X11), X15 + MOVB 2(X11), X16 +@@ -303,16 +303,16 @@ b_loop4: + MOVB X15, 1(X10) + MOVB X16, 2(X10) + MOVB X17, 3(X10) +- ADD $-4, X12 ++ SUB $4, X12 + BGE X12, X9, b_loop4 + + b_loop1: + BEQZ X12, done +- ADD $-1, X10 +- ADD $-1, X11 ++ SUB $1, X10 ++ SUB $1, X11 + MOVB 0(X11), X14 + MOVB X14, 0(X10) +- ADD $-1, X12 ++ SUB $1, X12 + JMP b_loop1 + + done: +diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go +index 0bfbd379e0..a96ae59c15 100644 +--- a/src/runtime/mkpreempt.go ++++ b/src/runtime/mkpreempt.go +@@ -576,7 +576,7 @@ func genRISCV64() { + } + + p("MOV X1, -%d(X2)", l.stack) +- p("ADD $-%d, X2", l.stack) ++ p("SUB $%d, X2", l.stack) + l.save() + p("CALL ·asyncPreempt2(SB)") + l.restore() +diff --git a/src/runtime/preempt_riscv64.s b/src/runtime/preempt_riscv64.s +index 56df6c30e0..bbb6447dc5 100644 +--- a/src/runtime/preempt_riscv64.s ++++ b/src/runtime/preempt_riscv64.s +@@ -5,7 +5,7 @@ + + TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 + MOV X1, -464(X2) +- ADD $-464, X2 ++ SUB $464, X2 + MOV X5, 8(X2) + MOV X6, 16(X2) + MOV X7, 24(X2) +diff --git a/src/runtime/sys_linux_riscv64.s b/src/runtime/sys_linux_riscv64.s +index d1558fd6f7..ffec2b5b75 100644 +--- a/src/runtime/sys_linux_riscv64.s ++++ b/src/runtime/sys_linux_riscv64.s +@@ -256,7 +256,7 @@ TEXT runtime·walltime(SB),NOSPLIT,$40-12 + MOV (g_sched+gobuf_sp)(T1), X2 + + noswitch: +- ADDI $-24, X2 // Space for result ++ SUB $24, X2 // Space for result + ANDI $~7, X2 // Align for C code + MOV $8(X2), A1 + +@@ -328,7 +328,7 @@ TEXT runtime·nanotime1(SB),NOSPLIT,$40-8 + MOV (g_sched+gobuf_sp)(T1), X2 + + noswitch: +- ADDI $-24, X2 // Space for result ++ SUB $24, X2 // Space for result + ANDI $~7, X2 // Align for C code + MOV $8(X2), A1 + +-- +2.39.5 + diff --git a/2021-cmd-internal-obj-riscv-add-support-of-PCALIGN-direct.patch b/2021-cmd-internal-obj-riscv-add-support-of-PCALIGN-direct.patch new file mode 100644 index 0000000..c262b58 --- /dev/null +++ b/2021-cmd-internal-obj-riscv-add-support-of-PCALIGN-direct.patch @@ -0,0 +1,152 @@ +From 035407e3926edc25a042c9dd90d489ca2d4f1cfe Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 021/119] cmd/internal/obj/riscv: add support of PCALIGN + directive + +Add support for PCALIGN directive on riscv. +This directive can be used within Go asm to align instruction +by padding NOP directives. + +This patch also adds a test to verify the correctness of the PCALIGN +directive. + +Original credit by Cooper Qu (Alibaba) +https://gitee.com/xuantie_riscv/xuantie-patch + +Change-Id: I8b6524a2bf81a1baf7c9d04b7da2db6c1a7b428f +Reviewed-on: https://go-review.googlesource.com/c/go/+/541740 +Run-TryBot: M Zhuo +Reviewed-by: Cherry Mui +Reviewed-by: Wang Yaduo +Reviewed-by: David Chase +Reviewed-by: Mark Ryan +TryBot-Result: Gopher Robot +--- + src/cmd/internal/obj/riscv/asm_test.go | 32 ++++++++++++++++++++++++++ + src/cmd/internal/obj/riscv/obj.go | 32 ++++++++++++++++++++++++++ + 2 files changed, 64 insertions(+) + +diff --git a/src/cmd/internal/obj/riscv/asm_test.go b/src/cmd/internal/obj/riscv/asm_test.go +index afe0525532..96ea230841 100644 +--- a/src/cmd/internal/obj/riscv/asm_test.go ++++ b/src/cmd/internal/obj/riscv/asm_test.go +@@ -9,8 +9,10 @@ import ( + "fmt" + "internal/testenv" + "os" ++ "os/exec" + "path/filepath" + "runtime" ++ "strings" + "testing" + ) + +@@ -277,3 +279,33 @@ func TestBranch(t *testing.T) { + t.Errorf("Branch test failed: %v\n%s", err, out) + } + } ++ ++func TestPCAlign(t *testing.T) { ++ dir := t.TempDir() ++ tmpfile := filepath.Join(dir, "x.s") ++ asm := ` ++TEXT _stub(SB),$0-0 ++ FENCE ++ PCALIGN $8 ++ FENCE ++ RET ++` ++ if err := os.WriteFile(tmpfile, []byte(asm), 0644); err != nil { ++ t.Fatal(err) ++ } ++ cmd := exec.Command(testenv.GoToolPath(t), "tool", "asm", "-o", filepath.Join(dir, "x.o"), "-S", tmpfile) ++ cmd.Env = append(os.Environ(), "GOARCH=riscv64", "GOOS=linux") ++ out, err := cmd.CombinedOutput() ++ if err != nil { ++ t.Errorf("Failed to assemble: %v\n%s", err, out) ++ } ++ // The expected instruction sequence after alignment: ++ // FENCE ++ // NOP ++ // FENCE ++ // RET ++ want := "0f 00 f0 0f 13 00 00 00 0f 00 f0 0f 67 80 00 00" ++ if !strings.Contains(string(out), want) { ++ t.Errorf("PCALIGN test failed - got %s\nwant %s", out, want) ++ } ++} +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 195cd26413..02d08fec76 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -308,6 +308,12 @@ func setPCs(p *obj.Prog, pc int64) int64 { + for _, ins := range instructionsForProg(p) { + pc += int64(ins.length()) + } ++ ++ if p.As == obj.APCALIGN { ++ alignedValue := p.From.Offset ++ v := pcAlignPadLength(pc, alignedValue) ++ pc += int64(v) ++ } + } + return pc + } +@@ -733,6 +739,16 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { + p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: high, Sym: cursym} + p.Link.To.Offset = low + } ++ ++ case obj.APCALIGN: ++ alignedValue := p.From.Offset ++ if (alignedValue&(alignedValue-1) != 0) || 4 > alignedValue || alignedValue > 2048 { ++ ctxt.Diag("alignment value of an instruction must be a power of two and in the range [4, 2048], got %d\n", alignedValue) ++ } ++ // Update the current text symbol alignment value. ++ if int32(alignedValue) > cursym.Func().Align { ++ cursym.Func().Align = int32(alignedValue) ++ } + } + } + +@@ -744,6 +760,10 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { + } + } + ++func pcAlignPadLength(pc int64, alignedValue int64) int { ++ return int(-pc & (alignedValue - 1)) ++} ++ + func stacksplit(ctxt *obj.Link, p *obj.Prog, cursym *obj.LSym, newprog obj.ProgAlloc, framesize int64) *obj.Prog { + // Leaf function with no frame is effectively NOSPLIT. + if framesize == 0 { +@@ -1707,6 +1727,7 @@ var encodings = [ALAST & obj.AMask]encoding{ + obj.ANOP: pseudoOpEncoding, + obj.ADUFFZERO: pseudoOpEncoding, + obj.ADUFFCOPY: pseudoOpEncoding, ++ obj.APCALIGN: pseudoOpEncoding, + } + + // encodingForAs returns the encoding for an obj.As. +@@ -2421,6 +2442,17 @@ func assemble(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { + rel.Sym = addr.Sym + rel.Add = addr.Offset + rel.Type = rt ++ ++ case obj.APCALIGN: ++ alignedValue := p.From.Offset ++ v := pcAlignPadLength(p.Pc, alignedValue) ++ offset := p.Pc ++ for ; v >= 4; v -= 4 { ++ // NOP ++ cursym.WriteBytes(ctxt, offset, []byte{0x13, 0, 0, 0}) ++ offset += 4 ++ } ++ continue + } + + offset := p.Pc +-- +2.39.5 + diff --git a/2022-internal-bytealg-optimize-Count-with-PCALIGN-in-risc.patch b/2022-internal-bytealg-optimize-Count-with-PCALIGN-in-risc.patch new file mode 100644 index 0000000..a7edc82 --- /dev/null +++ b/2022-internal-bytealg-optimize-Count-with-PCALIGN-in-risc.patch @@ -0,0 +1,94 @@ +From d8a1c916d517a531b1862fe55a22086b3e5767c7 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 022/119] internal/bytealg: optimize Count with PCALIGN in + riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +For #63678 + +Benchmark on Milk-V Mars CM eMMC (Starfive/JH7110 SoC) + +goos: linux +goarch: riscv64 +pkg: bytes + │ /root/bytes.old.bench │ /root/bytes.pc16.bench │ + │ sec/op │ sec/op vs base │ +Count/10 223.9n ± 1% 220.8n ± 1% -1.36% (p=0.001 n=10) +Count/32 571.6n ± 0% 571.3n ± 0% ~ (p=0.054 n=10) +Count/4K 38.56µ ± 0% 38.55µ ± 0% -0.01% (p=0.010 n=10) +Count/4M 40.13m ± 0% 39.21m ± 0% -2.28% (p=0.000 n=10) +Count/64M 627.5m ± 0% 627.4m ± 0% -0.01% (p=0.019 n=10) +CountEasy/10 101.3n ± 0% 101.3n ± 0% ~ (p=1.000 n=10) ¹ +CountEasy/32 139.3n ± 0% 139.3n ± 0% ~ (p=1.000 n=10) ¹ +CountEasy/4K 5.565µ ± 0% 5.564µ ± 0% -0.02% (p=0.001 n=10) +CountEasy/4M 5.619m ± 0% 5.619m ± 0% ~ (p=0.190 n=10) +CountEasy/64M 89.94m ± 0% 89.93m ± 0% ~ (p=0.436 n=10) +CountSingle/10 53.80n ± 0% 46.06n ± 0% -14.39% (p=0.000 n=10) +CountSingle/32 104.30n ± 0% 79.64n ± 0% -23.64% (p=0.000 n=10) +CountSingle/4K 10.413µ ± 0% 7.247µ ± 0% -30.40% (p=0.000 n=10) +CountSingle/4M 11.603m ± 0% 8.388m ± 0% -27.71% (p=0.000 n=10) +CountSingle/64M 230.9m ± 0% 172.3m ± 0% -25.40% (p=0.000 n=10) +CountHard1 9.981m ± 0% 9.981m ± 0% ~ (p=0.810 n=10) +CountHard2 9.981m ± 0% 9.981m ± 0% ~ (p=0.315 n=10) +CountHard3 9.981m ± 0% 9.981m ± 0% ~ (p=0.159 n=10) +geomean 144.6µ 133.5µ -7.70% +¹ all samples are equal + + │ /root/bytes.old.bench │ /root/bytes.pc16.bench │ + │ B/s │ B/s vs base │ +Count/10 42.60Mi ± 1% 43.19Mi ± 1% +1.39% (p=0.001 n=10) +Count/32 53.38Mi ± 0% 53.42Mi ± 0% +0.06% (p=0.049 n=10) +Count/4K 101.3Mi ± 0% 101.3Mi ± 0% ~ (p=0.077 n=10) +Count/4M 99.68Mi ± 0% 102.01Mi ± 0% +2.34% (p=0.000 n=10) +Count/64M 102.0Mi ± 0% 102.0Mi ± 0% ~ (p=0.076 n=10) +CountEasy/10 94.18Mi ± 0% 94.18Mi ± 0% ~ (p=0.054 n=10) +CountEasy/32 219.1Mi ± 0% 219.1Mi ± 0% +0.01% (p=0.016 n=10) +CountEasy/4K 702.0Mi ± 0% 702.0Mi ± 0% +0.00% (p=0.000 n=10) +CountEasy/4M 711.9Mi ± 0% 711.9Mi ± 0% ~ (p=0.133 n=10) +CountEasy/64M 711.6Mi ± 0% 711.7Mi ± 0% ~ (p=0.447 n=10) +CountSingle/10 177.2Mi ± 0% 207.0Mi ± 0% +16.81% (p=0.000 n=10) +CountSingle/32 292.7Mi ± 0% 383.2Mi ± 0% +30.91% (p=0.000 n=10) +CountSingle/4K 375.1Mi ± 0% 539.0Mi ± 0% +43.70% (p=0.000 n=10) +CountSingle/4M 344.7Mi ± 0% 476.9Mi ± 0% +38.33% (p=0.000 n=10) +CountSingle/64M 277.2Mi ± 0% 371.5Mi ± 0% +34.05% (p=0.000 n=10) +geomean 199.7Mi 219.8Mi +10.10% + +Change-Id: I1abf6b220b9802028f8ad5eebc8d3b7cfa3e89ea +Reviewed-on: https://go-review.googlesource.com/c/go/+/541756 +Reviewed-by: David Chase +Reviewed-by: Cherry Mui +Reviewed-by: Joel Sing +Run-TryBot: M Zhuo +TryBot-Result: Gopher Robot +Reviewed-by: Wang Yaduo +Reviewed-by: Mark Ryan +--- + src/internal/bytealg/count_riscv64.s | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/src/internal/bytealg/count_riscv64.s b/src/internal/bytealg/count_riscv64.s +index d123cbd7c6..3f255cd263 100644 +--- a/src/internal/bytealg/count_riscv64.s ++++ b/src/internal/bytealg/count_riscv64.s +@@ -14,6 +14,7 @@ TEXT ·Count(SB),NOSPLIT,$0-40 + MOV ZERO, X14 // count + ADD X10, X11 // end + ++ PCALIGN $16 + loop: + BEQ X10, X11, done + MOVBU (X10), X15 +@@ -34,6 +35,7 @@ TEXT ·CountString(SB),NOSPLIT,$0-32 + MOV ZERO, X14 // count + ADD X10, X11 // end + ++ PCALIGN $16 + loop: + BEQ X10, X11, done + MOVBU (X10), X15 +-- +2.39.5 + diff --git a/2023-cmd-compile-correct-code-generation-for-right-shifts.patch b/2023-cmd-compile-correct-code-generation-for-right-shifts.patch new file mode 100644 index 0000000..9f62ab0 --- /dev/null +++ b/2023-cmd-compile-correct-code-generation-for-right-shifts.patch @@ -0,0 +1,980 @@ +From 5cd157e29e4ea9dec96f4a7e6d35eb80ebdbee98 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 023/119] cmd/compile: correct code generation for right shifts + on riscv64 + +The code generation on riscv64 will currently result in incorrect +assembly when a 32 bit integer is right shifted by an amount that +exceeds the size of the type. In particular, this occurs when an +int32 or uint32 is cast to a 64 bit type and right shifted by a +value larger than 31. + +Fix this by moving the SRAW/SRLW conversion into the right shift +rules and removing the SignExt32to64/ZeroExt32to64. Add additional +rules that rewrite to SRAIW/SRLIW when the shift is less than the +size of the type, or replace/eliminate the shift when it exceeds +the size of the type. + +Add SSA tests that would have caught this issue. Also add additional +codegen tests to ensure that the resulting assembly is what we +expect in these overflow cases. + +Fixes #64285 + +Change-Id: Ie97b05668597cfcb91413afefaab18ee1aa145ec +Reviewed-on: https://go-review.googlesource.com/c/go/+/545035 +Reviewed-by: Russ Cox +Reviewed-by: Cherry Mui +Reviewed-by: M Zhuo +Reviewed-by: Mark Ryan +Run-TryBot: Joel Sing +TryBot-Result: Gopher Robot +--- + .../compile/internal/ssa/_gen/RISCV64.rules | 100 +++-- + .../compile/internal/ssa/rewriteRISCV64.go | 415 ++++++++++-------- + .../internal/test/testdata/arith_test.go | 66 +++ + test/codegen/shift.go | 30 ++ + 4 files changed, 387 insertions(+), 224 deletions(-) + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 9afe5995ae..fc206c42d3 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -153,27 +153,27 @@ + // SRL only considers the bottom 6 bits of y, similarly SRLW only considers the + // bottom 5 bits of y. Ensure that the result is always zero if the shift exceeds + // the maximum value. See Lsh above for a detailed description. +-(Rsh8Ux8 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] (ZeroExt8to64 y)))) +-(Rsh8Ux16 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] (ZeroExt16to64 y)))) +-(Rsh8Ux32 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] (ZeroExt32to64 y)))) +-(Rsh8Ux64 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] y))) +-(Rsh16Ux8 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] (ZeroExt8to64 y)))) +-(Rsh16Ux16 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] (ZeroExt16to64 y)))) +-(Rsh16Ux32 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] (ZeroExt32to64 y)))) +-(Rsh16Ux64 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] y))) +-(Rsh32Ux8 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt8to64 y)))) +-(Rsh32Ux16 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt16to64 y)))) +-(Rsh32Ux32 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt32to64 y)))) +-(Rsh32Ux64 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] y))) +-(Rsh64Ux8 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] (ZeroExt8to64 y)))) +-(Rsh64Ux16 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] (ZeroExt16to64 y)))) +-(Rsh64Ux32 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] (ZeroExt32to64 y)))) +-(Rsh64Ux64 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] y))) +- +-(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt8to64 x) y) +-(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt16to64 x) y) +-(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt32to64 x) y) +-(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL x y) ++(Rsh8Ux8 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] (ZeroExt8to64 y)))) ++(Rsh8Ux16 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] (ZeroExt16to64 y)))) ++(Rsh8Ux32 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] (ZeroExt32to64 y)))) ++(Rsh8Ux64 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt8to64 x) y) (Neg8 (SLTIU [64] y))) ++(Rsh16Ux8 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] (ZeroExt8to64 y)))) ++(Rsh16Ux16 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] (ZeroExt16to64 y)))) ++(Rsh16Ux32 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] (ZeroExt32to64 y)))) ++(Rsh16Ux64 x y) && !shiftIsBounded(v) => (AND (SRL (ZeroExt16to64 x) y) (Neg16 (SLTIU [64] y))) ++(Rsh32Ux8 x y) && !shiftIsBounded(v) => (AND (SRLW x y) (Neg32 (SLTIU [32] (ZeroExt8to64 y)))) ++(Rsh32Ux16 x y) && !shiftIsBounded(v) => (AND (SRLW x y) (Neg32 (SLTIU [32] (ZeroExt16to64 y)))) ++(Rsh32Ux32 x y) && !shiftIsBounded(v) => (AND (SRLW x y) (Neg32 (SLTIU [32] (ZeroExt32to64 y)))) ++(Rsh32Ux64 x y) && !shiftIsBounded(v) => (AND (SRLW x y) (Neg32 (SLTIU [32] y))) ++(Rsh64Ux8 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] (ZeroExt8to64 y)))) ++(Rsh64Ux16 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] (ZeroExt16to64 y)))) ++(Rsh64Ux32 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] (ZeroExt32to64 y)))) ++(Rsh64Ux64 x y) && !shiftIsBounded(v) => (AND (SRL x y) (Neg64 (SLTIU [64] y))) ++ ++(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt8to64 x) y) ++(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL (ZeroExt16to64 x) y) ++(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRLW x y) ++(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL x y) + + // SRA only considers the bottom 6 bits of y, similarly SRAW only considers the + // bottom 5 bits. If y is greater than the maximum value (either 63 or 31 +@@ -188,27 +188,27 @@ + // + // We don't need to sign-extend the OR result, as it will be at minimum 8 bits, + // more than the 5 or 6 bits SRAW and SRA care about. +-(Rsh8x8 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt8to64 y))))) +-(Rsh8x16 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) +-(Rsh8x32 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) +-(Rsh8x64 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] y)))) +-(Rsh16x8 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt8to64 y))))) +-(Rsh16x16 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) +-(Rsh16x32 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) +-(Rsh16x64 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] y)))) +-(Rsh32x8 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt8to64 y))))) +-(Rsh32x16 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt16to64 y))))) +-(Rsh32x32 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt32to64 y))))) +-(Rsh32x64 x y) && !shiftIsBounded(v) => (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] y)))) +-(Rsh64x8 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] (ZeroExt8to64 y))))) +-(Rsh64x16 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) +-(Rsh64x32 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) +-(Rsh64x64 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] y)))) +- +-(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt8to64 x) y) +-(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt16to64 x) y) +-(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt32to64 x) y) +-(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA x y) ++(Rsh8x8 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt8to64 y))))) ++(Rsh8x16 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) ++(Rsh8x32 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) ++(Rsh8x64 x y) && !shiftIsBounded(v) => (SRA (SignExt8to64 x) (OR y (ADDI [-1] (SLTIU [64] y)))) ++(Rsh16x8 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt8to64 y))))) ++(Rsh16x16 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) ++(Rsh16x32 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) ++(Rsh16x64 x y) && !shiftIsBounded(v) => (SRA (SignExt16to64 x) (OR y (ADDI [-1] (SLTIU [64] y)))) ++(Rsh32x8 x y) && !shiftIsBounded(v) => (SRAW x (OR y (ADDI [-1] (SLTIU [32] (ZeroExt8to64 y))))) ++(Rsh32x16 x y) && !shiftIsBounded(v) => (SRAW x (OR y (ADDI [-1] (SLTIU [32] (ZeroExt16to64 y))))) ++(Rsh32x32 x y) && !shiftIsBounded(v) => (SRAW x (OR y (ADDI [-1] (SLTIU [32] (ZeroExt32to64 y))))) ++(Rsh32x64 x y) && !shiftIsBounded(v) => (SRAW x (OR y (ADDI [-1] (SLTIU [32] y)))) ++(Rsh64x8 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] (ZeroExt8to64 y))))) ++(Rsh64x16 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] (ZeroExt16to64 y))))) ++(Rsh64x32 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] (ZeroExt32to64 y))))) ++(Rsh64x64 x y) && !shiftIsBounded(v) => (SRA x (OR y (ADDI [-1] (SLTIU [64] y)))) ++ ++(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt8to64 x) y) ++(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA (SignExt16to64 x) y) ++(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW x y) ++(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA x y) + + // Rotates. + (RotateLeft8 x (MOVDconst [c])) => (Or8 (Lsh8x64 x (MOVDconst [c&7])) (Rsh8Ux64 x (MOVDconst [-c&7]))) +@@ -710,10 +710,18 @@ + (MOVDnop (MOVDconst [c])) => (MOVDconst [c]) + + // Avoid unnecessary zero and sign extension when right shifting. +-(SRL (MOVWUreg x) y) => (SRLW x y) +-(SRLI [x] (MOVWUreg y)) => (SRLIW [int64(x&31)] y) +-(SRA (MOVWreg x) y) => (SRAW x y) +-(SRAI [x] (MOVWreg y)) => (SRAIW [int64(x&31)] y) ++(SRAI [x] (MOVWreg y)) && x >= 0 && x <= 31 => (SRAIW [int64(x)] y) ++(SRLI [x] (MOVWUreg y)) && x >= 0 && x <= 31 => (SRLIW [int64(x)] y) ++ ++// Replace right shifts that exceed size of signed type. ++(SRAI [x] (MOVBreg y)) && x >= 8 => (SRAI [63] (SLLI [56] y)) ++(SRAI [x] (MOVHreg y)) && x >= 16 => (SRAI [63] (SLLI [48] y)) ++(SRAI [x] (MOVWreg y)) && x >= 32 => (SRAIW [31] y) ++ ++// Eliminate right shifts that exceed size of unsigned type. ++(SRLI [x] (MOVBUreg y)) && x >= 8 => (MOVDconst [0]) ++(SRLI [x] (MOVHUreg y)) && x >= 16 => (MOVDconst [0]) ++(SRLI [x] (MOVWUreg y)) && x >= 32 => (MOVDconst [0]) + + // Fold constant into immediate instructions where possible. + (ADD (MOVDconst [val]) x) && is32Bit(val) && !t.IsPtr() => (ADDI [val] x) +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 572dac249e..41edcdf8b8 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -2,8 +2,10 @@ + + package ssa + +-import "math" +-import "cmd/compile/internal/types" ++import ( ++ "cmd/compile/internal/types" ++ "math" ++) + + func rewriteValueRISCV64(v *Value) bool { + switch v.Op { +@@ -6260,20 +6262,6 @@ func rewriteValueRISCV64_OpRISCV64SNEZ(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64SRA(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +- // match: (SRA (MOVWreg x) y) +- // result: (SRAW x y) +- for { +- t := v.Type +- if v_0.Op != OpRISCV64MOVWreg { +- break +- } +- x := v_0.Args[0] +- y := v_1 +- v.reset(OpRISCV64SRAW) +- v.Type = t +- v.AddArg2(x, y) +- return true +- } + // match: (SRA x (MOVDconst [val])) + // result: (SRAI [int64(val&63)] x) + for { +@@ -6291,8 +6279,10 @@ func rewriteValueRISCV64_OpRISCV64SRA(v *Value) bool { + } + func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool { + v_0 := v.Args[0] ++ b := v.Block + // match: (SRAI [x] (MOVWreg y)) +- // result: (SRAIW [int64(x&31)] y) ++ // cond: x >= 0 && x <= 31 ++ // result: (SRAIW [int64(x)] y) + for { + t := v.Type + x := auxIntToInt64(v.AuxInt) +@@ -6300,9 +6290,71 @@ func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool { + break + } + y := v_0.Args[0] ++ if !(x >= 0 && x <= 31) { ++ break ++ } + v.reset(OpRISCV64SRAIW) + v.Type = t +- v.AuxInt = int64ToAuxInt(int64(x & 31)) ++ v.AuxInt = int64ToAuxInt(int64(x)) ++ v.AddArg(y) ++ return true ++ } ++ // match: (SRAI [x] (MOVBreg y)) ++ // cond: x >= 8 ++ // result: (SRAI [63] (SLLI [56] y)) ++ for { ++ t := v.Type ++ x := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpRISCV64MOVBreg { ++ break ++ } ++ y := v_0.Args[0] ++ if !(x >= 8) { ++ break ++ } ++ v.reset(OpRISCV64SRAI) ++ v.AuxInt = int64ToAuxInt(63) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SLLI, t) ++ v0.AuxInt = int64ToAuxInt(56) ++ v0.AddArg(y) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SRAI [x] (MOVHreg y)) ++ // cond: x >= 16 ++ // result: (SRAI [63] (SLLI [48] y)) ++ for { ++ t := v.Type ++ x := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpRISCV64MOVHreg { ++ break ++ } ++ y := v_0.Args[0] ++ if !(x >= 16) { ++ break ++ } ++ v.reset(OpRISCV64SRAI) ++ v.AuxInt = int64ToAuxInt(63) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SLLI, t) ++ v0.AuxInt = int64ToAuxInt(48) ++ v0.AddArg(y) ++ v.AddArg(v0) ++ return true ++ } ++ // match: (SRAI [x] (MOVWreg y)) ++ // cond: x >= 32 ++ // result: (SRAIW [31] y) ++ for { ++ x := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpRISCV64MOVWreg { ++ break ++ } ++ y := v_0.Args[0] ++ if !(x >= 32) { ++ break ++ } ++ v.reset(OpRISCV64SRAIW) ++ v.AuxInt = int64ToAuxInt(31) + v.AddArg(y) + return true + } +@@ -6341,20 +6393,6 @@ func rewriteValueRISCV64_OpRISCV64SRAW(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +- // match: (SRL (MOVWUreg x) y) +- // result: (SRLW x y) +- for { +- t := v.Type +- if v_0.Op != OpRISCV64MOVWUreg { +- break +- } +- x := v_0.Args[0] +- y := v_1 +- v.reset(OpRISCV64SRLW) +- v.Type = t +- v.AddArg2(x, y) +- return true +- } + // match: (SRL x (MOVDconst [val])) + // result: (SRLI [int64(val&63)] x) + for { +@@ -6373,7 +6411,8 @@ func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64SRLI(v *Value) bool { + v_0 := v.Args[0] + // match: (SRLI [x] (MOVWUreg y)) +- // result: (SRLIW [x] y) ++ // cond: x >= 0 && x <= 31 ++ // result: (SRLIW [int64(x)] y) + for { + t := v.Type + x := auxIntToInt64(v.AuxInt) +@@ -6381,12 +6420,66 @@ func rewriteValueRISCV64_OpRISCV64SRLI(v *Value) bool { + break + } + y := v_0.Args[0] ++ if !(x >= 0 && x <= 31) { ++ break ++ } + v.reset(OpRISCV64SRLIW) + v.Type = t +- v.AuxInt = int64ToAuxInt(x) ++ v.AuxInt = int64ToAuxInt(int64(x)) + v.AddArg(y) + return true + } ++ // match: (SRLI [x] (MOVBUreg y)) ++ // cond: x >= 8 ++ // result: (MOVDconst [0]) ++ for { ++ t := v.Type ++ x := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpRISCV64MOVBUreg { ++ break ++ } ++ if !(x >= 8) { ++ break ++ } ++ v.reset(OpRISCV64MOVDconst) ++ v.Type = t ++ v.AuxInt = int64ToAuxInt(0) ++ return true ++ } ++ // match: (SRLI [x] (MOVHUreg y)) ++ // cond: x >= 16 ++ // result: (MOVDconst [0]) ++ for { ++ t := v.Type ++ x := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpRISCV64MOVHUreg { ++ break ++ } ++ if !(x >= 16) { ++ break ++ } ++ v.reset(OpRISCV64MOVDconst) ++ v.Type = t ++ v.AuxInt = int64ToAuxInt(0) ++ return true ++ } ++ // match: (SRLI [x] (MOVWUreg y)) ++ // cond: x >= 32 ++ // result: (MOVDconst [0]) ++ for { ++ t := v.Type ++ x := auxIntToInt64(v.AuxInt) ++ if v_0.Op != OpRISCV64MOVWUreg { ++ break ++ } ++ if !(x >= 32) { ++ break ++ } ++ v.reset(OpRISCV64MOVDconst) ++ v.Type = t ++ v.AuxInt = int64ToAuxInt(0) ++ return true ++ } + // match: (SRLI [x] (MOVDconst [y])) + // result: (MOVDconst [int64(uint64(y) >> uint32(x))]) + for { +@@ -7035,7 +7128,7 @@ func rewriteValueRISCV64_OpRsh32Ux16(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32Ux16 x y) + // cond: !shiftIsBounded(v) +- // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt16to64 y)))) ++ // result: (AND (SRLW x y) (Neg32 (SLTIU [32] (ZeroExt16to64 y)))) + for { + t := v.Type + x := v_0 +@@ -7044,33 +7137,29 @@ func rewriteValueRISCV64_OpRsh32Ux16(v *Value) bool { + break + } + v.reset(OpRISCV64AND) +- v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t) +- v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v1.AddArg(x) +- v0.AddArg2(v1, y) +- v2 := b.NewValue0(v.Pos, OpNeg32, t) +- v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) +- v3.AuxInt = int64ToAuxInt(32) +- v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +- v4.AddArg(y) +- v3.AddArg(v4) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SRLW, t) ++ v0.AddArg2(x, y) ++ v1 := b.NewValue0(v.Pos, OpNeg32, t) ++ v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) ++ v2.AuxInt = int64ToAuxInt(32) ++ v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v3.AddArg(y) + v2.AddArg(v3) +- v.AddArg2(v0, v2) ++ v1.AddArg(v2) ++ v.AddArg2(v0, v1) + return true + } + // match: (Rsh32Ux16 x y) + // cond: shiftIsBounded(v) +- // result: (SRL (ZeroExt32to64 x) y) ++ // result: (SRLW x y) + for { + x := v_0 + y := v_1 + if !(shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRL) +- v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v0.AddArg(x) +- v.AddArg2(v0, y) ++ v.reset(OpRISCV64SRLW) ++ v.AddArg2(x, y) + return true + } + return false +@@ -7082,7 +7171,7 @@ func rewriteValueRISCV64_OpRsh32Ux32(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32Ux32 x y) + // cond: !shiftIsBounded(v) +- // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt32to64 y)))) ++ // result: (AND (SRLW x y) (Neg32 (SLTIU [32] (ZeroExt32to64 y)))) + for { + t := v.Type + x := v_0 +@@ -7091,33 +7180,29 @@ func rewriteValueRISCV64_OpRsh32Ux32(v *Value) bool { + break + } + v.reset(OpRISCV64AND) +- v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t) +- v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v1.AddArg(x) +- v0.AddArg2(v1, y) +- v2 := b.NewValue0(v.Pos, OpNeg32, t) +- v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) +- v3.AuxInt = int64ToAuxInt(32) +- v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v4.AddArg(y) +- v3.AddArg(v4) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SRLW, t) ++ v0.AddArg2(x, y) ++ v1 := b.NewValue0(v.Pos, OpNeg32, t) ++ v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) ++ v2.AuxInt = int64ToAuxInt(32) ++ v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) ++ v3.AddArg(y) + v2.AddArg(v3) +- v.AddArg2(v0, v2) ++ v1.AddArg(v2) ++ v.AddArg2(v0, v1) + return true + } + // match: (Rsh32Ux32 x y) + // cond: shiftIsBounded(v) +- // result: (SRL (ZeroExt32to64 x) y) ++ // result: (SRLW x y) + for { + x := v_0 + y := v_1 + if !(shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRL) +- v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v0.AddArg(x) +- v.AddArg2(v0, y) ++ v.reset(OpRISCV64SRLW) ++ v.AddArg2(x, y) + return true + } + return false +@@ -7126,10 +7211,9 @@ func rewriteValueRISCV64_OpRsh32Ux64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block +- typ := &b.Func.Config.Types + // match: (Rsh32Ux64 x y) + // cond: !shiftIsBounded(v) +- // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] y))) ++ // result: (AND (SRLW x y) (Neg32 (SLTIU [32] y))) + for { + t := v.Type + x := v_0 +@@ -7138,31 +7222,27 @@ func rewriteValueRISCV64_OpRsh32Ux64(v *Value) bool { + break + } + v.reset(OpRISCV64AND) +- v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t) +- v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v1.AddArg(x) +- v0.AddArg2(v1, y) +- v2 := b.NewValue0(v.Pos, OpNeg32, t) +- v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) +- v3.AuxInt = int64ToAuxInt(32) +- v3.AddArg(y) +- v2.AddArg(v3) +- v.AddArg2(v0, v2) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SRLW, t) ++ v0.AddArg2(x, y) ++ v1 := b.NewValue0(v.Pos, OpNeg32, t) ++ v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) ++ v2.AuxInt = int64ToAuxInt(32) ++ v2.AddArg(y) ++ v1.AddArg(v2) ++ v.AddArg2(v0, v1) + return true + } + // match: (Rsh32Ux64 x y) + // cond: shiftIsBounded(v) +- // result: (SRL (ZeroExt32to64 x) y) ++ // result: (SRLW x y) + for { + x := v_0 + y := v_1 + if !(shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRL) +- v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v0.AddArg(x) +- v.AddArg2(v0, y) ++ v.reset(OpRISCV64SRLW) ++ v.AddArg2(x, y) + return true + } + return false +@@ -7174,7 +7254,7 @@ func rewriteValueRISCV64_OpRsh32Ux8(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32Ux8 x y) + // cond: !shiftIsBounded(v) +- // result: (AND (SRL (ZeroExt32to64 x) y) (Neg32 (SLTIU [32] (ZeroExt8to64 y)))) ++ // result: (AND (SRLW x y) (Neg32 (SLTIU [32] (ZeroExt8to64 y)))) + for { + t := v.Type + x := v_0 +@@ -7183,33 +7263,29 @@ func rewriteValueRISCV64_OpRsh32Ux8(v *Value) bool { + break + } + v.reset(OpRISCV64AND) +- v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t) +- v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v1.AddArg(x) +- v0.AddArg2(v1, y) +- v2 := b.NewValue0(v.Pos, OpNeg32, t) +- v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) +- v3.AuxInt = int64ToAuxInt(32) +- v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +- v4.AddArg(y) +- v3.AddArg(v4) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SRLW, t) ++ v0.AddArg2(x, y) ++ v1 := b.NewValue0(v.Pos, OpNeg32, t) ++ v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t) ++ v2.AuxInt = int64ToAuxInt(32) ++ v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v3.AddArg(y) + v2.AddArg(v3) +- v.AddArg2(v0, v2) ++ v1.AddArg(v2) ++ v.AddArg2(v0, v1) + return true + } + // match: (Rsh32Ux8 x y) + // cond: shiftIsBounded(v) +- // result: (SRL (ZeroExt32to64 x) y) ++ // result: (SRLW x y) + for { + x := v_0 + y := v_1 + if !(shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRL) +- v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v0.AddArg(x) +- v.AddArg2(v0, y) ++ v.reset(OpRISCV64SRLW) ++ v.AddArg2(x, y) + return true + } + return false +@@ -7221,7 +7297,7 @@ func rewriteValueRISCV64_OpRsh32x16(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32x16 x y) + // cond: !shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt16to64 y))))) ++ // result: (SRAW x (OR y (ADDI [-1] (SLTIU [32] (ZeroExt16to64 y))))) + for { + t := v.Type + x := v_0 +@@ -7229,36 +7305,32 @@ func rewriteValueRISCV64_OpRsh32x16(v *Value) bool { + if !(!shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRA) ++ v.reset(OpRISCV64SRAW) + v.Type = t +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type) +- v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) +- v2.AuxInt = int64ToAuxInt(-1) +- v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) +- v3.AuxInt = int64ToAuxInt(32) +- v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) +- v4.AddArg(y) +- v3.AddArg(v4) ++ v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type) ++ v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) ++ v1.AuxInt = int64ToAuxInt(-1) ++ v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) ++ v2.AuxInt = int64ToAuxInt(32) ++ v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v3.AddArg(y) + v2.AddArg(v3) +- v1.AddArg2(y, v2) +- v.AddArg2(v0, v1) ++ v1.AddArg(v2) ++ v0.AddArg2(y, v1) ++ v.AddArg2(x, v0) + return true + } + // match: (Rsh32x16 x y) + // cond: shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) y) ++ // result: (SRAW x y) + for { + x := v_0 + y := v_1 + if !(shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRA) +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v.AddArg2(v0, y) ++ v.reset(OpRISCV64SRAW) ++ v.AddArg2(x, y) + return true + } + return false +@@ -7270,7 +7342,7 @@ func rewriteValueRISCV64_OpRsh32x32(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32x32 x y) + // cond: !shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt32to64 y))))) ++ // result: (SRAW x (OR y (ADDI [-1] (SLTIU [32] (ZeroExt32to64 y))))) + for { + t := v.Type + x := v_0 +@@ -7278,36 +7350,32 @@ func rewriteValueRISCV64_OpRsh32x32(v *Value) bool { + if !(!shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRA) ++ v.reset(OpRISCV64SRAW) + v.Type = t +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type) +- v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) +- v2.AuxInt = int64ToAuxInt(-1) +- v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) +- v3.AuxInt = int64ToAuxInt(32) +- v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) +- v4.AddArg(y) +- v3.AddArg(v4) ++ v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type) ++ v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) ++ v1.AuxInt = int64ToAuxInt(-1) ++ v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) ++ v2.AuxInt = int64ToAuxInt(32) ++ v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) ++ v3.AddArg(y) + v2.AddArg(v3) +- v1.AddArg2(y, v2) +- v.AddArg2(v0, v1) ++ v1.AddArg(v2) ++ v0.AddArg2(y, v1) ++ v.AddArg2(x, v0) + return true + } + // match: (Rsh32x32 x y) + // cond: shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) y) ++ // result: (SRAW x y) + for { + x := v_0 + y := v_1 + if !(shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRA) +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v.AddArg2(v0, y) ++ v.reset(OpRISCV64SRAW) ++ v.AddArg2(x, y) + return true + } + return false +@@ -7316,10 +7384,9 @@ func rewriteValueRISCV64_OpRsh32x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block +- typ := &b.Func.Config.Types + // match: (Rsh32x64 x y) + // cond: !shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] y)))) ++ // result: (SRAW x (OR y (ADDI [-1] (SLTIU [32] y)))) + for { + t := v.Type + x := v_0 +@@ -7327,34 +7394,30 @@ func rewriteValueRISCV64_OpRsh32x64(v *Value) bool { + if !(!shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRA) ++ v.reset(OpRISCV64SRAW) + v.Type = t +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type) +- v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) +- v2.AuxInt = int64ToAuxInt(-1) +- v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) +- v3.AuxInt = int64ToAuxInt(32) +- v3.AddArg(y) +- v2.AddArg(v3) +- v1.AddArg2(y, v2) +- v.AddArg2(v0, v1) ++ v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type) ++ v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) ++ v1.AuxInt = int64ToAuxInt(-1) ++ v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) ++ v2.AuxInt = int64ToAuxInt(32) ++ v2.AddArg(y) ++ v1.AddArg(v2) ++ v0.AddArg2(y, v1) ++ v.AddArg2(x, v0) + return true + } + // match: (Rsh32x64 x y) + // cond: shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) y) ++ // result: (SRAW x y) + for { + x := v_0 + y := v_1 + if !(shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRA) +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v.AddArg2(v0, y) ++ v.reset(OpRISCV64SRAW) ++ v.AddArg2(x, y) + return true + } + return false +@@ -7366,7 +7429,7 @@ func rewriteValueRISCV64_OpRsh32x8(v *Value) bool { + typ := &b.Func.Config.Types + // match: (Rsh32x8 x y) + // cond: !shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) (OR y (ADDI [-1] (SLTIU [32] (ZeroExt8to64 y))))) ++ // result: (SRAW x (OR y (ADDI [-1] (SLTIU [32] (ZeroExt8to64 y))))) + for { + t := v.Type + x := v_0 +@@ -7374,36 +7437,32 @@ func rewriteValueRISCV64_OpRsh32x8(v *Value) bool { + if !(!shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRA) ++ v.reset(OpRISCV64SRAW) + v.Type = t +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type) +- v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) +- v2.AuxInt = int64ToAuxInt(-1) +- v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) +- v3.AuxInt = int64ToAuxInt(32) +- v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) +- v4.AddArg(y) +- v3.AddArg(v4) ++ v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type) ++ v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type) ++ v1.AuxInt = int64ToAuxInt(-1) ++ v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type) ++ v2.AuxInt = int64ToAuxInt(32) ++ v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v3.AddArg(y) + v2.AddArg(v3) +- v1.AddArg2(y, v2) +- v.AddArg2(v0, v1) ++ v1.AddArg(v2) ++ v0.AddArg2(y, v1) ++ v.AddArg2(x, v0) + return true + } + // match: (Rsh32x8 x y) + // cond: shiftIsBounded(v) +- // result: (SRA (SignExt32to64 x) y) ++ // result: (SRAW x y) + for { + x := v_0 + y := v_1 + if !(shiftIsBounded(v)) { + break + } +- v.reset(OpRISCV64SRA) +- v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64) +- v0.AddArg(x) +- v.AddArg2(v0, y) ++ v.reset(OpRISCV64SRAW) ++ v.AddArg2(x, y) + return true + } + return false +diff --git a/src/cmd/compile/internal/test/testdata/arith_test.go b/src/cmd/compile/internal/test/testdata/arith_test.go +index 2b8cd9fad3..cd7b5bc2c4 100644 +--- a/src/cmd/compile/internal/test/testdata/arith_test.go ++++ b/src/cmd/compile/internal/test/testdata/arith_test.go +@@ -268,6 +268,70 @@ func testOverflowConstShift(t *testing.T) { + } + } + ++//go:noinline ++func rsh64x64ConstOverflow8(x int8) int64 { ++ return int64(x) >> 9 ++} ++ ++//go:noinline ++func rsh64x64ConstOverflow16(x int16) int64 { ++ return int64(x) >> 17 ++} ++ ++//go:noinline ++func rsh64x64ConstOverflow32(x int32) int64 { ++ return int64(x) >> 33 ++} ++ ++func testArithRightShiftConstOverflow(t *testing.T) { ++ allSet := int64(-1) ++ if got, want := rsh64x64ConstOverflow8(0x7f), int64(0); got != want { ++ t.Errorf("rsh64x64ConstOverflow8 failed: got %v, want %v", got, want) ++ } ++ if got, want := rsh64x64ConstOverflow16(0x7fff), int64(0); got != want { ++ t.Errorf("rsh64x64ConstOverflow16 failed: got %v, want %v", got, want) ++ } ++ if got, want := rsh64x64ConstOverflow32(0x7ffffff), int64(0); got != want { ++ t.Errorf("rsh64x64ConstOverflow32 failed: got %v, want %v", got, want) ++ } ++ if got, want := rsh64x64ConstOverflow8(int8(-1)), allSet; got != want { ++ t.Errorf("rsh64x64ConstOverflow8 failed: got %v, want %v", got, want) ++ } ++ if got, want := rsh64x64ConstOverflow16(int16(-1)), allSet; got != want { ++ t.Errorf("rsh64x64ConstOverflow16 failed: got %v, want %v", got, want) ++ } ++ if got, want := rsh64x64ConstOverflow32(int32(-1)), allSet; got != want { ++ t.Errorf("rsh64x64ConstOverflow32 failed: got %v, want %v", got, want) ++ } ++} ++ ++//go:noinline ++func rsh64Ux64ConstOverflow8(x uint8) uint64 { ++ return uint64(x) >> 9 ++} ++ ++//go:noinline ++func rsh64Ux64ConstOverflow16(x uint16) uint64 { ++ return uint64(x) >> 17 ++} ++ ++//go:noinline ++func rsh64Ux64ConstOverflow32(x uint32) uint64 { ++ return uint64(x) >> 33 ++} ++ ++func testRightShiftConstOverflow(t *testing.T) { ++ if got, want := rsh64Ux64ConstOverflow8(0xff), uint64(0); got != want { ++ t.Errorf("rsh64Ux64ConstOverflow8 failed: got %v, want %v", got, want) ++ } ++ if got, want := rsh64Ux64ConstOverflow16(0xffff), uint64(0); got != want { ++ t.Errorf("rsh64Ux64ConstOverflow16 failed: got %v, want %v", got, want) ++ } ++ if got, want := rsh64Ux64ConstOverflow32(0xffffffff), uint64(0); got != want { ++ t.Errorf("rsh64Ux64ConstOverflow32 failed: got %v, want %v", got, want) ++ } ++} ++ + // test64BitConstMult tests that rewrite rules don't fold 64 bit constants + // into multiply instructions. + func test64BitConstMult(t *testing.T) { +@@ -918,6 +982,8 @@ func TestArithmetic(t *testing.T) { + testShiftCX(t) + testSubConst(t) + testOverflowConstShift(t) ++ testArithRightShiftConstOverflow(t) ++ testRightShiftConstOverflow(t) + testArithConstShift(t) + testArithRshConst(t) + testLargeConst(t) +diff --git a/test/codegen/shift.go b/test/codegen/shift.go +index b9d888ca6c..51b9b2e39c 100644 +--- a/test/codegen/shift.go ++++ b/test/codegen/shift.go +@@ -22,12 +22,42 @@ func rshConst64Ux64(v uint64) uint64 { + return v >> uint64(33) + } + ++func rshConst64Ux64Overflow32(v uint32) uint64 { ++ // riscv64:"MOV\t\\$0,",-"SRL" ++ return uint64(v) >> 32 ++} ++ ++func rshConst64Ux64Overflow16(v uint16) uint64 { ++ // riscv64:"MOV\t\\$0,",-"SRL" ++ return uint64(v) >> 16 ++} ++ ++func rshConst64Ux64Overflow8(v uint8) uint64 { ++ // riscv64:"MOV\t\\$0,",-"SRL" ++ return uint64(v) >> 8 ++} ++ + func rshConst64x64(v int64) int64 { + // ppc64x:"SRAD" + // riscv64:"SRAI\t",-"OR",-"SLTIU" + return v >> uint64(33) + } + ++func rshConst64x64Overflow32(v int32) int64 { ++ // riscv64:"SRAIW",-"SLLI",-"SRAI\t" ++ return int64(v) >> 32 ++} ++ ++func rshConst64x64Overflow16(v int16) int64 { ++ // riscv64:"SLLI","SRAI",-"SRAIW" ++ return int64(v) >> 16 ++} ++ ++func rshConst64x64Overflow8(v int8) int64 { ++ // riscv64:"SLLI","SRAI",-"SRAIW" ++ return int64(v) >> 8 ++} ++ + func lshConst32x64(v int32) int32 { + // ppc64x:"SLW" + // riscv64:"SLLI",-"AND",-"SLTIU", -"MOVW" +-- +2.39.5 + diff --git a/2024-crypto-sha512-provide-optimised-assembly-for-riscv64.patch b/2024-crypto-sha512-provide-optimised-assembly-for-riscv64.patch new file mode 100644 index 0000000..5463752 --- /dev/null +++ b/2024-crypto-sha512-provide-optimised-assembly-for-riscv64.patch @@ -0,0 +1,380 @@ +From 981c83755a2aba3e61156f28dd483d8a555dcaa7 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:34:22 +0800 +Subject: [PATCH 024/119] crypto/sha512: provide optimised assembly for riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Provide an optimised assembly implementation of sha512 for riscv64. +This results in significant performance gains. + +On a StarFive VisionFive 2: + + │ sha512a │ sha512b │ + │ sec/op │ sec/op vs base │ +Hash8Bytes/New-4 7.998µ ± 0% 6.962µ ± 0% -12.96% (p=0.000 n=10) +Hash8Bytes/Sum384-4 8.113µ ± 0% 6.651µ ± 0% -18.02% (p=0.000 n=10) +Hash8Bytes/Sum512-4 8.269µ ± 0% 6.748µ ± 0% -18.39% (p=0.000 n=10) +Hash1K/New-4 57.38µ ± 0% 36.92µ ± 0% -35.66% (p=0.000 n=10) +Hash1K/Sum384-4 57.47µ ± 0% 36.57µ ± 0% -36.37% (p=0.000 n=10) +Hash1K/Sum512-4 57.61µ ± 0% 36.75µ ± 0% -36.21% (p=0.000 n=10) +Hash8K/New-4 402.5µ ± 0% 245.4µ ± 0% -39.02% (p=0.000 n=10) +Hash8K/Sum384-4 402.5µ ± 0% 245.1µ ± 0% -39.12% (p=0.000 n=10) +Hash8K/Sum512-4 402.7µ ± 0% 245.3µ ± 0% -39.09% (p=0.000 n=10) + + │ sha512a │ sha512b │ + │ B/s │ B/s vs base │ +Hash8Bytes/New-4 976.6Ki ± 0% 1123.0Ki ± 0% +15.00% (p=0.000 n=10) +Hash8Bytes/Sum384-4 966.8Ki ± 0% 1171.9Ki ± 0% +21.21% (p=0.000 n=10) +Hash8Bytes/Sum512-4 947.3Ki ± 0% 1162.1Ki ± 1% +22.68% (p=0.000 n=10) +Hash1K/New-4 17.01Mi ± 0% 26.45Mi ± 0% +55.47% (p=0.000 n=10) +Hash1K/Sum384-4 16.99Mi ± 0% 26.70Mi ± 0% +57.13% (p=0.000 n=10) +Hash1K/Sum512-4 16.95Mi ± 0% 26.57Mi ± 0% +56.74% (p=0.000 n=10) +Hash8K/New-4 19.41Mi ± 0% 31.83Mi ± 0% +63.99% (p=0.000 n=10) +Hash8K/Sum384-4 19.41Mi ± 0% 31.88Mi ± 0% +64.28% (p=0.000 n=10) +Hash8K/Sum512-4 19.40Mi ± 0% 31.85Mi ± 0% +64.21% (p=0.000 n=10) + +Change-Id: I92629a106b75b0526e9f2a8fe3cc4a6f7fc63c8c +Reviewed-on: https://go-review.googlesource.com/c/go/+/518631 +Auto-Submit: Dmitri Shuralyov +Reviewed-by: Dmitri Shuralyov +Run-TryBot: Joel Sing +Reviewed-by: M Zhuo +Reviewed-by: Wang Yaduo +TryBot-Result: Gopher Robot +Reviewed-by: Mark Ryan +Reviewed-by: Cherry Mui +--- + src/crypto/sha512/sha512block_decl.go | 2 +- + src/crypto/sha512/sha512block_generic.go | 2 +- + src/crypto/sha512/sha512block_riscv64.s | 291 +++++++++++++++++++++++ + 3 files changed, 293 insertions(+), 2 deletions(-) + create mode 100644 src/crypto/sha512/sha512block_riscv64.s + +diff --git a/src/crypto/sha512/sha512block_decl.go b/src/crypto/sha512/sha512block_decl.go +index 4ad4418bc0..d5d03d0f3c 100644 +--- a/src/crypto/sha512/sha512block_decl.go ++++ b/src/crypto/sha512/sha512block_decl.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build s390x || ppc64le || ppc64 ++//go:build ppc64le || ppc64 || riscv64 || s390x + + package sha512 + +diff --git a/src/crypto/sha512/sha512block_generic.go b/src/crypto/sha512/sha512block_generic.go +index 02ecc2c794..f11c0980bd 100644 +--- a/src/crypto/sha512/sha512block_generic.go ++++ b/src/crypto/sha512/sha512block_generic.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64 ++//go:build !amd64 && !arm64 && !ppc64 && !ppc64le && !riscv64 && !s390x + + package sha512 + +diff --git a/src/crypto/sha512/sha512block_riscv64.s b/src/crypto/sha512/sha512block_riscv64.s +new file mode 100644 +index 0000000000..361aafe49d +--- /dev/null ++++ b/src/crypto/sha512/sha512block_riscv64.s +@@ -0,0 +1,291 @@ ++// Copyright 2023 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++#include "textflag.h" ++ ++// SHA512 block routine. See sha512block.go for Go equivalent. ++// ++// The algorithm is detailed in FIPS 180-4: ++// ++// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf ++// ++// Wt = Mt; for 0 <= t <= 15 ++// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79 ++// ++// a = H0 ++// b = H1 ++// c = H2 ++// d = H3 ++// e = H4 ++// f = H5 ++// g = H6 ++// h = H7 ++// ++// for t = 0 to 79 { ++// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt ++// T2 = BIGSIGMA0(a) + Maj(a,b,c) ++// h = g ++// g = f ++// f = e ++// e = d + T1 ++// d = c ++// c = b ++// b = a ++// a = T1 + T2 ++// } ++// ++// H0 = a + H0 ++// H1 = b + H1 ++// H2 = c + H2 ++// H3 = d + H3 ++// H4 = e + H4 ++// H5 = f + H5 ++// H6 = g + H6 ++// H7 = h + H7 ++ ++#define ROR(s, r, d, t1, t2) \ ++ SLL $(64-s), r, t1; \ ++ SRL $(s), r, t2; \ ++ OR t1, t2, d ++ ++// Wt = Mt; for 0 <= t <= 15 ++#define MSGSCHEDULE0(index) \ ++ MOVBU ((index*8)+0)(X29), X5; \ ++ MOVBU ((index*8)+1)(X29), X6; \ ++ MOVBU ((index*8)+2)(X29), X7; \ ++ MOVBU ((index*8)+3)(X29), X8; \ ++ SLL $56, X5; \ ++ SLL $48, X6; \ ++ OR X5, X6, X5; \ ++ SLL $40, X7; \ ++ OR X5, X7, X5; \ ++ SLL $32, X8; \ ++ OR X5, X8, X5; \ ++ MOVBU ((index*8)+4)(X29), X9; \ ++ MOVBU ((index*8)+5)(X29), X6; \ ++ MOVBU ((index*8)+6)(X29), X7; \ ++ MOVBU ((index*8)+7)(X29), X8; \ ++ SLL $24, X9; \ ++ OR X5, X9, X5; \ ++ SLL $16, X6; \ ++ OR X5, X6, X5; \ ++ SLL $8, X7; \ ++ OR X5, X7, X5; \ ++ OR X5, X8, X5; \ ++ MOV X5, (index*8)(X19) ++ ++// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79 ++// SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x) ++// SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x) ++#define MSGSCHEDULE1(index) \ ++ MOV (((index-2)&0xf)*8)(X19), X5; \ ++ MOV (((index-15)&0xf)*8)(X19), X6; \ ++ MOV (((index-7)&0xf)*8)(X19), X9; \ ++ MOV (((index-16)&0xf)*8)(X19), X21; \ ++ ROR(19, X5, X7, X23, X24); \ ++ ROR(61, X5, X8, X23, X24); \ ++ SRL $6, X5; \ ++ XOR X7, X5; \ ++ XOR X8, X5; \ ++ ADD X9, X5; \ ++ ROR(1, X6, X7, X23, X24); \ ++ ROR(8, X6, X8, X23, X24); \ ++ SRL $7, X6; \ ++ XOR X7, X6; \ ++ XOR X8, X6; \ ++ ADD X6, X5; \ ++ ADD X21, X5; \ ++ MOV X5, ((index&0xf)*8)(X19) ++ ++// Calculate T1 in X5. ++// h is also used as an accumulator. Wt is passed in X5. ++// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt ++// BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x) ++// Ch(x, y, z) = (x AND y) XOR (NOT x AND z) ++#define SHA512T1(index, e, f, g, h) \ ++ MOV (index*8)(X18), X8; \ ++ ADD X5, h; \ ++ ROR(14, e, X6, X23, X24); \ ++ ADD X8, h; \ ++ ROR(18, e, X7, X23, X24); \ ++ XOR X7, X6; \ ++ ROR(41, e, X8, X23, X24); \ ++ XOR X8, X6; \ ++ ADD X6, h; \ ++ AND e, f, X5; \ ++ NOT e, X7; \ ++ AND g, X7; \ ++ XOR X7, X5; \ ++ ADD h, X5 ++ ++// Calculate T2 in X6. ++// T2 = BIGSIGMA0(a) + Maj(a, b, c) ++// BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x) ++// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) ++#define SHA512T2(a, b, c) \ ++ ROR(28, a, X6, X23, X24); \ ++ ROR(34, a, X7, X23, X24); \ ++ XOR X7, X6; \ ++ ROR(39, a, X8, X23, X24); \ ++ XOR X8, X6; \ ++ AND a, b, X7; \ ++ AND a, c, X8; \ ++ XOR X8, X7; \ ++ AND b, c, X9; \ ++ XOR X9, X7; \ ++ ADD X7, X6 ++ ++// Calculate T1 and T2, then e = d + T1 and a = T1 + T2. ++// The values for e and a are stored in d and h, ready for rotation. ++#define SHA512ROUND(index, a, b, c, d, e, f, g, h) \ ++ SHA512T1(index, e, f, g, h); \ ++ SHA512T2(a, b, c); \ ++ MOV X6, h; \ ++ ADD X5, d; \ ++ ADD X5, h ++ ++#define SHA512ROUND0(index, a, b, c, d, e, f, g, h) \ ++ MSGSCHEDULE0(index); \ ++ SHA512ROUND(index, a, b, c, d, e, f, g, h) ++ ++#define SHA512ROUND1(index, a, b, c, d, e, f, g, h) \ ++ MSGSCHEDULE1(index); \ ++ SHA512ROUND(index, a, b, c, d, e, f, g, h) ++ ++// func block(dig *digest, p []byte) ++TEXT ·block(SB),0,$128-32 ++ MOV p_base+8(FP), X29 ++ MOV p_len+16(FP), X30 ++ SRL $7, X30 ++ SLL $7, X30 ++ ++ ADD X29, X30, X28 ++ BEQ X28, X29, end ++ ++ MOV ·_K(SB), X18 // const table ++ ADD $8, X2, X19 // message schedule ++ ++ MOV dig+0(FP), X20 ++ MOV (0*8)(X20), X10 // a = H0 ++ MOV (1*8)(X20), X11 // b = H1 ++ MOV (2*8)(X20), X12 // c = H2 ++ MOV (3*8)(X20), X13 // d = H3 ++ MOV (4*8)(X20), X14 // e = H4 ++ MOV (5*8)(X20), X15 // f = H5 ++ MOV (6*8)(X20), X16 // g = H6 ++ MOV (7*8)(X20), X17 // h = H7 ++ ++loop: ++ SHA512ROUND0(0, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA512ROUND0(1, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA512ROUND0(2, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA512ROUND0(3, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA512ROUND0(4, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA512ROUND0(5, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA512ROUND0(6, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA512ROUND0(7, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA512ROUND0(8, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA512ROUND0(9, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA512ROUND0(10, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA512ROUND0(11, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA512ROUND0(12, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA512ROUND0(13, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA512ROUND0(14, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA512ROUND0(15, X11, X12, X13, X14, X15, X16, X17, X10) ++ ++ SHA512ROUND1(16, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA512ROUND1(17, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA512ROUND1(18, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA512ROUND1(19, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA512ROUND1(20, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA512ROUND1(21, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA512ROUND1(22, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA512ROUND1(23, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA512ROUND1(24, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA512ROUND1(25, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA512ROUND1(26, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA512ROUND1(27, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA512ROUND1(28, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA512ROUND1(29, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA512ROUND1(30, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA512ROUND1(31, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA512ROUND1(32, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA512ROUND1(33, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA512ROUND1(34, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA512ROUND1(35, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA512ROUND1(36, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA512ROUND1(37, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA512ROUND1(38, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA512ROUND1(39, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA512ROUND1(40, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA512ROUND1(41, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA512ROUND1(42, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA512ROUND1(43, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA512ROUND1(44, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA512ROUND1(45, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA512ROUND1(46, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA512ROUND1(47, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA512ROUND1(48, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA512ROUND1(49, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA512ROUND1(50, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA512ROUND1(51, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA512ROUND1(52, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA512ROUND1(53, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA512ROUND1(54, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA512ROUND1(55, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA512ROUND1(56, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA512ROUND1(57, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA512ROUND1(58, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA512ROUND1(59, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA512ROUND1(60, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA512ROUND1(61, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA512ROUND1(62, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA512ROUND1(63, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA512ROUND1(64, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA512ROUND1(65, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA512ROUND1(66, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA512ROUND1(67, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA512ROUND1(68, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA512ROUND1(69, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA512ROUND1(70, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA512ROUND1(71, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA512ROUND1(72, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA512ROUND1(73, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA512ROUND1(74, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA512ROUND1(75, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA512ROUND1(76, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA512ROUND1(77, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA512ROUND1(78, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA512ROUND1(79, X11, X12, X13, X14, X15, X16, X17, X10) ++ ++ MOV (0*8)(X20), X5 ++ MOV (1*8)(X20), X6 ++ MOV (2*8)(X20), X7 ++ MOV (3*8)(X20), X8 ++ ADD X5, X10 // H0 = a + H0 ++ ADD X6, X11 // H1 = b + H1 ++ ADD X7, X12 // H2 = c + H2 ++ ADD X8, X13 // H3 = d + H3 ++ MOV X10, (0*8)(X20) ++ MOV X11, (1*8)(X20) ++ MOV X12, (2*8)(X20) ++ MOV X13, (3*8)(X20) ++ MOV (4*8)(X20), X5 ++ MOV (5*8)(X20), X6 ++ MOV (6*8)(X20), X7 ++ MOV (7*8)(X20), X8 ++ ADD X5, X14 // H4 = e + H4 ++ ADD X6, X15 // H5 = f + H5 ++ ADD X7, X16 // H6 = g + H6 ++ ADD X8, X17 // H7 = h + H7 ++ MOV X14, (4*8)(X20) ++ MOV X15, (5*8)(X20) ++ MOV X16, (6*8)(X20) ++ MOV X17, (7*8)(X20) ++ ++ ADD $128, X29 ++ BNE X28, X29, loop ++ ++end: ++ RET +-- +2.39.5 + diff --git a/2025-cmd-go-add-GORISCV64-environment-variable.patch b/2025-cmd-go-add-GORISCV64-environment-variable.patch new file mode 100644 index 0000000..285fb4b --- /dev/null +++ b/2025-cmd-go-add-GORISCV64-environment-variable.patch @@ -0,0 +1,396 @@ +From 75cafea7e262d3f3d23ea8a5e90172354558a1e3 Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:33 +0800 +Subject: [PATCH 025/119] cmd/go: add GORISCV64 environment variable + +The variable represents the RISC-V user-mode application profile for +which to compile. Valid values are rva20u64 (the default) and +rva22u64. + +Setting GORISCV64=rva20u64 defines the riscv64.rva20u64 build tag, +sets the internal variable buildcfg.GORISCV64 to 20 and defines the +macro GORISCV64_rva20u64 for use in assembly language code. + +Setting GORISCV64=rva22u64 defines the riscv64.rva20u64 and +riscv64.rva22u64 build tags, sets the internal variable +buildcfg.GORISCV64 to 22 and defines the macro GORISCV64_rva22u64 +for use in assembly language code. + +This patch only provides a mechanism for the compiler and hand-coded +assembly language functions to take advantage of the RISC-V +extensions mandated by the application profiles. Further patches +will be required to get the compiler/assembler and assembly language +functions to actually generate and use these extensions. + +Fixes #61476 + +Change-Id: I9195ae6ee71703cd2112160e89157ab63b8391af +Reviewed-on: https://go-review.googlesource.com/c/go/+/541135 +Reviewed-by: M Zhuo +Reviewed-by: Joel Sing +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Wang Yaduo +Reviewed-by: Cherry Mui +Reviewed-by: Bryan Mills +Run-TryBot: M Zhuo +TryBot-Result: Gopher Robot +--- + src/cmd/dist/build.go | 15 +++++++ + src/cmd/dist/buildruntime.go | 1 + + src/cmd/go/alldocs.go | 9 +++- + src/cmd/go/internal/cfg/cfg.go | 19 +++++---- + src/cmd/go/internal/help/helpdoc.go | 9 +++- + src/cmd/go/internal/work/gc.go | 5 +++ + src/cmd/go/testdata/script/tooltags.txt | 20 +++++++++ + src/cmd/internal/testdir/testdir_test.go | 2 +- + src/internal/buildcfg/cfg.go | 53 +++++++++++++++++------- + src/internal/buildcfg/cfg_test.go | 14 +++++++ + src/internal/cfg/cfg.go | 1 + + 11 files changed, 122 insertions(+), 26 deletions(-) + +diff --git a/src/cmd/dist/build.go b/src/cmd/dist/build.go +index ce573686ec..7d720cc5e1 100644 +--- a/src/cmd/dist/build.go ++++ b/src/cmd/dist/build.go +@@ -39,6 +39,7 @@ var ( + gomips string + gomips64 string + goppc64 string ++ goriscv64 string + goroot string + goroot_final string + goextlinkenabled string +@@ -184,6 +185,12 @@ func xinit() { + } + goppc64 = b + ++ b = os.Getenv("GORISCV64") ++ if b == "" { ++ b = "rva20u64" ++ } ++ goriscv64 = b ++ + if p := pathf("%s/src/all.bash", goroot); !isfile(p) { + fatalf("$GOROOT is not set correctly or not exported\n"+ + "\tGOROOT=%s\n"+ +@@ -244,6 +251,7 @@ func xinit() { + os.Setenv("GOMIPS", gomips) + os.Setenv("GOMIPS64", gomips64) + os.Setenv("GOPPC64", goppc64) ++ os.Setenv("GORISCV64", goriscv64) + os.Setenv("GOROOT", goroot) + os.Setenv("GOROOT_FINAL", goroot_final) + +@@ -899,6 +907,10 @@ func runInstall(pkg string, ch chan struct{}) { + asmArgs = append(asmArgs, "-D", "GOPPC64_power8") + } + } ++ if goarch == "riscv64" { ++ // Define GORISCV64_value from goriscv64 ++ asmArgs = append(asmArgs, "-D", "GORISCV64_"+goriscv64) ++ } + goasmh := pathf("%s/go_asm.h", workdir) + if IsRuntimePackagePath(pkg) { + asmArgs = append(asmArgs, "-compiling-runtime") +@@ -1253,6 +1265,9 @@ func cmdenv() { + if goarch == "ppc64" || goarch == "ppc64le" { + xprintf(format, "GOPPC64", goppc64) + } ++ if goarch == "riscv64" { ++ xprintf(format, "GORISCV64", goriscv64) ++ } + xprintf(format, "GOWORK", "off") + + if *path { +diff --git a/src/cmd/dist/buildruntime.go b/src/cmd/dist/buildruntime.go +index 816b944400..7095f43772 100644 +--- a/src/cmd/dist/buildruntime.go ++++ b/src/cmd/dist/buildruntime.go +@@ -58,6 +58,7 @@ func mkbuildcfg(file string) { + fmt.Fprintf(&buf, "const defaultGOMIPS = `%s`\n", gomips) + fmt.Fprintf(&buf, "const defaultGOMIPS64 = `%s`\n", gomips64) + fmt.Fprintf(&buf, "const defaultGOPPC64 = `%s`\n", goppc64) ++ fmt.Fprintf(&buf, "const defaultGORISCV64 = `%s`\n", goriscv64) + fmt.Fprintf(&buf, "const defaultGOEXPERIMENT = `%s`\n", goexperiment) + fmt.Fprintf(&buf, "const defaultGO_EXTLINK_ENABLED = `%s`\n", goextlinkenabled) + fmt.Fprintf(&buf, "const defaultGO_LDSO = `%s`\n", defaultldso) +diff --git a/src/cmd/go/alldocs.go b/src/cmd/go/alldocs.go +index bb28756133..db737b062e 100644 +--- a/src/cmd/go/alldocs.go ++++ b/src/cmd/go/alldocs.go +@@ -1978,10 +1978,13 @@ + // ppc64.power8, ppc64.power9, and ppc64.power10 + // (or ppc64le.power8, ppc64le.power9, and ppc64le.power10) + // feature build tags. ++// - For GOARCH=riscv64, ++// GORISCV64=rva20u64 and rva22u64 correspond to the riscv64.rva20u64 ++// and riscv64.rva22u64 build tags. + // - For GOARCH=wasm, GOWASM=satconv and signext + // correspond to the wasm.satconv and wasm.signext feature build tags. + // +-// For GOARCH=amd64, arm, ppc64, and ppc64le, a particular feature level ++// For GOARCH=amd64, arm, ppc64, ppc64le, and riscv64, a particular feature level + // sets the feature build tags for all previous levels as well. + // For example, GOAMD64=v2 sets the amd64.v1 and amd64.v2 feature flags. + // This ensures that code making use of v2 features continues to compile +@@ -2275,6 +2278,10 @@ + // GOPPC64 + // For GOARCH=ppc64{,le}, the target ISA (Instruction Set Architecture). + // Valid values are power8 (default), power9, power10. ++// GORISCV64 ++// For GOARCH=riscv64, the RISC-V user-mode application profile for which ++// to compile. Valid values are rva20u64 (default), rva22u64. ++// See https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc + // GOWASM + // For GOARCH=wasm, comma-separated list of experimental WebAssembly features to use. + // Valid values are satconv, signext. +diff --git a/src/cmd/go/internal/cfg/cfg.go b/src/cmd/go/internal/cfg/cfg.go +index 3b591a17d0..da7174c153 100644 +--- a/src/cmd/go/internal/cfg/cfg.go ++++ b/src/cmd/go/internal/cfg/cfg.go +@@ -409,14 +409,15 @@ var ( + GOMODCACHE = envOr("GOMODCACHE", gopathDir("pkg/mod")) + + // Used in envcmd.MkEnv and build ID computations. +- GOARM = envOr("GOARM", fmt.Sprint(buildcfg.GOARM)) +- GOARM64 = envOr("GOARM64", fmt.Sprint(buildcfg.GOARM64)) +- GO386 = envOr("GO386", buildcfg.GO386) +- GOAMD64 = envOr("GOAMD64", fmt.Sprintf("%s%d", "v", buildcfg.GOAMD64)) +- GOMIPS = envOr("GOMIPS", buildcfg.GOMIPS) +- GOMIPS64 = envOr("GOMIPS64", buildcfg.GOMIPS64) +- GOPPC64 = envOr("GOPPC64", fmt.Sprintf("%s%d", "power", buildcfg.GOPPC64)) +- GOWASM = envOr("GOWASM", fmt.Sprint(buildcfg.GOWASM)) ++ GOARM = envOr("GOARM", fmt.Sprint(buildcfg.GOARM)) ++ GOARM64 = envOr("GOARM64", fmt.Sprint(buildcfg.GOARM64)) ++ GO386 = envOr("GO386", buildcfg.GO386) ++ GOAMD64 = envOr("GOAMD64", fmt.Sprintf("%s%d", "v", buildcfg.GOAMD64)) ++ GOMIPS = envOr("GOMIPS", buildcfg.GOMIPS) ++ GOMIPS64 = envOr("GOMIPS64", buildcfg.GOMIPS64) ++ GOPPC64 = envOr("GOPPC64", fmt.Sprintf("%s%d", "power", buildcfg.GOPPC64)) ++ GORISCV64 = envOr("GORISCV64", fmt.Sprintf("rva%du64", buildcfg.GORISCV64)) ++ GOWASM = envOr("GOWASM", fmt.Sprint(buildcfg.GOWASM)) + + GOPROXY = envOr("GOPROXY", "") + GOSUMDB = envOr("GOSUMDB", "") +@@ -449,6 +450,8 @@ func GetArchEnv() (key, val string) { + return "GOMIPS64", GOMIPS64 + case "ppc64", "ppc64le": + return "GOPPC64", GOPPC64 ++ case "riscv64": ++ return "GORISCV64", GORISCV64 + case "wasm": + return "GOWASM", GOWASM + } +diff --git a/src/cmd/go/internal/help/helpdoc.go b/src/cmd/go/internal/help/helpdoc.go +index 68ac4d229d..55701bac46 100644 +--- a/src/cmd/go/internal/help/helpdoc.go ++++ b/src/cmd/go/internal/help/helpdoc.go +@@ -617,6 +617,10 @@ Architecture-specific environment variables: + GOPPC64 + For GOARCH=ppc64{,le}, the target ISA (Instruction Set Architecture). + Valid values are power8 (default), power9, power10. ++ GORISCV64 ++ For GOARCH=riscv64, the RISC-V user-mode application profile for which ++ to compile. Valid values are rva20u64 (default), rva22u64. ++ See https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc + GOWASM + For GOARCH=wasm, comma-separated list of experimental WebAssembly features to use. + Valid values are satconv, signext. +@@ -905,10 +909,13 @@ The defined architecture feature build tags are: + ppc64.power8, ppc64.power9, and ppc64.power10 + (or ppc64le.power8, ppc64le.power9, and ppc64le.power10) + feature build tags. ++ - For GOARCH=riscv64, ++ GORISCV64=rva20u64 and rva22u64 correspond to the riscv64.rva20u64 ++ and riscv64.rva22u64 build tags. + - For GOARCH=wasm, GOWASM=satconv and signext + correspond to the wasm.satconv and wasm.signext feature build tags. + +-For GOARCH=amd64, arm, ppc64, and ppc64le, a particular feature level ++For GOARCH=amd64, arm, ppc64, ppc64le, and riscv64, a particular feature level + sets the feature build tags for all previous levels as well. + For example, GOAMD64=v2 sets the amd64.v1 and amd64.v2 feature flags. + This ensures that code making use of v2 features continues to compile +diff --git a/src/cmd/go/internal/work/gc.go b/src/cmd/go/internal/work/gc.go +index f682219b3b..a8bd121472 100644 +--- a/src/cmd/go/internal/work/gc.go ++++ b/src/cmd/go/internal/work/gc.go +@@ -409,6 +409,11 @@ func asmArgs(a *Action, p *load.Package) []any { + } + } + ++ if cfg.Goarch == "riscv64" { ++ // Define GORISCV64_value from cfg.GORISCV64. ++ args = append(args, "-D", "GORISCV64_"+cfg.GORISCV64) ++ } ++ + return args + } + +diff --git a/src/cmd/go/testdata/script/tooltags.txt b/src/cmd/go/testdata/script/tooltags.txt +index 27068eebae..1f6f54563c 100644 +--- a/src/cmd/go/testdata/script/tooltags.txt ++++ b/src/cmd/go/testdata/script/tooltags.txt +@@ -40,6 +40,26 @@ env GOPPC64=power10 + go list -f '{{context.ToolTags}}' + stdout 'ppc64le.power8 ppc64le.power9 ppc64le.power10' + ++env GOARCH=riscv64 ++env GORISCV64=rva20u64 ++go list -f '{{context.ToolTags}}' ++stdout 'riscv64.rva20u64' ++ ++env GOARCH=riscv64 ++env GORISCV64=rva22u64 ++go list -f '{{context.ToolTags}}' ++stdout 'riscv64.rva20u64 riscv64.rva22u64' ++ ++env GOARCH=riscv64 ++env GORISCV64=rva22 ++! go list -f '{{context.ToolTags}}' ++stderr 'go: invalid GORISCV64: must be rva20u64, rva22u64' ++ ++env GOARCH=riscv64 ++env GORISCV64= ++go list -f '{{context.ToolTags}}' ++stdout 'riscv64.rva20u64' ++ + env GOARCH=386 + env GO386=sse2 + go list -f '{{context.ToolTags}}' +diff --git a/src/cmd/internal/testdir/testdir_test.go b/src/cmd/internal/testdir/testdir_test.go +index bd7785900c..1677191d96 100644 +--- a/src/cmd/internal/testdir/testdir_test.go ++++ b/src/cmd/internal/testdir/testdir_test.go +@@ -1464,7 +1464,7 @@ var ( + "ppc64x": {}, // A pseudo-arch representing both ppc64 and ppc64le + "s390x": {}, + "wasm": {}, +- "riscv64": {}, ++ "riscv64": {"GORISCV64", "rva20u64", "rva22u64"}, + } + ) + +diff --git a/src/internal/buildcfg/cfg.go b/src/internal/buildcfg/cfg.go +index dbb1f70ec3..599e782c7a 100644 +--- a/src/internal/buildcfg/cfg.go ++++ b/src/internal/buildcfg/cfg.go +@@ -21,20 +21,21 @@ import ( + ) + + var ( +- GOROOT = runtime.GOROOT() // cached for efficiency +- GOARCH = envOr("GOARCH", defaultGOARCH) +- GOOS = envOr("GOOS", defaultGOOS) +- GO386 = envOr("GO386", defaultGO386) +- GOAMD64 = goamd64() +- GOARM = goarm() +- GOARM64 = goarm64() +- GOMIPS = gomips() +- GOMIPS64 = gomips64() +- GOPPC64 = goppc64() +- GOWASM = gowasm() +- ToolTags = toolTags() +- GO_LDSO = defaultGO_LDSO +- Version = version ++ GOROOT = runtime.GOROOT() // cached for efficiency ++ GOARCH = envOr("GOARCH", defaultGOARCH) ++ GOOS = envOr("GOOS", defaultGOOS) ++ GO386 = envOr("GO386", defaultGO386) ++ GOAMD64 = goamd64() ++ GOARM = goarm() ++ GOARM64 = goarm64() ++ GOMIPS = gomips() ++ GOMIPS64 = gomips64() ++ GOPPC64 = goppc64() ++ GORISCV64 = goriscv64() ++ GOWASM = gowasm() ++ ToolTags = toolTags() ++ GO_LDSO = defaultGO_LDSO ++ Version = version + ) + + // Error is one of the errors found (if any) in the build configuration. +@@ -139,7 +140,7 @@ func ParseGoarm64(v string) (g Goarm64Features, e error) { + + switch v { + case "v8.0", "v8.1", "v8.2", "v8.3", "v8.4", "v8.5", "v8.6", "v8.7", "v8.8", "v8.9", +- "v9.0", "v9.1", "v9.2", "v9.4", "v9.5": ++ "v9.0", "v9.1", "v9.2", "v9.4", "v9.5": + g.Version = v + default: + e = fmt.Errorf("invalid GOARM64: must start with v8.{0-9} or v9.{0-5} and may optionally end in %q and/or %q", +@@ -213,6 +214,22 @@ func goppc64() int { + return int(defaultGOPPC64[len("power")] - '0') + } + ++func goriscv64() int { ++ switch v := envOr("GORISCV64", defaultGORISCV64); v { ++ case "rva20u64": ++ return 20 ++ case "rva22u64": ++ return 22 ++ } ++ Error = fmt.Errorf("invalid GORISCV64: must be rva20u64, rva22u64") ++ v := defaultGORISCV64[len("rva"):] ++ i := strings.IndexFunc(v, func(r rune) bool { ++ return r < '0' || r > '9' ++ }) ++ year, _ := strconv.Atoi(v[:i]) ++ return year ++} ++ + type gowasmFeatures struct { + SatConv bool + SignExt bool +@@ -331,6 +348,12 @@ func gogoarchTags() []string { + list = append(list, fmt.Sprintf("%s.power%d", GOARCH, i)) + } + return list ++ case "riscv64": ++ list := []string{GOARCH + "." + "rva20u64"} ++ if GORISCV64 >= 22 { ++ list = append(list, GOARCH+"."+"rva22u64") ++ } ++ return list + case "wasm": + var list []string + if GOWASM.SatConv { +diff --git a/src/internal/buildcfg/cfg_test.go b/src/internal/buildcfg/cfg_test.go +index 0123593317..69eeef2422 100644 +--- a/src/internal/buildcfg/cfg_test.go ++++ b/src/internal/buildcfg/cfg_test.go +@@ -23,4 +23,18 @@ func TestConfigFlags(t *testing.T) { + if goamd64(); Error == nil { + t.Errorf("Wrong parsing of GOAMD64=1") + } ++ ++ os.Setenv("GORISCV64", "rva20u64") ++ if goriscv64() != 20 { ++ t.Errorf("Wrong parsing of RISCV64=rva20u64") ++ } ++ os.Setenv("GORISCV64", "rva22u64") ++ if goriscv64() != 22 { ++ t.Errorf("Wrong parsing of RISCV64=rva22u64") ++ } ++ Error = nil ++ os.Setenv("GORISCV64", "rva22") ++ if _ = goriscv64(); Error == nil { ++ t.Errorf("Wrong parsing of RISCV64=rva22") ++ } + } +diff --git a/src/internal/cfg/cfg.go b/src/internal/cfg/cfg.go +index 7ef5bb7be6..08d210b797 100644 +--- a/src/internal/cfg/cfg.go ++++ b/src/internal/cfg/cfg.go +@@ -58,6 +58,7 @@ const KnownEnv = ` + GOPPC64 + GOPRIVATE + GOPROXY ++ GORISCV64 + GOROOT + GOSUMDB + GOTMPDIR +-- +2.39.5 + diff --git a/2026-cmd-compile-implement-float-min-max-in-hardware-for-.patch b/2026-cmd-compile-implement-float-min-max-in-hardware-for-.patch new file mode 100644 index 0000000..9074965 --- /dev/null +++ b/2026-cmd-compile-implement-float-min-max-in-hardware-for-.patch @@ -0,0 +1,520 @@ +From 0c99ca5b172774e907aa32f188236266d1770712 Mon Sep 17 00:00:00 2001 +From: Keith Randall +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 026/119] cmd/compile: implement float min/max in hardware for + amd64 and arm64 + +Update #59488 + +Change-Id: I89f5ea494cbcc887f6fae8560e57bcbd8749be86 +Reviewed-on: https://go-review.googlesource.com/c/go/+/514596 +Reviewed-by: Keith Randall +TryBot-Result: Gopher Robot +Run-TryBot: Keith Randall +Reviewed-by: Cherry Mui +--- + src/cmd/compile/internal/amd64/ssa.go | 3 +- + src/cmd/compile/internal/arm64/ssa.go | 4 + + src/cmd/compile/internal/ssa/_gen/AMD64.rules | 14 ++ + src/cmd/compile/internal/ssa/_gen/AMD64Ops.go | 9 +- + src/cmd/compile/internal/ssa/_gen/ARM64.rules | 3 + + src/cmd/compile/internal/ssa/_gen/ARM64Ops.go | 4 + + .../compile/internal/ssa/_gen/genericOps.go | 6 + + src/cmd/compile/internal/ssa/opGen.go | 133 ++++++++++++++++++ + src/cmd/compile/internal/ssa/rewriteAMD64.go | 90 ++++++++++++ + src/cmd/compile/internal/ssa/rewriteARM64.go | 12 ++ + src/cmd/compile/internal/ssagen/ssa.go | 23 ++- + 11 files changed, 298 insertions(+), 3 deletions(-) + +diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go +index 113875861c..174ea1ffc8 100644 +--- a/src/cmd/compile/internal/amd64/ssa.go ++++ b/src/cmd/compile/internal/amd64/ssa.go +@@ -252,7 +252,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB, + ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD, + ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD, +- ssa.OpAMD64PXOR, ++ ssa.OpAMD64MINSS, ssa.OpAMD64MINSD, ++ ssa.OpAMD64POR, ssa.OpAMD64PXOR, + ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ, + ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ, + ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ: +diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go +index a0b432bd97..27b4e881c0 100644 +--- a/src/cmd/compile/internal/arm64/ssa.go ++++ b/src/cmd/compile/internal/arm64/ssa.go +@@ -215,6 +215,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + ssa.OpARM64FNMULD, + ssa.OpARM64FDIVS, + ssa.OpARM64FDIVD, ++ ssa.OpARM64FMINS, ++ ssa.OpARM64FMIND, ++ ssa.OpARM64FMAXS, ++ ssa.OpARM64FMAXD, + ssa.OpARM64ROR, + ssa.OpARM64RORW: + r := v.Reg() +diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules +index 5f9b85fc41..fbbeff6dc2 100644 +--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules +@@ -172,6 +172,20 @@ + + (Round(32|64)F ...) => (Copy ...) + ++// Floating-point min is tricky, as the hardware op isn't right for various special ++// cases (-0 and NaN). We use two hardware ops organized just right to make the ++// result come out how we want it. See https://github.com/golang/go/issues/59488#issuecomment-1553493207 ++// (although that comment isn't exactly right, as the value overwritten is not simulated correctly). ++// t1 = MINSD x, y => incorrect if x==NaN or x==-0,y==+0 ++// t2 = MINSD t1, x => fixes x==NaN case ++// res = POR t1, t2 => fixes x==-0,y==+0 case ++// Note that this trick depends on the special property that (NaN OR x) produces a NaN (although ++// it might not produce the same NaN as the input). ++(Min(64|32)F x y) => (POR (MINS(D|S) (MINS(D|S) x y) x) (MINS(D|S) x y)) ++// Floating-point max is even trickier. Punt to using min instead. ++// max(x,y) == -min(-x,-y) ++(Max(64|32)F x y) => (Neg(64|32)F (Min(64|32)F (Neg(64|32)F x) (Neg(64|32)F y))) ++ + (CvtBoolToUint8 ...) => (Copy ...) + + // Lowering shifts +diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +index d8d0225fc3..27a6844b77 100644 +--- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +@@ -681,6 +681,12 @@ func init() { + // Any use must be preceded by a successful check of runtime.support_fma. + {name: "VFMADD231SD", argLength: 3, reg: fp31, resultInArg0: true, asm: "VFMADD231SD"}, + ++ // Note that these operations don't exactly match the semantics of Go's ++ // builtin min. In particular, these aren't commutative, because on various ++ // special cases the 2nd argument is preferred. ++ {name: "MINSD", argLength: 2, reg: fp21, resultInArg0: true, asm: "MINSD"}, // min(arg0,arg1) ++ {name: "MINSS", argLength: 2, reg: fp21, resultInArg0: true, asm: "MINSS"}, // min(arg0,arg1) ++ + {name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear. + {name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear. + // Note: SBBW and SBBB are subsumed by SBBL +@@ -746,7 +752,8 @@ func init() { + {name: "MOVLi2f", argLength: 1, reg: gpfp, typ: "Float32"}, // move 32 bits from int to float reg + {name: "MOVLf2i", argLength: 1, reg: fpgp, typ: "UInt32"}, // move 32 bits from float to int reg, zero extend + +- {name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation. ++ {name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs (for float negation). ++ {name: "POR", argLength: 2, reg: fp21, asm: "POR", commutative: true, resultInArg0: true}, // inclusive or, applied to X regs (for float min/max). + + {name: "LEAQ", argLength: 1, reg: gp11sb, asm: "LEAQ", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux + {name: "LEAL", argLength: 1, reg: gp11sb, asm: "LEAL", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux +diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64.rules b/src/cmd/compile/internal/ssa/_gen/ARM64.rules +index 8cf6f6740e..94032d6ca4 100644 +--- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules +@@ -61,6 +61,9 @@ + + (Sqrt32 ...) => (FSQRTS ...) + ++(Min(64|32)F ...) => (FMIN(D|S) ...) ++(Max(64|32)F ...) => (FMAX(D|S) ...) ++ + // lowering rotates + // we do rotate detection in generic rules, if the following rules need to be changed, check generic rules first. + (RotateLeft8 x (MOVDconst [c])) => (Or8 (Lsh8x64 x (MOVDconst [c&7])) (Rsh8Ux64 x (MOVDconst [-c&7]))) +diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go +index 2853e62540..c0c7cbbe61 100644 +--- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go +@@ -234,6 +234,10 @@ func init() { + {name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD"}, // -arg0, float64 + {name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD"}, // sqrt(arg0), float64 + {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"}, // sqrt(arg0), float32 ++ {name: "FMIND", argLength: 2, reg: fp21, asm: "FMIND"}, // min(arg0, arg1) ++ {name: "FMINS", argLength: 2, reg: fp21, asm: "FMINS"}, // min(arg0, arg1) ++ {name: "FMAXD", argLength: 2, reg: fp21, asm: "FMAXD"}, // max(arg0, arg1) ++ {name: "FMAXS", argLength: 2, reg: fp21, asm: "FMAXS"}, // max(arg0, arg1) + {name: "REV", argLength: 1, reg: gp11, asm: "REV"}, // byte reverse, 64-bit + {name: "REVW", argLength: 1, reg: gp11, asm: "REVW"}, // byte reverse, 32-bit + {name: "REV16", argLength: 1, reg: gp11, asm: "REV16"}, // byte reverse in each 16-bit halfword, 64-bit +diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go +index 53ff57f6b1..fb18319263 100644 +--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go ++++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go +@@ -285,6 +285,12 @@ var genericOps = []opData{ + {name: "Abs", argLength: 1}, // absolute value arg0 + {name: "Copysign", argLength: 2}, // copy sign from arg0 to arg1 + ++ // Float min/max implementation, if hardware is available. ++ {name: "Min64F", argLength: 2}, // min(arg0,arg1) ++ {name: "Min32F", argLength: 2}, // min(arg0,arg1) ++ {name: "Max64F", argLength: 2}, // max(arg0,arg1) ++ {name: "Max32F", argLength: 2}, // max(arg0,arg1) ++ + // 3-input opcode. + // Fused-multiply-add, float64 only. + // When a*b+c is exactly zero (before rounding), then the result is +0 or -0. +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index 62b516ce61..c811c4e020 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -912,6 +912,8 @@ const ( + OpAMD64SQRTSS + OpAMD64ROUNDSD + OpAMD64VFMADD231SD ++ OpAMD64MINSD ++ OpAMD64MINSS + OpAMD64SBBQcarrymask + OpAMD64SBBLcarrymask + OpAMD64SETEQ +@@ -964,6 +966,7 @@ const ( + OpAMD64MOVLi2f + OpAMD64MOVLf2i + OpAMD64PXOR ++ OpAMD64POR + OpAMD64LEAQ + OpAMD64LEAL + OpAMD64LEAW +@@ -1441,6 +1444,10 @@ const ( + OpARM64FNEGD + OpARM64FSQRTD + OpARM64FSQRTS ++ OpARM64FMIND ++ OpARM64FMINS ++ OpARM64FMAXD ++ OpARM64FMAXS + OpARM64REV + OpARM64REVW + OpARM64REV16 +@@ -3016,6 +3023,10 @@ const ( + OpRoundToEven + OpAbs + OpCopysign ++ OpMin64F ++ OpMin32F ++ OpMax64F ++ OpMax32F + OpFMA + OpPhi + OpCopy +@@ -11900,6 +11911,36 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "MINSD", ++ argLen: 2, ++ resultInArg0: true, ++ asm: x86.AMINSD, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 ++ {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 ++ }, ++ outputs: []outputInfo{ ++ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 ++ }, ++ }, ++ }, ++ { ++ name: "MINSS", ++ argLen: 2, ++ resultInArg0: true, ++ asm: x86.AMINSS, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 ++ {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 ++ }, ++ outputs: []outputInfo{ ++ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 ++ }, ++ }, ++ }, + { + name: "SBBQcarrymask", + argLen: 1, +@@ -12520,6 +12561,22 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "POR", ++ argLen: 2, ++ commutative: true, ++ resultInArg0: true, ++ asm: x86.APOR, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 ++ {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 ++ }, ++ outputs: []outputInfo{ ++ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 ++ }, ++ }, ++ }, + { + name: "LEAQ", + auxType: auxSymOff, +@@ -19287,6 +19344,62 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "FMIND", ++ argLen: 2, ++ asm: arm64.AFMIND, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, ++ { ++ name: "FMINS", ++ argLen: 2, ++ asm: arm64.AFMINS, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, ++ { ++ name: "FMAXD", ++ argLen: 2, ++ asm: arm64.AFMAXD, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, ++ { ++ name: "FMAXS", ++ argLen: 2, ++ asm: arm64.AFMAXS, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, + { + name: "REV", + argLen: 1, +@@ -39072,6 +39185,26 @@ var opcodeTable = [...]opInfo{ + argLen: 2, + generic: true, + }, ++ { ++ name: "Min64F", ++ argLen: 2, ++ generic: true, ++ }, ++ { ++ name: "Min32F", ++ argLen: 2, ++ generic: true, ++ }, ++ { ++ name: "Max64F", ++ argLen: 2, ++ generic: true, ++ }, ++ { ++ name: "Max32F", ++ argLen: 2, ++ generic: true, ++ }, + { + name: "FMA", + argLen: 3, +diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go +index 88bd48f331..23a1b11ddd 100644 +--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go ++++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go +@@ -871,6 +871,14 @@ func rewriteValueAMD64(v *Value) bool { + return rewriteValueAMD64_OpLsh8x64(v) + case OpLsh8x8: + return rewriteValueAMD64_OpLsh8x8(v) ++ case OpMax32F: ++ return rewriteValueAMD64_OpMax32F(v) ++ case OpMax64F: ++ return rewriteValueAMD64_OpMax64F(v) ++ case OpMin32F: ++ return rewriteValueAMD64_OpMin32F(v) ++ case OpMin64F: ++ return rewriteValueAMD64_OpMin64F(v) + case OpMod16: + return rewriteValueAMD64_OpMod16(v) + case OpMod16u: +@@ -27481,6 +27489,88 @@ func rewriteValueAMD64_OpLsh8x8(v *Value) bool { + } + return false + } ++func rewriteValueAMD64_OpMax32F(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ b := v.Block ++ // match: (Max32F x y) ++ // result: (Neg32F (Min32F (Neg32F x) (Neg32F y))) ++ for { ++ t := v.Type ++ x := v_0 ++ y := v_1 ++ v.reset(OpNeg32F) ++ v.Type = t ++ v0 := b.NewValue0(v.Pos, OpMin32F, t) ++ v1 := b.NewValue0(v.Pos, OpNeg32F, t) ++ v1.AddArg(x) ++ v2 := b.NewValue0(v.Pos, OpNeg32F, t) ++ v2.AddArg(y) ++ v0.AddArg2(v1, v2) ++ v.AddArg(v0) ++ return true ++ } ++} ++func rewriteValueAMD64_OpMax64F(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ b := v.Block ++ // match: (Max64F x y) ++ // result: (Neg64F (Min64F (Neg64F x) (Neg64F y))) ++ for { ++ t := v.Type ++ x := v_0 ++ y := v_1 ++ v.reset(OpNeg64F) ++ v.Type = t ++ v0 := b.NewValue0(v.Pos, OpMin64F, t) ++ v1 := b.NewValue0(v.Pos, OpNeg64F, t) ++ v1.AddArg(x) ++ v2 := b.NewValue0(v.Pos, OpNeg64F, t) ++ v2.AddArg(y) ++ v0.AddArg2(v1, v2) ++ v.AddArg(v0) ++ return true ++ } ++} ++func rewriteValueAMD64_OpMin32F(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ b := v.Block ++ // match: (Min32F x y) ++ // result: (POR (MINSS (MINSS x y) x) (MINSS x y)) ++ for { ++ t := v.Type ++ x := v_0 ++ y := v_1 ++ v.reset(OpAMD64POR) ++ v0 := b.NewValue0(v.Pos, OpAMD64MINSS, t) ++ v1 := b.NewValue0(v.Pos, OpAMD64MINSS, t) ++ v1.AddArg2(x, y) ++ v0.AddArg2(v1, x) ++ v.AddArg2(v0, v1) ++ return true ++ } ++} ++func rewriteValueAMD64_OpMin64F(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ b := v.Block ++ // match: (Min64F x y) ++ // result: (POR (MINSD (MINSD x y) x) (MINSD x y)) ++ for { ++ t := v.Type ++ x := v_0 ++ y := v_1 ++ v.reset(OpAMD64POR) ++ v0 := b.NewValue0(v.Pos, OpAMD64MINSD, t) ++ v1 := b.NewValue0(v.Pos, OpAMD64MINSD, t) ++ v1.AddArg2(x, y) ++ v0.AddArg2(v1, x) ++ v.AddArg2(v0, v1) ++ return true ++ } ++} + func rewriteValueAMD64_OpMod16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go +index 3b8fe30371..93a741ad87 100644 +--- a/src/cmd/compile/internal/ssa/rewriteARM64.go ++++ b/src/cmd/compile/internal/ssa/rewriteARM64.go +@@ -820,6 +820,18 @@ func rewriteValueARM64(v *Value) bool { + return rewriteValueARM64_OpLsh8x64(v) + case OpLsh8x8: + return rewriteValueARM64_OpLsh8x8(v) ++ case OpMax32F: ++ v.Op = OpARM64FMAXS ++ return true ++ case OpMax64F: ++ v.Op = OpARM64FMAXD ++ return true ++ case OpMin32F: ++ v.Op = OpARM64FMINS ++ return true ++ case OpMin64F: ++ v.Op = OpARM64FMIND ++ return true + case OpMod16: + return rewriteValueARM64_OpMod16(v) + case OpMod16u: +diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go +index cc70dc8f7d..b668f1847c 100644 +--- a/src/cmd/compile/internal/ssagen/ssa.go ++++ b/src/cmd/compile/internal/ssagen/ssa.go +@@ -3567,11 +3567,32 @@ func (s *state) minMax(n *ir.CallExpr) *ssa.Value { + + if typ.IsFloat() || typ.IsString() { + // min/max semantics for floats are tricky because of NaNs and +- // negative zero, so we let the runtime handle this instead. ++ // negative zero. Some architectures have instructions which ++ // we can use to generate the right result. For others we must ++ // call into the runtime instead. + // + // Strings are conceptually simpler, but we currently desugar + // string comparisons during walk, not ssagen. + ++ if typ.IsFloat() { ++ switch Arch.LinkArch.Family { ++ case sys.AMD64, sys.ARM64: ++ var op ssa.Op ++ switch { ++ case typ.Kind() == types.TFLOAT64 && n.Op() == ir.OMIN: ++ op = ssa.OpMin64F ++ case typ.Kind() == types.TFLOAT64 && n.Op() == ir.OMAX: ++ op = ssa.OpMax64F ++ case typ.Kind() == types.TFLOAT32 && n.Op() == ir.OMIN: ++ op = ssa.OpMin32F ++ case typ.Kind() == types.TFLOAT32 && n.Op() == ir.OMAX: ++ op = ssa.OpMax32F ++ } ++ return fold(func(x, a *ssa.Value) *ssa.Value { ++ return s.newValue2(op, typ, x, a) ++ }) ++ } ++ } + var name string + switch typ.Kind() { + case types.TFLOAT32: +-- +2.39.5 + diff --git a/2027-cmd-compile-implement-float-min-max-in-hardware-for-.patch b/2027-cmd-compile-implement-float-min-max-in-hardware-for-.patch new file mode 100644 index 0000000..764882c --- /dev/null +++ b/2027-cmd-compile-implement-float-min-max-in-hardware-for-.patch @@ -0,0 +1,348 @@ +From aed69da05999cc02f56ecf3736ed81ca6bc113e1 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 027/119] cmd/compile: implement float min/max in hardware for + riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CL 514596 adds float min/max for amd64, this CL adds it for riscv64. + +The behavior of the RISC-V FMIN/FMAX instructions almost match Go's +requirements. + +However according to RISCV spec 8.3 "NaN Generation and Propagation" +>> if at least one input is a signaling NaN, or if both inputs are quiet +>> NaNs, the result is the canonical NaN. If one operand is a quiet NaN +>> and the other is not a NaN, the result is the non-NaN operand. + +Go using quiet NaN as NaN and according to Go spec +>> if any argument is a NaN, the result is a NaN + +This requires the float min/max implementation to check whether one +of operand is qNaN before float mix/max actually execute. + +This CL also fix a typo in minmax test. + +Benchmark on Visionfive2 +goos: linux +goarch: riscv64 +pkg: runtime + │ float_minmax.old.bench │ float_minmax.new.bench │ + │ sec/op │ sec/op vs base │ +MinFloat 158.20n ± 0% 28.13n ± 0% -82.22% (p=0.000 n=10) +MaxFloat 158.10n ± 0% 28.12n ± 0% -82.21% (p=0.000 n=10) +geomean 158.1n 28.12n -82.22% + +Update #59488 + +Change-Id: Iab48be6d32b8882044fb8c821438ca8840e5493d +Reviewed-on: https://go-review.googlesource.com/c/go/+/514775 +Reviewed-by: Mauri de Souza Meneguzzo +Run-TryBot: M Zhuo +Reviewed-by: Joel Sing +Reviewed-by: Cherry Mui +TryBot-Result: Gopher Robot +Reviewed-by: Keith Randall +--- + src/cmd/compile/internal/riscv64/ssa.go | 66 ++++++++++++++++++ + .../compile/internal/ssa/_gen/RISCV64.rules | 3 + + .../compile/internal/ssa/_gen/RISCV64Ops.go | 4 ++ + src/cmd/compile/internal/ssa/opGen.go | 68 +++++++++++++++++++ + .../compile/internal/ssa/rewriteRISCV64.go | 12 ++++ + src/cmd/compile/internal/ssagen/ssa.go | 2 +- + src/runtime/minmax_test.go | 22 +++++- + 7 files changed, 174 insertions(+), 3 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 22338188e5..caca504d28 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -297,6 +297,72 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r ++ ++ case ssa.OpRISCV64LoweredFMAXD, ssa.OpRISCV64LoweredFMIND, ssa.OpRISCV64LoweredFMAXS, ssa.OpRISCV64LoweredFMINS: ++ // Most of FMIN/FMAX result match Go's required behaviour, unless one of the ++ // inputs is a NaN. As such, we need to explicitly test for NaN ++ // before using FMIN/FMAX. ++ ++ // FADD Rarg0, Rarg1, Rout // FADD is used to propagate a NaN to the result in these cases. ++ // FEQ Rarg0, Rarg0, Rtmp ++ // BEQZ Rtmp, end ++ // FEQ Rarg1, Rarg1, Rtmp ++ // BEQZ Rtmp, end ++ // F(MIN | MAX) ++ ++ r0 := v.Args[0].Reg() ++ r1 := v.Args[1].Reg() ++ out := v.Reg() ++ add, feq := riscv.AFADDD, riscv.AFEQD ++ if v.Op == ssa.OpRISCV64LoweredFMAXS || v.Op == ssa.OpRISCV64LoweredFMINS { ++ add = riscv.AFADDS ++ feq = riscv.AFEQS ++ } ++ ++ p1 := s.Prog(add) ++ p1.From.Type = obj.TYPE_REG ++ p1.From.Reg = r0 ++ p1.Reg = r1 ++ p1.To.Type = obj.TYPE_REG ++ p1.To.Reg = out ++ ++ p2 := s.Prog(feq) ++ p2.From.Type = obj.TYPE_REG ++ p2.From.Reg = r0 ++ p2.Reg = r0 ++ p2.To.Type = obj.TYPE_REG ++ p2.To.Reg = riscv.REG_TMP ++ ++ p3 := s.Prog(riscv.ABEQ) ++ p3.From.Type = obj.TYPE_REG ++ p3.From.Reg = riscv.REG_ZERO ++ p3.Reg = riscv.REG_TMP ++ p3.To.Type = obj.TYPE_BRANCH ++ ++ p4 := s.Prog(feq) ++ p4.From.Type = obj.TYPE_REG ++ p4.From.Reg = r1 ++ p4.Reg = r1 ++ p4.To.Type = obj.TYPE_REG ++ p4.To.Reg = riscv.REG_TMP ++ ++ p5 := s.Prog(riscv.ABEQ) ++ p5.From.Type = obj.TYPE_REG ++ p5.From.Reg = riscv.REG_ZERO ++ p5.Reg = riscv.REG_TMP ++ p5.To.Type = obj.TYPE_BRANCH ++ ++ p6 := s.Prog(v.Op.Asm()) ++ p6.From.Type = obj.TYPE_REG ++ p6.From.Reg = r1 ++ p6.Reg = r0 ++ p6.To.Type = obj.TYPE_REG ++ p6.To.Reg = out ++ ++ nop := s.Prog(obj.ANOP) ++ p3.To.SetTarget(nop) ++ p5.To.SetTarget(nop) ++ + case ssa.OpRISCV64LoweredMuluhilo: + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index fc206c42d3..4fef20a565 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -72,6 +72,9 @@ + + (FMA ...) => (FMADDD ...) + ++(Min(64|32)F ...) => (LoweredFMIN(D|S) ...) ++(Max(64|32)F ...) => (LoweredFMAX(D|S) ...) ++ + // Sign and zero extension. + + (SignExt8to16 ...) => (MOVBreg ...) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 93f20f8a99..9ce6450166 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -429,6 +429,8 @@ func init() { + {name: "FNES", argLength: 2, reg: fp2gp, asm: "FNES", commutative: true}, // arg0 != arg1 + {name: "FLTS", argLength: 2, reg: fp2gp, asm: "FLTS"}, // arg0 < arg1 + {name: "FLES", argLength: 2, reg: fp2gp, asm: "FLES"}, // arg0 <= arg1 ++ {name: "LoweredFMAXS", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMAXS", commutative: true, typ: "Float32"}, // max(arg0, arg1) ++ {name: "LoweredFMINS", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMINS", commutative: true, typ: "Float32"}, // min(arg0, arg1) + + // D extension. + {name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true, typ: "Float64"}, // arg0 + arg1 +@@ -456,6 +458,8 @@ func init() { + {name: "FNED", argLength: 2, reg: fp2gp, asm: "FNED", commutative: true}, // arg0 != arg1 + {name: "FLTD", argLength: 2, reg: fp2gp, asm: "FLTD"}, // arg0 < arg1 + {name: "FLED", argLength: 2, reg: fp2gp, asm: "FLED"}, // arg0 <= arg1 ++ {name: "LoweredFMIND", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMIND", commutative: true, typ: "Float64"}, // min(arg0, arg1) ++ {name: "LoweredFMAXD", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMAXD", commutative: true, typ: "Float64"}, // max(arg0, arg1) + } + + RISCV64blocks := []blockData{ +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index c811c4e020..e10b054214 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2446,6 +2446,8 @@ const ( + OpRISCV64FNES + OpRISCV64FLTS + OpRISCV64FLES ++ OpRISCV64LoweredFMAXS ++ OpRISCV64LoweredFMINS + OpRISCV64FADDD + OpRISCV64FSUBD + OpRISCV64FMULD +@@ -2471,6 +2473,8 @@ const ( + OpRISCV64FNED + OpRISCV64FLTD + OpRISCV64FLED ++ OpRISCV64LoweredFMIND ++ OpRISCV64LoweredFMAXD + + OpS390XFADDS + OpS390XFADD +@@ -32805,6 +32809,38 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "LoweredFMAXS", ++ argLen: 2, ++ commutative: true, ++ resultNotInArgs: true, ++ asm: riscv.AFMAXS, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, ++ { ++ name: "LoweredFMINS", ++ argLen: 2, ++ commutative: true, ++ resultNotInArgs: true, ++ asm: riscv.AFMINS, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, + { + name: "FADDD", + argLen: 2, +@@ -33159,6 +33195,38 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "LoweredFMIND", ++ argLen: 2, ++ commutative: true, ++ resultNotInArgs: true, ++ asm: riscv.AFMIND, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, ++ { ++ name: "LoweredFMAXD", ++ argLen: 2, ++ commutative: true, ++ resultNotInArgs: true, ++ asm: riscv.AFMAXD, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ }, ++ }, + + { + name: "FADDS", +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 41edcdf8b8..230033c7af 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -328,6 +328,18 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpLsh8x64(v) + case OpLsh8x8: + return rewriteValueRISCV64_OpLsh8x8(v) ++ case OpMax32F: ++ v.Op = OpRISCV64LoweredFMAXS ++ return true ++ case OpMax64F: ++ v.Op = OpRISCV64LoweredFMAXD ++ return true ++ case OpMin32F: ++ v.Op = OpRISCV64LoweredFMINS ++ return true ++ case OpMin64F: ++ v.Op = OpRISCV64LoweredFMIND ++ return true + case OpMod16: + return rewriteValueRISCV64_OpMod16(v) + case OpMod16u: +diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go +index b668f1847c..ec89a45701 100644 +--- a/src/cmd/compile/internal/ssagen/ssa.go ++++ b/src/cmd/compile/internal/ssagen/ssa.go +@@ -3576,7 +3576,7 @@ func (s *state) minMax(n *ir.CallExpr) *ssa.Value { + + if typ.IsFloat() { + switch Arch.LinkArch.Family { +- case sys.AMD64, sys.ARM64: ++ case sys.AMD64, sys.ARM64, sys.RISCV64: + var op ssa.Op + switch { + case typ.Kind() == types.TFLOAT64 && n.Op() == ir.OMIN: +diff --git a/src/runtime/minmax_test.go b/src/runtime/minmax_test.go +index e0bc28fbf6..1f815a84c3 100644 +--- a/src/runtime/minmax_test.go ++++ b/src/runtime/minmax_test.go +@@ -66,10 +66,10 @@ func TestMaxFloat(t *testing.T) { + } + for _, x := range all { + if z := max(nan, x); !math.IsNaN(z) { +- t.Errorf("min(%v, %v) = %v, want %v", nan, x, z, nan) ++ t.Errorf("max(%v, %v) = %v, want %v", nan, x, z, nan) + } + if z := max(x, nan); !math.IsNaN(z) { +- t.Errorf("min(%v, %v) = %v, want %v", nan, x, z, nan) ++ t.Errorf("max(%v, %v) = %v, want %v", nan, x, z, nan) + } + } + } +@@ -127,3 +127,21 @@ func TestMinMaxStringTies(t *testing.T) { + test(2, 0, 1) + test(2, 1, 0) + } ++ ++func BenchmarkMinFloat(b *testing.B) { ++ var m float64 = 0 ++ for i := 0; i < b.N; i++ { ++ for _, f := range all { ++ m = min(m, f) ++ } ++ } ++} ++ ++func BenchmarkMaxFloat(b *testing.B) { ++ var m float64 = 0 ++ for i := 0; i < b.N; i++ { ++ for _, f := range all { ++ m = max(m, f) ++ } ++ } ++} +-- +2.39.5 + diff --git a/2028-cmd-compile-improve-rotations-for-riscv64.patch b/2028-cmd-compile-improve-rotations-for-riscv64.patch new file mode 100644 index 0000000..71c6a7a --- /dev/null +++ b/2028-cmd-compile-improve-rotations-for-riscv64.patch @@ -0,0 +1,596 @@ +From 94e798b40448d2d8a1f21ee7f711d92e546a8bd7 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 028/119] cmd/compile: improve rotations for riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Enable canRotate for riscv64, enable rotation intrinsics and provide +better rewrite implementations for rotations. By avoiding Lsh*x64 +and Rsh*Ux64 we can produce better code, especially for 32 and 64 +bit rotations. By enabling canRotate we also benefit from the generic +rotation rewrite rules. + +Benchmark on a StarFive VisionFive 2: + + │ rotate.1 │ rotate.2 │ + │ sec/op │ sec/op vs base │ +RotateLeft-4 14.700n ± 0% 8.016n ± 0% -45.47% (p=0.000 n=10) +RotateLeft8-4 14.70n ± 0% 10.69n ± 0% -27.28% (p=0.000 n=10) +RotateLeft16-4 14.70n ± 0% 12.02n ± 0% -18.23% (p=0.000 n=10) +RotateLeft32-4 13.360n ± 0% 8.016n ± 0% -40.00% (p=0.000 n=10) +RotateLeft64-4 13.360n ± 0% 8.016n ± 0% -40.00% (p=0.000 n=10) +geomean 14.15n 9.208n -34.92% + +Change-Id: I1a2036fdc57cf88ebb6617eb8d92e1d187e183b2 +Reviewed-on: https://go-review.googlesource.com/c/go/+/560315 +Reviewed-by: M Zhuo +Run-TryBot: Joel Sing +TryBot-Result: Gopher Robot +Reviewed-by: Mark Ryan +Reviewed-by: Cherry Mui +Reviewed-by: David Chase +--- + src/cmd/compile/internal/riscv64/ssa.go | 6 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 9 +- + .../compile/internal/ssa/_gen/RISCV64Ops.go | 22 +-- + src/cmd/compile/internal/ssa/opGen.go | 30 ++++ + src/cmd/compile/internal/ssa/rewrite.go | 2 +- + .../compile/internal/ssa/rewriteRISCV64.go | 142 ++++++++++-------- + src/cmd/compile/internal/ssagen/ssa.go | 8 +- + test/codegen/rotate.go | 22 +++ + 8 files changed, 153 insertions(+), 88 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index caca504d28..17f0d98532 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -278,7 +278,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.To.Type = obj.TYPE_REG + p.To.Reg = rd + case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64SUBW, ssa.OpRISCV64XOR, ssa.OpRISCV64OR, ssa.OpRISCV64AND, +- ssa.OpRISCV64SLL, ssa.OpRISCV64SRA, ssa.OpRISCV64SRAW, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW, ++ ssa.OpRISCV64SLL, ssa.OpRISCV64SLLW, ssa.OpRISCV64SRA, ssa.OpRISCV64SRAW, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW, + ssa.OpRISCV64SLT, ssa.OpRISCV64SLTU, ssa.OpRISCV64MUL, ssa.OpRISCV64MULW, ssa.OpRISCV64MULH, + ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW, + ssa.OpRISCV64DIVUW, ssa.OpRISCV64REM, ssa.OpRISCV64REMU, ssa.OpRISCV64REMW, +@@ -422,8 +422,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpRISCV64ADDI, ssa.OpRISCV64ADDIW, ssa.OpRISCV64XORI, ssa.OpRISCV64ORI, ssa.OpRISCV64ANDI, +- ssa.OpRISCV64SLLI, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRAIW, ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI, +- ssa.OpRISCV64SLTIU: ++ ssa.OpRISCV64SLLI, ssa.OpRISCV64SLLIW, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRAIW, ++ ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI, ssa.OpRISCV64SLTIU: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 4fef20a565..135d70bc47 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -214,10 +214,10 @@ + (Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA x y) + + // Rotates. +-(RotateLeft8 x (MOVDconst [c])) => (Or8 (Lsh8x64 x (MOVDconst [c&7])) (Rsh8Ux64 x (MOVDconst [-c&7]))) +-(RotateLeft16 x (MOVDconst [c])) => (Or16 (Lsh16x64 x (MOVDconst [c&15])) (Rsh16Ux64 x (MOVDconst [-c&15]))) +-(RotateLeft32 x (MOVDconst [c])) => (Or32 (Lsh32x64 x (MOVDconst [c&31])) (Rsh32Ux64 x (MOVDconst [-c&31]))) +-(RotateLeft64 x (MOVDconst [c])) => (Or64 (Lsh64x64 x (MOVDconst [c&63])) (Rsh64Ux64 x (MOVDconst [-c&63]))) ++(RotateLeft8 x y) => (OR (SLL x (ANDI [7] y)) (SRL (ZeroExt8to64 x) (ANDI [7] (NEG y)))) ++(RotateLeft16 x y) => (OR (SLL x (ANDI [15] y)) (SRL (ZeroExt16to64 x) (ANDI [15] (NEG y)))) ++(RotateLeft32 x y) => (OR (SLLW x y) (SRLW x (NEG y))) ++(RotateLeft64 x y) => (OR (SLL x y) (SRL x (NEG y))) + + (Less64 ...) => (SLT ...) + (Less32 x y) => (SLT (SignExt32to64 x) (SignExt32to64 y)) +@@ -733,6 +733,7 @@ + (XOR (MOVDconst [val]) x) && is32Bit(val) => (XORI [val] x) + (SLL x (MOVDconst [val])) => (SLLI [int64(val&63)] x) + (SRL x (MOVDconst [val])) => (SRLI [int64(val&63)] x) ++(SLLW x (MOVDconst [val])) => (SLLIW [int64(val&31)] x) + (SRLW x (MOVDconst [val])) => (SRLIW [int64(val&31)] x) + (SRA x (MOVDconst [val])) => (SRAI [int64(val&63)] x) + (SRAW x (MOVDconst [val])) => (SRAIW [int64(val&31)] x) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 9ce6450166..e9f1df0d58 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -207,16 +207,18 @@ func init() { + {name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}, resultInArg0: true}, // nop, return arg0 in same register + + // Shift ops +- {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << (aux1 & 63) +- {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> (aux1 & 63), signed +- {name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"}, // arg0 >> (aux1 & 31), signed +- {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> (aux1 & 63), unsigned +- {name: "SRLW", argLength: 2, reg: gp21, asm: "SRLW"}, // arg0 >> (aux1 & 31), unsigned +- {name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"}, // arg0 << auxint, shift amount 0-63 +- {name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-63 +- {name: "SRAIW", argLength: 1, reg: gp11, asm: "SRAIW", aux: "Int64"}, // arg0 >> auxint, signed, shift amount 0-31 +- {name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-63 +- {name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, unsigned, shift amount 0-31 ++ {name: "SLL", argLength: 2, reg: gp21, asm: "SLL"}, // arg0 << (aux1 & 63), logical left shift ++ {name: "SLLW", argLength: 2, reg: gp21, asm: "SLLW"}, // arg0 << (aux1 & 31), logical left shift of 32 bit value, sign extended to 64 bits ++ {name: "SRA", argLength: 2, reg: gp21, asm: "SRA"}, // arg0 >> (aux1 & 63), arithmetic right shift ++ {name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"}, // arg0 >> (aux1 & 31), arithmetic right shift of 32 bit value, sign extended to 64 bits ++ {name: "SRL", argLength: 2, reg: gp21, asm: "SRL"}, // arg0 >> (aux1 & 63), logical right shift ++ {name: "SRLW", argLength: 2, reg: gp21, asm: "SRLW"}, // arg0 >> (aux1 & 31), logical right shift of 32 bit value, sign extended to 64 bits ++ {name: "SLLI", argLength: 1, reg: gp11, asm: "SLLI", aux: "Int64"}, // arg0 << auxint, shift amount 0-63, logical left shift ++ {name: "SLLIW", argLength: 1, reg: gp11, asm: "SLLIW", aux: "Int64"}, // arg0 << auxint, shift amount 0-31, logical left shift of 32 bit value, sign extended to 64 bits ++ {name: "SRAI", argLength: 1, reg: gp11, asm: "SRAI", aux: "Int64"}, // arg0 >> auxint, shift amount 0-63, arithmetic right shift ++ {name: "SRAIW", argLength: 1, reg: gp11, asm: "SRAIW", aux: "Int64"}, // arg0 >> auxint, shift amount 0-31, arithmetic right shift of 32 bit value, sign extended to 64 bits ++ {name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, shift amount 0-63, logical right shift ++ {name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, shift amount 0-31, logical right shift of 32 bit value, sign extended to 64 bits + + // Bitwise ops + {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1 +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index e10b054214..29ca9f5c0f 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2370,11 +2370,13 @@ const ( + OpRISCV64MOVWUreg + OpRISCV64MOVDnop + OpRISCV64SLL ++ OpRISCV64SLLW + OpRISCV64SRA + OpRISCV64SRAW + OpRISCV64SRL + OpRISCV64SRLW + OpRISCV64SLLI ++ OpRISCV64SLLIW + OpRISCV64SRAI + OpRISCV64SRAIW + OpRISCV64SRLI +@@ -31778,6 +31780,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SLLW", ++ argLen: 2, ++ asm: riscv.ASLLW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "SRA", + argLen: 2, +@@ -31848,6 +31864,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SLLIW", ++ auxType: auxInt64, ++ argLen: 1, ++ asm: riscv.ASLLIW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "SRAI", + auxType: auxInt64, +diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go +index 43843bda55..63d13bf6c4 100644 +--- a/src/cmd/compile/internal/ssa/rewrite.go ++++ b/src/cmd/compile/internal/ssa/rewrite.go +@@ -2066,7 +2066,7 @@ func canRotate(c *Config, bits int64) bool { + return false + } + switch c.arch { +- case "386", "amd64", "arm64": ++ case "386", "amd64", "arm64", "riscv64": + return true + case "arm", "s390x", "ppc64", "ppc64le", "wasm", "loong64": + return bits >= 32 +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 230033c7af..ca0e108915 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -538,6 +538,8 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpRISCV64SLL(v) + case OpRISCV64SLLI: + return rewriteValueRISCV64_OpRISCV64SLLI(v) ++ case OpRISCV64SLLW: ++ return rewriteValueRISCV64_OpRISCV64SLLW(v) + case OpRISCV64SLT: + return rewriteValueRISCV64_OpRISCV64SLT(v) + case OpRISCV64SLTI: +@@ -6072,6 +6074,24 @@ func rewriteValueRISCV64_OpRISCV64SLLI(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64SLLW(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (SLLW x (MOVDconst [val])) ++ // result: (SLLIW [int64(val&31)] x) ++ for { ++ x := v_0 ++ if v_1.Op != OpRISCV64MOVDconst { ++ break ++ } ++ val := auxIntToInt64(v_1.AuxInt) ++ v.reset(OpRISCV64SLLIW) ++ v.AuxInt = int64ToAuxInt(int64(val & 31)) ++ v.AddArg(x) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64SLT(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +@@ -6646,112 +6666,102 @@ func rewriteValueRISCV64_OpRotateLeft16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types +- // match: (RotateLeft16 x (MOVDconst [c])) +- // result: (Or16 (Lsh16x64 x (MOVDconst [c&15])) (Rsh16Ux64 x (MOVDconst [-c&15]))) ++ // match: (RotateLeft16 x y) ++ // result: (OR (SLL x (ANDI [15] y)) (SRL (ZeroExt16to64 x) (ANDI [15] (NEG y)))) + for { + t := v.Type + x := v_0 +- if v_1.Op != OpRISCV64MOVDconst { +- break +- } +- c := auxIntToInt64(v_1.AuxInt) +- v.reset(OpOr16) +- v0 := b.NewValue0(v.Pos, OpLsh16x64, t) +- v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) +- v1.AuxInt = int64ToAuxInt(c & 15) ++ y := v_1 ++ v.reset(OpRISCV64OR) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t) ++ v1 := b.NewValue0(v.Pos, OpRISCV64ANDI, y.Type) ++ v1.AuxInt = int64ToAuxInt(15) ++ v1.AddArg(y) + v0.AddArg2(x, v1) +- v2 := b.NewValue0(v.Pos, OpRsh16Ux64, t) +- v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) +- v3.AuxInt = int64ToAuxInt(-c & 15) +- v2.AddArg2(x, v3) ++ v2 := b.NewValue0(v.Pos, OpRISCV64SRL, t) ++ v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v3.AddArg(x) ++ v4 := b.NewValue0(v.Pos, OpRISCV64ANDI, y.Type) ++ v4.AuxInt = int64ToAuxInt(15) ++ v5 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type) ++ v5.AddArg(y) ++ v4.AddArg(v5) ++ v2.AddArg2(v3, v4) + v.AddArg2(v0, v2) + return true + } +- return false + } + func rewriteValueRISCV64_OpRotateLeft32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block +- typ := &b.Func.Config.Types +- // match: (RotateLeft32 x (MOVDconst [c])) +- // result: (Or32 (Lsh32x64 x (MOVDconst [c&31])) (Rsh32Ux64 x (MOVDconst [-c&31]))) ++ // match: (RotateLeft32 x y) ++ // result: (OR (SLLW x y) (SRLW x (NEG y))) + for { + t := v.Type + x := v_0 +- if v_1.Op != OpRISCV64MOVDconst { +- break +- } +- c := auxIntToInt64(v_1.AuxInt) +- v.reset(OpOr32) +- v0 := b.NewValue0(v.Pos, OpLsh32x64, t) +- v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) +- v1.AuxInt = int64ToAuxInt(c & 31) +- v0.AddArg2(x, v1) +- v2 := b.NewValue0(v.Pos, OpRsh32Ux64, t) +- v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) +- v3.AuxInt = int64ToAuxInt(-c & 31) +- v2.AddArg2(x, v3) +- v.AddArg2(v0, v2) ++ y := v_1 ++ v.reset(OpRISCV64OR) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SLLW, t) ++ v0.AddArg2(x, y) ++ v1 := b.NewValue0(v.Pos, OpRISCV64SRLW, t) ++ v2 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type) ++ v2.AddArg(y) ++ v1.AddArg2(x, v2) ++ v.AddArg2(v0, v1) + return true + } +- return false + } + func rewriteValueRISCV64_OpRotateLeft64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block +- typ := &b.Func.Config.Types +- // match: (RotateLeft64 x (MOVDconst [c])) +- // result: (Or64 (Lsh64x64 x (MOVDconst [c&63])) (Rsh64Ux64 x (MOVDconst [-c&63]))) ++ // match: (RotateLeft64 x y) ++ // result: (OR (SLL x y) (SRL x (NEG y))) + for { + t := v.Type + x := v_0 +- if v_1.Op != OpRISCV64MOVDconst { +- break +- } +- c := auxIntToInt64(v_1.AuxInt) +- v.reset(OpOr64) +- v0 := b.NewValue0(v.Pos, OpLsh64x64, t) +- v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) +- v1.AuxInt = int64ToAuxInt(c & 63) +- v0.AddArg2(x, v1) +- v2 := b.NewValue0(v.Pos, OpRsh64Ux64, t) +- v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) +- v3.AuxInt = int64ToAuxInt(-c & 63) +- v2.AddArg2(x, v3) +- v.AddArg2(v0, v2) ++ y := v_1 ++ v.reset(OpRISCV64OR) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t) ++ v0.AddArg2(x, y) ++ v1 := b.NewValue0(v.Pos, OpRISCV64SRL, t) ++ v2 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type) ++ v2.AddArg(y) ++ v1.AddArg2(x, v2) ++ v.AddArg2(v0, v1) + return true + } +- return false + } + func rewriteValueRISCV64_OpRotateLeft8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types +- // match: (RotateLeft8 x (MOVDconst [c])) +- // result: (Or8 (Lsh8x64 x (MOVDconst [c&7])) (Rsh8Ux64 x (MOVDconst [-c&7]))) ++ // match: (RotateLeft8 x y) ++ // result: (OR (SLL x (ANDI [7] y)) (SRL (ZeroExt8to64 x) (ANDI [7] (NEG y)))) + for { + t := v.Type + x := v_0 +- if v_1.Op != OpRISCV64MOVDconst { +- break +- } +- c := auxIntToInt64(v_1.AuxInt) +- v.reset(OpOr8) +- v0 := b.NewValue0(v.Pos, OpLsh8x64, t) +- v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) +- v1.AuxInt = int64ToAuxInt(c & 7) ++ y := v_1 ++ v.reset(OpRISCV64OR) ++ v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t) ++ v1 := b.NewValue0(v.Pos, OpRISCV64ANDI, y.Type) ++ v1.AuxInt = int64ToAuxInt(7) ++ v1.AddArg(y) + v0.AddArg2(x, v1) +- v2 := b.NewValue0(v.Pos, OpRsh8Ux64, t) +- v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) +- v3.AuxInt = int64ToAuxInt(-c & 7) +- v2.AddArg2(x, v3) ++ v2 := b.NewValue0(v.Pos, OpRISCV64SRL, t) ++ v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v3.AddArg(x) ++ v4 := b.NewValue0(v.Pos, OpRISCV64ANDI, y.Type) ++ v4.AuxInt = int64ToAuxInt(7) ++ v5 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type) ++ v5.AddArg(y) ++ v4.AddArg(v5) ++ v2.AddArg2(v3, v4) + v.AddArg2(v0, v2) + return true + } +- return false + } + func rewriteValueRISCV64_OpRsh16Ux16(v *Value) bool { + v_1 := v.Args[1] +diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go +index ec89a45701..178ccfb59b 100644 +--- a/src/cmd/compile/internal/ssagen/ssa.go ++++ b/src/cmd/compile/internal/ssagen/ssa.go +@@ -4776,22 +4776,22 @@ func InitTables() { + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1]) + }, +- sys.AMD64) ++ sys.AMD64, sys.RISCV64) + addF("math/bits", "RotateLeft16", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1]) + }, +- sys.AMD64) ++ sys.AMD64, sys.RISCV64) + addF("math/bits", "RotateLeft32", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1]) + }, +- sys.AMD64, sys.ARM, sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm, sys.Loong64) ++ sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) + addF("math/bits", "RotateLeft64", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1]) + }, +- sys.AMD64, sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm, sys.Loong64) ++ sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) + alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...) + + makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +diff --git a/test/codegen/rotate.go b/test/codegen/rotate.go +index 5495f86b79..109e55763c 100644 +--- a/test/codegen/rotate.go ++++ b/test/codegen/rotate.go +@@ -18,6 +18,7 @@ func rot64(x uint64) uint64 { + // amd64:"ROLQ\t[$]7" + // ppc64x:"ROTL\t[$]7" + // loong64: "ROTRV\t[$]57" ++ // riscv64: "OR","SLLI","SRLI",-"AND" + a += x<<7 | x>>57 + + // amd64:"ROLQ\t[$]8" +@@ -25,6 +26,7 @@ func rot64(x uint64) uint64 { + // s390x:"RISBGZ\t[$]0, [$]63, [$]8, " + // ppc64x:"ROTL\t[$]8" + // loong64: "ROTRV\t[$]56" ++ // riscv64: "OR","SLLI","SRLI",-"AND" + a += x<<8 + x>>56 + + // amd64:"ROLQ\t[$]9" +@@ -32,6 +34,7 @@ func rot64(x uint64) uint64 { + // s390x:"RISBGZ\t[$]0, [$]63, [$]9, " + // ppc64x:"ROTL\t[$]9" + // loong64: "ROTRV\t[$]55" ++ // riscv64: "OR","SLLI","SRLI",-"AND" + a += x<<9 ^ x>>55 + + // amd64:"ROLQ\t[$]10" +@@ -41,6 +44,7 @@ func rot64(x uint64) uint64 { + // arm64:"ROR\t[$]54" + // s390x:"RISBGZ\t[$]0, [$]63, [$]10, " + // loong64: "ROTRV\t[$]54" ++ // riscv64: "OR","SLLI","SRLI",-"AND" + a += bits.RotateLeft64(x, 10) + + return a +@@ -53,6 +57,7 @@ func rot32(x uint32) uint32 { + // arm:"MOVW\tR\\d+@>25" + // ppc64x:"ROTLW\t[$]7" + // loong64: "ROTR\t[$]25" ++ // riscv64: "OR","SLLIW","SRLIW",-"AND" + a += x<<7 | x>>25 + + // amd64:`ROLL\t[$]8` +@@ -61,6 +66,7 @@ func rot32(x uint32) uint32 { + // s390x:"RLL\t[$]8" + // ppc64x:"ROTLW\t[$]8" + // loong64: "ROTR\t[$]24" ++ // riscv64: "OR","SLLIW","SRLIW",-"AND" + a += x<<8 + x>>24 + + // amd64:"ROLL\t[$]9" +@@ -69,6 +75,7 @@ func rot32(x uint32) uint32 { + // s390x:"RLL\t[$]9" + // ppc64x:"ROTLW\t[$]9" + // loong64: "ROTR\t[$]23" ++ // riscv64: "OR","SLLIW","SRLIW",-"AND" + a += x<<9 ^ x>>23 + + // amd64:"ROLL\t[$]10" +@@ -79,6 +86,7 @@ func rot32(x uint32) uint32 { + // arm64:"RORW\t[$]22" + // s390x:"RLL\t[$]10" + // loong64: "ROTR\t[$]22" ++ // riscv64: "OR","SLLIW","SRLIW",-"AND" + a += bits.RotateLeft32(x, 10) + + return a +@@ -88,12 +96,15 @@ func rot16(x uint16) uint16 { + var a uint16 + + // amd64:"ROLW\t[$]7" ++ // riscv64: "OR","SLLI","SRLI",-"AND" + a += x<<7 | x>>9 + + // amd64:`ROLW\t[$]8` ++ // riscv64: "OR","SLLI","SRLI",-"AND" + a += x<<8 + x>>8 + + // amd64:"ROLW\t[$]9" ++ // riscv64: "OR","SLLI","SRLI",-"AND" + a += x<<9 ^ x>>7 + + return a +@@ -103,12 +114,15 @@ func rot8(x uint8) uint8 { + var a uint8 + + // amd64:"ROLB\t[$]5" ++ // riscv64: "OR","SLLI","SRLI",-"AND" + a += x<<5 | x>>3 + + // amd64:`ROLB\t[$]6` ++ // riscv64: "OR","SLLI","SRLI",-"AND" + a += x<<6 + x>>2 + + // amd64:"ROLB\t[$]7" ++ // riscv64: "OR","SLLI","SRLI",-"AND" + a += x<<7 ^ x>>1 + + return a +@@ -127,12 +141,14 @@ func rot64nc(x uint64, z uint) uint64 { + // arm64:"ROR","NEG",-"AND" + // ppc64x:"ROTL",-"NEG",-"AND" + // loong64: "ROTRV", -"AND" ++ // riscv64: "OR","SLL","SRL",-"AND" + a += x<>(64-z) + + // amd64:"RORQ",-"AND" + // arm64:"ROR",-"NEG",-"AND" + // ppc64x:"ROTL","NEG",-"AND" + // loong64: "ROTRV", -"AND" ++ // riscv64: "OR","SLL","SRL",-"AND" + a += x>>z | x<<(64-z) + + return a +@@ -147,12 +163,14 @@ func rot32nc(x uint32, z uint) uint32 { + // arm64:"ROR","NEG",-"AND" + // ppc64x:"ROTLW",-"NEG",-"AND" + // loong64: "ROTR", -"AND" ++ // riscv64: "OR","SLLW","SRLW",-"AND" + a += x<>(32-z) + + // amd64:"RORL",-"AND" + // arm64:"ROR",-"NEG",-"AND" + // ppc64x:"ROTLW","NEG",-"AND" + // loong64: "ROTR", -"AND" ++ // riscv64: "OR","SLLW","SRLW",-"AND" + a += x>>z | x<<(32-z) + + return a +@@ -164,9 +182,11 @@ func rot16nc(x uint16, z uint) uint16 { + z &= 15 + + // amd64:"ROLW",-"ANDQ" ++ // riscv64: "OR","SLL","SRL",-"AND\t" + a += x<>(16-z) + + // amd64:"RORW",-"ANDQ" ++ // riscv64: "OR","SLL","SRL",-"AND\t" + a += x>>z | x<<(16-z) + + return a +@@ -178,9 +198,11 @@ func rot8nc(x uint8, z uint) uint8 { + z &= 7 + + // amd64:"ROLB",-"ANDQ" ++ // riscv64: "OR","SLL","SRL",-"AND\t" + a += x<>(8-z) + + // amd64:"RORB",-"ANDQ" ++ // riscv64: "OR","SLL","SRL",-"AND\t" + a += x>>z | x<<(8-z) + + return a +-- +2.39.5 + diff --git a/2029-cmd-asm-cmd-internal-obj-enable-rounding-mode-suffix.patch b/2029-cmd-asm-cmd-internal-obj-enable-rounding-mode-suffix.patch new file mode 100644 index 0000000..15583f0 --- /dev/null +++ b/2029-cmd-asm-cmd-internal-obj-enable-rounding-mode-suffix.patch @@ -0,0 +1,308 @@ +From 7788f5dcc7c9046892c55c74e751b0409b4631d7 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 029/119] cmd/asm, cmd/internal/obj: enable rounding mode + suffix for riscv64 + +This CL adds rounding modes for riscv64 floating point conversion +instructions by suffix with 5 modes: RNE, RTZ, RDN, RUP and RMM. + +For example, for round to nearest (RNE), we can use `FCVTLD.RNE` +According to RISCV manual 8.7 and 9.5, we changed these +conversion instructions: + +FCVTWS +FCVTLS +FCVTWUS +FCVTLUS +FCVTWD +FCVTLD +FCVTWUD +FCVTLUD + +Note: Round towards zero (RTZ) by default for all these instructions above. + +Change-Id: I491e522e14d721e24aa7f528ee0c4640c54c5808 +Reviewed-on: https://go-review.googlesource.com/c/go/+/504736 +Reviewed-by: Joel Sing +Run-TryBot: M Zhuo +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +TryBot-Result: Gopher Robot +Reviewed-by: Than McIntosh +--- + src/cmd/asm/internal/asm/asm.go | 7 ++- + src/cmd/asm/internal/asm/parse.go | 4 +- + src/cmd/asm/internal/asm/testdata/riscv64.s | 40 ++++++++++++++++ + src/cmd/internal/obj/link.go | 2 +- + src/cmd/internal/obj/riscv/cpu.go | 51 ++++++++++++++++++++- + src/cmd/internal/obj/riscv/list.go | 16 +++++++ + src/cmd/internal/obj/riscv/obj.go | 17 ++++++- + 7 files changed, 130 insertions(+), 7 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/asm.go b/src/cmd/asm/internal/asm/asm.go +index 563e794706..223c613bd9 100644 +--- a/src/cmd/asm/internal/asm/asm.go ++++ b/src/cmd/asm/internal/asm/asm.go +@@ -16,6 +16,7 @@ import ( + "cmd/asm/internal/lex" + "cmd/internal/obj" + "cmd/internal/obj/ppc64" ++ "cmd/internal/obj/riscv" + "cmd/internal/obj/x86" + "cmd/internal/sys" + ) +@@ -46,7 +47,11 @@ func (p *Parser) append(prog *obj.Prog, cond string, doLabel bool) { + p.errorf("%v", err) + return + } +- ++ case sys.RISCV64: ++ if err := riscv.ParseSuffix(prog, cond); err != nil { ++ p.errorf("unrecognized suffix .%q", cond) ++ return ++ } + default: + p.errorf("unrecognized suffix .%q", cond) + return +diff --git a/src/cmd/asm/internal/asm/parse.go b/src/cmd/asm/internal/asm/parse.go +index 37f8e6c0bc..ecee98593d 100644 +--- a/src/cmd/asm/internal/asm/parse.go ++++ b/src/cmd/asm/internal/asm/parse.go +@@ -209,8 +209,8 @@ next: + for { + tok = p.nextToken() + if len(operands) == 0 && len(items) == 0 { +- if p.arch.InFamily(sys.ARM, sys.ARM64, sys.AMD64, sys.I386) && tok == '.' { +- // Suffixes: ARM conditionals or x86 modifiers. ++ if p.arch.InFamily(sys.ARM, sys.ARM64, sys.AMD64, sys.I386, sys.RISCV64) && tok == '.' { ++ // Suffixes: ARM conditionals, RISCV rounding mode or x86 modifiers. + tok = p.nextToken() + str := p.lex.Text() + if tok != scanner.Ident { +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 11a9e30080..837351508f 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -233,11 +233,31 @@ start: + + // 11.7: Single-Precision Floating-Point Conversion and Move Instructions + FCVTWS F0, X5 // d31200c0 ++ FCVTWS.RNE F0, X5 // d30200c0 ++ FCVTWS.RTZ F0, X5 // d31200c0 ++ FCVTWS.RDN F0, X5 // d32200c0 ++ FCVTWS.RUP F0, X5 // d33200c0 ++ FCVTWS.RMM F0, X5 // d34200c0 + FCVTLS F0, X5 // d31220c0 ++ FCVTLS.RNE F0, X5 // d30220c0 ++ FCVTLS.RTZ F0, X5 // d31220c0 ++ FCVTLS.RDN F0, X5 // d32220c0 ++ FCVTLS.RUP F0, X5 // d33220c0 ++ FCVTLS.RMM F0, X5 // d34220c0 + FCVTSW X5, F0 // 538002d0 + FCVTSL X5, F0 // 538022d0 + FCVTWUS F0, X5 // d31210c0 ++ FCVTWUS.RNE F0, X5 // d30210c0 ++ FCVTWUS.RTZ F0, X5 // d31210c0 ++ FCVTWUS.RDN F0, X5 // d32210c0 ++ FCVTWUS.RUP F0, X5 // d33210c0 ++ FCVTWUS.RMM F0, X5 // d34210c0 + FCVTLUS F0, X5 // d31230c0 ++ FCVTLUS.RNE F0, X5 // d30230c0 ++ FCVTLUS.RTZ F0, X5 // d31230c0 ++ FCVTLUS.RDN F0, X5 // d32230c0 ++ FCVTLUS.RUP F0, X5 // d33230c0 ++ FCVTLUS.RMM F0, X5 // d34230c0 + FCVTSWU X5, F0 // 538012d0 + FCVTSLU X5, F0 // 538032d0 + FSGNJS F1, F0, F2 // 53011020 +@@ -277,11 +297,31 @@ start: + + // 12.5: Double-Precision Floating-Point Conversion and Move Instructions + FCVTWD F0, X5 // d31200c2 ++ FCVTWD.RNE F0, X5 // d30200c2 ++ FCVTWD.RTZ F0, X5 // d31200c2 ++ FCVTWD.RDN F0, X5 // d32200c2 ++ FCVTWD.RUP F0, X5 // d33200c2 ++ FCVTWD.RMM F0, X5 // d34200c2 + FCVTLD F0, X5 // d31220c2 ++ FCVTLD.RNE F0, X5 // d30220c2 ++ FCVTLD.RTZ F0, X5 // d31220c2 ++ FCVTLD.RDN F0, X5 // d32220c2 ++ FCVTLD.RUP F0, X5 // d33220c2 ++ FCVTLD.RMM F0, X5 // d34220c2 + FCVTDW X5, F0 // 538002d2 + FCVTDL X5, F0 // 538022d2 + FCVTWUD F0, X5 // d31210c2 ++ FCVTWUD.RNE F0, X5 // d30210c2 ++ FCVTWUD.RTZ F0, X5 // d31210c2 ++ FCVTWUD.RDN F0, X5 // d32210c2 ++ FCVTWUD.RUP F0, X5 // d33210c2 ++ FCVTWUD.RMM F0, X5 // d34210c2 + FCVTLUD F0, X5 // d31230c2 ++ FCVTLUD.RNE F0, X5 // d30230c2 ++ FCVTLUD.RTZ F0, X5 // d31230c2 ++ FCVTLUD.RDN F0, X5 // d32230c2 ++ FCVTLUD.RUP F0, X5 // d33230c2 ++ FCVTLUD.RMM F0, X5 // d34230c2 + FCVTDWU X5, F0 // 538012d2 + FCVTDLU X5, F0 // 538032d2 + FCVTSD F0, F1 // d3001040 +diff --git a/src/cmd/internal/obj/link.go b/src/cmd/internal/obj/link.go +index f13f9b4c70..b12bf2399a 100644 +--- a/src/cmd/internal/obj/link.go ++++ b/src/cmd/internal/obj/link.go +@@ -314,7 +314,7 @@ type Prog struct { + RegTo2 int16 // 2nd destination operand + Mark uint16 // bitmask of arch-specific items + Optab uint16 // arch-specific opcode index +- Scond uint8 // bits that describe instruction suffixes (e.g. ARM conditions) ++ Scond uint8 // bits that describe instruction suffixes (e.g. ARM conditions, RISCV Rounding Mode) + Back uint8 // for x86 back end: backwards branch state + Ft uint8 // for x86 back end: type index of Prog.From + Tt uint8 // for x86 back end: type index of Prog.To +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index bfd5153da4..0f63a616f7 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -28,7 +28,12 @@ + + package riscv + +-import "cmd/internal/obj" ++import ( ++ "errors" ++ "fmt" ++ ++ "cmd/internal/obj" ++) + + //go:generate go run ../stringer.go -i $GOFILE -o anames.go -p riscv + +@@ -602,6 +607,50 @@ const ( + ALAST + ) + ++// opSuffix encoding to uint8 which fit into p.Scond ++var rmSuffixSet = map[string]uint8{ ++ "RNE": RM_RNE, ++ "RTZ": RM_RTZ, ++ "RDN": RM_RDN, ++ "RUP": RM_RUP, ++ "RMM": RM_RMM, ++} ++ ++const rmSuffixBit uint8 = 1 << 7 ++ ++func rmSuffixEncode(s string) (uint8, error) { ++ if s == "" { ++ return 0, errors.New("empty suffix") ++ } ++ enc, ok := rmSuffixSet[s] ++ if !ok { ++ return 0, fmt.Errorf("invalid encoding for unknown suffix:%q", s) ++ } ++ return enc | rmSuffixBit, nil ++} ++ ++func rmSuffixString(u uint8) (string, error) { ++ if u&rmSuffixBit == 0 { ++ return "", fmt.Errorf("invalid suffix, require round mode bit:%x", u) ++ } ++ ++ u &^= rmSuffixBit ++ for k, v := range rmSuffixSet { ++ if v == u { ++ return k, nil ++ } ++ } ++ return "", fmt.Errorf("unknown suffix:%x", u) ++} ++ ++const ( ++ RM_RNE uint8 = iota // Round to Nearest, ties to Even ++ RM_RTZ // Round towards Zero ++ RM_RDN // Round Down ++ RM_RUP // Round Up ++ RM_RMM // Round to Nearest, ties to Max Magnitude ++) ++ + // All unary instructions which write to their arguments (as opposed to reading + // from them) go here. The assembly parser uses this information to populate + // its AST in a semantically reasonable way. +diff --git a/src/cmd/internal/obj/riscv/list.go b/src/cmd/internal/obj/riscv/list.go +index de90961e32..bc87539f27 100644 +--- a/src/cmd/internal/obj/riscv/list.go ++++ b/src/cmd/internal/obj/riscv/list.go +@@ -13,6 +13,7 @@ import ( + func init() { + obj.RegisterRegister(obj.RBaseRISCV, REG_END, RegName) + obj.RegisterOpcode(obj.ABaseRISCV, Anames) ++ obj.RegisterOpSuffix("riscv64", opSuffixString) + } + + func RegName(r int) string { +@@ -31,3 +32,18 @@ func RegName(r int) string { + return fmt.Sprintf("Rgok(%d)", r-obj.RBaseRISCV) + } + } ++ ++func opSuffixString(s uint8) string { ++ if s&rmSuffixBit == 0 { ++ return "" ++ } ++ ++ ss, err := rmSuffixString(s) ++ if err != nil { ++ ss = fmt.Sprintf("", s) ++ } ++ if ss == "" { ++ return ss ++ } ++ return fmt.Sprintf(".%s", ss) ++} +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 02d08fec76..8020624c70 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -28,6 +28,7 @@ import ( + "internal/abi" + "log" + "math/bits" ++ "strings" + ) + + func buildop(ctxt *obj.Link) {} +@@ -2272,8 +2273,12 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.imm = 0x0ff + + case AFCVTWS, AFCVTLS, AFCVTWUS, AFCVTLUS, AFCVTWD, AFCVTLD, AFCVTWUD, AFCVTLUD: +- // Set the rounding mode in funct3 to round to zero. +- ins.funct3 = 1 ++ // Set the default rounding mode in funct3 to round to zero. ++ if p.Scond&rmSuffixBit == 0 { ++ ins.funct3 = uint32(RM_RTZ) ++ } else { ++ ins.funct3 = uint32(p.Scond &^ rmSuffixBit) ++ } + + case AFNES, AFNED: + // Replace FNE[SD] with FEQ[SD] and NOT. +@@ -2474,6 +2479,14 @@ func isUnsafePoint(p *obj.Prog) bool { + return p.Mark&USES_REG_TMP == USES_REG_TMP || p.From.Reg == REG_TMP || p.To.Reg == REG_TMP || p.Reg == REG_TMP + } + ++func ParseSuffix(prog *obj.Prog, cond string) (err error) { ++ switch prog.As { ++ case AFCVTWS, AFCVTLS, AFCVTWUS, AFCVTLUS, AFCVTWD, AFCVTLD, AFCVTWUD, AFCVTLUD: ++ prog.Scond, err = rmSuffixEncode(strings.TrimPrefix(cond, ".")) ++ } ++ return ++} ++ + var LinkRISCV64 = obj.LinkArch{ + Arch: sys.ArchRISCV64, + Init: buildop, +-- +2.39.5 + diff --git a/2030-math-add-round-assembly-implementations-on-riscv64.patch b/2030-math-add-round-assembly-implementations-on-riscv64.patch new file mode 100644 index 0000000..49c0dc2 --- /dev/null +++ b/2030-math-add-round-assembly-implementations-on-riscv64.patch @@ -0,0 +1,113 @@ +From b72a91d17ed5ef4149ad33f7c5ad4e80ba7a861e Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 030/119] math: add round assembly implementations on riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +goos: linux +goarch: riscv64 +pkg: math + │ floor_old.bench │ floor_new.bench │ + │ sec/op │ sec/op vs base │ +Ceil 54.12n ± 0% 22.05n ± 0% -59.26% (p=0.000 n=10) +Floor 40.80n ± 0% 22.05n ± 0% -45.96% (p=0.000 n=10) +Round 20.73n ± 0% 20.74n ± 0% ~ (p=0.441 n=10) +RoundToEven 24.07n ± 0% 24.07n ± 0% ~ (p=1.000 n=10) +Trunc 38.73n ± 0% 22.05n ± 0% -43.07% (p=0.000 n=10) +geomean 33.58n 22.17n -33.98% + +Change-Id: I24fb9e3bbf8146da253b6791b21377bea1afbd16 +Reviewed-on: https://go-review.googlesource.com/c/go/+/504737 +TryBot-Result: Gopher Robot +Reviewed-by: Russ Cox +Reviewed-by: M Zhuo +Reviewed-by: Cherry Mui +Run-TryBot: M Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: David Chase +Reviewed-by: Joel Sing +--- + src/math/floor_asm.go | 2 +- + src/math/floor_noasm.go | 2 +- + src/math/floor_riscv64.s | 41 ++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 43 insertions(+), 2 deletions(-) + create mode 100644 src/math/floor_riscv64.s + +diff --git a/src/math/floor_asm.go b/src/math/floor_asm.go +index fb419d6da2..5cb45f5a7e 100644 +--- a/src/math/floor_asm.go ++++ b/src/math/floor_asm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || s390x || wasm ++//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || riscv64 || s390x || wasm + + package math + +diff --git a/src/math/floor_noasm.go b/src/math/floor_noasm.go +index 5641c7ea0a..6754ca8fc8 100644 +--- a/src/math/floor_noasm.go ++++ b/src/math/floor_noasm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !s390x && !wasm ++//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !riscv64 && !s390x && !wasm + + package math + +diff --git a/src/math/floor_riscv64.s b/src/math/floor_riscv64.s +new file mode 100644 +index 0000000000..62ce963781 +--- /dev/null ++++ b/src/math/floor_riscv64.s +@@ -0,0 +1,41 @@ ++// Copyright 2023 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++#include "textflag.h" ++ ++#define PosInf 0x7FF0000000000000 ++ ++// The rounding mode of RISC-V is different from Go spec. ++ ++#define ROUNDFN(NAME, MODE) \ ++TEXT NAME(SB),NOSPLIT,$0; \ ++ MOVD x+0(FP), F0; \ ++ /* whether x is NaN */; \ ++ FEQD F0, F0, X6; \ ++ BNEZ X6, 3(PC); \ ++ /* return NaN if x is NaN */; \ ++ MOVD F0, ret+8(FP); \ ++ RET; \ ++ MOV $PosInf, X6; \ ++ FMVDX X6, F1; \ ++ FABSD F0, F2; \ ++ /* if abs(x) > +Inf, return Inf instead of round(x) */; \ ++ FLTD F1, F2, X6; \ ++ /* Inf should keep same signed with x then return */; \ ++ BEQZ X6, 3(PC); \ ++ FCVTLD.MODE F0, X6; \ ++ FCVTDL X6, F1; \ ++ /* rounding will drop signed bit in RISCV, restore it */; \ ++ FSGNJD F0, F1, F0; \ ++ MOVD F0, ret+8(FP); \ ++ RET ++ ++// func archFloor(x float64) float64 ++ROUNDFN(·archFloor, RDN) ++ ++// func archCeil(x float64) float64 ++ROUNDFN(·archCeil, RUP) ++ ++// func archTrunc(x float64) float64 ++ROUNDFN(·archTrunc, RTZ) +-- +2.39.5 + diff --git a/2031-cmd-link-internal-riscv64-generate-local-text-symbol.patch b/2031-cmd-link-internal-riscv64-generate-local-text-symbol.patch new file mode 100644 index 0000000..32f3aba --- /dev/null +++ b/2031-cmd-link-internal-riscv64-generate-local-text-symbol.patch @@ -0,0 +1,47 @@ +From f5bd8318d01641e10dfcd98d22e07030d2189beb Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 031/119] cmd/link/internal/riscv64: generate local text + symbols for R_RISCV_CALL + +Correctly generate local text symbols needed for R_RISCV_CALL when +external linking. R_RISCV_CALL was added in CL #520095 as a way of +marking AUIPC+JALR pairs, instead of overloading R_RISCV_PCREL_ITYPE. +However, genSymsLate was not updated to generate local text symbols +for the new relocation type, leading to HI20 symbol lookup failures. + +This issue is detected by cmd/internal/obj/riscv.TestLargeCall, +however this is unfortunately skipped in short mode. + +Fixes #65646 + +Change-Id: I8ee0f13791e0628f31657bf7dae2be8482b689b5 +Reviewed-on: https://go-review.googlesource.com/c/go/+/567375 +Reviewed-by: Mauri de Souza Meneguzzo +Reviewed-by: Carlos Amedee +Run-TryBot: Joel Sing +Reviewed-by: Cherry Mui +TryBot-Result: Gopher Robot +LUCI-TryBot-Result: Go LUCI +--- + src/cmd/link/internal/riscv64/asm.go | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/cmd/link/internal/riscv64/asm.go b/src/cmd/link/internal/riscv64/asm.go +index f3186398eb..55fb98199e 100644 +--- a/src/cmd/link/internal/riscv64/asm.go ++++ b/src/cmd/link/internal/riscv64/asm.go +@@ -38,8 +38,8 @@ func genSymsLate(ctxt *ld.Link, ldr *loader.Loader) { + relocs := ldr.Relocs(s) + for ri := 0; ri < relocs.Count(); ri++ { + r := relocs.At(ri) +- if r.Type() != objabi.R_RISCV_PCREL_ITYPE && r.Type() != objabi.R_RISCV_PCREL_STYPE && +- r.Type() != objabi.R_RISCV_TLS_IE { ++ if r.Type() != objabi.R_RISCV_CALL && r.Type() != objabi.R_RISCV_PCREL_ITYPE && ++ r.Type() != objabi.R_RISCV_PCREL_STYPE && r.Type() != objabi.R_RISCV_TLS_IE { + continue + } + if r.Off() == 0 && ldr.SymType(s) == sym.STEXT { +-- +2.39.5 + diff --git a/2032-cmd-compile-cmd-internal-obj-provide-rotation-pseudo.patch b/2032-cmd-compile-cmd-internal-obj-provide-rotation-pseudo.patch new file mode 100644 index 0000000..dc8b2d3 --- /dev/null +++ b/2032-cmd-compile-cmd-internal-obj-provide-rotation-pseudo.patch @@ -0,0 +1,864 @@ +From 3d3ebd4e882f711158e197cd7e95033e014b2216 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 032/119] cmd/compile,cmd/internal/obj: provide rotation + pseudo-instructions for riscv64 + +Provide and use rotation pseudo-instructions for riscv64. The RISC-V bitmanip +extension adds support for hardware rotation instructions in the form of ROL, +ROLW, ROR, RORI, RORIW and RORW. These are easily implemented in the assembler +as pseudo-instructions for CPUs that do not support the bitmanip extension. + +This approach provides a number of advantages, including reducing the rewrite +rules needed in the compiler, simplifying codegen tests and most importantly, +allowing these instructions to be used in assembly (for example, riscv64 +optimised versions of SHA-256 and SHA-512). When bitmanip support is added, +these instruction sequences can simply be replaced with a single instruction +if permitted by the GORISCV64 profile. + +Change-Id: Ia23402e1a82f211ac760690deb063386056ae1fa +Reviewed-on: https://go-review.googlesource.com/c/go/+/565015 +TryBot-Result: Gopher Robot +Reviewed-by: Michael Knyszek +Reviewed-by: M Zhuo +Reviewed-by: Carlos Amedee +LUCI-TryBot-Result: Go LUCI +Run-TryBot: Joel Sing +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 18 ++ + src/cmd/compile/internal/riscv64/ssa.go | 4 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 15 +- + .../compile/internal/ssa/_gen/RISCV64Ops.go | 14 +- + src/cmd/compile/internal/ssa/opGen.go | 124 ++++++++++-- + .../compile/internal/ssa/rewriteRISCV64.go | 185 +++++++++++++----- + src/cmd/internal/obj/riscv/anames.go | 6 + + src/cmd/internal/obj/riscv/cpu.go | 6 + + src/cmd/internal/obj/riscv/obj.go | 51 ++++- + src/crypto/sha512/sha512block_riscv64.s | 25 +-- + test/codegen/rotate.go | 24 +-- + 11 files changed, 376 insertions(+), 96 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 837351508f..ed691f4d9e 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -417,6 +417,24 @@ start: + NEGW X5 // bb025040 + NEGW X5, X6 // 3b035040 + ++ // Bitwise rotation pseudo-instructions ++ ROL X5, X6, X7 // b30f5040b35ff301b3135300b3e37f00 ++ ROL X5, X6 // b30f5040b35ff3013313530033e36f00 ++ ROLW X5, X6, X7 // b30f5040bb5ff301bb135300b3e37f00 ++ ROLW X5, X6 // b30f5040bb5ff3013b13530033e36f00 ++ ROR X5, X6, X7 // b30f5040b31ff301b3535300b3e37f00 ++ ROR X5, X6 // b30f5040b31ff3013353530033e36f00 ++ RORW X5, X6, X7 // b30f5040bb1ff301bb535300b3e37f00 ++ RORW X5, X6 // b30f5040bb1ff3013b53530033e36f00 ++ RORI $5, X6, X7 // 935f53009313b303b3e37f00 ++ RORI $5, X6 // 935f53001313b30333e36f00 ++ RORIW $5, X6, X7 // 9b5f53009b13b301b3e37f00 ++ RORIW $5, X6 // 9b5f53001b13b30133e36f00 ++ ROR $5, X6, X7 // 935f53009313b303b3e37f00 ++ ROR $5, X6 // 935f53001313b30333e36f00 ++ RORW $5, X6, X7 // 9b5f53009b13b301b3e37f00 ++ RORW $5, X6 // 9b5f53001b13b30133e36f00 ++ + // This jumps to the second instruction in the function (the + // first instruction is an invisible stack pointer adjustment). + JMP start // JMP 2 +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 17f0d98532..c9e75b2180 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -283,6 +283,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW, + ssa.OpRISCV64DIVUW, ssa.OpRISCV64REM, ssa.OpRISCV64REMU, ssa.OpRISCV64REMW, + ssa.OpRISCV64REMUW, ++ ssa.OpRISCV64ROL, ssa.OpRISCV64ROLW, ssa.OpRISCV64ROR, ssa.OpRISCV64RORW, + ssa.OpRISCV64FADDS, ssa.OpRISCV64FSUBS, ssa.OpRISCV64FMULS, ssa.OpRISCV64FDIVS, + ssa.OpRISCV64FEQS, ssa.OpRISCV64FNES, ssa.OpRISCV64FLTS, ssa.OpRISCV64FLES, + ssa.OpRISCV64FADDD, ssa.OpRISCV64FSUBD, ssa.OpRISCV64FMULD, ssa.OpRISCV64FDIVD, +@@ -423,7 +424,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.To.Reg = v.Reg() + case ssa.OpRISCV64ADDI, ssa.OpRISCV64ADDIW, ssa.OpRISCV64XORI, ssa.OpRISCV64ORI, ssa.OpRISCV64ANDI, + ssa.OpRISCV64SLLI, ssa.OpRISCV64SLLIW, ssa.OpRISCV64SRAI, ssa.OpRISCV64SRAIW, +- ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI, ssa.OpRISCV64SLTIU: ++ ssa.OpRISCV64SRLI, ssa.OpRISCV64SRLIW, ssa.OpRISCV64SLTI, ssa.OpRISCV64SLTIU, ++ ssa.OpRISCV64RORI, ssa.OpRISCV64RORIW: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 135d70bc47..c2df433315 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -216,8 +216,8 @@ + // Rotates. + (RotateLeft8 x y) => (OR (SLL x (ANDI [7] y)) (SRL (ZeroExt8to64 x) (ANDI [7] (NEG y)))) + (RotateLeft16 x y) => (OR (SLL x (ANDI [15] y)) (SRL (ZeroExt16to64 x) (ANDI [15] (NEG y)))) +-(RotateLeft32 x y) => (OR (SLLW x y) (SRLW x (NEG y))) +-(RotateLeft64 x y) => (OR (SLL x y) (SRL x (NEG y))) ++(RotateLeft32 ...) => (ROLW ...) ++(RotateLeft64 ...) => (ROL ...) + + (Less64 ...) => (SLT ...) + (Less32 x y) => (SLT (SignExt32to64 x) (SignExt32to64 y)) +@@ -665,6 +665,9 @@ + (MOVWreg x:(DIVUW _ _)) => (MOVDreg x) + (MOVWreg x:(REMW _ _)) => (MOVDreg x) + (MOVWreg x:(REMUW _ _)) => (MOVDreg x) ++(MOVWreg x:(ROLW _ _)) => (MOVDreg x) ++(MOVWreg x:(RORW _ _)) => (MOVDreg x) ++(MOVWreg x:(RORIW _)) => (MOVDreg x) + + // Fold double extensions. + (MOVBreg x:(MOVBreg _)) => (MOVDreg x) +@@ -731,6 +734,10 @@ + (AND (MOVDconst [val]) x) && is32Bit(val) => (ANDI [val] x) + (OR (MOVDconst [val]) x) && is32Bit(val) => (ORI [val] x) + (XOR (MOVDconst [val]) x) && is32Bit(val) => (XORI [val] x) ++(ROL x (MOVDconst [val])) => (RORI [int64(int8(-val)&63)] x) ++(ROLW x (MOVDconst [val])) => (RORIW [int64(int8(-val)&31)] x) ++(ROR x (MOVDconst [val])) => (RORI [int64(val&63)] x) ++(RORW x (MOVDconst [val])) => (RORIW [int64(val&31)] x) + (SLL x (MOVDconst [val])) => (SLLI [int64(val&63)] x) + (SRL x (MOVDconst [val])) => (SRLI [int64(val&63)] x) + (SLLW x (MOVDconst [val])) => (SLLIW [int64(val&31)] x) +@@ -740,6 +747,10 @@ + (SLT x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTI [val] x) + (SLTU x (MOVDconst [val])) && val >= -2048 && val <= 2047 => (SLTIU [val] x) + ++// Replace negated left rotation with right rotation. ++(ROL x (NEG y)) => (ROR x y) ++(ROLW x (NEG y)) => (RORW x y) ++ + // Convert const subtraction into ADDI with negative immediate, where possible. + (SUB x (MOVDconst [val])) && is32Bit(-val) => (ADDI [-val] x) + (SUB (MOVDconst [val]) y) && is32Bit(-val) => (NEG (ADDI [-val] y)) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index e9f1df0d58..13fa91864b 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -221,13 +221,19 @@ func init() { + {name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, shift amount 0-31, logical right shift of 32 bit value, sign extended to 64 bits + + // Bitwise ops +- {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1 +- {name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"}, // arg0 ^ auxint +- {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 +- {name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"}, // arg0 | auxint + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 + {name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"}, // arg0 & auxint + {name: "NOT", argLength: 1, reg: gp11, asm: "NOT"}, // ^arg0 ++ {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 ++ {name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"}, // arg0 | auxint ++ {name: "ROL", argLength: 2, reg: gp21, asm: "ROL"}, // rotate left arg0 by (arg1 & 63) ++ {name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"}, // rotate left least significant word of arg0 by (arg1 & 31), sign extended ++ {name: "ROR", argLength: 2, reg: gp21, asm: "ROR"}, // rotate right arg0 by (arg1 & 63) ++ {name: "RORI", argLength: 1, reg: gp11, asm: "RORI", aux: "Int64"}, // rotate right arg0 by auxint, shift amount 0-63 ++ {name: "RORIW", argLength: 1, reg: gp11, asm: "RORIW", aux: "Int64"}, // rotate right least significant word of arg0 by auxint, shift amount 0-31, sign extended ++ {name: "RORW", argLength: 2, reg: gp21, asm: "RORW"}, // rotate right least significant word of arg0 by (arg1 & 31), sign extended ++ {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1 ++ {name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"}, // arg0 ^ auxint + + // Generate boolean values + {name: "SEQZ", argLength: 1, reg: gp11, asm: "SEQZ"}, // arg0 == 0, result is 0 or 1 +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index 29ca9f5c0f..dd80a2c52a 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2381,13 +2381,19 @@ const ( + OpRISCV64SRAIW + OpRISCV64SRLI + OpRISCV64SRLIW +- OpRISCV64XOR +- OpRISCV64XORI +- OpRISCV64OR +- OpRISCV64ORI + OpRISCV64AND + OpRISCV64ANDI + OpRISCV64NOT ++ OpRISCV64OR ++ OpRISCV64ORI ++ OpRISCV64ROL ++ OpRISCV64ROLW ++ OpRISCV64ROR ++ OpRISCV64RORI ++ OpRISCV64RORIW ++ OpRISCV64RORW ++ OpRISCV64XOR ++ OpRISCV64XORI + OpRISCV64SEQZ + OpRISCV64SNEZ + OpRISCV64SLT +@@ -31935,10 +31941,10 @@ var opcodeTable = [...]opInfo{ + }, + }, + { +- name: "XOR", ++ name: "AND", + argLen: 2, + commutative: true, +- asm: riscv.AXOR, ++ asm: riscv.AAND, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 +@@ -31950,10 +31956,23 @@ var opcodeTable = [...]opInfo{ + }, + }, + { +- name: "XORI", ++ name: "ANDI", + auxType: auxInt64, + argLen: 1, +- asm: riscv.AXORI, ++ asm: riscv.AANDI, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "NOT", ++ argLen: 1, ++ asm: riscv.ANOT, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 +@@ -31993,10 +32012,9 @@ var opcodeTable = [...]opInfo{ + }, + }, + { +- name: "AND", +- argLen: 2, +- commutative: true, +- asm: riscv.AAND, ++ name: "ROL", ++ argLen: 2, ++ asm: riscv.AROL, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 +@@ -32008,10 +32026,38 @@ var opcodeTable = [...]opInfo{ + }, + }, + { +- name: "ANDI", ++ name: "ROLW", ++ argLen: 2, ++ asm: riscv.AROLW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "ROR", ++ argLen: 2, ++ asm: riscv.AROR, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "RORI", + auxType: auxInt64, + argLen: 1, +- asm: riscv.AANDI, ++ asm: riscv.ARORI, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 +@@ -32022,9 +32068,53 @@ var opcodeTable = [...]opInfo{ + }, + }, + { +- name: "NOT", +- argLen: 1, +- asm: riscv.ANOT, ++ name: "RORIW", ++ auxType: auxInt64, ++ argLen: 1, ++ asm: riscv.ARORIW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "RORW", ++ argLen: 2, ++ asm: riscv.ARORW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "XOR", ++ argLen: 2, ++ commutative: true, ++ asm: riscv.AXOR, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "XORI", ++ auxType: auxInt64, ++ argLen: 1, ++ asm: riscv.AXORI, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index ca0e108915..28c44da5a8 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -532,6 +532,14 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpRISCV64OR(v) + case OpRISCV64ORI: + return rewriteValueRISCV64_OpRISCV64ORI(v) ++ case OpRISCV64ROL: ++ return rewriteValueRISCV64_OpRISCV64ROL(v) ++ case OpRISCV64ROLW: ++ return rewriteValueRISCV64_OpRISCV64ROLW(v) ++ case OpRISCV64ROR: ++ return rewriteValueRISCV64_OpRISCV64ROR(v) ++ case OpRISCV64RORW: ++ return rewriteValueRISCV64_OpRISCV64RORW(v) + case OpRISCV64SEQZ: + return rewriteValueRISCV64_OpRISCV64SEQZ(v) + case OpRISCV64SLL: +@@ -571,9 +579,11 @@ func rewriteValueRISCV64(v *Value) bool { + case OpRotateLeft16: + return rewriteValueRISCV64_OpRotateLeft16(v) + case OpRotateLeft32: +- return rewriteValueRISCV64_OpRotateLeft32(v) ++ v.Op = OpRISCV64ROLW ++ return true + case OpRotateLeft64: +- return rewriteValueRISCV64_OpRotateLeft64(v) ++ v.Op = OpRISCV64ROL ++ return true + case OpRotateLeft8: + return rewriteValueRISCV64_OpRotateLeft8(v) + case OpRound32F: +@@ -5626,6 +5636,39 @@ func rewriteValueRISCV64_OpRISCV64MOVWreg(v *Value) bool { + v.AddArg(x) + return true + } ++ // match: (MOVWreg x:(ROLW _ _)) ++ // result: (MOVDreg x) ++ for { ++ x := v_0 ++ if x.Op != OpRISCV64ROLW { ++ break ++ } ++ v.reset(OpRISCV64MOVDreg) ++ v.AddArg(x) ++ return true ++ } ++ // match: (MOVWreg x:(RORW _ _)) ++ // result: (MOVDreg x) ++ for { ++ x := v_0 ++ if x.Op != OpRISCV64RORW { ++ break ++ } ++ v.reset(OpRISCV64MOVDreg) ++ v.AddArg(x) ++ return true ++ } ++ // match: (MOVWreg x:(RORIW _)) ++ // result: (MOVDreg x) ++ for { ++ x := v_0 ++ if x.Op != OpRISCV64RORIW { ++ break ++ } ++ v.reset(OpRISCV64MOVDreg) ++ v.AddArg(x) ++ return true ++ } + // match: (MOVWreg x:(MOVBreg _)) + // result: (MOVDreg x) + for { +@@ -5999,6 +6042,102 @@ func rewriteValueRISCV64_OpRISCV64ORI(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64ROL(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (ROL x (MOVDconst [val])) ++ // result: (RORI [int64(int8(-val)&63)] x) ++ for { ++ x := v_0 ++ if v_1.Op != OpRISCV64MOVDconst { ++ break ++ } ++ val := auxIntToInt64(v_1.AuxInt) ++ v.reset(OpRISCV64RORI) ++ v.AuxInt = int64ToAuxInt(int64(int8(-val) & 63)) ++ v.AddArg(x) ++ return true ++ } ++ // match: (ROL x (NEG y)) ++ // result: (ROR x y) ++ for { ++ x := v_0 ++ if v_1.Op != OpRISCV64NEG { ++ break ++ } ++ y := v_1.Args[0] ++ v.reset(OpRISCV64ROR) ++ v.AddArg2(x, y) ++ return true ++ } ++ return false ++} ++func rewriteValueRISCV64_OpRISCV64ROLW(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (ROLW x (MOVDconst [val])) ++ // result: (RORIW [int64(int8(-val)&31)] x) ++ for { ++ x := v_0 ++ if v_1.Op != OpRISCV64MOVDconst { ++ break ++ } ++ val := auxIntToInt64(v_1.AuxInt) ++ v.reset(OpRISCV64RORIW) ++ v.AuxInt = int64ToAuxInt(int64(int8(-val) & 31)) ++ v.AddArg(x) ++ return true ++ } ++ // match: (ROLW x (NEG y)) ++ // result: (RORW x y) ++ for { ++ x := v_0 ++ if v_1.Op != OpRISCV64NEG { ++ break ++ } ++ y := v_1.Args[0] ++ v.reset(OpRISCV64RORW) ++ v.AddArg2(x, y) ++ return true ++ } ++ return false ++} ++func rewriteValueRISCV64_OpRISCV64ROR(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (ROR x (MOVDconst [val])) ++ // result: (RORI [int64(val&63)] x) ++ for { ++ x := v_0 ++ if v_1.Op != OpRISCV64MOVDconst { ++ break ++ } ++ val := auxIntToInt64(v_1.AuxInt) ++ v.reset(OpRISCV64RORI) ++ v.AuxInt = int64ToAuxInt(int64(val & 63)) ++ v.AddArg(x) ++ return true ++ } ++ return false ++} ++func rewriteValueRISCV64_OpRISCV64RORW(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (RORW x (MOVDconst [val])) ++ // result: (RORIW [int64(val&31)] x) ++ for { ++ x := v_0 ++ if v_1.Op != OpRISCV64MOVDconst { ++ break ++ } ++ val := auxIntToInt64(v_1.AuxInt) ++ v.reset(OpRISCV64RORIW) ++ v.AuxInt = int64ToAuxInt(int64(val & 31)) ++ v.AddArg(x) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64SEQZ(v *Value) bool { + v_0 := v.Args[0] + // match: (SEQZ (NEG x)) +@@ -6691,48 +6830,6 @@ func rewriteValueRISCV64_OpRotateLeft16(v *Value) bool { + return true + } + } +-func rewriteValueRISCV64_OpRotateLeft32(v *Value) bool { +- v_1 := v.Args[1] +- v_0 := v.Args[0] +- b := v.Block +- // match: (RotateLeft32 x y) +- // result: (OR (SLLW x y) (SRLW x (NEG y))) +- for { +- t := v.Type +- x := v_0 +- y := v_1 +- v.reset(OpRISCV64OR) +- v0 := b.NewValue0(v.Pos, OpRISCV64SLLW, t) +- v0.AddArg2(x, y) +- v1 := b.NewValue0(v.Pos, OpRISCV64SRLW, t) +- v2 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type) +- v2.AddArg(y) +- v1.AddArg2(x, v2) +- v.AddArg2(v0, v1) +- return true +- } +-} +-func rewriteValueRISCV64_OpRotateLeft64(v *Value) bool { +- v_1 := v.Args[1] +- v_0 := v.Args[0] +- b := v.Block +- // match: (RotateLeft64 x y) +- // result: (OR (SLL x y) (SRL x (NEG y))) +- for { +- t := v.Type +- x := v_0 +- y := v_1 +- v.reset(OpRISCV64OR) +- v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t) +- v0.AddArg2(x, y) +- v1 := b.NewValue0(v.Pos, OpRISCV64SRL, t) +- v2 := b.NewValue0(v.Pos, OpRISCV64NEG, y.Type) +- v2.AddArg(y) +- v1.AddArg2(x, v2) +- v.AddArg2(v0, v1) +- return true +- } +-} + func rewriteValueRISCV64_OpRotateLeft8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go +index d2c41971b8..e547c6d5e9 100644 +--- a/src/cmd/internal/obj/riscv/anames.go ++++ b/src/cmd/internal/obj/riscv/anames.go +@@ -246,6 +246,12 @@ var Anames = []string{ + "NEG", + "NEGW", + "NOT", ++ "ROL", ++ "ROLW", ++ "ROR", ++ "RORI", ++ "RORIW", ++ "RORW", + "SEQZ", + "SNEZ", + "LAST", +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index 0f63a616f7..24026561ee 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -600,6 +600,12 @@ const ( + ANEG + ANEGW + ANOT ++ AROL ++ AROLW ++ AROR ++ ARORI ++ ARORIW ++ ARORW + ASEQZ + ASNEZ + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 8020624c70..9b81768b85 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -59,7 +59,8 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + AADDIW, ASLLIW, ASRLIW, ASRAIW, AADDW, ASUBW, ASLLW, ASRLW, ASRAW, + AADD, AAND, AOR, AXOR, ASLL, ASRL, ASUB, ASRA, + AMUL, AMULH, AMULHU, AMULHSU, AMULW, ADIV, ADIVU, ADIVW, ADIVUW, +- AREM, AREMU, AREMW, AREMUW: ++ AREM, AREMU, AREMW, AREMUW, ++ AROL, AROLW, AROR, ARORW, ARORI, ARORIW: + p.Reg = p.To.Reg + } + } +@@ -90,6 +91,10 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + p.As = ASRAI + case AADDW: + p.As = AADDIW ++ case AROR: ++ p.As = ARORI ++ case ARORW: ++ p.As = ARORIW + case ASUBW: + p.As, p.From.Offset = AADDIW, -p.From.Offset + case ASLLW: +@@ -2192,6 +2197,47 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + return inss + } + ++// instructionsForRotate returns the machine instructions for a bitwise rotation. ++func instructionsForRotate(p *obj.Prog, ins *instruction) []*instruction { ++ switch ins.as { ++ case AROL, AROLW, AROR, ARORW: ++ // ROL -> OR (SLL x y) (SRL x (NEG y)) ++ // ROR -> OR (SRL x y) (SLL x (NEG y)) ++ sllOp, srlOp := ASLL, ASRL ++ if ins.as == AROLW || ins.as == ARORW { ++ sllOp, srlOp = ASLLW, ASRLW ++ } ++ shift1, shift2 := sllOp, srlOp ++ if ins.as == AROR || ins.as == ARORW { ++ shift1, shift2 = shift2, shift1 ++ } ++ return []*instruction{ ++ &instruction{as: ASUB, rs1: REG_ZERO, rs2: ins.rs2, rd: REG_TMP}, ++ &instruction{as: shift2, rs1: ins.rs1, rs2: REG_TMP, rd: REG_TMP}, ++ &instruction{as: shift1, rs1: ins.rs1, rs2: ins.rs2, rd: ins.rd}, ++ &instruction{as: AOR, rs1: REG_TMP, rs2: ins.rd, rd: ins.rd}, ++ } ++ ++ case ARORI, ARORIW: ++ // ROR -> OR (SLLI -x y) (SRLI x y) ++ sllOp, srlOp := ASLLI, ASRLI ++ sllImm := int64(int8(-ins.imm) & 63) ++ if ins.as == ARORIW { ++ sllOp, srlOp = ASLLIW, ASRLIW ++ sllImm = int64(int8(-ins.imm) & 31) ++ } ++ return []*instruction{ ++ &instruction{as: srlOp, rs1: ins.rs1, rd: REG_TMP, imm: ins.imm}, ++ &instruction{as: sllOp, rs1: ins.rs1, rd: ins.rd, imm: sllImm}, ++ &instruction{as: AOR, rs1: REG_TMP, rs2: ins.rd, rd: ins.rd}, ++ } ++ ++ default: ++ p.Ctxt.Diag("%v: unknown rotation", p) ++ return nil ++ } ++} ++ + // instructionsForProg returns the machine instructions for an *obj.Prog. + func instructionsForProg(p *obj.Prog) []*instruction { + ins := instructionForProg(p) +@@ -2362,6 +2408,9 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.as = AFSGNJND + ins.rs1 = uint32(p.From.Reg) + ++ case AROL, AROLW, AROR, ARORW, ARORI, ARORIW: ++ inss = instructionsForRotate(p, ins) ++ + case ASLLI, ASRLI, ASRAI: + if ins.imm < 0 || ins.imm > 63 { + p.Ctxt.Diag("%v: shift amount out of range 0 to 63", p) +diff --git a/src/crypto/sha512/sha512block_riscv64.s b/src/crypto/sha512/sha512block_riscv64.s +index 361aafe49d..e3a240f70e 100644 +--- a/src/crypto/sha512/sha512block_riscv64.s ++++ b/src/crypto/sha512/sha512block_riscv64.s +@@ -44,11 +44,6 @@ + // H6 = g + H6 + // H7 = h + H7 + +-#define ROR(s, r, d, t1, t2) \ +- SLL $(64-s), r, t1; \ +- SRL $(s), r, t2; \ +- OR t1, t2, d +- + // Wt = Mt; for 0 <= t <= 15 + #define MSGSCHEDULE0(index) \ + MOVBU ((index*8)+0)(X29), X5; \ +@@ -83,14 +78,14 @@ + MOV (((index-15)&0xf)*8)(X19), X6; \ + MOV (((index-7)&0xf)*8)(X19), X9; \ + MOV (((index-16)&0xf)*8)(X19), X21; \ +- ROR(19, X5, X7, X23, X24); \ +- ROR(61, X5, X8, X23, X24); \ ++ ROR $19, X5, X7; \ ++ ROR $61, X5, X8; \ + SRL $6, X5; \ + XOR X7, X5; \ + XOR X8, X5; \ + ADD X9, X5; \ +- ROR(1, X6, X7, X23, X24); \ +- ROR(8, X6, X8, X23, X24); \ ++ ROR $1, X6, X7; \ ++ ROR $8, X6, X8; \ + SRL $7, X6; \ + XOR X7, X6; \ + XOR X8, X6; \ +@@ -106,11 +101,11 @@ + #define SHA512T1(index, e, f, g, h) \ + MOV (index*8)(X18), X8; \ + ADD X5, h; \ +- ROR(14, e, X6, X23, X24); \ ++ ROR $14, e, X6; \ + ADD X8, h; \ +- ROR(18, e, X7, X23, X24); \ ++ ROR $18, e, X7; \ + XOR X7, X6; \ +- ROR(41, e, X8, X23, X24); \ ++ ROR $41, e, X8; \ + XOR X8, X6; \ + ADD X6, h; \ + AND e, f, X5; \ +@@ -124,10 +119,10 @@ + // BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x) + // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) + #define SHA512T2(a, b, c) \ +- ROR(28, a, X6, X23, X24); \ +- ROR(34, a, X7, X23, X24); \ ++ ROR $28, a, X6; \ ++ ROR $34, a, X7; \ + XOR X7, X6; \ +- ROR(39, a, X8, X23, X24); \ ++ ROR $39, a, X8; \ + XOR X8, X6; \ + AND a, b, X7; \ + AND a, c, X8; \ +diff --git a/test/codegen/rotate.go b/test/codegen/rotate.go +index 109e55763c..121ce4cc0a 100644 +--- a/test/codegen/rotate.go ++++ b/test/codegen/rotate.go +@@ -18,7 +18,7 @@ func rot64(x uint64) uint64 { + // amd64:"ROLQ\t[$]7" + // ppc64x:"ROTL\t[$]7" + // loong64: "ROTRV\t[$]57" +- // riscv64: "OR","SLLI","SRLI",-"AND" ++ // riscv64: "RORI\t[$]57" + a += x<<7 | x>>57 + + // amd64:"ROLQ\t[$]8" +@@ -26,7 +26,7 @@ func rot64(x uint64) uint64 { + // s390x:"RISBGZ\t[$]0, [$]63, [$]8, " + // ppc64x:"ROTL\t[$]8" + // loong64: "ROTRV\t[$]56" +- // riscv64: "OR","SLLI","SRLI",-"AND" ++ // riscv64: "RORI\t[$]56" + a += x<<8 + x>>56 + + // amd64:"ROLQ\t[$]9" +@@ -34,7 +34,7 @@ func rot64(x uint64) uint64 { + // s390x:"RISBGZ\t[$]0, [$]63, [$]9, " + // ppc64x:"ROTL\t[$]9" + // loong64: "ROTRV\t[$]55" +- // riscv64: "OR","SLLI","SRLI",-"AND" ++ // riscv64: "RORI\t[$]55" + a += x<<9 ^ x>>55 + + // amd64:"ROLQ\t[$]10" +@@ -44,7 +44,7 @@ func rot64(x uint64) uint64 { + // arm64:"ROR\t[$]54" + // s390x:"RISBGZ\t[$]0, [$]63, [$]10, " + // loong64: "ROTRV\t[$]54" +- // riscv64: "OR","SLLI","SRLI",-"AND" ++ // riscv64: "RORI\t[$]54" + a += bits.RotateLeft64(x, 10) + + return a +@@ -57,7 +57,7 @@ func rot32(x uint32) uint32 { + // arm:"MOVW\tR\\d+@>25" + // ppc64x:"ROTLW\t[$]7" + // loong64: "ROTR\t[$]25" +- // riscv64: "OR","SLLIW","SRLIW",-"AND" ++ // riscv64: "RORIW\t[$]25" + a += x<<7 | x>>25 + + // amd64:`ROLL\t[$]8` +@@ -66,7 +66,7 @@ func rot32(x uint32) uint32 { + // s390x:"RLL\t[$]8" + // ppc64x:"ROTLW\t[$]8" + // loong64: "ROTR\t[$]24" +- // riscv64: "OR","SLLIW","SRLIW",-"AND" ++ // riscv64: "RORIW\t[$]24" + a += x<<8 + x>>24 + + // amd64:"ROLL\t[$]9" +@@ -75,7 +75,7 @@ func rot32(x uint32) uint32 { + // s390x:"RLL\t[$]9" + // ppc64x:"ROTLW\t[$]9" + // loong64: "ROTR\t[$]23" +- // riscv64: "OR","SLLIW","SRLIW",-"AND" ++ // riscv64: "RORIW\t[$]23" + a += x<<9 ^ x>>23 + + // amd64:"ROLL\t[$]10" +@@ -86,7 +86,7 @@ func rot32(x uint32) uint32 { + // arm64:"RORW\t[$]22" + // s390x:"RLL\t[$]10" + // loong64: "ROTR\t[$]22" +- // riscv64: "OR","SLLIW","SRLIW",-"AND" ++ // riscv64: "RORIW\t[$]22" + a += bits.RotateLeft32(x, 10) + + return a +@@ -141,14 +141,14 @@ func rot64nc(x uint64, z uint) uint64 { + // arm64:"ROR","NEG",-"AND" + // ppc64x:"ROTL",-"NEG",-"AND" + // loong64: "ROTRV", -"AND" +- // riscv64: "OR","SLL","SRL",-"AND" ++ // riscv64: "ROL",-"AND" + a += x<>(64-z) + + // amd64:"RORQ",-"AND" + // arm64:"ROR",-"NEG",-"AND" + // ppc64x:"ROTL","NEG",-"AND" + // loong64: "ROTRV", -"AND" +- // riscv64: "OR","SLL","SRL",-"AND" ++ // riscv64: "ROR",-"AND" + a += x>>z | x<<(64-z) + + return a +@@ -163,14 +163,14 @@ func rot32nc(x uint32, z uint) uint32 { + // arm64:"ROR","NEG",-"AND" + // ppc64x:"ROTLW",-"NEG",-"AND" + // loong64: "ROTR", -"AND" +- // riscv64: "OR","SLLW","SRLW",-"AND" ++ // riscv64: "ROLW",-"AND" + a += x<>(32-z) + + // amd64:"RORL",-"AND" + // arm64:"ROR",-"NEG",-"AND" + // ppc64x:"ROTLW","NEG",-"AND" + // loong64: "ROTR", -"AND" +- // riscv64: "OR","SLLW","SRLW",-"AND" ++ // riscv64: "RORW",-"AND" + a += x>>z | x<<(32-z) + + return a +-- +2.39.5 + diff --git a/2033-cmd-internal-obj-support-Zba-Zbb-Zbs-extensions-in-r.patch b/2033-cmd-internal-obj-support-Zba-Zbb-Zbs-extensions-in-r.patch new file mode 100644 index 0000000..4825528 --- /dev/null +++ b/2033-cmd-internal-obj-support-Zba-Zbb-Zbs-extensions-in-r.patch @@ -0,0 +1,617 @@ +From e7609bdfce949e57151a4ea7e11ffcd4fc3cc485 Mon Sep 17 00:00:00 2001 +From: Wang Yaduo +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 033/119] cmd/internal/obj: support Zba, Zbb, Zbs extensions in + riscv64 assembler + +Add assembler support for Zba, Zbb, Zbs extensions, which are +mandatory in the rva22u64 profile. These can be used to accelerate +address computation and bit manipulation. + +Change-Id: Ie90fe6b76b1382cf69984a0e71a72d3cba0e750a +Reviewed-on: https://go-review.googlesource.com/c/go/+/559655 +Reviewed-by: M Zhuo +Run-TryBot: Joel Sing +Reviewed-by: David Chase +Reviewed-by: Joel Sing +Reviewed-by: Keith Randall +TryBot-Result: Gopher Robot +LUCI-TryBot-Result: Go LUCI +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 96 +++++++++++++++++---- + src/cmd/internal/obj/riscv/anames.go | 46 ++++++++-- + src/cmd/internal/obj/riscv/cpu.go | 58 +++++++++++-- + src/cmd/internal/obj/riscv/inst.go | 84 +++++++++++++++++- + src/cmd/internal/obj/riscv/obj.go | 90 +++++++++++++++++-- + 5 files changed, 336 insertions(+), 38 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index ed691f4d9e..64170340dc 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -339,6 +339,84 @@ start: + // 12.6: Double-Precision Floating-Point Classify Instruction + FCLASSD F0, X5 // d31200e2 + ++ // RISC-V Bit-Manipulation ISA-extensions (1.0) ++ // 1.1: Address Generation Instructions (Zba) ++ ADDUW X10, X11, X12 // 3b86a508 ++ ADDUW X10, X11 // bb85a508 ++ SH1ADD X11, X12, X13 // b326b620 ++ SH1ADD X11, X12 // 3326b620 ++ SH1ADDUW X12, X13, X14 // 3ba7c620 ++ SH1ADDUW X12, X13 // bba6c620 ++ SH2ADD X13, X14, X15 // b347d720 ++ SH2ADD X13, X14 // 3347d720 ++ SH2ADDUW X14, X15, X16 // 3bc8e720 ++ SH2ADDUW X14, X15 // bbc7e720 ++ SH3ADD X15, X16, X17 // b368f820 ++ SH3ADD X15, X16 // 3368f820 ++ SH3ADDUW X16, X17, X18 // 3be90821 ++ SH3ADDUW X16, X17 // bbe80821 ++ SLLIUW $31, X17, X18 // 1b99f809 ++ SLLIUW $63, X17 // 9b98f80b ++ SLLIUW $63, X17, X18 // 1b99f80b ++ SLLIUW $1, X18, X19 // 9b191908 ++ ++ // 1.2: Basic Bit Manipulation (Zbb) ++ ANDN X19, X20, X21 // b37a3a41 ++ ANDN X19, X20 // 337a3a41 ++ CLZ X20, X21 // 931a0a60 ++ CLZW X21, X22 // 1b9b0a60 ++ CPOP X22, X23 // 931b2b60 ++ CPOPW X23, X24 // 1b9c2b60 ++ CTZ X24, X25 // 931c1c60 ++ CTZW X25, X26 // 1b9d1c60 ++ MAX X26, X28, X29 // b36eae0b ++ MAX X26, X28 // 336eae0b ++ MAXU X28, X29, X30 // 33ffce0b ++ MAXU X28, X29 // b3fece0b ++ MIN X29, X30, X5 // b342df0b ++ MIN X29, X30 // 334fdf0b ++ MINU X30, X5, X6 // 33d3e20b ++ MINU X30, X5 // b3d2e20b ++ ORN X6, X7, X8 // 33e46340 ++ ORN X6, X7 // b3e36340 ++ SEXTB X16, X17 // 93184860 ++ SEXTH X17, X18 // 13995860 ++ XNOR X18, X19, X20 // 33ca2941 ++ XNOR X18, X19 // b3c92941 ++ ZEXTH X19, X20 // 3bca0908 ++ ++ // 1.3: Bitwise Rotation (Zbb) ++ ROL X8, X9, X10 // 33958460 or b30f8040b3dff4013395840033e5af00 ++ ROL X8, X9 // b3948460 or b30f8040b3dff401b3948400b3e49f00 ++ ROLW X9, X10, X11 // bb159560 or b30f9040bb5ff501bb159500b3e5bf00 ++ ROLW X9, X10 // 3b159560 or b30f9040bb5ff5013b15950033e5af00 ++ ROR X10, X11, X12 // 33d6a560 or b30fa040b39ff50133d6a50033e6cf00 ++ ROR X10, X11 // b3d5a560 or b30fa040b39ff501b3d5a500b3e5bf00 ++ ROR $63, X11 // 93d5f563 or 93dff50393951500b3e5bf00 ++ RORI $63, X11, X12 // 13d6f563 or 93dff5031396150033e6cf00 ++ RORI $1, X12, X13 // 93561660 or 935f16009316f603b3e6df00 ++ RORIW $31, X13, X14 // 1bd7f661 or 9bdff6011b97160033e7ef00 ++ RORIW $1, X14, X15 // 9b571760 or 9b5f17009b17f701b3e7ff00 ++ RORW X15, X16, X17 // bb58f860 or b30ff040bb1ff801bb58f800b3e81f01 ++ RORW X15, X16 // 3b58f860 or b30ff040bb1ff8013b58f80033e80f01 ++ RORW $31, X13 // 9bd6f661 or 9bdff6019b961600b3e6df00 ++ ORCB X5, X6 // 13d37228 ++ REV8 X7, X8 // 13d4836b ++ ++ // 1.5: Single-bit Instructions (Zbs) ++ BCLR X23, X24, X25 // b31c7c49 ++ BCLR $63, X24 // 131cfc4b ++ BCLRI $1, X25, X26 // 139d1c48 ++ BEXT X26, X28, X29 // b35eae49 ++ BEXT $63, X28 // 135efe4b ++ BEXTI $1, X29, X30 // 13df1e48 ++ BINV X30, X5, X6 // 3393e269 ++ BINV $63, X6 // 1313f36b ++ BINVI $1, X7, X8 // 13941368 ++ BSET X8, X9, X10 // 33958428 ++ BSET $63, X9 // 9394f42b ++ BSETI $1, X10, X11 // 93151528 ++ + // Privileged ISA + + // 3.2.1: Environment Call and Breakpoint +@@ -417,24 +495,6 @@ start: + NEGW X5 // bb025040 + NEGW X5, X6 // 3b035040 + +- // Bitwise rotation pseudo-instructions +- ROL X5, X6, X7 // b30f5040b35ff301b3135300b3e37f00 +- ROL X5, X6 // b30f5040b35ff3013313530033e36f00 +- ROLW X5, X6, X7 // b30f5040bb5ff301bb135300b3e37f00 +- ROLW X5, X6 // b30f5040bb5ff3013b13530033e36f00 +- ROR X5, X6, X7 // b30f5040b31ff301b3535300b3e37f00 +- ROR X5, X6 // b30f5040b31ff3013353530033e36f00 +- RORW X5, X6, X7 // b30f5040bb1ff301bb535300b3e37f00 +- RORW X5, X6 // b30f5040bb1ff3013b53530033e36f00 +- RORI $5, X6, X7 // 935f53009313b303b3e37f00 +- RORI $5, X6 // 935f53001313b30333e36f00 +- RORIW $5, X6, X7 // 9b5f53009b13b301b3e37f00 +- RORIW $5, X6 // 9b5f53001b13b30133e36f00 +- ROR $5, X6, X7 // 935f53009313b303b3e37f00 +- ROR $5, X6 // 935f53001313b30333e36f00 +- RORW $5, X6, X7 // 9b5f53009b13b301b3e37f00 +- RORW $5, X6 // 9b5f53001b13b30133e36f00 +- + // This jumps to the second instruction in the function (the + // first instruction is an invisible stack pointer adjustment). + JMP start // JMP 2 +diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go +index e547c6d5e9..60c7b48620 100644 +--- a/src/cmd/internal/obj/riscv/anames.go ++++ b/src/cmd/internal/obj/riscv/anames.go +@@ -217,6 +217,46 @@ var Anames = []string{ + "DRET", + "WFI", + "SFENCEVMA", ++ "ADDUW", ++ "SH1ADD", ++ "SH1ADDUW", ++ "SH2ADD", ++ "SH2ADDUW", ++ "SH3ADD", ++ "SH3ADDUW", ++ "SLLIUW", ++ "ANDN", ++ "ORN", ++ "XNOR", ++ "CLZ", ++ "CLZW", ++ "CTZ", ++ "CTZW", ++ "CPOP", ++ "CPOPW", ++ "MAX", ++ "MAXU", ++ "MIN", ++ "MINU", ++ "SEXTB", ++ "SEXTH", ++ "ZEXTH", ++ "ROL", ++ "ROLW", ++ "ROR", ++ "RORI", ++ "RORIW", ++ "RORW", ++ "ORCB", ++ "REV8", ++ "BCLR", ++ "BCLRI", ++ "BEXT", ++ "BEXTI", ++ "BINV", ++ "BINVI", ++ "BSET", ++ "BSETI", + "WORD", + "BEQZ", + "BGEZ", +@@ -246,12 +286,6 @@ var Anames = []string{ + "NEG", + "NEGW", + "NOT", +- "ROL", +- "ROLW", +- "ROR", +- "RORI", +- "RORIW", +- "RORW", + "SEQZ", + "SNEZ", + "LAST", +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index 24026561ee..07d5ccff87 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -567,6 +567,58 @@ const ( + // 4.2.1: Supervisor Memory-Management Fence Instruction + ASFENCEVMA + ++ // ++ // RISC-V Bit-Manipulation ISA-extensions (1.0) ++ // ++ ++ // 1.1: Address Generation Instructions (Zba) ++ AADDUW ++ ASH1ADD ++ ASH1ADDUW ++ ASH2ADD ++ ASH2ADDUW ++ ASH3ADD ++ ASH3ADDUW ++ ASLLIUW ++ ++ // 1.2: Basic Bit Manipulation (Zbb) ++ AANDN ++ AORN ++ AXNOR ++ ACLZ ++ ACLZW ++ ACTZ ++ ACTZW ++ ACPOP ++ ACPOPW ++ AMAX ++ AMAXU ++ AMIN ++ AMINU ++ ASEXTB ++ ASEXTH ++ AZEXTH ++ ++ // 1.3: Bitwise Rotation (Zbb) ++ AROL ++ AROLW ++ AROR ++ ARORI ++ ARORIW ++ ARORW ++ AORCB ++ AREV8 ++ ++ // 1.5: Single-bit Instructions (Zbs) ++ ABCLR ++ ABCLRI ++ ABEXT ++ ABEXTI ++ ABINV ++ ABINVI ++ ABSET ++ ABSETI ++ + // The escape hatch. Inserts a single 32-bit word. + AWORD + +@@ -600,12 +652,6 @@ const ( + ANEG + ANEGW + ANOT +- AROL +- AROLW +- AROR +- ARORI +- ARORIW +- ARORW + ASEQZ + ASNEZ + +diff --git a/src/cmd/internal/obj/riscv/inst.go b/src/cmd/internal/obj/riscv/inst.go +index 6cb11cdfb5..223ddd15b2 100644 +--- a/src/cmd/internal/obj/riscv/inst.go ++++ b/src/cmd/internal/obj/riscv/inst.go +@@ -1,4 +1,4 @@ +-// Code generated by parse.py -go rv64_a rv64_d rv64_f rv64_i rv64_m rv64_q rv_a rv_d rv_f rv_i rv_m rv_q rv_s rv_system rv_zicsr; DO NOT EDIT. ++// Code generated by ./parse.py -go rv64_a rv64_d rv64_f rv64_i rv64_m rv64_q rv64_zba rv64_zbb rv64_zbs rv_a rv_d rv_f rv_i rv_m rv_q rv_zba rv_zbb rv_zbs rv_s rv_system rv_zicsr; DO NOT EDIT. + package riscv + + import "cmd/internal/obj" +@@ -15,6 +15,8 @@ func encode(a obj.As) *inst { + switch a { + case AADD: + return &inst{0x33, 0x0, 0x0, 0, 0x0} ++ case AADDUW: ++ return &inst{0x3b, 0x0, 0x0, 128, 0x4} + case AADDI: + return &inst{0x13, 0x0, 0x0, 0, 0x0} + case AADDIW: +@@ -61,20 +63,46 @@ func encode(a obj.As) *inst { + return &inst{0x33, 0x7, 0x0, 0, 0x0} + case AANDI: + return &inst{0x13, 0x7, 0x0, 0, 0x0} ++ case AANDN: ++ return &inst{0x33, 0x7, 0x0, 1024, 0x20} + case AAUIPC: + return &inst{0x17, 0x0, 0x0, 0, 0x0} ++ case ABCLR: ++ return &inst{0x33, 0x1, 0x0, 1152, 0x24} ++ case ABCLRI: ++ return &inst{0x13, 0x1, 0x0, 1152, 0x24} + case ABEQ: + return &inst{0x63, 0x0, 0x0, 0, 0x0} ++ case ABEXT: ++ return &inst{0x33, 0x5, 0x0, 1152, 0x24} ++ case ABEXTI: ++ return &inst{0x13, 0x5, 0x0, 1152, 0x24} + case ABGE: + return &inst{0x63, 0x5, 0x0, 0, 0x0} + case ABGEU: + return &inst{0x63, 0x7, 0x0, 0, 0x0} ++ case ABINV: ++ return &inst{0x33, 0x1, 0x0, 1664, 0x34} ++ case ABINVI: ++ return &inst{0x13, 0x1, 0x0, 1664, 0x34} + case ABLT: + return &inst{0x63, 0x4, 0x0, 0, 0x0} + case ABLTU: + return &inst{0x63, 0x6, 0x0, 0, 0x0} + case ABNE: + return &inst{0x63, 0x1, 0x0, 0, 0x0} ++ case ABSET: ++ return &inst{0x33, 0x1, 0x0, 640, 0x14} ++ case ABSETI: ++ return &inst{0x13, 0x1, 0x0, 640, 0x14} ++ case ACLZ: ++ return &inst{0x13, 0x1, 0x0, 1536, 0x30} ++ case ACLZW: ++ return &inst{0x1b, 0x1, 0x0, 1536, 0x30} ++ case ACPOP: ++ return &inst{0x13, 0x1, 0x2, 1538, 0x30} ++ case ACPOPW: ++ return &inst{0x1b, 0x1, 0x2, 1538, 0x30} + case ACSRRC: + return &inst{0x73, 0x3, 0x0, 0, 0x0} + case ACSRRCI: +@@ -87,6 +115,10 @@ func encode(a obj.As) *inst { + return &inst{0x73, 0x1, 0x0, 0, 0x0} + case ACSRRWI: + return &inst{0x73, 0x5, 0x0, 0, 0x0} ++ case ACTZ: ++ return &inst{0x13, 0x1, 0x1, 1537, 0x30} ++ case ACTZW: ++ return &inst{0x1b, 0x1, 0x1, 1537, 0x30} + case ADIV: + return &inst{0x33, 0x4, 0x0, 32, 0x1} + case ADIVU: +@@ -95,8 +127,6 @@ func encode(a obj.As) *inst { + return &inst{0x3b, 0x5, 0x0, 32, 0x1} + case ADIVW: + return &inst{0x3b, 0x4, 0x0, 32, 0x1} +- case ADRET: +- return &inst{0x73, 0x0, 0x12, 1970, 0x3d} + case AEBREAK: + return &inst{0x73, 0x0, 0x1, 1, 0x0} + case AECALL: +@@ -337,6 +367,14 @@ func encode(a obj.As) *inst { + return &inst{0x3, 0x2, 0x0, 0, 0x0} + case ALWU: + return &inst{0x3, 0x6, 0x0, 0, 0x0} ++ case AMAX: ++ return &inst{0x33, 0x6, 0x0, 160, 0x5} ++ case AMAXU: ++ return &inst{0x33, 0x7, 0x0, 160, 0x5} ++ case AMIN: ++ return &inst{0x33, 0x4, 0x0, 160, 0x5} ++ case AMINU: ++ return &inst{0x33, 0x5, 0x0, 160, 0x5} + case AMRET: + return &inst{0x73, 0x0, 0x2, 770, 0x18} + case AMUL: +@@ -351,8 +389,12 @@ func encode(a obj.As) *inst { + return &inst{0x3b, 0x0, 0x0, 32, 0x1} + case AOR: + return &inst{0x33, 0x6, 0x0, 0, 0x0} ++ case AORCB: ++ return &inst{0x13, 0x5, 0x7, 647, 0x14} + case AORI: + return &inst{0x13, 0x6, 0x0, 0, 0x0} ++ case AORN: ++ return &inst{0x33, 0x6, 0x0, 1024, 0x20} + case APAUSE: + return &inst{0xf, 0x0, 0x10, 16, 0x0} + case ARDCYCLE: +@@ -375,6 +417,20 @@ func encode(a obj.As) *inst { + return &inst{0x3b, 0x7, 0x0, 32, 0x1} + case AREMW: + return &inst{0x3b, 0x6, 0x0, 32, 0x1} ++ case AREV8: ++ return &inst{0x13, 0x5, 0x18, 1720, 0x35} ++ case AROL: ++ return &inst{0x33, 0x1, 0x0, 1536, 0x30} ++ case AROLW: ++ return &inst{0x3b, 0x1, 0x0, 1536, 0x30} ++ case AROR: ++ return &inst{0x33, 0x5, 0x0, 1536, 0x30} ++ case ARORI: ++ return &inst{0x13, 0x5, 0x0, 1536, 0x30} ++ case ARORIW: ++ return &inst{0x1b, 0x5, 0x0, 1536, 0x30} ++ case ARORW: ++ return &inst{0x3b, 0x5, 0x0, 1536, 0x30} + case ASB: + return &inst{0x23, 0x0, 0x0, 0, 0x0} + case ASBREAK: +@@ -387,14 +443,32 @@ func encode(a obj.As) *inst { + return &inst{0x73, 0x0, 0x0, 0, 0x0} + case ASD: + return &inst{0x23, 0x3, 0x0, 0, 0x0} ++ case ASEXTB: ++ return &inst{0x13, 0x1, 0x4, 1540, 0x30} ++ case ASEXTH: ++ return &inst{0x13, 0x1, 0x5, 1541, 0x30} + case ASFENCEVMA: + return &inst{0x73, 0x0, 0x0, 288, 0x9} + case ASH: + return &inst{0x23, 0x1, 0x0, 0, 0x0} ++ case ASH1ADD: ++ return &inst{0x33, 0x2, 0x0, 512, 0x10} ++ case ASH1ADDUW: ++ return &inst{0x3b, 0x2, 0x0, 512, 0x10} ++ case ASH2ADD: ++ return &inst{0x33, 0x4, 0x0, 512, 0x10} ++ case ASH2ADDUW: ++ return &inst{0x3b, 0x4, 0x0, 512, 0x10} ++ case ASH3ADD: ++ return &inst{0x33, 0x6, 0x0, 512, 0x10} ++ case ASH3ADDUW: ++ return &inst{0x3b, 0x6, 0x0, 512, 0x10} + case ASLL: + return &inst{0x33, 0x1, 0x0, 0, 0x0} + case ASLLI: + return &inst{0x13, 0x1, 0x0, 0, 0x0} ++ case ASLLIUW: ++ return &inst{0x1b, 0x1, 0x0, 128, 0x4} + case ASLLIW: + return &inst{0x1b, 0x1, 0x0, 0, 0x0} + case ASLLW: +@@ -433,10 +507,14 @@ func encode(a obj.As) *inst { + return &inst{0x23, 0x2, 0x0, 0, 0x0} + case AWFI: + return &inst{0x73, 0x0, 0x5, 261, 0x8} ++ case AXNOR: ++ return &inst{0x33, 0x4, 0x0, 1024, 0x20} + case AXOR: + return &inst{0x33, 0x4, 0x0, 0, 0x0} + case AXORI: + return &inst{0x13, 0x4, 0x0, 0, 0x0} ++ case AZEXTH: ++ return &inst{0x3b, 0x4, 0x0, 128, 0x4} + } + return nil + } +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 9b81768b85..8115350a9e 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -60,7 +60,9 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + AADD, AAND, AOR, AXOR, ASLL, ASRL, ASUB, ASRA, + AMUL, AMULH, AMULHU, AMULHSU, AMULW, ADIV, ADIVU, ADIVW, ADIVUW, + AREM, AREMU, AREMW, AREMUW, +- AROL, AROLW, AROR, ARORW, ARORI, ARORIW: ++ AADDUW, ASH1ADD, ASH1ADDUW, ASH2ADD, ASH2ADDUW, ASH3ADD, ASH3ADDUW, ASLLIUW, ++ AANDN, AORN, AXNOR, AMAX, AMAXU, AMIN, AMINU, AROL, AROLW, AROR, ARORW, ARORI, ARORIW, ++ ABCLR, ABCLRI, ABEXT, ABEXTI, ABINV, ABINVI, ABSET, ABSETI: + p.Reg = p.To.Reg + } + } +@@ -91,10 +93,6 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + p.As = ASRAI + case AADDW: + p.As = AADDIW +- case AROR: +- p.As = ARORI +- case ARORW: +- p.As = ARORIW + case ASUBW: + p.As, p.From.Offset = AADDIW, -p.From.Offset + case ASLLW: +@@ -103,6 +101,18 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + p.As = ASRLIW + case ASRAW: + p.As = ASRAIW ++ case AROR: ++ p.As = ARORI ++ case ARORW: ++ p.As = ARORIW ++ case ABCLR: ++ p.As = ABCLRI ++ case ABEXT: ++ p.As = ABEXTI ++ case ABINV: ++ p.As = ABINVI ++ case ABSET: ++ p.As = ABSETI + } + } + +@@ -1107,6 +1117,13 @@ func wantEvenOffset(ctxt *obj.Link, ins *instruction, offset int64) { + } + } + ++func validateRII(ctxt *obj.Link, ins *instruction) { ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ + func validateRIII(ctxt *obj.Link, ins *instruction) { + wantIntReg(ctxt, ins, "rd", ins.rd) + wantIntReg(ctxt, ins, "rs1", ins.rs1) +@@ -1260,6 +1277,10 @@ func encodeR4(as obj.As, rs1, rs2, rs3, rd, funct3, funct2 uint32) uint32 { + return rs3<<27 | funct2<<25 | rs2<<20 | rs1<<15 | enc.funct3<<12 | funct3<<12 | rd<<7 | enc.opcode + } + ++func encodeRII(ins *instruction) uint32 { ++ return encodeR(ins.as, regI(ins.rs1), 0, regI(ins.rd), ins.funct3, ins.funct7) ++} ++ + func encodeRIII(ins *instruction) uint32 { + return encodeR(ins.as, regI(ins.rs1), regI(ins.rs2), regI(ins.rd), ins.funct3, ins.funct7) + } +@@ -1491,6 +1512,7 @@ var ( + // indicates an S-type instruction with rs2 being a float register. + + rIIIEncoding = encoding{encode: encodeRIII, validate: validateRIII, length: 4} ++ rIIEncoding = encoding{encode: encodeRII, validate: validateRII, length: 4} + rFFFEncoding = encoding{encode: encodeRFFF, validate: validateRFFF, length: 4} + rFFFFEncoding = encoding{encode: encodeRFFFF, validate: validateRFFFF, length: 4} + rFFIEncoding = encoding{encode: encodeRFFI, validate: validateRFFI, length: 4} +@@ -1723,6 +1745,58 @@ var encodings = [ALAST & obj.AMask]encoding{ + AECALL & obj.AMask: iIEncoding, + AEBREAK & obj.AMask: iIEncoding, + ++ // ++ // RISC-V Bit-Manipulation ISA-extensions (1.0) ++ // ++ ++ // 1.1: Address Generation Instructions (Zba) ++ AADDUW & obj.AMask: rIIIEncoding, ++ ASH1ADD & obj.AMask: rIIIEncoding, ++ ASH1ADDUW & obj.AMask: rIIIEncoding, ++ ASH2ADD & obj.AMask: rIIIEncoding, ++ ASH2ADDUW & obj.AMask: rIIIEncoding, ++ ASH3ADD & obj.AMask: rIIIEncoding, ++ ASH3ADDUW & obj.AMask: rIIIEncoding, ++ ASLLIUW & obj.AMask: iIEncoding, ++ ++ // 1.2: Basic Bit Manipulation (Zbb) ++ AANDN & obj.AMask: rIIIEncoding, ++ ACLZ & obj.AMask: rIIEncoding, ++ ACLZW & obj.AMask: rIIEncoding, ++ ACPOP & obj.AMask: rIIEncoding, ++ ACPOPW & obj.AMask: rIIEncoding, ++ ACTZ & obj.AMask: rIIEncoding, ++ ACTZW & obj.AMask: rIIEncoding, ++ AMAX & obj.AMask: rIIIEncoding, ++ AMAXU & obj.AMask: rIIIEncoding, ++ AMIN & obj.AMask: rIIIEncoding, ++ AMINU & obj.AMask: rIIIEncoding, ++ AORN & obj.AMask: rIIIEncoding, ++ ASEXTB & obj.AMask: rIIEncoding, ++ ASEXTH & obj.AMask: rIIEncoding, ++ AXNOR & obj.AMask: rIIIEncoding, ++ AZEXTH & obj.AMask: rIIEncoding, ++ ++ // 1.3: Bitwise Rotation (Zbb) ++ AROL & obj.AMask: rIIIEncoding, ++ AROLW & obj.AMask: rIIIEncoding, ++ AROR & obj.AMask: rIIIEncoding, ++ ARORI & obj.AMask: iIEncoding, ++ ARORIW & obj.AMask: iIEncoding, ++ ARORW & obj.AMask: rIIIEncoding, ++ AORCB & obj.AMask: iIEncoding, ++ AREV8 & obj.AMask: iIEncoding, ++ ++ // 1.5: Single-bit Instructions (Zbs) ++ ABCLR & obj.AMask: rIIIEncoding, ++ ABCLRI & obj.AMask: iIEncoding, ++ ABEXT & obj.AMask: rIIIEncoding, ++ ABEXTI & obj.AMask: iIEncoding, ++ ABINV & obj.AMask: rIIIEncoding, ++ ABINVI & obj.AMask: iIEncoding, ++ ABSET & obj.AMask: rIIIEncoding, ++ ABSETI & obj.AMask: iIEncoding, ++ + // Escape hatch + AWORD & obj.AMask: rawEncoding, + +@@ -2420,6 +2494,12 @@ func instructionsForProg(p *obj.Prog) []*instruction { + if ins.imm < 0 || ins.imm > 31 { + p.Ctxt.Diag("%v: shift amount out of range 0 to 31", p) + } ++ ++ case ACLZ, ACLZW, ACTZ, ACTZW, ACPOP, ACPOPW, ASEXTB, ASEXTH, AZEXTH: ++ ins.rs1, ins.rs2 = uint32(p.From.Reg), obj.REG_NONE ++ ++ case AORCB, AREV8: ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE + } + + for _, ins := range inss { +-- +2.39.5 + diff --git a/2034-cmd-internal-obj-riscv-improve-register-MOVB-MOVH-MO.patch b/2034-cmd-internal-obj-riscv-improve-register-MOVB-MOVH-MO.patch new file mode 100644 index 0000000..5b43913 --- /dev/null +++ b/2034-cmd-internal-obj-riscv-improve-register-MOVB-MOVH-MO.patch @@ -0,0 +1,118 @@ +From 15597b92557425f079d2a780cb4a592ad4288f0a Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 034/119] cmd/internal/obj/riscv: improve register + MOVB/MOVH/MOVBU/MOVHU for rva22u64 + +When GORISCV64 enables rva22u64, use SEXTB for MOVB, SEXTH for MOVH, ZEXTH +for MOVHU and ADDUW for MOVWU. These are single instruction alternatives +to the two instruction shift sequences that are needed otherwise. + +Change-Id: Iea5e394f57e238ae8771400a87287c1ee507d44c +Reviewed-on: https://go-review.googlesource.com/c/go/+/572736 +Reviewed-by: David Chase +Run-TryBot: Joel Sing +Reviewed-by: Mark Ryan +LUCI-TryBot-Result: Go LUCI +TryBot-Result: Gopher Robot +Reviewed-by: Cherry Mui +Reviewed-by: M Zhuo +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 8 ++-- + src/cmd/internal/obj/riscv/obj.go | 49 ++++++++++++++------- + 2 files changed, 37 insertions(+), 20 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 64170340dc..5296a34d09 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -462,12 +462,12 @@ start: + MOVW X5, (X6) // 23205300 + MOVW X5, 4(X6) // 23225300 + +- MOVB X5, X6 // 1393820313538343 +- MOVH X5, X6 // 1393020313530343 ++ MOVB X5, X6 // 1393820313538343 or 13934260 ++ MOVH X5, X6 // 1393020313530343 or 13935260 + MOVW X5, X6 // 1b830200 + MOVBU X5, X6 // 13f3f20f +- MOVHU X5, X6 // 1393020313530303 +- MOVWU X5, X6 // 1393020213530302 ++ MOVHU X5, X6 // 1393020313530303 or 3bc30208 ++ MOVWU X5, X6 // 1393020213530302 or 3b830208 + + MOVF 4(X5), F0 // 07a04200 + MOVF F0, 4(X5) // 27a20200 +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 8115350a9e..f731359f7f 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -26,6 +26,7 @@ import ( + "cmd/internal/sys" + "fmt" + "internal/abi" ++ "internal/buildcfg" + "log" + "math/bits" + "strings" +@@ -2156,25 +2157,41 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + case AMOVD: // MOVD Ra, Rb -> FSGNJD Ra, Ra, Rb + ins.as, ins.rs1 = AFSGNJD, uint32(p.From.Reg) + case AMOVB, AMOVH: +- // Use SLLI/SRAI to extend. +- ins.as, ins.rs1, ins.rs2 = ASLLI, uint32(p.From.Reg), obj.REG_NONE +- if p.As == AMOVB { +- ins.imm = 56 +- } else if p.As == AMOVH { +- ins.imm = 48 ++ if buildcfg.GORISCV64 >= 22 { ++ // Use SEXTB or SEXTH to extend. ++ ins.as, ins.rs1, ins.rs2 = ASEXTB, uint32(p.From.Reg), obj.REG_NONE ++ if p.As == AMOVH { ++ ins.as = ASEXTH ++ } ++ } else { ++ // Use SLLI/SRAI sequence to extend. ++ ins.as, ins.rs1, ins.rs2 = ASLLI, uint32(p.From.Reg), obj.REG_NONE ++ if p.As == AMOVB { ++ ins.imm = 56 ++ } else if p.As == AMOVH { ++ ins.imm = 48 ++ } ++ ins2 := &instruction{as: ASRAI, rd: ins.rd, rs1: ins.rd, imm: ins.imm} ++ inss = append(inss, ins2) + } +- ins2 := &instruction{as: ASRAI, rd: ins.rd, rs1: ins.rd, imm: ins.imm} +- inss = append(inss, ins2) + case AMOVHU, AMOVWU: +- // Use SLLI/SRLI to extend. +- ins.as, ins.rs1, ins.rs2 = ASLLI, uint32(p.From.Reg), obj.REG_NONE +- if p.As == AMOVHU { +- ins.imm = 48 +- } else if p.As == AMOVWU { +- ins.imm = 32 ++ if buildcfg.GORISCV64 >= 22 { ++ // Use ZEXTH or ADDUW to extend. ++ ins.as, ins.rs1, ins.rs2, ins.imm = AZEXTH, uint32(p.From.Reg), obj.REG_NONE, 0 ++ if p.As == AMOVWU { ++ ins.as, ins.rs2 = AADDUW, REG_ZERO ++ } ++ } else { ++ // Use SLLI/SRLI sequence to extend. ++ ins.as, ins.rs1, ins.rs2 = ASLLI, uint32(p.From.Reg), obj.REG_NONE ++ if p.As == AMOVHU { ++ ins.imm = 48 ++ } else if p.As == AMOVWU { ++ ins.imm = 32 ++ } ++ ins2 := &instruction{as: ASRLI, rd: ins.rd, rs1: ins.rd, imm: ins.imm} ++ inss = append(inss, ins2) + } +- ins2 := &instruction{as: ASRLI, rd: ins.rd, rs1: ins.rd, imm: ins.imm} +- inss = append(inss, ins2) + } + + case p.From.Type == obj.TYPE_MEM && p.To.Type == obj.TYPE_REG: +-- +2.39.5 + diff --git a/2035-cmd-internal-obj-riscv-use-native-rotation-instructi.patch b/2035-cmd-internal-obj-riscv-use-native-rotation-instructi.patch new file mode 100644 index 0000000..bf23dda --- /dev/null +++ b/2035-cmd-internal-obj-riscv-use-native-rotation-instructi.patch @@ -0,0 +1,62 @@ +From b230277ae2437913fb5e906aeeab9748e48a1dfd Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 035/119] cmd/internal/obj/riscv: use native rotation + instructions for rva22u64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When rva22u64 is available, we can now use the native rotation instructions +from the Zbb extension. Use these instead of synthesising rotation +instructions. + +This provides a significant performance gain for SHA-512, the following +benchmarked on a StarFive VisionFive 2: + + │ sha512.rva20u64 │ sha512.rva22u64 │ + │ B/s │ B/s vs base │ +Hash8Bytes/New-4 859.4Ki ± 0% 1337.9Ki ± 0% +55.68% (p=0.000 n=10) +Hash8Bytes/Sum384-4 888.7Ki ± 1% 1308.6Ki ± 1% +47.25% (p=0.000 n=10) +Hash8Bytes/Sum512-4 869.1Ki ± 0% 1269.5Ki ± 1% +46.07% (p=0.000 n=10) +Hash1K/New-4 19.83Mi ± 0% 29.03Mi ± 0% +46.38% (p=0.000 n=10) +Hash1K/Sum384-4 20.00Mi ± 0% 28.86Mi ± 0% +44.30% (p=0.000 n=10) +Hash1K/Sum512-4 19.93Mi ± 0% 28.72Mi ± 0% +44.11% (p=0.000 n=10) +Hash8K/New-4 23.85Mi ± 0% 34.12Mi ± 0% +43.09% (p=0.000 n=10) +Hash8K/Sum384-4 23.88Mi ± 0% 34.09Mi ± 0% +42.77% (p=0.000 n=10) +Hash8K/Sum512-4 23.87Mi ± 0% 34.07Mi ± 0% +42.71% (p=0.000 n=10) +geomean 7.399Mi 10.78Mi +45.77% + +Change-Id: I9dca8e3f311eea101684c806cb998872dc697288 +Reviewed-on: https://go-review.googlesource.com/c/go/+/572716 +Run-TryBot: Joel Sing +Reviewed-by: David Chase +TryBot-Result: Gopher Robot +Reviewed-by: M Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Matthew Dempsky +Reviewed-by: Mark Ryan +Auto-Submit: Emmanuel Odeke +--- + src/cmd/internal/obj/riscv/obj.go | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index f731359f7f..579ac43810 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -2290,6 +2290,11 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + + // instructionsForRotate returns the machine instructions for a bitwise rotation. + func instructionsForRotate(p *obj.Prog, ins *instruction) []*instruction { ++ if buildcfg.GORISCV64 >= 22 { ++ // Rotation instructions are supported natively. ++ return []*instruction{ins} ++ } ++ + switch ins.as { + case AROL, AROLW, AROR, ARORW: + // ROL -> OR (SLL x y) (SRL x (NEG y)) +-- +2.39.5 + diff --git a/2036-cmd-internal-obj-riscv-check-immediate-for-rotation-.patch b/2036-cmd-internal-obj-riscv-check-immediate-for-rotation-.patch new file mode 100644 index 0000000..99ae0cb --- /dev/null +++ b/2036-cmd-internal-obj-riscv-check-immediate-for-rotation-.patch @@ -0,0 +1,102 @@ +From 234c3da75367155836a25d8317d349ec0e2010cc Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 036/119] cmd/internal/obj/riscv: check immediate for rotation + instructions + +Ensure that the immediate for a RORI or RORIW instruction are within range, +adding test coverage. Also use a consistent "immediate out of range" error +for both rotations and shifts. + +Change-Id: Id687d7c6e028786f607e9519bbb64dab62b6cf3d +Reviewed-on: https://go-review.googlesource.com/c/go/+/572735 +Reviewed-by: M Zhuo +Run-TryBot: Joel Sing +Reviewed-by: Dmitri Shuralyov +TryBot-Result: Gopher Robot +Reviewed-by: Than McIntosh +LUCI-TryBot-Result: Go LUCI +--- + .../asm/internal/asm/testdata/riscv64error.s | 28 +++++++++++-------- + src/cmd/internal/obj/riscv/obj.go | 18 ++++++++++-- + 2 files changed, 31 insertions(+), 15 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index 2dc9db3fb1..0b0184aaa7 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -26,18 +26,22 @@ TEXT errors(SB),$0 + MOVD F0, F1, F2 // ERROR "illegal MOV instruction" + MOV X10, X11, X12 // ERROR "illegal MOV instruction" + MOVW X10, X11, X12 // ERROR "illegal MOV instruction" +- SLLI $64, X5, X6 // ERROR "shift amount out of range 0 to 63" +- SRLI $64, X5, X6 // ERROR "shift amount out of range 0 to 63" +- SRAI $64, X5, X6 // ERROR "shift amount out of range 0 to 63" +- SLLI $-1, X5, X6 // ERROR "shift amount out of range 0 to 63" +- SRLI $-1, X5, X6 // ERROR "shift amount out of range 0 to 63" +- SRAI $-1, X5, X6 // ERROR "shift amount out of range 0 to 63" +- SLLIW $32, X5, X6 // ERROR "shift amount out of range 0 to 31" +- SRLIW $32, X5, X6 // ERROR "shift amount out of range 0 to 31" +- SRAIW $32, X5, X6 // ERROR "shift amount out of range 0 to 31" +- SLLIW $-1, X5, X6 // ERROR "shift amount out of range 0 to 31" +- SRLIW $-1, X5, X6 // ERROR "shift amount out of range 0 to 31" +- SRAIW $-1, X5, X6 // ERROR "shift amount out of range 0 to 31" ++ RORI $64, X5, X6 // ERROR "immediate out of range 0 to 63" ++ SLLI $64, X5, X6 // ERROR "immediate out of range 0 to 63" ++ SRLI $64, X5, X6 // ERROR "immediate out of range 0 to 63" ++ SRAI $64, X5, X6 // ERROR "immediate out of range 0 to 63" ++ RORI $-1, X5, X6 // ERROR "immediate out of range 0 to 63" ++ SLLI $-1, X5, X6 // ERROR "immediate out of range 0 to 63" ++ SRLI $-1, X5, X6 // ERROR "immediate out of range 0 to 63" ++ SRAI $-1, X5, X6 // ERROR "immediate out of range 0 to 63" ++ RORIW $32, X5, X6 // ERROR "immediate out of range 0 to 31" ++ SLLIW $32, X5, X6 // ERROR "immediate out of range 0 to 31" ++ SRLIW $32, X5, X6 // ERROR "immediate out of range 0 to 31" ++ SRAIW $32, X5, X6 // ERROR "immediate out of range 0 to 31" ++ RORIW $-1, X5, X6 // ERROR "immediate out of range 0 to 31" ++ SLLIW $-1, X5, X6 // ERROR "immediate out of range 0 to 31" ++ SRLIW $-1, X5, X6 // ERROR "immediate out of range 0 to 31" ++ SRAIW $-1, X5, X6 // ERROR "immediate out of range 0 to 31" + SD X5, 4294967296(X6) // ERROR "constant 4294967296 too large" + SRLI $1, X5, F1 // ERROR "expected integer register in rd position but got non-integer register F1" + SRLI $1, F1, X5 // ERROR "expected integer register in rs1 position but got non-integer register F1" +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 579ac43810..d396264a05 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -2504,17 +2504,29 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.as = AFSGNJND + ins.rs1 = uint32(p.From.Reg) + +- case AROL, AROLW, AROR, ARORW, ARORI, ARORIW: ++ case AROL, AROLW, AROR, ARORW: ++ inss = instructionsForRotate(p, ins) ++ ++ case ARORI: ++ if ins.imm < 0 || ins.imm > 63 { ++ p.Ctxt.Diag("%v: immediate out of range 0 to 63", p) ++ } ++ inss = instructionsForRotate(p, ins) ++ ++ case ARORIW: ++ if ins.imm < 0 || ins.imm > 31 { ++ p.Ctxt.Diag("%v: immediate out of range 0 to 31", p) ++ } + inss = instructionsForRotate(p, ins) + + case ASLLI, ASRLI, ASRAI: + if ins.imm < 0 || ins.imm > 63 { +- p.Ctxt.Diag("%v: shift amount out of range 0 to 63", p) ++ p.Ctxt.Diag("%v: immediate out of range 0 to 63", p) + } + + case ASLLIW, ASRLIW, ASRAIW: + if ins.imm < 0 || ins.imm > 31 { +- p.Ctxt.Diag("%v: shift amount out of range 0 to 31", p) ++ p.Ctxt.Diag("%v: immediate out of range 0 to 31", p) + } + + case ACLZ, ACLZW, ACTZ, ACTZW, ACPOP, ACPOPW, ASEXTB, ASEXTH, AZEXTH: +-- +2.39.5 + diff --git a/2037-test-codegen-add-Mul-test-for-riscv64.patch b/2037-test-codegen-add-Mul-test-for-riscv64.patch new file mode 100644 index 0000000..3797d36 --- /dev/null +++ b/2037-test-codegen-add-Mul-test-for-riscv64.patch @@ -0,0 +1,31 @@ +From 1c7896634d95fb272aaeabc4be0e3f72ef4aeae0 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 037/119] test/codegen: add Mul test for riscv64 + +Change-Id: I51e9832317e5dee1e3fe0772e7592b3dae95a625 +Reviewed-on: https://go-review.googlesource.com/c/go/+/586797 +Reviewed-by: Keith Randall +Reviewed-by: Keith Randall +Auto-Submit: Keith Randall +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +--- + test/codegen/mathbits.go | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go +index 8c971cf760..80fe9d2e0c 100644 +--- a/test/codegen/mathbits.go ++++ b/test/codegen/mathbits.go +@@ -804,6 +804,7 @@ func Mul(x, y uint) (hi, lo uint) { + // ppc64x:"MULHDU","MULLD" + // s390x:"MLGR" + // mips64: "MULVU" ++ // riscv64:"MULHU","MUL" + return bits.Mul(x, y) + } + +-- +2.39.5 + diff --git a/2038-math-remove-riscv64-assembly-implementations-of-roun.patch b/2038-math-remove-riscv64-assembly-implementations-of-roun.patch new file mode 100644 index 0000000..157b13e --- /dev/null +++ b/2038-math-remove-riscv64-assembly-implementations-of-roun.patch @@ -0,0 +1,127 @@ +From c38f77412975029cb112bf435bcefdf79391bc80 Mon Sep 17 00:00:00 2001 +From: Jorropo +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 038/119] math: remove riscv64 assembly implementations of + rounding + +Fixes #68322 + +This reverts commit ad377e906a8ee6f27545d83de280206dacec1e58. + +Change-Id: Ifa4811e2c679d789cc830dbff5e50301410e24d0 +Reviewed-on: https://go-review.googlesource.com/c/go/+/596516 +Reviewed-by: Than McIntosh +Reviewed-by: Keith Randall +Commit-Queue: Cuong Manh Le +Auto-Submit: Cuong Manh Le +Reviewed-by: Keith Randall +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Cuong Manh Le +--- + src/math/floor_asm.go | 2 +- + src/math/floor_noasm.go | 2 +- + src/math/floor_riscv64.s | 41 ------------------------------------ + test/fixedbugs/issue68322.go | 17 +++++++++++++++ + 4 files changed, 19 insertions(+), 43 deletions(-) + delete mode 100644 src/math/floor_riscv64.s + create mode 100644 test/fixedbugs/issue68322.go + +diff --git a/src/math/floor_asm.go b/src/math/floor_asm.go +index 5cb45f5a7e..fb419d6da2 100644 +--- a/src/math/floor_asm.go ++++ b/src/math/floor_asm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || riscv64 || s390x || wasm ++//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || s390x || wasm + + package math + +diff --git a/src/math/floor_noasm.go b/src/math/floor_noasm.go +index 6754ca8fc8..5641c7ea0a 100644 +--- a/src/math/floor_noasm.go ++++ b/src/math/floor_noasm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !riscv64 && !s390x && !wasm ++//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !s390x && !wasm + + package math + +diff --git a/src/math/floor_riscv64.s b/src/math/floor_riscv64.s +deleted file mode 100644 +index 62ce963781..0000000000 +--- a/src/math/floor_riscv64.s ++++ /dev/null +@@ -1,41 +0,0 @@ +-// Copyright 2023 The Go Authors. All rights reserved. +-// Use of this source code is governed by a BSD-style +-// license that can be found in the LICENSE file. +- +-#include "textflag.h" +- +-#define PosInf 0x7FF0000000000000 +- +-// The rounding mode of RISC-V is different from Go spec. +- +-#define ROUNDFN(NAME, MODE) \ +-TEXT NAME(SB),NOSPLIT,$0; \ +- MOVD x+0(FP), F0; \ +- /* whether x is NaN */; \ +- FEQD F0, F0, X6; \ +- BNEZ X6, 3(PC); \ +- /* return NaN if x is NaN */; \ +- MOVD F0, ret+8(FP); \ +- RET; \ +- MOV $PosInf, X6; \ +- FMVDX X6, F1; \ +- FABSD F0, F2; \ +- /* if abs(x) > +Inf, return Inf instead of round(x) */; \ +- FLTD F1, F2, X6; \ +- /* Inf should keep same signed with x then return */; \ +- BEQZ X6, 3(PC); \ +- FCVTLD.MODE F0, X6; \ +- FCVTDL X6, F1; \ +- /* rounding will drop signed bit in RISCV, restore it */; \ +- FSGNJD F0, F1, F0; \ +- MOVD F0, ret+8(FP); \ +- RET +- +-// func archFloor(x float64) float64 +-ROUNDFN(·archFloor, RDN) +- +-// func archCeil(x float64) float64 +-ROUNDFN(·archCeil, RUP) +- +-// func archTrunc(x float64) float64 +-ROUNDFN(·archTrunc, RTZ) +diff --git a/test/fixedbugs/issue68322.go b/test/fixedbugs/issue68322.go +new file mode 100644 +index 0000000000..9b3e713d59 +--- /dev/null ++++ b/test/fixedbugs/issue68322.go +@@ -0,0 +1,17 @@ ++// run ++ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++package main ++ ++import "math" ++ ++var doNotFold = 18446744073709549568.0 ++ ++func main() { ++ if math.Trunc(doNotFold) != doNotFold { ++ panic("big (over 2**63-1) math.Trunc is incorrect") ++ } ++} +-- +2.39.5 + diff --git a/2039-cmd-compile-drop-TODO-in-NilCheck-for-riscv64.patch b/2039-cmd-compile-drop-TODO-in-NilCheck-for-riscv64.patch new file mode 100644 index 0000000..fa52e1a --- /dev/null +++ b/2039-cmd-compile-drop-TODO-in-NilCheck-for-riscv64.patch @@ -0,0 +1,49 @@ +From 946d929609b1ad9b3a96ed3f02464469c6989084 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 039/119] cmd/compile: drop TODO in NilCheck for riscv64 + +Also add log as arm/amd64 do. + +Change-Id: I3698993e2df0ebf3bfcf8bad5fe389affa0e8eff +Reviewed-on: https://go-review.googlesource.com/c/go/+/595355 +Reviewed-by: Keith Randall +Reviewed-by: Robert Griesemer +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Keith Randall +--- + src/cmd/compile/internal/riscv64/ssa.go | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index c9e75b2180..3c57bc93bc 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -7,6 +7,7 @@ package riscv64 + import ( + "cmd/compile/internal/base" + "cmd/compile/internal/ir" ++ "cmd/compile/internal/logopt" + "cmd/compile/internal/objw" + "cmd/compile/internal/ssa" + "cmd/compile/internal/ssagen" +@@ -720,13 +721,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + + case ssa.OpRISCV64LoweredNilCheck: + // Issue a load which will fault if arg is nil. +- // TODO: optimizations. See arm and amd64 LoweredNilCheck. + p := s.Prog(riscv.AMOVB) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = riscv.REG_ZERO ++ if logopt.Enabled() { ++ logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name) ++ } + if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos == 1 in generated wrappers + base.WarnfAt(v.Pos, "generated nil check") + } +-- +2.39.5 + diff --git a/2040-math-big-implement-addVV-in-riscv64-assembly.patch b/2040-math-big-implement-addVV-in-riscv64-assembly.patch new file mode 100644 index 0000000..ec5aa2a --- /dev/null +++ b/2040-math-big-implement-addVV-in-riscv64-assembly.patch @@ -0,0 +1,148 @@ +From 2ce9d3b1f780c80d0dcbcf2efe3972ba26b5ba7a Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 040/119] math/big: implement addVV in riscv64 assembly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This provides an assembly implementation of addVV for riscv64, +processing up to four words per loop, resulting in a significant +performance gain. + +On a StarFive VisionFive 2: + + │ addvv.1 │ addvv.2 │ + │ sec/op │ sec/op vs base │ +AddVV/1-4 73.45n ± 0% 48.08n ± 0% -34.54% (p=0.000 n=10) +AddVV/2-4 88.14n ± 0% 58.76n ± 0% -33.33% (p=0.000 n=10) +AddVV/3-4 102.80n ± 0% 69.44n ± 0% -32.45% (p=0.000 n=10) +AddVV/4-4 117.50n ± 0% 72.18n ± 0% -38.57% (p=0.000 n=10) +AddVV/5-4 132.20n ± 0% 82.79n ± 0% -37.38% (p=0.000 n=10) +AddVV/10-4 216.3n ± 0% 126.8n ± 0% -41.35% (p=0.000 n=10) +AddVV/100-4 1659.0n ± 0% 885.2n ± 0% -46.64% (p=0.000 n=10) +AddVV/1000-4 16.089µ ± 0% 8.400µ ± 0% -47.79% (p=0.000 n=10) +AddVV/10000-4 245.3µ ± 0% 176.9µ ± 0% -27.88% (p=0.000 n=10) +AddVV/100000-4 2.537m ± 0% 1.873m ± 0% -26.17% (p=0.000 n=10) +geomean 1.435µ 904.5n -36.99% + + │ addvv.1 │ addvv.2 │ + │ B/s │ B/s vs base │ +AddVV/1-4 830.9Mi ± 0% 1269.5Mi ± 0% +52.78% (p=0.000 n=10) +AddVV/2-4 1.353Gi ± 0% 2.029Gi ± 0% +50.00% (p=0.000 n=10) +AddVV/3-4 1.739Gi ± 0% 2.575Gi ± 0% +48.09% (p=0.000 n=10) +AddVV/4-4 2.029Gi ± 0% 3.303Gi ± 0% +62.82% (p=0.000 n=10) +AddVV/5-4 2.254Gi ± 0% 3.600Gi ± 0% +59.69% (p=0.000 n=10) +AddVV/10-4 2.755Gi ± 0% 4.699Gi ± 0% +70.54% (p=0.000 n=10) +AddVV/100-4 3.594Gi ± 0% 6.734Gi ± 0% +87.37% (p=0.000 n=10) +AddVV/1000-4 3.705Gi ± 0% 7.096Gi ± 0% +91.54% (p=0.000 n=10) +AddVV/10000-4 2.430Gi ± 0% 3.369Gi ± 0% +38.65% (p=0.000 n=10) +AddVV/100000-4 2.350Gi ± 0% 3.183Gi ± 0% +35.44% (p=0.000 n=10) +geomean 2.119Gi 3.364Gi +58.71% + +Change-Id: I727b3d9f8ab01eada7270046480b1430d56d0a96 +Reviewed-on: https://go-review.googlesource.com/c/go/+/595395 +Reviewed-by: Cherry Mui +Reviewed-by: David Chase +Reviewed-by: M Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Than McIntosh +--- + src/math/big/arith_riscv64.s | 81 +++++++++++++++++++++++++++++++++++- + 1 file changed, 80 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s +index cb9ac18292..56e7a2bf4f 100644 +--- a/src/math/big/arith_riscv64.s ++++ b/src/math/big/arith_riscv64.s +@@ -11,7 +11,86 @@ + // arithmetic operations on vectors implemented in arith.go. + + TEXT ·addVV(SB),NOSPLIT,$0 +- JMP ·addVV_g(SB) ++ MOV x+24(FP), X5 ++ MOV y+48(FP), X6 ++ MOV z+0(FP), X7 ++ MOV z_len+8(FP), X30 ++ ++ MOV $4, X28 ++ MOV $0, X29 // c = 0 ++ ++ BEQZ X30, done ++ BLTU X30, X28, loop1 ++ ++loop4: ++ MOV 0(X5), X8 // x[0] ++ MOV 0(X6), X9 // y[0] ++ MOV 8(X5), X11 // x[1] ++ MOV 8(X6), X12 // y[1] ++ MOV 16(X5), X14 // x[2] ++ MOV 16(X6), X15 // y[2] ++ MOV 24(X5), X17 // x[3] ++ MOV 24(X6), X18 // y[3] ++ ++ ADD X8, X9, X21 // z[0] = x[0] + y[0] ++ SLTU X8, X21, X22 ++ ADD X21, X29, X10 // z[0] = x[0] + y[0] + c ++ SLTU X21, X10, X23 ++ ADD X22, X23, X29 // next c ++ ++ ADD X11, X12, X24 // z[1] = x[1] + y[1] ++ SLTU X11, X24, X25 ++ ADD X24, X29, X13 // z[1] = x[1] + y[1] + c ++ SLTU X24, X13, X26 ++ ADD X25, X26, X29 // next c ++ ++ ADD X14, X15, X21 // z[2] = x[2] + y[2] ++ SLTU X14, X21, X22 ++ ADD X21, X29, X16 // z[2] = x[2] + y[2] + c ++ SLTU X21, X16, X23 ++ ADD X22, X23, X29 // next c ++ ++ ADD X17, X18, X21 // z[3] = x[3] + y[3] ++ SLTU X17, X21, X22 ++ ADD X21, X29, X19 // z[3] = x[3] + y[3] + c ++ SLTU X21, X19, X23 ++ ADD X22, X23, X29 // next c ++ ++ MOV X10, 0(X7) // z[0] ++ MOV X13, 8(X7) // z[1] ++ MOV X16, 16(X7) // z[2] ++ MOV X19, 24(X7) // z[3] ++ ++ ADD $32, X5 ++ ADD $32, X6 ++ ADD $32, X7 ++ SUB $4, X30 ++ ++ BGEU X30, X28, loop4 ++ BEQZ X30, done ++ ++loop1: ++ MOV 0(X5), X10 // x ++ MOV 0(X6), X11 // y ++ ++ ADD X10, X11, X12 // z = x + y ++ SLTU X10, X12, X14 ++ ADD X12, X29, X13 // z = x + y + c ++ SLTU X12, X13, X15 ++ ADD X14, X15, X29 // next c ++ ++ MOV X13, 0(X7) // z ++ ++ ADD $8, X5 ++ ADD $8, X6 ++ ADD $8, X7 ++ SUB $1, X30 ++ ++ BNEZ X30, loop1 ++ ++done: ++ MOV X29, c+72(FP) // return c ++ RET + + TEXT ·subVV(SB),NOSPLIT,$0 + JMP ·subVV_g(SB) +-- +2.39.5 + diff --git a/2041-math-big-implement-subVV-in-riscv64-assembly.patch b/2041-math-big-implement-subVV-in-riscv64-assembly.patch new file mode 100644 index 0000000..b7b4d7c --- /dev/null +++ b/2041-math-big-implement-subVV-in-riscv64-assembly.patch @@ -0,0 +1,148 @@ +From e89ee68e880af855a61c4b29f10d6db4aa86598d Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 041/119] math/big: implement subVV in riscv64 assembly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This provides an assembly implementation of subVV for riscv64, +processing up to four words per loop, resulting in a significant +performance gain. + +On a StarFive VisionFive 2: + + │ subvv.1 │ subvv.2 │ + │ sec/op │ sec/op vs base │ +SubVV/1-4 73.46n ± 0% 48.08n ± 0% -34.55% (p=0.000 n=10) +SubVV/2-4 88.13n ± 0% 58.76n ± 0% -33.33% (p=0.000 n=10) +SubVV/3-4 102.80n ± 0% 69.45n ± 0% -32.44% (p=0.000 n=10) +SubVV/4-4 117.50n ± 0% 72.11n ± 0% -38.63% (p=0.000 n=10) +SubVV/5-4 132.20n ± 0% 82.80n ± 0% -37.37% (p=0.000 n=10) +SubVV/10-4 216.3n ± 0% 126.9n ± 0% -41.33% (p=0.000 n=10) +SubVV/100-4 1659.0n ± 0% 886.5n ± 0% -46.56% (p=0.000 n=10) +SubVV/1000-4 16.089µ ± 0% 8.401µ ± 0% -47.78% (p=0.000 n=10) +SubVV/10000-4 244.7µ ± 0% 176.8µ ± 0% -27.74% (p=0.000 n=10) +SubVV/100000-4 2.562m ± 0% 1.871m ± 0% -26.96% (p=0.000 n=10) +geomean 1.436µ 904.4n -37.04% + + │ subvv.1 │ subvv.2 │ + │ B/s │ B/s vs base │ +SubVV/1-4 830.9Mi ± 0% 1269.5Mi ± 0% +52.79% (p=0.000 n=10) +SubVV/2-4 1.353Gi ± 0% 2.029Gi ± 0% +49.99% (p=0.000 n=10) +SubVV/3-4 1.739Gi ± 0% 2.575Gi ± 0% +48.06% (p=0.000 n=10) +SubVV/4-4 2.029Gi ± 0% 3.306Gi ± 0% +62.96% (p=0.000 n=10) +SubVV/5-4 2.254Gi ± 0% 3.600Gi ± 0% +59.67% (p=0.000 n=10) +SubVV/10-4 2.755Gi ± 0% 4.699Gi ± 0% +70.53% (p=0.000 n=10) +SubVV/100-4 3.594Gi ± 0% 6.723Gi ± 0% +87.08% (p=0.000 n=10) +SubVV/1000-4 3.705Gi ± 0% 7.095Gi ± 0% +91.52% (p=0.000 n=10) +SubVV/10000-4 2.436Gi ± 0% 3.372Gi ± 0% +38.39% (p=0.000 n=10) +SubVV/100000-4 2.327Gi ± 0% 3.185Gi ± 0% +36.91% (p=0.000 n=10) +geomean 2.118Gi 3.364Gi +58.84% + +Change-Id: I361cb3f4195b27a9f1e9486c9e1fdbeaa94d32b4 +Reviewed-on: https://go-review.googlesource.com/c/go/+/595396 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: Mark Ryan +Reviewed-by: Cherry Mui +Reviewed-by: Carlos Amedee +--- + src/math/big/arith_riscv64.s | 81 +++++++++++++++++++++++++++++++++++- + 1 file changed, 80 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s +index 56e7a2bf4f..f29933d2a1 100644 +--- a/src/math/big/arith_riscv64.s ++++ b/src/math/big/arith_riscv64.s +@@ -93,7 +93,86 @@ done: + RET + + TEXT ·subVV(SB),NOSPLIT,$0 +- JMP ·subVV_g(SB) ++ MOV x+24(FP), X5 ++ MOV y+48(FP), X6 ++ MOV z+0(FP), X7 ++ MOV z_len+8(FP), X30 ++ ++ MOV $4, X28 ++ MOV $0, X29 // b = 0 ++ ++ BEQZ X30, done ++ BLTU X30, X28, loop1 ++ ++loop4: ++ MOV 0(X5), X8 // x[0] ++ MOV 0(X6), X9 // y[0] ++ MOV 8(X5), X11 // x[1] ++ MOV 8(X6), X12 // y[1] ++ MOV 16(X5), X14 // x[2] ++ MOV 16(X6), X15 // y[2] ++ MOV 24(X5), X17 // x[3] ++ MOV 24(X6), X18 // y[3] ++ ++ SUB X9, X8, X21 // z[0] = x[0] - y[0] ++ SLTU X21, X8, X22 ++ SUB X29, X21, X10 // z[0] = x[0] - y[0] - b ++ SLTU X10, X21, X23 ++ ADD X22, X23, X29 // next b ++ ++ SUB X12, X11, X24 // z[1] = x[1] - y[1] ++ SLTU X24, X11, X25 ++ SUB X29, X24, X13 // z[1] = x[1] - y[1] - b ++ SLTU X13, X24, X26 ++ ADD X25, X26, X29 // next b ++ ++ SUB X15, X14, X21 // z[2] = x[2] - y[2] ++ SLTU X21, X14, X22 ++ SUB X29, X21, X16 // z[2] = x[2] - y[2] - b ++ SLTU X16, X21, X23 ++ ADD X22, X23, X29 // next b ++ ++ SUB X18, X17, X21 // z[3] = x[3] - y[3] ++ SLTU X21, X17, X22 ++ SUB X29, X21, X19 // z[3] = x[3] - y[3] - b ++ SLTU X19, X21, X23 ++ ADD X22, X23, X29 // next b ++ ++ MOV X10, 0(X7) // z[0] ++ MOV X13, 8(X7) // z[1] ++ MOV X16, 16(X7) // z[2] ++ MOV X19, 24(X7) // z[3] ++ ++ ADD $32, X5 ++ ADD $32, X6 ++ ADD $32, X7 ++ SUB $4, X30 ++ ++ BGEU X30, X28, loop4 ++ BEQZ X30, done ++ ++loop1: ++ MOV 0(X5), X10 // x ++ MOV 0(X6), X11 // y ++ ++ SUB X11, X10, X12 // z = x - y ++ SLTU X12, X10, X14 ++ SUB X29, X12, X13 // z = x - y - b ++ SLTU X13, X12, X15 ++ ADD X14, X15, X29 // next b ++ ++ MOV X13, 0(X7) // z ++ ++ ADD $8, X5 ++ ADD $8, X6 ++ ADD $8, X7 ++ SUB $1, X30 ++ ++ BNEZ X30, loop1 ++ ++done: ++ MOV X29, c+72(FP) // return b ++ RET + + TEXT ·addVW(SB),NOSPLIT,$0 + JMP ·addVW_g(SB) +-- +2.39.5 + diff --git a/2042-cmd-compile-use-integer-min-max-instructions-on-risc.patch b/2042-cmd-compile-use-integer-min-max-instructions-on-risc.patch new file mode 100644 index 0000000..f2d97ff --- /dev/null +++ b/2042-cmd-compile-use-integer-min-max-instructions-on-risc.patch @@ -0,0 +1,360 @@ +From 27901e4870c6f988a9a1dbdb0a6aedda301a66bc Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 042/119] cmd/compile: use integer min/max instructions on + riscv64 + +When GORISCV64 enables rva22u64, make use of integer MIN/MINU/MAX/MAXU +instructions in compiler rewrite rules. + +Change-Id: I4e7c514516acad03f2869d4c8936f06582cf7ea9 +Reviewed-on: https://go-review.googlesource.com/c/go/+/559660 +Reviewed-by: David Chase +Reviewed-by: Carlos Amedee +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +--- + src/cmd/compile/internal/riscv64/ssa.go | 4 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 10 +++ + .../compile/internal/ssa/_gen/RISCV64Ops.go | 6 ++ + .../compile/internal/ssa/_gen/genericOps.go | 6 ++ + src/cmd/compile/internal/ssa/opGen.go | 88 +++++++++++++++++++ + .../compile/internal/ssa/rewriteRISCV64.go | 81 +++++++++++++++++ + src/cmd/compile/internal/ssagen/ssa.go | 19 ++++ + 7 files changed, 212 insertions(+), 2 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 3c57bc93bc..10fea07e60 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -288,8 +288,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + ssa.OpRISCV64FADDS, ssa.OpRISCV64FSUBS, ssa.OpRISCV64FMULS, ssa.OpRISCV64FDIVS, + ssa.OpRISCV64FEQS, ssa.OpRISCV64FNES, ssa.OpRISCV64FLTS, ssa.OpRISCV64FLES, + ssa.OpRISCV64FADDD, ssa.OpRISCV64FSUBD, ssa.OpRISCV64FMULD, ssa.OpRISCV64FDIVD, +- ssa.OpRISCV64FEQD, ssa.OpRISCV64FNED, ssa.OpRISCV64FLTD, ssa.OpRISCV64FLED, +- ssa.OpRISCV64FSGNJD: ++ ssa.OpRISCV64FEQD, ssa.OpRISCV64FNED, ssa.OpRISCV64FLTD, ssa.OpRISCV64FLED, ssa.OpRISCV64FSGNJD, ++ ssa.OpRISCV64MIN, ssa.OpRISCV64MAX, ssa.OpRISCV64MINU, ssa.OpRISCV64MAXU: + r := v.Reg() + r1 := v.Args[0].Reg() + r2 := v.Args[1].Reg() +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index c2df433315..7d8fb79e17 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -834,3 +834,13 @@ + (F(MADD|NMADD|MSUB|NMSUB)S x y neg:(FNEGS z)) && neg.Uses == 1 => (F(MSUB|NMSUB|MADD|NMADD)S x y z) + (F(MADD|NMADD|MSUB|NMSUB)D neg:(FNEGD x) y z) && neg.Uses == 1 => (F(NMSUB|MSUB|NMADD|MADD)D x y z) + (F(MADD|NMADD|MSUB|NMSUB)D x y neg:(FNEGD z)) && neg.Uses == 1 => (F(MSUB|NMSUB|MADD|NMADD)D x y z) ++ ++// ++// Optimisations for rva22u64 and above. ++// ++ ++// Integer minimum and maximum. ++(Min64 x y) && buildcfg.GORISCV64 >= 22 => (MIN x y) ++(Max64 x y) && buildcfg.GORISCV64 >= 22 => (MAX x y) ++(Min64u x y) && buildcfg.GORISCV64 >= 22 => (MINU x y) ++(Max64u x y) && buildcfg.GORISCV64 >= 22 => (MAXU x y) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 13fa91864b..7323cb119c 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -235,6 +235,12 @@ func init() { + {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1 + {name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"}, // arg0 ^ auxint + ++ // Minimum and maximum ++ {name: "MIN", argLength: 2, reg: gp21, asm: "MIN", commutative: true}, // min(arg0,arg1), signed ++ {name: "MAX", argLength: 2, reg: gp21, asm: "MAX", commutative: true}, // max(arg0,arg1), signed ++ {name: "MINU", argLength: 2, reg: gp21, asm: "MINU", commutative: true}, // min(arg0,arg1), unsigned ++ {name: "MAXU", argLength: 2, reg: gp21, asm: "MAXU", commutative: true}, // max(arg0,arg1), unsigned ++ + // Generate boolean values + {name: "SEQZ", argLength: 1, reg: gp11, asm: "SEQZ"}, // arg0 == 0, result is 0 or 1 + {name: "SNEZ", argLength: 1, reg: gp11, asm: "SNEZ"}, // arg0 != 0, result is 0 or 1 +diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go +index fb18319263..95a5a4dda9 100644 +--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go ++++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go +@@ -285,6 +285,12 @@ var genericOps = []opData{ + {name: "Abs", argLength: 1}, // absolute value arg0 + {name: "Copysign", argLength: 2}, // copy sign from arg0 to arg1 + ++ // Integer min/max implementation, if hardware is available. ++ {name: "Min64", argLength: 2}, // min(arg0,arg1), signed ++ {name: "Max64", argLength: 2}, // max(arg0,arg1), signed ++ {name: "Min64u", argLength: 2}, // min(arg0,arg1), unsigned ++ {name: "Max64u", argLength: 2}, // max(arg0,arg1), unsigned ++ + // Float min/max implementation, if hardware is available. + {name: "Min64F", argLength: 2}, // min(arg0,arg1) + {name: "Min32F", argLength: 2}, // min(arg0,arg1) +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index dd80a2c52a..600b8d9e30 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2394,6 +2394,10 @@ const ( + OpRISCV64RORW + OpRISCV64XOR + OpRISCV64XORI ++ OpRISCV64MIN ++ OpRISCV64MAX ++ OpRISCV64MINU ++ OpRISCV64MAXU + OpRISCV64SEQZ + OpRISCV64SNEZ + OpRISCV64SLT +@@ -3035,6 +3039,10 @@ const ( + OpRoundToEven + OpAbs + OpCopysign ++ OpMin64 ++ OpMax64 ++ OpMin64u ++ OpMax64u + OpMin64F + OpMin32F + OpMax64F +@@ -32124,6 +32132,66 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "MIN", ++ argLen: 2, ++ commutative: true, ++ asm: riscv.AMIN, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "MAX", ++ argLen: 2, ++ commutative: true, ++ asm: riscv.AMAX, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "MINU", ++ argLen: 2, ++ commutative: true, ++ asm: riscv.AMINU, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "MAXU", ++ argLen: 2, ++ commutative: true, ++ asm: riscv.AMAXU, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "SEQZ", + argLen: 1, +@@ -39373,6 +39441,26 @@ var opcodeTable = [...]opInfo{ + argLen: 2, + generic: true, + }, ++ { ++ name: "Min64", ++ argLen: 2, ++ generic: true, ++ }, ++ { ++ name: "Max64", ++ argLen: 2, ++ generic: true, ++ }, ++ { ++ name: "Min64u", ++ argLen: 2, ++ generic: true, ++ }, ++ { ++ name: "Max64u", ++ argLen: 2, ++ generic: true, ++ }, + { + name: "Min64F", + argLen: 2, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 28c44da5a8..9a13955689 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -4,6 +4,7 @@ package ssa + + import ( + "cmd/compile/internal/types" ++ "internal/buildcfg" + "math" + ) + +@@ -331,15 +332,23 @@ func rewriteValueRISCV64(v *Value) bool { + case OpMax32F: + v.Op = OpRISCV64LoweredFMAXS + return true ++ case OpMax64: ++ return rewriteValueRISCV64_OpMax64(v) + case OpMax64F: + v.Op = OpRISCV64LoweredFMAXD + return true ++ case OpMax64u: ++ return rewriteValueRISCV64_OpMax64u(v) + case OpMin32F: + v.Op = OpRISCV64LoweredFMINS + return true ++ case OpMin64: ++ return rewriteValueRISCV64_OpMin64(v) + case OpMin64F: + v.Op = OpRISCV64LoweredFMIND + return true ++ case OpMin64u: ++ return rewriteValueRISCV64_OpMin64u(v) + case OpMod16: + return rewriteValueRISCV64_OpMod16(v) + case OpMod16u: +@@ -2398,6 +2407,78 @@ func rewriteValueRISCV64_OpLsh8x8(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpMax64(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (Max64 x y) ++ // cond: buildcfg.GORISCV64 >= 22 ++ // result: (MAX x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(buildcfg.GORISCV64 >= 22) { ++ break ++ } ++ v.reset(OpRISCV64MAX) ++ v.AddArg2(x, y) ++ return true ++ } ++ return false ++} ++func rewriteValueRISCV64_OpMax64u(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (Max64u x y) ++ // cond: buildcfg.GORISCV64 >= 22 ++ // result: (MAXU x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(buildcfg.GORISCV64 >= 22) { ++ break ++ } ++ v.reset(OpRISCV64MAXU) ++ v.AddArg2(x, y) ++ return true ++ } ++ return false ++} ++func rewriteValueRISCV64_OpMin64(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (Min64 x y) ++ // cond: buildcfg.GORISCV64 >= 22 ++ // result: (MIN x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(buildcfg.GORISCV64 >= 22) { ++ break ++ } ++ v.reset(OpRISCV64MIN) ++ v.AddArg2(x, y) ++ return true ++ } ++ return false ++} ++func rewriteValueRISCV64_OpMin64u(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (Min64u x y) ++ // cond: buildcfg.GORISCV64 >= 22 ++ // result: (MINU x y) ++ for { ++ x := v_0 ++ y := v_1 ++ if !(buildcfg.GORISCV64 >= 22) { ++ break ++ } ++ v.reset(OpRISCV64MINU) ++ v.AddArg2(x, y) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpMod16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go +index 178ccfb59b..a0a3470ea2 100644 +--- a/src/cmd/compile/internal/ssagen/ssa.go ++++ b/src/cmd/compile/internal/ssagen/ssa.go +@@ -3624,6 +3624,25 @@ func (s *state) minMax(n *ir.CallExpr) *ssa.Value { + }) + } + ++ if typ.IsInteger() { ++ if Arch.LinkArch.Family == sys.RISCV64 && buildcfg.GORISCV64 >= 22 && typ.Size() == 8 { ++ var op ssa.Op ++ switch { ++ case typ.IsSigned() && n.Op() == ir.OMIN: ++ op = ssa.OpMin64 ++ case typ.IsSigned() && n.Op() == ir.OMAX: ++ op = ssa.OpMax64 ++ case typ.IsUnsigned() && n.Op() == ir.OMIN: ++ op = ssa.OpMin64u ++ case typ.IsUnsigned() && n.Op() == ir.OMAX: ++ op = ssa.OpMax64u ++ } ++ return fold(func(x, a *ssa.Value) *ssa.Value { ++ return s.newValue2(op, typ, x, a) ++ }) ++ } ++ } ++ + lt := s.ssaOp(ir.OLT, typ) + + return fold(func(x, a *ssa.Value) *ssa.Value { +-- +2.39.5 + diff --git a/2043-math-big-implement-addVW-in-riscv64-assembly.patch b/2043-math-big-implement-addVW-in-riscv64-assembly.patch new file mode 100644 index 0000000..6ce9221 --- /dev/null +++ b/2043-math-big-implement-addVW-in-riscv64-assembly.patch @@ -0,0 +1,146 @@ +From bb58f5dd1ad9a3c5f95a372a9e2ee6c97032a442 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 043/119] math/big: implement addVW in riscv64 assembly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This provides an assembly implementation of addVW for riscv64, +processing up to four words per loop, resulting in a significant +performance gain. + +On a StarFive VisionFive 2: + + │ addvw.1 │ addvw.2 │ + │ sec/op │ sec/op vs base │ +AddVW/1-4 57.43n ± 0% 41.45n ± 0% -27.83% (p=0.000 n=10) +AddVW/2-4 69.31n ± 0% 48.15n ± 0% -30.53% (p=0.000 n=10) +AddVW/3-4 76.12n ± 0% 54.97n ± 0% -27.79% (p=0.000 n=10) +AddVW/4-4 85.47n ± 0% 56.14n ± 0% -34.32% (p=0.000 n=10) +AddVW/5-4 96.16n ± 0% 62.82n ± 0% -34.67% (p=0.000 n=10) +AddVW/10-4 149.60n ± 0% 89.55n ± 0% -40.14% (p=0.000 n=10) +AddVW/100-4 1115.0n ± 0% 549.3n ± 0% -50.74% (p=0.000 n=10) +AddVW/1000-4 10.732µ ± 0% 5.060µ ± 0% -52.85% (p=0.000 n=10) +AddVW/10000-4 151.7µ ± 0% 103.7µ ± 0% -31.63% (p=0.000 n=10) +AddVW/100000-4 1.523m ± 0% 1.050m ± 0% -31.03% (p=0.000 n=10) +AddVWext/1-4 57.42n ± 0% 41.45n ± 0% -27.81% (p=0.000 n=10) +AddVWext/2-4 69.32n ± 0% 48.15n ± 0% -30.54% (p=0.000 n=10) +AddVWext/3-4 76.12n ± 0% 54.87n ± 0% -27.92% (p=0.000 n=10) +AddVWext/4-4 85.47n ± 0% 56.14n ± 0% -34.32% (p=0.000 n=10) +AddVWext/5-4 96.15n ± 0% 62.82n ± 0% -34.66% (p=0.000 n=10) +AddVWext/10-4 149.60n ± 0% 89.55n ± 0% -40.14% (p=0.000 n=10) +AddVWext/100-4 1115.0n ± 0% 549.3n ± 0% -50.74% (p=0.000 n=10) +AddVWext/1000-4 10.732µ ± 0% 5.060µ ± 0% -52.85% (p=0.000 n=10) +AddVWext/10000-4 150.5µ ± 0% 103.7µ ± 0% -31.10% (p=0.000 n=10) +AddVWext/100000-4 1.530m ± 0% 1.049m ± 0% -31.41% (p=0.000 n=10) +geomean 1.003µ 633.9n -36.79% + + │ addvw.1 │ addvw.2 │ + │ B/s │ B/s vs base │ +AddVW/1-4 132.8Mi ± 0% 184.1Mi ± 0% +38.55% (p=0.000 n=10) +AddVW/2-4 220.1Mi ± 0% 316.9Mi ± 0% +43.96% (p=0.000 n=10) +AddVW/3-4 300.7Mi ± 0% 416.4Mi ± 0% +38.48% (p=0.000 n=10) +AddVW/4-4 357.1Mi ± 0% 543.6Mi ± 0% +52.25% (p=0.000 n=10) +AddVW/5-4 396.7Mi ± 0% 607.2Mi ± 0% +53.06% (p=0.000 n=10) +AddVW/10-4 510.1Mi ± 0% 852.0Mi ± 0% +67.02% (p=0.000 n=10) +AddVW/100-4 684.1Mi ± 0% 1389.0Mi ± 0% +103.03% (p=0.000 n=10) +AddVW/1000-4 710.9Mi ± 0% 1507.8Mi ± 0% +112.08% (p=0.000 n=10) +AddVW/10000-4 503.1Mi ± 0% 735.8Mi ± 0% +46.26% (p=0.000 n=10) +AddVW/100000-4 501.0Mi ± 0% 726.5Mi ± 0% +45.00% (p=0.000 n=10) +AddVWext/1-4 132.9Mi ± 0% 184.1Mi ± 0% +38.55% (p=0.000 n=10) +AddVWext/2-4 220.1Mi ± 0% 316.9Mi ± 0% +43.98% (p=0.000 n=10) +AddVWext/3-4 300.7Mi ± 0% 417.1Mi ± 0% +38.73% (p=0.000 n=10) +AddVWext/4-4 357.1Mi ± 0% 543.6Mi ± 0% +52.25% (p=0.000 n=10) +AddVWext/5-4 396.7Mi ± 0% 607.2Mi ± 0% +53.05% (p=0.000 n=10) +AddVWext/10-4 510.1Mi ± 0% 852.0Mi ± 0% +67.02% (p=0.000 n=10) +AddVWext/100-4 684.2Mi ± 0% 1389.0Mi ± 0% +103.02% (p=0.000 n=10) +AddVWext/1000-4 710.9Mi ± 0% 1507.7Mi ± 0% +112.08% (p=0.000 n=10) +AddVWext/10000-4 506.9Mi ± 0% 735.8Mi ± 0% +45.15% (p=0.000 n=10) +AddVWext/100000-4 498.6Mi ± 0% 727.0Mi ± 0% +45.79% (p=0.000 n=10) +geomean 388.3Mi 614.3Mi +58.19% + +Change-Id: Ib14a4b8c1d81e710753bbf6dd5546bbca44fe3f1 +Reviewed-on: https://go-review.googlesource.com/c/go/+/595397 +Reviewed-by: Meng Zhuo +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +Reviewed-by: Dmitri Shuralyov +--- + src/math/big/arith_riscv64.s | 59 +++++++++++++++++++++++++++++++++++- + 1 file changed, 58 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s +index f29933d2a1..eb11de7a2c 100644 +--- a/src/math/big/arith_riscv64.s ++++ b/src/math/big/arith_riscv64.s +@@ -175,7 +175,64 @@ done: + RET + + TEXT ·addVW(SB),NOSPLIT,$0 +- JMP ·addVW_g(SB) ++ MOV x+24(FP), X5 ++ MOV y+48(FP), X6 ++ MOV z+0(FP), X7 ++ MOV z_len+8(FP), X30 ++ ++ MOV $4, X28 ++ MOV X6, X29 // c = y ++ ++ BEQZ X30, done ++ BLTU X30, X28, loop1 ++ ++loop4: ++ MOV 0(X5), X8 // x[0] ++ MOV 8(X5), X11 // x[1] ++ MOV 16(X5), X14 // x[2] ++ MOV 24(X5), X17 // x[3] ++ ++ ADD X8, X29, X10 // z[0] = x[0] + c ++ SLTU X8, X10, X29 // next c ++ ++ ADD X11, X29, X13 // z[1] = x[1] + c ++ SLTU X11, X13, X29 // next c ++ ++ ADD X14, X29, X16 // z[2] = x[2] + c ++ SLTU X14, X16, X29 // next c ++ ++ ADD X17, X29, X19 // z[3] = x[3] + c ++ SLTU X17, X19, X29 // next c ++ ++ MOV X10, 0(X7) // z[0] ++ MOV X13, 8(X7) // z[1] ++ MOV X16, 16(X7) // z[2] ++ MOV X19, 24(X7) // z[3] ++ ++ ADD $32, X5 ++ ADD $32, X7 ++ SUB $4, X30 ++ ++ BGEU X30, X28, loop4 ++ BEQZ X30, done ++ ++loop1: ++ MOV 0(X5), X10 // x ++ ++ ADD X10, X29, X12 // z = x + c ++ SLTU X10, X12, X29 // next c ++ ++ MOV X12, 0(X7) // z ++ ++ ADD $8, X5 ++ ADD $8, X7 ++ SUB $1, X30 ++ ++ BNEZ X30, loop1 ++ ++done: ++ MOV X29, c+56(FP) // return c ++ RET + + TEXT ·subVW(SB),NOSPLIT,$0 + JMP ·subVW_g(SB) +-- +2.39.5 + diff --git a/2044-math-big-implement-subVW-in-riscv64-assembly.patch b/2044-math-big-implement-subVW-in-riscv64-assembly.patch new file mode 100644 index 0000000..18d93ff --- /dev/null +++ b/2044-math-big-implement-subVW-in-riscv64-assembly.patch @@ -0,0 +1,146 @@ +From 5b10906c01a84e75ff5eb8a95cec158d53bea3ee Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 044/119] math/big: implement subVW in riscv64 assembly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This provides an assembly implementation of subVW for riscv64, +processing up to four words per loop, resulting in a significant +performance gain. + +On a StarFive VisionFive 2: + + │ subvw.1 │ subvw.2 │ + │ sec/op │ sec/op vs base │ +SubVW/1-4 57.43n ± 0% 41.45n ± 0% -27.82% (p=0.000 n=10) +SubVW/2-4 69.31n ± 0% 48.15n ± 0% -30.53% (p=0.000 n=10) +SubVW/3-4 76.12n ± 0% 54.87n ± 0% -27.92% (p=0.000 n=10) +SubVW/4-4 85.47n ± 0% 56.14n ± 0% -34.32% (p=0.000 n=10) +SubVW/5-4 96.15n ± 0% 62.83n ± 0% -34.65% (p=0.000 n=10) +SubVW/10-4 149.60n ± 0% 89.55n ± 0% -40.14% (p=0.000 n=10) +SubVW/100-4 1115.0n ± 0% 549.3n ± 0% -50.74% (p=0.000 n=10) +SubVW/1000-4 10.732µ ± 0% 5.071µ ± 0% -52.75% (p=0.000 n=10) +SubVW/10000-4 153.0µ ± 0% 103.7µ ± 0% -32.21% (p=0.000 n=10) +SubVW/100000-4 1.542m ± 0% 1.046m ± 0% -32.13% (p=0.000 n=10) +SubVWext/1-4 57.42n ± 0% 41.45n ± 0% -27.81% (p=0.000 n=10) +SubVWext/2-4 69.33n ± 0% 48.15n ± 0% -30.55% (p=0.000 n=10) +SubVWext/3-4 76.12n ± 0% 54.93n ± 0% -27.84% (p=0.000 n=10) +SubVWext/4-4 85.47n ± 0% 56.14n ± 0% -34.32% (p=0.000 n=10) +SubVWext/5-4 96.15n ± 0% 62.83n ± 0% -34.65% (p=0.000 n=10) +SubVWext/10-4 149.60n ± 0% 89.56n ± 0% -40.14% (p=0.000 n=10) +SubVWext/100-4 1115.0n ± 0% 549.3n ± 0% -50.74% (p=0.000 n=10) +SubVWext/1000-4 10.732µ ± 0% 5.061µ ± 0% -52.84% (p=0.000 n=10) +SubVWext/10000-4 152.5µ ± 0% 103.7µ ± 0% -32.02% (p=0.000 n=10) +SubVWext/100000-4 1.533m ± 0% 1.046m ± 0% -31.75% (p=0.000 n=10) +geomean 1.005µ 633.7n -36.92% + + │ subvw.1 │ subvw.2 │ + │ B/s │ B/s vs base │ +SubVW/1-4 132.9Mi ± 0% 184.1Mi ± 0% +38.54% (p=0.000 n=10) +SubVW/2-4 220.1Mi ± 0% 316.9Mi ± 0% +43.95% (p=0.000 n=10) +SubVW/3-4 300.7Mi ± 0% 417.1Mi ± 0% +38.72% (p=0.000 n=10) +SubVW/4-4 357.1Mi ± 0% 543.6Mi ± 0% +52.24% (p=0.000 n=10) +SubVW/5-4 396.7Mi ± 0% 607.2Mi ± 0% +53.03% (p=0.000 n=10) +SubVW/10-4 510.1Mi ± 0% 851.9Mi ± 0% +67.01% (p=0.000 n=10) +SubVW/100-4 684.2Mi ± 0% 1388.9Mi ± 0% +102.99% (p=0.000 n=10) +SubVW/1000-4 710.9Mi ± 0% 1504.5Mi ± 0% +111.63% (p=0.000 n=10) +SubVW/10000-4 498.7Mi ± 0% 735.7Mi ± 0% +47.52% (p=0.000 n=10) +SubVW/100000-4 494.8Mi ± 0% 729.1Mi ± 0% +47.34% (p=0.000 n=10) +SubVWext/1-4 132.9Mi ± 0% 184.1Mi ± 0% +38.53% (p=0.000 n=10) +SubVWext/2-4 220.1Mi ± 0% 316.9Mi ± 0% +44.00% (p=0.000 n=10) +SubVWext/3-4 300.7Mi ± 0% 416.7Mi ± 0% +38.57% (p=0.000 n=10) +SubVWext/4-4 357.1Mi ± 0% 543.6Mi ± 0% +52.24% (p=0.000 n=10) +SubVWext/5-4 396.7Mi ± 0% 607.2Mi ± 0% +53.04% (p=0.000 n=10) +SubVWext/10-4 510.1Mi ± 0% 851.9Mi ± 0% +67.01% (p=0.000 n=10) +SubVWext/100-4 684.2Mi ± 0% 1388.9Mi ± 0% +102.99% (p=0.000 n=10) +SubVWext/1000-4 710.9Mi ± 0% 1507.6Mi ± 0% +112.07% (p=0.000 n=10) +SubVWext/10000-4 500.1Mi ± 0% 735.7Mi ± 0% +47.10% (p=0.000 n=10) +SubVWext/100000-4 497.8Mi ± 0% 729.4Mi ± 0% +46.52% (p=0.000 n=10) +geomean 387.6Mi 614.5Mi +58.51% + +Change-Id: I9d7fac719e977710ad9db9121fa298db6df605de +Reviewed-on: https://go-review.googlesource.com/c/go/+/595398 +Reviewed-by: Mark Ryan +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Meng Zhuo +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +--- + src/math/big/arith_riscv64.s | 59 +++++++++++++++++++++++++++++++++++- + 1 file changed, 58 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s +index eb11de7a2c..6aca1b6d6c 100644 +--- a/src/math/big/arith_riscv64.s ++++ b/src/math/big/arith_riscv64.s +@@ -235,7 +235,64 @@ done: + RET + + TEXT ·subVW(SB),NOSPLIT,$0 +- JMP ·subVW_g(SB) ++ MOV x+24(FP), X5 ++ MOV y+48(FP), X6 ++ MOV z+0(FP), X7 ++ MOV z_len+8(FP), X30 ++ ++ MOV $4, X28 ++ MOV X6, X29 // b = y ++ ++ BEQZ X30, done ++ BLTU X30, X28, loop1 ++ ++loop4: ++ MOV 0(X5), X8 // x[0] ++ MOV 8(X5), X11 // x[1] ++ MOV 16(X5), X14 // x[2] ++ MOV 24(X5), X17 // x[3] ++ ++ SUB X29, X8, X10 // z[0] = x[0] - b ++ SLTU X10, X8, X29 // next b ++ ++ SUB X29, X11, X13 // z[1] = x[1] - b ++ SLTU X13, X11, X29 // next b ++ ++ SUB X29, X14, X16 // z[2] = x[2] - b ++ SLTU X16, X14, X29 // next b ++ ++ SUB X29, X17, X19 // z[3] = x[3] - b ++ SLTU X19, X17, X29 // next b ++ ++ MOV X10, 0(X7) // z[0] ++ MOV X13, 8(X7) // z[1] ++ MOV X16, 16(X7) // z[2] ++ MOV X19, 24(X7) // z[3] ++ ++ ADD $32, X5 ++ ADD $32, X7 ++ SUB $4, X30 ++ ++ BGEU X30, X28, loop4 ++ BEQZ X30, done ++ ++loop1: ++ MOV 0(X5), X10 // x ++ ++ SUB X29, X10, X12 // z = x - b ++ SLTU X12, X10, X29 // next b ++ ++ MOV X12, 0(X7) // z ++ ++ ADD $8, X5 ++ ADD $8, X7 ++ SUB $1, X30 ++ ++ BNEZ X30, loop1 ++ ++done: ++ MOV X29, c+56(FP) // return b ++ RET + + TEXT ·shlVU(SB),NOSPLIT,$0 + JMP ·shlVU_g(SB) +-- +2.39.5 + diff --git a/2045-crypto-sha256-provide-optimised-assembly-for-riscv64.patch b/2045-crypto-sha256-provide-optimised-assembly-for-riscv64.patch new file mode 100644 index 0000000..543143f --- /dev/null +++ b/2045-crypto-sha256-provide-optimised-assembly-for-riscv64.patch @@ -0,0 +1,349 @@ +From 6cbde165e70d9890431c1e452bb29a6fc5c963c8 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 045/119] crypto/sha256: provide optimised assembly for riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Provide an optimised assembly implementation of sha256 for riscv64. +This results in considerable performance gains. + +On a StarFive VisionFive 2: + + │ sha256.1 │ sha256.2 │ + │ sec/op │ sec/op vs base │ +Hash8Bytes/New-4 7.820µ ± 0% 5.193µ ± 0% -33.59% (p=0.000 n=10) +Hash8Bytes/Sum224-4 7.918µ ± 0% 5.294µ ± 0% -33.15% (p=0.000 n=10) +Hash8Bytes/Sum256-4 7.950µ ± 0% 5.320µ ± 0% -33.08% (p=0.000 n=10) +Hash1K/New-4 108.03µ ± 0% 66.12µ ± 0% -38.79% (p=0.000 n=10) +Hash1K/Sum224-4 108.12µ ± 0% 66.22µ ± 0% -38.76% (p=0.000 n=10) +Hash1K/Sum256-4 108.15µ ± 0% 66.24µ ± 0% -38.75% (p=0.000 n=10) +Hash8K/New-4 808.5µ ± 0% 493.0µ ± 0% -39.02% (p=0.000 n=10) +Hash8K/Sum224-4 808.6µ ± 0% 493.1µ ± 0% -39.02% (p=0.000 n=10) +Hash8K/Sum256-4 808.6µ ± 0% 493.1µ ± 0% -39.02% (p=0.000 n=10) +geomean 88.37µ 55.61µ -37.08% + + │ sha256.1 │ sha256.2 │ + │ B/s │ B/s vs base │ +Hash8Bytes/New-4 996.1Ki ± 0% 1503.9Ki ± 0% +50.98% (p=0.000 n=10) +Hash8Bytes/Sum224-4 986.3Ki ± 0% 1474.6Ki ± 0% +49.50% (p=0.000 n=10) +Hash8Bytes/Sum256-4 986.3Ki ± 0% 1464.8Ki ± 0% +48.51% (p=0.000 n=10) +Hash1K/New-4 9.041Mi ± 0% 14.772Mi ± 0% +63.40% (p=0.000 n=10) +Hash1K/Sum224-4 9.031Mi ± 0% 14.744Mi ± 0% +63.25% (p=0.000 n=10) +Hash1K/Sum256-4 9.031Mi ± 0% 14.744Mi ± 0% +63.25% (p=0.000 n=10) +Hash8K/New-4 9.661Mi ± 0% 15.850Mi ± 0% +64.07% (p=0.000 n=10) +Hash8K/Sum224-4 9.661Mi ± 0% 15.841Mi ± 0% +63.97% (p=0.000 n=10) +Hash8K/Sum256-4 9.661Mi ± 0% 15.841Mi ± 0% +63.97% (p=0.000 n=10) +geomean 4.386Mi 6.966Mi +58.85% + +Change-Id: Ieead7b7c02291d70ddc472a7a8cf3c044c1da4b3 +Reviewed-on: https://go-review.googlesource.com/c/go/+/519695 +Reviewed-by: Mark Ryan +Reviewed-by: David Chase +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: Cherry Mui +--- + src/crypto/sha256/sha256block_decl.go | 2 +- + src/crypto/sha256/sha256block_generic.go | 2 +- + src/crypto/sha256/sha256block_riscv64.s | 261 +++++++++++++++++++++++ + 3 files changed, 263 insertions(+), 2 deletions(-) + create mode 100644 src/crypto/sha256/sha256block_riscv64.s + +diff --git a/src/crypto/sha256/sha256block_decl.go b/src/crypto/sha256/sha256block_decl.go +index 7d68cd95fe..0646ef3685 100644 +--- a/src/crypto/sha256/sha256block_decl.go ++++ b/src/crypto/sha256/sha256block_decl.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build 386 || amd64 || s390x || ppc64le || ppc64 ++//go:build 386 || amd64 || s390x || ppc64le || ppc64 || riscv64 + + package sha256 + +diff --git a/src/crypto/sha256/sha256block_generic.go b/src/crypto/sha256/sha256block_generic.go +index fd098bec89..125eb8effb 100644 +--- a/src/crypto/sha256/sha256block_generic.go ++++ b/src/crypto/sha256/sha256block_generic.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !amd64 && !386 && !s390x && !ppc64le && !ppc64 && !arm64 ++//go:build !amd64 && !386 && !s390x && !ppc64le && !ppc64 && !arm64 && !riscv64 + + package sha256 + +diff --git a/src/crypto/sha256/sha256block_riscv64.s b/src/crypto/sha256/sha256block_riscv64.s +new file mode 100644 +index 0000000000..fc7bf65e41 +--- /dev/null ++++ b/src/crypto/sha256/sha256block_riscv64.s +@@ -0,0 +1,261 @@ ++// Copyright 2023 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++#include "textflag.h" ++ ++// SHA256 block routine. See sha256block.go for Go equivalent. ++// ++// The algorithm is detailed in FIPS 180-4: ++// ++// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf ++// ++// Wt = Mt; for 0 <= t <= 15 ++// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 ++// ++// a = H0 ++// b = H1 ++// c = H2 ++// d = H3 ++// e = H4 ++// f = H5 ++// g = H6 ++// h = H7 ++// ++// for t = 0 to 63 { ++// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt ++// T2 = BIGSIGMA0(a) + Maj(a,b,c) ++// h = g ++// g = f ++// f = e ++// e = d + T1 ++// d = c ++// c = b ++// b = a ++// a = T1 + T2 ++// } ++// ++// H0 = a + H0 ++// H1 = b + H1 ++// H2 = c + H2 ++// H3 = d + H3 ++// H4 = e + H4 ++// H5 = f + H5 ++// H6 = g + H6 ++// H7 = h + H7 ++ ++// Wt = Mt; for 0 <= t <= 15 ++#define MSGSCHEDULE0(index) \ ++ MOVBU ((index*4)+0)(X29), X5; \ ++ MOVBU ((index*4)+1)(X29), X6; \ ++ MOVBU ((index*4)+2)(X29), X7; \ ++ MOVBU ((index*4)+3)(X29), X8; \ ++ SLL $24, X5; \ ++ SLL $16, X6; \ ++ OR X5, X6, X5; \ ++ SLL $8, X7; \ ++ OR X5, X7, X5; \ ++ OR X5, X8, X5; \ ++ MOVW X5, (index*4)(X19) ++ ++// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 ++// SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x) ++// SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x) ++#define MSGSCHEDULE1(index) \ ++ MOVWU (((index-2)&0xf)*4)(X19), X5; \ ++ MOVWU (((index-15)&0xf)*4)(X19), X6; \ ++ MOVWU (((index-7)&0xf)*4)(X19), X9; \ ++ MOVWU (((index-16)&0xf)*4)(X19), X21; \ ++ RORW $17, X5, X7; \ ++ RORW $19, X5, X8; \ ++ SRL $10, X5; \ ++ XOR X7, X5; \ ++ XOR X8, X5; \ ++ ADD X9, X5; \ ++ RORW $7, X6, X7; \ ++ RORW $18, X6, X8; \ ++ SRL $3, X6; \ ++ XOR X7, X6; \ ++ XOR X8, X6; \ ++ ADD X6, X5; \ ++ ADD X21, X5; \ ++ MOVW X5, ((index&0xf)*4)(X19) ++ ++// Calculate T1 in X5. ++// h is also used as an accumulator. Wt is passed in X5. ++// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt ++// BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x) ++// Ch(x, y, z) = (x AND y) XOR (NOT x AND z) ++#define SHA256T1(index, e, f, g, h) \ ++ MOVWU (index*4)(X18), X8; \ ++ ADD X5, h; \ ++ RORW $6, e, X6; \ ++ ADD X8, h; \ ++ RORW $11, e, X7; \ ++ XOR X7, X6; \ ++ RORW $25, e, X8; \ ++ XOR X8, X6; \ ++ ADD X6, h; \ ++ AND e, f, X5; \ ++ NOT e, X7; \ ++ AND g, X7; \ ++ XOR X7, X5; \ ++ ADD h, X5 ++ ++// Calculate T2 in X6. ++// T2 = BIGSIGMA0(a) + Maj(a, b, c) ++// BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x) ++// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) ++#define SHA256T2(a, b, c) \ ++ RORW $2, a, X6; \ ++ RORW $13, a, X7; \ ++ XOR X7, X6; \ ++ RORW $22, a, X8; \ ++ XOR X8, X6; \ ++ AND a, b, X7; \ ++ AND a, c, X8; \ ++ XOR X8, X7; \ ++ AND b, c, X9; \ ++ XOR X9, X7; \ ++ ADD X7, X6 ++ ++// Calculate T1 and T2, then e = d + T1 and a = T1 + T2. ++// The values for e and a are stored in d and h, ready for rotation. ++#define SHA256ROUND(index, a, b, c, d, e, f, g, h) \ ++ SHA256T1(index, e, f, g, h); \ ++ SHA256T2(a, b, c); \ ++ MOV X6, h; \ ++ ADD X5, d; \ ++ ADD X5, h ++ ++#define SHA256ROUND0(index, a, b, c, d, e, f, g, h) \ ++ MSGSCHEDULE0(index); \ ++ SHA256ROUND(index, a, b, c, d, e, f, g, h) ++ ++#define SHA256ROUND1(index, a, b, c, d, e, f, g, h) \ ++ MSGSCHEDULE1(index); \ ++ SHA256ROUND(index, a, b, c, d, e, f, g, h) ++ ++// Note that 64 bytes of stack space is used as a circular buffer ++// for the message schedule (4 bytes * 16 entries). ++// ++// func block(dig *digest, p []byte) ++TEXT ·block(SB),0,$64-32 ++ MOV p_base+8(FP), X29 ++ MOV p_len+16(FP), X30 ++ SRL $6, X30 ++ SLL $6, X30 ++ ++ ADD X29, X30, X28 ++ BEQ X28, X29, end ++ ++ MOV ·_K(SB), X18 // const table ++ ADD $8, X2, X19 // message schedule ++ ++ MOV dig+0(FP), X20 ++ MOVWU (0*4)(X20), X10 // a = H0 ++ MOVWU (1*4)(X20), X11 // b = H1 ++ MOVWU (2*4)(X20), X12 // c = H2 ++ MOVWU (3*4)(X20), X13 // d = H3 ++ MOVWU (4*4)(X20), X14 // e = H4 ++ MOVWU (5*4)(X20), X15 // f = H5 ++ MOVWU (6*4)(X20), X16 // g = H6 ++ MOVWU (7*4)(X20), X17 // h = H7 ++ ++loop: ++ SHA256ROUND0(0, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA256ROUND0(1, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA256ROUND0(2, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA256ROUND0(3, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA256ROUND0(4, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA256ROUND0(5, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA256ROUND0(6, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA256ROUND0(7, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA256ROUND0(8, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA256ROUND0(9, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA256ROUND0(10, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA256ROUND0(11, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA256ROUND0(12, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA256ROUND0(13, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA256ROUND0(14, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA256ROUND0(15, X11, X12, X13, X14, X15, X16, X17, X10) ++ ++ SHA256ROUND1(16, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA256ROUND1(17, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA256ROUND1(18, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA256ROUND1(19, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA256ROUND1(20, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA256ROUND1(21, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA256ROUND1(22, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA256ROUND1(23, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA256ROUND1(24, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA256ROUND1(25, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA256ROUND1(26, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA256ROUND1(27, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA256ROUND1(28, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA256ROUND1(29, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA256ROUND1(30, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA256ROUND1(31, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA256ROUND1(32, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA256ROUND1(33, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA256ROUND1(34, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA256ROUND1(35, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA256ROUND1(36, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA256ROUND1(37, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA256ROUND1(38, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA256ROUND1(39, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA256ROUND1(40, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA256ROUND1(41, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA256ROUND1(42, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA256ROUND1(43, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA256ROUND1(44, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA256ROUND1(45, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA256ROUND1(46, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA256ROUND1(47, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA256ROUND1(48, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA256ROUND1(49, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA256ROUND1(50, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA256ROUND1(51, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA256ROUND1(52, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA256ROUND1(53, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA256ROUND1(54, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA256ROUND1(55, X11, X12, X13, X14, X15, X16, X17, X10) ++ SHA256ROUND1(56, X10, X11, X12, X13, X14, X15, X16, X17) ++ SHA256ROUND1(57, X17, X10, X11, X12, X13, X14, X15, X16) ++ SHA256ROUND1(58, X16, X17, X10, X11, X12, X13, X14, X15) ++ SHA256ROUND1(59, X15, X16, X17, X10, X11, X12, X13, X14) ++ SHA256ROUND1(60, X14, X15, X16, X17, X10, X11, X12, X13) ++ SHA256ROUND1(61, X13, X14, X15, X16, X17, X10, X11, X12) ++ SHA256ROUND1(62, X12, X13, X14, X15, X16, X17, X10, X11) ++ SHA256ROUND1(63, X11, X12, X13, X14, X15, X16, X17, X10) ++ ++ MOVWU (0*4)(X20), X5 ++ MOVWU (1*4)(X20), X6 ++ MOVWU (2*4)(X20), X7 ++ MOVWU (3*4)(X20), X8 ++ ADD X5, X10 // H0 = a + H0 ++ ADD X6, X11 // H1 = b + H1 ++ ADD X7, X12 // H2 = c + H2 ++ ADD X8, X13 // H3 = d + H3 ++ MOVW X10, (0*4)(X20) ++ MOVW X11, (1*4)(X20) ++ MOVW X12, (2*4)(X20) ++ MOVW X13, (3*4)(X20) ++ MOVWU (4*4)(X20), X5 ++ MOVWU (5*4)(X20), X6 ++ MOVWU (6*4)(X20), X7 ++ MOVWU (7*4)(X20), X8 ++ ADD X5, X14 // H4 = e + H4 ++ ADD X6, X15 // H5 = f + H5 ++ ADD X7, X16 // H6 = g + H6 ++ ADD X8, X17 // H7 = h + H7 ++ MOVW X14, (4*4)(X20) ++ MOVW X15, (5*4)(X20) ++ MOVW X16, (6*4)(X20) ++ MOVW X17, (7*4)(X20) ++ ++ ADD $64, X29 ++ BNE X28, X29, loop ++ ++end: ++ RET +-- +2.39.5 + diff --git a/2046-math-big-implement-mulAddVWW-in-riscv64-assembly.patch b/2046-math-big-implement-mulAddVWW-in-riscv64-assembly.patch new file mode 100644 index 0000000..c0151ac --- /dev/null +++ b/2046-math-big-implement-mulAddVWW-in-riscv64-assembly.patch @@ -0,0 +1,141 @@ +From e24c5f2b3d687d5f7870107fa2c4ec28833b142a Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 046/119] math/big: implement mulAddVWW in riscv64 assembly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This provides an assembly implementation of mulAddVWW for riscv64, +processing up to four words per loop, resulting in a significant +performance gain. + +On a StarFive VisionFive 2: + + │ muladdvww.1 │ muladdvww.2 │ + │ sec/op │ sec/op vs base │ +MulAddVWW/1-4 68.18n ± 0% 65.49n ± 0% -3.95% (p=0.000 n=10) +MulAddVWW/2-4 82.81n ± 0% 78.85n ± 0% -4.78% (p=0.000 n=10) +MulAddVWW/3-4 97.49n ± 0% 72.18n ± 0% -25.96% (p=0.000 n=10) +MulAddVWW/4-4 112.20n ± 0% 85.54n ± 0% -23.76% (p=0.000 n=10) +MulAddVWW/5-4 126.90n ± 0% 98.90n ± 0% -22.06% (p=0.000 n=10) +MulAddVWW/10-4 200.3n ± 0% 144.3n ± 0% -27.96% (p=0.000 n=10) +MulAddVWW/100-4 1532.0n ± 0% 860.0n ± 0% -43.86% (p=0.000 n=10) +MulAddVWW/1000-4 14.757µ ± 0% 8.076µ ± 0% -45.27% (p=0.000 n=10) +MulAddVWW/10000-4 204.0µ ± 0% 137.1µ ± 0% -32.77% (p=0.000 n=10) +MulAddVWW/100000-4 2.066m ± 0% 1.382m ± 0% -33.12% (p=0.000 n=10) +geomean 1.311µ 950.0n -27.51% + + │ muladdvww.1 │ muladdvww.2 │ + │ B/s │ B/s vs base │ +MulAddVWW/1-4 895.1Mi ± 0% 932.0Mi ± 0% +4.11% (p=0.000 n=10) +MulAddVWW/2-4 1.440Gi ± 0% 1.512Gi ± 0% +5.02% (p=0.000 n=10) +MulAddVWW/3-4 1.834Gi ± 0% 2.477Gi ± 0% +35.07% (p=0.000 n=10) +MulAddVWW/4-4 2.125Gi ± 0% 2.787Gi ± 0% +31.15% (p=0.000 n=10) +MulAddVWW/5-4 2.349Gi ± 0% 3.013Gi ± 0% +28.28% (p=0.000 n=10) +MulAddVWW/10-4 2.975Gi ± 0% 4.130Gi ± 0% +38.79% (p=0.000 n=10) +MulAddVWW/100-4 3.891Gi ± 0% 6.930Gi ± 0% +78.11% (p=0.000 n=10) +MulAddVWW/1000-4 4.039Gi ± 0% 7.380Gi ± 0% +82.72% (p=0.000 n=10) +MulAddVWW/10000-4 2.922Gi ± 0% 4.346Gi ± 0% +48.74% (p=0.000 n=10) +MulAddVWW/100000-4 2.884Gi ± 0% 4.313Gi ± 0% +49.52% (p=0.000 n=10) +geomean 2.321Gi 3.202Gi +37.95% + +Change-Id: If08191607913ce5c7641f34bae8fa5c9dfb44777 +Reviewed-on: https://go-review.googlesource.com/c/go/+/595399 +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Mark Ryan +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +--- + src/math/big/arith_riscv64.s | 74 +++++++++++++++++++++++++++++++++++- + 1 file changed, 73 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s +index 6aca1b6d6c..c6b32ae1cc 100644 +--- a/src/math/big/arith_riscv64.s ++++ b/src/math/big/arith_riscv64.s +@@ -301,7 +301,79 @@ TEXT ·shrVU(SB),NOSPLIT,$0 + JMP ·shrVU_g(SB) + + TEXT ·mulAddVWW(SB),NOSPLIT,$0 +- JMP ·mulAddVWW_g(SB) ++ MOV x+24(FP), X5 ++ MOV y+48(FP), X6 ++ MOV z+0(FP), X7 ++ MOV z_len+8(FP), X30 ++ MOV r+56(FP), X29 ++ ++ MOV $4, X28 ++ ++ BEQ ZERO, X30, done ++ BLTU X30, X28, loop1 ++ ++loop4: ++ MOV 0(X5), X8 // x[0] ++ MOV 8(X5), X11 // x[1] ++ MOV 16(X5), X14 // x[2] ++ MOV 24(X5), X17 // x[3] ++ ++ MULHU X8, X6, X9 // z_hi[0] = x[0] * y ++ MUL X8, X6, X8 // z_lo[0] = x[0] * y ++ ADD X8, X29, X10 // z[0] = z_lo[0] + c ++ SLTU X8, X10, X23 ++ ADD X23, X9, X29 // next c ++ ++ MULHU X11, X6, X12 // z_hi[1] = x[1] * y ++ MUL X11, X6, X11 // z_lo[1] = x[1] * y ++ ADD X11, X29, X13 // z[1] = z_lo[1] + c ++ SLTU X11, X13, X23 ++ ADD X23, X12, X29 // next c ++ ++ MULHU X14, X6, X15 // z_hi[2] = x[2] * y ++ MUL X14, X6, X14 // z_lo[2] = x[2] * y ++ ADD X14, X29, X16 // z[2] = z_lo[2] + c ++ SLTU X14, X16, X23 ++ ADD X23, X15, X29 // next c ++ ++ MULHU X17, X6, X18 // z_hi[3] = x[3] * y ++ MUL X17, X6, X17 // z_lo[3] = x[3] * y ++ ADD X17, X29, X19 // z[3] = z_lo[3] + c ++ SLTU X17, X19, X23 ++ ADD X23, X18, X29 // next c ++ ++ MOV X10, 0(X7) // z[0] ++ MOV X13, 8(X7) // z[1] ++ MOV X16, 16(X7) // z[2] ++ MOV X19, 24(X7) // z[3] ++ ++ ADD $32, X5 ++ ADD $32, X7 ++ SUB $4, X30 ++ ++ BGEU X30, X28, loop4 ++ BEQZ X30, done ++ ++loop1: ++ MOV 0(X5), X10 // x ++ ++ MULHU X10, X6, X12 // z_hi = x * y ++ MUL X10, X6, X10 // z_lo = x * y ++ ADD X10, X29, X13 // z_lo + c ++ SLTU X10, X13, X15 ++ ADD X12, X15, X29 // next c ++ ++ MOV X13, 0(X7) // z ++ ++ ADD $8, X5 ++ ADD $8, X7 ++ SUB $1, X30 ++ ++ BNEZ X30, loop1 ++ ++done: ++ MOV X29, c+64(FP) // return c ++ RET + + TEXT ·addMulVVW(SB),NOSPLIT,$0 + JMP ·addMulVVW_g(SB) +-- +2.39.5 + diff --git a/2047-math-big-implement-addMulVVW-in-riscv64-assembly.patch b/2047-math-big-implement-addMulVVW-in-riscv64-assembly.patch new file mode 100644 index 0000000..41ae569 --- /dev/null +++ b/2047-math-big-implement-addMulVVW-in-riscv64-assembly.patch @@ -0,0 +1,158 @@ +From e959b79c7113f225df7e89d50c965fae3d25ad1e Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 047/119] math/big: implement addMulVVW in riscv64 assembly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This provides an assembly implementation of addMulVVW for riscv64, +processing up to four words per loop, resulting in a significant +performance gain. + +On a StarFive VisionFive 2: + + │ addmulvvw.1 │ addmulvvw.2 │ + │ sec/op │ sec/op vs base │ +AddMulVVW/1-4 65.49n ± 0% 50.79n ± 0% -22.44% (p=0.000 n=10) +AddMulVVW/2-4 82.81n ± 0% 66.83n ± 0% -19.29% (p=0.000 n=10) +AddMulVVW/3-4 100.20n ± 0% 82.87n ± 0% -17.30% (p=0.000 n=10) +AddMulVVW/4-4 117.50n ± 0% 84.20n ± 0% -28.34% (p=0.000 n=10) +AddMulVVW/5-4 134.9n ± 0% 100.3n ± 0% -25.69% (p=0.000 n=10) +AddMulVVW/10-4 221.7n ± 0% 164.4n ± 0% -25.85% (p=0.000 n=10) +AddMulVVW/100-4 1.794µ ± 0% 1.250µ ± 0% -30.32% (p=0.000 n=10) +AddMulVVW/1000-4 17.42µ ± 0% 12.08µ ± 0% -30.68% (p=0.000 n=10) +AddMulVVW/10000-4 254.9µ ± 0% 214.8µ ± 0% -15.75% (p=0.000 n=10) +AddMulVVW/100000-4 2.569m ± 0% 2.178m ± 0% -15.20% (p=0.000 n=10) +geomean 1.443µ 1.107µ -23.29% + + │ addmulvvw.1 │ addmulvvw.2 │ + │ B/s │ B/s vs base │ +AddMulVVW/1-4 932.0Mi ± 0% 1201.6Mi ± 0% +28.93% (p=0.000 n=10) +AddMulVVW/2-4 1.440Gi ± 0% 1.784Gi ± 0% +23.90% (p=0.000 n=10) +AddMulVVW/3-4 1.785Gi ± 0% 2.158Gi ± 0% +20.87% (p=0.000 n=10) +AddMulVVW/4-4 2.029Gi ± 0% 2.832Gi ± 0% +39.59% (p=0.000 n=10) +AddMulVVW/5-4 2.209Gi ± 0% 2.973Gi ± 0% +34.55% (p=0.000 n=10) +AddMulVVW/10-4 2.689Gi ± 0% 3.626Gi ± 0% +34.86% (p=0.000 n=10) +AddMulVVW/100-4 3.323Gi ± 0% 4.770Gi ± 0% +43.54% (p=0.000 n=10) +AddMulVVW/1000-4 3.421Gi ± 0% 4.936Gi ± 0% +44.27% (p=0.000 n=10) +AddMulVVW/10000-4 2.338Gi ± 0% 2.776Gi ± 0% +18.69% (p=0.000 n=10) +AddMulVVW/100000-4 2.320Gi ± 0% 2.736Gi ± 0% +17.93% (p=0.000 n=10) +geomean 2.109Gi 2.749Gi +30.36% + +Change-Id: I6c7ee48233c53ff9b6a5a9002675886cd9bff5af +Reviewed-on: https://go-review.googlesource.com/c/go/+/595400 +Reviewed-by: Meng Zhuo +Reviewed-by: Cherry Mui +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Mark Ryan +LUCI-TryBot-Result: Go LUCI +--- + src/math/big/arith_riscv64.s | 93 +++++++++++++++++++++++++++++++++++- + 1 file changed, 92 insertions(+), 1 deletion(-) + +diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s +index c6b32ae1cc..cc96d3145c 100644 +--- a/src/math/big/arith_riscv64.s ++++ b/src/math/big/arith_riscv64.s +@@ -376,5 +376,96 @@ done: + RET + + TEXT ·addMulVVW(SB),NOSPLIT,$0 +- JMP ·addMulVVW_g(SB) ++ MOV x+24(FP), X5 ++ MOV y+48(FP), X6 ++ MOV z+0(FP), X7 ++ MOV z_len+8(FP), X30 ++ ++ MOV $4, X28 ++ MOV $0, X29 // c = 0 ++ ++ BEQZ X30, done ++ BLTU X30, X28, loop1 ++ ++loop4: ++ MOV 0(X5), X8 // x[0] ++ MOV 0(X7), X10 // z[0] ++ MOV 8(X5), X11 // x[1] ++ MOV 8(X7), X13 // z[1] ++ MOV 16(X5), X14 // x[2] ++ MOV 16(X7), X16 // z[2] ++ MOV 24(X5), X17 // x[3] ++ MOV 24(X7), X19 // z[3] ++ ++ MULHU X8, X6, X9 // z_hi[0] = x[0] * y ++ MUL X8, X6, X8 // z_lo[0] = x[0] * y ++ ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0] ++ SLTU X8, X21, X22 ++ ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0] ++ ADD X21, X29, X10 // z[0] = x[0] * y + z[0] + c ++ SLTU X21, X10, X22 ++ ADD X9, X22, X29 // next c ++ ++ MULHU X11, X6, X12 // z_hi[1] = x[1] * y ++ MUL X11, X6, X11 // z_lo[1] = x[1] * y ++ ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1] ++ SLTU X11, X21, X22 ++ ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1] ++ ADD X21, X29, X13 // z[1] = x[1] * y + z[1] + c ++ SLTU X21, X13, X22 ++ ADD X12, X22, X29 // next c ++ ++ MULHU X14, X6, X15 // z_hi[2] = x[2] * y ++ MUL X14, X6, X14 // z_lo[2] = x[2] * y ++ ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2] ++ SLTU X14, X21, X22 ++ ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2] ++ ADD X21, X29, X16 // z[2] = x[2] * y + z[2] + c ++ SLTU X21, X16, X22 ++ ADD X15, X22, X29 // next c ++ ++ MULHU X17, X6, X18 // z_hi[3] = x[3] * y ++ MUL X17, X6, X17 // z_lo[3] = x[3] * y ++ ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3] ++ SLTU X17, X21, X22 ++ ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3] ++ ADD X21, X29, X19 // z[3] = x[3] * y + z[3] + c ++ SLTU X21, X19, X22 ++ ADD X18, X22, X29 // next c + ++ MOV X10, 0(X7) // z[0] ++ MOV X13, 8(X7) // z[1] ++ MOV X16, 16(X7) // z[2] ++ MOV X19, 24(X7) // z[3] ++ ++ ADD $32, X5 ++ ADD $32, X7 ++ SUB $4, X30 ++ ++ BGEU X30, X28, loop4 ++ BEQZ X30, done ++ ++loop1: ++ MOV 0(X5), X10 // x ++ MOV 0(X7), X11 // z ++ ++ MULHU X10, X6, X12 // z_hi = x * y ++ MUL X10, X6, X10 // z_lo = x * y ++ ADD X10, X11, X13 // z_lo = x * y + z ++ SLTU X10, X13, X15 ++ ADD X12, X15, X12 // z_hi = x * y + z ++ ADD X13, X29, X10 // z = x * y + z + c ++ SLTU X13, X10, X15 ++ ADD X12, X15, X29 // next c ++ ++ MOV X10, 0(X7) // z ++ ++ ADD $8, X5 ++ ADD $8, X7 ++ SUB $1, X30 ++ ++ BNEZ X30, loop1 ++ ++done: ++ MOV X29, c+56(FP) // return c ++ RET +-- +2.39.5 + diff --git a/2048-test-codegen-add-initial-codegen-tests-for-integer-m.patch b/2048-test-codegen-add-initial-codegen-tests-for-integer-m.patch new file mode 100644 index 0000000..6500317 --- /dev/null +++ b/2048-test-codegen-add-initial-codegen-tests-for-integer-m.patch @@ -0,0 +1,63 @@ +From 0626074d153ea7ade203d2946d97919e7c682700 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 048/119] test/codegen: add initial codegen tests for integer + min/max + +Change-Id: I006370053748edbec930c7279ee88a805009aa0d +Reviewed-on: https://go-review.googlesource.com/c/go/+/606976 +Reviewed-by: Cherry Mui +Reviewed-by: Meng Zhuo +Reviewed-by: Dmitri Shuralyov +LUCI-TryBot-Result: Go LUCI +--- + test/codegen/arithmetic.go | 36 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 36 insertions(+) + +diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go +index f381b34ade..5f4ce9c76f 100644 +--- a/test/codegen/arithmetic.go ++++ b/test/codegen/arithmetic.go +@@ -588,3 +588,39 @@ func constantFold3(i, j int) int { + r := (5 * i) * (6 * j) + return r + } ++ ++// ----------------- // ++// Integer Min/Max // ++// ----------------- // ++ ++func Int64Min(a, b int64) int64 { ++ // amd64: "CMPQ","CMOVQLT" ++ // arm64: "CMP","CSEL" ++ // riscv64/rva20u64:"BLT\t" ++ // riscv64/rva22u64:"MIN\t" ++ return min(a, b) ++} ++ ++func Int64Max(a, b int64) int64 { ++ // amd64: "CMPQ","CMOVQGT" ++ // arm64: "CMP","CSEL" ++ // riscv64/rva20u64:"BLT\t" ++ // riscv64/rva22u64:"MAX\t" ++ return max(a, b) ++} ++ ++func Uint64Min(a, b uint64) uint64 { ++ // amd64: "CMPQ","CMOVQCS" ++ // arm64: "CMP","CSEL" ++ // riscv64/rva20u64:"BLTU" ++ // riscv64/rva22u64:"MINU" ++ return min(a, b) ++} ++ ++func Uint64Max(a, b uint64) uint64 { ++ // amd64: "CMPQ","CMOVQHI" ++ // arm64: "CMP","CSEL" ++ // riscv64/rva20u64:"BLTU" ++ // riscv64/rva22u64:"MAXU" ++ return max(a, b) ++} +-- +2.39.5 + diff --git a/2049-cmd-compile-internal-ssa-combine-shift-and-addition-.patch b/2049-cmd-compile-internal-ssa-combine-shift-and-addition-.patch new file mode 100644 index 0000000..5fce80d --- /dev/null +++ b/2049-cmd-compile-internal-ssa-combine-shift-and-addition-.patch @@ -0,0 +1,232 @@ +From 8afd6098ad3cbdbe64f7108459f3954791e1391f Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 049/119] cmd/compile/internal/ssa: combine shift and addition + for riscv64 rva22u64 + +When GORISCV64 enables rva22u64, combined shift and addition using the +SH1ADD, SH2ADD and SH3ADD instructions that are available via the Zba +extension. This results in more than 2000 instructions being removed +from the Go binary on riscv64. + +Change-Id: Ia62ae7dda3d8083cff315113421bee73f518eea8 +Reviewed-on: https://go-review.googlesource.com/c/go/+/606636 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +Reviewed-by: Michael Pratt +Reviewed-by: Cherry Mui +Reviewed-by: Meng Zhuo +--- + src/cmd/compile/internal/riscv64/ssa.go | 3 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 5 ++ + .../compile/internal/ssa/_gen/RISCV64Ops.go | 5 ++ + src/cmd/compile/internal/ssa/opGen.go | 45 +++++++++++++++ + .../compile/internal/ssa/rewriteRISCV64.go | 57 +++++++++++++++++++ + test/codegen/shift.go | 17 ++++++ + 6 files changed, 131 insertions(+), 1 deletion(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 10fea07e60..e3a2889697 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -289,7 +289,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + ssa.OpRISCV64FEQS, ssa.OpRISCV64FNES, ssa.OpRISCV64FLTS, ssa.OpRISCV64FLES, + ssa.OpRISCV64FADDD, ssa.OpRISCV64FSUBD, ssa.OpRISCV64FMULD, ssa.OpRISCV64FDIVD, + ssa.OpRISCV64FEQD, ssa.OpRISCV64FNED, ssa.OpRISCV64FLTD, ssa.OpRISCV64FLED, ssa.OpRISCV64FSGNJD, +- ssa.OpRISCV64MIN, ssa.OpRISCV64MAX, ssa.OpRISCV64MINU, ssa.OpRISCV64MAXU: ++ ssa.OpRISCV64MIN, ssa.OpRISCV64MAX, ssa.OpRISCV64MINU, ssa.OpRISCV64MAXU, ++ ssa.OpRISCV64SH1ADD, ssa.OpRISCV64SH2ADD, ssa.OpRISCV64SH3ADD: + r := v.Reg() + r1 := v.Args[0].Reg() + r2 := v.Args[1].Reg() +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 7d8fb79e17..f0afd6b345 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -839,6 +839,11 @@ + // Optimisations for rva22u64 and above. + // + ++// Combine left shift and addition. ++(ADD (SLLI [1] x) y) && buildcfg.GORISCV64 >= 22 => (SH1ADD x y) ++(ADD (SLLI [2] x) y) && buildcfg.GORISCV64 >= 22 => (SH2ADD x y) ++(ADD (SLLI [3] x) y) && buildcfg.GORISCV64 >= 22 => (SH3ADD x y) ++ + // Integer minimum and maximum. + (Min64 x y) && buildcfg.GORISCV64 >= 22 => (MIN x y) + (Max64 x y) && buildcfg.GORISCV64 >= 22 => (MAX x y) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 7323cb119c..8badefa9ac 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -220,6 +220,11 @@ func init() { + {name: "SRLI", argLength: 1, reg: gp11, asm: "SRLI", aux: "Int64"}, // arg0 >> auxint, shift amount 0-63, logical right shift + {name: "SRLIW", argLength: 1, reg: gp11, asm: "SRLIW", aux: "Int64"}, // arg0 >> auxint, shift amount 0-31, logical right shift of 32 bit value, sign extended to 64 bits + ++ // Shift and add ++ {name: "SH1ADD", argLength: 2, reg: gp21, asm: "SH1ADD"}, // arg0 << 1 + arg1 ++ {name: "SH2ADD", argLength: 2, reg: gp21, asm: "SH2ADD"}, // arg0 << 2 + arg1 ++ {name: "SH3ADD", argLength: 2, reg: gp21, asm: "SH3ADD"}, // arg0 << 3 + arg1 ++ + // Bitwise ops + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 + {name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"}, // arg0 & auxint +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index 600b8d9e30..f651adf63e 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2381,6 +2381,9 @@ const ( + OpRISCV64SRAIW + OpRISCV64SRLI + OpRISCV64SRLIW ++ OpRISCV64SH1ADD ++ OpRISCV64SH2ADD ++ OpRISCV64SH3ADD + OpRISCV64AND + OpRISCV64ANDI + OpRISCV64NOT +@@ -31948,6 +31951,48 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "SH1ADD", ++ argLen: 2, ++ asm: riscv.ASH1ADD, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "SH2ADD", ++ argLen: 2, ++ asm: riscv.ASH2ADD, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "SH3ADD", ++ argLen: 2, ++ asm: riscv.ASH3ADD, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "AND", + argLen: 2, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 9a13955689..5e6ccab467 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -3317,6 +3317,63 @@ func rewriteValueRISCV64_OpRISCV64ADD(v *Value) bool { + } + break + } ++ // match: (ADD (SLLI [1] x) y) ++ // cond: buildcfg.GORISCV64 >= 22 ++ // result: (SH1ADD x y) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ if v_0.Op != OpRISCV64SLLI || auxIntToInt64(v_0.AuxInt) != 1 { ++ continue ++ } ++ x := v_0.Args[0] ++ y := v_1 ++ if !(buildcfg.GORISCV64 >= 22) { ++ continue ++ } ++ v.reset(OpRISCV64SH1ADD) ++ v.AddArg2(x, y) ++ return true ++ } ++ break ++ } ++ // match: (ADD (SLLI [2] x) y) ++ // cond: buildcfg.GORISCV64 >= 22 ++ // result: (SH2ADD x y) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ if v_0.Op != OpRISCV64SLLI || auxIntToInt64(v_0.AuxInt) != 2 { ++ continue ++ } ++ x := v_0.Args[0] ++ y := v_1 ++ if !(buildcfg.GORISCV64 >= 22) { ++ continue ++ } ++ v.reset(OpRISCV64SH2ADD) ++ v.AddArg2(x, y) ++ return true ++ } ++ break ++ } ++ // match: (ADD (SLLI [3] x) y) ++ // cond: buildcfg.GORISCV64 >= 22 ++ // result: (SH3ADD x y) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ if v_0.Op != OpRISCV64SLLI || auxIntToInt64(v_0.AuxInt) != 3 { ++ continue ++ } ++ x := v_0.Args[0] ++ y := v_1 ++ if !(buildcfg.GORISCV64 >= 22) { ++ continue ++ } ++ v.reset(OpRISCV64SH3ADD) ++ v.AddArg2(x, y) ++ return true ++ } ++ break ++ } + return false + } + func rewriteValueRISCV64_OpRISCV64ADDI(v *Value) bool { +diff --git a/test/codegen/shift.go b/test/codegen/shift.go +index 51b9b2e39c..4b3b79f142 100644 +--- a/test/codegen/shift.go ++++ b/test/codegen/shift.go +@@ -474,3 +474,20 @@ func checkShiftToMask(u []uint64, s []int64) { + // amd64:-"SHR",-"SHL","ANDQ" + u[1] = u[1] << 5 >> 5 + } ++ ++// ++// Left shift with addition. ++// ++ ++func checkLeftShiftWithAddition(a int64, b int64) int64 { ++ // riscv64/rva20u64: "SLLI","ADD" ++ // riscv64/rva22u64: "SH1ADD" ++ a = a + b<<1 ++ // riscv64/rva20u64: "SLLI","ADD" ++ // riscv64/rva22u64: "SH2ADD" ++ a = a + b<<2 ++ // riscv64/rva20u64: "SLLI","ADD" ++ // riscv64/rva22u64: "SH3ADD" ++ a = a + b<<3 ++ return a ++} +-- +2.39.5 + diff --git a/2050-math-add-round-assembly-implementations-on-riscv64.patch b/2050-math-add-round-assembly-implementations-on-riscv64.patch new file mode 100644 index 0000000..00cef9d --- /dev/null +++ b/2050-math-add-round-assembly-implementations-on-riscv64.patch @@ -0,0 +1,125 @@ +From 38a2f9c476c9b5fb3688605454eff7a198368211 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 050/119] math: add round assembly implementations on riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This CL reapplies CL 504737 and adds integer precision +limitation check, since CL 504737 only checks whether +floating point number is +-Inf or NaN. + +This CL is also ~7% faster than CL 504737. + +Updates #68322 + +goos: linux +goarch: riscv64 +pkg: math + │ math.old.bench │ math.new.bench │ + │ sec/op │ sec/op vs base │ +Ceil 54.09n ± 0% 18.72n ± 0% -65.39% (p=0.000 n=10) +Floor 40.72n ± 0% 18.72n ± 0% -54.03% (p=0.000 n=10) +Round 20.73n ± 0% 20.73n ± 0% ~ (p=1.000 n=10) +RoundToEven 24.07n ± 0% 24.07n ± 0% ~ (p=1.000 n=10) +Trunc 38.72n ± 0% 18.72n ± 0% -51.65% (p=0.000 n=10) +geomean 33.56n 20.09n -40.13% + +Change-Id: I06cfe2cb9e2535cd705d40b6650a7e71fedd906c +Reviewed-on: https://go-review.googlesource.com/c/go/+/600075 +Reviewed-by: Keith Randall +Reviewed-by: Joel Sing +Reviewed-by: Keith Randall +Reviewed-by: Michael Knyszek +LUCI-TryBot-Result: Go LUCI +--- + src/math/floor_asm.go | 2 +- + src/math/floor_noasm.go | 2 +- + src/math/floor_riscv64.s | 48 ++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 50 insertions(+), 2 deletions(-) + create mode 100644 src/math/floor_riscv64.s + +diff --git a/src/math/floor_asm.go b/src/math/floor_asm.go +index fb419d6da2..5cb45f5a7e 100644 +--- a/src/math/floor_asm.go ++++ b/src/math/floor_asm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || s390x || wasm ++//go:build 386 || amd64 || arm64 || ppc64 || ppc64le || riscv64 || s390x || wasm + + package math + +diff --git a/src/math/floor_noasm.go b/src/math/floor_noasm.go +index 5641c7ea0a..6754ca8fc8 100644 +--- a/src/math/floor_noasm.go ++++ b/src/math/floor_noasm.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !s390x && !wasm ++//go:build !386 && !amd64 && !arm64 && !ppc64 && !ppc64le && !riscv64 && !s390x && !wasm + + package math + +diff --git a/src/math/floor_riscv64.s b/src/math/floor_riscv64.s +new file mode 100644 +index 0000000000..d9fe0ed8e2 +--- /dev/null ++++ b/src/math/floor_riscv64.s +@@ -0,0 +1,48 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++#include "textflag.h" ++ ++// RISC-V offered floating-point (FP) rounding by FP conversion instructions (FCVT) ++// with rounding mode field. ++// As Go spec expects FP rounding result in FP, we have to use FCVT integer ++// back to FP (fp -> int -> fp). ++// RISC-V only set Inexact flag during invalid FP-integer conversion without changing any data, ++// on the other hand, RISC-V sets out of integer represent range yet valid FP into NaN. ++// When it comes to integer-FP conversion, invalid FP like NaN, +-Inf will be ++// converted into the closest valid FP, for example: ++// ++// `Floor(-Inf) -> int64(0x7fffffffffffffff) -> float64(9.22e+18)` ++// `Floor(18446744073709549568.0) -> int64(0x7fffffffffffffff) -> float64(9.22e+18)` ++// ++// This ISA conversion limitation requires we skip all invalid or out of range FP ++// before any normal rounding operations. ++ ++#define ROUNDFN(NAME, MODE) \ ++TEXT NAME(SB),NOSPLIT,$0; \ ++ MOVD x+0(FP), F10; \ ++ FMVXD F10, X10; \ ++ /* Drop all fraction bits */;\ ++ SRL $52, X10, X12; \ ++ /* Remove sign bit */; \ ++ AND $0x7FF, X12, X12;\ ++ /* Return either input is +-Inf, NaN(0x7FF) or out of precision limitation */;\ ++ /* 1023: bias of exponent, [-2^53, 2^53]: exactly integer represent range */;\ ++ MOV $1023+53, X11; \ ++ BLTU X11, X12, 4(PC);\ ++ FCVTLD.MODE F10, X11; \ ++ FCVTDL X11, F11; \ ++ /* RISC-V rounds negative values to +0, restore original sign */;\ ++ FSGNJD F10, F11, F10; \ ++ MOVD F10, ret+8(FP); \ ++ RET ++ ++// func archFloor(x float64) float64 ++ROUNDFN(·archFloor, RDN) ++ ++// func archCeil(x float64) float64 ++ROUNDFN(·archCeil, RUP) ++ ++// func archTrunc(x float64) float64 ++ROUNDFN(·archTrunc, RTZ) +-- +2.39.5 + diff --git a/2051-test-codegen-add-Rotate-test-for-riscv64.patch b/2051-test-codegen-add-Rotate-test-for-riscv64.patch new file mode 100644 index 0000000..020d744 --- /dev/null +++ b/2051-test-codegen-add-Rotate-test-for-riscv64.patch @@ -0,0 +1,62 @@ +From 359d01862161f76d5493d310de9b9a9ae46d75d5 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 051/119] test/codegen: add Rotate test for riscv64 + +Change-Id: I7d996b8d46fbeef933943f806052a30f1f8d50c3 +Reviewed-on: https://go-review.googlesource.com/c/go/+/588836 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Joel Sing +Reviewed-by: Tim King +Reviewed-by: Dmitri Shuralyov +--- + test/codegen/mathbits.go | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go +index 80fe9d2e0c..caeecdf078 100644 +--- a/test/codegen/mathbits.go ++++ b/test/codegen/mathbits.go +@@ -231,6 +231,7 @@ func RotateLeft64(n uint64) uint64 { + // amd64:"ROLQ" + // arm64:"ROR" + // ppc64x:"ROTL" ++ // riscv64:"RORI" + // s390x:"RISBGZ\t[$]0, [$]63, [$]37, " + // wasm:"I64Rotl" + return bits.RotateLeft64(n, 37) +@@ -241,6 +242,7 @@ func RotateLeft32(n uint32) uint32 { + // arm:`MOVW\tR[0-9]+@>23` + // arm64:"RORW" + // ppc64x:"ROTLW" ++ // riscv64:"RORIW" + // s390x:"RLL" + // wasm:"I32Rotl" + return bits.RotateLeft32(n, 9) +@@ -262,6 +264,7 @@ func RotateLeftVariable(n uint, m int) uint { + // amd64:"ROLQ" + // arm64:"ROR" + // ppc64x:"ROTL" ++ // riscv64:"ROL" + // s390x:"RLLG" + // wasm:"I64Rotl" + return bits.RotateLeft(n, m) +@@ -271,6 +274,7 @@ func RotateLeftVariable64(n uint64, m int) uint64 { + // amd64:"ROLQ" + // arm64:"ROR" + // ppc64x:"ROTL" ++ // riscv64:"ROL" + // s390x:"RLLG" + // wasm:"I64Rotl" + return bits.RotateLeft64(n, m) +@@ -281,6 +285,7 @@ func RotateLeftVariable32(n uint32, m int) uint32 { + // amd64:"ROLL" + // arm64:"RORW" + // ppc64x:"ROTLW" ++ // riscv64:"ROLW" + // s390x:"RLL" + // wasm:"I32Rotl" + return bits.RotateLeft32(n, m) +-- +2.39.5 + diff --git a/2052-runtime-add-asm_riscv64.h.patch b/2052-runtime-add-asm_riscv64.h.patch new file mode 100644 index 0000000..84cfce3 --- /dev/null +++ b/2052-runtime-add-asm_riscv64.h.patch @@ -0,0 +1,67 @@ +From d1f0994bb9c5f428f58638aa4ac69e78d4122a25 Mon Sep 17 00:00:00 2001 +From: Mark D Ryan +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 052/119] runtime: add asm_riscv64.h + +asm_riscv64.h will be used to define macros for each riscv64 +extension that is not part of the rva20u64 base profile but that the +_riscv64.s assembly files are allowed to use because the user has +specified a more capable profile in the GORISCV64 variable. This will +allow us, for example, to test for the hasZba macro in those assembly +files instead of the GORISCV64_rva22u64 macro before using a Zba +instruction. This is important as it means that in the future when +we add support for new profiles that support Zba, e.g., rva23u64, +we only need to update asm_riscv64.h to indicate rva23u64 supports +Zba. We will not need to update every assembly language file that +already uses Zba instructions. + +Updates #61476 + +Change-Id: I83abfeb20d08a87ac8ea88f4d8a93437f0631353 +Reviewed-on: https://go-review.googlesource.com/c/go/+/608255 +Auto-Submit: Tim King +Reviewed-by: Tim King +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Meng Zhuo +Reviewed-by: Joel Sing +LUCI-TryBot-Result: Go LUCI +--- + src/cmd/dist/build.go | 2 ++ + src/runtime/asm_riscv64.h | 12 ++++++++++++ + 2 files changed, 14 insertions(+) + create mode 100644 src/runtime/asm_riscv64.h + +diff --git a/src/cmd/dist/build.go b/src/cmd/dist/build.go +index 7d720cc5e1..873d031fac 100644 +--- a/src/cmd/dist/build.go ++++ b/src/cmd/dist/build.go +@@ -821,6 +821,8 @@ func runInstall(pkg string, ch chan struct{}) { + pathf("%s/src/runtime/asm_ppc64x.h", goroot), 0) + copyfile(pathf("%s/pkg/include/asm_amd64.h", goroot), + pathf("%s/src/runtime/asm_amd64.h", goroot), 0) ++ copyfile(pathf("%s/pkg/include/asm_riscv64.h", goroot), ++ pathf("%s/src/runtime/asm_riscv64.h", goroot), 0) + } + + // Generate any missing files; regenerate existing ones. +diff --git a/src/runtime/asm_riscv64.h b/src/runtime/asm_riscv64.h +new file mode 100644 +index 0000000000..d4deb093a6 +--- /dev/null ++++ b/src/runtime/asm_riscv64.h +@@ -0,0 +1,12 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++// Define features that are guaranteed to be supported by setting the GORISCV64 variable. ++// If a feature is supported, there's no need to check it at runtime every time. ++ ++#ifdef GORISCV64_rva22u64 ++#define hasZba ++#define hasZbb ++#define hasZbs ++#endif +-- +2.39.5 + diff --git a/2053-cmd-compile-cmd-internal-obj-riscv-always-provide-AN.patch b/2053-cmd-compile-cmd-internal-obj-riscv-always-provide-AN.patch new file mode 100644 index 0000000..77c492f --- /dev/null +++ b/2053-cmd-compile-cmd-internal-obj-riscv-always-provide-AN.patch @@ -0,0 +1,387 @@ +From 92dfc0ca8a0e09f34d4ab39b964cb532aa6ac73d Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 053/119] cmd/compile,cmd/internal/obj/riscv: always provide + ANDN, ORN and XNOR for riscv64 + +The ANDN, ORN and XNOR RISC-V Zbb extension instructions are easily +synthesised. Make them always available by adding support to the +riscv64 assembler so that we either emit two instruction sequences, +or a single instruction, when permitted by the GORISCV64 profile. +This means that these instructions can be used unconditionally, +simplifying compiler rewrite rules, codegen tests and manually +written assembly. + +Around 180 instructions are removed from the Go binary on riscv64 +when built with rva22u64. + +Change-Id: Ib2d90f2593a306530dc0ed08a981acde4d01be20 +Reviewed-on: https://go-review.googlesource.com/c/go/+/611895 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: Tim King +Reviewed-by: Dmitri Shuralyov +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 12 +-- + src/cmd/compile/internal/riscv64/ssa.go | 3 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 1 - + .../compile/internal/ssa/_gen/RISCV64Ops.go | 29 ++++--- + .../internal/ssa/_gen/RISCV64latelower.rules | 6 ++ + src/cmd/compile/internal/ssa/opGen.go | 46 ++++++++++ + .../internal/ssa/rewriteRISCV64latelower.go | 84 +++++++++++++++++++ + src/cmd/internal/obj/riscv/obj.go | 28 +++++++ + 8 files changed, 188 insertions(+), 21 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 5296a34d09..53b7b92faa 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -361,8 +361,8 @@ start: + SLLIUW $1, X18, X19 // 9b191908 + + // 1.2: Basic Bit Manipulation (Zbb) +- ANDN X19, X20, X21 // b37a3a41 +- ANDN X19, X20 // 337a3a41 ++ ANDN X19, X20, X21 // b37a3a41 or 93caf9ffb37a5a01 ++ ANDN X19, X20 // 337a3a41 or 93cff9ff337afa01 + CLZ X20, X21 // 931a0a60 + CLZW X21, X22 // 1b9b0a60 + CPOP X22, X23 // 931b2b60 +@@ -377,12 +377,12 @@ start: + MIN X29, X30 // 334fdf0b + MINU X30, X5, X6 // 33d3e20b + MINU X30, X5 // b3d2e20b +- ORN X6, X7, X8 // 33e46340 +- ORN X6, X7 // b3e36340 ++ ORN X6, X7, X8 // 33e46340 or 1344f3ff33e48300 ++ ORN X6, X7 // b3e36340 or 934ff3ffb3e3f301 + SEXTB X16, X17 // 93184860 + SEXTH X17, X18 // 13995860 +- XNOR X18, X19, X20 // 33ca2941 +- XNOR X18, X19 // b3c92941 ++ XNOR X18, X19, X20 // 33ca2941 or 33ca2901134afaff ++ XNOR X18, X19 // b3c92941 or b3c9290193c9f9ff + ZEXTH X19, X20 // 3bca0908 + + // 1.3: Bitwise Rotation (Zbb) +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index e3a2889697..759d8d7cf4 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -278,7 +278,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.From.Reg = rs + p.To.Type = obj.TYPE_REG + p.To.Reg = rd +- case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64SUBW, ssa.OpRISCV64XOR, ssa.OpRISCV64OR, ssa.OpRISCV64AND, ++ case ssa.OpRISCV64ADD, ssa.OpRISCV64SUB, ssa.OpRISCV64SUBW, ssa.OpRISCV64XNOR, ssa.OpRISCV64XOR, ++ ssa.OpRISCV64OR, ssa.OpRISCV64ORN, ssa.OpRISCV64AND, ssa.OpRISCV64ANDN, + ssa.OpRISCV64SLL, ssa.OpRISCV64SLLW, ssa.OpRISCV64SRA, ssa.OpRISCV64SRAW, ssa.OpRISCV64SRL, ssa.OpRISCV64SRLW, + ssa.OpRISCV64SLT, ssa.OpRISCV64SLTU, ssa.OpRISCV64MUL, ssa.OpRISCV64MULW, ssa.OpRISCV64MULH, + ssa.OpRISCV64MULHU, ssa.OpRISCV64DIV, ssa.OpRISCV64DIVU, ssa.OpRISCV64DIVW, +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index f0afd6b345..9ae9604381 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -62,7 +62,6 @@ + + (Com(64|32|16|8) ...) => (NOT ...) + +- + (Sqrt ...) => (FSQRTD ...) + (Sqrt32 ...) => (FSQRTS ...) + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 8badefa9ac..7f3c4a2bf4 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -226,19 +226,22 @@ func init() { + {name: "SH3ADD", argLength: 2, reg: gp21, asm: "SH3ADD"}, // arg0 << 3 + arg1 + + // Bitwise ops +- {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 +- {name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"}, // arg0 & auxint +- {name: "NOT", argLength: 1, reg: gp11, asm: "NOT"}, // ^arg0 +- {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 +- {name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"}, // arg0 | auxint +- {name: "ROL", argLength: 2, reg: gp21, asm: "ROL"}, // rotate left arg0 by (arg1 & 63) +- {name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"}, // rotate left least significant word of arg0 by (arg1 & 31), sign extended +- {name: "ROR", argLength: 2, reg: gp21, asm: "ROR"}, // rotate right arg0 by (arg1 & 63) +- {name: "RORI", argLength: 1, reg: gp11, asm: "RORI", aux: "Int64"}, // rotate right arg0 by auxint, shift amount 0-63 +- {name: "RORIW", argLength: 1, reg: gp11, asm: "RORIW", aux: "Int64"}, // rotate right least significant word of arg0 by auxint, shift amount 0-31, sign extended +- {name: "RORW", argLength: 2, reg: gp21, asm: "RORW"}, // rotate right least significant word of arg0 by (arg1 & 31), sign extended +- {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1 +- {name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"}, // arg0 ^ auxint ++ {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 ++ {name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"}, // ^arg0 & arg1 ++ {name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"}, // arg0 & auxint ++ {name: "NOT", argLength: 1, reg: gp11, asm: "NOT"}, // ^arg0 ++ {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 ++ {name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // ^arg0 | arg1 ++ {name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"}, // arg0 | auxint ++ {name: "ROL", argLength: 2, reg: gp21, asm: "ROL"}, // rotate left arg0 by (arg1 & 63) ++ {name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"}, // rotate left least significant word of arg0 by (arg1 & 31), sign extended ++ {name: "ROR", argLength: 2, reg: gp21, asm: "ROR"}, // rotate right arg0 by (arg1 & 63) ++ {name: "RORI", argLength: 1, reg: gp11, asm: "RORI", aux: "Int64"}, // rotate right arg0 by auxint, shift amount 0-63 ++ {name: "RORIW", argLength: 1, reg: gp11, asm: "RORIW", aux: "Int64"}, // rotate right least significant word of arg0 by auxint, shift amount 0-31, sign extended ++ {name: "RORW", argLength: 2, reg: gp21, asm: "RORW"}, // rotate right least significant word of arg0 by (arg1 & 31), sign extended ++ {name: "XNOR", argLength: 2, reg: gp21, asm: "XNOR", commutative: true}, // ^(arg0 ^ arg1) ++ {name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true}, // arg0 ^ arg1 ++ {name: "XORI", argLength: 1, reg: gp11, asm: "XORI", aux: "Int64"}, // arg0 ^ auxint + + // Minimum and maximum + {name: "MIN", argLength: 2, reg: gp21, asm: "MIN", commutative: true}, // min(arg0,arg1), signed +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules +index cd55331dfd..7acaa2f3fe 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64latelower.rules +@@ -2,6 +2,12 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + ++// Combine bitwise operation and bitwise inversion. ++(AND x (NOT y)) => (ANDN x y) ++(OR x (NOT y)) => (ORN x y) ++(XOR x (NOT y)) => (XNOR x y) ++(NOT (XOR x y)) => (XNOR x y) ++ + // Fold constant shift with extension. + (SRAI [c] (MOVBreg x)) && c < 8 => (SRAI [56+c] (SLLI [56] x)) + (SRAI [c] (MOVHreg x)) && c < 16 => (SRAI [48+c] (SLLI [48] x)) +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index f651adf63e..a02afc2da0 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2385,9 +2385,11 @@ const ( + OpRISCV64SH2ADD + OpRISCV64SH3ADD + OpRISCV64AND ++ OpRISCV64ANDN + OpRISCV64ANDI + OpRISCV64NOT + OpRISCV64OR ++ OpRISCV64ORN + OpRISCV64ORI + OpRISCV64ROL + OpRISCV64ROLW +@@ -2395,6 +2397,7 @@ const ( + OpRISCV64RORI + OpRISCV64RORIW + OpRISCV64RORW ++ OpRISCV64XNOR + OpRISCV64XOR + OpRISCV64XORI + OpRISCV64MIN +@@ -32008,6 +32011,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "ANDN", ++ argLen: 2, ++ asm: riscv.AANDN, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "ANDI", + auxType: auxInt64, +@@ -32050,6 +32067,20 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "ORN", ++ argLen: 2, ++ asm: riscv.AORN, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "ORI", + auxType: auxInt64, +@@ -32148,6 +32179,21 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "XNOR", ++ argLen: 2, ++ commutative: true, ++ asm: riscv.AXNOR, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ {1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "XOR", + argLen: 2, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64latelower.go b/src/cmd/compile/internal/ssa/rewriteRISCV64latelower.go +index 6dd97d65bd..d2c3a8f73d 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64latelower.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64latelower.go +@@ -4,12 +4,76 @@ package ssa + + func rewriteValueRISCV64latelower(v *Value) bool { + switch v.Op { ++ case OpRISCV64AND: ++ return rewriteValueRISCV64latelower_OpRISCV64AND(v) ++ case OpRISCV64NOT: ++ return rewriteValueRISCV64latelower_OpRISCV64NOT(v) ++ case OpRISCV64OR: ++ return rewriteValueRISCV64latelower_OpRISCV64OR(v) + case OpRISCV64SLLI: + return rewriteValueRISCV64latelower_OpRISCV64SLLI(v) + case OpRISCV64SRAI: + return rewriteValueRISCV64latelower_OpRISCV64SRAI(v) + case OpRISCV64SRLI: + return rewriteValueRISCV64latelower_OpRISCV64SRLI(v) ++ case OpRISCV64XOR: ++ return rewriteValueRISCV64latelower_OpRISCV64XOR(v) ++ } ++ return false ++} ++func rewriteValueRISCV64latelower_OpRISCV64AND(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (AND x (NOT y)) ++ // result: (ANDN x y) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ x := v_0 ++ if v_1.Op != OpRISCV64NOT { ++ continue ++ } ++ y := v_1.Args[0] ++ v.reset(OpRISCV64ANDN) ++ v.AddArg2(x, y) ++ return true ++ } ++ break ++ } ++ return false ++} ++func rewriteValueRISCV64latelower_OpRISCV64NOT(v *Value) bool { ++ v_0 := v.Args[0] ++ // match: (NOT (XOR x y)) ++ // result: (XNOR x y) ++ for { ++ if v_0.Op != OpRISCV64XOR { ++ break ++ } ++ y := v_0.Args[1] ++ x := v_0.Args[0] ++ v.reset(OpRISCV64XNOR) ++ v.AddArg2(x, y) ++ return true ++ } ++ return false ++} ++func rewriteValueRISCV64latelower_OpRISCV64OR(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (OR x (NOT y)) ++ // result: (ORN x y) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ x := v_0 ++ if v_1.Op != OpRISCV64NOT { ++ continue ++ } ++ y := v_1.Args[0] ++ v.reset(OpRISCV64ORN) ++ v.AddArg2(x, y) ++ return true ++ } ++ break + } + return false + } +@@ -241,6 +305,26 @@ func rewriteValueRISCV64latelower_OpRISCV64SRLI(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64latelower_OpRISCV64XOR(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (XOR x (NOT y)) ++ // result: (XNOR x y) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ x := v_0 ++ if v_1.Op != OpRISCV64NOT { ++ continue ++ } ++ y := v_1.Args[0] ++ v.reset(OpRISCV64XNOR) ++ v.AddArg2(x, y) ++ return true ++ } ++ break ++ } ++ return false ++} + func rewriteBlockRISCV64latelower(b *Block) bool { + return false + } +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index d396264a05..088463aef8 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -2534,6 +2534,34 @@ func instructionsForProg(p *obj.Prog) []*instruction { + + case AORCB, AREV8: + ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE ++ ++ case AANDN, AORN: ++ if buildcfg.GORISCV64 >= 22 { ++ // ANDN and ORN instructions are supported natively. ++ break ++ } ++ // ANDN -> (AND (NOT x) y) ++ // ORN -> (OR (NOT x) y) ++ bitwiseOp, notReg := AAND, ins.rd ++ if ins.as == AORN { ++ bitwiseOp = AOR ++ } ++ if ins.rs1 == notReg { ++ notReg = REG_TMP ++ } ++ inss = []*instruction{ ++ &instruction{as: AXORI, rs1: ins.rs2, rs2: obj.REG_NONE, rd: notReg, imm: -1}, ++ &instruction{as: bitwiseOp, rs1: ins.rs1, rs2: notReg, rd: ins.rd}, ++ } ++ ++ case AXNOR: ++ if buildcfg.GORISCV64 >= 22 { ++ // XNOR instruction is supported natively. ++ break ++ } ++ // XNOR -> (NOT (XOR x y)) ++ ins.as = AXOR ++ inss = append(inss, &instruction{as: AXORI, rs1: ins.rd, rs2: obj.REG_NONE, rd: ins.rd, imm: -1}) + } + + for _, ins := range inss { +-- +2.39.5 + diff --git a/2054-crypto-md5-provide-optimised-assembly-for-riscv64.patch b/2054-crypto-md5-provide-optimised-assembly-for-riscv64.patch new file mode 100644 index 0000000..d636541 --- /dev/null +++ b/2054-crypto-md5-provide-optimised-assembly-for-riscv64.patch @@ -0,0 +1,385 @@ +From d9a225d97913ddb16defe054eb661769b99df43d Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:39 +0800 +Subject: [PATCH 054/119] crypto/md5: provide optimised assembly for riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Provide an optimised assembly implementation of MD5 for RISC-V. +There are significant performance improvements. The assembler takes +advantage of Zbb instructions when they are available. + +Results for the VisionFive 2 running Ubuntu 24.04 with +GORISCV64=rva20u64. + +goos: linux +goarch: riscv64 +pkg: crypto/md5 + │ md5_go.txt │ md5_ass.txt │ + │ sec/op │ sec/op vs base │ +Hash8Bytes 1.202µ ± 0% 1.220µ ± 0% +1.50% (p=0.000 n=10) +Hash64 1.665µ ± 0% 1.518µ ± 0% -8.83% (p=0.000 n=10) +Hash128 2.165µ ± 0% 1.885µ ± 0% -12.94% (p=0.000 n=10) +Hash256 3.162µ ± 0% 2.613µ ± 0% -17.38% (p=0.000 n=10) +Hash512 5.146µ ± 0% 4.063µ ± 0% -21.05% (p=0.000 n=10) +Hash1K 9.115µ ± 0% 6.959µ ± 0% -23.65% (p=0.000 n=10) +Hash8K 64.68µ ± 0% 47.52µ ± 0% -26.54% (p=0.000 n=10) +Hash1M 8.131m ± 0% 5.936m ± 0% -27.00% (p=0.000 n=10) +Hash8M 65.06m ± 0% 47.50m ± 0% -26.99% (p=0.000 n=10) +Hash8BytesUnaligned 1.210µ ± 0% 1.199µ ± 0% -0.91% (p=0.000 n=10) +Hash1KUnaligned 9.114µ ± 0% 8.266µ ± 0% -9.30% (p=0.000 n=10) +Hash8KUnaligned 64.68µ ± 0% 57.97µ ± 0% -10.38% (p=0.000 n=10) +geomean 22.37µ 18.83µ -15.82% + +Results for the VisionFive 2 running Ubuntu 24.04 with +GORISCV64=rva22u64. + +goos: linux +goarch: riscv64 +pkg: crypto/md5 + │ md5_g22.txt │ md5_a22.txt │ + │ sec/op │ sec/op vs base │ +Hash8Bytes 1.175µ ± 0% 1.002µ ± 0% -14.72% (p=0.000 n=10) +Hash64 1.575µ ± 0% 1.274µ ± 0% -19.11% (p=0.000 n=10) +Hash128 2.033µ ± 0% 1.587µ ± 0% -21.92% (p=0.000 n=10) +Hash256 2.943µ ± 0% 2.209µ ± 0% -24.93% (p=0.000 n=10) +Hash512 4.755µ ± 0% 3.443µ ± 0% -27.58% (p=0.000 n=10) +Hash1K 8.378µ ± 0% 5.910µ ± 0% -29.46% (p=0.000 n=10) +Hash8K 59.12µ ± 0% 40.45µ ± 0% -31.58% (p=0.000 n=10) +Hash1M 7.426m ± 0% 5.056m ± 0% -31.92% (p=0.000 n=10) +Hash8M 59.41m ± 0% 40.45m ± 0% -31.91% (p=0.000 n=10) +Hash8BytesUnaligned 1.169µ ± 0% 1.012µ ± 0% -13.43% (p=0.000 n=10) +Hash1KUnaligned 8.379µ ± 0% 7.213µ ± 0% -13.91% (p=0.000 n=10) +Hash8KUnaligned 59.12µ ± 0% 50.90µ ± 0% -13.91% (p=0.000 n=10) +geomean 20.83µ 15.99µ -23.21% + +Change-Id: I61e3fa802c2cc50e0b5f71f151b4741691ccb481 +Reviewed-on: https://go-review.googlesource.com/c/go/+/527936 +Reviewed-by: Joel Sing +Auto-Submit: Tim King +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Tim King +--- + src/crypto/md5/md5block_decl.go | 2 +- + src/crypto/md5/md5block_generic.go | 2 +- + src/crypto/md5/md5block_riscv64.s | 279 +++++++++++++++++++++++++++++ + 3 files changed, 281 insertions(+), 2 deletions(-) + create mode 100644 src/crypto/md5/md5block_riscv64.s + +diff --git a/src/crypto/md5/md5block_decl.go b/src/crypto/md5/md5block_decl.go +index f1fb34c3d7..9c8e7271df 100644 +--- a/src/crypto/md5/md5block_decl.go ++++ b/src/crypto/md5/md5block_decl.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build amd64 || 386 || arm || ppc64le || ppc64 || s390x || arm64 ++//go:build amd64 || 386 || arm || ppc64le || ppc64 || s390x || arm64 || riscv64 + + package md5 + +diff --git a/src/crypto/md5/md5block_generic.go b/src/crypto/md5/md5block_generic.go +index c929c2b84a..de607e01a6 100644 +--- a/src/crypto/md5/md5block_generic.go ++++ b/src/crypto/md5/md5block_generic.go +@@ -2,7 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + +-//go:build !amd64 && !386 && !arm && !ppc64le && !ppc64 && !s390x && !arm64 ++//go:build !amd64 && !386 && !arm && !ppc64le && !ppc64 && !s390x && !arm64 && !riscv64 + + package md5 + +diff --git a/src/crypto/md5/md5block_riscv64.s b/src/crypto/md5/md5block_riscv64.s +new file mode 100644 +index 0000000000..017c70b936 +--- /dev/null ++++ b/src/crypto/md5/md5block_riscv64.s +@@ -0,0 +1,279 @@ ++// Copyright 2023 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++// ++// RISCV64 version of md5block.go ++// derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go ++ ++//go:build !purego ++ ++#include "textflag.h" ++ ++#define LOAD32U(base, offset, tmp, dest) \ ++ MOVBU (offset+0*1)(base), dest; \ ++ MOVBU (offset+1*1)(base), tmp; \ ++ SLL $8, tmp; \ ++ OR tmp, dest; \ ++ MOVBU (offset+2*1)(base), tmp; \ ++ SLL $16, tmp; \ ++ OR tmp, dest; \ ++ MOVBU (offset+3*1)(base), tmp; \ ++ SLL $24, tmp; \ ++ OR tmp, dest ++ ++#define LOAD64U(base, offset, tmp1, tmp2, dst) \ ++ LOAD32U(base, offset, tmp1, dst); \ ++ LOAD32U(base, offset+4, tmp1, tmp2); \ ++ SLL $32, tmp2; \ ++ OR tmp2, dst ++ ++#define ROUND1EVN(a, b, c, d, x, const, shift) \ ++ MOV $const, X23; \ ++ ADDW x, a; \ ++ ADDW X23, a; \ ++ XOR c, d, X23; \ ++ AND b, X23; \ ++ XOR d, X23; \ ++ ADDW X23, a; \ ++ RORIW $(32-shift), a; \ ++ ADDW b, a ++ ++#define ROUND1ODD(a, b, c, d, x, const, shift) \ ++ MOV $const, X23; \ ++ ADDW X23, a; \ ++ SRL $32, x, X23; \ ++ ADDW X23, a; \ ++ XOR c, d, X23; \ ++ AND b, X23; \ ++ XOR d, X23; \ ++ ADDW X23, a; \ ++ RORIW $(32-shift), a; \ ++ ADDW b, a ++ ++#define ROUND2EVN(a, b, c, d, x, const, shift) \ ++ MOV $const, X23; \ ++ ADDW x, a; \ ++ ADDW X23, a; \ ++ XOR b, c, X23; \ ++ AND d, X23; \ ++ XOR c, X23; \ ++ ADDW X23, a; \ ++ RORIW $(32-shift), a; \ ++ ADDW b, a ++ ++#define ROUND2ODD(a, b, c, d, x, const, shift) \ ++ MOV $const, X23; \ ++ ADDW X23, a; \ ++ SRL $32, x, X23; \ ++ ADDW X23, a; \ ++ XOR b, c, X23; \ ++ AND d, X23; \ ++ XOR c, X23; \ ++ ADDW X23, a; \ ++ RORIW $(32-shift), a; \ ++ ADDW b, a ++ ++#define ROUND3EVN(a, b, c, d, x, const, shift) \ ++ MOV $const, X23; \ ++ ADDW x, a; \ ++ ADDW X23, a; \ ++ XOR c, d, X23; \ ++ XOR b, X23; \ ++ ADDW X23, a; \ ++ RORIW $(32-shift), a; \ ++ ADDW b, a ++ ++#define ROUND3ODD(a, b, c, d, x, const, shift) \ ++ MOV $const, X23; \ ++ ADDW X23, a; \ ++ SRL $32, x, X23; \ ++ ADDW X23, a; \ ++ XOR c, d, X23; \ ++ XOR b, X23; \ ++ ADDW X23, a; \ ++ RORIW $(32-shift), a; \ ++ ADDW b, a ++ ++#define ROUND4EVN(a, b, c, d, x, const, shift) \ ++ MOV $const, X23; \ ++ ADDW x, a; \ ++ ADDW X23, a; \ ++ ORN d, b, X23; \ ++ XOR c, X23; \ ++ ADDW X23, a; \ ++ RORIW $(32-shift), a; \ ++ ADDW b, a ++ ++#define ROUND4ODD(a, b, c, d, x, const, shift) \ ++ MOV $const, X23; \ ++ ADDW X23, a; \ ++ SRL $32, x, X23; \ ++ ADDW X23, a; \ ++ ORN d, b, X23; \ ++ XOR c, X23; \ ++ ADDW X23, a; \ ++ RORIW $(32-shift), a; \ ++ ADDW b, a ++ ++// Register use for the block function ++// ++// X5 - X12 : contain the 16 32 bit data items in the block we're ++// processing. Odd numbered values, e.g., x1, x3 are stored in ++// the upper 32 bits of the register. ++// X13 - X16 : a, b, c, d ++// X17 - X20 : used to store the old values of a, b, c, d, i.e., aa, bb, cc, ++// dd. X17 and X18 are also used as temporary registers when ++// loading unaligned data. ++// X22 : pointer to dig.s ++// X23 : temporary register ++// X28 : pointer to the first byte beyond the end of p ++// X29 : pointer to current 64 byte block of data, initially set to ++// &p[0] ++// X30 : temporary register ++ ++TEXT ·block(SB),NOSPLIT,$0-32 ++ MOV p+8(FP), X29 ++ MOV p_len+16(FP), X30 ++ SRL $6, X30 ++ SLL $6, X30 ++ BEQZ X30, zero ++ ++ ADD X29, X30, X28 ++ ++ MOV dig+0(FP), X22 ++ MOVWU (0*4)(X22), X13 // a = s[0] ++ MOVWU (1*4)(X22), X14 // b = s[1] ++ MOVWU (2*4)(X22), X15 // c = s[2] ++ MOVWU (3*4)(X22), X16 // d = s[3] ++ ++loop: ++ ++ // Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12. ++ // Different paths are taken to load the values depending on whether the ++ // buffer is 8 byte aligned or not. We load all the values up front ++ // here at the start of the loop to avoid multiple alignment checks and ++ // to reduce code size. It takes 10 instructions to load an unaligned ++ // 32 bit value and this value will be used 4 times in the main body ++ // of the loop below. ++ ++ AND $7, X29, X30 ++ BEQZ X30, aligned ++ ++ LOAD64U(X29,0, X17, X18, X5) ++ LOAD64U(X29,8, X17, X18, X6) ++ LOAD64U(X29,16, X17, X18, X7) ++ LOAD64U(X29,24, X17, X18, X8) ++ LOAD64U(X29,32, X17, X18, X9) ++ LOAD64U(X29,40, X17, X18, X10) ++ LOAD64U(X29,48, X17, X18, X11) ++ LOAD64U(X29,56, X17, X18, X12) ++ JMP block_loaded ++ ++aligned: ++ MOV (0*8)(X29), X5 ++ MOV (1*8)(X29), X6 ++ MOV (2*8)(X29), X7 ++ MOV (3*8)(X29), X8 ++ MOV (4*8)(X29), X9 ++ MOV (5*8)(X29), X10 ++ MOV (6*8)(X29), X11 ++ MOV (7*8)(X29), X12 ++ ++block_loaded: ++ MOV X13, X17 ++ MOV X14, X18 ++ MOV X15, X19 ++ MOV X16, X20 ++ ++ // Some of the hex constants below are too large to fit into a ++ // signed 32 bit value. The assembler will handle these ++ // constants in a special way to ensure that they are ++ // zero extended. Our algorithm is only interested in the ++ // bottom 32 bits and doesn't care whether constants are ++ // sign or zero extended when moved into 64 bit registers. ++ // So we use signed constants instead of hex when bit 31 is ++ // set so all constants can be loaded by lui+addi. ++ ++ ROUND1EVN(X13,X14,X15,X16,X5, -680876936, 7); // 0xd76aa478 ++ ROUND1ODD(X16,X13,X14,X15,X5, -389564586,12); // 0xe8c7b756 ++ ROUND1EVN(X15,X16,X13,X14,X6, 0x242070db,17); // 0x242070db ++ ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee ++ ROUND1EVN(X13,X14,X15,X16,X7, -176418897, 7); // 0xf57c0faf ++ ROUND1ODD(X16,X13,X14,X15,X7, 0x4787c62a,12); // 0x4787c62a ++ ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613 ++ ROUND1ODD(X14,X15,X16,X13,X8, -45705983,22); // 0xfd469501 ++ ROUND1EVN(X13,X14,X15,X16,X9, 0x698098d8, 7); // 0x698098d8 ++ ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af ++ ROUND1EVN(X15,X16,X13,X14,X10, -42063,17); // 0xffff5bb1 ++ ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be ++ ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122 ++ ROUND1ODD(X16,X13,X14,X15,X11, -40341101,12); // 0xfd987193 ++ ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e ++ ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821 ++ ++ ROUND2ODD(X13,X14,X15,X16,X5, -165796510, 5); // f61e2562 ++ ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340 ++ ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51 ++ ROUND2EVN(X14,X15,X16,X13,X5, -373897302,20); // e9b6c7aa ++ ROUND2ODD(X13,X14,X15,X16,X7, -701558691, 5); // d62f105d ++ ROUND2EVN(X16,X13,X14,X15,X10, 0x2441453, 9); // 2441453 ++ ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681 ++ ROUND2EVN(X14,X15,X16,X13,X7, -405537848,20); // e7d3fbc8 ++ ROUND2ODD(X13,X14,X15,X16,X9, 0x21e1cde6, 5); // 21e1cde6 ++ ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6 ++ ROUND2ODD(X15,X16,X13,X14,X6, -187363961,14); // f4d50d87 ++ ROUND2EVN(X14,X15,X16,X13,X9, 0x455a14ed,20); // 455a14ed ++ ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905 ++ ROUND2EVN(X16,X13,X14,X15,X6, -51403784, 9); // fcefa3f8 ++ ROUND2ODD(X15,X16,X13,X14,X8, 0x676f02d9,14); // 676f02d9 ++ ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a ++ ++ ROUND3ODD(X13,X14,X15,X16,X7, -378558, 4); // fffa3942 ++ ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681 ++ ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122 ++ ROUND3EVN(X14,X15,X16,X13,X12, -35309556,23); // fde5380c ++ ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44 ++ ROUND3EVN(X16,X13,X14,X15,X7, 0x4bdecfa9,11); // 4bdecfa9 ++ ROUND3ODD(X15,X16,X13,X14,X8, -155497632,16); // f6bb4b60 ++ ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70 ++ ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6 ++ ROUND3EVN(X16,X13,X14,X15,X5, -358537222,11); // eaa127fa ++ ROUND3ODD(X15,X16,X13,X14,X6, -722521979,16); // d4ef3085 ++ ROUND3EVN(X14,X15,X16,X13,X8, 0x4881d05,23); // 4881d05 ++ ROUND3ODD(X13,X14,X15,X16,X9, -640364487, 4); // d9d4d039 ++ ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5 ++ ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8 ++ ROUND3EVN(X14,X15,X16,X13,X6, -995338651,23); // c4ac5665 ++ ++ ROUND4EVN(X13,X14,X15,X16,X5, -198630844, 6); // f4292244 ++ ROUND4ODD(X16,X13,X14,X15,X8, 0x432aff97,10); // 432aff97 ++ ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7 ++ ROUND4ODD(X14,X15,X16,X13,X7, -57434055,21); // fc93a039 ++ ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3 ++ ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92 ++ ROUND4EVN(X15,X16,X13,X14,X10 ,-1051523,15); // ffeff47d ++ ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1 ++ ROUND4EVN(X13,X14,X15,X16,X9, 0x6fa87e4f, 6); // 6fa87e4f ++ ROUND4ODD(X16,X13,X14,X15,X12, -30611744,10); // fe2ce6e0 ++ ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314 ++ ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1 ++ ROUND4EVN(X13,X14,X15,X16,X7, -145523070, 6); // f7537e82 ++ ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235 ++ ROUND4EVN(X15,X16,X13,X14,X6, 0x2ad7d2bb,15); // 2ad7d2bb ++ ROUND4ODD(X14,X15,X16,X13,X9, -343485551,21); // eb86d391 ++ ++ ADDW X17, X13 ++ ADDW X18, X14 ++ ADDW X19, X15 ++ ADDW X20, X16 ++ ++ ADD $64, X29 ++ BNE X28, X29, loop ++ ++ MOVW X13, (0*4)(X22) ++ MOVW X14, (1*4)(X22) ++ MOVW X15, (2*4)(X22) ++ MOVW X16, (3*4)(X22) ++ ++zero: ++ RET +-- +2.39.5 + diff --git a/2055-cmd-internal-obj-riscv-rename-the-iIEncoding.patch b/2055-cmd-internal-obj-riscv-rename-the-iIEncoding.patch new file mode 100644 index 0000000..b138006 --- /dev/null +++ b/2055-cmd-internal-obj-riscv-rename-the-iIEncoding.patch @@ -0,0 +1,200 @@ +From 01f4244453d5872e9c0dbb5057eace1b18fa65b6 Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 055/119] cmd/internal/obj/riscv: rename the iIEncoding + +We rename it to iIIEncoding to reflect the fact that instructions +that use this encoding take two integer registers. This change +will allow us to add a new encoding for I-type instructions that +take a single integer register. This new encoding will be used for +instructions that modify CSRs. + +Change-Id: Ic507d0020e18f6aa72353f4d3ffcd0e868261e7a +Reviewed-on: https://go-review.googlesource.com/c/go/+/614355 +Reviewed-by: Carlos Amedee +Reviewed-by: Joel Sing +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: David Chase +--- + src/cmd/internal/obj/riscv/obj.go | 80 +++++++++++++++---------------- + 1 file changed, 40 insertions(+), 40 deletions(-) + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 088463aef8..6e9691bb4f 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1174,7 +1174,7 @@ func validateRFF(ctxt *obj.Link, ins *instruction) { + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + +-func validateII(ctxt *obj.Link, ins *instruction) { ++func validateIII(ctxt *obj.Link, ins *instruction) { + wantImmI(ctxt, ins, ins.imm, 12) + wantIntReg(ctxt, ins, "rd", ins.rd) + wantIntReg(ctxt, ins, "rs1", ins.rs1) +@@ -1320,7 +1320,7 @@ func encodeI(as obj.As, rs1, rd, imm uint32) uint32 { + return imm<<20 | rs1<<15 | enc.funct3<<12 | rd<<7 | enc.opcode + } + +-func encodeII(ins *instruction) uint32 { ++func encodeIII(ins *instruction) uint32 { + return encodeI(ins.as, regI(ins.rs1), regI(ins.rd), uint32(ins.imm)) + } + +@@ -1521,8 +1521,8 @@ var ( + rIFEncoding = encoding{encode: encodeRIF, validate: validateRIF, length: 4} + rFFEncoding = encoding{encode: encodeRFF, validate: validateRFF, length: 4} + +- iIEncoding = encoding{encode: encodeII, validate: validateII, length: 4} +- iFEncoding = encoding{encode: encodeIF, validate: validateIF, length: 4} ++ iIIEncoding = encoding{encode: encodeIII, validate: validateIII, length: 4} ++ iFEncoding = encoding{encode: encodeIF, validate: validateIF, length: 4} + + sIEncoding = encoding{encode: encodeSI, validate: validateSI, length: 4} + sFEncoding = encoding{encode: encodeSF, validate: validateSF, length: 4} +@@ -1549,15 +1549,15 @@ var encodings = [ALAST & obj.AMask]encoding{ + // Unprivileged ISA + + // 2.4: Integer Computational Instructions +- AADDI & obj.AMask: iIEncoding, +- ASLTI & obj.AMask: iIEncoding, +- ASLTIU & obj.AMask: iIEncoding, +- AANDI & obj.AMask: iIEncoding, +- AORI & obj.AMask: iIEncoding, +- AXORI & obj.AMask: iIEncoding, +- ASLLI & obj.AMask: iIEncoding, +- ASRLI & obj.AMask: iIEncoding, +- ASRAI & obj.AMask: iIEncoding, ++ AADDI & obj.AMask: iIIEncoding, ++ ASLTI & obj.AMask: iIIEncoding, ++ ASLTIU & obj.AMask: iIIEncoding, ++ AANDI & obj.AMask: iIIEncoding, ++ AORI & obj.AMask: iIIEncoding, ++ AXORI & obj.AMask: iIIEncoding, ++ ASLLI & obj.AMask: iIIEncoding, ++ ASRLI & obj.AMask: iIIEncoding, ++ ASRAI & obj.AMask: iIIEncoding, + ALUI & obj.AMask: uEncoding, + AAUIPC & obj.AMask: uEncoding, + AADD & obj.AMask: rIIIEncoding, +@@ -1573,7 +1573,7 @@ var encodings = [ALAST & obj.AMask]encoding{ + + // 2.5: Control Transfer Instructions + AJAL & obj.AMask: jEncoding, +- AJALR & obj.AMask: iIEncoding, ++ AJALR & obj.AMask: iIIEncoding, + ABEQ & obj.AMask: bEncoding, + ABNE & obj.AMask: bEncoding, + ABLT & obj.AMask: bEncoding, +@@ -1582,24 +1582,24 @@ var encodings = [ALAST & obj.AMask]encoding{ + ABGEU & obj.AMask: bEncoding, + + // 2.6: Load and Store Instructions +- ALW & obj.AMask: iIEncoding, +- ALWU & obj.AMask: iIEncoding, +- ALH & obj.AMask: iIEncoding, +- ALHU & obj.AMask: iIEncoding, +- ALB & obj.AMask: iIEncoding, +- ALBU & obj.AMask: iIEncoding, ++ ALW & obj.AMask: iIIEncoding, ++ ALWU & obj.AMask: iIIEncoding, ++ ALH & obj.AMask: iIIEncoding, ++ ALHU & obj.AMask: iIIEncoding, ++ ALB & obj.AMask: iIIEncoding, ++ ALBU & obj.AMask: iIIEncoding, + ASW & obj.AMask: sIEncoding, + ASH & obj.AMask: sIEncoding, + ASB & obj.AMask: sIEncoding, + + // 2.7: Memory Ordering +- AFENCE & obj.AMask: iIEncoding, ++ AFENCE & obj.AMask: iIIEncoding, + + // 5.2: Integer Computational Instructions (RV64I) +- AADDIW & obj.AMask: iIEncoding, +- ASLLIW & obj.AMask: iIEncoding, +- ASRLIW & obj.AMask: iIEncoding, +- ASRAIW & obj.AMask: iIEncoding, ++ AADDIW & obj.AMask: iIIEncoding, ++ ASLLIW & obj.AMask: iIIEncoding, ++ ASRLIW & obj.AMask: iIIEncoding, ++ ASRAIW & obj.AMask: iIIEncoding, + AADDW & obj.AMask: rIIIEncoding, + ASLLW & obj.AMask: rIIIEncoding, + ASRLW & obj.AMask: rIIIEncoding, +@@ -1607,7 +1607,7 @@ var encodings = [ALAST & obj.AMask]encoding{ + ASRAW & obj.AMask: rIIIEncoding, + + // 5.3: Load and Store Instructions (RV64I) +- ALD & obj.AMask: iIEncoding, ++ ALD & obj.AMask: iIIEncoding, + ASD & obj.AMask: sIEncoding, + + // 7.1: Multiplication Operations +@@ -1652,9 +1652,9 @@ var encodings = [ALAST & obj.AMask]encoding{ + AAMOMINUD & obj.AMask: rIIIEncoding, + + // 10.1: Base Counters and Timers +- ARDCYCLE & obj.AMask: iIEncoding, +- ARDTIME & obj.AMask: iIEncoding, +- ARDINSTRET & obj.AMask: iIEncoding, ++ ARDCYCLE & obj.AMask: iIIEncoding, ++ ARDTIME & obj.AMask: iIIEncoding, ++ ARDINSTRET & obj.AMask: iIIEncoding, + + // 11.5: Single-Precision Load and Store Instructions + AFLW & obj.AMask: iFEncoding, +@@ -1743,8 +1743,8 @@ var encodings = [ALAST & obj.AMask]encoding{ + // Privileged ISA + + // 3.2.1: Environment Call and Breakpoint +- AECALL & obj.AMask: iIEncoding, +- AEBREAK & obj.AMask: iIEncoding, ++ AECALL & obj.AMask: iIIEncoding, ++ AEBREAK & obj.AMask: iIIEncoding, + + // + // RISC-V Bit-Manipulation ISA-extensions (1.0) +@@ -1758,7 +1758,7 @@ var encodings = [ALAST & obj.AMask]encoding{ + ASH2ADDUW & obj.AMask: rIIIEncoding, + ASH3ADD & obj.AMask: rIIIEncoding, + ASH3ADDUW & obj.AMask: rIIIEncoding, +- ASLLIUW & obj.AMask: iIEncoding, ++ ASLLIUW & obj.AMask: iIIEncoding, + + // 1.2: Basic Bit Manipulation (Zbb) + AANDN & obj.AMask: rIIIEncoding, +@@ -1782,21 +1782,21 @@ var encodings = [ALAST & obj.AMask]encoding{ + AROL & obj.AMask: rIIIEncoding, + AROLW & obj.AMask: rIIIEncoding, + AROR & obj.AMask: rIIIEncoding, +- ARORI & obj.AMask: iIEncoding, +- ARORIW & obj.AMask: iIEncoding, ++ ARORI & obj.AMask: iIIEncoding, ++ ARORIW & obj.AMask: iIIEncoding, + ARORW & obj.AMask: rIIIEncoding, +- AORCB & obj.AMask: iIEncoding, +- AREV8 & obj.AMask: iIEncoding, ++ AORCB & obj.AMask: iIIEncoding, ++ AREV8 & obj.AMask: iIIEncoding, + + // 1.5: Single-bit Instructions (Zbs) + ABCLR & obj.AMask: rIIIEncoding, +- ABCLRI & obj.AMask: iIEncoding, ++ ABCLRI & obj.AMask: iIIEncoding, + ABEXT & obj.AMask: rIIIEncoding, +- ABEXTI & obj.AMask: iIEncoding, ++ ABEXTI & obj.AMask: iIIEncoding, + ABINV & obj.AMask: rIIIEncoding, +- ABINVI & obj.AMask: iIEncoding, ++ ABINVI & obj.AMask: iIIEncoding, + ABSET & obj.AMask: rIIIEncoding, +- ABSETI & obj.AMask: iIEncoding, ++ ABSETI & obj.AMask: iIIEncoding, + + // Escape hatch + AWORD & obj.AMask: rawEncoding, +-- +2.39.5 + diff --git a/2056-cmd-internal-obj-riscv-add-vector-instruction-encodi.patch b/2056-cmd-internal-obj-riscv-add-vector-instruction-encodi.patch new file mode 100644 index 0000000..c63fb18 --- /dev/null +++ b/2056-cmd-internal-obj-riscv-add-vector-instruction-encodi.patch @@ -0,0 +1,2444 @@ +From b8fe27a30aaf17fbb06a4ca6f43a2ac74086327e Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 056/119] cmd/internal/obj/riscv: add vector instruction + encodings +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Regenerate the riscv instruction encoding table with the V extension +enabled. Add constants and names for the resulting 375 instructions. + +Change-Id: Icce688493aeb1e9880fb76a0618643f57e481273 +Reviewed-on: https://go-review.googlesource.com/c/go/+/595403 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Cherry Mui +Reviewed-by: 鹏程汪 +Reviewed-by: Meng Zhuo +Reviewed-by: Mark Ryan +Reviewed-by: Michael Pratt +--- + src/cmd/internal/obj/riscv/anames.go | 375 ++++++++ + src/cmd/internal/obj/riscv/cpu.go | 491 ++++++++++ + src/cmd/internal/obj/riscv/inst.go | 1255 ++++++++++++++++++++------ + 3 files changed, 1869 insertions(+), 252 deletions(-) + +diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go +index 60c7b48620..53cf1c95dc 100644 +--- a/src/cmd/internal/obj/riscv/anames.go ++++ b/src/cmd/internal/obj/riscv/anames.go +@@ -257,6 +257,381 @@ var Anames = []string{ + "BINVI", + "BSET", + "BSETI", ++ "VSETVLI", ++ "VSETIVLI", ++ "VSETVL", ++ "VLE8V", ++ "VLE16V", ++ "VLE32V", ++ "VLE64V", ++ "VSE8V", ++ "VSE16V", ++ "VSE32V", ++ "VSE64V", ++ "VLMV", ++ "VSMV", ++ "VLSE8V", ++ "VLSE16V", ++ "VLSE32V", ++ "VLSE64V", ++ "VSSE8V", ++ "VSSE16V", ++ "VSSE32V", ++ "VSSE64V", ++ "VLUXEI8V", ++ "VLUXEI16V", ++ "VLUXEI32V", ++ "VLUXEI64V", ++ "VLOXEI8V", ++ "VLOXEI16V", ++ "VLOXEI32V", ++ "VLOXEI64V", ++ "VSUXEI8V", ++ "VSUXEI16V", ++ "VSUXEI32V", ++ "VSUXEI64V", ++ "VSOXEI8V", ++ "VSOXEI16V", ++ "VSOXEI32V", ++ "VSOXEI64V", ++ "VLE8FFV", ++ "VLE16FFV", ++ "VLE32FFV", ++ "VLE64FFV", ++ "VL1RE8V", ++ "VL1RE16V", ++ "VL1RE32V", ++ "VL1RE64V", ++ "VL2RE8V", ++ "VL2RE16V", ++ "VL2RE32V", ++ "VL2RE64V", ++ "VL4RE8V", ++ "VL4RE16V", ++ "VL4RE32V", ++ "VL4RE64V", ++ "VL8RE8V", ++ "VL8RE16V", ++ "VL8RE32V", ++ "VL8RE64V", ++ "VS1RV", ++ "VS2RV", ++ "VS4RV", ++ "VS8RV", ++ "VADDVV", ++ "VADDVX", ++ "VADDVI", ++ "VSUBVV", ++ "VSUBVX", ++ "VRSUBVX", ++ "VRSUBVI", ++ "VWADDUVV", ++ "VWADDUVX", ++ "VWSUBUVV", ++ "VWSUBUVX", ++ "VWADDVV", ++ "VWADDVX", ++ "VWSUBVV", ++ "VWSUBVX", ++ "VWADDUWV", ++ "VWADDUWX", ++ "VWSUBUWV", ++ "VWSUBUWX", ++ "VWADDWV", ++ "VWADDWX", ++ "VWSUBWV", ++ "VWSUBWX", ++ "VZEXTVF2", ++ "VSEXTVF2", ++ "VZEXTVF4", ++ "VSEXTVF4", ++ "VZEXTVF8", ++ "VSEXTVF8", ++ "VADCVVM", ++ "VADCVXM", ++ "VADCVIM", ++ "VMADCVVM", ++ "VMADCVXM", ++ "VMADCVIM", ++ "VMADCVV", ++ "VMADCVX", ++ "VMADCVI", ++ "VSBCVVM", ++ "VSBCVXM", ++ "VMSBCVVM", ++ "VMSBCVXM", ++ "VMSBCVV", ++ "VMSBCVX", ++ "VANDVV", ++ "VANDVX", ++ "VANDVI", ++ "VORVV", ++ "VORVX", ++ "VORVI", ++ "VXORVV", ++ "VXORVX", ++ "VXORVI", ++ "VSLLVV", ++ "VSLLVX", ++ "VSLLVI", ++ "VSRLVV", ++ "VSRLVX", ++ "VSRLVI", ++ "VSRAVV", ++ "VSRAVX", ++ "VSRAVI", ++ "VNSRLWV", ++ "VNSRLWX", ++ "VNSRLWI", ++ "VNSRAWV", ++ "VNSRAWX", ++ "VNSRAWI", ++ "VMSEQVV", ++ "VMSEQVX", ++ "VMSEQVI", ++ "VMSNEVV", ++ "VMSNEVX", ++ "VMSNEVI", ++ "VMSLTUVV", ++ "VMSLTUVX", ++ "VMSLTVV", ++ "VMSLTVX", ++ "VMSLEUVV", ++ "VMSLEUVX", ++ "VMSLEUVI", ++ "VMSLEVV", ++ "VMSLEVX", ++ "VMSLEVI", ++ "VMSGTUVX", ++ "VMSGTUVI", ++ "VMSGTVX", ++ "VMSGTVI", ++ "VMINUVV", ++ "VMINUVX", ++ "VMINVV", ++ "VMINVX", ++ "VMAXUVV", ++ "VMAXUVX", ++ "VMAXVV", ++ "VMAXVX", ++ "VMULVV", ++ "VMULVX", ++ "VMULHVV", ++ "VMULHVX", ++ "VMULHUVV", ++ "VMULHUVX", ++ "VMULHSUVV", ++ "VMULHSUVX", ++ "VDIVUVV", ++ "VDIVUVX", ++ "VDIVVV", ++ "VDIVVX", ++ "VREMUVV", ++ "VREMUVX", ++ "VREMVV", ++ "VREMVX", ++ "VWMULVV", ++ "VWMULVX", ++ "VWMULUVV", ++ "VWMULUVX", ++ "VWMULSUVV", ++ "VWMULSUVX", ++ "VMACCVV", ++ "VMACCVX", ++ "VNMSACVV", ++ "VNMSACVX", ++ "VMADDVV", ++ "VMADDVX", ++ "VNMSUBVV", ++ "VNMSUBVX", ++ "VWMACCUVV", ++ "VWMACCUVX", ++ "VWMACCVV", ++ "VWMACCVX", ++ "VWMACCSUVV", ++ "VWMACCSUVX", ++ "VWMACCUSVX", ++ "VMERGEVVM", ++ "VMERGEVXM", ++ "VMERGEVIM", ++ "VMVVV", ++ "VMVVX", ++ "VMVVI", ++ "VSADDUVV", ++ "VSADDUVX", ++ "VSADDUVI", ++ "VSADDVV", ++ "VSADDVX", ++ "VSADDVI", ++ "VSSUBUVV", ++ "VSSUBUVX", ++ "VSSUBVV", ++ "VSSUBVX", ++ "VAADDUVV", ++ "VAADDUVX", ++ "VAADDVV", ++ "VAADDVX", ++ "VASUBUVV", ++ "VASUBUVX", ++ "VASUBVV", ++ "VASUBVX", ++ "VSMULVV", ++ "VSMULVX", ++ "VSSRLVV", ++ "VSSRLVX", ++ "VSSRLVI", ++ "VSSRAVV", ++ "VSSRAVX", ++ "VSSRAVI", ++ "VNCLIPUWV", ++ "VNCLIPUWX", ++ "VNCLIPUWI", ++ "VNCLIPWV", ++ "VNCLIPWX", ++ "VNCLIPWI", ++ "VFADDVV", ++ "VFADDVF", ++ "VFSUBVV", ++ "VFSUBVF", ++ "VFRSUBVF", ++ "VFWADDVV", ++ "VFWADDVF", ++ "VFWSUBVV", ++ "VFWSUBVF", ++ "VFWADDWV", ++ "VFWADDWF", ++ "VFWSUBWV", ++ "VFWSUBWF", ++ "VFMULVV", ++ "VFMULVF", ++ "VFDIVVV", ++ "VFDIVVF", ++ "VFRDIVVF", ++ "VFWMULVV", ++ "VFWMULVF", ++ "VFMACCVV", ++ "VFMACCVF", ++ "VFNMACCVV", ++ "VFNMACCVF", ++ "VFMSACVV", ++ "VFMSACVF", ++ "VFNMSACVV", ++ "VFNMSACVF", ++ "VFMADDVV", ++ "VFMADDVF", ++ "VFNMADDVV", ++ "VFNMADDVF", ++ "VFMSUBVV", ++ "VFMSUBVF", ++ "VFNMSUBVV", ++ "VFNMSUBVF", ++ "VFWMACCVV", ++ "VFWMACCVF", ++ "VFWNMACCVV", ++ "VFWNMACCVF", ++ "VFWMSACVV", ++ "VFWMSACVF", ++ "VFWNMSACVV", ++ "VFWNMSACVF", ++ "VFSQRTV", ++ "VFRSQRT7V", ++ "VFREC7V", ++ "VFMINVV", ++ "VFMINVF", ++ "VFMAXVV", ++ "VFMAXVF", ++ "VFSGNJVV", ++ "VFSGNJVF", ++ "VFSGNJNVV", ++ "VFSGNJNVF", ++ "VFSGNJXVV", ++ "VFSGNJXVF", ++ "VMFEQVV", ++ "VMFEQVF", ++ "VMFNEVV", ++ "VMFNEVF", ++ "VMFLTVV", ++ "VMFLTVF", ++ "VMFLEVV", ++ "VMFLEVF", ++ "VMFGTVF", ++ "VMFGEVF", ++ "VFCLASSV", ++ "VFMERGEVFM", ++ "VFMVVF", ++ "VFCVTXUFV", ++ "VFCVTXFV", ++ "VFCVTRTZXUFV", ++ "VFCVTRTZXFV", ++ "VFCVTFXUV", ++ "VFCVTFXV", ++ "VFWCVTXUFV", ++ "VFWCVTXFV", ++ "VFWCVTRTZXUFV", ++ "VFWCVTRTZXFV", ++ "VFWCVTFXUV", ++ "VFWCVTFXV", ++ "VFWCVTFFV", ++ "VFNCVTXUFW", ++ "VFNCVTXFW", ++ "VFNCVTRTZXUFW", ++ "VFNCVTRTZXFW", ++ "VFNCVTFXUW", ++ "VFNCVTFXW", ++ "VFNCVTFFW", ++ "VFNCVTRODFFW", ++ "VREDSUMVS", ++ "VREDMAXUVS", ++ "VREDMAXVS", ++ "VREDMINUVS", ++ "VREDMINVS", ++ "VREDANDVS", ++ "VREDORVS", ++ "VREDXORVS", ++ "VWREDSUMUVS", ++ "VWREDSUMVS", ++ "VFREDOSUMVS", ++ "VFREDUSUMVS", ++ "VFREDMAXVS", ++ "VFREDMINVS", ++ "VFWREDOSUMVS", ++ "VFWREDUSUMVS", ++ "VMANDMM", ++ "VMNANDMM", ++ "VMANDNMM", ++ "VMXORMM", ++ "VMORMM", ++ "VMNORMM", ++ "VMORNMM", ++ "VMXNORMM", ++ "VCPOPM", ++ "VFIRSTM", ++ "VMSBFM", ++ "VMSIFM", ++ "VMSOFM", ++ "VIOTAM", ++ "VIDV", ++ "VMVXS", ++ "VMVSX", ++ "VFMVFS", ++ "VFMVSF", ++ "VSLIDEUPVX", ++ "VSLIDEUPVI", ++ "VSLIDEDOWNVX", ++ "VSLIDEDOWNVI", ++ "VSLIDE1UPVX", ++ "VFSLIDE1UPVF", ++ "VSLIDE1DOWNVX", ++ "VFSLIDE1DOWNVF", ++ "VRGATHERVV", ++ "VRGATHEREI16VV", ++ "VRGATHERVX", ++ "VRGATHERVI", ++ "VCOMPRESSVM", ++ "VMV1RV", ++ "VMV2RV", ++ "VMV4RV", ++ "VMV8RV", + "WORD", + "BEQZ", + "BGEZ", +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index 07d5ccff87..8b620b8646 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -619,6 +619,497 @@ const ( + ABSET + ABSETI + ++ // ++ // RISC-V Vector ISA-extension (1.0) (Unprivileged 20240411) ++ // ++ ++ // 31.6. Configuration-Setting Instructions ++ AVSETVLI ++ AVSETIVLI ++ AVSETVL ++ ++ // 31.7.4. Vector Unit-Stride Instructions ++ AVLE8V ++ AVLE16V ++ AVLE32V ++ AVLE64V ++ AVSE8V ++ AVSE16V ++ AVSE32V ++ AVSE64V ++ AVLMV ++ AVSMV ++ ++ // 31.7.5. Vector Strided Instructions ++ AVLSE8V ++ AVLSE16V ++ AVLSE32V ++ AVLSE64V ++ AVSSE8V ++ AVSSE16V ++ AVSSE32V ++ AVSSE64V ++ ++ // 31.7.6. Vector Indexed Instructions ++ AVLUXEI8V ++ AVLUXEI16V ++ AVLUXEI32V ++ AVLUXEI64V ++ AVLOXEI8V ++ AVLOXEI16V ++ AVLOXEI32V ++ AVLOXEI64V ++ AVSUXEI8V ++ AVSUXEI16V ++ AVSUXEI32V ++ AVSUXEI64V ++ AVSOXEI8V ++ AVSOXEI16V ++ AVSOXEI32V ++ AVSOXEI64V ++ ++ // 31.7.7. Unit-stride Fault-Only-First Loads ++ AVLE8FFV ++ AVLE16FFV ++ AVLE32FFV ++ AVLE64FFV ++ ++ // 31.7.9. Vector Load/Store Whole Register Instructions ++ AVL1RE8V ++ AVL1RE16V ++ AVL1RE32V ++ AVL1RE64V ++ AVL2RE8V ++ AVL2RE16V ++ AVL2RE32V ++ AVL2RE64V ++ AVL4RE8V ++ AVL4RE16V ++ AVL4RE32V ++ AVL4RE64V ++ AVL8RE8V ++ AVL8RE16V ++ AVL8RE32V ++ AVL8RE64V ++ AVS1RV ++ AVS2RV ++ AVS4RV ++ AVS8RV ++ ++ // 31.11.1. Vector Single-Width Integer Add and Subtract ++ AVADDVV ++ AVADDVX ++ AVADDVI ++ AVSUBVV ++ AVSUBVX ++ AVRSUBVX ++ AVRSUBVI ++ ++ // 31.11.2. Vector Widening Integer Add/Subtract ++ AVWADDUVV ++ AVWADDUVX ++ AVWSUBUVV ++ AVWSUBUVX ++ AVWADDVV ++ AVWADDVX ++ AVWSUBVV ++ AVWSUBVX ++ AVWADDUWV ++ AVWADDUWX ++ AVWSUBUWV ++ AVWSUBUWX ++ AVWADDWV ++ AVWADDWX ++ AVWSUBWV ++ AVWSUBWX ++ ++ // 31.11.3. Vector Integer Extension ++ AVZEXTVF2 ++ AVSEXTVF2 ++ AVZEXTVF4 ++ AVSEXTVF4 ++ AVZEXTVF8 ++ AVSEXTVF8 ++ ++ // 31.11.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions ++ AVADCVVM ++ AVADCVXM ++ AVADCVIM ++ AVMADCVVM ++ AVMADCVXM ++ AVMADCVIM ++ AVMADCVV ++ AVMADCVX ++ AVMADCVI ++ AVSBCVVM ++ AVSBCVXM ++ AVMSBCVVM ++ AVMSBCVXM ++ AVMSBCVV ++ AVMSBCVX ++ ++ // 31.11.5. Vector Bitwise Logical Instructions ++ AVANDVV ++ AVANDVX ++ AVANDVI ++ AVORVV ++ AVORVX ++ AVORVI ++ AVXORVV ++ AVXORVX ++ AVXORVI ++ ++ // 31.11.6. Vector Single-Width Shift Instructions ++ AVSLLVV ++ AVSLLVX ++ AVSLLVI ++ AVSRLVV ++ AVSRLVX ++ AVSRLVI ++ AVSRAVV ++ AVSRAVX ++ AVSRAVI ++ ++ // 31.11.7. Vector Narrowing Integer Right Shift Instructions ++ AVNSRLWV ++ AVNSRLWX ++ AVNSRLWI ++ AVNSRAWV ++ AVNSRAWX ++ AVNSRAWI ++ ++ // 31.11.8. Vector Integer Compare Instructions ++ AVMSEQVV ++ AVMSEQVX ++ AVMSEQVI ++ AVMSNEVV ++ AVMSNEVX ++ AVMSNEVI ++ AVMSLTUVV ++ AVMSLTUVX ++ AVMSLTVV ++ AVMSLTVX ++ AVMSLEUVV ++ AVMSLEUVX ++ AVMSLEUVI ++ AVMSLEVV ++ AVMSLEVX ++ AVMSLEVI ++ AVMSGTUVX ++ AVMSGTUVI ++ AVMSGTVX ++ AVMSGTVI ++ ++ // 31.11.9. Vector Integer Min/Max Instructions ++ AVMINUVV ++ AVMINUVX ++ AVMINVV ++ AVMINVX ++ AVMAXUVV ++ AVMAXUVX ++ AVMAXVV ++ AVMAXVX ++ ++ // 31.11.10. Vector Single-Width Integer Multiply Instructions ++ AVMULVV ++ AVMULVX ++ AVMULHVV ++ AVMULHVX ++ AVMULHUVV ++ AVMULHUVX ++ AVMULHSUVV ++ AVMULHSUVX ++ ++ // 31.11.11. Vector Integer Divide Instructions ++ AVDIVUVV ++ AVDIVUVX ++ AVDIVVV ++ AVDIVVX ++ AVREMUVV ++ AVREMUVX ++ AVREMVV ++ AVREMVX ++ ++ // 31.11.12. Vector Widening Integer Multiply Instructions ++ AVWMULVV ++ AVWMULVX ++ AVWMULUVV ++ AVWMULUVX ++ AVWMULSUVV ++ AVWMULSUVX ++ ++ // 31.11.13. Vector Single-Width Integer Multiply-Add Instructions ++ AVMACCVV ++ AVMACCVX ++ AVNMSACVV ++ AVNMSACVX ++ AVMADDVV ++ AVMADDVX ++ AVNMSUBVV ++ AVNMSUBVX ++ ++ // 31.11.14. Vector Widening Integer Multiply-Add Instructions ++ AVWMACCUVV ++ AVWMACCUVX ++ AVWMACCVV ++ AVWMACCVX ++ AVWMACCSUVV ++ AVWMACCSUVX ++ AVWMACCUSVX ++ ++ // 31.11.15. Vector Integer Merge Instructions ++ AVMERGEVVM ++ AVMERGEVXM ++ AVMERGEVIM ++ ++ // 31.11.16. Vector Integer Move Instructions ++ AVMVVV ++ AVMVVX ++ AVMVVI ++ ++ // 31.12.1. Vector Single-Width Saturating Add and Subtract ++ AVSADDUVV ++ AVSADDUVX ++ AVSADDUVI ++ AVSADDVV ++ AVSADDVX ++ AVSADDVI ++ AVSSUBUVV ++ AVSSUBUVX ++ AVSSUBVV ++ AVSSUBVX ++ ++ // 31.12.2. Vector Single-Width Averaging Add and Subtract ++ AVAADDUVV ++ AVAADDUVX ++ AVAADDVV ++ AVAADDVX ++ AVASUBUVV ++ AVASUBUVX ++ AVASUBVV ++ AVASUBVX ++ ++ // 31.12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation ++ AVSMULVV ++ AVSMULVX ++ ++ // 31.12.4. Vector Single-Width Scaling Shift Instructions ++ AVSSRLVV ++ AVSSRLVX ++ AVSSRLVI ++ AVSSRAVV ++ AVSSRAVX ++ AVSSRAVI ++ ++ // 31.12.5. Vector Narrowing Fixed-Point Clip Instructions ++ AVNCLIPUWV ++ AVNCLIPUWX ++ AVNCLIPUWI ++ AVNCLIPWV ++ AVNCLIPWX ++ AVNCLIPWI ++ ++ // 31.13.2. Vector Single-Width Floating-Point Add/Subtract Instructions ++ AVFADDVV ++ AVFADDVF ++ AVFSUBVV ++ AVFSUBVF ++ AVFRSUBVF ++ ++ // 31.13.3. Vector Widening Floating-Point Add/Subtract Instructions ++ AVFWADDVV ++ AVFWADDVF ++ AVFWSUBVV ++ AVFWSUBVF ++ AVFWADDWV ++ AVFWADDWF ++ AVFWSUBWV ++ AVFWSUBWF ++ ++ // 31.13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions ++ AVFMULVV ++ AVFMULVF ++ AVFDIVVV ++ AVFDIVVF ++ AVFRDIVVF ++ ++ // 31.13.5. Vector Widening Floating-Point Multiply ++ AVFWMULVV ++ AVFWMULVF ++ ++ // 31.13.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions ++ AVFMACCVV ++ AVFMACCVF ++ AVFNMACCVV ++ AVFNMACCVF ++ AVFMSACVV ++ AVFMSACVF ++ AVFNMSACVV ++ AVFNMSACVF ++ AVFMADDVV ++ AVFMADDVF ++ AVFNMADDVV ++ AVFNMADDVF ++ AVFMSUBVV ++ AVFMSUBVF ++ AVFNMSUBVV ++ AVFNMSUBVF ++ ++ // 31.13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions ++ AVFWMACCVV ++ AVFWMACCVF ++ AVFWNMACCVV ++ AVFWNMACCVF ++ AVFWMSACVV ++ AVFWMSACVF ++ AVFWNMSACVV ++ AVFWNMSACVF ++ ++ // 31.13.8. Vector Floating-Point Square-Root Instruction ++ AVFSQRTV ++ ++ // 31.13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction ++ AVFRSQRT7V ++ ++ // 31.13.10. Vector Floating-Point Reciprocal Estimate Instruction ++ AVFREC7V ++ ++ // 31.13.11. Vector Floating-Point MIN/MAX Instructions ++ AVFMINVV ++ AVFMINVF ++ AVFMAXVV ++ AVFMAXVF ++ ++ // 31.13.12. Vector Floating-Point Sign-Injection Instructions ++ AVFSGNJVV ++ AVFSGNJVF ++ AVFSGNJNVV ++ AVFSGNJNVF ++ AVFSGNJXVV ++ AVFSGNJXVF ++ ++ // 31.13.13. Vector Floating-Point Compare Instructions ++ AVMFEQVV ++ AVMFEQVF ++ AVMFNEVV ++ AVMFNEVF ++ AVMFLTVV ++ AVMFLTVF ++ AVMFLEVV ++ AVMFLEVF ++ AVMFGTVF ++ AVMFGEVF ++ ++ // 31.13.14. Vector Floating-Point Classify Instruction ++ AVFCLASSV ++ ++ // 31.13.15. Vector Floating-Point Merge Instruction ++ AVFMERGEVFM ++ ++ // 31.13.16. Vector Floating-Point Move Instruction ++ AVFMVVF ++ ++ // 31.13.17. Single-Width Floating-Point/Integer Type-Convert Instructions ++ AVFCVTXUFV ++ AVFCVTXFV ++ AVFCVTRTZXUFV ++ AVFCVTRTZXFV ++ AVFCVTFXUV ++ AVFCVTFXV ++ ++ // 31.13.18. Widening Floating-Point/Integer Type-Convert Instructions ++ AVFWCVTXUFV ++ AVFWCVTXFV ++ AVFWCVTRTZXUFV ++ AVFWCVTRTZXFV ++ AVFWCVTFXUV ++ AVFWCVTFXV ++ AVFWCVTFFV ++ ++ // 31.13.19. Narrowing Floating-Point/Integer Type-Convert Instructions ++ AVFNCVTXUFW ++ AVFNCVTXFW ++ AVFNCVTRTZXUFW ++ AVFNCVTRTZXFW ++ AVFNCVTFXUW ++ AVFNCVTFXW ++ AVFNCVTFFW ++ AVFNCVTRODFFW ++ ++ // 31.14.1. Vector Single-Width Integer Reduction Instructions ++ AVREDSUMVS ++ AVREDMAXUVS ++ AVREDMAXVS ++ AVREDMINUVS ++ AVREDMINVS ++ AVREDANDVS ++ AVREDORVS ++ AVREDXORVS ++ ++ // 31.14.2. Vector Widening Integer Reduction Instructions ++ AVWREDSUMUVS ++ AVWREDSUMVS ++ ++ // 31.14.3. Vector Single-Width Floating-Point Reduction Instructions ++ AVFREDOSUMVS ++ AVFREDUSUMVS ++ AVFREDMAXVS ++ AVFREDMINVS ++ ++ // 31.14.4. Vector Widening Floating-Point Reduction Instructions ++ AVFWREDOSUMVS ++ AVFWREDUSUMVS ++ ++ // 31.15. Vector Mask Instructions ++ AVMANDMM ++ AVMNANDMM ++ AVMANDNMM ++ AVMXORMM ++ AVMORMM ++ AVMNORMM ++ AVMORNMM ++ AVMXNORMM ++ AVCPOPM ++ AVFIRSTM ++ AVMSBFM ++ AVMSIFM ++ AVMSOFM ++ AVIOTAM ++ AVIDV ++ ++ // 31.16.1. Integer Scalar Move Instructions ++ AVMVXS ++ AVMVSX ++ ++ // 31.16.2. Floating-Point Scalar Move Instructions ++ AVFMVFS ++ AVFMVSF ++ ++ // 31.16.3. Vector Slide Instructions ++ AVSLIDEUPVX ++ AVSLIDEUPVI ++ AVSLIDEDOWNVX ++ AVSLIDEDOWNVI ++ AVSLIDE1UPVX ++ AVFSLIDE1UPVF ++ AVSLIDE1DOWNVX ++ AVFSLIDE1DOWNVF ++ ++ // 31.16.4. Vector Register Gather Instructions ++ AVRGATHERVV ++ AVRGATHEREI16VV ++ AVRGATHERVX ++ AVRGATHERVI ++ ++ // 31.16.5. Vector Compress Instruction ++ AVCOMPRESSVM ++ ++ // 31.16.6. Whole Vector Register Move ++ AVMV1RV ++ AVMV2RV ++ AVMV4RV ++ AVMV8RV ++ + // The escape hatch. Inserts a single 32-bit word. + AWORD + +diff --git a/src/cmd/internal/obj/riscv/inst.go b/src/cmd/internal/obj/riscv/inst.go +index 223ddd15b2..c264f6ae15 100644 +--- a/src/cmd/internal/obj/riscv/inst.go ++++ b/src/cmd/internal/obj/riscv/inst.go +@@ -1,4 +1,4 @@ +-// Code generated by ./parse.py -go rv64_a rv64_d rv64_f rv64_i rv64_m rv64_q rv64_zba rv64_zbb rv64_zbs rv_a rv_d rv_f rv_i rv_m rv_q rv_zba rv_zbb rv_zbs rv_s rv_system rv_zicsr; DO NOT EDIT. ++// Code generated by ./parse.py -go rv64_a rv64_d rv64_f rv64_i rv64_m rv64_q rv64_zba rv64_zbb rv64_zbs rv_a rv_d rv_f rv_i rv_m rv_q rv_s rv_system rv_v rv_zba rv_zbb rv_zbs rv_zicsr; DO NOT EDIT. + package riscv + + import "cmd/internal/obj" +@@ -6,6 +6,7 @@ import "cmd/internal/obj" + type inst struct { + opcode uint32 + funct3 uint32 ++ rs1 uint32 + rs2 uint32 + csr int64 + funct7 uint32 +@@ -14,507 +15,1257 @@ type inst struct { + func encode(a obj.As) *inst { + switch a { + case AADD: +- return &inst{0x33, 0x0, 0x0, 0, 0x0} ++ return &inst{0x33, 0x0, 0x0, 0x0, 0, 0x0} + case AADDUW: +- return &inst{0x3b, 0x0, 0x0, 128, 0x4} ++ return &inst{0x3b, 0x0, 0x0, 0x0, 128, 0x4} + case AADDI: +- return &inst{0x13, 0x0, 0x0, 0, 0x0} ++ return &inst{0x13, 0x0, 0x0, 0x0, 0, 0x0} + case AADDIW: +- return &inst{0x1b, 0x0, 0x0, 0, 0x0} ++ return &inst{0x1b, 0x0, 0x0, 0x0, 0, 0x0} + case AADDW: +- return &inst{0x3b, 0x0, 0x0, 0, 0x0} ++ return &inst{0x3b, 0x0, 0x0, 0x0, 0, 0x0} + case AAMOADDD: +- return &inst{0x2f, 0x3, 0x0, 0, 0x0} ++ return &inst{0x2f, 0x3, 0x0, 0x0, 0, 0x0} + case AAMOADDW: +- return &inst{0x2f, 0x2, 0x0, 0, 0x0} ++ return &inst{0x2f, 0x2, 0x0, 0x0, 0, 0x0} + case AAMOANDD: +- return &inst{0x2f, 0x3, 0x0, 1536, 0x30} ++ return &inst{0x2f, 0x3, 0x0, 0x0, 1536, 0x30} + case AAMOANDW: +- return &inst{0x2f, 0x2, 0x0, 1536, 0x30} ++ return &inst{0x2f, 0x2, 0x0, 0x0, 1536, 0x30} + case AAMOMAXD: +- return &inst{0x2f, 0x3, 0x0, -1536, 0x50} ++ return &inst{0x2f, 0x3, 0x0, 0x0, -1536, 0x50} + case AAMOMAXW: +- return &inst{0x2f, 0x2, 0x0, -1536, 0x50} ++ return &inst{0x2f, 0x2, 0x0, 0x0, -1536, 0x50} + case AAMOMAXUD: +- return &inst{0x2f, 0x3, 0x0, -512, 0x70} ++ return &inst{0x2f, 0x3, 0x0, 0x0, -512, 0x70} + case AAMOMAXUW: +- return &inst{0x2f, 0x2, 0x0, -512, 0x70} ++ return &inst{0x2f, 0x2, 0x0, 0x0, -512, 0x70} + case AAMOMIND: +- return &inst{0x2f, 0x3, 0x0, -2048, 0x40} ++ return &inst{0x2f, 0x3, 0x0, 0x0, -2048, 0x40} + case AAMOMINW: +- return &inst{0x2f, 0x2, 0x0, -2048, 0x40} ++ return &inst{0x2f, 0x2, 0x0, 0x0, -2048, 0x40} + case AAMOMINUD: +- return &inst{0x2f, 0x3, 0x0, -1024, 0x60} ++ return &inst{0x2f, 0x3, 0x0, 0x0, -1024, 0x60} + case AAMOMINUW: +- return &inst{0x2f, 0x2, 0x0, -1024, 0x60} ++ return &inst{0x2f, 0x2, 0x0, 0x0, -1024, 0x60} + case AAMOORD: +- return &inst{0x2f, 0x3, 0x0, 1024, 0x20} ++ return &inst{0x2f, 0x3, 0x0, 0x0, 1024, 0x20} + case AAMOORW: +- return &inst{0x2f, 0x2, 0x0, 1024, 0x20} ++ return &inst{0x2f, 0x2, 0x0, 0x0, 1024, 0x20} + case AAMOSWAPD: +- return &inst{0x2f, 0x3, 0x0, 128, 0x4} ++ return &inst{0x2f, 0x3, 0x0, 0x0, 128, 0x4} + case AAMOSWAPW: +- return &inst{0x2f, 0x2, 0x0, 128, 0x4} ++ return &inst{0x2f, 0x2, 0x0, 0x0, 128, 0x4} + case AAMOXORD: +- return &inst{0x2f, 0x3, 0x0, 512, 0x10} ++ return &inst{0x2f, 0x3, 0x0, 0x0, 512, 0x10} + case AAMOXORW: +- return &inst{0x2f, 0x2, 0x0, 512, 0x10} ++ return &inst{0x2f, 0x2, 0x0, 0x0, 512, 0x10} + case AAND: +- return &inst{0x33, 0x7, 0x0, 0, 0x0} ++ return &inst{0x33, 0x7, 0x0, 0x0, 0, 0x0} + case AANDI: +- return &inst{0x13, 0x7, 0x0, 0, 0x0} ++ return &inst{0x13, 0x7, 0x0, 0x0, 0, 0x0} + case AANDN: +- return &inst{0x33, 0x7, 0x0, 1024, 0x20} ++ return &inst{0x33, 0x7, 0x0, 0x0, 1024, 0x20} + case AAUIPC: +- return &inst{0x17, 0x0, 0x0, 0, 0x0} ++ return &inst{0x17, 0x0, 0x0, 0x0, 0, 0x0} + case ABCLR: +- return &inst{0x33, 0x1, 0x0, 1152, 0x24} ++ return &inst{0x33, 0x1, 0x0, 0x0, 1152, 0x24} + case ABCLRI: +- return &inst{0x13, 0x1, 0x0, 1152, 0x24} ++ return &inst{0x13, 0x1, 0x0, 0x0, 1152, 0x24} + case ABEQ: +- return &inst{0x63, 0x0, 0x0, 0, 0x0} ++ return &inst{0x63, 0x0, 0x0, 0x0, 0, 0x0} + case ABEXT: +- return &inst{0x33, 0x5, 0x0, 1152, 0x24} ++ return &inst{0x33, 0x5, 0x0, 0x0, 1152, 0x24} + case ABEXTI: +- return &inst{0x13, 0x5, 0x0, 1152, 0x24} ++ return &inst{0x13, 0x5, 0x0, 0x0, 1152, 0x24} + case ABGE: +- return &inst{0x63, 0x5, 0x0, 0, 0x0} ++ return &inst{0x63, 0x5, 0x0, 0x0, 0, 0x0} + case ABGEU: +- return &inst{0x63, 0x7, 0x0, 0, 0x0} ++ return &inst{0x63, 0x7, 0x0, 0x0, 0, 0x0} + case ABINV: +- return &inst{0x33, 0x1, 0x0, 1664, 0x34} ++ return &inst{0x33, 0x1, 0x0, 0x0, 1664, 0x34} + case ABINVI: +- return &inst{0x13, 0x1, 0x0, 1664, 0x34} ++ return &inst{0x13, 0x1, 0x0, 0x0, 1664, 0x34} + case ABLT: +- return &inst{0x63, 0x4, 0x0, 0, 0x0} ++ return &inst{0x63, 0x4, 0x0, 0x0, 0, 0x0} + case ABLTU: +- return &inst{0x63, 0x6, 0x0, 0, 0x0} ++ return &inst{0x63, 0x6, 0x0, 0x0, 0, 0x0} + case ABNE: +- return &inst{0x63, 0x1, 0x0, 0, 0x0} ++ return &inst{0x63, 0x1, 0x0, 0x0, 0, 0x0} + case ABSET: +- return &inst{0x33, 0x1, 0x0, 640, 0x14} ++ return &inst{0x33, 0x1, 0x0, 0x0, 640, 0x14} + case ABSETI: +- return &inst{0x13, 0x1, 0x0, 640, 0x14} ++ return &inst{0x13, 0x1, 0x0, 0x0, 640, 0x14} + case ACLZ: +- return &inst{0x13, 0x1, 0x0, 1536, 0x30} ++ return &inst{0x13, 0x1, 0x0, 0x0, 1536, 0x30} + case ACLZW: +- return &inst{0x1b, 0x1, 0x0, 1536, 0x30} ++ return &inst{0x1b, 0x1, 0x0, 0x0, 1536, 0x30} + case ACPOP: +- return &inst{0x13, 0x1, 0x2, 1538, 0x30} ++ return &inst{0x13, 0x1, 0x0, 0x2, 1538, 0x30} + case ACPOPW: +- return &inst{0x1b, 0x1, 0x2, 1538, 0x30} ++ return &inst{0x1b, 0x1, 0x0, 0x2, 1538, 0x30} + case ACSRRC: +- return &inst{0x73, 0x3, 0x0, 0, 0x0} ++ return &inst{0x73, 0x3, 0x0, 0x0, 0, 0x0} + case ACSRRCI: +- return &inst{0x73, 0x7, 0x0, 0, 0x0} ++ return &inst{0x73, 0x7, 0x0, 0x0, 0, 0x0} + case ACSRRS: +- return &inst{0x73, 0x2, 0x0, 0, 0x0} ++ return &inst{0x73, 0x2, 0x0, 0x0, 0, 0x0} + case ACSRRSI: +- return &inst{0x73, 0x6, 0x0, 0, 0x0} ++ return &inst{0x73, 0x6, 0x0, 0x0, 0, 0x0} + case ACSRRW: +- return &inst{0x73, 0x1, 0x0, 0, 0x0} ++ return &inst{0x73, 0x1, 0x0, 0x0, 0, 0x0} + case ACSRRWI: +- return &inst{0x73, 0x5, 0x0, 0, 0x0} ++ return &inst{0x73, 0x5, 0x0, 0x0, 0, 0x0} + case ACTZ: +- return &inst{0x13, 0x1, 0x1, 1537, 0x30} ++ return &inst{0x13, 0x1, 0x0, 0x1, 1537, 0x30} + case ACTZW: +- return &inst{0x1b, 0x1, 0x1, 1537, 0x30} ++ return &inst{0x1b, 0x1, 0x0, 0x1, 1537, 0x30} + case ADIV: +- return &inst{0x33, 0x4, 0x0, 32, 0x1} ++ return &inst{0x33, 0x4, 0x0, 0x0, 32, 0x1} + case ADIVU: +- return &inst{0x33, 0x5, 0x0, 32, 0x1} ++ return &inst{0x33, 0x5, 0x0, 0x0, 32, 0x1} + case ADIVUW: +- return &inst{0x3b, 0x5, 0x0, 32, 0x1} ++ return &inst{0x3b, 0x5, 0x0, 0x0, 32, 0x1} + case ADIVW: +- return &inst{0x3b, 0x4, 0x0, 32, 0x1} ++ return &inst{0x3b, 0x4, 0x0, 0x0, 32, 0x1} + case AEBREAK: +- return &inst{0x73, 0x0, 0x1, 1, 0x0} ++ return &inst{0x73, 0x0, 0x0, 0x1, 1, 0x0} + case AECALL: +- return &inst{0x73, 0x0, 0x0, 0, 0x0} ++ return &inst{0x73, 0x0, 0x0, 0x0, 0, 0x0} + case AFADDD: +- return &inst{0x53, 0x0, 0x0, 32, 0x1} ++ return &inst{0x53, 0x0, 0x0, 0x0, 32, 0x1} + case AFADDQ: +- return &inst{0x53, 0x0, 0x0, 96, 0x3} ++ return &inst{0x53, 0x0, 0x0, 0x0, 96, 0x3} + case AFADDS: +- return &inst{0x53, 0x0, 0x0, 0, 0x0} ++ return &inst{0x53, 0x0, 0x0, 0x0, 0, 0x0} + case AFCLASSD: +- return &inst{0x53, 0x1, 0x0, -480, 0x71} ++ return &inst{0x53, 0x1, 0x0, 0x0, -480, 0x71} + case AFCLASSQ: +- return &inst{0x53, 0x1, 0x0, -416, 0x73} ++ return &inst{0x53, 0x1, 0x0, 0x0, -416, 0x73} + case AFCLASSS: +- return &inst{0x53, 0x1, 0x0, -512, 0x70} ++ return &inst{0x53, 0x1, 0x0, 0x0, -512, 0x70} + case AFCVTDL: +- return &inst{0x53, 0x0, 0x2, -734, 0x69} ++ return &inst{0x53, 0x0, 0x0, 0x2, -734, 0x69} + case AFCVTDLU: +- return &inst{0x53, 0x0, 0x3, -733, 0x69} ++ return &inst{0x53, 0x0, 0x0, 0x3, -733, 0x69} + case AFCVTDQ: +- return &inst{0x53, 0x0, 0x3, 1059, 0x21} ++ return &inst{0x53, 0x0, 0x0, 0x3, 1059, 0x21} + case AFCVTDS: +- return &inst{0x53, 0x0, 0x0, 1056, 0x21} ++ return &inst{0x53, 0x0, 0x0, 0x0, 1056, 0x21} + case AFCVTDW: +- return &inst{0x53, 0x0, 0x0, -736, 0x69} ++ return &inst{0x53, 0x0, 0x0, 0x0, -736, 0x69} + case AFCVTDWU: +- return &inst{0x53, 0x0, 0x1, -735, 0x69} ++ return &inst{0x53, 0x0, 0x0, 0x1, -735, 0x69} + case AFCVTLD: +- return &inst{0x53, 0x0, 0x2, -990, 0x61} ++ return &inst{0x53, 0x0, 0x0, 0x2, -990, 0x61} + case AFCVTLQ: +- return &inst{0x53, 0x0, 0x2, -926, 0x63} ++ return &inst{0x53, 0x0, 0x0, 0x2, -926, 0x63} + case AFCVTLS: +- return &inst{0x53, 0x0, 0x2, -1022, 0x60} ++ return &inst{0x53, 0x0, 0x0, 0x2, -1022, 0x60} + case AFCVTLUD: +- return &inst{0x53, 0x0, 0x3, -989, 0x61} ++ return &inst{0x53, 0x0, 0x0, 0x3, -989, 0x61} + case AFCVTLUQ: +- return &inst{0x53, 0x0, 0x3, -925, 0x63} ++ return &inst{0x53, 0x0, 0x0, 0x3, -925, 0x63} + case AFCVTLUS: +- return &inst{0x53, 0x0, 0x3, -1021, 0x60} ++ return &inst{0x53, 0x0, 0x0, 0x3, -1021, 0x60} + case AFCVTQD: +- return &inst{0x53, 0x0, 0x1, 1121, 0x23} ++ return &inst{0x53, 0x0, 0x0, 0x1, 1121, 0x23} + case AFCVTQL: +- return &inst{0x53, 0x0, 0x2, -670, 0x6b} ++ return &inst{0x53, 0x0, 0x0, 0x2, -670, 0x6b} + case AFCVTQLU: +- return &inst{0x53, 0x0, 0x3, -669, 0x6b} ++ return &inst{0x53, 0x0, 0x0, 0x3, -669, 0x6b} + case AFCVTQS: +- return &inst{0x53, 0x0, 0x0, 1120, 0x23} ++ return &inst{0x53, 0x0, 0x0, 0x0, 1120, 0x23} + case AFCVTQW: +- return &inst{0x53, 0x0, 0x0, -672, 0x6b} ++ return &inst{0x53, 0x0, 0x0, 0x0, -672, 0x6b} + case AFCVTQWU: +- return &inst{0x53, 0x0, 0x1, -671, 0x6b} ++ return &inst{0x53, 0x0, 0x0, 0x1, -671, 0x6b} + case AFCVTSD: +- return &inst{0x53, 0x0, 0x1, 1025, 0x20} ++ return &inst{0x53, 0x0, 0x0, 0x1, 1025, 0x20} + case AFCVTSL: +- return &inst{0x53, 0x0, 0x2, -766, 0x68} ++ return &inst{0x53, 0x0, 0x0, 0x2, -766, 0x68} + case AFCVTSLU: +- return &inst{0x53, 0x0, 0x3, -765, 0x68} ++ return &inst{0x53, 0x0, 0x0, 0x3, -765, 0x68} + case AFCVTSQ: +- return &inst{0x53, 0x0, 0x3, 1027, 0x20} ++ return &inst{0x53, 0x0, 0x0, 0x3, 1027, 0x20} + case AFCVTSW: +- return &inst{0x53, 0x0, 0x0, -768, 0x68} ++ return &inst{0x53, 0x0, 0x0, 0x0, -768, 0x68} + case AFCVTSWU: +- return &inst{0x53, 0x0, 0x1, -767, 0x68} ++ return &inst{0x53, 0x0, 0x0, 0x1, -767, 0x68} + case AFCVTWD: +- return &inst{0x53, 0x0, 0x0, -992, 0x61} ++ return &inst{0x53, 0x0, 0x0, 0x0, -992, 0x61} + case AFCVTWQ: +- return &inst{0x53, 0x0, 0x0, -928, 0x63} ++ return &inst{0x53, 0x0, 0x0, 0x0, -928, 0x63} + case AFCVTWS: +- return &inst{0x53, 0x0, 0x0, -1024, 0x60} ++ return &inst{0x53, 0x0, 0x0, 0x0, -1024, 0x60} + case AFCVTWUD: +- return &inst{0x53, 0x0, 0x1, -991, 0x61} ++ return &inst{0x53, 0x0, 0x0, 0x1, -991, 0x61} + case AFCVTWUQ: +- return &inst{0x53, 0x0, 0x1, -927, 0x63} ++ return &inst{0x53, 0x0, 0x0, 0x1, -927, 0x63} + case AFCVTWUS: +- return &inst{0x53, 0x0, 0x1, -1023, 0x60} ++ return &inst{0x53, 0x0, 0x0, 0x1, -1023, 0x60} + case AFDIVD: +- return &inst{0x53, 0x0, 0x0, 416, 0xd} ++ return &inst{0x53, 0x0, 0x0, 0x0, 416, 0xd} + case AFDIVQ: +- return &inst{0x53, 0x0, 0x0, 480, 0xf} ++ return &inst{0x53, 0x0, 0x0, 0x0, 480, 0xf} + case AFDIVS: +- return &inst{0x53, 0x0, 0x0, 384, 0xc} ++ return &inst{0x53, 0x0, 0x0, 0x0, 384, 0xc} + case AFENCE: +- return &inst{0xf, 0x0, 0x0, 0, 0x0} ++ return &inst{0xf, 0x0, 0x0, 0x0, 0, 0x0} + case AFENCETSO: +- return &inst{0xf, 0x0, 0x13, -1997, 0x41} ++ return &inst{0xf, 0x0, 0x0, 0x13, -1997, 0x41} + case AFEQD: +- return &inst{0x53, 0x2, 0x0, -1504, 0x51} ++ return &inst{0x53, 0x2, 0x0, 0x0, -1504, 0x51} + case AFEQQ: +- return &inst{0x53, 0x2, 0x0, -1440, 0x53} ++ return &inst{0x53, 0x2, 0x0, 0x0, -1440, 0x53} + case AFEQS: +- return &inst{0x53, 0x2, 0x0, -1536, 0x50} ++ return &inst{0x53, 0x2, 0x0, 0x0, -1536, 0x50} + case AFLD: +- return &inst{0x7, 0x3, 0x0, 0, 0x0} ++ return &inst{0x7, 0x3, 0x0, 0x0, 0, 0x0} + case AFLED: +- return &inst{0x53, 0x0, 0x0, -1504, 0x51} ++ return &inst{0x53, 0x0, 0x0, 0x0, -1504, 0x51} + case AFLEQ: +- return &inst{0x53, 0x0, 0x0, -1440, 0x53} ++ return &inst{0x53, 0x0, 0x0, 0x0, -1440, 0x53} + case AFLES: +- return &inst{0x53, 0x0, 0x0, -1536, 0x50} ++ return &inst{0x53, 0x0, 0x0, 0x0, -1536, 0x50} + case AFLQ: +- return &inst{0x7, 0x4, 0x0, 0, 0x0} ++ return &inst{0x7, 0x4, 0x0, 0x0, 0, 0x0} + case AFLTD: +- return &inst{0x53, 0x1, 0x0, -1504, 0x51} ++ return &inst{0x53, 0x1, 0x0, 0x0, -1504, 0x51} + case AFLTQ: +- return &inst{0x53, 0x1, 0x0, -1440, 0x53} ++ return &inst{0x53, 0x1, 0x0, 0x0, -1440, 0x53} + case AFLTS: +- return &inst{0x53, 0x1, 0x0, -1536, 0x50} ++ return &inst{0x53, 0x1, 0x0, 0x0, -1536, 0x50} + case AFLW: +- return &inst{0x7, 0x2, 0x0, 0, 0x0} ++ return &inst{0x7, 0x2, 0x0, 0x0, 0, 0x0} + case AFMADDD: +- return &inst{0x43, 0x0, 0x0, 32, 0x1} ++ return &inst{0x43, 0x0, 0x0, 0x0, 32, 0x1} + case AFMADDQ: +- return &inst{0x43, 0x0, 0x0, 96, 0x3} ++ return &inst{0x43, 0x0, 0x0, 0x0, 96, 0x3} + case AFMADDS: +- return &inst{0x43, 0x0, 0x0, 0, 0x0} ++ return &inst{0x43, 0x0, 0x0, 0x0, 0, 0x0} + case AFMAXD: +- return &inst{0x53, 0x1, 0x0, 672, 0x15} ++ return &inst{0x53, 0x1, 0x0, 0x0, 672, 0x15} + case AFMAXQ: +- return &inst{0x53, 0x1, 0x0, 736, 0x17} ++ return &inst{0x53, 0x1, 0x0, 0x0, 736, 0x17} + case AFMAXS: +- return &inst{0x53, 0x1, 0x0, 640, 0x14} ++ return &inst{0x53, 0x1, 0x0, 0x0, 640, 0x14} + case AFMIND: +- return &inst{0x53, 0x0, 0x0, 672, 0x15} ++ return &inst{0x53, 0x0, 0x0, 0x0, 672, 0x15} + case AFMINQ: +- return &inst{0x53, 0x0, 0x0, 736, 0x17} ++ return &inst{0x53, 0x0, 0x0, 0x0, 736, 0x17} + case AFMINS: +- return &inst{0x53, 0x0, 0x0, 640, 0x14} ++ return &inst{0x53, 0x0, 0x0, 0x0, 640, 0x14} + case AFMSUBD: +- return &inst{0x47, 0x0, 0x0, 32, 0x1} ++ return &inst{0x47, 0x0, 0x0, 0x0, 32, 0x1} + case AFMSUBQ: +- return &inst{0x47, 0x0, 0x0, 96, 0x3} ++ return &inst{0x47, 0x0, 0x0, 0x0, 96, 0x3} + case AFMSUBS: +- return &inst{0x47, 0x0, 0x0, 0, 0x0} ++ return &inst{0x47, 0x0, 0x0, 0x0, 0, 0x0} + case AFMULD: +- return &inst{0x53, 0x0, 0x0, 288, 0x9} ++ return &inst{0x53, 0x0, 0x0, 0x0, 288, 0x9} + case AFMULQ: +- return &inst{0x53, 0x0, 0x0, 352, 0xb} ++ return &inst{0x53, 0x0, 0x0, 0x0, 352, 0xb} + case AFMULS: +- return &inst{0x53, 0x0, 0x0, 256, 0x8} ++ return &inst{0x53, 0x0, 0x0, 0x0, 256, 0x8} + case AFMVDX: +- return &inst{0x53, 0x0, 0x0, -224, 0x79} ++ return &inst{0x53, 0x0, 0x0, 0x0, -224, 0x79} + case AFMVSX: +- return &inst{0x53, 0x0, 0x0, -256, 0x78} ++ return &inst{0x53, 0x0, 0x0, 0x0, -256, 0x78} + case AFMVWX: +- return &inst{0x53, 0x0, 0x0, -256, 0x78} ++ return &inst{0x53, 0x0, 0x0, 0x0, -256, 0x78} + case AFMVXD: +- return &inst{0x53, 0x0, 0x0, -480, 0x71} ++ return &inst{0x53, 0x0, 0x0, 0x0, -480, 0x71} + case AFMVXS: +- return &inst{0x53, 0x0, 0x0, -512, 0x70} ++ return &inst{0x53, 0x0, 0x0, 0x0, -512, 0x70} + case AFMVXW: +- return &inst{0x53, 0x0, 0x0, -512, 0x70} ++ return &inst{0x53, 0x0, 0x0, 0x0, -512, 0x70} + case AFNMADDD: +- return &inst{0x4f, 0x0, 0x0, 32, 0x1} ++ return &inst{0x4f, 0x0, 0x0, 0x0, 32, 0x1} + case AFNMADDQ: +- return &inst{0x4f, 0x0, 0x0, 96, 0x3} ++ return &inst{0x4f, 0x0, 0x0, 0x0, 96, 0x3} + case AFNMADDS: +- return &inst{0x4f, 0x0, 0x0, 0, 0x0} ++ return &inst{0x4f, 0x0, 0x0, 0x0, 0, 0x0} + case AFNMSUBD: +- return &inst{0x4b, 0x0, 0x0, 32, 0x1} ++ return &inst{0x4b, 0x0, 0x0, 0x0, 32, 0x1} + case AFNMSUBQ: +- return &inst{0x4b, 0x0, 0x0, 96, 0x3} ++ return &inst{0x4b, 0x0, 0x0, 0x0, 96, 0x3} + case AFNMSUBS: +- return &inst{0x4b, 0x0, 0x0, 0, 0x0} ++ return &inst{0x4b, 0x0, 0x0, 0x0, 0, 0x0} + case AFRCSR: +- return &inst{0x73, 0x2, 0x3, 3, 0x0} ++ return &inst{0x73, 0x2, 0x0, 0x3, 3, 0x0} + case AFRFLAGS: +- return &inst{0x73, 0x2, 0x1, 1, 0x0} ++ return &inst{0x73, 0x2, 0x0, 0x1, 1, 0x0} + case AFRRM: +- return &inst{0x73, 0x2, 0x2, 2, 0x0} ++ return &inst{0x73, 0x2, 0x0, 0x2, 2, 0x0} + case AFSCSR: +- return &inst{0x73, 0x1, 0x3, 3, 0x0} ++ return &inst{0x73, 0x1, 0x0, 0x3, 3, 0x0} + case AFSD: +- return &inst{0x27, 0x3, 0x0, 0, 0x0} ++ return &inst{0x27, 0x3, 0x0, 0x0, 0, 0x0} + case AFSFLAGS: +- return &inst{0x73, 0x1, 0x1, 1, 0x0} ++ return &inst{0x73, 0x1, 0x0, 0x1, 1, 0x0} + case AFSFLAGSI: +- return &inst{0x73, 0x5, 0x1, 1, 0x0} ++ return &inst{0x73, 0x5, 0x0, 0x1, 1, 0x0} + case AFSGNJD: +- return &inst{0x53, 0x0, 0x0, 544, 0x11} ++ return &inst{0x53, 0x0, 0x0, 0x0, 544, 0x11} + case AFSGNJQ: +- return &inst{0x53, 0x0, 0x0, 608, 0x13} ++ return &inst{0x53, 0x0, 0x0, 0x0, 608, 0x13} + case AFSGNJS: +- return &inst{0x53, 0x0, 0x0, 512, 0x10} ++ return &inst{0x53, 0x0, 0x0, 0x0, 512, 0x10} + case AFSGNJND: +- return &inst{0x53, 0x1, 0x0, 544, 0x11} ++ return &inst{0x53, 0x1, 0x0, 0x0, 544, 0x11} + case AFSGNJNQ: +- return &inst{0x53, 0x1, 0x0, 608, 0x13} ++ return &inst{0x53, 0x1, 0x0, 0x0, 608, 0x13} + case AFSGNJNS: +- return &inst{0x53, 0x1, 0x0, 512, 0x10} ++ return &inst{0x53, 0x1, 0x0, 0x0, 512, 0x10} + case AFSGNJXD: +- return &inst{0x53, 0x2, 0x0, 544, 0x11} ++ return &inst{0x53, 0x2, 0x0, 0x0, 544, 0x11} + case AFSGNJXQ: +- return &inst{0x53, 0x2, 0x0, 608, 0x13} ++ return &inst{0x53, 0x2, 0x0, 0x0, 608, 0x13} + case AFSGNJXS: +- return &inst{0x53, 0x2, 0x0, 512, 0x10} ++ return &inst{0x53, 0x2, 0x0, 0x0, 512, 0x10} + case AFSQ: +- return &inst{0x27, 0x4, 0x0, 0, 0x0} ++ return &inst{0x27, 0x4, 0x0, 0x0, 0, 0x0} + case AFSQRTD: +- return &inst{0x53, 0x0, 0x0, 1440, 0x2d} ++ return &inst{0x53, 0x0, 0x0, 0x0, 1440, 0x2d} + case AFSQRTQ: +- return &inst{0x53, 0x0, 0x0, 1504, 0x2f} ++ return &inst{0x53, 0x0, 0x0, 0x0, 1504, 0x2f} + case AFSQRTS: +- return &inst{0x53, 0x0, 0x0, 1408, 0x2c} ++ return &inst{0x53, 0x0, 0x0, 0x0, 1408, 0x2c} + case AFSRM: +- return &inst{0x73, 0x1, 0x2, 2, 0x0} ++ return &inst{0x73, 0x1, 0x0, 0x2, 2, 0x0} + case AFSRMI: +- return &inst{0x73, 0x5, 0x2, 2, 0x0} ++ return &inst{0x73, 0x5, 0x0, 0x2, 2, 0x0} + case AFSUBD: +- return &inst{0x53, 0x0, 0x0, 160, 0x5} ++ return &inst{0x53, 0x0, 0x0, 0x0, 160, 0x5} + case AFSUBQ: +- return &inst{0x53, 0x0, 0x0, 224, 0x7} ++ return &inst{0x53, 0x0, 0x0, 0x0, 224, 0x7} + case AFSUBS: +- return &inst{0x53, 0x0, 0x0, 128, 0x4} ++ return &inst{0x53, 0x0, 0x0, 0x0, 128, 0x4} + case AFSW: +- return &inst{0x27, 0x2, 0x0, 0, 0x0} ++ return &inst{0x27, 0x2, 0x0, 0x0, 0, 0x0} + case AJAL: +- return &inst{0x6f, 0x0, 0x0, 0, 0x0} ++ return &inst{0x6f, 0x0, 0x0, 0x0, 0, 0x0} + case AJALR: +- return &inst{0x67, 0x0, 0x0, 0, 0x0} ++ return &inst{0x67, 0x0, 0x0, 0x0, 0, 0x0} + case ALB: +- return &inst{0x3, 0x0, 0x0, 0, 0x0} ++ return &inst{0x3, 0x0, 0x0, 0x0, 0, 0x0} + case ALBU: +- return &inst{0x3, 0x4, 0x0, 0, 0x0} ++ return &inst{0x3, 0x4, 0x0, 0x0, 0, 0x0} + case ALD: +- return &inst{0x3, 0x3, 0x0, 0, 0x0} ++ return &inst{0x3, 0x3, 0x0, 0x0, 0, 0x0} + case ALH: +- return &inst{0x3, 0x1, 0x0, 0, 0x0} ++ return &inst{0x3, 0x1, 0x0, 0x0, 0, 0x0} + case ALHU: +- return &inst{0x3, 0x5, 0x0, 0, 0x0} ++ return &inst{0x3, 0x5, 0x0, 0x0, 0, 0x0} + case ALRD: +- return &inst{0x2f, 0x3, 0x0, 256, 0x8} ++ return &inst{0x2f, 0x3, 0x0, 0x0, 256, 0x8} + case ALRW: +- return &inst{0x2f, 0x2, 0x0, 256, 0x8} ++ return &inst{0x2f, 0x2, 0x0, 0x0, 256, 0x8} + case ALUI: +- return &inst{0x37, 0x0, 0x0, 0, 0x0} ++ return &inst{0x37, 0x0, 0x0, 0x0, 0, 0x0} + case ALW: +- return &inst{0x3, 0x2, 0x0, 0, 0x0} ++ return &inst{0x3, 0x2, 0x0, 0x0, 0, 0x0} + case ALWU: +- return &inst{0x3, 0x6, 0x0, 0, 0x0} ++ return &inst{0x3, 0x6, 0x0, 0x0, 0, 0x0} + case AMAX: +- return &inst{0x33, 0x6, 0x0, 160, 0x5} ++ return &inst{0x33, 0x6, 0x0, 0x0, 160, 0x5} + case AMAXU: +- return &inst{0x33, 0x7, 0x0, 160, 0x5} ++ return &inst{0x33, 0x7, 0x0, 0x0, 160, 0x5} + case AMIN: +- return &inst{0x33, 0x4, 0x0, 160, 0x5} ++ return &inst{0x33, 0x4, 0x0, 0x0, 160, 0x5} + case AMINU: +- return &inst{0x33, 0x5, 0x0, 160, 0x5} ++ return &inst{0x33, 0x5, 0x0, 0x0, 160, 0x5} + case AMRET: +- return &inst{0x73, 0x0, 0x2, 770, 0x18} ++ return &inst{0x73, 0x0, 0x0, 0x2, 770, 0x18} + case AMUL: +- return &inst{0x33, 0x0, 0x0, 32, 0x1} ++ return &inst{0x33, 0x0, 0x0, 0x0, 32, 0x1} + case AMULH: +- return &inst{0x33, 0x1, 0x0, 32, 0x1} ++ return &inst{0x33, 0x1, 0x0, 0x0, 32, 0x1} + case AMULHSU: +- return &inst{0x33, 0x2, 0x0, 32, 0x1} ++ return &inst{0x33, 0x2, 0x0, 0x0, 32, 0x1} + case AMULHU: +- return &inst{0x33, 0x3, 0x0, 32, 0x1} ++ return &inst{0x33, 0x3, 0x0, 0x0, 32, 0x1} + case AMULW: +- return &inst{0x3b, 0x0, 0x0, 32, 0x1} ++ return &inst{0x3b, 0x0, 0x0, 0x0, 32, 0x1} + case AOR: +- return &inst{0x33, 0x6, 0x0, 0, 0x0} ++ return &inst{0x33, 0x6, 0x0, 0x0, 0, 0x0} + case AORCB: +- return &inst{0x13, 0x5, 0x7, 647, 0x14} ++ return &inst{0x13, 0x5, 0x0, 0x7, 647, 0x14} + case AORI: +- return &inst{0x13, 0x6, 0x0, 0, 0x0} ++ return &inst{0x13, 0x6, 0x0, 0x0, 0, 0x0} + case AORN: +- return &inst{0x33, 0x6, 0x0, 1024, 0x20} ++ return &inst{0x33, 0x6, 0x0, 0x0, 1024, 0x20} + case APAUSE: +- return &inst{0xf, 0x0, 0x10, 16, 0x0} ++ return &inst{0xf, 0x0, 0x0, 0x10, 16, 0x0} + case ARDCYCLE: +- return &inst{0x73, 0x2, 0x0, -1024, 0x60} ++ return &inst{0x73, 0x2, 0x0, 0x0, -1024, 0x60} + case ARDCYCLEH: +- return &inst{0x73, 0x2, 0x0, -896, 0x64} ++ return &inst{0x73, 0x2, 0x0, 0x0, -896, 0x64} + case ARDINSTRET: +- return &inst{0x73, 0x2, 0x2, -1022, 0x60} ++ return &inst{0x73, 0x2, 0x0, 0x2, -1022, 0x60} + case ARDINSTRETH: +- return &inst{0x73, 0x2, 0x2, -894, 0x64} ++ return &inst{0x73, 0x2, 0x0, 0x2, -894, 0x64} + case ARDTIME: +- return &inst{0x73, 0x2, 0x1, -1023, 0x60} ++ return &inst{0x73, 0x2, 0x0, 0x1, -1023, 0x60} + case ARDTIMEH: +- return &inst{0x73, 0x2, 0x1, -895, 0x64} ++ return &inst{0x73, 0x2, 0x0, 0x1, -895, 0x64} + case AREM: +- return &inst{0x33, 0x6, 0x0, 32, 0x1} ++ return &inst{0x33, 0x6, 0x0, 0x0, 32, 0x1} + case AREMU: +- return &inst{0x33, 0x7, 0x0, 32, 0x1} ++ return &inst{0x33, 0x7, 0x0, 0x0, 32, 0x1} + case AREMUW: +- return &inst{0x3b, 0x7, 0x0, 32, 0x1} ++ return &inst{0x3b, 0x7, 0x0, 0x0, 32, 0x1} + case AREMW: +- return &inst{0x3b, 0x6, 0x0, 32, 0x1} ++ return &inst{0x3b, 0x6, 0x0, 0x0, 32, 0x1} + case AREV8: +- return &inst{0x13, 0x5, 0x18, 1720, 0x35} ++ return &inst{0x13, 0x5, 0x0, 0x18, 1720, 0x35} + case AROL: +- return &inst{0x33, 0x1, 0x0, 1536, 0x30} ++ return &inst{0x33, 0x1, 0x0, 0x0, 1536, 0x30} + case AROLW: +- return &inst{0x3b, 0x1, 0x0, 1536, 0x30} ++ return &inst{0x3b, 0x1, 0x0, 0x0, 1536, 0x30} + case AROR: +- return &inst{0x33, 0x5, 0x0, 1536, 0x30} ++ return &inst{0x33, 0x5, 0x0, 0x0, 1536, 0x30} + case ARORI: +- return &inst{0x13, 0x5, 0x0, 1536, 0x30} ++ return &inst{0x13, 0x5, 0x0, 0x0, 1536, 0x30} + case ARORIW: +- return &inst{0x1b, 0x5, 0x0, 1536, 0x30} ++ return &inst{0x1b, 0x5, 0x0, 0x0, 1536, 0x30} + case ARORW: +- return &inst{0x3b, 0x5, 0x0, 1536, 0x30} ++ return &inst{0x3b, 0x5, 0x0, 0x0, 1536, 0x30} + case ASB: +- return &inst{0x23, 0x0, 0x0, 0, 0x0} ++ return &inst{0x23, 0x0, 0x0, 0x0, 0, 0x0} + case ASBREAK: +- return &inst{0x73, 0x0, 0x1, 1, 0x0} ++ return &inst{0x73, 0x0, 0x0, 0x1, 1, 0x0} + case ASCD: +- return &inst{0x2f, 0x3, 0x0, 384, 0xc} ++ return &inst{0x2f, 0x3, 0x0, 0x0, 384, 0xc} + case ASCW: +- return &inst{0x2f, 0x2, 0x0, 384, 0xc} ++ return &inst{0x2f, 0x2, 0x0, 0x0, 384, 0xc} + case ASCALL: +- return &inst{0x73, 0x0, 0x0, 0, 0x0} ++ return &inst{0x73, 0x0, 0x0, 0x0, 0, 0x0} + case ASD: +- return &inst{0x23, 0x3, 0x0, 0, 0x0} ++ return &inst{0x23, 0x3, 0x0, 0x0, 0, 0x0} + case ASEXTB: +- return &inst{0x13, 0x1, 0x4, 1540, 0x30} ++ return &inst{0x13, 0x1, 0x0, 0x4, 1540, 0x30} + case ASEXTH: +- return &inst{0x13, 0x1, 0x5, 1541, 0x30} ++ return &inst{0x13, 0x1, 0x0, 0x5, 1541, 0x30} + case ASFENCEVMA: +- return &inst{0x73, 0x0, 0x0, 288, 0x9} ++ return &inst{0x73, 0x0, 0x0, 0x0, 288, 0x9} + case ASH: +- return &inst{0x23, 0x1, 0x0, 0, 0x0} ++ return &inst{0x23, 0x1, 0x0, 0x0, 0, 0x0} + case ASH1ADD: +- return &inst{0x33, 0x2, 0x0, 512, 0x10} ++ return &inst{0x33, 0x2, 0x0, 0x0, 512, 0x10} + case ASH1ADDUW: +- return &inst{0x3b, 0x2, 0x0, 512, 0x10} ++ return &inst{0x3b, 0x2, 0x0, 0x0, 512, 0x10} + case ASH2ADD: +- return &inst{0x33, 0x4, 0x0, 512, 0x10} ++ return &inst{0x33, 0x4, 0x0, 0x0, 512, 0x10} + case ASH2ADDUW: +- return &inst{0x3b, 0x4, 0x0, 512, 0x10} ++ return &inst{0x3b, 0x4, 0x0, 0x0, 512, 0x10} + case ASH3ADD: +- return &inst{0x33, 0x6, 0x0, 512, 0x10} ++ return &inst{0x33, 0x6, 0x0, 0x0, 512, 0x10} + case ASH3ADDUW: +- return &inst{0x3b, 0x6, 0x0, 512, 0x10} ++ return &inst{0x3b, 0x6, 0x0, 0x0, 512, 0x10} + case ASLL: +- return &inst{0x33, 0x1, 0x0, 0, 0x0} ++ return &inst{0x33, 0x1, 0x0, 0x0, 0, 0x0} + case ASLLI: +- return &inst{0x13, 0x1, 0x0, 0, 0x0} ++ return &inst{0x13, 0x1, 0x0, 0x0, 0, 0x0} + case ASLLIUW: +- return &inst{0x1b, 0x1, 0x0, 128, 0x4} ++ return &inst{0x1b, 0x1, 0x0, 0x0, 128, 0x4} + case ASLLIW: +- return &inst{0x1b, 0x1, 0x0, 0, 0x0} ++ return &inst{0x1b, 0x1, 0x0, 0x0, 0, 0x0} + case ASLLW: +- return &inst{0x3b, 0x1, 0x0, 0, 0x0} ++ return &inst{0x3b, 0x1, 0x0, 0x0, 0, 0x0} + case ASLT: +- return &inst{0x33, 0x2, 0x0, 0, 0x0} ++ return &inst{0x33, 0x2, 0x0, 0x0, 0, 0x0} + case ASLTI: +- return &inst{0x13, 0x2, 0x0, 0, 0x0} ++ return &inst{0x13, 0x2, 0x0, 0x0, 0, 0x0} + case ASLTIU: +- return &inst{0x13, 0x3, 0x0, 0, 0x0} ++ return &inst{0x13, 0x3, 0x0, 0x0, 0, 0x0} + case ASLTU: +- return &inst{0x33, 0x3, 0x0, 0, 0x0} ++ return &inst{0x33, 0x3, 0x0, 0x0, 0, 0x0} + case ASRA: +- return &inst{0x33, 0x5, 0x0, 1024, 0x20} ++ return &inst{0x33, 0x5, 0x0, 0x0, 1024, 0x20} + case ASRAI: +- return &inst{0x13, 0x5, 0x0, 1024, 0x20} ++ return &inst{0x13, 0x5, 0x0, 0x0, 1024, 0x20} + case ASRAIW: +- return &inst{0x1b, 0x5, 0x0, 1024, 0x20} ++ return &inst{0x1b, 0x5, 0x0, 0x0, 1024, 0x20} + case ASRAW: +- return &inst{0x3b, 0x5, 0x0, 1024, 0x20} ++ return &inst{0x3b, 0x5, 0x0, 0x0, 1024, 0x20} + case ASRET: +- return &inst{0x73, 0x0, 0x2, 258, 0x8} ++ return &inst{0x73, 0x0, 0x0, 0x2, 258, 0x8} + case ASRL: +- return &inst{0x33, 0x5, 0x0, 0, 0x0} ++ return &inst{0x33, 0x5, 0x0, 0x0, 0, 0x0} + case ASRLI: +- return &inst{0x13, 0x5, 0x0, 0, 0x0} ++ return &inst{0x13, 0x5, 0x0, 0x0, 0, 0x0} + case ASRLIW: +- return &inst{0x1b, 0x5, 0x0, 0, 0x0} ++ return &inst{0x1b, 0x5, 0x0, 0x0, 0, 0x0} + case ASRLW: +- return &inst{0x3b, 0x5, 0x0, 0, 0x0} ++ return &inst{0x3b, 0x5, 0x0, 0x0, 0, 0x0} + case ASUB: +- return &inst{0x33, 0x0, 0x0, 1024, 0x20} ++ return &inst{0x33, 0x0, 0x0, 0x0, 1024, 0x20} + case ASUBW: +- return &inst{0x3b, 0x0, 0x0, 1024, 0x20} ++ return &inst{0x3b, 0x0, 0x0, 0x0, 1024, 0x20} + case ASW: +- return &inst{0x23, 0x2, 0x0, 0, 0x0} ++ return &inst{0x23, 0x2, 0x0, 0x0, 0, 0x0} ++ case AVAADDVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, 576, 0x12} ++ case AVAADDVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, 576, 0x12} ++ case AVAADDUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, 512, 0x10} ++ case AVAADDUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, 512, 0x10} ++ case AVADCVIM: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1024, 0x20} ++ case AVADCVVM: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1024, 0x20} ++ case AVADCVXM: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1024, 0x20} ++ case AVADDVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 0, 0x0} ++ case AVADDVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 0, 0x0} ++ case AVADDVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 0, 0x0} ++ case AVANDVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 576, 0x12} ++ case AVANDVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 576, 0x12} ++ case AVANDVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 576, 0x12} ++ case AVASUBVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, 704, 0x16} ++ case AVASUBVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, 704, 0x16} ++ case AVASUBUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, 640, 0x14} ++ case AVASUBUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, 640, 0x14} ++ case AVCOMPRESSVM: ++ return &inst{0x57, 0x2, 0x0, 0x0, 1504, 0x2f} ++ case AVCPOPM: ++ return &inst{0x57, 0x2, 0x10, 0x0, 1024, 0x20} ++ case AVDIVVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1984, 0x42} ++ case AVDIVVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1984, 0x42} ++ case AVDIVUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -2048, 0x40} ++ case AVDIVUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -2048, 0x40} ++ case AVFADDVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 0, 0x0} ++ case AVFADDVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 0, 0x0} ++ case AVFCLASSV: ++ return &inst{0x57, 0x1, 0x10, 0x0, 1216, 0x26} ++ case AVFCVTFXV: ++ return &inst{0x57, 0x1, 0x3, 0x0, 1152, 0x24} ++ case AVFCVTFXUV: ++ return &inst{0x57, 0x1, 0x2, 0x0, 1152, 0x24} ++ case AVFCVTRTZXFV: ++ return &inst{0x57, 0x1, 0x7, 0x0, 1152, 0x24} ++ case AVFCVTRTZXUFV: ++ return &inst{0x57, 0x1, 0x6, 0x0, 1152, 0x24} ++ case AVFCVTXFV: ++ return &inst{0x57, 0x1, 0x1, 0x0, 1152, 0x24} ++ case AVFCVTXUFV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 1152, 0x24} ++ case AVFDIVVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -2048, 0x40} ++ case AVFDIVVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -2048, 0x40} ++ case AVFIRSTM: ++ return &inst{0x57, 0x2, 0x11, 0x0, 1024, 0x20} ++ case AVFMACCVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1280, 0x58} ++ case AVFMACCVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -1280, 0x58} ++ case AVFMADDVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1536, 0x50} ++ case AVFMADDVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -1536, 0x50} ++ case AVFMAXVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 384, 0xc} ++ case AVFMAXVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 384, 0xc} ++ case AVFMERGEVFM: ++ return &inst{0x57, 0x5, 0x0, 0x0, 1472, 0x2e} ++ case AVFMINVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 256, 0x8} ++ case AVFMINVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 256, 0x8} ++ case AVFMSACVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1152, 0x5c} ++ case AVFMSACVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -1152, 0x5c} ++ case AVFMSUBVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1408, 0x54} ++ case AVFMSUBVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -1408, 0x54} ++ case AVFMULVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1792, 0x48} ++ case AVFMULVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -1792, 0x48} ++ case AVFMVFS: ++ return &inst{0x57, 0x1, 0x0, 0x0, 1056, 0x21} ++ case AVFMVSF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 1056, 0x21} ++ case AVFMVVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 1504, 0x2f} ++ case AVFNCVTFFW: ++ return &inst{0x57, 0x1, 0x14, 0x0, 1152, 0x24} ++ case AVFNCVTFXW: ++ return &inst{0x57, 0x1, 0x13, 0x0, 1152, 0x24} ++ case AVFNCVTFXUW: ++ return &inst{0x57, 0x1, 0x12, 0x0, 1152, 0x24} ++ case AVFNCVTRODFFW: ++ return &inst{0x57, 0x1, 0x15, 0x0, 1152, 0x24} ++ case AVFNCVTRTZXFW: ++ return &inst{0x57, 0x1, 0x17, 0x0, 1152, 0x24} ++ case AVFNCVTRTZXUFW: ++ return &inst{0x57, 0x1, 0x16, 0x0, 1152, 0x24} ++ case AVFNCVTXFW: ++ return &inst{0x57, 0x1, 0x11, 0x0, 1152, 0x24} ++ case AVFNCVTXUFW: ++ return &inst{0x57, 0x1, 0x10, 0x0, 1152, 0x24} ++ case AVFNMACCVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1216, 0x5a} ++ case AVFNMACCVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -1216, 0x5a} ++ case AVFNMADDVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1472, 0x52} ++ case AVFNMADDVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -1472, 0x52} ++ case AVFNMSACVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1088, 0x5e} ++ case AVFNMSACVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -1088, 0x5e} ++ case AVFNMSUBVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1344, 0x56} ++ case AVFNMSUBVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -1344, 0x56} ++ case AVFRDIVVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1984, 0x42} ++ case AVFREC7V: ++ return &inst{0x57, 0x1, 0x5, 0x0, 1216, 0x26} ++ case AVFREDMAXVS: ++ return &inst{0x57, 0x1, 0x0, 0x0, 448, 0xe} ++ case AVFREDMINVS: ++ return &inst{0x57, 0x1, 0x0, 0x0, 320, 0xa} ++ case AVFREDOSUMVS: ++ return &inst{0x57, 0x1, 0x0, 0x0, 192, 0x6} ++ case AVFREDUSUMVS: ++ return &inst{0x57, 0x1, 0x0, 0x0, 64, 0x2} ++ case AVFRSQRT7V: ++ return &inst{0x57, 0x1, 0x4, 0x0, 1216, 0x26} ++ case AVFRSUBVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1600, 0x4e} ++ case AVFSGNJVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 512, 0x10} ++ case AVFSGNJVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 512, 0x10} ++ case AVFSGNJNVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 576, 0x12} ++ case AVFSGNJNVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 576, 0x12} ++ case AVFSGNJXVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 640, 0x14} ++ case AVFSGNJXVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 640, 0x14} ++ case AVFSLIDE1DOWNVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 960, 0x1e} ++ case AVFSLIDE1UPVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 896, 0x1c} ++ case AVFSQRTV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 1216, 0x26} ++ case AVFSUBVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 128, 0x4} ++ case AVFSUBVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 128, 0x4} ++ case AVFWADDVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -1024, 0x60} ++ case AVFWADDVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -1024, 0x60} ++ case AVFWADDWF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -768, 0x68} ++ case AVFWADDWV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -768, 0x68} ++ case AVFWCVTFFV: ++ return &inst{0x57, 0x1, 0xc, 0x0, 1152, 0x24} ++ case AVFWCVTFXV: ++ return &inst{0x57, 0x1, 0xb, 0x0, 1152, 0x24} ++ case AVFWCVTFXUV: ++ return &inst{0x57, 0x1, 0xa, 0x0, 1152, 0x24} ++ case AVFWCVTRTZXFV: ++ return &inst{0x57, 0x1, 0xf, 0x0, 1152, 0x24} ++ case AVFWCVTRTZXUFV: ++ return &inst{0x57, 0x1, 0xe, 0x0, 1152, 0x24} ++ case AVFWCVTXFV: ++ return &inst{0x57, 0x1, 0x9, 0x0, 1152, 0x24} ++ case AVFWCVTXUFV: ++ return &inst{0x57, 0x1, 0x8, 0x0, 1152, 0x24} ++ case AVFWMACCVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -256, 0x78} ++ case AVFWMACCVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -256, 0x78} ++ case AVFWMSACVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -128, 0x7c} ++ case AVFWMSACVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -128, 0x7c} ++ case AVFWMULVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -512, 0x70} ++ case AVFWMULVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -512, 0x70} ++ case AVFWNMACCVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -192, 0x7a} ++ case AVFWNMACCVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -192, 0x7a} ++ case AVFWNMSACVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -64, 0x7e} ++ case AVFWNMSACVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -64, 0x7e} ++ case AVFWREDOSUMVS: ++ return &inst{0x57, 0x1, 0x0, 0x0, -832, 0x66} ++ case AVFWREDUSUMVS: ++ return &inst{0x57, 0x1, 0x0, 0x0, -960, 0x62} ++ case AVFWSUBVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -896, 0x64} ++ case AVFWSUBVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -896, 0x64} ++ case AVFWSUBWF: ++ return &inst{0x57, 0x5, 0x0, 0x0, -640, 0x6c} ++ case AVFWSUBWV: ++ return &inst{0x57, 0x1, 0x0, 0x0, -640, 0x6c} ++ case AVIDV: ++ return &inst{0x57, 0x2, 0x11, 0x0, 1280, 0x28} ++ case AVIOTAM: ++ return &inst{0x57, 0x2, 0x10, 0x0, 1280, 0x28} ++ case AVL1RE16V: ++ return &inst{0x7, 0x5, 0x0, 0x8, 40, 0x1} ++ case AVL1RE32V: ++ return &inst{0x7, 0x6, 0x0, 0x8, 40, 0x1} ++ case AVL1RE64V: ++ return &inst{0x7, 0x7, 0x0, 0x8, 40, 0x1} ++ case AVL1RE8V: ++ return &inst{0x7, 0x0, 0x0, 0x8, 40, 0x1} ++ case AVL2RE16V: ++ return &inst{0x7, 0x5, 0x0, 0x8, 552, 0x11} ++ case AVL2RE32V: ++ return &inst{0x7, 0x6, 0x0, 0x8, 552, 0x11} ++ case AVL2RE64V: ++ return &inst{0x7, 0x7, 0x0, 0x8, 552, 0x11} ++ case AVL2RE8V: ++ return &inst{0x7, 0x0, 0x0, 0x8, 552, 0x11} ++ case AVL4RE16V: ++ return &inst{0x7, 0x5, 0x0, 0x8, 1576, 0x31} ++ case AVL4RE32V: ++ return &inst{0x7, 0x6, 0x0, 0x8, 1576, 0x31} ++ case AVL4RE64V: ++ return &inst{0x7, 0x7, 0x0, 0x8, 1576, 0x31} ++ case AVL4RE8V: ++ return &inst{0x7, 0x0, 0x0, 0x8, 1576, 0x31} ++ case AVL8RE16V: ++ return &inst{0x7, 0x5, 0x0, 0x8, -472, 0x71} ++ case AVL8RE32V: ++ return &inst{0x7, 0x6, 0x0, 0x8, -472, 0x71} ++ case AVL8RE64V: ++ return &inst{0x7, 0x7, 0x0, 0x8, -472, 0x71} ++ case AVL8RE8V: ++ return &inst{0x7, 0x0, 0x0, 0x8, -472, 0x71} ++ case AVLE16V: ++ return &inst{0x7, 0x5, 0x0, 0x0, 0, 0x0} ++ case AVLE16FFV: ++ return &inst{0x7, 0x5, 0x0, 0x10, 16, 0x0} ++ case AVLE32V: ++ return &inst{0x7, 0x6, 0x0, 0x0, 0, 0x0} ++ case AVLE32FFV: ++ return &inst{0x7, 0x6, 0x0, 0x10, 16, 0x0} ++ case AVLE64V: ++ return &inst{0x7, 0x7, 0x0, 0x0, 0, 0x0} ++ case AVLE64FFV: ++ return &inst{0x7, 0x7, 0x0, 0x10, 16, 0x0} ++ case AVLE8V: ++ return &inst{0x7, 0x0, 0x0, 0x0, 0, 0x0} ++ case AVLE8FFV: ++ return &inst{0x7, 0x0, 0x0, 0x10, 16, 0x0} ++ case AVLMV: ++ return &inst{0x7, 0x0, 0x0, 0xb, 43, 0x1} ++ case AVLOXEI16V: ++ return &inst{0x7, 0x5, 0x0, 0x0, 192, 0x6} ++ case AVLOXEI32V: ++ return &inst{0x7, 0x6, 0x0, 0x0, 192, 0x6} ++ case AVLOXEI64V: ++ return &inst{0x7, 0x7, 0x0, 0x0, 192, 0x6} ++ case AVLOXEI8V: ++ return &inst{0x7, 0x0, 0x0, 0x0, 192, 0x6} ++ case AVLSE16V: ++ return &inst{0x7, 0x5, 0x0, 0x0, 128, 0x4} ++ case AVLSE32V: ++ return &inst{0x7, 0x6, 0x0, 0x0, 128, 0x4} ++ case AVLSE64V: ++ return &inst{0x7, 0x7, 0x0, 0x0, 128, 0x4} ++ case AVLSE8V: ++ return &inst{0x7, 0x0, 0x0, 0x0, 128, 0x4} ++ case AVLUXEI16V: ++ return &inst{0x7, 0x5, 0x0, 0x0, 64, 0x2} ++ case AVLUXEI32V: ++ return &inst{0x7, 0x6, 0x0, 0x0, 64, 0x2} ++ case AVLUXEI64V: ++ return &inst{0x7, 0x7, 0x0, 0x0, 64, 0x2} ++ case AVLUXEI8V: ++ return &inst{0x7, 0x0, 0x0, 0x0, 64, 0x2} ++ case AVMACCVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1216, 0x5a} ++ case AVMACCVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1216, 0x5a} ++ case AVMADCVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1120, 0x23} ++ case AVMADCVIM: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1088, 0x22} ++ case AVMADCVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1120, 0x23} ++ case AVMADCVVM: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1088, 0x22} ++ case AVMADCVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1120, 0x23} ++ case AVMADCVXM: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1088, 0x22} ++ case AVMADDVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1472, 0x52} ++ case AVMADDVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1472, 0x52} ++ case AVMANDMM: ++ return &inst{0x57, 0x2, 0x0, 0x0, 1632, 0x33} ++ case AVMANDNMM: ++ return &inst{0x57, 0x2, 0x0, 0x0, 1568, 0x31} ++ case AVMAXVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 448, 0xe} ++ case AVMAXVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 448, 0xe} ++ case AVMAXUVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 384, 0xc} ++ case AVMAXUVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 384, 0xc} ++ case AVMERGEVIM: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1472, 0x2e} ++ case AVMERGEVVM: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1472, 0x2e} ++ case AVMERGEVXM: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1472, 0x2e} ++ case AVMFEQVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 1536, 0x30} ++ case AVMFEQVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 1536, 0x30} ++ case AVMFGEVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 1984, 0x3e} ++ case AVMFGTVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 1856, 0x3a} ++ case AVMFLEVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 1600, 0x32} ++ case AVMFLEVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 1600, 0x32} ++ case AVMFLTVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 1728, 0x36} ++ case AVMFLTVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 1728, 0x36} ++ case AVMFNEVF: ++ return &inst{0x57, 0x5, 0x0, 0x0, 1792, 0x38} ++ case AVMFNEVV: ++ return &inst{0x57, 0x1, 0x0, 0x0, 1792, 0x38} ++ case AVMINVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 320, 0xa} ++ case AVMINVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 320, 0xa} ++ case AVMINUVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 256, 0x8} ++ case AVMINUVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 256, 0x8} ++ case AVMNANDMM: ++ return &inst{0x57, 0x2, 0x0, 0x0, 1888, 0x3b} ++ case AVMNORMM: ++ return &inst{0x57, 0x2, 0x0, 0x0, 1952, 0x3d} ++ case AVMORMM: ++ return &inst{0x57, 0x2, 0x0, 0x0, 1696, 0x35} ++ case AVMORNMM: ++ return &inst{0x57, 0x2, 0x0, 0x0, 1824, 0x39} ++ case AVMSBCVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1248, 0x27} ++ case AVMSBCVVM: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1216, 0x26} ++ case AVMSBCVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1248, 0x27} ++ case AVMSBCVXM: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1216, 0x26} ++ case AVMSBFM: ++ return &inst{0x57, 0x2, 0x1, 0x0, 1280, 0x28} ++ case AVMSEQVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1536, 0x30} ++ case AVMSEQVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1536, 0x30} ++ case AVMSEQVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1536, 0x30} ++ case AVMSGTVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1984, 0x3e} ++ case AVMSGTVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1984, 0x3e} ++ case AVMSGTUVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1920, 0x3c} ++ case AVMSGTUVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1920, 0x3c} ++ case AVMSIFM: ++ return &inst{0x57, 0x2, 0x3, 0x0, 1280, 0x28} ++ case AVMSLEVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1856, 0x3a} ++ case AVMSLEVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1856, 0x3a} ++ case AVMSLEVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1856, 0x3a} ++ case AVMSLEUVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1792, 0x38} ++ case AVMSLEUVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1792, 0x38} ++ case AVMSLEUVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1792, 0x38} ++ case AVMSLTVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1728, 0x36} ++ case AVMSLTVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1728, 0x36} ++ case AVMSLTUVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1664, 0x34} ++ case AVMSLTUVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1664, 0x34} ++ case AVMSNEVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1600, 0x32} ++ case AVMSNEVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1600, 0x32} ++ case AVMSNEVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1600, 0x32} ++ case AVMSOFM: ++ return &inst{0x57, 0x2, 0x2, 0x0, 1280, 0x28} ++ case AVMULVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1728, 0x4a} ++ case AVMULVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1728, 0x4a} ++ case AVMULHVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1600, 0x4e} ++ case AVMULHVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1600, 0x4e} ++ case AVMULHSUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1664, 0x4c} ++ case AVMULHSUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1664, 0x4c} ++ case AVMULHUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1792, 0x48} ++ case AVMULHUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1792, 0x48} ++ case AVMV1RV: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1568, 0x4f} ++ case AVMV2RV: ++ return &inst{0x57, 0x3, 0x1, 0x0, -1568, 0x4f} ++ case AVMV4RV: ++ return &inst{0x57, 0x3, 0x3, 0x0, -1568, 0x4f} ++ case AVMV8RV: ++ return &inst{0x57, 0x3, 0x7, 0x0, -1568, 0x4f} ++ case AVMVSX: ++ return &inst{0x57, 0x6, 0x0, 0x0, 1056, 0x21} ++ case AVMVVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 1504, 0x2f} ++ case AVMVVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1504, 0x2f} ++ case AVMVVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1504, 0x2f} ++ case AVMVXS: ++ return &inst{0x57, 0x2, 0x0, 0x0, 1056, 0x21} ++ case AVMXNORMM: ++ return &inst{0x57, 0x2, 0x0, 0x0, 2016, 0x3f} ++ case AVMXORMM: ++ return &inst{0x57, 0x2, 0x0, 0x0, 1760, 0x37} ++ case AVNCLIPWI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1088, 0x5e} ++ case AVNCLIPWV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1088, 0x5e} ++ case AVNCLIPWX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1088, 0x5e} ++ case AVNCLIPUWI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1152, 0x5c} ++ case AVNCLIPUWV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1152, 0x5c} ++ case AVNCLIPUWX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1152, 0x5c} ++ case AVNMSACVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1088, 0x5e} ++ case AVNMSACVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1088, 0x5e} ++ case AVNMSUBVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1344, 0x56} ++ case AVNMSUBVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1344, 0x56} ++ case AVNSRAWI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1216, 0x5a} ++ case AVNSRAWV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1216, 0x5a} ++ case AVNSRAWX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1216, 0x5a} ++ case AVNSRLWI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1280, 0x58} ++ case AVNSRLWV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1280, 0x58} ++ case AVNSRLWX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1280, 0x58} ++ case AVORVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 640, 0x14} ++ case AVORVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 640, 0x14} ++ case AVORVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 640, 0x14} ++ case AVREDANDVS: ++ return &inst{0x57, 0x2, 0x0, 0x0, 64, 0x2} ++ case AVREDMAXVS: ++ return &inst{0x57, 0x2, 0x0, 0x0, 448, 0xe} ++ case AVREDMAXUVS: ++ return &inst{0x57, 0x2, 0x0, 0x0, 384, 0xc} ++ case AVREDMINVS: ++ return &inst{0x57, 0x2, 0x0, 0x0, 320, 0xa} ++ case AVREDMINUVS: ++ return &inst{0x57, 0x2, 0x0, 0x0, 256, 0x8} ++ case AVREDORVS: ++ return &inst{0x57, 0x2, 0x0, 0x0, 128, 0x4} ++ case AVREDSUMVS: ++ return &inst{0x57, 0x2, 0x0, 0x0, 0, 0x0} ++ case AVREDXORVS: ++ return &inst{0x57, 0x2, 0x0, 0x0, 192, 0x6} ++ case AVREMVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1856, 0x46} ++ case AVREMVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1856, 0x46} ++ case AVREMUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1920, 0x44} ++ case AVREMUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1920, 0x44} ++ case AVRGATHERVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 768, 0x18} ++ case AVRGATHERVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 768, 0x18} ++ case AVRGATHERVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 768, 0x18} ++ case AVRGATHEREI16VV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 896, 0x1c} ++ case AVRSUBVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 192, 0x6} ++ case AVRSUBVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 192, 0x6} ++ case AVS1RV: ++ return &inst{0x27, 0x0, 0x0, 0x8, 40, 0x1} ++ case AVS2RV: ++ return &inst{0x27, 0x0, 0x0, 0x8, 552, 0x11} ++ case AVS4RV: ++ return &inst{0x27, 0x0, 0x0, 0x8, 1576, 0x31} ++ case AVS8RV: ++ return &inst{0x27, 0x0, 0x0, 0x8, -472, 0x71} ++ case AVSADDVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1984, 0x42} ++ case AVSADDVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1984, 0x42} ++ case AVSADDVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1984, 0x42} ++ case AVSADDUVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -2048, 0x40} ++ case AVSADDUVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -2048, 0x40} ++ case AVSADDUVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -2048, 0x40} ++ case AVSBCVVM: ++ return &inst{0x57, 0x0, 0x0, 0x0, 1152, 0x24} ++ case AVSBCVXM: ++ return &inst{0x57, 0x4, 0x0, 0x0, 1152, 0x24} ++ case AVSE16V: ++ return &inst{0x27, 0x5, 0x0, 0x0, 0, 0x0} ++ case AVSE32V: ++ return &inst{0x27, 0x6, 0x0, 0x0, 0, 0x0} ++ case AVSE64V: ++ return &inst{0x27, 0x7, 0x0, 0x0, 0, 0x0} ++ case AVSE8V: ++ return &inst{0x27, 0x0, 0x0, 0x0, 0, 0x0} ++ case AVSETIVLI: ++ return &inst{0x57, 0x7, 0x0, 0x0, -1024, 0x60} ++ case AVSETVL: ++ return &inst{0x57, 0x7, 0x0, 0x0, -2048, 0x40} ++ case AVSETVLI: ++ return &inst{0x57, 0x7, 0x0, 0x0, 0, 0x0} ++ case AVSEXTVF2: ++ return &inst{0x57, 0x2, 0x7, 0x0, 1152, 0x24} ++ case AVSEXTVF4: ++ return &inst{0x57, 0x2, 0x5, 0x0, 1152, 0x24} ++ case AVSEXTVF8: ++ return &inst{0x57, 0x2, 0x3, 0x0, 1152, 0x24} ++ case AVSLIDE1DOWNVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, 960, 0x1e} ++ case AVSLIDE1UPVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, 896, 0x1c} ++ case AVSLIDEDOWNVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 960, 0x1e} ++ case AVSLIDEDOWNVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 960, 0x1e} ++ case AVSLIDEUPVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 896, 0x1c} ++ case AVSLIDEUPVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 896, 0x1c} ++ case AVSLLVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1728, 0x4a} ++ case AVSLLVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1728, 0x4a} ++ case AVSLLVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1728, 0x4a} ++ case AVSMV: ++ return &inst{0x27, 0x0, 0x0, 0xb, 43, 0x1} ++ case AVSMULVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1600, 0x4e} ++ case AVSMULVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1600, 0x4e} ++ case AVSOXEI16V: ++ return &inst{0x27, 0x5, 0x0, 0x0, 192, 0x6} ++ case AVSOXEI32V: ++ return &inst{0x27, 0x6, 0x0, 0x0, 192, 0x6} ++ case AVSOXEI64V: ++ return &inst{0x27, 0x7, 0x0, 0x0, 192, 0x6} ++ case AVSOXEI8V: ++ return &inst{0x27, 0x0, 0x0, 0x0, 192, 0x6} ++ case AVSRAVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1472, 0x52} ++ case AVSRAVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1472, 0x52} ++ case AVSRAVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1472, 0x52} ++ case AVSRLVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1536, 0x50} ++ case AVSRLVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1536, 0x50} ++ case AVSRLVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1536, 0x50} ++ case AVSSE16V: ++ return &inst{0x27, 0x5, 0x0, 0x0, 128, 0x4} ++ case AVSSE32V: ++ return &inst{0x27, 0x6, 0x0, 0x0, 128, 0x4} ++ case AVSSE64V: ++ return &inst{0x27, 0x7, 0x0, 0x0, 128, 0x4} ++ case AVSSE8V: ++ return &inst{0x27, 0x0, 0x0, 0x0, 128, 0x4} ++ case AVSSRAVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1344, 0x56} ++ case AVSSRAVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1344, 0x56} ++ case AVSSRAVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1344, 0x56} ++ case AVSSRLVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, -1408, 0x54} ++ case AVSSRLVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1408, 0x54} ++ case AVSSRLVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1408, 0x54} ++ case AVSSUBVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1856, 0x46} ++ case AVSSUBVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1856, 0x46} ++ case AVSSUBUVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1920, 0x44} ++ case AVSSUBUVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, -1920, 0x44} ++ case AVSUBVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 128, 0x4} ++ case AVSUBVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 128, 0x4} ++ case AVSUXEI16V: ++ return &inst{0x27, 0x5, 0x0, 0x0, 64, 0x2} ++ case AVSUXEI32V: ++ return &inst{0x27, 0x6, 0x0, 0x0, 64, 0x2} ++ case AVSUXEI64V: ++ return &inst{0x27, 0x7, 0x0, 0x0, 64, 0x2} ++ case AVSUXEI8V: ++ return &inst{0x27, 0x0, 0x0, 0x0, 64, 0x2} ++ case AVWADDVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -960, 0x62} ++ case AVWADDVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -960, 0x62} ++ case AVWADDWV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -704, 0x6a} ++ case AVWADDWX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -704, 0x6a} ++ case AVWADDUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -1024, 0x60} ++ case AVWADDUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -1024, 0x60} ++ case AVWADDUWV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -768, 0x68} ++ case AVWADDUWX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -768, 0x68} ++ case AVWMACCVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -192, 0x7a} ++ case AVWMACCVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -192, 0x7a} ++ case AVWMACCSUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -64, 0x7e} ++ case AVWMACCSUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -64, 0x7e} ++ case AVWMACCUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -256, 0x78} ++ case AVWMACCUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -256, 0x78} ++ case AVWMACCUSVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -128, 0x7c} ++ case AVWMULVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -320, 0x76} ++ case AVWMULVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -320, 0x76} ++ case AVWMULSUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -384, 0x74} ++ case AVWMULSUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -384, 0x74} ++ case AVWMULUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -512, 0x70} ++ case AVWMULUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -512, 0x70} ++ case AVWREDSUMVS: ++ return &inst{0x57, 0x0, 0x0, 0x0, -960, 0x62} ++ case AVWREDSUMUVS: ++ return &inst{0x57, 0x0, 0x0, 0x0, -1024, 0x60} ++ case AVWSUBVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -832, 0x66} ++ case AVWSUBVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -832, 0x66} ++ case AVWSUBWV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -576, 0x6e} ++ case AVWSUBWX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -576, 0x6e} ++ case AVWSUBUVV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -896, 0x64} ++ case AVWSUBUVX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -896, 0x64} ++ case AVWSUBUWV: ++ return &inst{0x57, 0x2, 0x0, 0x0, -640, 0x6c} ++ case AVWSUBUWX: ++ return &inst{0x57, 0x6, 0x0, 0x0, -640, 0x6c} ++ case AVXORVI: ++ return &inst{0x57, 0x3, 0x0, 0x0, 704, 0x16} ++ case AVXORVV: ++ return &inst{0x57, 0x0, 0x0, 0x0, 704, 0x16} ++ case AVXORVX: ++ return &inst{0x57, 0x4, 0x0, 0x0, 704, 0x16} ++ case AVZEXTVF2: ++ return &inst{0x57, 0x2, 0x6, 0x0, 1152, 0x24} ++ case AVZEXTVF4: ++ return &inst{0x57, 0x2, 0x4, 0x0, 1152, 0x24} ++ case AVZEXTVF8: ++ return &inst{0x57, 0x2, 0x2, 0x0, 1152, 0x24} + case AWFI: +- return &inst{0x73, 0x0, 0x5, 261, 0x8} ++ return &inst{0x73, 0x0, 0x0, 0x5, 261, 0x8} + case AXNOR: +- return &inst{0x33, 0x4, 0x0, 1024, 0x20} ++ return &inst{0x33, 0x4, 0x0, 0x0, 1024, 0x20} + case AXOR: +- return &inst{0x33, 0x4, 0x0, 0, 0x0} ++ return &inst{0x33, 0x4, 0x0, 0x0, 0, 0x0} + case AXORI: +- return &inst{0x13, 0x4, 0x0, 0, 0x0} ++ return &inst{0x13, 0x4, 0x0, 0x0, 0, 0x0} + case AZEXTH: +- return &inst{0x3b, 0x4, 0x0, 128, 0x4} ++ return &inst{0x3b, 0x4, 0x0, 0x0, 128, 0x4} + } + return nil + } +-- +2.39.5 + diff --git a/2057-cmd-internal-obj-cmd-asm-add-vector-registers-to-ris.patch b/2057-cmd-internal-obj-cmd-asm-add-vector-registers-to-ris.patch new file mode 100644 index 0000000..f767f21 --- /dev/null +++ b/2057-cmd-internal-obj-cmd-asm-add-vector-registers-to-ris.patch @@ -0,0 +1,139 @@ +From 39d4bbc0b357cd81948a05ed60f513665d0bc1d0 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 057/119] cmd/internal/obj,cmd/asm: add vector registers to + riscv64 assembler +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This adds V0 through V31 as vector registers, which are available on CPUs +that support the V extension. + +Change-Id: Ibffee3f9a2cf1d062638715b3744431d72d451ce +Reviewed-on: https://go-review.googlesource.com/c/go/+/595404 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Cherry Mui +Reviewed-by: Mark Ryan +Reviewed-by: Michael Pratt +Reviewed-by: 鹏程汪 +--- + src/cmd/asm/internal/arch/arch.go | 4 ++++ + src/cmd/internal/obj/riscv/cpu.go | 36 +++++++++++++++++++++++++++++- + src/cmd/internal/obj/riscv/list.go | 2 ++ + src/cmd/internal/obj/riscv/obj.go | 10 +++++++++ + 4 files changed, 51 insertions(+), 1 deletion(-) + +diff --git a/src/cmd/asm/internal/arch/arch.go b/src/cmd/asm/internal/arch/arch.go +index 11bb7af899..429dff7be5 100644 +--- a/src/cmd/asm/internal/arch/arch.go ++++ b/src/cmd/asm/internal/arch/arch.go +@@ -586,6 +586,10 @@ func archRISCV64(shared bool) *Arch { + name := fmt.Sprintf("F%d", i-riscv.REG_F0) + register[name] = int16(i) + } ++ for i := riscv.REG_V0; i <= riscv.REG_V31; i++ { ++ name := fmt.Sprintf("V%d", i-riscv.REG_V0) ++ register[name] = int16(i) ++ } + + // General registers with ABI names. + register["ZERO"] = riscv.REG_ZERO +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index 8b620b8646..d2154593de 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -72,7 +72,7 @@ const ( + REG_X30 + REG_X31 + +- // FP register numberings. ++ // Floating Point register numberings. + REG_F0 + REG_F1 + REG_F2 +@@ -106,6 +106,40 @@ const ( + REG_F30 + REG_F31 + ++ // Vector register numberings. ++ REG_V0 ++ REG_V1 ++ REG_V2 ++ REG_V3 ++ REG_V4 ++ REG_V5 ++ REG_V6 ++ REG_V7 ++ REG_V8 ++ REG_V9 ++ REG_V10 ++ REG_V11 ++ REG_V12 ++ REG_V13 ++ REG_V14 ++ REG_V15 ++ REG_V16 ++ REG_V17 ++ REG_V18 ++ REG_V19 ++ REG_V20 ++ REG_V21 ++ REG_V22 ++ REG_V23 ++ REG_V24 ++ REG_V25 ++ REG_V26 ++ REG_V27 ++ REG_V28 ++ REG_V29 ++ REG_V30 ++ REG_V31 ++ + // This marks the end of the register numbering. + REG_END + +diff --git a/src/cmd/internal/obj/riscv/list.go b/src/cmd/internal/obj/riscv/list.go +index bc87539f27..c5b7e80719 100644 +--- a/src/cmd/internal/obj/riscv/list.go ++++ b/src/cmd/internal/obj/riscv/list.go +@@ -28,6 +28,8 @@ func RegName(r int) string { + return fmt.Sprintf("X%d", r-REG_X0) + case REG_F0 <= r && r <= REG_F31: + return fmt.Sprintf("F%d", r-REG_F0) ++ case REG_V0 <= r && r <= REG_V31: ++ return fmt.Sprintf("V%d", r-REG_V0) + default: + return fmt.Sprintf("Rgok(%d)", r-obj.RBaseRISCV) + } +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 6e9691bb4f..6f74f38543 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1029,6 +1029,11 @@ func regF(r uint32) uint32 { + return regVal(r, REG_F0, REG_F31) + } + ++// regV returns a vector register. ++func regV(r uint32) uint32 { ++ return regVal(r, REG_V0, REG_V31) ++} ++ + // regAddr extracts a register from an Addr. + func regAddr(a obj.Addr, min, max uint32) uint32 { + if a.Type != obj.TYPE_REG { +@@ -1111,6 +1116,11 @@ func wantFloatReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) { + wantReg(ctxt, ins, pos, "float", r, REG_F0, REG_F31) + } + ++// wantVectorReg checks that r is a vector register. ++func wantVectorReg(ctxt *obj.Link, ins *instruction, pos string, r uint32) { ++ wantReg(ctxt, ins, pos, "vector", r, REG_V0, REG_V31) ++} ++ + // wantEvenOffset checks that the offset is a multiple of two. + func wantEvenOffset(ctxt *obj.Link, ins *instruction, offset int64) { + if err := immEven(offset); err != nil { +-- +2.39.5 + diff --git a/2058-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch b/2058-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch new file mode 100644 index 0000000..8babafc --- /dev/null +++ b/2058-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch @@ -0,0 +1,578 @@ +From 14b23a9bebcd9c6d482db37e76bff8dcde86c7ee Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 058/119] cmd/internal/obj/riscv: update references to RISC-V + specification + +Update references to version 20240411 of the RISC-V specifications. +Reorder and regroup instructions to maintain ordering. + +Change-Id: Iea2a5d22ad677e04948e9a9325986ad301c03f35 +Reviewed-on: https://go-review.googlesource.com/c/go/+/616115 +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +Reviewed-by: Meng Zhuo +Reviewed-by: David Chase +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 55 ++++---- + src/cmd/internal/obj/riscv/anames.go | 42 +++--- + src/cmd/internal/obj/riscv/cpu.go | 134 ++++++++++---------- + 3 files changed, 119 insertions(+), 112 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 53b7b92faa..517930aa60 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -6,7 +6,9 @@ + + TEXT asmtest(SB),DUPOK|NOSPLIT,$0 + start: ++ // + // Unprivileged ISA ++ // + + // 2.4: Integer Computational Instructions + +@@ -139,7 +141,7 @@ start: + // 2.7: Memory Ordering Instructions + FENCE // 0f00f00f + +- // 5.2: Integer Computational Instructions (RV64I) ++ // 4.2: Integer Computational Instructions (RV64I) + ADDIW $1, X5, X6 // 1b831200 + SLLIW $1, X5, X6 // 1b931200 + SRLIW $1, X5, X6 // 1bd31200 +@@ -164,18 +166,25 @@ start: + SUBW $1, X6 // 1b03f3ff + SRAW $1, X6 // 1b531340 + +- // 5.3: Load and Store Instructions (RV64I) ++ // 4.3: Load and Store Instructions (RV64I) + LD (X5), X6 // 03b30200 + LD 4(X5), X6 // 03b34200 + SD X5, (X6) // 23305300 + SD X5, 4(X6) // 23325300 + +- // 7.1: Multiplication Operations ++ // 8.1: Base Counters and Timers (Zicntr) ++ RDCYCLE X5 // f32200c0 ++ RDTIME X5 // f32210c0 ++ RDINSTRET X5 // f32220c0 ++ ++ // 13.1: Multiplication Operations + MUL X5, X6, X7 // b3035302 + MULH X5, X6, X7 // b3135302 + MULHU X5, X6, X7 // b3335302 + MULHSU X5, X6, X7 // b3235302 + MULW X5, X6, X7 // bb035302 ++ ++ // 13.2: Division Operations + DIV X5, X6, X7 // b3435302 + DIVU X5, X6, X7 // b3535302 + REM X5, X6, X7 // b3635302 +@@ -185,13 +194,13 @@ start: + REMW X5, X6, X7 // bb635302 + REMUW X5, X6, X7 // bb735302 + +- // 8.2: Load-Reserved/Store-Conditional ++ // 14.2: Load-Reserved/Store-Conditional (Zalrsc) + LRW (X5), X6 // 2fa30214 + LRD (X5), X6 // 2fb30214 + SCW X5, (X6), X7 // af23531a + SCD X5, (X6), X7 // af33531a + +- // 8.3: Atomic Memory Operations ++ // 14.4: Atomic Memory Operations (Zaamo) + AMOSWAPW X5, (X6), X7 // af23530e + AMOSWAPD X5, (X6), X7 // af33530e + AMOADDW X5, (X6), X7 // af235306 +@@ -211,18 +220,13 @@ start: + AMOMINUW X5, (X6), X7 // af2353c6 + AMOMINUD X5, (X6), X7 // af3353c6 + +- // 10.1: Base Counters and Timers +- RDCYCLE X5 // f32200c0 +- RDTIME X5 // f32210c0 +- RDINSTRET X5 // f32220c0 +- +- // 11.5: Single-Precision Load and Store Instructions ++ // 20.5: Single-Precision Load and Store Instructions + FLW (X5), F0 // 07a00200 + FLW 4(X5), F0 // 07a04200 + FSW F0, (X5) // 27a00200 + FSW F0, 4(X5) // 27a20200 + +- // 11.6: Single-Precision Floating-Point Computational Instructions ++ // 20.6: Single-Precision Floating-Point Computational Instructions + FADDS F1, F0, F2 // 53011000 + FSUBS F1, F0, F2 // 53011008 + FMULS F1, F0, F2 // 53011010 +@@ -231,7 +235,7 @@ start: + FMAXS F1, F0, F2 // 53111028 + FSQRTS F0, F1 // d3000058 + +- // 11.7: Single-Precision Floating-Point Conversion and Move Instructions ++ // 20.7: Single-Precision Floating-Point Conversion and Move Instructions + FCVTWS F0, X5 // d31200c0 + FCVTWS.RNE F0, X5 // d30200c0 + FCVTWS.RTZ F0, X5 // d31200c0 +@@ -272,21 +276,21 @@ start: + FNMSUBS F1, F2, F3, F4 // 4b822018 + FNMADDS F1, F2, F3, F4 // 4f822018 + +- // 11.8: Single-Precision Floating-Point Compare Instructions ++ // 20.8: Single-Precision Floating-Point Compare Instructions + FEQS F0, F1, X7 // d3a300a0 + FLTS F0, F1, X7 // d39300a0 + FLES F0, F1, X7 // d38300a0 + +- // 11.9: Single-Precision Floating-Point Classify Instruction ++ // 20.9: Single-Precision Floating-Point Classify Instruction + FCLASSS F0, X5 // d31200e0 + +- // 12.3: Double-Precision Load and Store Instructions ++ // 21.3: Double-Precision Load and Store Instructions + FLD (X5), F0 // 07b00200 + FLD 4(X5), F0 // 07b04200 + FSD F0, (X5) // 27b00200 + FSD F0, 4(X5) // 27b20200 + +- // 12.4: Double-Precision Floating-Point Computational Instructions ++ // 21.4: Double-Precision Floating-Point Computational Instructions + FADDD F1, F0, F2 // 53011002 + FSUBD F1, F0, F2 // 5301100a + FMULD F1, F0, F2 // 53011012 +@@ -295,7 +299,7 @@ start: + FMAXD F1, F0, F2 // 5311102a + FSQRTD F0, F1 // d300005a + +- // 12.5: Double-Precision Floating-Point Conversion and Move Instructions ++ // 21.5: Double-Precision Floating-Point Conversion and Move Instructions + FCVTWD F0, X5 // d31200c2 + FCVTWD.RNE F0, X5 // d30200c2 + FCVTWD.RTZ F0, X5 // d31200c2 +@@ -336,11 +340,10 @@ start: + FNMSUBD F1, F2, F3, F4 // 4b82201a + FNMADDD F1, F2, F3, F4 // 4f82201a + +- // 12.6: Double-Precision Floating-Point Classify Instruction ++ // 21.7: Double-Precision Floating-Point Classify Instruction + FCLASSD F0, X5 // d31200e2 + +- // RISC-V Bit-Manipulation ISA-extensions (1.0) +- // 1.1: Address Generation Instructions (Zba) ++ // 28.4.1: Address Generation Instructions (Zba) + ADDUW X10, X11, X12 // 3b86a508 + ADDUW X10, X11 // bb85a508 + SH1ADD X11, X12, X13 // b326b620 +@@ -360,7 +363,7 @@ start: + SLLIUW $63, X17, X18 // 1b99f80b + SLLIUW $1, X18, X19 // 9b191908 + +- // 1.2: Basic Bit Manipulation (Zbb) ++ // 28.4.2: Basic Bit Manipulation (Zbb) + ANDN X19, X20, X21 // b37a3a41 or 93caf9ffb37a5a01 + ANDN X19, X20 // 337a3a41 or 93cff9ff337afa01 + CLZ X20, X21 // 931a0a60 +@@ -385,7 +388,7 @@ start: + XNOR X18, X19 // b3c92941 or b3c9290193c9f9ff + ZEXTH X19, X20 // 3bca0908 + +- // 1.3: Bitwise Rotation (Zbb) ++ // 28.4.2: Bitwise Rotation (Zbb) + ROL X8, X9, X10 // 33958460 or b30f8040b3dff4013395840033e5af00 + ROL X8, X9 // b3948460 or b30f8040b3dff401b3948400b3e49f00 + ROLW X9, X10, X11 // bb159560 or b30f9040bb5ff501bb159500b3e5bf00 +@@ -403,7 +406,7 @@ start: + ORCB X5, X6 // 13d37228 + REV8 X7, X8 // 13d4836b + +- // 1.5: Single-bit Instructions (Zbs) ++ // 28.4.4: Single-bit Instructions (Zbs) + BCLR X23, X24, X25 // b31c7c49 + BCLR $63, X24 // 131cfc4b + BCLRI $1, X25, X26 // 139d1c48 +@@ -417,9 +420,11 @@ start: + BSET $63, X9 // 9394f42b + BSETI $1, X10, X11 // 93151528 + ++ // + // Privileged ISA ++ // + +- // 3.2.1: Environment Call and Breakpoint ++ // 3.3.1: Environment Call and Breakpoint + ECALL // 73000000 + SCALL // 73000000 + EBREAK // 73001000 +diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go +index 53cf1c95dc..dbdce22687 100644 +--- a/src/cmd/internal/obj/riscv/anames.go ++++ b/src/cmd/internal/obj/riscv/anames.go +@@ -57,6 +57,18 @@ var Anames = []string{ + "SRAW", + "LD", + "SD", ++ "CSRRW", ++ "CSRRS", ++ "CSRRC", ++ "CSRRWI", ++ "CSRRSI", ++ "CSRRCI", ++ "RDCYCLE", ++ "RDCYCLEH", ++ "RDTIME", ++ "RDTIMEH", ++ "RDINSTRET", ++ "RDINSTRETH", + "MUL", + "MULH", + "MULHU", +@@ -92,12 +104,6 @@ var Anames = []string{ + "AMOMAXUW", + "AMOMINW", + "AMOMINUW", +- "RDCYCLE", +- "RDCYCLEH", +- "RDTIME", +- "RDTIMEH", +- "RDINSTRET", +- "RDINSTRETH", + "FRCSR", + "FSCSR", + "FRRM", +@@ -202,21 +208,6 @@ var Anames = []string{ + "FLEQ", + "FLTQ", + "FCLASSQ", +- "CSRRW", +- "CSRRS", +- "CSRRC", +- "CSRRWI", +- "CSRRSI", +- "CSRRCI", +- "ECALL", +- "SCALL", +- "EBREAK", +- "SBREAK", +- "MRET", +- "SRET", +- "DRET", +- "WFI", +- "SFENCEVMA", + "ADDUW", + "SH1ADD", + "SH1ADDUW", +@@ -632,6 +623,15 @@ var Anames = []string{ + "VMV2RV", + "VMV4RV", + "VMV8RV", ++ "ECALL", ++ "SCALL", ++ "EBREAK", ++ "SBREAK", ++ "MRET", ++ "SRET", ++ "DRET", ++ "WFI", ++ "SFENCEVMA", + "WORD", + "BEQZ", + "BGEZ", +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index d2154593de..1c3a13f6c3 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -319,11 +319,13 @@ const ( + // + // As well as some pseudo-mnemonics (e.g. MOV) used only in the assembler. + // +-// See also "The RISC-V Instruction Set Manual" at https://riscv.org/specifications/. ++// See also "The RISC-V Instruction Set Manual" at https://riscv.org/technical/specifications/. + // + // If you modify this table, you MUST run 'go generate' to regenerate anames.go! + const ( +- // Unprivileged ISA (Document Version 20190608-Base-Ratified) ++ // ++ // Unprivileged ISA (version 20240411) ++ // + + // 2.4: Integer Computational Instructions + AADDI = obj.ABaseRISCV + obj.A_ARCHSPECIFIC + iota +@@ -374,7 +376,7 @@ const ( + AFENCETSO + APAUSE + +- // 5.2: Integer Computational Instructions (RV64I) ++ // 4.2: Integer Computational Instructions (RV64I) + AADDIW + ASLLIW + ASRLIW +@@ -385,16 +387,34 @@ const ( + ASUBW + ASRAW + +- // 5.3: Load and Store Instructions (RV64I) ++ // 4.3: Load and Store Instructions (RV64I) + ALD + ASD + +- // 7.1: Multiplication Operations ++ // 7.1: CSR Instructions (Zicsr) ++ ACSRRW ++ ACSRRS ++ ACSRRC ++ ACSRRWI ++ ACSRRSI ++ ACSRRCI ++ ++ // 8.1: Base Counters and Timers (Zicntr) ++ ARDCYCLE ++ ARDCYCLEH ++ ARDTIME ++ ARDTIMEH ++ ARDINSTRET ++ ARDINSTRETH ++ ++ // 13.1: Multiplication Operations + AMUL + AMULH + AMULHU + AMULHSU + AMULW ++ ++ // 13.2: Division Operations + ADIV + ADIVU + AREM +@@ -404,13 +424,13 @@ const ( + AREMW + AREMUW + +- // 8.2: Load-Reserved/Store-Conditional Instructions ++ // 14.2: Load-Reserved/Store-Conditional Instructions (Zalrsc) + ALRD + ASCD + ALRW + ASCW + +- // 8.3: Atomic Memory Operations ++ // 14.4: Atomic Memory Operations (Zaamo) + AAMOSWAPD + AAMOADDD + AAMOANDD +@@ -430,15 +450,7 @@ const ( + AAMOMINW + AAMOMINUW + +- // 10.1: Base Counters and Timers +- ARDCYCLE +- ARDCYCLEH +- ARDTIME +- ARDTIMEH +- ARDINSTRET +- ARDINSTRETH +- +- // 11.2: Floating-Point Control and Status Register ++ // 20.2: Floating-Point Control and Status Register + AFRCSR + AFSCSR + AFRRM +@@ -448,11 +460,11 @@ const ( + AFSRMI + AFSFLAGSI + +- // 11.5: Single-Precision Load and Store Instructions ++ // 20.5: Single-Precision Load and Store Instructions + AFLW + AFSW + +- // 11.6: Single-Precision Floating-Point Computational Instructions ++ // 20.6: Single-Precision Floating-Point Computational Instructions + AFADDS + AFSUBS + AFMULS +@@ -465,7 +477,7 @@ const ( + AFNMADDS + AFNMSUBS + +- // 11.7: Single-Precision Floating-Point Conversion and Move Instructions ++ // 20.7: Single-Precision Floating-Point Conversion and Move Instructions + AFCVTWS + AFCVTLS + AFCVTSW +@@ -482,19 +494,19 @@ const ( + AFMVXW + AFMVWX + +- // 11.8: Single-Precision Floating-Point Compare Instructions ++ // 20.8: Single-Precision Floating-Point Compare Instructions + AFEQS + AFLTS + AFLES + +- // 11.9: Single-Precision Floating-Point Classify Instruction ++ // 20.9: Single-Precision Floating-Point Classify Instruction + AFCLASSS + +- // 12.3: Double-Precision Load and Store Instructions ++ // 21.3: Double-Precision Load and Store Instructions + AFLD + AFSD + +- // 12.4: Double-Precision Floating-Point Computational Instructions ++ // 21.4: Double-Precision Floating-Point Computational Instructions + AFADDD + AFSUBD + AFMULD +@@ -507,7 +519,7 @@ const ( + AFNMADDD + AFNMSUBD + +- // 12.5: Double-Precision Floating-Point Conversion and Move Instructions ++ // 21.5: Double-Precision Floating-Point Conversion and Move Instructions + AFCVTWD + AFCVTLD + AFCVTDW +@@ -524,19 +536,19 @@ const ( + AFMVXD + AFMVDX + +- // 12.6: Double-Precision Floating-Point Compare Instructions ++ // 21.6: Double-Precision Floating-Point Compare Instructions + AFEQD + AFLTD + AFLED + +- // 12.7: Double-Precision Floating-Point Classify Instruction ++ // 21.7: Double-Precision Floating-Point Classify Instruction + AFCLASSD + +- // 13.1 Quad-Precision Load and Store Instructions ++ // 22.1 Quad-Precision Load and Store Instructions + AFLQ + AFSQ + +- // 13.2: Quad-Precision Computational Instructions ++ // 22.2: Quad-Precision Computational Instructions + AFADDQ + AFSUBQ + AFMULQ +@@ -549,7 +561,7 @@ const ( + AFNMADDQ + AFNMSUBQ + +- // 13.3 Quad-Precision Convert and Move Instructions ++ // 22.3 Quad-Precision Convert and Move Instructions + AFCVTWQ + AFCVTLQ + AFCVTSQ +@@ -566,46 +578,15 @@ const ( + AFSGNJNQ + AFSGNJXQ + +- // 13.4 Quad-Precision Floating-Point Compare Instructions ++ // 22.4 Quad-Precision Floating-Point Compare Instructions + AFEQQ + AFLEQ + AFLTQ + +- // 13.5 Quad-Precision Floating-Point Classify Instruction ++ // 22.5 Quad-Precision Floating-Point Classify Instruction + AFCLASSQ + +- // Privileged ISA (Version 20190608-Priv-MSU-Ratified) +- +- // 3.1.9: Instructions to Access CSRs +- ACSRRW +- ACSRRS +- ACSRRC +- ACSRRWI +- ACSRRSI +- ACSRRCI +- +- // 3.2.1: Environment Call and Breakpoint +- AECALL +- ASCALL +- AEBREAK +- ASBREAK +- +- // 3.2.2: Trap-Return Instructions +- AMRET +- ASRET +- ADRET +- +- // 3.2.3: Wait for Interrupt +- AWFI +- +- // 4.2.1: Supervisor Memory-Management Fence Instruction +- ASFENCEVMA +- +- // +- // RISC-V Bit-Manipulation ISA-extensions (1.0) +- // +- +- // 1.1: Address Generation Instructions (Zba) ++ // 28.4.1: Address Generation Instructions (Zba) + AADDUW + ASH1ADD + ASH1ADDUW +@@ -615,7 +596,7 @@ const ( + ASH3ADDUW + ASLLIUW + +- // 1.2: Basic Bit Manipulation (Zbb) ++ // 28.4.2: Basic Bit Manipulation (Zbb) + AANDN + AORN + AXNOR +@@ -633,7 +614,7 @@ const ( + ASEXTH + AZEXTH + +- // 1.3: Bitwise Rotation (Zbb) ++ // 28.4.3: Bitwise Rotation (Zbb) + AROL + AROLW + AROR +@@ -643,7 +624,7 @@ const ( + AORCB + AREV8 + +- // 1.5: Single-bit Instructions (Zbs) ++ // 28.4.4: Single-bit Instructions (Zbs) + ABCLR + ABCLRI + ABEXT +@@ -1144,6 +1125,27 @@ const ( + AVMV4RV + AVMV8RV + ++ // ++ // Privileged ISA (version 20240411) ++ // ++ ++ // 3.3.1: Environment Call and Breakpoint ++ AECALL ++ ASCALL ++ AEBREAK ++ ASBREAK ++ ++ // 3.3.2: Trap-Return Instructions ++ AMRET ++ ASRET ++ ADRET ++ ++ // 3.3.3: Wait for Interrupt ++ AWFI ++ ++ // 10.2: Supervisor Memory-Management Fence Instruction ++ ASFENCEVMA ++ + // The escape hatch. Inserts a single 32-bit word. + AWORD + +-- +2.39.5 + diff --git a/2059-cmd-internal-obj-add-prologue_end-DWARF-stmt-for-ris.patch b/2059-cmd-internal-obj-add-prologue_end-DWARF-stmt-for-ris.patch new file mode 100644 index 0000000..1fe9120 --- /dev/null +++ b/2059-cmd-internal-obj-add-prologue_end-DWARF-stmt-for-ris.patch @@ -0,0 +1,58 @@ +From 9f63043c4866fc204823e3fd62bacb1cfe0bbeb1 Mon Sep 17 00:00:00 2001 +From: Lin Runze +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 059/119] cmd/internal/obj: add prologue_end DWARF stmt for + riscv64 + +This patch adds prologue_end statement to the DWARF info for riscv64, +which delve debugger uses for skip stacksplit prologue. + +Change-Id: I4e5d9c26202385f65b3118b16f53f66de9d327f0 +Reviewed-on: https://go-review.googlesource.com/c/go/+/620295 +Reviewed-by: Hyang-Ah Hana Kim +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Cherry Mui +--- + src/cmd/internal/obj/riscv/obj.go | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 6f74f38543..76aec7df8f 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -23,6 +23,7 @@ package riscv + import ( + "cmd/internal/obj" + "cmd/internal/objabi" ++ "cmd/internal/src" + "cmd/internal/sys" + "fmt" + "internal/abi" +@@ -427,18 +428,23 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { + prologue = stacksplit(ctxt, prologue, cursym, newprog, stacksize) // emit split check + } + ++ q := prologue ++ + if stacksize != 0 { + prologue = ctxt.StartUnsafePoint(prologue, newprog) + + // Actually save LR. + prologue = obj.Appendp(prologue, newprog) + prologue.As = AMOV ++ prologue.Pos = q.Pos + prologue.From = obj.Addr{Type: obj.TYPE_REG, Reg: REG_LR} + prologue.To = obj.Addr{Type: obj.TYPE_MEM, Reg: REG_SP, Offset: -stacksize} + + // Insert stack adjustment. + prologue = obj.Appendp(prologue, newprog) + prologue.As = AADDI ++ prologue.Pos = q.Pos ++ prologue.Pos = prologue.Pos.WithXlogue(src.PosPrologueEnd) + prologue.From = obj.Addr{Type: obj.TYPE_CONST, Offset: -stacksize} + prologue.Reg = REG_SP + prologue.To = obj.Addr{Type: obj.TYPE_REG, Reg: REG_SP} +-- +2.39.5 + diff --git a/2060-cmd-internal-obj-riscv-update-RISC-V-instruction-tab.patch b/2060-cmd-internal-obj-riscv-update-RISC-V-instruction-tab.patch new file mode 100644 index 0000000..bb06b51 --- /dev/null +++ b/2060-cmd-internal-obj-riscv-update-RISC-V-instruction-tab.patch @@ -0,0 +1,371 @@ +From b5e96f2d3a7ea8eb3f2114eac8f564197f64af29 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 060/119] cmd/internal/obj/riscv: update RISC-V instruction + table + +Regenerate RISC-V instruction table from the riscv-opcodes repository, +due to various changes and shuffling upstream. + +This has been changed to remove pseudo-instructions, since Go only +needs the instruction encodings and including the pseudo-instructions +is creating unnecessary complications (for example, the inclusion +of ANOP and ARET, as well as strangely named aliases such as +AJALPSEUDO/AJALRPSEUDO). Remove pseudo-instructions that are not +currently supported by the assembler and add specific handling for +RDCYCLE, RDTIME and RDINSTRET, which were previously implemented +via the instruction encodings. + +Change-Id: I78be4506ba6b627eba1f321406081a63bab5b2e6 +Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64 +Reviewed-on: https://go-review.googlesource.com/c/go/+/616116 +Reviewed-by: Michael Pratt +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: Carlos Amedee +Reviewed-by: Cherry Mui +--- + src/cmd/internal/obj/riscv/anames.go | 23 ++++------------ + src/cmd/internal/obj/riscv/cpu.go | 36 ++++++------------------- + src/cmd/internal/obj/riscv/inst.go | 40 ---------------------------- + src/cmd/internal/obj/riscv/obj.go | 35 ++++++++++++++++++------ + 4 files changed, 40 insertions(+), 94 deletions(-) + +diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go +index dbdce22687..c49569c943 100644 +--- a/src/cmd/internal/obj/riscv/anames.go ++++ b/src/cmd/internal/obj/riscv/anames.go +@@ -44,8 +44,6 @@ var Anames = []string{ + "SH", + "SB", + "FENCE", +- "FENCETSO", +- "PAUSE", + "ADDIW", + "SLLIW", + "SRLIW", +@@ -63,12 +61,6 @@ var Anames = []string{ + "CSRRWI", + "CSRRSI", + "CSRRCI", +- "RDCYCLE", +- "RDCYCLEH", +- "RDTIME", +- "RDTIMEH", +- "RDINSTRET", +- "RDINSTRETH", + "MUL", + "MULH", + "MULHU", +@@ -104,14 +96,6 @@ var Anames = []string{ + "AMOMAXUW", + "AMOMINW", + "AMOMINUW", +- "FRCSR", +- "FSCSR", +- "FRRM", +- "FSRM", +- "FRFLAGS", +- "FSFLAGS", +- "FSRMI", +- "FSFLAGSI", + "FLW", + "FSW", + "FADDS", +@@ -645,15 +629,15 @@ var Anames = []string{ + "BNEZ", + "FABSD", + "FABSS", ++ "FNED", + "FNEGD", + "FNEGS", +- "FNED", + "FNES", + "MOV", + "MOVB", + "MOVBU", +- "MOVF", + "MOVD", ++ "MOVF", + "MOVH", + "MOVHU", + "MOVW", +@@ -661,6 +645,9 @@ var Anames = []string{ + "NEG", + "NEGW", + "NOT", ++ "RDCYCLE", ++ "RDINSTRET", ++ "RDTIME", + "SEQZ", + "SNEZ", + "LAST", +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index 1c3a13f6c3..a36b95e6d2 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -373,8 +373,6 @@ const ( + + // 2.7: Memory Ordering Instructions + AFENCE +- AFENCETSO +- APAUSE + + // 4.2: Integer Computational Instructions (RV64I) + AADDIW +@@ -399,14 +397,6 @@ const ( + ACSRRSI + ACSRRCI + +- // 8.1: Base Counters and Timers (Zicntr) +- ARDCYCLE +- ARDCYCLEH +- ARDTIME +- ARDTIMEH +- ARDINSTRET +- ARDINSTRETH +- + // 13.1: Multiplication Operations + AMUL + AMULH +@@ -450,16 +440,6 @@ const ( + AAMOMINW + AAMOMINUW + +- // 20.2: Floating-Point Control and Status Register +- AFRCSR +- AFSCSR +- AFRRM +- AFSRM +- AFRFLAGS +- AFSFLAGS +- AFSRMI +- AFSFLAGSI +- + // 20.5: Single-Precision Load and Store Instructions + AFLW + AFSW +@@ -1163,15 +1143,15 @@ const ( + ABNEZ + AFABSD + AFABSS ++ AFNED + AFNEGD + AFNEGS +- AFNED + AFNES + AMOV + AMOVB + AMOVBU +- AMOVF + AMOVD ++ AMOVF + AMOVH + AMOVHU + AMOVW +@@ -1179,6 +1159,9 @@ const ( + ANEG + ANEGW + ANOT ++ ARDCYCLE ++ ARDINSTRET ++ ARDTIME + ASEQZ + ASNEZ + +@@ -1237,12 +1220,9 @@ const ( + // Any instructions not listed here are assumed to either be non-unary or to read + // from its argument. + var unaryDst = map[obj.As]bool{ +- ARDCYCLE: true, +- ARDCYCLEH: true, +- ARDTIME: true, +- ARDTIMEH: true, +- ARDINSTRET: true, +- ARDINSTRETH: true, ++ ARDCYCLE: true, ++ ARDTIME: true, ++ ARDINSTRET: true, + } + + // Instruction encoding masks. +diff --git a/src/cmd/internal/obj/riscv/inst.go b/src/cmd/internal/obj/riscv/inst.go +index c264f6ae15..2d9132e532 100644 +--- a/src/cmd/internal/obj/riscv/inst.go ++++ b/src/cmd/internal/obj/riscv/inst.go +@@ -212,8 +212,6 @@ func encode(a obj.As) *inst { + return &inst{0x53, 0x0, 0x0, 0x0, 384, 0xc} + case AFENCE: + return &inst{0xf, 0x0, 0x0, 0x0, 0, 0x0} +- case AFENCETSO: +- return &inst{0xf, 0x0, 0x0, 0x13, -1997, 0x41} + case AFEQD: + return &inst{0x53, 0x2, 0x0, 0x0, -1504, 0x51} + case AFEQQ: +@@ -270,14 +268,10 @@ func encode(a obj.As) *inst { + return &inst{0x53, 0x0, 0x0, 0x0, 256, 0x8} + case AFMVDX: + return &inst{0x53, 0x0, 0x0, 0x0, -224, 0x79} +- case AFMVSX: +- return &inst{0x53, 0x0, 0x0, 0x0, -256, 0x78} + case AFMVWX: + return &inst{0x53, 0x0, 0x0, 0x0, -256, 0x78} + case AFMVXD: + return &inst{0x53, 0x0, 0x0, 0x0, -480, 0x71} +- case AFMVXS: +- return &inst{0x53, 0x0, 0x0, 0x0, -512, 0x70} + case AFMVXW: + return &inst{0x53, 0x0, 0x0, 0x0, -512, 0x70} + case AFNMADDD: +@@ -292,20 +286,8 @@ func encode(a obj.As) *inst { + return &inst{0x4b, 0x0, 0x0, 0x0, 96, 0x3} + case AFNMSUBS: + return &inst{0x4b, 0x0, 0x0, 0x0, 0, 0x0} +- case AFRCSR: +- return &inst{0x73, 0x2, 0x0, 0x3, 3, 0x0} +- case AFRFLAGS: +- return &inst{0x73, 0x2, 0x0, 0x1, 1, 0x0} +- case AFRRM: +- return &inst{0x73, 0x2, 0x0, 0x2, 2, 0x0} +- case AFSCSR: +- return &inst{0x73, 0x1, 0x0, 0x3, 3, 0x0} + case AFSD: + return &inst{0x27, 0x3, 0x0, 0x0, 0, 0x0} +- case AFSFLAGS: +- return &inst{0x73, 0x1, 0x0, 0x1, 1, 0x0} +- case AFSFLAGSI: +- return &inst{0x73, 0x5, 0x0, 0x1, 1, 0x0} + case AFSGNJD: + return &inst{0x53, 0x0, 0x0, 0x0, 544, 0x11} + case AFSGNJQ: +@@ -332,10 +314,6 @@ func encode(a obj.As) *inst { + return &inst{0x53, 0x0, 0x0, 0x0, 1504, 0x2f} + case AFSQRTS: + return &inst{0x53, 0x0, 0x0, 0x0, 1408, 0x2c} +- case AFSRM: +- return &inst{0x73, 0x1, 0x0, 0x2, 2, 0x0} +- case AFSRMI: +- return &inst{0x73, 0x5, 0x0, 0x2, 2, 0x0} + case AFSUBD: + return &inst{0x53, 0x0, 0x0, 0x0, 160, 0x5} + case AFSUBQ: +@@ -396,20 +374,6 @@ func encode(a obj.As) *inst { + return &inst{0x13, 0x6, 0x0, 0x0, 0, 0x0} + case AORN: + return &inst{0x33, 0x6, 0x0, 0x0, 1024, 0x20} +- case APAUSE: +- return &inst{0xf, 0x0, 0x0, 0x10, 16, 0x0} +- case ARDCYCLE: +- return &inst{0x73, 0x2, 0x0, 0x0, -1024, 0x60} +- case ARDCYCLEH: +- return &inst{0x73, 0x2, 0x0, 0x0, -896, 0x64} +- case ARDINSTRET: +- return &inst{0x73, 0x2, 0x0, 0x2, -1022, 0x60} +- case ARDINSTRETH: +- return &inst{0x73, 0x2, 0x0, 0x2, -894, 0x64} +- case ARDTIME: +- return &inst{0x73, 0x2, 0x0, 0x1, -1023, 0x60} +- case ARDTIMEH: +- return &inst{0x73, 0x2, 0x0, 0x1, -895, 0x64} + case AREM: + return &inst{0x33, 0x6, 0x0, 0x0, 32, 0x1} + case AREMU: +@@ -434,14 +398,10 @@ func encode(a obj.As) *inst { + return &inst{0x3b, 0x5, 0x0, 0x0, 1536, 0x30} + case ASB: + return &inst{0x23, 0x0, 0x0, 0x0, 0, 0x0} +- case ASBREAK: +- return &inst{0x73, 0x0, 0x0, 0x1, 1, 0x0} + case ASCD: + return &inst{0x2f, 0x3, 0x0, 0x0, 384, 0xc} + case ASCW: + return &inst{0x2f, 0x2, 0x0, 0x0, 384, 0xc} +- case ASCALL: +- return &inst{0x73, 0x0, 0x0, 0x0, 0, 0x0} + case ASD: + return &inst{0x23, 0x3, 0x0, 0x0, 0, 0x0} + case ASEXTB: +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 76aec7df8f..6b490a8967 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -155,6 +155,14 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + case obj.AUNDEF: + p.As = AEBREAK + ++ case AFMVXS: ++ // FMVXS is the old name for FMVXW. ++ p.As = AFMVXW ++ ++ case AFMVSX: ++ // FMVSX is the old name for FMVWX. ++ p.As = AFMVWX ++ + case ASCALL: + // SCALL is the old name for ECALL. + p.As = AECALL +@@ -1626,6 +1634,9 @@ var encodings = [ALAST & obj.AMask]encoding{ + ALD & obj.AMask: iIIEncoding, + ASD & obj.AMask: sIEncoding, + ++ // 7.1: CSR Instructions ++ ACSRRS & obj.AMask: iIIEncoding, ++ + // 7.1: Multiplication Operations + AMUL & obj.AMask: rIIIEncoding, + AMULH & obj.AMask: rIIIEncoding, +@@ -1667,11 +1678,6 @@ var encodings = [ALAST & obj.AMask]encoding{ + AAMOMINUW & obj.AMask: rIIIEncoding, + AAMOMINUD & obj.AMask: rIIIEncoding, + +- // 10.1: Base Counters and Timers +- ARDCYCLE & obj.AMask: iIIEncoding, +- ARDTIME & obj.AMask: iIIEncoding, +- ARDINSTRET & obj.AMask: iIIEncoding, +- + // 11.5: Single-Precision Load and Store Instructions + AFLW & obj.AMask: iFEncoding, + AFSW & obj.AMask: sFEncoding, +@@ -1701,8 +1707,6 @@ var encodings = [ALAST & obj.AMask]encoding{ + AFSGNJS & obj.AMask: rFFFEncoding, + AFSGNJNS & obj.AMask: rFFFEncoding, + AFSGNJXS & obj.AMask: rFFFEncoding, +- AFMVXS & obj.AMask: rFIEncoding, +- AFMVSX & obj.AMask: rIFEncoding, + AFMVXW & obj.AMask: rFIEncoding, + AFMVWX & obj.AMask: rIFEncoding, + +@@ -2418,7 +2422,7 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.funct7 = 3 + ins.rd, ins.rs1, ins.rs2 = uint32(p.RegTo2), uint32(p.To.Reg), uint32(p.From.Reg) + +- case AECALL, AEBREAK, ARDCYCLE, ARDTIME, ARDINSTRET: ++ case AECALL, AEBREAK: + insEnc := encode(p.As) + if p.To.Type == obj.TYPE_NONE { + ins.rd = REG_ZERO +@@ -2426,6 +2430,21 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.rs1 = REG_ZERO + ins.imm = insEnc.csr + ++ case ARDCYCLE, ARDTIME, ARDINSTRET: ++ ins.as = ACSRRS ++ if p.To.Type == obj.TYPE_NONE { ++ ins.rd = REG_ZERO ++ } ++ ins.rs1 = REG_ZERO ++ switch p.As { ++ case ARDCYCLE: ++ ins.imm = -1024 ++ case ARDTIME: ++ ins.imm = -1023 ++ case ARDINSTRET: ++ ins.imm = -1022 ++ } ++ + case AFENCE: + ins.rd, ins.rs1, ins.rs2 = REG_ZERO, REG_ZERO, obj.REG_NONE + ins.imm = 0x0ff +-- +2.39.5 + diff --git a/2061-crypto-sha512-improve-performance-of-riscv64-assembl.patch b/2061-crypto-sha512-improve-performance-of-riscv64-assembl.patch new file mode 100644 index 0000000..3c92e2f --- /dev/null +++ b/2061-crypto-sha512-improve-performance-of-riscv64-assembl.patch @@ -0,0 +1,110 @@ +From 07fe1dae08b8a60a417bc3c9b65e01ea19e3e3cd Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 061/119] crypto/sha512: improve performance of riscv64 + assembly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Implement optimised versions of Maj and Ch, which reduce the number of +instructions required per round. Reorder instructions for better +interleaving. + +This gives around a 10% gain on a StarFive VisionFive 2: + + │ sha512.1 │ sha512.2 │ + │ sec/op │ sec/op vs base │ +Hash8Bytes/New-4 9.310µ ± 0% 8.564µ ± 0% -8.01% (p=0.000 n=10) +Hash8Bytes/Sum384-4 8.833µ ± 0% 7.980µ ± 0% -9.66% (p=0.000 n=10) +Hash8Bytes/Sum512-4 9.293µ ± 0% 8.162µ ± 0% -12.17% (p=0.000 n=10) +Hash1K/New-4 49.60µ ± 0% 44.33µ ± 0% -10.63% (p=0.000 n=10) +Hash1K/Sum384-4 48.93µ ± 0% 43.78µ ± 0% -10.53% (p=0.000 n=10) +Hash1K/Sum512-4 49.48µ ± 0% 43.96µ ± 0% -11.15% (p=0.000 n=10) +Hash8K/New-4 327.9µ ± 0% 292.6µ ± 0% -10.78% (p=0.000 n=10) +Hash8K/Sum384-4 327.3µ ± 0% 292.0µ ± 0% -10.77% (p=0.000 n=10) +Hash8K/Sum512-4 327.8µ ± 0% 292.2µ ± 0% -10.85% (p=0.000 n=10) +geomean 52.87µ 47.31µ -10.51% + + │ sha512.1 │ sha512.2 │ + │ B/s │ B/s vs base │ +Hash8Bytes/New-4 839.8Ki ± 0% 908.2Ki ± 0% +8.14% (p=0.000 n=10) +Hash8Bytes/Sum384-4 888.7Ki ± 1% 976.6Ki ± 0% +9.89% (p=0.000 n=10) +Hash8Bytes/Sum512-4 839.8Ki ± 0% 957.0Ki ± 0% +13.95% (p=0.000 n=10) +Hash1K/New-4 19.69Mi ± 0% 22.03Mi ± 0% +11.86% (p=0.000 n=10) +Hash1K/Sum384-4 19.96Mi ± 0% 22.31Mi ± 0% +11.75% (p=0.000 n=10) +Hash1K/Sum512-4 19.74Mi ± 0% 22.21Mi ± 0% +12.51% (p=0.000 n=10) +Hash8K/New-4 23.82Mi ± 0% 26.70Mi ± 0% +12.09% (p=0.000 n=10) +Hash8K/Sum384-4 23.87Mi ± 0% 26.75Mi ± 0% +12.07% (p=0.000 n=10) +Hash8K/Sum512-4 23.83Mi ± 0% 26.73Mi ± 0% +12.16% (p=0.000 n=10) +geomean 7.334Mi 8.184Mi +11.59% + +Change-Id: I66e359e96b25b38efbc4d840e6b2d6a1e5d417ec +Reviewed-on: https://go-review.googlesource.com/c/go/+/605495 +Reviewed-by: David Chase +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Cherry Mui +Reviewed-by: Mark Ryan +Reviewed-by: Meng Zhuo +--- + src/crypto/sha512/sha512block_riscv64.s | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/src/crypto/sha512/sha512block_riscv64.s b/src/crypto/sha512/sha512block_riscv64.s +index e3a240f70e..0281464e4d 100644 +--- a/src/crypto/sha512/sha512block_riscv64.s ++++ b/src/crypto/sha512/sha512block_riscv64.s +@@ -98,38 +98,38 @@ + // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt + // BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x) + // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) ++// = ((y XOR z) AND x) XOR z + #define SHA512T1(index, e, f, g, h) \ + MOV (index*8)(X18), X8; \ + ADD X5, h; \ + ROR $14, e, X6; \ + ADD X8, h; \ + ROR $18, e, X7; \ +- XOR X7, X6; \ + ROR $41, e, X8; \ ++ XOR X7, X6; \ ++ XOR f, g, X5; \ + XOR X8, X6; \ ++ AND e, X5; \ + ADD X6, h; \ +- AND e, f, X5; \ +- NOT e, X7; \ +- AND g, X7; \ +- XOR X7, X5; \ ++ XOR g, X5; \ + ADD h, X5 + + // Calculate T2 in X6. + // T2 = BIGSIGMA0(a) + Maj(a, b, c) + // BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x) + // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) ++// = ((y XOR z) AND x) XOR (y AND z) + #define SHA512T2(a, b, c) \ + ROR $28, a, X6; \ + ROR $34, a, X7; \ +- XOR X7, X6; \ + ROR $39, a, X8; \ ++ XOR X7, X6; \ ++ XOR b, c, X9; \ ++ AND b, c, X7; \ ++ AND a, X9; \ + XOR X8, X6; \ +- AND a, b, X7; \ +- AND a, c, X8; \ +- XOR X8, X7; \ +- AND b, c, X9; \ +- XOR X9, X7; \ +- ADD X7, X6 ++ XOR X7, X9; \ ++ ADD X9, X6 + + // Calculate T1 and T2, then e = d + T1 and a = T1 + T2. + // The values for e and a are stored in d and h, ready for rotation. +-- +2.39.5 + diff --git a/2062-internal-bytealg-optimize-IndexByte-for-riscv64.patch b/2062-internal-bytealg-optimize-IndexByte-for-riscv64.patch new file mode 100644 index 0000000..0236441 --- /dev/null +++ b/2062-internal-bytealg-optimize-IndexByte-for-riscv64.patch @@ -0,0 +1,466 @@ +From b8ad4498aa879366c7eaa60a3999a48c67195249 Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 062/119] internal/bytealg: optimize IndexByte for riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The existing implementations of IndexByte and IndexByteString for +riscv64 are very simplistic. They load and compare a single byte at +a time in a tight loop. It's possible to improve performance in the +general case by loading and checking 8 bytes at a time. This is +achieved using the 'Determine if a word has a byte equal to n' bit +hack from https://graphics.stanford.edu/~seander/bithacks.html. + +We broadcast the byte we're looking for across a 64 bit register, +let v be the result of xoring that register with 8 bytes loaded +from the buffer and then use the formula, + +(((v) - 0x0101010101010101UL) & ~(v) & 0x8080808080808080UL) + +which evaluates to true if any one of the bytes in v is 0, i.e, +matches the byte we're looking for. We then just need to figure +out which byte out of the 8 it is to return the correct index. + +This change generally improves performance when the byte we're +looking for is not in the first 24 bytes of the buffer and degrades +performance slightly when it is. + +Some example benchmarks results from the bytes and strings package +are presented below. These were generated on a VisionFive2 running +Ubuntu 24.04. + +Subset of bytes Index benchmarks + +IndexByte/10 46.49n ± 0% 44.08n ± 0% -5.19% (p=0.000 n=10) +IndexByte/32 75.98n ± 0% 67.90n ± 0% -10.63% (p=0.000 n=10) +IndexByte/4K 5.512µ ± 0% 2.113µ ± 0% -61.67% (p=0.000 n=10) +IndexByte/4M 7.354m ± 0% 3.218m ± 0% -56.24% (p=0.000 n=10) +IndexByte/64M 90.15m ± 0% 33.86m ± 0% -62.44% (p=0.000 n=10) +IndexBytePortable/10 50.41n ± 0% 54.92n ± 1% +8.94% (p=0.000 n=10) +IndexBytePortable/32 111.9n ± 0% 115.5n ± 0% +3.22% (p=0.000 n=10) +IndexBytePortable/4K 10.99µ ± 0% 10.99µ ± 0% +0.04% (p=0.000 n=10) +IndexBytePortable/4M 11.24m ± 0% 11.24m ± 0% ~ (p=0.218 n=10) +IndexBytePortable/64M 179.8m ± 0% 179.8m ± 0% +0.01% (p=0.001 n=10) +IndexRune/10 104.2n ± 0% 104.4n ± 0% +0.19% (p=0.000 n=10) +IndexRune/32 133.7n ± 0% 139.3n ± 0% +4.23% (p=0.000 n=10) +IndexRune/4K 5.573µ ± 0% 2.184µ ± 0% -60.81% (p=0.000 n=10) +IndexRune/4M 5.634m ± 0% 2.112m ± 0% -62.51% (p=0.000 n=10) +IndexRune/64M 90.19m ± 0% 33.87m ± 0% -62.45% (p=0.000 n=10) +IndexRuneASCII/10 50.42n ± 2% 47.14n ± 0% -6.52% (p=0.000 n=10) +IndexRuneASCII/32 79.64n ± 1% 70.39n ± 0% -11.61% (p=0.000 n=10) +IndexRuneASCII/4K 5.516µ ± 0% 2.115µ ± 0% -61.66% (p=0.000 n=10) +IndexRuneASCII/4M 5.634m ± 0% 2.112m ± 0% -62.51% (p=0.000 n=10) +IndexRuneASCII/64M 90.16m ± 0% 33.86m ± 0% -62.44% (p=0.000 n=10) +IndexRuneUnicode/Latin/10 82.14n ± 0% 82.07n ± 0% -0.09% (p=0.000 n=10) +IndexRuneUnicode/Latin/32 111.6n ± 0% 117.1n ± 0% +4.93% (p=0.000 n=10) +IndexRuneUnicode/Latin/4K 6.222µ ± 0% 3.429µ ± 0% -44.89% (p=0.000 n=10) +IndexRuneUnicode/Latin/4M 8.189m ± 0% 4.706m ± 0% -42.53% (p=0.000 n=10) +IndexRuneUnicode/Latin/64M 171.8m ± 2% 105.8m ± 0% -38.44% (p=0.000 n=10) +IndexRuneUnicode/Cyrillic/10 89.69n ± 0% 89.67n ± 0% -0.02% (p=0.000 n=10) +IndexRuneUnicode/Cyrillic/32 119.1n ± 0% 124.1n ± 0% +4.20% (p=0.000 n=10) +IndexRuneUnicode/Cyrillic/4K 8.002µ ± 0% 6.232µ ± 0% -22.12% (p=0.000 n=10) +IndexRuneUnicode/Cyrillic/4M 9.501m ± 0% 7.510m ± 0% -20.95% (p=0.000 n=10) +IndexRuneUnicode/Cyrillic/64M 186.5m ± 0% 150.3m ± 0% -19.41% (p=0.000 n=10) +IndexRuneUnicode/Han/10 117.8n ± 0% 118.1n ± 0% +0.25% (p=0.000 n=10) +IndexRuneUnicode/Han/32 151.5n ± 0% 154.0n ± 0% +1.65% (p=0.000 n=10) +IndexRuneUnicode/Han/4K 6.664µ ± 0% 4.125µ ± 0% -38.11% (p=0.000 n=10) +IndexRuneUnicode/Han/4M 8.526m ± 0% 5.502m ± 0% -35.46% (p=0.000 n=10) +IndexRuneUnicode/Han/64M 171.8m ± 1% 112.2m ± 0% -34.68% (p=0.000 n=10) +Index/10 199.3n ± 1% 199.4n ± 0% ~ (p=1.000 n=10) +Index/32 547.7n ± 0% 547.3n ± 0% -0.08% (p=0.001 n=10) +Index/4K 38.62µ ± 0% 38.62µ ± 0% -0.01% (p=0.023 n=10) +Index/4M 40.46m ± 0% 40.45m ± 0% ~ (p=0.105 n=10) +Index/64M 648.5m ± 0% 648.4m ± 0% ~ (p=1.000 n=10) +IndexEasy/10 70.25n ± 0% 70.92n ± 0% +0.95% (p=0.000 n=10) +IndexEasy/32 104.60n ± 0% 95.67n ± 0% -8.54% (p=0.000 n=10) +IndexEasy/4K 5.544µ ± 0% 2.142µ ± 0% -61.36% (p=0.000 n=10) +IndexEasy/4M 7.354m ± 0% 3.213m ± 0% -56.32% (p=0.000 n=10) +IndexEasy/64M 114.93m ± 2% 52.61m ± 0% -54.22% (p=0.000 n=10) +IndexHard1 10.09m ± 0% 10.09m ± 0% ~ (p=0.393 n=10) +IndexHard2 10.09m ± 0% 10.09m ± 0% ~ (p=0.481 n=10) +IndexHard3 10.09m ± 0% 10.09m ± 0% ~ (p=1.000 n=10) +IndexHard4 10.09m ± 0% 10.09m ± 0% ~ (p=0.739 n=10) +LastIndexHard1 10.71m ± 0% 10.71m ± 0% ~ (p=0.052 n=10) +LastIndexHard2 10.71m ± 0% 10.71m ± 0% ~ (p=0.218 n=10) +LastIndexHard3 10.71m ± 0% 10.71m ± 0% ~ (p=0.739 n=10) +IndexAnyASCII/1:1 30.13n ± 0% 30.79n ± 0% +2.19% (p=0.000 n=10) +IndexAnyASCII/1:2 31.49n ± 0% 32.16n ± 0% +2.13% (p=0.000 n=10) +IndexAnyASCII/1:4 34.16n ± 0% 34.82n ± 0% +1.93% (p=0.000 n=10) +IndexAnyASCII/1:8 39.50n ± 0% 40.16n ± 0% +1.67% (p=0.000 n=10) +IndexAnyASCII/1:16 50.20n ± 0% 50.87n ± 0% +1.33% (p=0.000 n=10) +IndexAnyASCII/1:32 81.04n ± 0% 50.29n ± 0% -37.94% (p=0.000 n=10) +IndexAnyASCII/1:64 119.80n ± 0% 66.94n ± 0% -44.13% (p=0.000 n=10) +IndexAnyASCII/16:1 54.86n ± 0% 55.53n ± 0% +1.22% (p=0.000 n=10) +IndexAnyASCII/16:2 268.2n ± 0% 268.2n ± 0% ~ (p=1.000 n=10) +IndexAnyASCII/16:4 288.1n ± 0% 288.1n ± 0% ~ (p=1.000 n=10) ¹ +IndexAnyASCII/16:8 328.3n ± 0% 328.2n ± 0% ~ (p=0.370 n=10) +IndexAnyASCII/16:16 413.4n ± 0% 413.4n ± 0% ~ (p=0.628 n=10) +IndexAnyASCII/16:32 574.0n ± 0% 573.9n ± 0% ~ (p=0.141 n=10) +IndexAnyASCII/16:64 895.1n ± 0% 895.1n ± 0% ~ (p=0.548 n=10) +IndexAnyASCII/256:1 381.4n ± 0% 175.4n ± 0% -53.99% (p=0.000 n=10) +IndexAnyASCII/256:2 2.998µ ± 0% 2.998µ ± 0% ~ (p=0.365 n=10) +IndexAnyASCII/256:4 3.018µ ± 0% 3.018µ ± 0% ~ (p=0.650 n=10) +IndexAnyASCII/256:8 3.058µ ± 0% 3.064µ ± 0% +0.20% (p=0.011 n=10) +IndexAnyASCII/256:16 3.143µ ± 0% 3.150µ ± 0% +0.21% (p=0.000 n=10) +IndexAnyASCII/256:32 3.303µ ± 0% 3.307µ ± 0% +0.12% (p=0.000 n=10) +IndexAnyASCII/256:64 3.625µ ± 0% 3.638µ ± 0% +0.36% (p=0.000 n=10) +IndexAnyUTF8/1:1 30.13n ± 0% 30.94n ± 0% +2.69% (p=0.000 n=10) +IndexAnyUTF8/1:2 31.49n ± 0% 32.30n ± 0% +2.59% (p=0.000 n=10) +IndexAnyUTF8/1:4 34.16n ± 0% 35.03n ± 0% +2.55% (p=0.000 n=10) +IndexAnyUTF8/1:8 39.50n ± 0% 40.16n ± 0% +1.67% (p=0.000 n=10) +IndexAnyUTF8/1:16 50.20n ± 0% 50.84n ± 0% +1.27% (p=0.000 n=10) +IndexAnyUTF8/1:32 81.02n ± 0% 61.55n ± 0% -24.03% (p=0.000 n=10) +IndexAnyUTF8/1:64 119.80n ± 0% 80.04n ± 0% -33.19% (p=0.000 n=10) +IndexAnyUTF8/16:1 489.0n ± 0% 489.0n ± 0% ~ (p=1.000 n=10) +IndexAnyUTF8/16:2 361.9n ± 0% 372.6n ± 0% +2.96% (p=0.000 n=10) +IndexAnyUTF8/16:4 404.7n ± 0% 415.4n ± 0% +2.64% (p=0.000 n=10) +IndexAnyUTF8/16:8 489.9n ± 0% 500.7n ± 0% +2.20% (p=0.000 n=10) +IndexAnyUTF8/16:16 661.2n ± 0% 671.9n ± 0% +1.62% (p=0.000 n=10) +IndexAnyUTF8/16:32 1004.0n ± 0% 881.6n ± 0% -12.19% (p=0.000 n=10) +IndexAnyUTF8/16:64 1.767µ ± 0% 1.129µ ± 0% -36.11% (p=0.000 n=10) +IndexAnyUTF8/256:1 7.072µ ± 0% 7.072µ ± 0% ~ (p=0.387 n=10) +IndexAnyUTF8/256:2 4.700µ ± 0% 4.872µ ± 0% +3.66% (p=0.000 n=10) +IndexAnyUTF8/256:4 5.386µ ± 0% 5.557µ ± 0% +3.18% (p=0.000 n=10) +IndexAnyUTF8/256:8 6.752µ ± 0% 6.923µ ± 0% +2.53% (p=0.000 n=10) +IndexAnyUTF8/256:16 9.493µ ± 0% 9.664µ ± 0% +1.80% (p=0.000 n=10) +IndexAnyUTF8/256:32 14.97µ ± 0% 12.93µ ± 0% -13.64% (p=0.000 n=10) +IndexAnyUTF8/256:64 27.15µ ± 0% 16.89µ ± 0% -37.80% (p=0.000 n=10) +LastIndexAnyASCII/1:1 30.78n ± 0% 31.45n ± 0% +2.18% (p=0.000 n=10) +LastIndexAnyASCII/1:2 32.13n ± 0% 32.80n ± 0% +2.07% (p=0.000 n=10) +LastIndexAnyASCII/1:4 34.81n ± 0% 35.48n ± 0% +1.92% (p=0.000 n=10) +LastIndexAnyASCII/1:8 40.14n ± 0% 40.81n ± 0% +1.67% (p=0.000 n=10) +LastIndexAnyASCII/1:16 50.85n ± 0% 51.51n ± 0% +1.30% (p=0.000 n=10) +LastIndexAnyASCII/1:32 84.03n ± 0% 50.85n ± 0% -39.49% (p=0.000 n=10) +LastIndexAnyASCII/1:64 121.50n ± 0% 68.16n ± 0% -43.90% (p=0.000 n=10) +LastIndexAnyASCII/16:1 249.7n ± 0% 249.7n ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/16:2 255.2n ± 0% 255.2n ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/16:4 274.0n ± 0% 274.0n ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/16:8 314.1n ± 0% 314.1n ± 0% ~ (p=1.000 n=10) +LastIndexAnyASCII/16:16 403.8n ± 0% 403.8n ± 0% ~ (p=1.000 n=10) +LastIndexAnyASCII/16:32 564.4n ± 0% 564.4n ± 0% ~ (p=1.000 n=10) +LastIndexAnyASCII/16:64 885.5n ± 0% 885.5n ± 0% ~ (p=0.474 n=10) +LastIndexAnyASCII/256:1 2.819µ ± 0% 2.819µ ± 0% ~ (p=0.211 n=10) +LastIndexAnyASCII/256:2 2.824µ ± 0% 2.824µ ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/256:4 2.843µ ± 0% 2.843µ ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/256:8 2.883µ ± 0% 2.883µ ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/256:16 2.973µ ± 0% 2.973µ ± 0% ~ (p=1.000 n=10) +LastIndexAnyASCII/256:32 3.133µ ± 0% 3.133µ ± 0% ~ (p=0.628 n=10) +LastIndexAnyASCII/256:64 3.454µ ± 0% 3.454µ ± 0% ~ (p=1.000 n=10) +LastIndexAnyUTF8/1:1 30.78n ± 0% 31.45n ± 0% +2.18% (p=0.000 n=10) +LastIndexAnyUTF8/1:2 32.13n ± 0% 32.80n ± 0% +2.07% (p=0.000 n=10) +LastIndexAnyUTF8/1:4 34.81n ± 0% 35.48n ± 0% +1.92% (p=0.000 n=10) +LastIndexAnyUTF8/1:8 40.14n ± 0% 40.81n ± 0% +1.67% (p=0.000 n=10) +LastIndexAnyUTF8/1:16 50.84n ± 0% 51.52n ± 0% +1.33% (p=0.000 n=10) +LastIndexAnyUTF8/1:32 83.87n ± 0% 62.90n ± 0% -25.00% (p=0.000 n=10) +LastIndexAnyUTF8/1:64 121.50n ± 0% 81.67n ± 0% -32.78% (p=0.000 n=10) +LastIndexAnyUTF8/16:1 330.0n ± 0% 330.0n ± 0% ~ (p=1.000 n=10) +LastIndexAnyUTF8/16:2 365.4n ± 1% 376.1n ± 0% +2.93% (p=0.000 n=10) +LastIndexAnyUTF8/16:4 399.9n ± 0% 410.6n ± 0% +2.68% (p=0.000 n=10) +LastIndexAnyUTF8/16:8 485.5n ± 0% 496.2n ± 0% +2.20% (p=0.000 n=10) +LastIndexAnyUTF8/16:16 656.8n ± 0% 667.5n ± 0% +1.63% (p=0.000 n=10) +LastIndexAnyUTF8/16:32 999.3n ± 0% 882.6n ± 0% -11.68% (p=0.000 n=10) +LastIndexAnyUTF8/16:64 1.744µ ± 0% 1.129µ ± 0% -35.26% (p=0.000 n=10) +LastIndexAnyUTF8/256:1 4.023µ ± 0% 4.023µ ± 0% 0.00% (p=0.033 n=10) +LastIndexAnyUTF8/256:2 4.645µ ± 0% 4.816µ ± 0% +3.68% (p=0.000 n=10) +LastIndexAnyUTF8/256:4 5.217µ ± 0% 5.388µ ± 0% +3.28% (p=0.000 n=10) +LastIndexAnyUTF8/256:8 6.587µ ± 0% 6.758µ ± 0% +2.60% (p=0.000 n=10) +LastIndexAnyUTF8/256:16 9.327µ ± 0% 9.498µ ± 0% +1.83% (p=0.000 n=10) +LastIndexAnyUTF8/256:32 14.81µ ± 0% 12.92µ ± 0% -12.73% (p=0.000 n=10) +LastIndexAnyUTF8/256:64 26.69µ ± 0% 16.84µ ± 0% -36.92% (p=0.000 n=10) +IndexPeriodic/IndexPeriodic2 625.6µ ± 0% 625.6µ ± 0% ~ (p=0.529 n=10) +IndexPeriodic/IndexPeriodic4 625.5µ ± 0% 625.6µ ± 0% +0.01% (p=0.002 n=10) +IndexPeriodic/IndexPeriodic8 625.4µ ± 0% 625.4µ ± 0% +0.01% (p=0.001 n=10) +IndexPeriodic/IndexPeriodic16 236.5µ ± 0% 225.4µ ± 0% -4.69% (p=0.000 n=10) +IndexPeriodic/IndexPeriodic32 171.1µ ± 3% 133.4µ ± 0% -22.05% (p=0.000 n=10) +IndexPeriodic/IndexPeriodic64 139.10µ ± 3% 89.28µ ± 0% -35.82% (p=0.000 n=10) +geomean 4.222µ 3.628µ -14.0 + +Subset of strings Index benchmarks + +IndexRune 110.7n ± 0% 117.7n ± 0% +6.32% (p=0.000 n=10) +IndexRuneLongString 246.6n ± 0% 187.4n ± 3% -24.01% (p=0.000 n=10) +IndexRuneFastPath 46.82n ± 0% 46.06n ± 0% -1.62% (p=0.000 n=10) +Index 48.28n ± 0% 47.61n ± 0% -1.39% (p=0.000 n=10) +LastIndex 34.50n ± 0% 34.50n ± 0% ~ (p=1.000 n=10) ¹ +IndexByte 41.72n ± 0% 40.83n ± 0% -2.13% (p=0.000 n=10) +IndexHard1 10.01m ± 0% 10.01m ± 0% +0.02% (p=0.000 n=10) +IndexHard2 10.01m ± 0% 10.01m ± 0% +0.02% (p=0.000 n=10) +IndexHard3 10.01m ± 0% 10.01m ± 0% +0.02% (p=0.000 n=10) +IndexHard4 10.01m ± 0% 10.01m ± 0% +0.02% (p=0.000 n=10) +LastIndexHard1 10.71m ± 0% 10.71m ± 0% +0.03% (p=0.000 n=10) +LastIndexHard2 10.71m ± 0% 10.71m ± 0% +0.03% (p=0.000 n=10) +LastIndexHard3 10.71m ± 0% 10.71m ± 0% +0.03% (p=0.000 n=10) +IndexTorture 71.33µ ± 0% 71.37µ ± 0% +0.05% (p=0.000 n=10) +IndexAnyASCII/1:1 34.40n ± 0% 35.07n ± 0% +1.95% (p=0.000 n=10) +IndexAnyASCII/1:2 46.87n ± 0% 47.54n ± 0% +1.43% (p=0.000 n=10) +IndexAnyASCII/1:4 49.53n ± 0% 50.20n ± 0% +1.35% (p=0.000 n=10) +IndexAnyASCII/1:8 54.86n ± 0% 55.53n ± 0% +1.22% (p=0.000 n=10) +IndexAnyASCII/1:16 65.56n ± 0% 66.24n ± 0% +1.04% (p=0.000 n=10) +IndexAnyASCII/1:32 86.97n ± 0% 77.82n ± 0% -10.52% (p=0.000 n=10) +IndexAnyASCII/1:64 134.50n ± 0% 98.57n ± 0% -26.71% (p=0.000 n=10) +IndexAnyASCII/16:1 54.19n ± 0% 54.86n ± 0% +1.24% (p=0.000 n=10) +IndexAnyASCII/16:2 257.4n ± 0% 256.7n ± 0% -0.27% (p=0.000 n=10) +IndexAnyASCII/16:4 275.3n ± 0% 275.3n ± 0% ~ (p=1.000 n=10) +IndexAnyASCII/16:8 315.4n ± 0% 315.5n ± 0% +0.03% (p=0.001 n=10) +IndexAnyASCII/16:16 405.4n ± 0% 405.4n ± 0% ~ (p=1.000 n=10) +IndexAnyASCII/16:32 566.0n ± 0% 566.0n ± 0% ~ (p=1.000 n=10) +IndexAnyASCII/16:64 887.0n ± 0% 887.1n ± 0% ~ (p=0.181 n=10) +IndexAnyASCII/256:1 380.0n ± 0% 174.7n ± 0% -54.03% (p=0.000 n=10) +IndexAnyASCII/256:2 2.826µ ± 0% 2.826µ ± 0% ~ (p=1.000 n=10) ¹ +IndexAnyASCII/256:4 2.844µ ± 0% 2.844µ ± 0% ~ (p=1.000 n=10) ¹ +IndexAnyASCII/256:8 2.884µ ± 0% 2.884µ ± 0% ~ (p=0.087 n=10) +IndexAnyASCII/256:16 2.974µ ± 0% 2.974µ ± 0% ~ (p=1.000 n=10) +IndexAnyASCII/256:32 3.135µ ± 0% 3.135µ ± 0% ~ (p=1.000 n=10) +IndexAnyASCII/256:64 3.456µ ± 0% 3.456µ ± 0% ~ (p=1.000 n=10) ¹ +IndexAnyUTF8/1:1 38.13n ± 0% 38.13n ± 0% ~ (p=1.000 n=10) ¹ +IndexAnyUTF8/1:2 46.87n ± 0% 47.54n ± 0% +1.43% (p=0.000 n=10) +IndexAnyUTF8/1:4 49.53n ± 0% 50.19n ± 0% +1.33% (p=0.000 n=10) +IndexAnyUTF8/1:8 54.86n ± 0% 55.52n ± 0% +1.20% (p=0.000 n=10) +IndexAnyUTF8/1:16 65.56n ± 0% 66.23n ± 0% +1.02% (p=0.000 n=10) +IndexAnyUTF8/1:32 86.97n ± 0% 82.25n ± 0% -5.42% (p=0.000 n=10) +IndexAnyUTF8/1:64 134.50n ± 0% 99.96n ± 0% -25.68% (p=0.000 n=10) +IndexAnyUTF8/16:1 98.34n ± 0% 98.34n ± 0% ~ (p=1.000 n=10) +IndexAnyUTF8/16:2 462.7n ± 0% 473.7n ± 0% +2.38% (p=0.000 n=10) +IndexAnyUTF8/16:4 504.6n ± 0% 515.3n ± 0% +2.11% (p=0.000 n=10) +IndexAnyUTF8/16:8 589.1n ± 0% 599.7n ± 0% +1.80% (p=0.000 n=10) +IndexAnyUTF8/16:16 760.4n ± 0% 770.9n ± 0% +1.38% (p=0.000 n=10) +IndexAnyUTF8/16:32 1.103µ ± 0% 1.023µ ± 0% -7.25% (p=0.000 n=10) +IndexAnyUTF8/16:64 1.857µ ± 0% 1.294µ ± 0% -30.32% (p=0.000 n=10) +IndexAnyUTF8/256:1 1.066µ ± 0% 1.066µ ± 0% ~ (p=1.000 n=10) ¹ +IndexAnyUTF8/256:2 6.106µ ± 0% 6.277µ ± 0% +2.81% (p=0.000 n=10) +IndexAnyUTF8/256:4 6.787µ ± 0% 6.958µ ± 0% +2.52% (p=0.000 n=10) +IndexAnyUTF8/256:8 8.136µ ± 0% 8.308µ ± 0% +2.11% (p=0.000 n=10) +IndexAnyUTF8/256:16 10.88µ ± 0% 11.05µ ± 0% +1.57% (p=0.000 n=10) +IndexAnyUTF8/256:32 16.36µ ± 0% 14.90µ ± 0% -8.93% (p=0.000 n=10) +IndexAnyUTF8/256:64 28.51µ ± 0% 19.41µ ± 0% -31.92% (p=0.000 n=10) +LastIndexAnyASCII/1:1 35.79n ± 0% 38.52n ± 0% +7.63% (p=0.000 n=10) +LastIndexAnyASCII/1:2 37.12n ± 0% 39.85n ± 0% +7.35% (p=0.000 n=10) +LastIndexAnyASCII/1:4 39.76n ± 0% 42.08n ± 0% +5.84% (p=0.000 n=10) +LastIndexAnyASCII/1:8 44.82n ± 0% 47.22n ± 0% +5.34% (p=0.000 n=10) +LastIndexAnyASCII/1:16 55.53n ± 0% 57.92n ± 3% +4.30% (p=0.000 n=10) +LastIndexAnyASCII/1:32 76.94n ± 0% 70.16n ± 0% -8.81% (p=0.000 n=10) +LastIndexAnyASCII/1:64 124.40n ± 0% 89.67n ± 0% -27.92% (p=0.000 n=10) +LastIndexAnyASCII/16:1 245.9n ± 0% 245.9n ± 0% ~ (p=1.000 n=10) +LastIndexAnyASCII/16:2 255.2n ± 0% 255.2n ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/16:4 275.1n ± 0% 275.1n ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/16:8 315.2n ± 0% 315.2n ± 0% ~ (p=1.000 n=10) +LastIndexAnyASCII/16:16 400.4n ± 0% 400.4n ± 0% ~ (p=0.087 n=10) +LastIndexAnyASCII/16:32 560.9n ± 0% 560.9n ± 0% ~ (p=0.124 n=10) +LastIndexAnyASCII/16:64 882.1n ± 0% 882.0n ± 0% -0.01% (p=0.003 n=10) +LastIndexAnyASCII/256:1 2.815µ ± 0% 2.815µ ± 0% ~ (p=0.211 n=10) +LastIndexAnyASCII/256:2 2.824µ ± 0% 2.824µ ± 0% ~ (p=1.000 n=10) +LastIndexAnyASCII/256:4 2.844µ ± 0% 2.844µ ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/256:8 2.884µ ± 0% 2.884µ ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/256:16 2.969µ ± 0% 2.969µ ± 0% ~ (p=1.000 n=10) +LastIndexAnyASCII/256:32 3.130µ ± 0% 3.130µ ± 0% ~ (p=1.000 n=10) ¹ +LastIndexAnyASCII/256:64 3.451µ ± 0% 3.451µ ± 0% ~ (p=0.474 n=10) +LastIndexAnyUTF8/1:1 35.79n ± 0% 36.13n ± 0% +0.95% (p=0.000 n=10) +LastIndexAnyUTF8/1:2 37.11n ± 0% 37.47n ± 0% +0.97% (p=0.000 n=10) +LastIndexAnyUTF8/1:4 39.75n ± 0% 40.14n ± 0% +0.97% (p=0.000 n=10) +LastIndexAnyUTF8/1:8 44.82n ± 0% 45.49n ± 0% +1.49% (p=0.000 n=10) +LastIndexAnyUTF8/1:16 55.52n ± 0% 56.20n ± 0% +1.22% (p=0.000 n=10) +LastIndexAnyUTF8/1:32 76.93n ± 0% 74.25n ± 0% -3.48% (p=0.000 n=10) +LastIndexAnyUTF8/1:64 124.40n ± 0% 91.15n ± 0% -26.73% (p=0.000 n=10) +LastIndexAnyUTF8/16:1 322.5n ± 0% 322.5n ± 0% ~ (p=0.087 n=10) +LastIndexAnyUTF8/16:2 634.2n ± 0% 616.4n ± 0% -2.81% (p=0.000 n=10) +LastIndexAnyUTF8/16:4 674.5n ± 0% 657.9n ± 0% -2.46% (p=0.000 n=10) +LastIndexAnyUTF8/16:8 758.3n ± 0% 741.0n ± 0% -2.28% (p=0.000 n=10) +LastIndexAnyUTF8/16:16 929.6n ± 0% 912.3n ± 0% -1.86% (p=0.000 n=10) +LastIndexAnyUTF8/16:32 1.272µ ± 0% 1.176µ ± 0% -7.55% (p=0.000 n=10) +LastIndexAnyUTF8/16:64 2.018µ ± 0% 1.453µ ± 0% -28.00% (p=0.000 n=10) +LastIndexAnyUTF8/256:1 4.015µ ± 0% 4.016µ ± 0% +0.02% (p=0.000 n=10) +LastIndexAnyUTF8/256:2 8.896µ ± 0% 8.537µ ± 0% -4.04% (p=0.000 n=10) +LastIndexAnyUTF8/256:4 9.553µ ± 0% 9.217µ ± 0% -3.52% (p=0.000 n=10) +LastIndexAnyUTF8/256:8 10.90µ ± 0% 10.54µ ± 0% -3.29% (p=0.000 n=10) +LastIndexAnyUTF8/256:16 13.64µ ± 0% 13.28µ ± 0% -2.63% (p=0.000 n=10) +LastIndexAnyUTF8/256:32 19.12µ ± 0% 17.16µ ± 1% -10.23% (p=0.000 n=10) +LastIndexAnyUTF8/256:64 31.11µ ± 0% 21.98µ ± 0% -29.36% (p=0.000 n=10) +IndexPeriodic/IndexPeriodic2 625.5µ ± 0% 625.5µ ± 0% ~ (p=0.955 n=10) +IndexPeriodic/IndexPeriodic4 625.4µ ± 0% 625.4µ ± 0% ~ (p=0.838 n=10) +IndexPeriodic/IndexPeriodic8 625.3µ ± 0% 625.3µ ± 0% +0.01% (p=0.009 n=10) +IndexPeriodic/IndexPeriodic16 229.8µ ± 0% 227.0µ ± 0% -1.22% (p=0.000 n=10) +IndexPeriodic/IndexPeriodic32 168.9µ ± 3% 131.8µ ± 0% -22.00% (p=0.000 n=10) +IndexPeriodic/IndexPeriodic64 126.36µ ± 0% 86.66µ ± 0% -31.42% (p=0.000 n=10) +geomean 1.361µ 1.302µ -4.31% + +As these functions are so heavily used this change impacts other +benchmarks. I include the improvements in geomean for the all the +benchmarks in the strings and bytes packages, along with some +selected benchmarks to illustrate the impact of the change. + +geomean for bytes 13.81µ 12.92µ -6.44% +geomean for string 9.385µ 9.224µ -1.72% + +Note that when building for rva22u64 a single Zbb instruction is used +in the main loop. This also helps to improve performance slightly. +The geomean for all the bytes benchmarks when building with +GORISCV64=rva22u64 with and without the patch is shown below. + +geomean for bytes (rva22u64) 13.46µ 12.49µ -7.21% + +Examples of non-Index benchmarks affected by this commit. + +ReadString uses IndexByte to search for a byte stored at the end of +32KB buffer, so we see a speed up. SplitSingleByteSeparator searches +large buffers, but the byte being sought occurs within the first 15 +bytes of the buffer, 76% of the time, hence the slowdown. In +SplitMultiByteSeparator the first byte of the separator only occurs +in the first 15 bytes 33% of the time so we see a speed up. + +ReadString 05.13µ ± 2% 74.67µ ± 0% -28.97% (p=0.000 n=10) +SplitSingleByteSeparator 11.31m ± 2% 12.43m ± 1% +9.83% (p=0.000 n=10) +SplitMultiByteSeparator 8.070m ± 1% 7.707m ± 1% -4.49% (p=0.000 n=10) + +Change-Id: I6210ea2f3decdc6d2e0609df72b1b66e6d6f5395 +Reviewed-on: https://go-review.googlesource.com/c/go/+/561275 +Reviewed-by: Joel Sing +Reviewed-by: Cherry Mui +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +--- + src/internal/bytealg/indexbyte_riscv64.s | 100 +++++++++++++++++++++-- + 1 file changed, 94 insertions(+), 6 deletions(-) + +diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s +index de00983c7b..fde00da0ea 100644 +--- a/src/internal/bytealg/indexbyte_riscv64.s ++++ b/src/internal/bytealg/indexbyte_riscv64.s +@@ -10,31 +10,118 @@ TEXT ·IndexByte(SB),NOSPLIT,$0-40 + // X11 = b_len + // X12 = b_cap (unused) + // X13 = byte to find +- AND $0xff, X13 +- MOV X10, X12 // store base for later ++ AND $0xff, X13, X12 // x12 byte to look for ++ MOV X10, X13 // store base for later ++ ++ SLTI $24, X11, X14 + ADD X10, X11 // end +- SUB $1, X10 ++ BEQZ X14, bigBody + ++ SUB $1, X10 + loop: + ADD $1, X10 + BEQ X10, X11, notfound + MOVBU (X10), X14 +- BNE X13, X14, loop ++ BNE X12, X14, loop + +- SUB X12, X10 // remove base ++ SUB X13, X10 // remove base + RET + + notfound: + MOV $-1, X10 + RET + ++bigBody: ++ JMP indexByteBig<>(SB) ++ + TEXT ·IndexByteString(SB),NOSPLIT,$0-32 + // X10 = b_base + // X11 = b_len + // X12 = byte to find +- AND $0xff, X12 ++ ++ AND $0xff, X12 // x12 byte to look for + MOV X10, X13 // store base for later ++ ++ SLTI $24, X11, X14 + ADD X10, X11 // end ++ BEQZ X14, bigBody ++ ++ SUB $1, X10 ++loop: ++ ADD $1, X10 ++ BEQ X10, X11, notfound ++ MOVBU (X10), X14 ++ BNE X12, X14, loop ++ ++ SUB X13, X10 // remove base ++ RET ++ ++notfound: ++ MOV $-1, X10 ++ RET ++ ++bigBody: ++ JMP indexByteBig<>(SB) ++ ++TEXT indexByteBig<>(SB),NOSPLIT|NOFRAME,$0 ++ // On entry ++ // X10 = b_base ++ // X11 = end ++ // X12 = byte to find ++ // X13 = b_base ++ // X11 is at least 16 bytes > X10 ++ ++ // On exit ++ // X10 = index of first instance of sought byte, if found, or -1 otherwise ++ ++ // Process the first few bytes until we get to an 8 byte boundary ++ // No need to check for end here as we have at least 16 bytes in ++ // the buffer. ++ ++unalignedloop: ++ AND $7, X10, X14 ++ BEQZ X14, aligned ++ MOVBU (X10), X14 ++ BEQ X12, X14, found ++ ADD $1, X10 ++ JMP unalignedloop ++ ++aligned: ++ AND $~7, X11, X15 // X15 = end of aligned data ++ ++ // We have at least 9 bytes left ++ ++ // Use 'Determine if a word has a byte equal to n' bit hack from ++ // https://graphics.stanford.edu/~seander/bithacks.html to determine ++ // whether the byte is present somewhere in the next 8 bytes of the ++ // array. ++ ++ MOV $0x0101010101010101, X16 ++ SLLI $7, X16, X17 // X17 = 0x8080808080808080 ++ ++ MUL X12, X16, X18 // broadcast X12 to every byte in X18 ++ ++alignedloop: ++ MOV (X10), X14 ++ XOR X14, X18, X19 ++ ++ // If the LSB in X12 is present somewhere in the 8 bytes we've just ++ // loaded into X14 then at least one of the bytes in X19 will be 0 ++ // after the XOR. If any of the bytes in X19 are zero then ++ // ++ // ((X19 - X16) & (~X19) & X17) ++ // ++ // will be non-zero. The expression will evaluate to zero if none of ++ // the bytes in X19 are zero, i.e., X12 is not present in X14. ++ ++ SUB X16, X19, X20 ++ ANDN X19, X17, X21 ++ AND X20, X21 ++ BNEZ X21, tailloop // If X21 != 0 X12 is present in X14 ++ ADD $8, X10 ++ BNE X10, X15, alignedloop ++ ++tailloop: + SUB $1, X10 + + loop: +@@ -43,6 +130,7 @@ loop: + MOVBU (X10), X14 + BNE X12, X14, loop + ++found: + SUB X13, X10 // remove base + RET + +-- +2.39.5 + diff --git a/2063-cmd-internal-obj-riscv-rework-instruction-encoding-i.patch b/2063-cmd-internal-obj-riscv-rework-instruction-encoding-i.patch new file mode 100644 index 0000000..f630789 --- /dev/null +++ b/2063-cmd-internal-obj-riscv-rework-instruction-encoding-i.patch @@ -0,0 +1,624 @@ +From 28c6d3f567efbad8c616f1964fbb4f88848991ba Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 063/119] cmd/internal/obj/riscv: rework instruction encoding + information + +Currently, instruction encoding is a slice of encoding types, which +is indexed by a masked version of the riscv64 opcode. Additional +information about some instructions (for example, if an instruction +has a ternary form and if there is an immediate form for an instruction) +is manually specified in other parts of the assembler code. + +Rework the instruction encoding information so that we use a table +driven form, providing additional data for each instruction where +relevant. This means that we can simplify other parts of the code +by simply looking up the instruction data and reusing minimal logic. + +Change-Id: I7b3b6c61a4868647edf28bd7dbae2150e043ae00 +Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64 +Reviewed-on: https://go-review.googlesource.com/c/go/+/622535 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Cherry Mui +Reviewed-by: Meng Zhuo +Reviewed-by: Mark Ryan +Reviewed-by: Dmitri Shuralyov +--- + src/cmd/internal/obj/riscv/obj.go | 491 ++++++++++++++---------------- + 1 file changed, 234 insertions(+), 257 deletions(-) + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 6b490a8967..5e7092ab36 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -53,18 +53,14 @@ func jalToSym(ctxt *obj.Link, p *obj.Prog, lr int16) { + // progedit is called individually for each *obj.Prog. It normalizes instruction + // formats and eliminates as many pseudo-instructions as possible. + func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { ++ insData, err := instructionDataForAs(p.As) ++ if err != nil { ++ panic(fmt.Sprintf("failed to lookup instruction data for %v: %v", p.As, err)) ++ } + + // Expand binary instructions to ternary ones. + if p.Reg == obj.REG_NONE { +- switch p.As { +- case AADDI, ASLTI, ASLTIU, AANDI, AORI, AXORI, ASLLI, ASRLI, ASRAI, +- AADDIW, ASLLIW, ASRLIW, ASRAIW, AADDW, ASUBW, ASLLW, ASRLW, ASRAW, +- AADD, AAND, AOR, AXOR, ASLL, ASRL, ASUB, ASRA, +- AMUL, AMULH, AMULHU, AMULHSU, AMULW, ADIV, ADIVU, ADIVW, ADIVUW, +- AREM, AREMU, AREMW, AREMUW, +- AADDUW, ASH1ADD, ASH1ADDUW, ASH2ADD, ASH2ADDUW, ASH3ADD, ASH3ADDUW, ASLLIUW, +- AANDN, AORN, AXNOR, AMAX, AMAXU, AMIN, AMINU, AROL, AROLW, AROR, ARORW, ARORI, ARORIW, +- ABCLR, ABCLRI, ABEXT, ABEXTI, ABINV, ABINVI, ABSET, ABSETI: ++ if insData.ternary { + p.Reg = p.To.Reg + } + } +@@ -73,48 +69,14 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + // form of the instruction. + if p.From.Type == obj.TYPE_CONST { + switch p.As { +- case AADD: +- p.As = AADDI + case ASUB: + p.As, p.From.Offset = AADDI, -p.From.Offset +- case ASLT: +- p.As = ASLTI +- case ASLTU: +- p.As = ASLTIU +- case AAND: +- p.As = AANDI +- case AOR: +- p.As = AORI +- case AXOR: +- p.As = AXORI +- case ASLL: +- p.As = ASLLI +- case ASRL: +- p.As = ASRLI +- case ASRA: +- p.As = ASRAI +- case AADDW: +- p.As = AADDIW + case ASUBW: + p.As, p.From.Offset = AADDIW, -p.From.Offset +- case ASLLW: +- p.As = ASLLIW +- case ASRLW: +- p.As = ASRLIW +- case ASRAW: +- p.As = ASRAIW +- case AROR: +- p.As = ARORI +- case ARORW: +- p.As = ARORIW +- case ABCLR: +- p.As = ABCLRI +- case ABEXT: +- p.As = ABEXTI +- case ABINV: +- p.As = ABINVI +- case ABSET: +- p.As = ABSETI ++ default: ++ if insData.immForm != obj.AXXX { ++ p.As = insData.immForm ++ } + } + } + +@@ -1566,285 +1528,300 @@ var ( + badEncoding = encoding{encode: func(*instruction) uint32 { return 0 }, validate: func(*obj.Link, *instruction) {}, length: 0} + ) + +-// encodings contains the encodings for RISC-V instructions. +-// Instructions are masked with obj.AMask to keep indices small. +-var encodings = [ALAST & obj.AMask]encoding{ ++// instructionData specifies details relating to a RISC-V instruction. ++type instructionData struct { ++ enc encoding ++ immForm obj.As // immediate form of this instruction ++ ternary bool ++} + ++// instructions contains details of RISC-V instructions, including ++// their encoding type. Entries are masked with obj.AMask to keep ++// indices small. ++var instructions = [ALAST & obj.AMask]instructionData{ + // Unprivileged ISA + + // 2.4: Integer Computational Instructions +- AADDI & obj.AMask: iIIEncoding, +- ASLTI & obj.AMask: iIIEncoding, +- ASLTIU & obj.AMask: iIIEncoding, +- AANDI & obj.AMask: iIIEncoding, +- AORI & obj.AMask: iIIEncoding, +- AXORI & obj.AMask: iIIEncoding, +- ASLLI & obj.AMask: iIIEncoding, +- ASRLI & obj.AMask: iIIEncoding, +- ASRAI & obj.AMask: iIIEncoding, +- ALUI & obj.AMask: uEncoding, +- AAUIPC & obj.AMask: uEncoding, +- AADD & obj.AMask: rIIIEncoding, +- ASLT & obj.AMask: rIIIEncoding, +- ASLTU & obj.AMask: rIIIEncoding, +- AAND & obj.AMask: rIIIEncoding, +- AOR & obj.AMask: rIIIEncoding, +- AXOR & obj.AMask: rIIIEncoding, +- ASLL & obj.AMask: rIIIEncoding, +- ASRL & obj.AMask: rIIIEncoding, +- ASUB & obj.AMask: rIIIEncoding, +- ASRA & obj.AMask: rIIIEncoding, ++ AADDI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ASLTI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ASLTIU & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ AANDI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ AORI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ AXORI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ASLLI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ASRLI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ASRAI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ALUI & obj.AMask: {enc: uEncoding}, ++ AAUIPC & obj.AMask: {enc: uEncoding}, ++ AADD & obj.AMask: {enc: rIIIEncoding, immForm: AADDI, ternary: true}, ++ ASLT & obj.AMask: {enc: rIIIEncoding, immForm: ASLTI, ternary: true}, ++ ASLTU & obj.AMask: {enc: rIIIEncoding, immForm: ASLTIU, ternary: true}, ++ AAND & obj.AMask: {enc: rIIIEncoding, immForm: AANDI, ternary: true}, ++ AOR & obj.AMask: {enc: rIIIEncoding, immForm: AORI, ternary: true}, ++ AXOR & obj.AMask: {enc: rIIIEncoding, immForm: AXORI, ternary: true}, ++ ASLL & obj.AMask: {enc: rIIIEncoding, immForm: ASLLI, ternary: true}, ++ ASRL & obj.AMask: {enc: rIIIEncoding, immForm: ASRLI, ternary: true}, ++ ASUB & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ASRA & obj.AMask: {enc: rIIIEncoding, immForm: ASRAI, ternary: true}, + + // 2.5: Control Transfer Instructions +- AJAL & obj.AMask: jEncoding, +- AJALR & obj.AMask: iIIEncoding, +- ABEQ & obj.AMask: bEncoding, +- ABNE & obj.AMask: bEncoding, +- ABLT & obj.AMask: bEncoding, +- ABLTU & obj.AMask: bEncoding, +- ABGE & obj.AMask: bEncoding, +- ABGEU & obj.AMask: bEncoding, ++ AJAL & obj.AMask: {enc: jEncoding}, ++ AJALR & obj.AMask: {enc: iIIEncoding}, ++ ABEQ & obj.AMask: {enc: bEncoding}, ++ ABNE & obj.AMask: {enc: bEncoding}, ++ ABLT & obj.AMask: {enc: bEncoding}, ++ ABLTU & obj.AMask: {enc: bEncoding}, ++ ABGE & obj.AMask: {enc: bEncoding}, ++ ABGEU & obj.AMask: {enc: bEncoding}, + + // 2.6: Load and Store Instructions +- ALW & obj.AMask: iIIEncoding, +- ALWU & obj.AMask: iIIEncoding, +- ALH & obj.AMask: iIIEncoding, +- ALHU & obj.AMask: iIIEncoding, +- ALB & obj.AMask: iIIEncoding, +- ALBU & obj.AMask: iIIEncoding, +- ASW & obj.AMask: sIEncoding, +- ASH & obj.AMask: sIEncoding, +- ASB & obj.AMask: sIEncoding, ++ ALW & obj.AMask: {enc: iIIEncoding}, ++ ALWU & obj.AMask: {enc: iIIEncoding}, ++ ALH & obj.AMask: {enc: iIIEncoding}, ++ ALHU & obj.AMask: {enc: iIIEncoding}, ++ ALB & obj.AMask: {enc: iIIEncoding}, ++ ALBU & obj.AMask: {enc: iIIEncoding}, ++ ASW & obj.AMask: {enc: sIEncoding}, ++ ASH & obj.AMask: {enc: sIEncoding}, ++ ASB & obj.AMask: {enc: sIEncoding}, + + // 2.7: Memory Ordering +- AFENCE & obj.AMask: iIIEncoding, ++ AFENCE & obj.AMask: {enc: iIIEncoding}, + + // 5.2: Integer Computational Instructions (RV64I) +- AADDIW & obj.AMask: iIIEncoding, +- ASLLIW & obj.AMask: iIIEncoding, +- ASRLIW & obj.AMask: iIIEncoding, +- ASRAIW & obj.AMask: iIIEncoding, +- AADDW & obj.AMask: rIIIEncoding, +- ASLLW & obj.AMask: rIIIEncoding, +- ASRLW & obj.AMask: rIIIEncoding, +- ASUBW & obj.AMask: rIIIEncoding, +- ASRAW & obj.AMask: rIIIEncoding, ++ AADDIW & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ASLLIW & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ASRLIW & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ASRAIW & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ AADDW & obj.AMask: {enc: rIIIEncoding, immForm: AADDIW, ternary: true}, ++ ASLLW & obj.AMask: {enc: rIIIEncoding, immForm: ASLLIW, ternary: true}, ++ ASRLW & obj.AMask: {enc: rIIIEncoding, immForm: ASRLIW, ternary: true}, ++ ASUBW & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ASRAW & obj.AMask: {enc: rIIIEncoding, immForm: ASRAIW, ternary: true}, + + // 5.3: Load and Store Instructions (RV64I) +- ALD & obj.AMask: iIIEncoding, +- ASD & obj.AMask: sIEncoding, ++ ALD & obj.AMask: {enc: iIIEncoding}, ++ ASD & obj.AMask: {enc: sIEncoding}, + + // 7.1: CSR Instructions +- ACSRRS & obj.AMask: iIIEncoding, ++ ACSRRS & obj.AMask: {enc: iIIEncoding}, + + // 7.1: Multiplication Operations +- AMUL & obj.AMask: rIIIEncoding, +- AMULH & obj.AMask: rIIIEncoding, +- AMULHU & obj.AMask: rIIIEncoding, +- AMULHSU & obj.AMask: rIIIEncoding, +- AMULW & obj.AMask: rIIIEncoding, +- ADIV & obj.AMask: rIIIEncoding, +- ADIVU & obj.AMask: rIIIEncoding, +- AREM & obj.AMask: rIIIEncoding, +- AREMU & obj.AMask: rIIIEncoding, +- ADIVW & obj.AMask: rIIIEncoding, +- ADIVUW & obj.AMask: rIIIEncoding, +- AREMW & obj.AMask: rIIIEncoding, +- AREMUW & obj.AMask: rIIIEncoding, ++ AMUL & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AMULH & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AMULHU & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AMULHSU & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AMULW & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ADIV & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ADIVU & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AREM & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AREMU & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ADIVW & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ADIVUW & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AREMW & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AREMUW & obj.AMask: {enc: rIIIEncoding, ternary: true}, + + // 8.2: Load-Reserved/Store-Conditional +- ALRW & obj.AMask: rIIIEncoding, +- ALRD & obj.AMask: rIIIEncoding, +- ASCW & obj.AMask: rIIIEncoding, +- ASCD & obj.AMask: rIIIEncoding, ++ ALRW & obj.AMask: {enc: rIIIEncoding}, ++ ALRD & obj.AMask: {enc: rIIIEncoding}, ++ ASCW & obj.AMask: {enc: rIIIEncoding}, ++ ASCD & obj.AMask: {enc: rIIIEncoding}, + + // 8.3: Atomic Memory Operations +- AAMOSWAPW & obj.AMask: rIIIEncoding, +- AAMOSWAPD & obj.AMask: rIIIEncoding, +- AAMOADDW & obj.AMask: rIIIEncoding, +- AAMOADDD & obj.AMask: rIIIEncoding, +- AAMOANDW & obj.AMask: rIIIEncoding, +- AAMOANDD & obj.AMask: rIIIEncoding, +- AAMOORW & obj.AMask: rIIIEncoding, +- AAMOORD & obj.AMask: rIIIEncoding, +- AAMOXORW & obj.AMask: rIIIEncoding, +- AAMOXORD & obj.AMask: rIIIEncoding, +- AAMOMAXW & obj.AMask: rIIIEncoding, +- AAMOMAXD & obj.AMask: rIIIEncoding, +- AAMOMAXUW & obj.AMask: rIIIEncoding, +- AAMOMAXUD & obj.AMask: rIIIEncoding, +- AAMOMINW & obj.AMask: rIIIEncoding, +- AAMOMIND & obj.AMask: rIIIEncoding, +- AAMOMINUW & obj.AMask: rIIIEncoding, +- AAMOMINUD & obj.AMask: rIIIEncoding, ++ AAMOSWAPW & obj.AMask: {enc: rIIIEncoding}, ++ AAMOSWAPD & obj.AMask: {enc: rIIIEncoding}, ++ AAMOADDW & obj.AMask: {enc: rIIIEncoding}, ++ AAMOADDD & obj.AMask: {enc: rIIIEncoding}, ++ AAMOANDW & obj.AMask: {enc: rIIIEncoding}, ++ AAMOANDD & obj.AMask: {enc: rIIIEncoding}, ++ AAMOORW & obj.AMask: {enc: rIIIEncoding}, ++ AAMOORD & obj.AMask: {enc: rIIIEncoding}, ++ AAMOXORW & obj.AMask: {enc: rIIIEncoding}, ++ AAMOXORD & obj.AMask: {enc: rIIIEncoding}, ++ AAMOMAXW & obj.AMask: {enc: rIIIEncoding}, ++ AAMOMAXD & obj.AMask: {enc: rIIIEncoding}, ++ AAMOMAXUW & obj.AMask: {enc: rIIIEncoding}, ++ AAMOMAXUD & obj.AMask: {enc: rIIIEncoding}, ++ AAMOMINW & obj.AMask: {enc: rIIIEncoding}, ++ AAMOMIND & obj.AMask: {enc: rIIIEncoding}, ++ AAMOMINUW & obj.AMask: {enc: rIIIEncoding}, ++ AAMOMINUD & obj.AMask: {enc: rIIIEncoding}, + + // 11.5: Single-Precision Load and Store Instructions +- AFLW & obj.AMask: iFEncoding, +- AFSW & obj.AMask: sFEncoding, ++ AFLW & obj.AMask: {enc: iFEncoding}, ++ AFSW & obj.AMask: {enc: sFEncoding}, + + // 11.6: Single-Precision Floating-Point Computational Instructions +- AFADDS & obj.AMask: rFFFEncoding, +- AFSUBS & obj.AMask: rFFFEncoding, +- AFMULS & obj.AMask: rFFFEncoding, +- AFDIVS & obj.AMask: rFFFEncoding, +- AFMINS & obj.AMask: rFFFEncoding, +- AFMAXS & obj.AMask: rFFFEncoding, +- AFSQRTS & obj.AMask: rFFFEncoding, +- AFMADDS & obj.AMask: rFFFFEncoding, +- AFMSUBS & obj.AMask: rFFFFEncoding, +- AFNMSUBS & obj.AMask: rFFFFEncoding, +- AFNMADDS & obj.AMask: rFFFFEncoding, ++ AFADDS & obj.AMask: {enc: rFFFEncoding}, ++ AFSUBS & obj.AMask: {enc: rFFFEncoding}, ++ AFMULS & obj.AMask: {enc: rFFFEncoding}, ++ AFDIVS & obj.AMask: {enc: rFFFEncoding}, ++ AFMINS & obj.AMask: {enc: rFFFEncoding}, ++ AFMAXS & obj.AMask: {enc: rFFFEncoding}, ++ AFSQRTS & obj.AMask: {enc: rFFFEncoding}, ++ AFMADDS & obj.AMask: {enc: rFFFFEncoding}, ++ AFMSUBS & obj.AMask: {enc: rFFFFEncoding}, ++ AFNMSUBS & obj.AMask: {enc: rFFFFEncoding}, ++ AFNMADDS & obj.AMask: {enc: rFFFFEncoding}, + + // 11.7: Single-Precision Floating-Point Conversion and Move Instructions +- AFCVTWS & obj.AMask: rFIEncoding, +- AFCVTLS & obj.AMask: rFIEncoding, +- AFCVTSW & obj.AMask: rIFEncoding, +- AFCVTSL & obj.AMask: rIFEncoding, +- AFCVTWUS & obj.AMask: rFIEncoding, +- AFCVTLUS & obj.AMask: rFIEncoding, +- AFCVTSWU & obj.AMask: rIFEncoding, +- AFCVTSLU & obj.AMask: rIFEncoding, +- AFSGNJS & obj.AMask: rFFFEncoding, +- AFSGNJNS & obj.AMask: rFFFEncoding, +- AFSGNJXS & obj.AMask: rFFFEncoding, +- AFMVXW & obj.AMask: rFIEncoding, +- AFMVWX & obj.AMask: rIFEncoding, ++ AFCVTWS & obj.AMask: {enc: rFIEncoding}, ++ AFCVTLS & obj.AMask: {enc: rFIEncoding}, ++ AFCVTSW & obj.AMask: {enc: rIFEncoding}, ++ AFCVTSL & obj.AMask: {enc: rIFEncoding}, ++ AFCVTWUS & obj.AMask: {enc: rFIEncoding}, ++ AFCVTLUS & obj.AMask: {enc: rFIEncoding}, ++ AFCVTSWU & obj.AMask: {enc: rIFEncoding}, ++ AFCVTSLU & obj.AMask: {enc: rIFEncoding}, ++ AFSGNJS & obj.AMask: {enc: rFFFEncoding}, ++ AFSGNJNS & obj.AMask: {enc: rFFFEncoding}, ++ AFSGNJXS & obj.AMask: {enc: rFFFEncoding}, ++ AFMVXW & obj.AMask: {enc: rFIEncoding}, ++ AFMVWX & obj.AMask: {enc: rIFEncoding}, + + // 11.8: Single-Precision Floating-Point Compare Instructions +- AFEQS & obj.AMask: rFFIEncoding, +- AFLTS & obj.AMask: rFFIEncoding, +- AFLES & obj.AMask: rFFIEncoding, ++ AFEQS & obj.AMask: {enc: rFFIEncoding}, ++ AFLTS & obj.AMask: {enc: rFFIEncoding}, ++ AFLES & obj.AMask: {enc: rFFIEncoding}, + + // 11.9: Single-Precision Floating-Point Classify Instruction +- AFCLASSS & obj.AMask: rFIEncoding, ++ AFCLASSS & obj.AMask: {enc: rFIEncoding}, + + // 12.3: Double-Precision Load and Store Instructions +- AFLD & obj.AMask: iFEncoding, +- AFSD & obj.AMask: sFEncoding, ++ AFLD & obj.AMask: {enc: iFEncoding}, ++ AFSD & obj.AMask: {enc: sFEncoding}, + + // 12.4: Double-Precision Floating-Point Computational Instructions +- AFADDD & obj.AMask: rFFFEncoding, +- AFSUBD & obj.AMask: rFFFEncoding, +- AFMULD & obj.AMask: rFFFEncoding, +- AFDIVD & obj.AMask: rFFFEncoding, +- AFMIND & obj.AMask: rFFFEncoding, +- AFMAXD & obj.AMask: rFFFEncoding, +- AFSQRTD & obj.AMask: rFFFEncoding, +- AFMADDD & obj.AMask: rFFFFEncoding, +- AFMSUBD & obj.AMask: rFFFFEncoding, +- AFNMSUBD & obj.AMask: rFFFFEncoding, +- AFNMADDD & obj.AMask: rFFFFEncoding, ++ AFADDD & obj.AMask: {enc: rFFFEncoding}, ++ AFSUBD & obj.AMask: {enc: rFFFEncoding}, ++ AFMULD & obj.AMask: {enc: rFFFEncoding}, ++ AFDIVD & obj.AMask: {enc: rFFFEncoding}, ++ AFMIND & obj.AMask: {enc: rFFFEncoding}, ++ AFMAXD & obj.AMask: {enc: rFFFEncoding}, ++ AFSQRTD & obj.AMask: {enc: rFFFEncoding}, ++ AFMADDD & obj.AMask: {enc: rFFFFEncoding}, ++ AFMSUBD & obj.AMask: {enc: rFFFFEncoding}, ++ AFNMSUBD & obj.AMask: {enc: rFFFFEncoding}, ++ AFNMADDD & obj.AMask: {enc: rFFFFEncoding}, + + // 12.5: Double-Precision Floating-Point Conversion and Move Instructions +- AFCVTWD & obj.AMask: rFIEncoding, +- AFCVTLD & obj.AMask: rFIEncoding, +- AFCVTDW & obj.AMask: rIFEncoding, +- AFCVTDL & obj.AMask: rIFEncoding, +- AFCVTWUD & obj.AMask: rFIEncoding, +- AFCVTLUD & obj.AMask: rFIEncoding, +- AFCVTDWU & obj.AMask: rIFEncoding, +- AFCVTDLU & obj.AMask: rIFEncoding, +- AFCVTSD & obj.AMask: rFFEncoding, +- AFCVTDS & obj.AMask: rFFEncoding, +- AFSGNJD & obj.AMask: rFFFEncoding, +- AFSGNJND & obj.AMask: rFFFEncoding, +- AFSGNJXD & obj.AMask: rFFFEncoding, +- AFMVXD & obj.AMask: rFIEncoding, +- AFMVDX & obj.AMask: rIFEncoding, ++ AFCVTWD & obj.AMask: {enc: rFIEncoding}, ++ AFCVTLD & obj.AMask: {enc: rFIEncoding}, ++ AFCVTDW & obj.AMask: {enc: rIFEncoding}, ++ AFCVTDL & obj.AMask: {enc: rIFEncoding}, ++ AFCVTWUD & obj.AMask: {enc: rFIEncoding}, ++ AFCVTLUD & obj.AMask: {enc: rFIEncoding}, ++ AFCVTDWU & obj.AMask: {enc: rIFEncoding}, ++ AFCVTDLU & obj.AMask: {enc: rIFEncoding}, ++ AFCVTSD & obj.AMask: {enc: rFFEncoding}, ++ AFCVTDS & obj.AMask: {enc: rFFEncoding}, ++ AFSGNJD & obj.AMask: {enc: rFFFEncoding}, ++ AFSGNJND & obj.AMask: {enc: rFFFEncoding}, ++ AFSGNJXD & obj.AMask: {enc: rFFFEncoding}, ++ AFMVXD & obj.AMask: {enc: rFIEncoding}, ++ AFMVDX & obj.AMask: {enc: rIFEncoding}, + + // 12.6: Double-Precision Floating-Point Compare Instructions +- AFEQD & obj.AMask: rFFIEncoding, +- AFLTD & obj.AMask: rFFIEncoding, +- AFLED & obj.AMask: rFFIEncoding, ++ AFEQD & obj.AMask: {enc: rFFIEncoding}, ++ AFLTD & obj.AMask: {enc: rFFIEncoding}, ++ AFLED & obj.AMask: {enc: rFFIEncoding}, + + // 12.7: Double-Precision Floating-Point Classify Instruction +- AFCLASSD & obj.AMask: rFIEncoding, ++ AFCLASSD & obj.AMask: {enc: rFIEncoding}, + + // Privileged ISA + + // 3.2.1: Environment Call and Breakpoint +- AECALL & obj.AMask: iIIEncoding, +- AEBREAK & obj.AMask: iIIEncoding, ++ AECALL & obj.AMask: {enc: iIIEncoding}, ++ AEBREAK & obj.AMask: {enc: iIIEncoding}, + + // + // RISC-V Bit-Manipulation ISA-extensions (1.0) + // + + // 1.1: Address Generation Instructions (Zba) +- AADDUW & obj.AMask: rIIIEncoding, +- ASH1ADD & obj.AMask: rIIIEncoding, +- ASH1ADDUW & obj.AMask: rIIIEncoding, +- ASH2ADD & obj.AMask: rIIIEncoding, +- ASH2ADDUW & obj.AMask: rIIIEncoding, +- ASH3ADD & obj.AMask: rIIIEncoding, +- ASH3ADDUW & obj.AMask: rIIIEncoding, +- ASLLIUW & obj.AMask: iIIEncoding, ++ AADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ASH1ADD & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ASH1ADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ASH2ADD & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ASH2ADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ASH3ADD & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ASH3ADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ASLLIUW & obj.AMask: {enc: iIIEncoding, ternary: true}, + + // 1.2: Basic Bit Manipulation (Zbb) +- AANDN & obj.AMask: rIIIEncoding, +- ACLZ & obj.AMask: rIIEncoding, +- ACLZW & obj.AMask: rIIEncoding, +- ACPOP & obj.AMask: rIIEncoding, +- ACPOPW & obj.AMask: rIIEncoding, +- ACTZ & obj.AMask: rIIEncoding, +- ACTZW & obj.AMask: rIIEncoding, +- AMAX & obj.AMask: rIIIEncoding, +- AMAXU & obj.AMask: rIIIEncoding, +- AMIN & obj.AMask: rIIIEncoding, +- AMINU & obj.AMask: rIIIEncoding, +- AORN & obj.AMask: rIIIEncoding, +- ASEXTB & obj.AMask: rIIEncoding, +- ASEXTH & obj.AMask: rIIEncoding, +- AXNOR & obj.AMask: rIIIEncoding, +- AZEXTH & obj.AMask: rIIEncoding, ++ AANDN & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ACLZ & obj.AMask: {enc: rIIEncoding}, ++ ACLZW & obj.AMask: {enc: rIIEncoding}, ++ ACPOP & obj.AMask: {enc: rIIEncoding}, ++ ACPOPW & obj.AMask: {enc: rIIEncoding}, ++ ACTZ & obj.AMask: {enc: rIIEncoding}, ++ ACTZW & obj.AMask: {enc: rIIEncoding}, ++ AMAX & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AMAXU & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AMIN & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AMINU & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AORN & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ ASEXTB & obj.AMask: {enc: rIIEncoding}, ++ ASEXTH & obj.AMask: {enc: rIIEncoding}, ++ AXNOR & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AZEXTH & obj.AMask: {enc: rIIEncoding}, + + // 1.3: Bitwise Rotation (Zbb) +- AROL & obj.AMask: rIIIEncoding, +- AROLW & obj.AMask: rIIIEncoding, +- AROR & obj.AMask: rIIIEncoding, +- ARORI & obj.AMask: iIIEncoding, +- ARORIW & obj.AMask: iIIEncoding, +- ARORW & obj.AMask: rIIIEncoding, +- AORCB & obj.AMask: iIIEncoding, +- AREV8 & obj.AMask: iIIEncoding, ++ AROL & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AROLW & obj.AMask: {enc: rIIIEncoding, ternary: true}, ++ AROR & obj.AMask: {enc: rIIIEncoding, immForm: ARORI, ternary: true}, ++ ARORI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ARORIW & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ARORW & obj.AMask: {enc: rIIIEncoding, immForm: ARORIW, ternary: true}, ++ AORCB & obj.AMask: {enc: iIIEncoding}, ++ AREV8 & obj.AMask: {enc: iIIEncoding}, + + // 1.5: Single-bit Instructions (Zbs) +- ABCLR & obj.AMask: rIIIEncoding, +- ABCLRI & obj.AMask: iIIEncoding, +- ABEXT & obj.AMask: rIIIEncoding, +- ABEXTI & obj.AMask: iIIEncoding, +- ABINV & obj.AMask: rIIIEncoding, +- ABINVI & obj.AMask: iIIEncoding, +- ABSET & obj.AMask: rIIIEncoding, +- ABSETI & obj.AMask: iIIEncoding, ++ ABCLR & obj.AMask: {enc: rIIIEncoding, immForm: ABCLRI, ternary: true}, ++ ABCLRI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ABEXT & obj.AMask: {enc: rIIIEncoding, immForm: ABEXTI, ternary: true}, ++ ABEXTI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ABINV & obj.AMask: {enc: rIIIEncoding, immForm: ABINVI, ternary: true}, ++ ABINVI & obj.AMask: {enc: iIIEncoding, ternary: true}, ++ ABSET & obj.AMask: {enc: rIIIEncoding, immForm: ABSETI, ternary: true}, ++ ABSETI & obj.AMask: {enc: iIIEncoding, ternary: true}, + + // Escape hatch +- AWORD & obj.AMask: rawEncoding, ++ AWORD & obj.AMask: {enc: rawEncoding}, + + // Pseudo-operations +- obj.AFUNCDATA: pseudoOpEncoding, +- obj.APCDATA: pseudoOpEncoding, +- obj.ATEXT: pseudoOpEncoding, +- obj.ANOP: pseudoOpEncoding, +- obj.ADUFFZERO: pseudoOpEncoding, +- obj.ADUFFCOPY: pseudoOpEncoding, +- obj.APCALIGN: pseudoOpEncoding, ++ obj.AFUNCDATA: {enc: pseudoOpEncoding}, ++ obj.APCDATA: {enc: pseudoOpEncoding}, ++ obj.ATEXT: {enc: pseudoOpEncoding}, ++ obj.ANOP: {enc: pseudoOpEncoding}, ++ obj.ADUFFZERO: {enc: pseudoOpEncoding}, ++ obj.ADUFFCOPY: {enc: pseudoOpEncoding}, ++ obj.APCALIGN: {enc: pseudoOpEncoding}, + } + +-// encodingForAs returns the encoding for an obj.As. +-func encodingForAs(as obj.As) (encoding, error) { ++// instructionDataForAs returns the instruction data for an obj.As. ++func instructionDataForAs(as obj.As) (*instructionData, error) { + if base := as &^ obj.AMask; base != obj.ABaseRISCV && base != 0 { +- return badEncoding, fmt.Errorf("encodingForAs: not a RISC-V instruction %s", as) ++ return nil, fmt.Errorf("%v is not a RISC-V instruction", as) + } + asi := as & obj.AMask +- if int(asi) >= len(encodings) { +- return badEncoding, fmt.Errorf("encodingForAs: bad RISC-V instruction %s", as) ++ if int(asi) >= len(instructions) { ++ return nil, fmt.Errorf("bad RISC-V instruction %v", as) ++ } ++ return &instructions[asi], nil ++} ++ ++// encodingForAs returns the encoding for an obj.As. ++func encodingForAs(as obj.As) (*encoding, error) { ++ insData, err := instructionDataForAs(as) ++ if err != nil { ++ return &badEncoding, err + } +- enc := encodings[asi] +- if enc.validate == nil { +- return badEncoding, fmt.Errorf("encodingForAs: no encoding for instruction %s", as) ++ if insData.enc.validate == nil { ++ return &badEncoding, fmt.Errorf("no encoding for instruction %s", as) + } +- return enc, nil ++ return &insData.enc, nil + } + + type instruction struct { +-- +2.39.5 + diff --git a/2064-cpu-internal-provide-runtime-detection-of-RISC-V-ext.patch b/2064-cpu-internal-provide-runtime-detection-of-RISC-V-ext.patch new file mode 100644 index 0000000..e144b15 --- /dev/null +++ b/2064-cpu-internal-provide-runtime-detection-of-RISC-V-ext.patch @@ -0,0 +1,255 @@ +From 0f385c824d3218473ca71a98c71050671078f6ed Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 064/119] cpu/internal: provide runtime detection of RISC-V + extensions on Linux + +Add a RISCV64 variable to cpu/internal that indicates both the presence +of RISC-V extensions and performance information about the underlying +RISC-V cores. The variable is only populated with non false values on +Linux. The detection code relies on the riscv_hwprobe syscall +introduced in Linux 6.4. The patch can detect RVV 1.0 and whether +the CPU supports fast misaligned accesses. It can only detect RVV 1.0 +on a 6.5 kernel or later (without backports). + +Updates #61416 + +Change-Id: I2d8289345c885b699afff441d417cae38f6bdc54 +Reviewed-on: https://go-review.googlesource.com/c/go/+/522995 +Reviewed-by: Joel Sing +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Michael Knyszek +Reviewed-by: David Chase +--- + src/go/build/deps_test.go | 4 +- + src/internal/cpu/cpu.go | 11 ++++ + src/internal/cpu/cpu_riscv64.go | 11 ++++ + src/internal/cpu/cpu_riscv64_linux.go | 91 +++++++++++++++++++++++++++ + src/internal/cpu/cpu_riscv64_other.go | 11 ++++ + src/runtime/os_linux_riscv64.go | 30 +++++++++ + 6 files changed, 156 insertions(+), 2 deletions(-) + create mode 100644 src/internal/cpu/cpu_riscv64_linux.go + create mode 100644 src/internal/cpu/cpu_riscv64_other.go + +diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go +index 592f2fd72a..babce57e42 100644 +--- a/src/go/build/deps_test.go ++++ b/src/go/build/deps_test.go +@@ -39,6 +39,7 @@ import ( + var depsRules = ` + # No dependencies allowed for any of these packages. + NONE ++ < unsafe + < cmp, container/list, container/ring, + internal/cfg, internal/coverage, internal/coverage/rtcov, + internal/coverage/uleb128, internal/coverage/calloc, +@@ -46,8 +47,7 @@ var depsRules = ` + internal/goexperiment, internal/goos, + internal/goversion, internal/nettrace, internal/platform, + log/internal, +- unicode/utf8, unicode/utf16, unicode, +- unsafe; ++ unicode/utf8, unicode/utf16, unicode; + + # These packages depend only on internal/goarch and unsafe. + internal/goarch, unsafe +diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go +index 1352810f42..ddad7198a8 100644 +--- a/src/internal/cpu/cpu.go ++++ b/src/internal/cpu/cpu.go +@@ -117,6 +117,17 @@ var S390X struct { + _ CacheLinePad + } + ++// RISCV64 contains the supported CPU features and performance characteristics for riscv64 ++// platforms. The booleans in RISCV64, with the exception of HasFastMisaligned, indicate ++// the presence of RISC-V extensions. ++// The struct is padded to avoid false sharing. ++var RISCV64 struct { ++ _ CacheLinePad ++ HasFastMisaligned bool // Fast misaligned accesses ++ HasV bool // Vector extension compatible with RVV 1.0 ++ _ CacheLinePad ++} ++ + // Initialize examines the processor and sets the relevant variables above. + // This is called by the runtime package early in program initialization, + // before normal init functions are run. env is set by runtime if the OS supports +diff --git a/src/internal/cpu/cpu_riscv64.go b/src/internal/cpu/cpu_riscv64.go +index 2173fe8886..e6e532c7e7 100644 +--- a/src/internal/cpu/cpu_riscv64.go ++++ b/src/internal/cpu/cpu_riscv64.go +@@ -6,5 +6,16 @@ package cpu + + const CacheLinePadSize = 64 + ++// RISC-V doesn't have a 'cpuid' equivalent. On Linux we rely on the riscv_hwprobe syscall. ++ + func doinit() { ++ options = []option{ ++ {Name: "fastmisaligned", Feature: &RISCV64.HasFastMisaligned}, ++ {Name: "v", Feature: &RISCV64.HasV}, ++ } ++ osInit() ++} ++ ++func isSet(hwc uint, value uint) bool { ++ return hwc&value != 0 + } +diff --git a/src/internal/cpu/cpu_riscv64_linux.go b/src/internal/cpu/cpu_riscv64_linux.go +new file mode 100644 +index 0000000000..a076d3e33c +--- /dev/null ++++ b/src/internal/cpu/cpu_riscv64_linux.go +@@ -0,0 +1,91 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++//go:build riscv64 && linux ++ ++package cpu ++ ++import _ "unsafe" ++ ++// RISC-V extension discovery code for Linux. ++// ++// A note on detection of the Vector extension using HWCAP. ++// ++// Support for the Vector extension version 1.0 was added to the Linux kernel in release 6.5. ++// Support for the riscv_hwprobe syscall was added in 6.4. It follows that if the riscv_hwprobe ++// syscall is not available then neither is the Vector extension (which needs kernel support). ++// The riscv_hwprobe syscall should then be all we need to detect the Vector extension. ++// However, some RISC-V board manufacturers ship boards with an older kernel on top of which ++// they have back-ported various versions of the Vector extension patches but not the riscv_hwprobe ++// patches. These kernels advertise support for the Vector extension using HWCAP. Falling ++// back to HWCAP to detect the Vector extension, if riscv_hwprobe is not available, or simply not ++// bothering with riscv_hwprobe at all and just using HWCAP may then seem like an attractive option. ++// ++// Unfortunately, simply checking the 'V' bit in AT_HWCAP will not work as this bit is used by ++// RISC-V board and cloud instance providers to mean different things. The Lichee Pi 4A board ++// and the Scaleway RV1 cloud instances use the 'V' bit to advertise their support for the unratified ++// 0.7.1 version of the Vector Specification. The Banana Pi BPI-F3 and the CanMV-K230 board use ++// it to advertise support for 1.0 of the Vector extension. Versions 0.7.1 and 1.0 of the Vector ++// extension are binary incompatible. HWCAP can then not be used in isolation to populate the ++// HasV field as this field indicates that the underlying CPU is compatible with RVV 1.0. ++// Go will only support the ratified versions >= 1.0 and so any vector code it might generate ++// would crash on a Scaleway RV1 instance or a Lichee Pi 4a, if allowed to run. ++// ++// There is a way at runtime to distinguish between versions 0.7.1 and 1.0 of the Vector ++// specification by issuing a RVV 1.0 vsetvli instruction and checking the vill bit of the vtype ++// register. This check would allow us to safely detect version 1.0 of the Vector extension ++// with HWCAP, if riscv_hwprobe were not available. However, the check cannot ++// be added until the assembler supports the Vector instructions. ++// ++// Note the riscv_hwprobe syscall does not suffer from these ambiguities by design as all of the ++// extensions it advertises support for are explicitly versioned. It's also worth noting that ++// the riscv_hwprobe syscall is the only way to detect multi-letter RISC-V extensions, e.g., Zvbb. ++// These cannot be detected using HWCAP and so riscv_hwprobe must be used to detect the majority ++// of RISC-V extensions. ++// ++// Please see https://docs.kernel.org/arch/riscv/hwprobe.html for more information. ++ ++const ( ++ // Copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go. ++ riscv_HWPROBE_KEY_IMA_EXT_0 = 0x4 ++ riscv_HWPROBE_IMA_V = 0x4 ++ riscv_HWPROBE_KEY_CPUPERF_0 = 0x5 ++ riscv_HWPROBE_MISALIGNED_FAST = 0x3 ++ riscv_HWPROBE_MISALIGNED_MASK = 0x7 ++) ++ ++// riscvHWProbePairs is copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go. ++type riscvHWProbePairs struct { ++ key int64 ++ value uint64 ++} ++ ++//go:linkname riscvHWProbe ++func riscvHWProbe(pairs []riscvHWProbePairs, flags uint) bool ++ ++func osInit() { ++ // A slice of key/value pair structures is passed to the RISCVHWProbe syscall. The key ++ // field should be initialised with one of the key constants defined above, e.g., ++ // RISCV_HWPROBE_KEY_IMA_EXT_0. The syscall will set the value field to the appropriate value. ++ // If the kernel does not recognise a key it will set the key field to -1 and the value field to 0. ++ ++ pairs := []riscvHWProbePairs{ ++ {riscv_HWPROBE_KEY_IMA_EXT_0, 0}, ++ {riscv_HWPROBE_KEY_CPUPERF_0, 0}, ++ } ++ ++ // This call only indicates that extensions are supported if they are implemented on all cores. ++ if !riscvHWProbe(pairs, 0) { ++ return ++ } ++ ++ if pairs[0].key != -1 { ++ v := uint(pairs[0].value) ++ RISCV64.HasV = isSet(v, riscv_HWPROBE_IMA_V) ++ } ++ if pairs[1].key != -1 { ++ v := pairs[1].value & riscv_HWPROBE_MISALIGNED_MASK ++ RISCV64.HasFastMisaligned = v == riscv_HWPROBE_MISALIGNED_FAST ++ } ++} +diff --git a/src/internal/cpu/cpu_riscv64_other.go b/src/internal/cpu/cpu_riscv64_other.go +new file mode 100644 +index 0000000000..1307d822b3 +--- /dev/null ++++ b/src/internal/cpu/cpu_riscv64_other.go +@@ -0,0 +1,11 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++//go:build riscv64 && !linux ++ ++package cpu ++ ++func osInit() { ++ // Other operating systems do not support the riscv_hwprobe syscall. ++} +diff --git a/src/runtime/os_linux_riscv64.go b/src/runtime/os_linux_riscv64.go +index 9be88a5ad2..bd275707eb 100644 +--- a/src/runtime/os_linux_riscv64.go ++++ b/src/runtime/os_linux_riscv64.go +@@ -4,4 +4,34 @@ + + package runtime + ++import ( ++ "runtime/internal/syscall" ++ "unsafe" ++) ++ + func osArchInit() {} ++ ++type riscvHWProbePairs = struct { ++ key int64 ++ value uint64 ++} ++ ++// TODO: Consider whether to use the VDSO entry for riscv_hwprobe. ++// There is a VDSO entry for riscv_hwprobe that should allow us to avoid the syscall ++// entirely as it can handle the case where the caller only requests extensions that are ++// supported on all cores, which is what we're doing here. However, as we're only calling ++// this syscall once, it may not be worth the added effort to implement the VDSO call. ++ ++//go:linkname internal_cpu_riscvHWProbe internal/cpu.riscvHWProbe ++func internal_cpu_riscvHWProbe(pairs []riscvHWProbePairs, flags uint) bool { ++ // sys_RISCV_HWPROBE is copied from golang.org/x/sys/unix/zsysnum_linux_riscv64.go. ++ const sys_RISCV_HWPROBE uintptr = 258 ++ ++ if len(pairs) == 0 { ++ return false ++ } ++ // Passing in a cpuCount of 0 and a cpu of nil ensures that only extensions supported by all the ++ // cores are returned, which is the behaviour we want in internal/cpu. ++ _, _, e1 := syscall.Syscall6(sys_RISCV_HWPROBE, uintptr(unsafe.Pointer(&pairs[0])), uintptr(len(pairs)), uintptr(0), uintptr(unsafe.Pointer(nil)), uintptr(flags), 0) ++ return e1 == 0 ++} +-- +2.39.5 + diff --git a/2065-cmd-go-add-rva23u64-as-a-valid-value-for-GORISCV64.patch b/2065-cmd-go-add-rva23u64-as-a-valid-value-for-GORISCV64.patch new file mode 100644 index 0000000..7edb409 --- /dev/null +++ b/2065-cmd-go-add-rva23u64-as-a-valid-value-for-GORISCV64.patch @@ -0,0 +1,190 @@ +From 344295f05b52aa3fa812ad039625033b76cc3fcd Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 065/119] cmd/go: add rva23u64 as a valid value for GORISCV64 + +The RVA23 profile was ratified on the 21st of October 2024. + +https://riscv.org/announcements/2024/10/risc-v-announces-ratification-of-the-rva23-profile-standard/ + +Now that it's ratified we can add rva23u64 as a valid value for the +GORISCV64 environment variable. This will allow the compiler and +assembler to generate instructions made mandatory by the new profile +without a runtime check. Examples of such instructions include those +introduced by the Vector and Zicond extensions. + +Setting GORISCV64=rva23u64 defines the riscv64.rva20u64, +riscv64.rva22u64 and riscv64.rva23u64 build tags, sets the internal +variable buildcfg.GORISCV64 to 23 and defines the macros +GORISCV64_rva23u64, hasV, hasZba, hasZbb, hasZbs, hasZfa, and +hasZicond for use in assembly language code. + +Updates #61476 + +Change-Id: I7641c23084fa52891c9a18df58f4013cb6597d88 +Reviewed-on: https://go-review.googlesource.com/c/go/+/633417 +Reviewed-by: Carlos Amedee +Reviewed-by: Jorropo +Reviewed-by: Joel Sing +Reviewed-by: Dmitri Shuralyov +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +--- + src/cmd/go/alldocs.go | 9 +++++---- + src/cmd/go/internal/help/helpdoc.go | 9 +++++---- + src/cmd/go/testdata/script/tooltags.txt | 7 ++++++- + src/cmd/internal/testdir/testdir_test.go | 2 +- + src/internal/buildcfg/cfg.go | 7 ++++++- + src/internal/buildcfg/cfg_test.go | 4 ++++ + src/runtime/asm_riscv64.h | 9 +++++++++ + 7 files changed, 36 insertions(+), 11 deletions(-) + +diff --git a/src/cmd/go/alldocs.go b/src/cmd/go/alldocs.go +index db737b062e..32e2ba15e9 100644 +--- a/src/cmd/go/alldocs.go ++++ b/src/cmd/go/alldocs.go +@@ -1979,8 +1979,8 @@ + // (or ppc64le.power8, ppc64le.power9, and ppc64le.power10) + // feature build tags. + // - For GOARCH=riscv64, +-// GORISCV64=rva20u64 and rva22u64 correspond to the riscv64.rva20u64 +-// and riscv64.rva22u64 build tags. ++// GORISCV64=rva20u64, rva22u64 and rva23u64 correspond to the riscv64.rva20u64, ++// riscv64.rva22u64 and riscv64.rva23u64 build tags. + // - For GOARCH=wasm, GOWASM=satconv and signext + // correspond to the wasm.satconv and wasm.signext feature build tags. + // +@@ -2280,8 +2280,9 @@ + // Valid values are power8 (default), power9, power10. + // GORISCV64 + // For GOARCH=riscv64, the RISC-V user-mode application profile for which +-// to compile. Valid values are rva20u64 (default), rva22u64. +-// See https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc ++// to compile. Valid values are rva20u64 (default), rva22u64, rva23u64. ++// See https://github.com/riscv/riscv-profiles/blob/main/src/profiles.adoc ++// and https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc + // GOWASM + // For GOARCH=wasm, comma-separated list of experimental WebAssembly features to use. + // Valid values are satconv, signext. +diff --git a/src/cmd/go/internal/help/helpdoc.go b/src/cmd/go/internal/help/helpdoc.go +index 55701bac46..12b667e9be 100644 +--- a/src/cmd/go/internal/help/helpdoc.go ++++ b/src/cmd/go/internal/help/helpdoc.go +@@ -619,8 +619,9 @@ Architecture-specific environment variables: + Valid values are power8 (default), power9, power10. + GORISCV64 + For GOARCH=riscv64, the RISC-V user-mode application profile for which +- to compile. Valid values are rva20u64 (default), rva22u64. +- See https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc ++ to compile. Valid values are rva20u64 (default), rva22u64, rva23u64. ++ See https://github.com/riscv/riscv-profiles/blob/main/src/profiles.adoc ++ and https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc + GOWASM + For GOARCH=wasm, comma-separated list of experimental WebAssembly features to use. + Valid values are satconv, signext. +@@ -910,8 +911,8 @@ The defined architecture feature build tags are: + (or ppc64le.power8, ppc64le.power9, and ppc64le.power10) + feature build tags. + - For GOARCH=riscv64, +- GORISCV64=rva20u64 and rva22u64 correspond to the riscv64.rva20u64 +- and riscv64.rva22u64 build tags. ++ GORISCV64=rva20u64, rva22u64 and rva23u64 correspond to the riscv64.rva20u64, ++ riscv64.rva22u64 and riscv64.rva23u64 build tags. + - For GOARCH=wasm, GOWASM=satconv and signext + correspond to the wasm.satconv and wasm.signext feature build tags. + +diff --git a/src/cmd/go/testdata/script/tooltags.txt b/src/cmd/go/testdata/script/tooltags.txt +index 1f6f54563c..a69b7a5c37 100644 +--- a/src/cmd/go/testdata/script/tooltags.txt ++++ b/src/cmd/go/testdata/script/tooltags.txt +@@ -50,10 +50,15 @@ env GORISCV64=rva22u64 + go list -f '{{context.ToolTags}}' + stdout 'riscv64.rva20u64 riscv64.rva22u64' + ++env GOARCH=riscv64 ++env GORISCV64=rva23u64 ++go list -f '{{context.ToolTags}}' ++stdout 'riscv64.rva20u64 riscv64.rva22u64 riscv64.rva23u64' ++ + env GOARCH=riscv64 + env GORISCV64=rva22 + ! go list -f '{{context.ToolTags}}' +-stderr 'go: invalid GORISCV64: must be rva20u64, rva22u64' ++stderr 'go: invalid GORISCV64: must be rva20u64, rva22u64, rva23u64' + + env GOARCH=riscv64 + env GORISCV64= +diff --git a/src/cmd/internal/testdir/testdir_test.go b/src/cmd/internal/testdir/testdir_test.go +index 1677191d96..90d967f47d 100644 +--- a/src/cmd/internal/testdir/testdir_test.go ++++ b/src/cmd/internal/testdir/testdir_test.go +@@ -1464,7 +1464,7 @@ var ( + "ppc64x": {}, // A pseudo-arch representing both ppc64 and ppc64le + "s390x": {}, + "wasm": {}, +- "riscv64": {"GORISCV64", "rva20u64", "rva22u64"}, ++ "riscv64": {"GORISCV64", "rva20u64", "rva22u64", "rva23u64"}, + } + ) + +diff --git a/src/internal/buildcfg/cfg.go b/src/internal/buildcfg/cfg.go +index 599e782c7a..f6fb2d232f 100644 +--- a/src/internal/buildcfg/cfg.go ++++ b/src/internal/buildcfg/cfg.go +@@ -220,8 +220,10 @@ func goriscv64() int { + return 20 + case "rva22u64": + return 22 ++ case "rva23u64": ++ return 23 + } +- Error = fmt.Errorf("invalid GORISCV64: must be rva20u64, rva22u64") ++ Error = fmt.Errorf("invalid GORISCV64: must be rva20u64, rva22u64, rva23u64") + v := defaultGORISCV64[len("rva"):] + i := strings.IndexFunc(v, func(r rune) bool { + return r < '0' || r > '9' +@@ -353,6 +355,9 @@ func gogoarchTags() []string { + if GORISCV64 >= 22 { + list = append(list, GOARCH+"."+"rva22u64") + } ++ if GORISCV64 >= 23 { ++ list = append(list, GOARCH+"."+"rva23u64") ++ } + return list + case "wasm": + var list []string +diff --git a/src/internal/buildcfg/cfg_test.go b/src/internal/buildcfg/cfg_test.go +index 69eeef2422..1513cdc9b0 100644 +--- a/src/internal/buildcfg/cfg_test.go ++++ b/src/internal/buildcfg/cfg_test.go +@@ -32,6 +32,10 @@ func TestConfigFlags(t *testing.T) { + if goriscv64() != 22 { + t.Errorf("Wrong parsing of RISCV64=rva22u64") + } ++ os.Setenv("GORISCV64", "rva23u64") ++ if goriscv64() != 23 { ++ t.Errorf("Wrong parsing of RISCV64=rva23u64") ++ } + Error = nil + os.Setenv("GORISCV64", "rva22") + if _ = goriscv64(); Error == nil { +diff --git a/src/runtime/asm_riscv64.h b/src/runtime/asm_riscv64.h +index d4deb093a6..2414b9f067 100644 +--- a/src/runtime/asm_riscv64.h ++++ b/src/runtime/asm_riscv64.h +@@ -10,3 +10,12 @@ + #define hasZbb + #define hasZbs + #endif ++ ++#ifdef GORISCV64_rva23u64 ++#define hasV ++#define hasZba ++#define hasZbb ++#define hasZbs ++#define hasZfa ++#define hasZicond ++#endif +-- +2.39.5 + diff --git a/2066-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch b/2066-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch new file mode 100644 index 0000000..32935e5 --- /dev/null +++ b/2066-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch @@ -0,0 +1,671 @@ +From 39ff3dc09208547d679805802d845e64792e686c Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 066/119] cmd/internal/obj/riscv: update references to RISC-V + specification + +Update references to version 20240411 of the RISC-V specifications. +Reorder and regroup instructions to maintain ordering. Also be +consistent with formatting. + +The instruction encodings table was seemingly missed in CL 616115. + +Change-Id: I47b7c8538383ff3b0503ba59db570c3d4f0d5653 +Reviewed-on: https://go-review.googlesource.com/c/go/+/631935 +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Ian Lance Taylor +Reviewed-by: Meng Zhuo +Reviewed-by: Pengcheng Wang +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 4 + + src/cmd/internal/obj/riscv/cpu.go | 118 ++++++++++---------- + src/cmd/internal/obj/riscv/obj.go | 54 ++++----- + 3 files changed, 94 insertions(+), 82 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 517930aa60..ad468574a9 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -363,6 +363,10 @@ start: + SLLIUW $63, X17, X18 // 1b99f80b + SLLIUW $1, X18, X19 // 9b191908 + ++ // ++ // "B" Extension for Bit Manipulation, Version 1.0.0 ++ // ++ + // 28.4.2: Basic Bit Manipulation (Zbb) + ANDN X19, X20, X21 // b37a3a41 or 93caf9ffb37a5a01 + ANDN X19, X20 // 337a3a41 or 93cff9ff337afa01 +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index a36b95e6d2..29f7e913ed 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -566,6 +566,10 @@ const ( + // 22.5 Quad-Precision Floating-Point Classify Instruction + AFCLASSQ + ++ // ++ // "B" Extension for Bit Manipulation, Version 1.0.0 ++ // ++ + // 28.4.1: Address Generation Instructions (Zba) + AADDUW + ASH1ADD +@@ -615,15 +619,15 @@ const ( + ABSETI + + // +- // RISC-V Vector ISA-extension (1.0) (Unprivileged 20240411) ++ // "V" Standard Extension for Vector Operations, Version 1.0 + // + +- // 31.6. Configuration-Setting Instructions ++ // 31.6: Configuration-Setting Instructions + AVSETVLI + AVSETIVLI + AVSETVL + +- // 31.7.4. Vector Unit-Stride Instructions ++ // 31.7.4: Vector Unit-Stride Instructions + AVLE8V + AVLE16V + AVLE32V +@@ -635,7 +639,7 @@ const ( + AVLMV + AVSMV + +- // 31.7.5. Vector Strided Instructions ++ // 31.7.5: Vector Strided Instructions + AVLSE8V + AVLSE16V + AVLSE32V +@@ -645,7 +649,7 @@ const ( + AVSSE32V + AVSSE64V + +- // 31.7.6. Vector Indexed Instructions ++ // 31.7.6: Vector Indexed Instructions + AVLUXEI8V + AVLUXEI16V + AVLUXEI32V +@@ -663,13 +667,13 @@ const ( + AVSOXEI32V + AVSOXEI64V + +- // 31.7.7. Unit-stride Fault-Only-First Loads ++ // 31.7.7: Unit-stride Fault-Only-First Loads + AVLE8FFV + AVLE16FFV + AVLE32FFV + AVLE64FFV + +- // 31.7.9. Vector Load/Store Whole Register Instructions ++ // 31.7.9: Vector Load/Store Whole Register Instructions + AVL1RE8V + AVL1RE16V + AVL1RE32V +@@ -691,7 +695,7 @@ const ( + AVS4RV + AVS8RV + +- // 31.11.1. Vector Single-Width Integer Add and Subtract ++ // 31.11.1: Vector Single-Width Integer Add and Subtract + AVADDVV + AVADDVX + AVADDVI +@@ -700,7 +704,7 @@ const ( + AVRSUBVX + AVRSUBVI + +- // 31.11.2. Vector Widening Integer Add/Subtract ++ // 31.11.2: Vector Widening Integer Add/Subtract + AVWADDUVV + AVWADDUVX + AVWSUBUVV +@@ -718,7 +722,7 @@ const ( + AVWSUBWV + AVWSUBWX + +- // 31.11.3. Vector Integer Extension ++ // 31.11.3: Vector Integer Extension + AVZEXTVF2 + AVSEXTVF2 + AVZEXTVF4 +@@ -726,7 +730,7 @@ const ( + AVZEXTVF8 + AVSEXTVF8 + +- // 31.11.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions ++ // 31.11.4: Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions + AVADCVVM + AVADCVXM + AVADCVIM +@@ -743,7 +747,7 @@ const ( + AVMSBCVV + AVMSBCVX + +- // 31.11.5. Vector Bitwise Logical Instructions ++ // 31.11.5: Vector Bitwise Logical Instructions + AVANDVV + AVANDVX + AVANDVI +@@ -754,7 +758,7 @@ const ( + AVXORVX + AVXORVI + +- // 31.11.6. Vector Single-Width Shift Instructions ++ // 31.11.6: Vector Single-Width Shift Instructions + AVSLLVV + AVSLLVX + AVSLLVI +@@ -765,7 +769,7 @@ const ( + AVSRAVX + AVSRAVI + +- // 31.11.7. Vector Narrowing Integer Right Shift Instructions ++ // 31.11.7: Vector Narrowing Integer Right Shift Instructions + AVNSRLWV + AVNSRLWX + AVNSRLWI +@@ -773,7 +777,7 @@ const ( + AVNSRAWX + AVNSRAWI + +- // 31.11.8. Vector Integer Compare Instructions ++ // 31.11.8: Vector Integer Compare Instructions + AVMSEQVV + AVMSEQVX + AVMSEQVI +@@ -795,7 +799,7 @@ const ( + AVMSGTVX + AVMSGTVI + +- // 31.11.9. Vector Integer Min/Max Instructions ++ // 31.11.9: Vector Integer Min/Max Instructions + AVMINUVV + AVMINUVX + AVMINVV +@@ -805,7 +809,7 @@ const ( + AVMAXVV + AVMAXVX + +- // 31.11.10. Vector Single-Width Integer Multiply Instructions ++ // 31.11.10: Vector Single-Width Integer Multiply Instructions + AVMULVV + AVMULVX + AVMULHVV +@@ -815,7 +819,7 @@ const ( + AVMULHSUVV + AVMULHSUVX + +- // 31.11.11. Vector Integer Divide Instructions ++ // 31.11.11: Vector Integer Divide Instructions + AVDIVUVV + AVDIVUVX + AVDIVVV +@@ -825,7 +829,7 @@ const ( + AVREMVV + AVREMVX + +- // 31.11.12. Vector Widening Integer Multiply Instructions ++ // 31.11.12: Vector Widening Integer Multiply Instructions + AVWMULVV + AVWMULVX + AVWMULUVV +@@ -833,7 +837,7 @@ const ( + AVWMULSUVV + AVWMULSUVX + +- // 31.11.13. Vector Single-Width Integer Multiply-Add Instructions ++ // 31.11.13: Vector Single-Width Integer Multiply-Add Instructions + AVMACCVV + AVMACCVX + AVNMSACVV +@@ -843,7 +847,7 @@ const ( + AVNMSUBVV + AVNMSUBVX + +- // 31.11.14. Vector Widening Integer Multiply-Add Instructions ++ // 31.11.14: Vector Widening Integer Multiply-Add Instructions + AVWMACCUVV + AVWMACCUVX + AVWMACCVV +@@ -852,17 +856,17 @@ const ( + AVWMACCSUVX + AVWMACCUSVX + +- // 31.11.15. Vector Integer Merge Instructions ++ // 31.11.15: Vector Integer Merge Instructions + AVMERGEVVM + AVMERGEVXM + AVMERGEVIM + +- // 31.11.16. Vector Integer Move Instructions ++ // 31.11.16: Vector Integer Move Instructions + AVMVVV + AVMVVX + AVMVVI + +- // 31.12.1. Vector Single-Width Saturating Add and Subtract ++ // 31.12.1: Vector Single-Width Saturating Add and Subtract + AVSADDUVV + AVSADDUVX + AVSADDUVI +@@ -874,7 +878,7 @@ const ( + AVSSUBVV + AVSSUBVX + +- // 31.12.2. Vector Single-Width Averaging Add and Subtract ++ // 31.12.2: Vector Single-Width Averaging Add and Subtract + AVAADDUVV + AVAADDUVX + AVAADDVV +@@ -884,11 +888,11 @@ const ( + AVASUBVV + AVASUBVX + +- // 31.12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation ++ // 31.12.3: Vector Single-Width Fractional Multiply with Rounding and Saturation + AVSMULVV + AVSMULVX + +- // 31.12.4. Vector Single-Width Scaling Shift Instructions ++ // 31.12.4: Vector Single-Width Scaling Shift Instructions + AVSSRLVV + AVSSRLVX + AVSSRLVI +@@ -896,7 +900,7 @@ const ( + AVSSRAVX + AVSSRAVI + +- // 31.12.5. Vector Narrowing Fixed-Point Clip Instructions ++ // 31.12.5: Vector Narrowing Fixed-Point Clip Instructions + AVNCLIPUWV + AVNCLIPUWX + AVNCLIPUWI +@@ -904,14 +908,14 @@ const ( + AVNCLIPWX + AVNCLIPWI + +- // 31.13.2. Vector Single-Width Floating-Point Add/Subtract Instructions ++ // 31.13.2: Vector Single-Width Floating-Point Add/Subtract Instructions + AVFADDVV + AVFADDVF + AVFSUBVV + AVFSUBVF + AVFRSUBVF + +- // 31.13.3. Vector Widening Floating-Point Add/Subtract Instructions ++ // 31.13.3: Vector Widening Floating-Point Add/Subtract Instructions + AVFWADDVV + AVFWADDVF + AVFWSUBVV +@@ -921,18 +925,18 @@ const ( + AVFWSUBWV + AVFWSUBWF + +- // 31.13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions ++ // 31.13.4: Vector Single-Width Floating-Point Multiply/Divide Instructions + AVFMULVV + AVFMULVF + AVFDIVVV + AVFDIVVF + AVFRDIVVF + +- // 31.13.5. Vector Widening Floating-Point Multiply ++ // 31.13.5: Vector Widening Floating-Point Multiply + AVFWMULVV + AVFWMULVF + +- // 31.13.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions ++ // 31.13.6: Vector Single-Width Floating-Point Fused Multiply-Add Instructions + AVFMACCVV + AVFMACCVF + AVFNMACCVV +@@ -950,7 +954,7 @@ const ( + AVFNMSUBVV + AVFNMSUBVF + +- // 31.13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions ++ // 31.13.7: Vector Widening Floating-Point Fused Multiply-Add Instructions + AVFWMACCVV + AVFWMACCVF + AVFWNMACCVV +@@ -960,22 +964,22 @@ const ( + AVFWNMSACVV + AVFWNMSACVF + +- // 31.13.8. Vector Floating-Point Square-Root Instruction ++ // 31.13.8: Vector Floating-Point Square-Root Instruction + AVFSQRTV + +- // 31.13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction ++ // 31.13.9: Vector Floating-Point Reciprocal Square-Root Estimate Instruction + AVFRSQRT7V + +- // 31.13.10. Vector Floating-Point Reciprocal Estimate Instruction ++ // 31.13.10: Vector Floating-Point Reciprocal Estimate Instruction + AVFREC7V + +- // 31.13.11. Vector Floating-Point MIN/MAX Instructions ++ // 31.13.11: Vector Floating-Point MIN/MAX Instructions + AVFMINVV + AVFMINVF + AVFMAXVV + AVFMAXVF + +- // 31.13.12. Vector Floating-Point Sign-Injection Instructions ++ // 31.13.12: Vector Floating-Point Sign-Injection Instructions + AVFSGNJVV + AVFSGNJVF + AVFSGNJNVV +@@ -983,7 +987,7 @@ const ( + AVFSGNJXVV + AVFSGNJXVF + +- // 31.13.13. Vector Floating-Point Compare Instructions ++ // 31.13.13: Vector Floating-Point Compare Instructions + AVMFEQVV + AVMFEQVF + AVMFNEVV +@@ -995,16 +999,16 @@ const ( + AVMFGTVF + AVMFGEVF + +- // 31.13.14. Vector Floating-Point Classify Instruction ++ // 31.13.14: Vector Floating-Point Classify Instruction + AVFCLASSV + +- // 31.13.15. Vector Floating-Point Merge Instruction ++ // 31.13.15: Vector Floating-Point Merge Instruction + AVFMERGEVFM + +- // 31.13.16. Vector Floating-Point Move Instruction ++ // 31.13.16: Vector Floating-Point Move Instruction + AVFMVVF + +- // 31.13.17. Single-Width Floating-Point/Integer Type-Convert Instructions ++ // 31.13.17: Single-Width Floating-Point/Integer Type-Convert Instructions + AVFCVTXUFV + AVFCVTXFV + AVFCVTRTZXUFV +@@ -1012,7 +1016,7 @@ const ( + AVFCVTFXUV + AVFCVTFXV + +- // 31.13.18. Widening Floating-Point/Integer Type-Convert Instructions ++ // 31.13.18: Widening Floating-Point/Integer Type-Convert Instructions + AVFWCVTXUFV + AVFWCVTXFV + AVFWCVTRTZXUFV +@@ -1021,7 +1025,7 @@ const ( + AVFWCVTFXV + AVFWCVTFFV + +- // 31.13.19. Narrowing Floating-Point/Integer Type-Convert Instructions ++ // 31.13.19: Narrowing Floating-Point/Integer Type-Convert Instructions + AVFNCVTXUFW + AVFNCVTXFW + AVFNCVTRTZXUFW +@@ -1031,7 +1035,7 @@ const ( + AVFNCVTFFW + AVFNCVTRODFFW + +- // 31.14.1. Vector Single-Width Integer Reduction Instructions ++ // 31.14.1: Vector Single-Width Integer Reduction Instructions + AVREDSUMVS + AVREDMAXUVS + AVREDMAXVS +@@ -1041,21 +1045,21 @@ const ( + AVREDORVS + AVREDXORVS + +- // 31.14.2. Vector Widening Integer Reduction Instructions ++ // 31.14.2: Vector Widening Integer Reduction Instructions + AVWREDSUMUVS + AVWREDSUMVS + +- // 31.14.3. Vector Single-Width Floating-Point Reduction Instructions ++ // 31.14.3: Vector Single-Width Floating-Point Reduction Instructions + AVFREDOSUMVS + AVFREDUSUMVS + AVFREDMAXVS + AVFREDMINVS + +- // 31.14.4. Vector Widening Floating-Point Reduction Instructions ++ // 31.14.4: Vector Widening Floating-Point Reduction Instructions + AVFWREDOSUMVS + AVFWREDUSUMVS + +- // 31.15. Vector Mask Instructions ++ // 31.15: Vector Mask Instructions + AVMANDMM + AVMNANDMM + AVMANDNMM +@@ -1072,15 +1076,15 @@ const ( + AVIOTAM + AVIDV + +- // 31.16.1. Integer Scalar Move Instructions ++ // 31.16.1: Integer Scalar Move Instructions + AVMVXS + AVMVSX + +- // 31.16.2. Floating-Point Scalar Move Instructions ++ // 31.16.2: Floating-Point Scalar Move Instructions + AVFMVFS + AVFMVSF + +- // 31.16.3. Vector Slide Instructions ++ // 31.16.3: Vector Slide Instructions + AVSLIDEUPVX + AVSLIDEUPVI + AVSLIDEDOWNVX +@@ -1090,16 +1094,16 @@ const ( + AVSLIDE1DOWNVX + AVFSLIDE1DOWNVF + +- // 31.16.4. Vector Register Gather Instructions ++ // 31.16.4: Vector Register Gather Instructions + AVRGATHERVV + AVRGATHEREI16VV + AVRGATHERVX + AVRGATHERVI + +- // 31.16.5. Vector Compress Instruction ++ // 31.16.5: Vector Compress Instruction + AVCOMPRESSVM + +- // 31.16.6. Whole Vector Register Move ++ // 31.16.6: Whole Vector Register Move + AVMV1RV + AVMV2RV + AVMV4RV +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 5e7092ab36..6fac9159e5 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1539,7 +1539,9 @@ type instructionData struct { + // their encoding type. Entries are masked with obj.AMask to keep + // indices small. + var instructions = [ALAST & obj.AMask]instructionData{ ++ // + // Unprivileged ISA ++ // + + // 2.4: Integer Computational Instructions + AADDI & obj.AMask: {enc: iIIEncoding, ternary: true}, +@@ -1588,7 +1590,7 @@ var instructions = [ALAST & obj.AMask]instructionData{ + // 2.7: Memory Ordering + AFENCE & obj.AMask: {enc: iIIEncoding}, + +- // 5.2: Integer Computational Instructions (RV64I) ++ // 4.2: Integer Computational Instructions (RV64I) + AADDIW & obj.AMask: {enc: iIIEncoding, ternary: true}, + ASLLIW & obj.AMask: {enc: iIIEncoding, ternary: true}, + ASRLIW & obj.AMask: {enc: iIIEncoding, ternary: true}, +@@ -1599,14 +1601,14 @@ var instructions = [ALAST & obj.AMask]instructionData{ + ASUBW & obj.AMask: {enc: rIIIEncoding, ternary: true}, + ASRAW & obj.AMask: {enc: rIIIEncoding, immForm: ASRAIW, ternary: true}, + +- // 5.3: Load and Store Instructions (RV64I) ++ // 4.3: Load and Store Instructions (RV64I) + ALD & obj.AMask: {enc: iIIEncoding}, + ASD & obj.AMask: {enc: sIEncoding}, + + // 7.1: CSR Instructions + ACSRRS & obj.AMask: {enc: iIIEncoding}, + +- // 7.1: Multiplication Operations ++ // 13.1: Multiplication Operations + AMUL & obj.AMask: {enc: rIIIEncoding, ternary: true}, + AMULH & obj.AMask: {enc: rIIIEncoding, ternary: true}, + AMULHU & obj.AMask: {enc: rIIIEncoding, ternary: true}, +@@ -1621,13 +1623,13 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AREMW & obj.AMask: {enc: rIIIEncoding, ternary: true}, + AREMUW & obj.AMask: {enc: rIIIEncoding, ternary: true}, + +- // 8.2: Load-Reserved/Store-Conditional ++ // 14.2: Load-Reserved/Store-Conditional Instructions (Zalrsc) + ALRW & obj.AMask: {enc: rIIIEncoding}, + ALRD & obj.AMask: {enc: rIIIEncoding}, + ASCW & obj.AMask: {enc: rIIIEncoding}, + ASCD & obj.AMask: {enc: rIIIEncoding}, + +- // 8.3: Atomic Memory Operations ++ // 14.4: Atomic Memory Operations (Zaamo) + AAMOSWAPW & obj.AMask: {enc: rIIIEncoding}, + AAMOSWAPD & obj.AMask: {enc: rIIIEncoding}, + AAMOADDW & obj.AMask: {enc: rIIIEncoding}, +@@ -1647,11 +1649,11 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AAMOMINUW & obj.AMask: {enc: rIIIEncoding}, + AAMOMINUD & obj.AMask: {enc: rIIIEncoding}, + +- // 11.5: Single-Precision Load and Store Instructions ++ // 20.5: Single-Precision Load and Store Instructions + AFLW & obj.AMask: {enc: iFEncoding}, + AFSW & obj.AMask: {enc: sFEncoding}, + +- // 11.6: Single-Precision Floating-Point Computational Instructions ++ // 20.6: Single-Precision Floating-Point Computational Instructions + AFADDS & obj.AMask: {enc: rFFFEncoding}, + AFSUBS & obj.AMask: {enc: rFFFEncoding}, + AFMULS & obj.AMask: {enc: rFFFEncoding}, +@@ -1664,7 +1666,7 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AFNMSUBS & obj.AMask: {enc: rFFFFEncoding}, + AFNMADDS & obj.AMask: {enc: rFFFFEncoding}, + +- // 11.7: Single-Precision Floating-Point Conversion and Move Instructions ++ // 20.7: Single-Precision Floating-Point Conversion and Move Instructions + AFCVTWS & obj.AMask: {enc: rFIEncoding}, + AFCVTLS & obj.AMask: {enc: rFIEncoding}, + AFCVTSW & obj.AMask: {enc: rIFEncoding}, +@@ -1679,19 +1681,19 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AFMVXW & obj.AMask: {enc: rFIEncoding}, + AFMVWX & obj.AMask: {enc: rIFEncoding}, + +- // 11.8: Single-Precision Floating-Point Compare Instructions ++ // 20.8: Single-Precision Floating-Point Compare Instructions + AFEQS & obj.AMask: {enc: rFFIEncoding}, + AFLTS & obj.AMask: {enc: rFFIEncoding}, + AFLES & obj.AMask: {enc: rFFIEncoding}, + +- // 11.9: Single-Precision Floating-Point Classify Instruction ++ // 20.9: Single-Precision Floating-Point Classify Instruction + AFCLASSS & obj.AMask: {enc: rFIEncoding}, + + // 12.3: Double-Precision Load and Store Instructions + AFLD & obj.AMask: {enc: iFEncoding}, + AFSD & obj.AMask: {enc: sFEncoding}, + +- // 12.4: Double-Precision Floating-Point Computational Instructions ++ // 21.4: Double-Precision Floating-Point Computational Instructions + AFADDD & obj.AMask: {enc: rFFFEncoding}, + AFSUBD & obj.AMask: {enc: rFFFEncoding}, + AFMULD & obj.AMask: {enc: rFFFEncoding}, +@@ -1704,7 +1706,7 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AFNMSUBD & obj.AMask: {enc: rFFFFEncoding}, + AFNMADDD & obj.AMask: {enc: rFFFFEncoding}, + +- // 12.5: Double-Precision Floating-Point Conversion and Move Instructions ++ // 21.5: Double-Precision Floating-Point Conversion and Move Instructions + AFCVTWD & obj.AMask: {enc: rFIEncoding}, + AFCVTLD & obj.AMask: {enc: rFIEncoding}, + AFCVTDW & obj.AMask: {enc: rIFEncoding}, +@@ -1721,25 +1723,19 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AFMVXD & obj.AMask: {enc: rFIEncoding}, + AFMVDX & obj.AMask: {enc: rIFEncoding}, + +- // 12.6: Double-Precision Floating-Point Compare Instructions ++ // 21.6: Double-Precision Floating-Point Compare Instructions + AFEQD & obj.AMask: {enc: rFFIEncoding}, + AFLTD & obj.AMask: {enc: rFFIEncoding}, + AFLED & obj.AMask: {enc: rFFIEncoding}, + +- // 12.7: Double-Precision Floating-Point Classify Instruction ++ // 21.7: Double-Precision Floating-Point Classify Instruction + AFCLASSD & obj.AMask: {enc: rFIEncoding}, + +- // Privileged ISA +- +- // 3.2.1: Environment Call and Breakpoint +- AECALL & obj.AMask: {enc: iIIEncoding}, +- AEBREAK & obj.AMask: {enc: iIIEncoding}, +- + // +- // RISC-V Bit-Manipulation ISA-extensions (1.0) ++ // "B" Extension for Bit Manipulation, Version 1.0.0 + // + +- // 1.1: Address Generation Instructions (Zba) ++ // 28.4.1: Address Generation Instructions (Zba) + AADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true}, + ASH1ADD & obj.AMask: {enc: rIIIEncoding, ternary: true}, + ASH1ADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true}, +@@ -1749,7 +1745,7 @@ var instructions = [ALAST & obj.AMask]instructionData{ + ASH3ADDUW & obj.AMask: {enc: rIIIEncoding, ternary: true}, + ASLLIUW & obj.AMask: {enc: iIIEncoding, ternary: true}, + +- // 1.2: Basic Bit Manipulation (Zbb) ++ // 28.4.2: Basic Bit Manipulation (Zbb) + AANDN & obj.AMask: {enc: rIIIEncoding, ternary: true}, + ACLZ & obj.AMask: {enc: rIIEncoding}, + ACLZW & obj.AMask: {enc: rIIEncoding}, +@@ -1767,7 +1763,7 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AXNOR & obj.AMask: {enc: rIIIEncoding, ternary: true}, + AZEXTH & obj.AMask: {enc: rIIEncoding}, + +- // 1.3: Bitwise Rotation (Zbb) ++ // 28.4.3: Bitwise Rotation (Zbb) + AROL & obj.AMask: {enc: rIIIEncoding, ternary: true}, + AROLW & obj.AMask: {enc: rIIIEncoding, ternary: true}, + AROR & obj.AMask: {enc: rIIIEncoding, immForm: ARORI, ternary: true}, +@@ -1777,7 +1773,7 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AORCB & obj.AMask: {enc: iIIEncoding}, + AREV8 & obj.AMask: {enc: iIIEncoding}, + +- // 1.5: Single-bit Instructions (Zbs) ++ // 28.4.4: Single-bit Instructions (Zbs) + ABCLR & obj.AMask: {enc: rIIIEncoding, immForm: ABCLRI, ternary: true}, + ABCLRI & obj.AMask: {enc: iIIEncoding, ternary: true}, + ABEXT & obj.AMask: {enc: rIIIEncoding, immForm: ABEXTI, ternary: true}, +@@ -1787,6 +1783,14 @@ var instructions = [ALAST & obj.AMask]instructionData{ + ABSET & obj.AMask: {enc: rIIIEncoding, immForm: ABSETI, ternary: true}, + ABSETI & obj.AMask: {enc: iIIEncoding, ternary: true}, + ++ // ++ // Privileged ISA ++ // ++ ++ // 3.3.1: Environment Call and Breakpoint ++ AECALL & obj.AMask: {enc: iIIEncoding}, ++ AEBREAK & obj.AMask: {enc: iIIEncoding}, ++ + // Escape hatch + AWORD & obj.AMask: {enc: rawEncoding}, + +-- +2.39.5 + diff --git a/2067-cmd-compile-don-t-merge-symbols-on-riscv64-when-dyna.patch b/2067-cmd-compile-don-t-merge-symbols-on-riscv64-when-dyna.patch new file mode 100644 index 0000000..afd6b8d --- /dev/null +++ b/2067-cmd-compile-don-t-merge-symbols-on-riscv64-when-dyna.patch @@ -0,0 +1,589 @@ +From 20ee5b16747e9841cb1923ad3590806047e4b235 Mon Sep 17 00:00:00 2001 +From: Meng Zhuo +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 067/119] cmd/compile: don't merge symbols on riscv64 when + dynamic linking + +Each plugin is compiled as a separate shared object, +with its own symbol table. When dynamic linking plugin symbols +are resolved within the plugin's scope, not globally merged to +avoid conflicts. + +Change-Id: I9e6986085855c17fbd6c39b937cb6129d216f5e9 +Reviewed-on: https://go-review.googlesource.com/c/go/+/435015 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Joel Sing +Reviewed-by: Michael Pratt +Reviewed-by: Cherry Mui +--- + .../compile/internal/ssa/_gen/RISCV64.rules | 82 +++------- + .../compile/internal/ssa/rewriteRISCV64.go | 154 +++++++++++------- + 2 files changed, 115 insertions(+), 121 deletions(-) + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 9ae9604381..a69df619a5 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -270,65 +270,29 @@ + + // We need to fold MOVaddr into the LD/MOVDstore ops so that the live variable analysis + // knows what variables are being read/written by the ops. +-(MOVBUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} base mem) +-(MOVBload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem) +-(MOVHUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} base mem) +-(MOVHload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem) +-(MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem) +-(MOVWload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem) +-(MOVDload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem) +- +-(MOVBstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +-(MOVHstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +-(MOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +-(MOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => +- (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) +-(MOVBstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => +- (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +-(MOVHstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => +- (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +-(MOVWstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => +- (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +-(MOVDstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) && canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) => +- (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) +- +-(MOVBUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => +- (MOVBUload [off1+int32(off2)] {sym} base mem) +-(MOVBload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => +- (MOVBload [off1+int32(off2)] {sym} base mem) +-(MOVHUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => +- (MOVHUload [off1+int32(off2)] {sym} base mem) +-(MOVHload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => +- (MOVHload [off1+int32(off2)] {sym} base mem) +-(MOVWUload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => +- (MOVWUload [off1+int32(off2)] {sym} base mem) +-(MOVWload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => +- (MOVWload [off1+int32(off2)] {sym} base mem) +-(MOVDload [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => +- (MOVDload [off1+int32(off2)] {sym} base mem) +- +-(MOVBstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => +- (MOVBstore [off1+int32(off2)] {sym} base val mem) +-(MOVHstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => +- (MOVHstore [off1+int32(off2)] {sym} base val mem) +-(MOVWstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => +- (MOVWstore [off1+int32(off2)] {sym} base val mem) +-(MOVDstore [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => +- (MOVDstore [off1+int32(off2)] {sym} base val mem) +-(MOVBstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVBstorezero [off1+int32(off2)] {sym} ptr mem) +-(MOVHstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVHstorezero [off1+int32(off2)] {sym} ptr mem) +-(MOVWstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVWstorezero [off1+int32(off2)] {sym} ptr mem) +-(MOVDstorezero [off1] {sym} (ADDI [off2] ptr) mem) && is32Bit(int64(off1)+off2) => (MOVDstorezero [off1+int32(off2)] {sym} ptr mem) ++(MOV(B|BU|H|HU|W|WU|D)load [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && ++ is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && ++ (base.Op != OpSB || !config.ctxt.Flag_dynlink) => ++ (MOV(B|BU|H|HU|W|WU|D)load [off1+off2] {mergeSym(sym1,sym2)} base mem) ++ ++(MOV(B|H|W|D)store [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && ++ is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && ++ (base.Op != OpSB || !config.ctxt.Flag_dynlink) => ++ (MOV(B|H|W|D)store [off1+off2] {mergeSym(sym1,sym2)} base val mem) ++ ++(MOV(B|H|W|D)storezero [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && ++ canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) && ++ (base.Op != OpSB || !config.ctxt.Flag_dynlink) => ++ (MOV(B|H|W|D)storezero [off1+off2] {mergeSym(sym1,sym2)} base mem) ++ ++(MOV(B|BU|H|HU|W|WU|D)load [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => ++ (MOV(B|BU|H|HU|W|WU|D)load [off1+int32(off2)] {sym} base mem) ++ ++(MOV(B|H|W|D)store [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => ++ (MOV(B|H|W|D)store [off1+int32(off2)] {sym} base val mem) ++ ++(MOV(B|H|W|D)storezero [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => ++ (MOV(B|H|W|D)storezero [off1+int32(off2)] {sym} base mem) + + // Similarly, fold ADDI into MOVaddr to avoid confusing live variable analysis + // with OffPtr -> ADDI. +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 5e6ccab467..1c226a1660 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -4010,8 +4010,10 @@ func rewriteValueRISCV64_OpRISCV64FSUBS(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVBUload(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVBUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -4023,7 +4025,7 @@ func rewriteValueRISCV64_OpRISCV64MOVBUload(v *Value) bool { + sym2 := auxToSym(v_0.Aux) + base := v_0.Args[0] + mem := v_1 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVBUload) +@@ -4317,8 +4319,10 @@ func rewriteValueRISCV64_OpRISCV64MOVBUreg(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVBload(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVBload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -4330,7 +4334,7 @@ func rewriteValueRISCV64_OpRISCV64MOVBload(v *Value) bool { + sym2 := auxToSym(v_0.Aux) + base := v_0.Args[0] + mem := v_1 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVBload) +@@ -4443,8 +4447,10 @@ func rewriteValueRISCV64_OpRISCV64MOVBstore(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVBstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -4457,7 +4463,7 @@ func rewriteValueRISCV64_OpRISCV64MOVBstore(v *Value) bool { + base := v_0.Args[0] + val := v_1 + mem := v_2 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVBstore) +@@ -4611,9 +4617,11 @@ func rewriteValueRISCV64_OpRISCV64MOVBstore(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +- // match: (MOVBstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) +- // cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) +- // result: (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) ++ b := v.Block ++ config := b.Func.Config ++ // match: (MOVBstorezero [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) ++ // cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) ++ // result: (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym1 := auxToSym(v.Aux) +@@ -4622,20 +4630,20 @@ func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool { + } + off2 := auxIntToInt32(v_0.AuxInt) + sym2 := auxToSym(v_0.Aux) +- ptr := v_0.Args[0] ++ base := v_0.Args[0] + mem := v_1 +- if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) { ++ if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVBstorezero) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(mergeSym(sym1, sym2)) +- v.AddArg2(ptr, mem) ++ v.AddArg2(base, mem) + return true + } +- // match: (MOVBstorezero [off1] {sym} (ADDI [off2] ptr) mem) ++ // match: (MOVBstorezero [off1] {sym} (ADDI [off2] base) mem) + // cond: is32Bit(int64(off1)+off2) +- // result: (MOVBstorezero [off1+int32(off2)] {sym} ptr mem) ++ // result: (MOVBstorezero [off1+int32(off2)] {sym} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) +@@ -4643,7 +4651,7 @@ func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool { + break + } + off2 := auxIntToInt64(v_0.AuxInt) +- ptr := v_0.Args[0] ++ base := v_0.Args[0] + mem := v_1 + if !(is32Bit(int64(off1) + off2)) { + break +@@ -4651,7 +4659,7 @@ func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool { + v.reset(OpRISCV64MOVBstorezero) + v.AuxInt = int32ToAuxInt(off1 + int32(off2)) + v.Aux = symToAux(sym) +- v.AddArg2(ptr, mem) ++ v.AddArg2(base, mem) + return true + } + return false +@@ -4659,8 +4667,10 @@ func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVDload(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVDload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -4672,7 +4682,7 @@ func rewriteValueRISCV64_OpRISCV64MOVDload(v *Value) bool { + sym2 := auxToSym(v_0.Aux) + base := v_0.Args[0] + mem := v_1 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVDload) +@@ -4739,8 +4749,10 @@ func rewriteValueRISCV64_OpRISCV64MOVDstore(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -4753,7 +4765,7 @@ func rewriteValueRISCV64_OpRISCV64MOVDstore(v *Value) bool { + base := v_0.Args[0] + val := v_1 + mem := v_2 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVDstore) +@@ -4805,9 +4817,11 @@ func rewriteValueRISCV64_OpRISCV64MOVDstore(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +- // match: (MOVDstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) +- // cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) +- // result: (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) ++ b := v.Block ++ config := b.Func.Config ++ // match: (MOVDstorezero [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) ++ // cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) ++ // result: (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym1 := auxToSym(v.Aux) +@@ -4816,20 +4830,20 @@ func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool { + } + off2 := auxIntToInt32(v_0.AuxInt) + sym2 := auxToSym(v_0.Aux) +- ptr := v_0.Args[0] ++ base := v_0.Args[0] + mem := v_1 +- if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) { ++ if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVDstorezero) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(mergeSym(sym1, sym2)) +- v.AddArg2(ptr, mem) ++ v.AddArg2(base, mem) + return true + } +- // match: (MOVDstorezero [off1] {sym} (ADDI [off2] ptr) mem) ++ // match: (MOVDstorezero [off1] {sym} (ADDI [off2] base) mem) + // cond: is32Bit(int64(off1)+off2) +- // result: (MOVDstorezero [off1+int32(off2)] {sym} ptr mem) ++ // result: (MOVDstorezero [off1+int32(off2)] {sym} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) +@@ -4837,7 +4851,7 @@ func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool { + break + } + off2 := auxIntToInt64(v_0.AuxInt) +- ptr := v_0.Args[0] ++ base := v_0.Args[0] + mem := v_1 + if !(is32Bit(int64(off1) + off2)) { + break +@@ -4845,7 +4859,7 @@ func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool { + v.reset(OpRISCV64MOVDstorezero) + v.AuxInt = int32ToAuxInt(off1 + int32(off2)) + v.Aux = symToAux(sym) +- v.AddArg2(ptr, mem) ++ v.AddArg2(base, mem) + return true + } + return false +@@ -4853,8 +4867,10 @@ func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVHUload(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVHUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -4866,7 +4882,7 @@ func rewriteValueRISCV64_OpRISCV64MOVHUload(v *Value) bool { + sym2 := auxToSym(v_0.Aux) + base := v_0.Args[0] + mem := v_1 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVHUload) +@@ -5017,8 +5033,10 @@ func rewriteValueRISCV64_OpRISCV64MOVHUreg(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVHload(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVHload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -5030,7 +5048,7 @@ func rewriteValueRISCV64_OpRISCV64MOVHload(v *Value) bool { + sym2 := auxToSym(v_0.Aux) + base := v_0.Args[0] + mem := v_1 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVHload) +@@ -5187,8 +5205,10 @@ func rewriteValueRISCV64_OpRISCV64MOVHstore(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVHstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -5201,7 +5221,7 @@ func rewriteValueRISCV64_OpRISCV64MOVHstore(v *Value) bool { + base := v_0.Args[0] + val := v_1 + mem := v_2 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVHstore) +@@ -5321,9 +5341,11 @@ func rewriteValueRISCV64_OpRISCV64MOVHstore(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +- // match: (MOVHstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) +- // cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) +- // result: (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) ++ b := v.Block ++ config := b.Func.Config ++ // match: (MOVHstorezero [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) ++ // cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) ++ // result: (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym1 := auxToSym(v.Aux) +@@ -5332,20 +5354,20 @@ func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool { + } + off2 := auxIntToInt32(v_0.AuxInt) + sym2 := auxToSym(v_0.Aux) +- ptr := v_0.Args[0] ++ base := v_0.Args[0] + mem := v_1 +- if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) { ++ if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVHstorezero) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(mergeSym(sym1, sym2)) +- v.AddArg2(ptr, mem) ++ v.AddArg2(base, mem) + return true + } +- // match: (MOVHstorezero [off1] {sym} (ADDI [off2] ptr) mem) ++ // match: (MOVHstorezero [off1] {sym} (ADDI [off2] base) mem) + // cond: is32Bit(int64(off1)+off2) +- // result: (MOVHstorezero [off1+int32(off2)] {sym} ptr mem) ++ // result: (MOVHstorezero [off1+int32(off2)] {sym} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) +@@ -5353,7 +5375,7 @@ func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool { + break + } + off2 := auxIntToInt64(v_0.AuxInt) +- ptr := v_0.Args[0] ++ base := v_0.Args[0] + mem := v_1 + if !(is32Bit(int64(off1) + off2)) { + break +@@ -5361,7 +5383,7 @@ func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool { + v.reset(OpRISCV64MOVHstorezero) + v.AuxInt = int32ToAuxInt(off1 + int32(off2)) + v.Aux = symToAux(sym) +- v.AddArg2(ptr, mem) ++ v.AddArg2(base, mem) + return true + } + return false +@@ -5369,8 +5391,10 @@ func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVWUload(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -5382,7 +5406,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWUload(v *Value) bool { + sym2 := auxToSym(v_0.Aux) + base := v_0.Args[0] + mem := v_1 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVWUload) +@@ -5557,8 +5581,10 @@ func rewriteValueRISCV64_OpRISCV64MOVWUreg(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVWload(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVWload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -5570,7 +5596,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWload(v *Value) bool { + sym2 := auxToSym(v_0.Aux) + base := v_0.Args[0] + mem := v_1 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVWload) +@@ -5881,8 +5907,10 @@ func rewriteValueRISCV64_OpRISCV64MOVWstore(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config + // match: (MOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) +- // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) + for { + off1 := auxIntToInt32(v.AuxInt) +@@ -5895,7 +5923,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWstore(v *Value) bool { + base := v_0.Args[0] + val := v_1 + mem := v_2 +- if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVWstore) +@@ -5981,9 +6009,11 @@ func rewriteValueRISCV64_OpRISCV64MOVWstore(v *Value) bool { + func rewriteValueRISCV64_OpRISCV64MOVWstorezero(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +- // match: (MOVWstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem) +- // cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) +- // result: (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem) ++ b := v.Block ++ config := b.Func.Config ++ // match: (MOVWstorezero [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) ++ // cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) ++ // result: (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym1 := auxToSym(v.Aux) +@@ -5992,20 +6022,20 @@ func rewriteValueRISCV64_OpRISCV64MOVWstorezero(v *Value) bool { + } + off2 := auxIntToInt32(v_0.AuxInt) + sym2 := auxToSym(v_0.Aux) +- ptr := v_0.Args[0] ++ base := v_0.Args[0] + mem := v_1 +- if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) { ++ if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2)) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { + break + } + v.reset(OpRISCV64MOVWstorezero) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(mergeSym(sym1, sym2)) +- v.AddArg2(ptr, mem) ++ v.AddArg2(base, mem) + return true + } +- // match: (MOVWstorezero [off1] {sym} (ADDI [off2] ptr) mem) ++ // match: (MOVWstorezero [off1] {sym} (ADDI [off2] base) mem) + // cond: is32Bit(int64(off1)+off2) +- // result: (MOVWstorezero [off1+int32(off2)] {sym} ptr mem) ++ // result: (MOVWstorezero [off1+int32(off2)] {sym} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) +@@ -6013,7 +6043,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWstorezero(v *Value) bool { + break + } + off2 := auxIntToInt64(v_0.AuxInt) +- ptr := v_0.Args[0] ++ base := v_0.Args[0] + mem := v_1 + if !(is32Bit(int64(off1) + off2)) { + break +@@ -6021,7 +6051,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWstorezero(v *Value) bool { + v.reset(OpRISCV64MOVWstorezero) + v.AuxInt = int32ToAuxInt(off1 + int32(off2)) + v.Aux = symToAux(sym) +- v.AddArg2(ptr, mem) ++ v.AddArg2(base, mem) + return true + } + return false +-- +2.39.5 + diff --git a/2068-cmd-internal-obj-riscv-support-MOVD-with-floating-po.patch b/2068-cmd-internal-obj-riscv-support-MOVD-with-floating-po.patch new file mode 100644 index 0000000..14ffd0b --- /dev/null +++ b/2068-cmd-internal-obj-riscv-support-MOVD-with-floating-po.patch @@ -0,0 +1,83 @@ +From 9a3920342f79a02921089bbafe030e5a74e67530 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 068/119] cmd/internal/obj/riscv: support MOVD with floating + point constants + +Currently, we only support loading of values from memory (or other +registers). Add floating point constant support to MOVD. This is +implemented by storing the floating point constant to a symbol, +which is then loaded into the floating point register. + +Change-Id: I6db242d27f606f0d5d084a3ab93538698d3a4f8c +Reviewed-on: https://go-review.googlesource.com/c/go/+/631876 +Reviewed-by: Meng Zhuo +Reviewed-by: Mark Ryan +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 3 +++ + src/cmd/internal/obj/riscv/obj.go | 22 ++++++++++++++++++--- + 2 files changed, 22 insertions(+), 3 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index ad468574a9..588ad0f067 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -486,6 +486,9 @@ start: + MOVD F0, 4(X5) // 27b20200 + MOVD F0, F1 // d3000022 + ++ // Convert to load of symbol (AUIPC + FLD) ++ MOVD $(709.78271289338397), F3 // 970f000087b10f00 ++ + // TLS load with local-exec (LUI + ADDIW + ADD of TP + load) + MOV tls(SB), X5 // b70f00009b8f0f00b38f4f0083b20f00 + MOVB tls(SB), X5 // b70f00009b8f0f00b38f4f0083820f00 +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 6fac9159e5..2e582eb9cb 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -147,6 +147,15 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + p.From.Name = obj.NAME_EXTERN + p.From.Offset = 0 + } ++ ++ case AMOVD: ++ if p.From.Type == obj.TYPE_FCONST && p.From.Name == obj.NAME_NONE && p.From.Reg == obj.REG_NONE { ++ f64 := p.From.Val.(float64) ++ p.From.Type = obj.TYPE_MEM ++ p.From.Sym = ctxt.Float64Sym(f64) ++ p.From.Name = obj.NAME_EXTERN ++ p.From.Offset = 0 ++ } + } + } + +@@ -2208,12 +2217,19 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + } + + // Note that the values for $off_hi and $off_lo are currently +- // zero and will be assigned during relocation. ++ // zero and will be assigned during relocation. If the destination ++ // is an integer register then we can use the same register for the ++ // address computation, otherwise we need to use the temporary register. + // + // AUIPC $off_hi, Rd + // L $off_lo, Rd, Rd +- insAUIPC := &instruction{as: AAUIPC, rd: ins.rd} +- ins.as, ins.rs1, ins.rs2, ins.imm = movToLoad(p.As), ins.rd, obj.REG_NONE, 0 ++ // ++ addrReg := ins.rd ++ if addrReg < REG_X0 || addrReg > REG_X31 { ++ addrReg = REG_TMP ++ } ++ insAUIPC := &instruction{as: AAUIPC, rd: addrReg} ++ ins.as, ins.rs1, ins.rs2, ins.imm = movToLoad(p.As), addrReg, obj.REG_NONE, 0 + inss = []*instruction{insAUIPC, ins} + + default: +-- +2.39.5 + diff --git a/2069-cmd-asm-cmd-internal-obj-riscv-implement-vector-conf.patch b/2069-cmd-asm-cmd-internal-obj-riscv-implement-vector-conf.patch new file mode 100644 index 0000000..77f8540 --- /dev/null +++ b/2069-cmd-asm-cmd-internal-obj-riscv-implement-vector-conf.patch @@ -0,0 +1,618 @@ +From c6be78d841d1c3fcf2b798598655bb9c8e4c1663 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 069/119] cmd/asm,cmd/internal/obj/riscv: implement vector + configuration setting instructions + +Implement vector configuration setting instructions (VSETVLI, +VSETIVLI, VSETL). These allow the vector length (vl) and vector +type (vtype) CSRs to be configured via a single instruction. +Unfortunately each instruction has its own dedicated encoding. + +In the case of VSETVLI/VSETIVLI, the vector type is specified via +a series of special operands, which specify the selected element +width (E8, E16, E32, E64), the vector register group multiplier +(M1, M2, M4, M8, MF2, MF4, MF8), the vector tail policy (TU, TA) +and vector mask policy (MU, MA). Note that the order of these +special operands matches non-Go assemblers. + +Partially based on work by Pengcheng Wang . + +Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64 +Change-Id: I431f59c1e048a3e84754f0643a963da473a741fe +Reviewed-on: https://go-review.googlesource.com/c/go/+/631936 +Reviewed-by: Mark Ryan +Reviewed-by: Meng Zhuo +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Dmitri Shuralyov +--- + src/cmd/asm/internal/arch/arm64.go | 6 +- + src/cmd/asm/internal/arch/riscv64.go | 35 +++- + src/cmd/asm/internal/asm/asm.go | 21 +++ + src/cmd/asm/internal/asm/parse.go | 16 +- + src/cmd/asm/internal/asm/testdata/riscv64.s | 24 +++ + .../asm/internal/asm/testdata/riscv64error.s | 4 + + src/cmd/internal/obj/arm64/a.out.go | 4 +- + src/cmd/internal/obj/link.go | 3 +- + src/cmd/internal/obj/riscv/cpu.go | 71 +++++++++ + src/cmd/internal/obj/riscv/list.go | 9 ++ + src/cmd/internal/obj/riscv/obj.go | 149 ++++++++++++++++-- + src/cmd/internal/obj/util.go | 7 + + 12 files changed, 325 insertions(+), 24 deletions(-) + +diff --git a/src/cmd/asm/internal/arch/arm64.go b/src/cmd/asm/internal/arch/arm64.go +index e63601de64..87ccb8c040 100644 +--- a/src/cmd/asm/internal/arch/arm64.go ++++ b/src/cmd/asm/internal/arch/arm64.go +@@ -59,10 +59,10 @@ func jumpArm64(word string) bool { + + var arm64SpecialOperand map[string]arm64.SpecialOperand + +-// GetARM64SpecialOperand returns the internal representation of a special operand. +-func GetARM64SpecialOperand(name string) arm64.SpecialOperand { ++// ARM64SpecialOperand returns the internal representation of a special operand. ++func ARM64SpecialOperand(name string) arm64.SpecialOperand { + if arm64SpecialOperand == nil { +- // Generate the mapping automatically when the first time the function is called. ++ // Generate mapping when function is first called. + arm64SpecialOperand = map[string]arm64.SpecialOperand{} + for opd := arm64.SPOP_BEGIN; opd < arm64.SPOP_END; opd++ { + arm64SpecialOperand[opd.String()] = opd +diff --git a/src/cmd/asm/internal/arch/riscv64.go b/src/cmd/asm/internal/arch/riscv64.go +index 27a66c5e63..69e060a865 100644 +--- a/src/cmd/asm/internal/arch/riscv64.go ++++ b/src/cmd/asm/internal/arch/riscv64.go +@@ -13,9 +13,8 @@ import ( + "cmd/internal/obj/riscv" + ) + +-// IsRISCV64AMO reports whether the op (as defined by a riscv.A* +-// constant) is one of the AMO instructions that requires special +-// handling. ++// IsRISCV64AMO reports whether op is an AMO instruction that requires ++// special handling. + func IsRISCV64AMO(op obj.As) bool { + switch op { + case riscv.ASCW, riscv.ASCD, riscv.AAMOSWAPW, riscv.AAMOSWAPD, riscv.AAMOADDW, riscv.AAMOADDD, +@@ -26,3 +25,33 @@ func IsRISCV64AMO(op obj.As) bool { + } + return false + } ++ ++// IsRISCV64VTypeI reports whether op is a vtype immediate instruction that ++// requires special handling. ++func IsRISCV64VTypeI(op obj.As) bool { ++ return op == riscv.AVSETVLI || op == riscv.AVSETIVLI ++} ++ ++var riscv64SpecialOperand map[string]riscv.SpecialOperand ++ ++// RISCV64SpecialOperand returns the internal representation of a special operand. ++func RISCV64SpecialOperand(name string) riscv.SpecialOperand { ++ if riscv64SpecialOperand == nil { ++ // Generate mapping when function is first called. ++ riscv64SpecialOperand = map[string]riscv.SpecialOperand{} ++ for opd := riscv.SPOP_BEGIN; opd < riscv.SPOP_END; opd++ { ++ riscv64SpecialOperand[opd.String()] = opd ++ } ++ } ++ if opd, ok := riscv64SpecialOperand[name]; ok { ++ return opd ++ } ++ return riscv.SPOP_END ++} ++ ++// RISCV64ValidateVectorType reports whether the given configuration is a ++// valid vector type. ++func RISCV64ValidateVectorType(vsew, vlmul, vtail, vmask int64) error { ++ _, err := riscv.EncodeVectorType(vsew, vlmul, vtail, vmask) ++ return err ++} +diff --git a/src/cmd/asm/internal/asm/asm.go b/src/cmd/asm/internal/asm/asm.go +index 223c613bd9..6a87813549 100644 +--- a/src/cmd/asm/internal/asm/asm.go ++++ b/src/cmd/asm/internal/asm/asm.go +@@ -905,6 +905,19 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) { + prog.To = a[5] + break + } ++ if p.arch.Family == sys.RISCV64 && arch.IsRISCV64VTypeI(op) { ++ prog.From = a[0] ++ vsew := p.getSpecial(prog, op, &a[1]) ++ vlmul := p.getSpecial(prog, op, &a[2]) ++ vtail := p.getSpecial(prog, op, &a[3]) ++ vmask := p.getSpecial(prog, op, &a[4]) ++ if err := arch.RISCV64ValidateVectorType(vsew, vlmul, vtail, vmask); err != nil { ++ p.errorf("invalid vtype: %v", err) ++ } ++ prog.AddRestSourceArgs([]obj.Addr{a[1], a[2], a[3], a[4]}) ++ prog.To = a[5] ++ break ++ } + fallthrough + default: + p.errorf("can't handle %s instruction with %d operands", op, len(a)) +@@ -955,3 +968,11 @@ func (p *Parser) getRegister(prog *obj.Prog, op obj.As, addr *obj.Addr) int16 { + } + return addr.Reg + } ++ ++// getSpecial checks that addr represents a special operand and returns its value. ++func (p *Parser) getSpecial(prog *obj.Prog, op obj.As, addr *obj.Addr) int64 { ++ if addr.Type != obj.TYPE_SPECIAL || addr.Name != 0 || addr.Reg != 0 || addr.Index != 0 { ++ p.errorf("%s: expected special operand; found %s", op, obj.Dconv(prog, addr)) ++ } ++ return addr.Offset ++} +diff --git a/src/cmd/asm/internal/asm/parse.go b/src/cmd/asm/internal/asm/parse.go +index ecee98593d..0d78a242c7 100644 +--- a/src/cmd/asm/internal/asm/parse.go ++++ b/src/cmd/asm/internal/asm/parse.go +@@ -20,6 +20,7 @@ import ( + "cmd/asm/internal/lex" + "cmd/internal/obj" + "cmd/internal/obj/arm64" ++ "cmd/internal/obj/riscv" + "cmd/internal/obj/x86" + "cmd/internal/src" + "cmd/internal/sys" +@@ -390,16 +391,21 @@ func (p *Parser) operand(a *obj.Addr) { + tok := p.next() + name := tok.String() + if tok.ScanToken == scanner.Ident && !p.atStartOfRegister(name) { ++ // See if this is an architecture specific special operand. + switch p.arch.Family { + case sys.ARM64: +- // arm64 special operands. +- if opd := arch.GetARM64SpecialOperand(name); opd != arm64.SPOP_END { ++ if opd := arch.ARM64SpecialOperand(name); opd != arm64.SPOP_END { + a.Type = obj.TYPE_SPECIAL + a.Offset = int64(opd) +- break + } +- fallthrough +- default: ++ case sys.RISCV64: ++ if opd := arch.RISCV64SpecialOperand(name); opd != riscv.SPOP_END { ++ a.Type = obj.TYPE_SPECIAL ++ a.Offset = int64(opd) ++ } ++ } ++ ++ if a.Type != obj.TYPE_SPECIAL { + // We have a symbol. Parse $sym±offset(symkind) + p.symbolReference(a, name, prefix) + } +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 588ad0f067..aba7a80007 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -424,6 +424,30 @@ start: + BSET $63, X9 // 9394f42b + BSETI $1, X10, X11 // 93151528 + ++ // ++ // "V" Standard Extension for Vector Operations, Version 1.0 ++ // ++ ++ // 31.6: Configuration Setting Instructions ++ VSETVLI X10, E8, M1, TU, MU, X12 // 57760500 ++ VSETVLI X10, E16, M1, TU, MU, X12 // 57768500 ++ VSETVLI X10, E32, M1, TU, MU, X12 // 57760501 ++ VSETVLI X10, E64, M1, TU, MU, X12 // 57768501 ++ VSETVLI X10, E32, M1, TU, MA, X12 // 57760509 ++ VSETVLI X10, E32, M1, TA, MA, X12 // 5776050d ++ VSETVLI X10, E32, M2, TA, MA, X12 // 5776150d ++ VSETVLI X10, E32, M4, TA, MA, X12 // 5776250d ++ VSETVLI X10, E32, M8, TA, MA, X12 // 5776350d ++ VSETVLI X10, E32, MF2, TA, MA, X12 // 5776550d ++ VSETVLI X10, E32, MF4, TA, MA, X12 // 5776650d ++ VSETVLI X10, E32, MF8, TA, MA, X12 // 5776750d ++ VSETVLI X10, E32, M1, TA, MA, X12 // 5776050d ++ VSETVLI $15, E32, M1, TA, MA, X12 // 57f607cd ++ VSETIVLI $0, E32, M1, TA, MA, X12 // 577600cd ++ VSETIVLI $15, E32, M1, TA, MA, X12 // 57f607cd ++ VSETIVLI $31, E32, M1, TA, MA, X12 // 57f60fcd ++ VSETVL X10, X11, X12 // 57f6a580 ++ + // + // Privileged ISA + // +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index 0b0184aaa7..a90f22af9f 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -46,4 +46,8 @@ TEXT errors(SB),$0 + SRLI $1, X5, F1 // ERROR "expected integer register in rd position but got non-integer register F1" + SRLI $1, F1, X5 // ERROR "expected integer register in rs1 position but got non-integer register F1" + FNES F1, (X5) // ERROR "needs an integer register output" ++ VSETVLI $32, E16, M1, TU, MU, X12 // ERROR "must be in range [0, 31] (5 bits)" ++ VSETVLI $-1, E32, M2, TA, MA, X12 // ERROR "must be in range [0, 31] (5 bits)" ++ VSETIVLI X10, E32, M2, TA, MA, X12 // ERROR "expected immediate value" ++ VSETVL X10, X11 // ERROR "expected integer register in rs1 position" + RET +diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go +index fc170e737d..3a3d976639 100644 +--- a/src/cmd/internal/obj/arm64/a.out.go ++++ b/src/cmd/internal/obj/arm64/a.out.go +@@ -1052,8 +1052,8 @@ type SpecialOperand int + + const ( + // PRFM +- SPOP_PLDL1KEEP SpecialOperand = iota // must be the first one +- SPOP_BEGIN SpecialOperand = iota - 1 // set as the lower bound ++ SPOP_PLDL1KEEP SpecialOperand = obj.SpecialOperandARM64Base + iota // must be the first one ++ SPOP_BEGIN SpecialOperand = obj.SpecialOperandARM64Base + iota - 1 // set as the lower bound + SPOP_PLDL1STRM + SPOP_PLDL2KEEP + SPOP_PLDL2STRM +diff --git a/src/cmd/internal/obj/link.go b/src/cmd/internal/obj/link.go +index b12bf2399a..2b35554cdc 100644 +--- a/src/cmd/internal/obj/link.go ++++ b/src/cmd/internal/obj/link.go +@@ -97,7 +97,8 @@ import ( + // val = string + // + // +-// Special symbolic constants for ARM64, such as conditional flags, tlbi_op and so on. ++// Special symbolic constants for ARM64 (such as conditional flags, tlbi_op and so on) ++// and RISCV64 (such as names for vector configuration instruction arguments). + // Encoding: + // type = TYPE_SPECIAL + // offset = The constant value corresponding to this symbol +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index 29f7e913ed..9b88ff2ccd 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -1217,6 +1217,77 @@ const ( + RM_RMM // Round to Nearest, ties to Max Magnitude + ) + ++type SpecialOperand int ++ ++const ( ++ SPOP_BEGIN SpecialOperand = obj.SpecialOperandRISCVBase ++ ++ // Vector mask policy. ++ SPOP_MA SpecialOperand = obj.SpecialOperandRISCVBase + iota - 1 ++ SPOP_MU ++ ++ // Vector tail policy. ++ SPOP_TA ++ SPOP_TU ++ ++ // Vector register group multiplier (VLMUL). ++ SPOP_M1 ++ SPOP_M2 ++ SPOP_M4 ++ SPOP_M8 ++ SPOP_MF2 ++ SPOP_MF4 ++ SPOP_MF8 ++ ++ // Vector selected element width (VSEW). ++ SPOP_E8 ++ SPOP_E16 ++ SPOP_E32 ++ SPOP_E64 ++ ++ SPOP_END ++) ++ ++var specialOperands = map[SpecialOperand]struct { ++ encoding uint32 ++ name string ++}{ ++ SPOP_MA: {encoding: 1, name: "MA"}, ++ SPOP_MU: {encoding: 0, name: "MU"}, ++ ++ SPOP_TA: {encoding: 1, name: "TA"}, ++ SPOP_TU: {encoding: 0, name: "TU"}, ++ ++ SPOP_M1: {encoding: 0, name: "M1"}, ++ SPOP_M2: {encoding: 1, name: "M2"}, ++ SPOP_M4: {encoding: 2, name: "M4"}, ++ SPOP_M8: {encoding: 3, name: "M8"}, ++ SPOP_MF2: {encoding: 5, name: "MF2"}, ++ SPOP_MF4: {encoding: 6, name: "MF4"}, ++ SPOP_MF8: {encoding: 7, name: "MF8"}, ++ ++ SPOP_E8: {encoding: 0, name: "E8"}, ++ SPOP_E16: {encoding: 1, name: "E16"}, ++ SPOP_E32: {encoding: 2, name: "E32"}, ++ SPOP_E64: {encoding: 3, name: "E64"}, ++} ++ ++func (so SpecialOperand) encode() uint32 { ++ op, ok := specialOperands[so] ++ if ok { ++ return op.encoding ++ } ++ return 0 ++} ++ ++func (so SpecialOperand) String() string { ++ op, ok := specialOperands[so] ++ if ok { ++ return op.name ++ } ++ return "" ++} ++ + // All unary instructions which write to their arguments (as opposed to reading + // from them) go here. The assembly parser uses this information to populate + // its AST in a semantically reasonable way. +diff --git a/src/cmd/internal/obj/riscv/list.go b/src/cmd/internal/obj/riscv/list.go +index c5b7e80719..8eb97a476d 100644 +--- a/src/cmd/internal/obj/riscv/list.go ++++ b/src/cmd/internal/obj/riscv/list.go +@@ -14,6 +14,7 @@ func init() { + obj.RegisterRegister(obj.RBaseRISCV, REG_END, RegName) + obj.RegisterOpcode(obj.ABaseRISCV, Anames) + obj.RegisterOpSuffix("riscv64", opSuffixString) ++ obj.RegisterSpecialOperands(int64(SPOP_BEGIN), int64(SPOP_END), specialOperandConv) + } + + func RegName(r int) string { +@@ -49,3 +50,11 @@ func opSuffixString(s uint8) string { + } + return fmt.Sprintf(".%s", ss) + } ++ ++func specialOperandConv(a int64) string { ++ spc := SpecialOperand(a) ++ if spc >= SPOP_BEGIN && spc < SPOP_END { ++ return spc.String() ++ } ++ return "SPC_??" ++} +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 2e582eb9cb..3d1c120baa 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1046,27 +1046,35 @@ func immEven(x int64) error { + return nil + } + +-// immIFits checks whether the immediate value x fits in nbits bits +-// as a signed integer. If it does not, an error is returned. +-func immIFits(x int64, nbits uint) error { +- nbits-- +- min := int64(-1) << nbits +- max := int64(1)< max { + if nbits <= 16 { +- return fmt.Errorf("signed immediate %d must be in range [%d, %d] (%d bits)", x, min, max, nbits) ++ return fmt.Errorf("%s immediate %d must be in range [%d, %d] (%d bits)", label, x, min, max, nbits) + } +- return fmt.Errorf("signed immediate %#x must be in range [%#x, %#x] (%d bits)", x, min, max, nbits) ++ return fmt.Errorf("%s immediate %#x must be in range [%#x, %#x] (%d bits)", label, x, min, max, nbits) + } + return nil + } + ++// immIFits checks whether the immediate value x fits in nbits bits ++// as a signed integer. If it does not, an error is returned. ++func immIFits(x int64, nbits uint) error { ++ return immFits(x, nbits, true) ++} ++ + // immI extracts the signed integer of the specified size from an immediate. + func immI(as obj.As, imm int64, nbits uint) uint32 { + if err := immIFits(imm, nbits); err != nil { + panic(fmt.Sprintf("%v: %v", as, err)) + } +- return uint32(imm) ++ return uint32(imm) & ((1 << nbits) - 1) + } + + func wantImmI(ctxt *obj.Link, ins *instruction, imm int64, nbits uint) { +@@ -1075,6 +1083,26 @@ func wantImmI(ctxt *obj.Link, ins *instruction, imm int64, nbits uint) { + } + } + ++// immUFits checks whether the immediate value x fits in nbits bits ++// as an unsigned integer. If it does not, an error is returned. ++func immUFits(x int64, nbits uint) error { ++ return immFits(x, nbits, false) ++} ++ ++// immU extracts the unsigned integer of the specified size from an immediate. ++func immU(as obj.As, imm int64, nbits uint) uint32 { ++ if err := immUFits(imm, nbits); err != nil { ++ panic(fmt.Sprintf("%v: %v", as, err)) ++ } ++ return uint32(imm) & ((1 << nbits) - 1) ++} ++ ++func wantImmU(ctxt *obj.Link, ins *instruction, imm int64, nbits uint) { ++ if err := immUFits(imm, nbits); err != nil { ++ ctxt.Diag("%v: %v", ins, err) ++ } ++} ++ + func wantReg(ctxt *obj.Link, ins *instruction, pos string, descr string, r, min, max uint32) { + if r < min || r > max { + var suffix string +@@ -1231,6 +1259,29 @@ func validateJ(ctxt *obj.Link, ins *instruction) { + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + ++func validateVsetvli(ctxt *obj.Link, ins *instruction) { ++ wantImmU(ctxt, ins, ins.imm, 11) ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateVsetivli(ctxt *obj.Link, ins *instruction) { ++ wantImmU(ctxt, ins, ins.imm, 10) ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantImmU(ctxt, ins, int64(ins.rs1), 5) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateVsetvl(ctxt *obj.Link, ins *instruction) { ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantIntReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ + func validateRaw(ctxt *obj.Link, ins *instruction) { + // Treat the raw value specially as a 32-bit unsigned integer. + // Nobody wants to enter negative machine code. +@@ -1419,6 +1470,29 @@ func encodeCJImmediate(imm uint32) uint32 { + return bits << 2 + } + ++func encodeVset(as obj.As, rs1, rs2, rd uint32) uint32 { ++ enc := encode(as) ++ if enc == nil { ++ panic("encodeVset: could not encode instruction") ++ } ++ return enc.funct7<<25 | rs2<<20 | rs1<<15 | enc.funct3<<12 | rd<<7 | enc.opcode ++} ++ ++func encodeVsetvli(ins *instruction) uint32 { ++ vtype := immU(ins.as, ins.imm, 11) ++ return encodeVset(ins.as, regI(ins.rs1), vtype, regI(ins.rd)) ++} ++ ++func encodeVsetivli(ins *instruction) uint32 { ++ vtype := immU(ins.as, ins.imm, 10) ++ avl := immU(ins.as, int64(ins.rs1), 5) ++ return encodeVset(ins.as, avl, vtype, regI(ins.rd)) ++} ++ ++func encodeVsetvl(ins *instruction) uint32 { ++ return encodeVset(ins.as, regI(ins.rs1), regI(ins.rs2), regI(ins.rd)) ++} ++ + func encodeRawIns(ins *instruction) uint32 { + // Treat the raw value specially as a 32-bit unsigned integer. + // Nobody wants to enter negative machine code. +@@ -1489,6 +1563,27 @@ func EncodeUImmediate(imm int64) (int64, error) { + return imm << 12, nil + } + ++func EncodeVectorType(vsew, vlmul, vtail, vmask int64) (int64, error) { ++ vsewSO := SpecialOperand(vsew) ++ if vsewSO < SPOP_E8 || vsewSO > SPOP_E64 { ++ return -1, fmt.Errorf("invalid vector selected element width %q", vsewSO) ++ } ++ vlmulSO := SpecialOperand(vlmul) ++ if vlmulSO < SPOP_M1 || vlmulSO > SPOP_MF8 { ++ return -1, fmt.Errorf("invalid vector register group multiplier %q", vlmulSO) ++ } ++ vtailSO := SpecialOperand(vtail) ++ if vtailSO != SPOP_TA && vtailSO != SPOP_TU { ++ return -1, fmt.Errorf("invalid vector tail policy %q", vtailSO) ++ } ++ vmaskSO := SpecialOperand(vmask) ++ if vmaskSO != SPOP_MA && vmaskSO != SPOP_MU { ++ return -1, fmt.Errorf("invalid vector mask policy %q", vmaskSO) ++ } ++ vtype := vmaskSO.encode()<<7 | vtailSO.encode()<<6 | vsewSO.encode()<<3 | vlmulSO.encode() ++ return int64(vtype), nil ++} ++ + type encoding struct { + encode func(*instruction) uint32 // encode returns the machine code for an instruction + validate func(*obj.Link, *instruction) // validate validates an instruction +@@ -1526,6 +1621,11 @@ var ( + uEncoding = encoding{encode: encodeU, validate: validateU, length: 4} + jEncoding = encoding{encode: encodeJ, validate: validateJ, length: 4} + ++ // Encodings for vector configuration setting instruction. ++ vsetvliEncoding = encoding{encode: encodeVsetvli, validate: validateVsetvli, length: 4} ++ vsetivliEncoding = encoding{encode: encodeVsetivli, validate: validateVsetivli, length: 4} ++ vsetvlEncoding = encoding{encode: encodeVsetvl, validate: validateVsetvl, length: 4} ++ + // rawEncoding encodes a raw instruction byte sequence. + rawEncoding = encoding{encode: encodeRawIns, validate: validateRaw, length: 4} + +@@ -1792,6 +1892,15 @@ var instructions = [ALAST & obj.AMask]instructionData{ + ABSET & obj.AMask: {enc: rIIIEncoding, immForm: ABSETI, ternary: true}, + ABSETI & obj.AMask: {enc: iIIEncoding, ternary: true}, + ++ // ++ // "V" Standard Extension for Vector Operations, Version 1.0 ++ // ++ ++ // 31.6. Vector Configuration-Setting Instructions ++ AVSETVLI & obj.AMask: {enc: vsetvliEncoding, immForm: AVSETIVLI}, ++ AVSETIVLI & obj.AMask: {enc: vsetivliEncoding}, ++ AVSETVL & obj.AMask: {enc: vsetvlEncoding}, ++ + // + // Privileged ISA + // +@@ -2356,7 +2465,12 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins := instructionForProg(p) + inss := []*instruction{ins} + +- if len(p.RestArgs) > 1 { ++ if ins.as == AVSETVLI || ins.as == AVSETIVLI { ++ if len(p.RestArgs) != 4 { ++ p.Ctxt.Diag("incorrect number of arguments for instruction") ++ return nil ++ } ++ } else if len(p.RestArgs) > 1 { + p.Ctxt.Diag("too many source registers") + return nil + } +@@ -2594,6 +2708,21 @@ func instructionsForProg(p *obj.Prog) []*instruction { + // XNOR -> (NOT (XOR x y)) + ins.as = AXOR + inss = append(inss, &instruction{as: AXORI, rs1: ins.rd, rs2: obj.REG_NONE, rd: ins.rd, imm: -1}) ++ ++ case AVSETVLI, AVSETIVLI: ++ ins.rs1, ins.rs2 = ins.rs2, obj.REG_NONE ++ vtype, err := EncodeVectorType(p.RestArgs[0].Offset, p.RestArgs[1].Offset, p.RestArgs[2].Offset, p.RestArgs[3].Offset) ++ if err != nil { ++ p.Ctxt.Diag("%v: %v", p, err) ++ } ++ ins.imm = int64(vtype) ++ if ins.as == AVSETIVLI { ++ if p.From.Type != obj.TYPE_CONST { ++ p.Ctxt.Diag("%v: expected immediate value", p) ++ } ++ ins.rs1 = uint32(p.From.Offset) ++ } ++ + } + + for _, ins := range inss { +diff --git a/src/cmd/internal/obj/util.go b/src/cmd/internal/obj/util.go +index 3a071c21d4..367535f863 100644 +--- a/src/cmd/internal/obj/util.go ++++ b/src/cmd/internal/obj/util.go +@@ -584,6 +584,13 @@ type spcSet struct { + + var spcSpace []spcSet + ++// Each architecture is allotted a distinct subspace: [Lo, Hi) for declaring its ++// arch-specific special operands. ++const ( ++ SpecialOperandARM64Base = 0 << 16 ++ SpecialOperandRISCVBase = 1 << 16 ++) ++ + // RegisterSpecialOperands binds a pretty-printer (SPCconv) for special + // operand numbers to a given special operand number range. Lo is inclusive, + // hi is exclusive (valid special operands are lo through hi-1). +-- +2.39.5 + diff --git a/2070-internal-bytealg-clean-up-and-simplify-the-riscv64-e.patch b/2070-internal-bytealg-clean-up-and-simplify-the-riscv64-e.patch new file mode 100644 index 0000000..1456ee5 --- /dev/null +++ b/2070-internal-bytealg-clean-up-and-simplify-the-riscv64-e.patch @@ -0,0 +1,160 @@ +From e4c46374f5cbd543dccfa0fb346503a7a46a34ef Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 070/119] internal/bytealg: clean up and simplify the riscv64 + equal implementation + +Now that riscv64 is only regabi, remove the entrypoint separation and +have runtime.memequal_varlen call runtime.memequal. Add a zero byte +length check and replace the equal and not equal exit paths with a +single exit path that conditions on length reaching zero. + +Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64 +Change-Id: Ida4e54378daa7fd423f759753eba04ce513a27cb +Reviewed-on: https://go-review.googlesource.com/c/go/+/648855 +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Michael Knyszek +Reviewed-by: Cherry Mui +--- + src/internal/bytealg/equal_riscv64.s | 62 +++++++++++++--------------- + 1 file changed, 29 insertions(+), 33 deletions(-) + +diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s +index 7f470ce0a0..87b2d79302 100644 +--- a/src/internal/bytealg/equal_riscv64.s ++++ b/src/internal/bytealg/equal_riscv64.s +@@ -7,25 +7,23 @@ + + #define CTXT S10 + +-// func memequal(a, b unsafe.Pointer, size uintptr) bool +-TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 +- // X10 = a_base +- // X11 = b_base +- // X12 = size +- JMP memequal<>(SB) +- + // func memequal_varlen(a, b unsafe.Pointer) bool + TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17 ++ // X10 = a_base ++ // X11 = b_base + MOV 8(CTXT), X12 // compiler stores size at offset 8 in the closure ++ JMP runtime·memequal(SB) ++ ++// func memequal(a, b unsafe.Pointer, size uintptr) bool ++TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 + // X10 = a_base + // X11 = b_base +- JMP memequal<>(SB) ++ // X12 = size ++ BNE X10, X11, length_check ++ MOV $0, X12 + +-// On entry X10 and X11 contain pointers, X12 contains length. +-// For non-regabi X13 contains address for return value. +-// For regabi return value in X10. +-TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0 +- BEQ X10, X11, eq ++length_check: ++ BEQZ X12, done + + MOV $32, X23 + BLT X12, X23, loop4_check +@@ -44,7 +42,7 @@ align: + SUB $1, X9 + MOVBU 0(X10), X19 + MOVBU 0(X11), X20 +- BNE X19, X20, not_eq ++ BNE X19, X20, done + ADD $1, X10 + ADD $1, X11 + BNEZ X9, align +@@ -57,19 +55,19 @@ loop32: + MOV 0(X11), X20 + MOV 8(X10), X21 + MOV 8(X11), X22 +- BNE X19, X20, not_eq +- BNE X21, X22, not_eq ++ BNE X19, X20, done ++ BNE X21, X22, done + MOV 16(X10), X14 + MOV 16(X11), X15 + MOV 24(X10), X16 + MOV 24(X11), X17 +- BNE X14, X15, not_eq +- BNE X16, X17, not_eq ++ BNE X14, X15, done ++ BNE X16, X17, done + ADD $32, X10 + ADD $32, X11 + SUB $32, X12 + BGE X12, X9, loop32 +- BEQZ X12, eq ++ BEQZ X12, done + + loop16_check: + MOV $16, X23 +@@ -79,13 +77,13 @@ loop16: + MOV 0(X11), X20 + MOV 8(X10), X21 + MOV 8(X11), X22 +- BNE X19, X20, not_eq +- BNE X21, X22, not_eq ++ BNE X19, X20, done ++ BNE X21, X22, done + ADD $16, X10 + ADD $16, X11 + SUB $16, X12 + BGE X12, X23, loop16 +- BEQZ X12, eq ++ BEQZ X12, done + + loop4_check: + MOV $4, X23 +@@ -95,32 +93,30 @@ loop4: + MOVBU 0(X11), X20 + MOVBU 1(X10), X21 + MOVBU 1(X11), X22 +- BNE X19, X20, not_eq +- BNE X21, X22, not_eq ++ BNE X19, X20, done ++ BNE X21, X22, done + MOVBU 2(X10), X14 + MOVBU 2(X11), X15 + MOVBU 3(X10), X16 + MOVBU 3(X11), X17 +- BNE X14, X15, not_eq +- BNE X16, X17, not_eq ++ BNE X14, X15, done ++ BNE X16, X17, done + ADD $4, X10 + ADD $4, X11 + SUB $4, X12 + BGE X12, X23, loop4 + + loop1: +- BEQZ X12, eq ++ BEQZ X12, done + MOVBU 0(X10), X19 + MOVBU 0(X11), X20 +- BNE X19, X20, not_eq ++ BNE X19, X20, done + ADD $1, X10 + ADD $1, X11 + SUB $1, X12 + JMP loop1 + +-not_eq: +- MOVB ZERO, X10 +- RET +-eq: +- MOV $1, X10 ++done: ++ // If X12 is zero then memory is equivalent. ++ SEQZ X12, X10 + RET +-- +2.39.5 + diff --git a/2071-bytes-internal-bytealg-eliminate-HashStrBytes-HashSt.patch b/2071-bytes-internal-bytealg-eliminate-HashStrBytes-HashSt.patch new file mode 100644 index 0000000..31903aa --- /dev/null +++ b/2071-bytes-internal-bytealg-eliminate-HashStrBytes-HashSt.patch @@ -0,0 +1,126 @@ +From fd8c0c4bd2cbc86ae57f517398792416f7a497c3 Mon Sep 17 00:00:00 2001 +From: Jes Cok +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 071/119] =?UTF-8?q?bytes,internal/bytealg:=20eliminate=20H?= + =?UTF-8?q?ashStrBytes,HashStrRevBytes=20using=20=E2=80=A6?= +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +…generics + +The logic of HashStrBytes, HashStrRevBytes and HashStr, HashStrRev, +are exactly the same, except that the types are different. + +Since the bootstrap toolchain is bumped to 1.20, we can eliminate them +by using generics. + +Change-Id: I4336b1cab494ba963f09646c169b45f6b1ee62e3 +GitHub-Last-Rev: b11a2bf9476d54bed4bd18a3f9269b5c95a66d67 +GitHub-Pull-Request: golang/go#63766 +Reviewed-on: https://go-review.googlesource.com/c/go/+/538175 +Reviewed-by: Keith Randall +Reviewed-by: David Chase +Reviewed-by: Keith Randall +Auto-Submit: Keith Randall +LUCI-TryBot-Result: Go LUCI +--- + src/bytes/bytes.go | 2 +- + src/internal/bytealg/bytealg.go | 46 +++++---------------------------- + 2 files changed, 7 insertions(+), 41 deletions(-) + +diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go +index c54e52e4fc..c662b1cae6 100644 +--- a/src/bytes/bytes.go ++++ b/src/bytes/bytes.go +@@ -122,7 +122,7 @@ func LastIndex(s, sep []byte) int { + return -1 + } + // Rabin-Karp search from the end of the string +- hashss, pow := bytealg.HashStrRevBytes(sep) ++ hashss, pow := bytealg.HashStrRev(sep) + last := len(s) - n + var h uint32 + for i := len(s) - 1; i >= last; i-- { +diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go +index 28f2742c0e..ae4b8b48d2 100644 +--- a/src/internal/bytealg/bytealg.go ++++ b/src/internal/bytealg/bytealg.go +@@ -24,33 +24,16 @@ const ( + // If MaxLen is not 0, make sure MaxLen >= 4. + var MaxLen int + +-// FIXME: the logic of HashStrBytes, HashStrRevBytes, IndexRabinKarpBytes and HashStr, HashStrRev, +-// IndexRabinKarp are exactly the same, except that the types are different. Can we eliminate +-// three of them without causing allocation? ++// FIXME: the logic of IndexRabinKarpBytes and IndexRabinKarp are exactly the same, ++// except that the types are different. ++// Can we eliminate one of them without causing allocation? + + // PrimeRK is the prime base used in Rabin-Karp algorithm. + const PrimeRK = 16777619 + +-// HashStrBytes returns the hash and the appropriate multiplicative +-// factor for use in Rabin-Karp algorithm. +-func HashStrBytes(sep []byte) (uint32, uint32) { +- hash := uint32(0) +- for i := 0; i < len(sep); i++ { +- hash = hash*PrimeRK + uint32(sep[i]) +- } +- var pow, sq uint32 = 1, PrimeRK +- for i := len(sep); i > 0; i >>= 1 { +- if i&1 != 0 { +- pow *= sq +- } +- sq *= sq +- } +- return hash, pow +-} +- + // HashStr returns the hash and the appropriate multiplicative + // factor for use in Rabin-Karp algorithm. +-func HashStr(sep string) (uint32, uint32) { ++func HashStr[T string | []byte](sep T) (uint32, uint32) { + hash := uint32(0) + for i := 0; i < len(sep); i++ { + hash = hash*PrimeRK + uint32(sep[i]) +@@ -65,26 +48,9 @@ func HashStr(sep string) (uint32, uint32) { + return hash, pow + } + +-// HashStrRevBytes returns the hash of the reverse of sep and the +-// appropriate multiplicative factor for use in Rabin-Karp algorithm. +-func HashStrRevBytes(sep []byte) (uint32, uint32) { +- hash := uint32(0) +- for i := len(sep) - 1; i >= 0; i-- { +- hash = hash*PrimeRK + uint32(sep[i]) +- } +- var pow, sq uint32 = 1, PrimeRK +- for i := len(sep); i > 0; i >>= 1 { +- if i&1 != 0 { +- pow *= sq +- } +- sq *= sq +- } +- return hash, pow +-} +- + // HashStrRev returns the hash of the reverse of sep and the + // appropriate multiplicative factor for use in Rabin-Karp algorithm. +-func HashStrRev(sep string) (uint32, uint32) { ++func HashStrRev[T string | []byte](sep T) (uint32, uint32) { + hash := uint32(0) + for i := len(sep) - 1; i >= 0; i-- { + hash = hash*PrimeRK + uint32(sep[i]) +@@ -103,7 +69,7 @@ func HashStrRev(sep string) (uint32, uint32) { + // first occurrence of substr in s, or -1 if not present. + func IndexRabinKarpBytes(s, sep []byte) int { + // Rabin-Karp search +- hashsep, pow := HashStrBytes(sep) ++ hashsep, pow := HashStr(sep) + n := len(sep) + var h uint32 + for i := 0; i < n; i++ { +-- +2.39.5 + diff --git a/2072-cmd-internal-obj-riscv-implement-vector-load-store-i.patch b/2072-cmd-internal-obj-riscv-implement-vector-load-store-i.patch new file mode 100644 index 0000000..6b0afd7 --- /dev/null +++ b/2072-cmd-internal-obj-riscv-implement-vector-load-store-i.patch @@ -0,0 +1,539 @@ +From d2e44d94537e9ee96d9b5909f575ae9eb15422d7 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 072/119] cmd/internal/obj/riscv: implement vector load/store + instructions + +Implement vector unit stride, vector strided, vector indexed and +vector whole register load and store instructions. + +The vector unit stride instructions take an optional vector mask +register, which if specified must be register V0. If only two +operands are given, the instruction is encoded as unmasked. + +The vector strided and vector indexed instructions also take an +optional vector mask register, which if specified must be register +V0. If only three operands are given, the instruction is encoded as +unmasked. + +Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64 +Change-Id: I35e43bb8f1cf6ae8826fbeec384b95ac945da50f +Reviewed-on: https://go-review.googlesource.com/c/go/+/631937 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +Reviewed-by: Michael Knyszek +Reviewed-by: Meng Zhuo +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Pengcheng Wang +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 98 ++++++++ + .../asm/internal/asm/testdata/riscv64error.s | 39 +++ + src/cmd/internal/obj/riscv/anames.go | 4 + + src/cmd/internal/obj/riscv/cpu.go | 4 + + src/cmd/internal/obj/riscv/obj.go | 226 ++++++++++++++++-- + 5 files changed, 356 insertions(+), 15 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index aba7a80007..49f3ac00f3 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -448,6 +448,104 @@ start: + VSETIVLI $31, E32, M1, TA, MA, X12 // 57f60fcd + VSETVL X10, X11, X12 // 57f6a580 + ++ // 31.7.4: Vector Unit-Stride Instructions ++ VLE8V (X10), V3 // 87010502 ++ VLE8V (X10), V0, V3 // 87010500 ++ VLE16V (X10), V3 // 87510502 ++ VLE16V (X10), V0, V3 // 87510500 ++ VLE32V (X10), V3 // 87610502 ++ VLE32V (X10), V0, V3 // 87610500 ++ VLE64V (X10), V3 // 87710502 ++ VLE64V (X10), V0, V3 // 87710500 ++ VSE8V V3, (X10) // a7010502 ++ VSE8V V3, V0, (X10) // a7010500 ++ VSE16V V3, (X10) // a7510502 ++ VSE16V V3, V0, (X10) // a7510500 ++ VSE32V V3, (X10) // a7610502 ++ VSE32V V3, V0, (X10) // a7610500 ++ VSE64V V3, (X10) // a7710502 ++ VSE64V V3, V0, (X10) // a7710500 ++ VLMV (X10), V3 // 8701b502 ++ VSMV V3, (X10) // a701b502 ++ ++ // 31.7.5: Vector Strided Instructions ++ VLSE8V (X10), X11, V3 // 8701b50a ++ VLSE8V (X10), X11, V0, V3 // 8701b508 ++ VLSE16V (X10), X11, V3 // 8751b50a ++ VLSE16V (X10), X11, V0, V3 // 8751b508 ++ VLSE32V (X10), X11, V3 // 8761b50a ++ VLSE32V (X10), X11, V0, V3 // 8761b508 ++ VLSE64V (X10), X11, V3 // 8771b50a ++ VLSE64V (X10), X11, V0, V3 // 8771b508 ++ VSSE8V V3, X11, (X10) // a701b50a ++ VSSE8V V3, X11, V0, (X10) // a701b508 ++ VSSE16V V3, X11, (X10) // a751b50a ++ VSSE16V V3, X11, V0, (X10) // a751b508 ++ VSSE32V V3, X11, (X10) // a761b50a ++ VSSE32V V3, X11, V0, (X10) // a761b508 ++ VSSE64V V3, X11, (X10) // a771b50a ++ VSSE64V V3, X11, V0, (X10) // a771b508 ++ ++ // 31.7.6: Vector Indexed Instructions ++ VLUXEI8V (X10), V2, V3 // 87012506 ++ VLUXEI8V (X10), V2, V0, V3 // 87012504 ++ VLUXEI16V (X10), V2, V3 // 87512506 ++ VLUXEI16V (X10), V2, V0, V3 // 87512504 ++ VLUXEI32V (X10), V2, V3 // 87612506 ++ VLUXEI32V (X10), V2, V0, V3 // 87612504 ++ VLUXEI64V (X10), V2, V3 // 87712506 ++ VLUXEI64V (X10), V2, V0, V3 // 87712504 ++ VLOXEI8V (X10), V2, V3 // 8701250e ++ VLOXEI8V (X10), V2, V0, V3 // 8701250c ++ VLOXEI16V (X10), V2, V3 // 8751250e ++ VLOXEI16V (X10), V2, V0, V3 // 8751250c ++ VLOXEI32V (X10), V2, V3 // 8761250e ++ VLOXEI32V (X10), V2, V0, V3 // 8761250c ++ VLOXEI64V (X10), V2, V3 // 8771250e ++ VLOXEI64V (X10), V2, V0, V3 // 8771250c ++ VSUXEI8V V3, V2, (X10) // a7012506 ++ VSUXEI8V V3, V2, V0, (X10) // a7012504 ++ VSUXEI16V V3, V2, (X10) // a7512506 ++ VSUXEI16V V3, V2, V0, (X10) // a7512504 ++ VSUXEI32V V3, V2, (X10) // a7612506 ++ VSUXEI32V V3, V2, V0, (X10) // a7612504 ++ VSUXEI64V V3, V2, (X10) // a7712506 ++ VSUXEI64V V3, V2, V0, (X10) // a7712504 ++ VSOXEI8V V3, V2, (X10) // a701250e ++ VSOXEI8V V3, V2, V0, (X10) // a701250c ++ VSOXEI16V V3, V2, (X10) // a751250e ++ VSOXEI16V V3, V2, V0, (X10) // a751250c ++ VSOXEI32V V3, V2, (X10) // a761250e ++ VSOXEI32V V3, V2, V0, (X10) // a761250c ++ VSOXEI64V V3, V2, (X10) // a771250e ++ VSOXEI64V V3, V2, V0, (X10) // a771250c ++ ++ // 31.7.9: Vector Load/Store Whole Register Instructions ++ VL1RV (X10), V3 // 87018502 ++ VL1RE8V (X10), V3 // 87018502 ++ VL1RE16V (X10), V3 // 87518502 ++ VL1RE32V (X10), V3 // 87618502 ++ VL1RE64V (X10), V3 // 87718502 ++ VL2RV (X10), V2 // 07018522 ++ VL2RE8V (X10), V2 // 07018522 ++ VL2RE16V (X10), V2 // 07518522 ++ VL2RE32V (X10), V2 // 07618522 ++ VL2RE64V (X10), V2 // 07718522 ++ VL4RV (X10), V4 // 07028562 ++ VL4RE8V (X10), V4 // 07028562 ++ VL4RE16V (X10), V4 // 07528562 ++ VL4RE32V (X10), V4 // 07628562 ++ VL4RE64V (X10), V4 // 07728562 ++ VL8RV (X10), V8 // 070485e2 ++ VL8RE8V (X10), V8 // 070485e2 ++ VL8RE16V (X10), V8 // 075485e2 ++ VL8RE32V (X10), V8 // 076485e2 ++ VL8RE64V (X10), V8 // 077485e2 ++ VS1RV V3, (X11) // a7818502 ++ VS2RV V2, (X11) // 27818522 ++ VS4RV V4, (X11) // 27828562 ++ VS8RV V8, (X11) // 278485e2 ++ + // + // Privileged ISA + // +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index a90f22af9f..82a2348894 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -50,4 +50,43 @@ TEXT errors(SB),$0 + VSETVLI $-1, E32, M2, TA, MA, X12 // ERROR "must be in range [0, 31] (5 bits)" + VSETIVLI X10, E32, M2, TA, MA, X12 // ERROR "expected immediate value" + VSETVL X10, X11 // ERROR "expected integer register in rs1 position" ++ VLE8V (X10), X10 // ERROR "expected vector register in rd position" ++ VLE8V (V1), V3 // ERROR "expected integer register in rs1 position" ++ VLE8V (X10), V1, V3 // ERROR "invalid vector mask register" ++ VSE8V X10, (X10) // ERROR "expected vector register in rs1 position" ++ VSE8V V3, (V1) // ERROR "expected integer register in rd position" ++ VSE8V V3, V1, (X10) // ERROR "invalid vector mask register" ++ VLSE8V (X10), V3 // ERROR "expected integer register in rs2 position" ++ VLSE8V (X10), X10, X11 // ERROR "expected vector register in rd position" ++ VLSE8V (V1), X10, V3 // ERROR "expected integer register in rs1 position" ++ VLSE8V (X10), V1, V0, V3 // ERROR "expected integer register in rs2 position" ++ VLSE8V (X10), X10, V1, V3 // ERROR "invalid vector mask register" ++ VSSE8V V3, (X10) // ERROR "expected integer register in rs2 position" ++ VSSE8V X10, X11, (X10) // ERROR "expected vector register in rd position" ++ VSSE8V V3, X11, (V1) // ERROR "expected integer register in rs1 position" ++ VSSE8V V3, V1, V0, (X10) // ERROR "expected integer register in rs2 position" ++ VSSE8V V3, X11, V1, (X10) // ERROR "invalid vector mask register" ++ VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" ++ VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" ++ VLUXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" ++ VLUXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in rs2 position" ++ VLUXEI8V (X10), V2, V1, V3 // ERROR "invalid vector mask register" ++ VSUXEI8V X10, V2, (X10) // ERROR "expected vector register in rd position" ++ VSUXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" ++ VSUXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in rs2 position" ++ VSUXEI8V V3, V2, V1, (X10) // ERROR "invalid vector mask register" ++ VLOXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" ++ VLOXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" ++ VLOXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in rs2 position" ++ VLOXEI8V (X10), V2, V1, V3 // ERROR "invalid vector mask register" ++ VSOXEI8V X10, V2, (X10) // ERROR "expected vector register in rd position" ++ VSOXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" ++ VSOXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in rs2 position" ++ VSOXEI8V V3, V2, V1, (X10) // ERROR "invalid vector mask register" ++ VL1RV (X10), V0, V3 // ERROR "too many operands for instruction" ++ VL1RV (X10), X10 // ERROR "expected vector register in rd position" ++ VL1RV (V1), V3 // ERROR "expected integer register in rs1 position" ++ VS1RV V3, V0, (X11) // ERROR "too many operands for instruction" ++ VS1RV X11, (X11) // ERROR "expected vector register in rs1 position" ++ VS1RV V3, (V1) // ERROR "expected integer register in rd position" + RET +diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go +index c49569c943..6df5f0a173 100644 +--- a/src/cmd/internal/obj/riscv/anames.go ++++ b/src/cmd/internal/obj/riscv/anames.go +@@ -650,5 +650,9 @@ var Anames = []string{ + "RDTIME", + "SEQZ", + "SNEZ", ++ "VL1RV", ++ "VL2RV", ++ "VL4RV", ++ "VL8RV", + "LAST", + } +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index 9b88ff2ccd..8999ef149b 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -1168,6 +1168,10 @@ const ( + ARDTIME + ASEQZ + ASNEZ ++ AVL1RV ++ AVL2RV ++ AVL4RV ++ AVL8RV + + // End marker + ALAST +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 3d1c120baa..a558dc3596 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1213,6 +1213,27 @@ func validateIF(ctxt *obj.Link, ins *instruction) { + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + ++func validateIV(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateIIIV(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantIntReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateIVIV(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ + func validateSI(ctxt *obj.Link, ins *instruction) { + wantImmI(ctxt, ins, ins.imm, 12) + wantIntReg(ctxt, ins, "rd", ins.rd) +@@ -1229,6 +1250,27 @@ func validateSF(ctxt *obj.Link, ins *instruction) { + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + ++func validateSV(ctxt *obj.Link, ins *instruction) { ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantVectorReg(ctxt, ins, "rs1", ins.rs1) ++ wantNoneReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateSVII(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantIntReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateSVIV(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "rd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ + func validateB(ctxt *obj.Link, ins *instruction) { + // Offsets are multiples of two, so accept 13 bit immediates for the + // 12 bit slot. We implicitly drop the least significant bit in encodeB. +@@ -1305,7 +1347,10 @@ func encodeR(as obj.As, rs1, rs2, rd, funct3, funct7 uint32) uint32 { + if enc.rs2 != 0 && rs2 != 0 { + panic("encodeR: instruction uses rs2, but rs2 was nonzero") + } +- return funct7<<25 | enc.funct7<<25 | enc.rs2<<20 | rs2<<20 | rs1<<15 | enc.funct3<<12 | funct3<<12 | rd<<7 | enc.opcode ++ funct3 |= enc.funct3 ++ funct7 |= enc.funct7 ++ rs2 |= enc.rs2 ++ return funct7<<25 | rs2<<20 | rs1<<15 | funct3<<12 | rd<<7 | enc.opcode + } + + // encodeR4 encodes an R4-type RISC-V instruction. +@@ -1357,38 +1402,67 @@ func encodeRFF(ins *instruction) uint32 { + } + + // encodeI encodes an I-type RISC-V instruction. +-func encodeI(as obj.As, rs1, rd, imm uint32) uint32 { ++func encodeI(as obj.As, rs1, rd, imm, funct7 uint32) uint32 { + enc := encode(as) + if enc == nil { + panic("encodeI: could not encode instruction") + } + imm |= uint32(enc.csr) +- return imm<<20 | rs1<<15 | enc.funct3<<12 | rd<<7 | enc.opcode ++ return funct7<<25 | imm<<20 | rs1<<15 | enc.funct3<<12 | rd<<7 | enc.opcode + } + + func encodeIII(ins *instruction) uint32 { +- return encodeI(ins.as, regI(ins.rs1), regI(ins.rd), uint32(ins.imm)) ++ return encodeI(ins.as, regI(ins.rs1), regI(ins.rd), uint32(ins.imm), 0) + } + + func encodeIF(ins *instruction) uint32 { +- return encodeI(ins.as, regI(ins.rs1), regF(ins.rd), uint32(ins.imm)) ++ return encodeI(ins.as, regI(ins.rs1), regF(ins.rd), uint32(ins.imm), 0) ++} ++ ++func encodeIV(ins *instruction) uint32 { ++ return encodeI(ins.as, regI(ins.rs1), regV(ins.rd), uint32(ins.imm), ins.funct7) ++} ++ ++func encodeIIIV(ins *instruction) uint32 { ++ return encodeI(ins.as, regI(ins.rs1), regV(ins.rd), regI(ins.rs2), ins.funct7) ++} ++ ++func encodeIVIV(ins *instruction) uint32 { ++ return encodeI(ins.as, regI(ins.rs1), regV(ins.rd), regV(ins.rs2), ins.funct7) + } + + // encodeS encodes an S-type RISC-V instruction. +-func encodeS(as obj.As, rs1, rs2, imm uint32) uint32 { ++func encodeS(as obj.As, rs1, rs2, imm, funct7 uint32) uint32 { + enc := encode(as) + if enc == nil { + panic("encodeS: could not encode instruction") + } +- return (imm>>5)<<25 | rs2<<20 | rs1<<15 | enc.funct3<<12 | (imm&0x1f)<<7 | enc.opcode ++ if enc.rs2 != 0 && rs2 != 0 { ++ panic("encodeS: instruction uses rs2, but rs2 was nonzero") ++ } ++ rs2 |= enc.rs2 ++ imm |= uint32(enc.csr) &^ 0x1f ++ return funct7<<25 | (imm>>5)<<25 | rs2<<20 | rs1<<15 | enc.funct3<<12 | (imm&0x1f)<<7 | enc.opcode + } + + func encodeSI(ins *instruction) uint32 { +- return encodeS(ins.as, regI(ins.rd), regI(ins.rs1), uint32(ins.imm)) ++ return encodeS(ins.as, regI(ins.rd), regI(ins.rs1), uint32(ins.imm), 0) + } + + func encodeSF(ins *instruction) uint32 { +- return encodeS(ins.as, regI(ins.rd), regF(ins.rs1), uint32(ins.imm)) ++ return encodeS(ins.as, regI(ins.rd), regF(ins.rs1), uint32(ins.imm), 0) ++} ++ ++func encodeSV(ins *instruction) uint32 { ++ return encodeS(ins.as, regI(ins.rd), 0, regV(ins.rs1), ins.funct7) ++} ++ ++func encodeSVII(ins *instruction) uint32 { ++ return encodeS(ins.as, regI(ins.rs1), regI(ins.rs2), regV(ins.rd), ins.funct7) ++} ++ ++func encodeSVIV(ins *instruction) uint32 { ++ return encodeS(ins.as, regI(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct7) + } + + // encodeBImmediate encodes an immediate for a B-type RISC-V instruction. +@@ -1595,7 +1669,7 @@ var ( + // + // 1. the instruction encoding (R/I/S/B/U/J), in lowercase + // 2. zero or more register operand identifiers (I = integer +- // register, F = float register), in uppercase ++ // register, F = float register, V = vector register), in uppercase + // 3. the word "Encoding" + // + // For example, rIIIEncoding indicates an R-type instruction with two +@@ -1611,11 +1685,17 @@ var ( + rIFEncoding = encoding{encode: encodeRIF, validate: validateRIF, length: 4} + rFFEncoding = encoding{encode: encodeRFF, validate: validateRFF, length: 4} + +- iIIEncoding = encoding{encode: encodeIII, validate: validateIII, length: 4} +- iFEncoding = encoding{encode: encodeIF, validate: validateIF, length: 4} ++ iIIEncoding = encoding{encode: encodeIII, validate: validateIII, length: 4} ++ iFEncoding = encoding{encode: encodeIF, validate: validateIF, length: 4} ++ iVEncoding = encoding{encode: encodeIV, validate: validateIV, length: 4} ++ iIIVEncoding = encoding{encode: encodeIIIV, validate: validateIIIV, length: 4} ++ iVIVEncoding = encoding{encode: encodeIVIV, validate: validateIVIV, length: 4} + +- sIEncoding = encoding{encode: encodeSI, validate: validateSI, length: 4} +- sFEncoding = encoding{encode: encodeSF, validate: validateSF, length: 4} ++ sIEncoding = encoding{encode: encodeSI, validate: validateSI, length: 4} ++ sFEncoding = encoding{encode: encodeSF, validate: validateSF, length: 4} ++ sVEncoding = encoding{encode: encodeSV, validate: validateSV, length: 4} ++ sVIIEncoding = encoding{encode: encodeSVII, validate: validateSVII, length: 4} ++ sVIVEncoding = encoding{encode: encodeSVIV, validate: validateSVIV, length: 4} + + bEncoding = encoding{encode: encodeB, validate: validateB, length: 4} + uEncoding = encoding{encode: encodeU, validate: validateU, length: 4} +@@ -1896,11 +1976,73 @@ var instructions = [ALAST & obj.AMask]instructionData{ + // "V" Standard Extension for Vector Operations, Version 1.0 + // + +- // 31.6. Vector Configuration-Setting Instructions ++ // 31.6: Vector Configuration-Setting Instructions + AVSETVLI & obj.AMask: {enc: vsetvliEncoding, immForm: AVSETIVLI}, + AVSETIVLI & obj.AMask: {enc: vsetivliEncoding}, + AVSETVL & obj.AMask: {enc: vsetvlEncoding}, + ++ // 31.7.4: Vector Unit-Stride Instructions ++ AVLE8V & obj.AMask: {enc: iVEncoding}, ++ AVLE16V & obj.AMask: {enc: iVEncoding}, ++ AVLE32V & obj.AMask: {enc: iVEncoding}, ++ AVLE64V & obj.AMask: {enc: iVEncoding}, ++ AVSE8V & obj.AMask: {enc: sVEncoding}, ++ AVSE16V & obj.AMask: {enc: sVEncoding}, ++ AVSE32V & obj.AMask: {enc: sVEncoding}, ++ AVSE64V & obj.AMask: {enc: sVEncoding}, ++ AVLMV & obj.AMask: {enc: iVEncoding}, ++ AVSMV & obj.AMask: {enc: sVEncoding}, ++ ++ // 31.7.5: Vector Strided Instructions ++ AVLSE8V & obj.AMask: {enc: iIIVEncoding}, ++ AVLSE16V & obj.AMask: {enc: iIIVEncoding}, ++ AVLSE32V & obj.AMask: {enc: iIIVEncoding}, ++ AVLSE64V & obj.AMask: {enc: iIIVEncoding}, ++ AVSSE8V & obj.AMask: {enc: sVIIEncoding}, ++ AVSSE16V & obj.AMask: {enc: sVIIEncoding}, ++ AVSSE32V & obj.AMask: {enc: sVIIEncoding}, ++ AVSSE64V & obj.AMask: {enc: sVIIEncoding}, ++ ++ // 31.7.6: Vector Indexed Instructions ++ AVLUXEI8V & obj.AMask: {enc: iVIVEncoding}, ++ AVLUXEI16V & obj.AMask: {enc: iVIVEncoding}, ++ AVLUXEI32V & obj.AMask: {enc: iVIVEncoding}, ++ AVLUXEI64V & obj.AMask: {enc: iVIVEncoding}, ++ AVLOXEI8V & obj.AMask: {enc: iVIVEncoding}, ++ AVLOXEI16V & obj.AMask: {enc: iVIVEncoding}, ++ AVLOXEI32V & obj.AMask: {enc: iVIVEncoding}, ++ AVLOXEI64V & obj.AMask: {enc: iVIVEncoding}, ++ AVSUXEI8V & obj.AMask: {enc: sVIVEncoding}, ++ AVSUXEI16V & obj.AMask: {enc: sVIVEncoding}, ++ AVSUXEI32V & obj.AMask: {enc: sVIVEncoding}, ++ AVSUXEI64V & obj.AMask: {enc: sVIVEncoding}, ++ AVSOXEI8V & obj.AMask: {enc: sVIVEncoding}, ++ AVSOXEI16V & obj.AMask: {enc: sVIVEncoding}, ++ AVSOXEI32V & obj.AMask: {enc: sVIVEncoding}, ++ AVSOXEI64V & obj.AMask: {enc: sVIVEncoding}, ++ ++ // 31.7.9. Vector Load/Store Whole Register Instructions ++ AVL1RE8V & obj.AMask: {enc: iVEncoding}, ++ AVL1RE16V & obj.AMask: {enc: iVEncoding}, ++ AVL1RE32V & obj.AMask: {enc: iVEncoding}, ++ AVL1RE64V & obj.AMask: {enc: iVEncoding}, ++ AVL2RE8V & obj.AMask: {enc: iVEncoding}, ++ AVL2RE16V & obj.AMask: {enc: iVEncoding}, ++ AVL2RE32V & obj.AMask: {enc: iVEncoding}, ++ AVL2RE64V & obj.AMask: {enc: iVEncoding}, ++ AVL4RE8V & obj.AMask: {enc: iVEncoding}, ++ AVL4RE16V & obj.AMask: {enc: iVEncoding}, ++ AVL4RE32V & obj.AMask: {enc: iVEncoding}, ++ AVL4RE64V & obj.AMask: {enc: iVEncoding}, ++ AVL8RE8V & obj.AMask: {enc: iVEncoding}, ++ AVL8RE16V & obj.AMask: {enc: iVEncoding}, ++ AVL8RE32V & obj.AMask: {enc: iVEncoding}, ++ AVL8RE64V & obj.AMask: {enc: iVEncoding}, ++ AVS1RV & obj.AMask: {enc: sVEncoding}, ++ AVS2RV & obj.AMask: {enc: sVEncoding}, ++ AVS4RV & obj.AMask: {enc: sVEncoding}, ++ AVS8RV & obj.AMask: {enc: sVEncoding}, ++ + // + // Privileged ISA + // +@@ -2723,6 +2865,60 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.rs1 = uint32(p.From.Offset) + } + ++ case AVLE8V, AVLE16V, AVLE32V, AVLE64V, AVSE8V, AVSE16V, AVSE32V, AVSE64V, AVLMV, AVSMV: ++ // Set mask bit ++ switch { ++ case ins.rs1 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs1 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE ++ ++ case AVLSE8V, AVLSE16V, AVLSE32V, AVLSE64V, ++ AVLUXEI8V, AVLUXEI16V, AVLUXEI32V, AVLUXEI64V, AVLOXEI8V, AVLOXEI16V, AVLOXEI32V, AVLOXEI64V: ++ // Set mask bit ++ switch { ++ case ins.rs3 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs3 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ ins.rs1, ins.rs2, ins.rs3 = ins.rs2, ins.rs1, obj.REG_NONE ++ ++ case AVSSE8V, AVSSE16V, AVSSE32V, AVSSE64V, ++ AVSUXEI8V, AVSUXEI16V, AVSUXEI32V, AVSUXEI64V, AVSOXEI8V, AVSOXEI16V, AVSOXEI32V, AVSOXEI64V: ++ // Set mask bit ++ switch { ++ case ins.rs3 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs3 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ ins.rd, ins.rs1, ins.rs2, ins.rs3 = ins.rs2, ins.rd, ins.rs1, obj.REG_NONE ++ ++ case AVL1RV, AVL1RE8V, AVL1RE16V, AVL1RE32V, AVL1RE64V, AVL2RV, AVL2RE8V, AVL2RE16V, AVL2RE32V, AVL2RE64V, ++ AVL4RV, AVL4RE8V, AVL4RE16V, AVL4RE32V, AVL4RE64V, AVL8RV, AVL8RE8V, AVL8RE16V, AVL8RE32V, AVL8RE64V: ++ switch ins.as { ++ case AVL1RV: ++ ins.as = AVL1RE8V ++ case AVL2RV: ++ ins.as = AVL2RE8V ++ case AVL4RV: ++ ins.as = AVL4RE8V ++ case AVL8RV: ++ ins.as = AVL8RE8V ++ } ++ if ins.rs1 != obj.REG_NONE { ++ p.Ctxt.Diag("%v: too many operands for instruction", p) ++ } ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE ++ ++ case AVS1RV, AVS2RV, AVS4RV, AVS8RV: ++ if ins.rs1 != obj.REG_NONE { ++ p.Ctxt.Diag("%v: too many operands for instruction", p) ++ } ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE + } + + for _, ins := range inss { +-- +2.39.5 + diff --git a/2073-cmd-internal-obj-riscv-add-riscv64-CSR-map.patch b/2073-cmd-internal-obj-riscv-add-riscv64-CSR-map.patch new file mode 100644 index 0000000..c854f00 --- /dev/null +++ b/2073-cmd-internal-obj-riscv-add-riscv64-CSR-map.patch @@ -0,0 +1,363 @@ +From 5fa2cbd247ff2acebe9a8655ad19814b2c40af4d Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 073/119] cmd/internal/obj/riscv: add riscv64 CSR map + +The map is automatically generated by running the latest version of +parse.py from github.com/riscv/riscv-opcodes. + +Change-Id: I05e00ab27ec583750752c25e1835c2578b339fbf +Reviewed-on: https://go-review.googlesource.com/c/go/+/630518 +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Pengcheng Wang +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Joel Sing +Reviewed-by: Michael Pratt +--- + src/cmd/internal/obj/riscv/inst.go | 332 +++++++++++++++++++++++++++++ + 1 file changed, 332 insertions(+) + +diff --git a/src/cmd/internal/obj/riscv/inst.go b/src/cmd/internal/obj/riscv/inst.go +index 2d9132e532..5ee5bda361 100644 +--- a/src/cmd/internal/obj/riscv/inst.go ++++ b/src/cmd/internal/obj/riscv/inst.go +@@ -1229,3 +1229,335 @@ func encode(a obj.As) *inst { + } + return nil + } ++ ++var csrs = map[uint16]string{ ++ 0x1: "FFLAGS", ++ 0x2: "FRM", ++ 0x3: "FCSR", ++ 0x7: "UTVT", ++ 0x8: "VSTART", ++ 0x9: "VXSAT", ++ 0xa: "VXRM", ++ 0xf: "VCSR", ++ 0x11: "SSP", ++ 0x15: "SEED", ++ 0x17: "JVT", ++ 0x45: "UNXTI", ++ 0x46: "UINTSTATUS", ++ 0x48: "USCRATCHCSW", ++ 0x49: "USCRATCHCSWL", ++ 0x100: "SSTATUS", ++ 0x102: "SEDELEG", ++ 0x103: "SIDELEG", ++ 0x104: "SIE", ++ 0x105: "STVEC", ++ 0x106: "SCOUNTEREN", ++ 0x107: "STVT", ++ 0x10a: "SENVCFG", ++ 0x10c: "SSTATEEN0", ++ 0x10d: "SSTATEEN1", ++ 0x10e: "SSTATEEN2", ++ 0x10f: "SSTATEEN3", ++ 0x120: "SCOUNTINHIBIT", ++ 0x140: "SSCRATCH", ++ 0x141: "SEPC", ++ 0x142: "SCAUSE", ++ 0x143: "STVAL", ++ 0x144: "SIP", ++ 0x145: "SNXTI", ++ 0x146: "SINTSTATUS", ++ 0x148: "SSCRATCHCSW", ++ 0x149: "SSCRATCHCSWL", ++ 0x14d: "STIMECMP", ++ 0x14e: "SCTRCTL", ++ 0x14f: "SCTRSTATUS", ++ 0x150: "SISELECT", ++ 0x151: "SIREG", ++ 0x152: "SIREG2", ++ 0x153: "SIREG3", ++ 0x155: "SIREG4", ++ 0x156: "SIREG5", ++ 0x157: "SIREG6", ++ 0x15c: "STOPEI", ++ 0x15f: "SCTRDEPTH", ++ 0x180: "SATP", ++ 0x181: "SRMCFG", ++ 0x200: "VSSTATUS", ++ 0x204: "VSIE", ++ 0x205: "VSTVEC", ++ 0x240: "VSSCRATCH", ++ 0x241: "VSEPC", ++ 0x242: "VSCAUSE", ++ 0x243: "VSTVAL", ++ 0x244: "VSIP", ++ 0x24d: "VSTIMECMP", ++ 0x24e: "VSCTRCTL", ++ 0x250: "VSISELECT", ++ 0x251: "VSIREG", ++ 0x252: "VSIREG2", ++ 0x253: "VSIREG3", ++ 0x255: "VSIREG4", ++ 0x256: "VSIREG5", ++ 0x257: "VSIREG6", ++ 0x25c: "VSTOPEI", ++ 0x280: "VSATP", ++ 0x300: "MSTATUS", ++ 0x301: "MISA", ++ 0x302: "MEDELEG", ++ 0x303: "MIDELEG", ++ 0x304: "MIE", ++ 0x305: "MTVEC", ++ 0x306: "MCOUNTEREN", ++ 0x307: "MTVT", ++ 0x308: "MVIEN", ++ 0x309: "MVIP", ++ 0x30a: "MENVCFG", ++ 0x30c: "MSTATEEN0", ++ 0x30d: "MSTATEEN1", ++ 0x30e: "MSTATEEN2", ++ 0x30f: "MSTATEEN3", ++ 0x320: "MCOUNTINHIBIT", ++ 0x321: "MCYCLECFG", ++ 0x322: "MINSTRETCFG", ++ 0x323: "MHPMEVENT3", ++ 0x324: "MHPMEVENT4", ++ 0x325: "MHPMEVENT5", ++ 0x326: "MHPMEVENT6", ++ 0x327: "MHPMEVENT7", ++ 0x328: "MHPMEVENT8", ++ 0x329: "MHPMEVENT9", ++ 0x32a: "MHPMEVENT10", ++ 0x32b: "MHPMEVENT11", ++ 0x32c: "MHPMEVENT12", ++ 0x32d: "MHPMEVENT13", ++ 0x32e: "MHPMEVENT14", ++ 0x32f: "MHPMEVENT15", ++ 0x330: "MHPMEVENT16", ++ 0x331: "MHPMEVENT17", ++ 0x332: "MHPMEVENT18", ++ 0x333: "MHPMEVENT19", ++ 0x334: "MHPMEVENT20", ++ 0x335: "MHPMEVENT21", ++ 0x336: "MHPMEVENT22", ++ 0x337: "MHPMEVENT23", ++ 0x338: "MHPMEVENT24", ++ 0x339: "MHPMEVENT25", ++ 0x33a: "MHPMEVENT26", ++ 0x33b: "MHPMEVENT27", ++ 0x33c: "MHPMEVENT28", ++ 0x33d: "MHPMEVENT29", ++ 0x33e: "MHPMEVENT30", ++ 0x33f: "MHPMEVENT31", ++ 0x340: "MSCRATCH", ++ 0x341: "MEPC", ++ 0x342: "MCAUSE", ++ 0x343: "MTVAL", ++ 0x344: "MIP", ++ 0x345: "MNXTI", ++ 0x346: "MINTSTATUS", ++ 0x348: "MSCRATCHCSW", ++ 0x349: "MSCRATCHCSWL", ++ 0x34a: "MTINST", ++ 0x34b: "MTVAL2", ++ 0x34e: "MCTRCTL", ++ 0x350: "MISELECT", ++ 0x351: "MIREG", ++ 0x352: "MIREG2", ++ 0x353: "MIREG3", ++ 0x355: "MIREG4", ++ 0x356: "MIREG5", ++ 0x357: "MIREG6", ++ 0x35c: "MTOPEI", ++ 0x3a0: "PMPCFG0", ++ 0x3a1: "PMPCFG1", ++ 0x3a2: "PMPCFG2", ++ 0x3a3: "PMPCFG3", ++ 0x3a4: "PMPCFG4", ++ 0x3a5: "PMPCFG5", ++ 0x3a6: "PMPCFG6", ++ 0x3a7: "PMPCFG7", ++ 0x3a8: "PMPCFG8", ++ 0x3a9: "PMPCFG9", ++ 0x3aa: "PMPCFG10", ++ 0x3ab: "PMPCFG11", ++ 0x3ac: "PMPCFG12", ++ 0x3ad: "PMPCFG13", ++ 0x3ae: "PMPCFG14", ++ 0x3af: "PMPCFG15", ++ 0x3b0: "PMPADDR0", ++ 0x3b1: "PMPADDR1", ++ 0x3b2: "PMPADDR2", ++ 0x3b3: "PMPADDR3", ++ 0x3b4: "PMPADDR4", ++ 0x3b5: "PMPADDR5", ++ 0x3b6: "PMPADDR6", ++ 0x3b7: "PMPADDR7", ++ 0x3b8: "PMPADDR8", ++ 0x3b9: "PMPADDR9", ++ 0x3ba: "PMPADDR10", ++ 0x3bb: "PMPADDR11", ++ 0x3bc: "PMPADDR12", ++ 0x3bd: "PMPADDR13", ++ 0x3be: "PMPADDR14", ++ 0x3bf: "PMPADDR15", ++ 0x3c0: "PMPADDR16", ++ 0x3c1: "PMPADDR17", ++ 0x3c2: "PMPADDR18", ++ 0x3c3: "PMPADDR19", ++ 0x3c4: "PMPADDR20", ++ 0x3c5: "PMPADDR21", ++ 0x3c6: "PMPADDR22", ++ 0x3c7: "PMPADDR23", ++ 0x3c8: "PMPADDR24", ++ 0x3c9: "PMPADDR25", ++ 0x3ca: "PMPADDR26", ++ 0x3cb: "PMPADDR27", ++ 0x3cc: "PMPADDR28", ++ 0x3cd: "PMPADDR29", ++ 0x3ce: "PMPADDR30", ++ 0x3cf: "PMPADDR31", ++ 0x3d0: "PMPADDR32", ++ 0x3d1: "PMPADDR33", ++ 0x3d2: "PMPADDR34", ++ 0x3d3: "PMPADDR35", ++ 0x3d4: "PMPADDR36", ++ 0x3d5: "PMPADDR37", ++ 0x3d6: "PMPADDR38", ++ 0x3d7: "PMPADDR39", ++ 0x3d8: "PMPADDR40", ++ 0x3d9: "PMPADDR41", ++ 0x3da: "PMPADDR42", ++ 0x3db: "PMPADDR43", ++ 0x3dc: "PMPADDR44", ++ 0x3dd: "PMPADDR45", ++ 0x3de: "PMPADDR46", ++ 0x3df: "PMPADDR47", ++ 0x3e0: "PMPADDR48", ++ 0x3e1: "PMPADDR49", ++ 0x3e2: "PMPADDR50", ++ 0x3e3: "PMPADDR51", ++ 0x3e4: "PMPADDR52", ++ 0x3e5: "PMPADDR53", ++ 0x3e6: "PMPADDR54", ++ 0x3e7: "PMPADDR55", ++ 0x3e8: "PMPADDR56", ++ 0x3e9: "PMPADDR57", ++ 0x3ea: "PMPADDR58", ++ 0x3eb: "PMPADDR59", ++ 0x3ec: "PMPADDR60", ++ 0x3ed: "PMPADDR61", ++ 0x3ee: "PMPADDR62", ++ 0x3ef: "PMPADDR63", ++ 0x5a8: "SCONTEXT", ++ 0x600: "HSTATUS", ++ 0x602: "HEDELEG", ++ 0x603: "HIDELEG", ++ 0x604: "HIE", ++ 0x605: "HTIMEDELTA", ++ 0x606: "HCOUNTEREN", ++ 0x607: "HGEIE", ++ 0x608: "HVIEN", ++ 0x609: "HVICTL", ++ 0x60a: "HENVCFG", ++ 0x60c: "HSTATEEN0", ++ 0x60d: "HSTATEEN1", ++ 0x60e: "HSTATEEN2", ++ 0x60f: "HSTATEEN3", ++ 0x643: "HTVAL", ++ 0x644: "HIP", ++ 0x645: "HVIP", ++ 0x646: "HVIPRIO1", ++ 0x647: "HVIPRIO2", ++ 0x64a: "HTINST", ++ 0x680: "HGATP", ++ 0x6a8: "HCONTEXT", ++ 0x747: "MSECCFG", ++ 0x7a0: "TSELECT", ++ 0x7a1: "TDATA1", ++ 0x7a2: "TDATA2", ++ 0x7a3: "TDATA3", ++ 0x7a4: "TINFO", ++ 0x7a5: "TCONTROL", ++ 0x7a8: "MCONTEXT", ++ 0x7aa: "MSCONTEXT", ++ 0x7b0: "DCSR", ++ 0x7b1: "DPC", ++ 0x7b2: "DSCRATCH0", ++ 0x7b3: "DSCRATCH1", ++ 0xb00: "MCYCLE", ++ 0xb02: "MINSTRET", ++ 0xb03: "MHPMCOUNTER3", ++ 0xb04: "MHPMCOUNTER4", ++ 0xb05: "MHPMCOUNTER5", ++ 0xb06: "MHPMCOUNTER6", ++ 0xb07: "MHPMCOUNTER7", ++ 0xb08: "MHPMCOUNTER8", ++ 0xb09: "MHPMCOUNTER9", ++ 0xb0a: "MHPMCOUNTER10", ++ 0xb0b: "MHPMCOUNTER11", ++ 0xb0c: "MHPMCOUNTER12", ++ 0xb0d: "MHPMCOUNTER13", ++ 0xb0e: "MHPMCOUNTER14", ++ 0xb0f: "MHPMCOUNTER15", ++ 0xb10: "MHPMCOUNTER16", ++ 0xb11: "MHPMCOUNTER17", ++ 0xb12: "MHPMCOUNTER18", ++ 0xb13: "MHPMCOUNTER19", ++ 0xb14: "MHPMCOUNTER20", ++ 0xb15: "MHPMCOUNTER21", ++ 0xb16: "MHPMCOUNTER22", ++ 0xb17: "MHPMCOUNTER23", ++ 0xb18: "MHPMCOUNTER24", ++ 0xb19: "MHPMCOUNTER25", ++ 0xb1a: "MHPMCOUNTER26", ++ 0xb1b: "MHPMCOUNTER27", ++ 0xb1c: "MHPMCOUNTER28", ++ 0xb1d: "MHPMCOUNTER29", ++ 0xb1e: "MHPMCOUNTER30", ++ 0xb1f: "MHPMCOUNTER31", ++ 0xc00: "CYCLE", ++ 0xc01: "TIME", ++ 0xc02: "INSTRET", ++ 0xc03: "HPMCOUNTER3", ++ 0xc04: "HPMCOUNTER4", ++ 0xc05: "HPMCOUNTER5", ++ 0xc06: "HPMCOUNTER6", ++ 0xc07: "HPMCOUNTER7", ++ 0xc08: "HPMCOUNTER8", ++ 0xc09: "HPMCOUNTER9", ++ 0xc0a: "HPMCOUNTER10", ++ 0xc0b: "HPMCOUNTER11", ++ 0xc0c: "HPMCOUNTER12", ++ 0xc0d: "HPMCOUNTER13", ++ 0xc0e: "HPMCOUNTER14", ++ 0xc0f: "HPMCOUNTER15", ++ 0xc10: "HPMCOUNTER16", ++ 0xc11: "HPMCOUNTER17", ++ 0xc12: "HPMCOUNTER18", ++ 0xc13: "HPMCOUNTER19", ++ 0xc14: "HPMCOUNTER20", ++ 0xc15: "HPMCOUNTER21", ++ 0xc16: "HPMCOUNTER22", ++ 0xc17: "HPMCOUNTER23", ++ 0xc18: "HPMCOUNTER24", ++ 0xc19: "HPMCOUNTER25", ++ 0xc1a: "HPMCOUNTER26", ++ 0xc1b: "HPMCOUNTER27", ++ 0xc1c: "HPMCOUNTER28", ++ 0xc1d: "HPMCOUNTER29", ++ 0xc1e: "HPMCOUNTER30", ++ 0xc1f: "HPMCOUNTER31", ++ 0xc20: "VL", ++ 0xc21: "VTYPE", ++ 0xc22: "VLENB", ++ 0xda0: "SCOUNTOVF", ++ 0xdb0: "STOPI", ++ 0xe12: "HGEIP", ++ 0xeb0: "VSTOPI", ++ 0xf11: "MVENDORID", ++ 0xf12: "MARCHID", ++ 0xf13: "MIMPID", ++ 0xf14: "MHARTID", ++ 0xf15: "MCONFIGPTR", ++ 0xfb0: "MTOPI", ++} +-- +2.39.5 + diff --git a/2074-test-codegen-tighten-the-TrailingZeros64-test-on-386.patch b/2074-test-codegen-tighten-the-TrailingZeros64-test-on-386.patch new file mode 100644 index 0000000..a57ede6 --- /dev/null +++ b/2074-test-codegen-tighten-the-TrailingZeros64-test-on-386.patch @@ -0,0 +1,36 @@ +From 46c2b95b27862604e6ffe206ac68e92b1983fd29 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 074/119] test/codegen: tighten the TrailingZeros64 test on 386 + +Make the TrailingZeros64 code generation check more specific for 386. +Just checking for BSFL will match both the generic 64 bit decomposition +and the custom 386 lowering. + +Change-Id: I62076f1889af0ef1f29704cba01ab419cae0c6e3 +Reviewed-on: https://go-review.googlesource.com/c/go/+/656996 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: David Chase +Reviewed-by: Keith Randall +Auto-Submit: Keith Randall +Reviewed-by: Keith Randall +--- + test/codegen/mathbits.go | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go +index caeecdf078..bf2e8130c4 100644 +--- a/test/codegen/mathbits.go ++++ b/test/codegen/mathbits.go +@@ -311,7 +311,7 @@ func TrailingZeros(n uint) int { + func TrailingZeros64(n uint64) int { + // amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ" + // amd64/v3:"TZCNTQ" +- // 386:"BSFL" ++ // 386:"BSFL","JNE" + // arm64:"RBIT","CLZ" + // s390x:"FLOGR" + // ppc64x/power8:"ANDN","POPCNTD" +-- +2.39.5 + diff --git a/2075-test-codegen-add-riscv64-codegen-for-arithmetic-test.patch b/2075-test-codegen-add-riscv64-codegen-for-arithmetic-test.patch new file mode 100644 index 0000000..13dffe9 --- /dev/null +++ b/2075-test-codegen-add-riscv64-codegen-for-arithmetic-test.patch @@ -0,0 +1,102 @@ +From 60dd9fcdc906997df39a06b95100f0bf28fd0312 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 075/119] test/codegen: add riscv64 codegen for arithmetic + tests + +Codify the current riscv64 code generation for various subtract from +constant and addition/subtraction tests. + +Change-Id: I54ad923280a0578a338bc4431fa5bdc0644c4729 +Reviewed-on: https://go-review.googlesource.com/c/go/+/652316 +Reviewed-by: Meng Zhuo +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +Reviewed-by: David Chase +--- + test/codegen/arithmetic.go | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go +index 5f4ce9c76f..ec5cb491fa 100644 +--- a/test/codegen/arithmetic.go ++++ b/test/codegen/arithmetic.go +@@ -44,36 +44,42 @@ func SubMem(arr []int, b, c, d int) int { + + func SubFromConst(a int) int { + // ppc64x: `SUBC\tR[0-9]+,\s[$]40,\sR` ++ // riscv64: "ADDI\t\\$-40","NEG" + b := 40 - a + return b + } + + func SubFromConstNeg(a int) int { + // ppc64x: `ADD\t[$]40,\sR[0-9]+,\sR` ++ // riscv64: "NEG","ADDI\t\\$-40","NEG" + c := 40 - (-a) + return c + } + + func SubSubFromConst(a int) int { + // ppc64x: `ADD\t[$]20,\sR[0-9]+,\sR` ++ // riscv64: "ADDI\t\\$20",-"NEG" + c := 40 - (20 - a) + return c + } + + func AddSubFromConst(a int) int { + // ppc64x: `SUBC\tR[0-9]+,\s[$]60,\sR` ++ // riscv64: "ADDI\t\\$-60","NEG" + c := 40 + (20 - a) + return c + } + + func NegSubFromConst(a int) int { + // ppc64x: `ADD\t[$]-20,\sR[0-9]+,\sR` ++ // riscv64: "ADDI\t\\$-20" + c := -(20 - a) + return c + } + + func NegAddFromConstNeg(a int) int { + // ppc64x: `SUBC\tR[0-9]+,\s[$]40,\sR` ++ // riscv64: "ADDI\t\\$-40","NEG" + c := -(-40 + a) + return c + } +@@ -81,6 +87,7 @@ func NegAddFromConstNeg(a int) int { + func SubSubNegSimplify(a, b int) int { + // amd64:"NEGQ" + // ppc64x:"NEG" ++ // riscv64:"NEG",-"SUB" + r := (a - b) - a + return r + } +@@ -88,6 +95,7 @@ func SubSubNegSimplify(a, b int) int { + func SubAddSimplify(a, b int) int { + // amd64:-"SUBQ",-"ADDQ" + // ppc64x:-"SUB",-"ADD" ++ // riscv64:-"SUB",-"ADD" + r := a + (b - a) + return r + } +@@ -111,6 +119,7 @@ func SubAddSimplify2(a, b, c int) (int, int, int, int, int, int) { + func SubAddNegSimplify(a, b int) int { + // amd64:"NEGQ",-"ADDQ",-"SUBQ" + // ppc64x:"NEG",-"ADD",-"SUB" ++ // riscv64:"NEG",-"ADD",-"SUB" + r := a - (b + a) + return r + } +@@ -118,6 +127,7 @@ func SubAddNegSimplify(a, b int) int { + func AddAddSubSimplify(a, b, c int) int { + // amd64:-"SUBQ" + // ppc64x:-"SUB" ++ // riscv64:"ADD","ADD",-"SUB" + r := a + (b + (c - a)) + return r + } +-- +2.39.5 + diff --git a/2076-test-codegen-add-riscv64-rva23u64-specifiers-to-exis.patch b/2076-test-codegen-add-riscv64-rva23u64-specifiers-to-exis.patch new file mode 100644 index 0000000..96396b5 --- /dev/null +++ b/2076-test-codegen-add-riscv64-rva23u64-specifiers-to-exis.patch @@ -0,0 +1,84 @@ +From 7623a2c8fe4db9e157cd551ca549dd548d072a9f Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 076/119] test/codegen: add riscv64/rva23u64 specifiers to + existing tests + +Tests that exist for riscv64/rva22u64 should also be applied to +riscv64/rva23u64. + +Change-Id: Ia529fdf0ac55b8bcb3dcd24fa80efef2351f3842 +Reviewed-on: https://go-review.googlesource.com/c/go/+/652315 +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +Reviewed-by: Meng Zhuo +Reviewed-by: David Chase +--- + test/codegen/arithmetic.go | 8 ++++---- + test/codegen/shift.go | 6 +++--- + 2 files changed, 7 insertions(+), 7 deletions(-) + +diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go +index ec5cb491fa..1a27227aef 100644 +--- a/test/codegen/arithmetic.go ++++ b/test/codegen/arithmetic.go +@@ -607,7 +607,7 @@ func Int64Min(a, b int64) int64 { + // amd64: "CMPQ","CMOVQLT" + // arm64: "CMP","CSEL" + // riscv64/rva20u64:"BLT\t" +- // riscv64/rva22u64:"MIN\t" ++ // riscv64/rva22u64,riscv64/rva23u64:"MIN\t" + return min(a, b) + } + +@@ -615,7 +615,7 @@ func Int64Max(a, b int64) int64 { + // amd64: "CMPQ","CMOVQGT" + // arm64: "CMP","CSEL" + // riscv64/rva20u64:"BLT\t" +- // riscv64/rva22u64:"MAX\t" ++ // riscv64/rva22u64,riscv64/rva23u64:"MAX\t" + return max(a, b) + } + +@@ -623,7 +623,7 @@ func Uint64Min(a, b uint64) uint64 { + // amd64: "CMPQ","CMOVQCS" + // arm64: "CMP","CSEL" + // riscv64/rva20u64:"BLTU" +- // riscv64/rva22u64:"MINU" ++ // riscv64/rva22u64,riscv64/rva23u64:"MINU" + return min(a, b) + } + +@@ -631,6 +631,6 @@ func Uint64Max(a, b uint64) uint64 { + // amd64: "CMPQ","CMOVQHI" + // arm64: "CMP","CSEL" + // riscv64/rva20u64:"BLTU" +- // riscv64/rva22u64:"MAXU" ++ // riscv64/rva22u64,riscv64/rva23u64:"MAXU" + return max(a, b) + } +diff --git a/test/codegen/shift.go b/test/codegen/shift.go +index 4b3b79f142..6b1157d3fd 100644 +--- a/test/codegen/shift.go ++++ b/test/codegen/shift.go +@@ -481,13 +481,13 @@ func checkShiftToMask(u []uint64, s []int64) { + + func checkLeftShiftWithAddition(a int64, b int64) int64 { + // riscv64/rva20u64: "SLLI","ADD" +- // riscv64/rva22u64: "SH1ADD" ++ // riscv64/rva22u64,riscv64/rva23u64: "SH1ADD" + a = a + b<<1 + // riscv64/rva20u64: "SLLI","ADD" +- // riscv64/rva22u64: "SH2ADD" ++ // riscv64/rva22u64,riscv64/rva23u64: "SH2ADD" + a = a + b<<2 + // riscv64/rva20u64: "SLLI","ADD" +- // riscv64/rva22u64: "SH3ADD" ++ // riscv64/rva22u64,riscv64/rva23u64: "SH3ADD" + a = a + b<<3 + return a + } +-- +2.39.5 + diff --git a/2077-test-codegen-add-a-test-for-negation-and-conversion-.patch b/2077-test-codegen-add-a-test-for-negation-and-conversion-.patch new file mode 100644 index 0000000..b216ea7 --- /dev/null +++ b/2077-test-codegen-add-a-test-for-negation-and-conversion-.patch @@ -0,0 +1,39 @@ +From f57c6ba1f6683a19d0c39ae08deb75dc9bd4ecc5 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 077/119] test/codegen: add a test for negation and conversion + to int32 + +Codify the current code generation used on riscv64 in this case. + +Change-Id: If4152e3652fc19d0aa28b79dba08abee2486d5ae +Reviewed-on: https://go-review.googlesource.com/c/go/+/652317 +Reviewed-by: Mark Ryan +Reviewed-by: Meng Zhuo +Reviewed-by: Cherry Mui +Reviewed-by: David Chase +LUCI-TryBot-Result: Go LUCI +--- + test/codegen/arithmetic.go | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go +index 1a27227aef..0d303b9f24 100644 +--- a/test/codegen/arithmetic.go ++++ b/test/codegen/arithmetic.go +@@ -132,6 +132,12 @@ func AddAddSubSimplify(a, b, c int) int { + return r + } + ++func NegToInt32(a int) int { ++ // riscv64: "NEG","MOVW" ++ r := int(int32(-a)) ++ return r ++} ++ + // -------------------- // + // Multiplication // + // -------------------- // +-- +2.39.5 + diff --git a/2078-cmd-compile-combine-negation-and-word-sign-extension.patch b/2078-cmd-compile-combine-negation-and-word-sign-extension.patch new file mode 100644 index 0000000..19c49ab --- /dev/null +++ b/2078-cmd-compile-combine-negation-and-word-sign-extension.patch @@ -0,0 +1,80 @@ +From 1a58541e145e0e54539b5a71ca08d00bfe1a4bf6 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 078/119] cmd/compile: combine negation and word sign extension + on riscv64 + +Use NEGW to produce a negated and sign extended word, rather than doing +the same via two instructions: + + neg t0, t0 + sext.w a0, t0 + +Becomes: + + negw t0, t0 + +Change-Id: I824ab25001bd3304bdbd435e7b244fcc036ef212 +Reviewed-on: https://go-review.googlesource.com/c/go/+/652319 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +Reviewed-by: David Chase +Reviewed-by: Cherry Mui +--- + src/cmd/compile/internal/ssa/_gen/RISCV64.rules | 3 +++ + src/cmd/compile/internal/ssa/rewriteRISCV64.go | 11 +++++++++++ + test/codegen/arithmetic.go | 2 +- + 3 files changed, 15 insertions(+), 1 deletion(-) + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index a69df619a5..bc5a49be0b 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -589,6 +589,9 @@ + (MOVHUreg (ANDI [c] x)) && c < 0 => (ANDI [int64(uint16(c))] x) + (MOVWUreg (ANDI [c] x)) && c < 0 => (AND (MOVDconst [int64(uint32(c))]) x) + ++// Combine negation and sign extension. ++(MOVWreg (NEG x)) => (NEGW x) ++ + // Avoid sign/zero extension for consts. + (MOVBreg (MOVDconst [c])) => (MOVDconst [int64(int8(c))]) + (MOVHreg (MOVDconst [c])) => (MOVDconst [int64(int16(c))]) +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 1c226a1660..1675d61fe5 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -5646,6 +5646,17 @@ func rewriteValueRISCV64_OpRISCV64MOVWreg(v *Value) bool { + v.copyOf(x) + return true + } ++ // match: (MOVWreg (NEG x)) ++ // result: (NEGW x) ++ for { ++ if v_0.Op != OpRISCV64NEG { ++ break ++ } ++ x := v_0.Args[0] ++ v.reset(OpRISCV64NEGW) ++ v.AddArg(x) ++ return true ++ } + // match: (MOVWreg (MOVDconst [c])) + // result: (MOVDconst [int64(int32(c))]) + for { +diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go +index 0d303b9f24..e4e3a90cd1 100644 +--- a/test/codegen/arithmetic.go ++++ b/test/codegen/arithmetic.go +@@ -133,7 +133,7 @@ func AddAddSubSimplify(a, b, c int) int { + } + + func NegToInt32(a int) int { +- // riscv64: "NEG","MOVW" ++ // riscv64: "NEGW",-"MOVW" + r := int(int32(-a)) + return r + } +-- +2.39.5 + diff --git a/2079-cmd-compile-internal-ssa-remove-double-negation-with.patch b/2079-cmd-compile-internal-ssa-remove-double-negation-with.patch new file mode 100644 index 0000000..3e2689d --- /dev/null +++ b/2079-cmd-compile-internal-ssa-remove-double-negation-with.patch @@ -0,0 +1,97 @@ +From 752985cd45306ed0c62eae5090507c47de9834d5 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 079/119] cmd/compile/internal/ssa: remove double negation with + addition on riscv64 + +On riscv64, subtraction from a constant is typically implemented as an +ADDI with the negative constant, followed by a negation. However this can +lead to multiple NEG/ADDI/NEG sequences that can be optimised out. + +For example, runtime.(*_panic).nextDefer currently contains: + + lbu t0, 0(t0) + addi t0, t0, -8 + neg t0, t0 + addi t0, t0, -7 + neg t0, t0 + +Which is now optimised to: + + lbu t0, 0(t0) + addi t0, t0, -1 + +Change-Id: Idf5815e6db2e3705cc4a4811ca9130a064ae3d80 +Reviewed-on: https://go-review.googlesource.com/c/go/+/652318 +Reviewed-by: Cherry Mui +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +Reviewed-by: David Chase +--- + .../compile/internal/ssa/_gen/RISCV64.rules | 1 + + .../compile/internal/ssa/rewriteRISCV64.go | 22 +++++++++++++++++++ + test/codegen/arithmetic.go | 2 +- + 3 files changed, 24 insertions(+), 1 deletion(-) + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index bc5a49be0b..58cadc8944 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -735,6 +735,7 @@ + + // Double negation. + (NEG (NEG x)) => x ++(NEG s:(ADDI [val] (NEG x))) && s.Uses == 1 && is32Bit(-val) => (ADDI [-val] x) + + // Addition of zero or two constants. + (ADDI [0] x) => x +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 1675d61fe5..8f8c902df8 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -6118,6 +6118,28 @@ func rewriteValueRISCV64_OpRISCV64NEG(v *Value) bool { + v.copyOf(x) + return true + } ++ // match: (NEG s:(ADDI [val] (NEG x))) ++ // cond: s.Uses == 1 && is32Bit(-val) ++ // result: (ADDI [-val] x) ++ for { ++ s := v_0 ++ if s.Op != OpRISCV64ADDI { ++ break ++ } ++ val := auxIntToInt64(s.AuxInt) ++ s_0 := s.Args[0] ++ if s_0.Op != OpRISCV64NEG { ++ break ++ } ++ x := s_0.Args[0] ++ if !(s.Uses == 1 && is32Bit(-val)) { ++ break ++ } ++ v.reset(OpRISCV64ADDI) ++ v.AuxInt = int64ToAuxInt(-val) ++ v.AddArg(x) ++ return true ++ } + // match: (NEG (MOVDconst [x])) + // result: (MOVDconst [-x]) + for { +diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go +index e4e3a90cd1..976158326c 100644 +--- a/test/codegen/arithmetic.go ++++ b/test/codegen/arithmetic.go +@@ -51,7 +51,7 @@ func SubFromConst(a int) int { + + func SubFromConstNeg(a int) int { + // ppc64x: `ADD\t[$]40,\sR[0-9]+,\sR` +- // riscv64: "NEG","ADDI\t\\$-40","NEG" ++ // riscv64: "ADDI\t\\$40",-"NEG" + c := 40 - (-a) + return c + } +-- +2.39.5 + diff --git a/2080-cmd-internal-obj-riscv-prevent-duplicate-error-repor.patch b/2080-cmd-internal-obj-riscv-prevent-duplicate-error-repor.patch new file mode 100644 index 0000000..07439e3 --- /dev/null +++ b/2080-cmd-internal-obj-riscv-prevent-duplicate-error-repor.patch @@ -0,0 +1,189 @@ +From 933daf2afa0cc95422a22b88603a8df7969e4c03 Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 080/119] cmd/internal/obj/riscv: prevent duplicate error + reports + +The riscv64 Go assembler can output certain errors, ones produced by +instructionsForProg, multiple times. These errors are guaranteed to +be output at least twice and can appear three or more times if a +rescan is needed to recompute branch addresses. For example, the +syntactically incorrect instruction + +MOV (X10), $1 + +will generate at least two identical errors + +asm: 86076 (asm.s:21524) MOV (X10), $1: unsupported MOV +asm: 86076 (asm.s:21524) MOV (X10), $1: unsupported MOV +asm: assembly failed + +In addition to confusing the user, these duplicate errors make it +difficult to write negative tests for certain types of instructions, +e.g., branches, whose duplicate errors are not always identical, +and so not ignored by endtoend_test.go. + +We fix the issue by returning from preprocess if any errors have been +generated by the time we reach the end of the rescan loop. One +implication of this change is that validation errors will no longer +be reported if an error is generated earlier in the preprocess stage. +Negative test cases for validation errors are therefore moved to +their own file as the existing riscv64error.s file contains errors +generated by instructionsForProg that will now suppress the +validation errors. + +Change-Id: Iffacdbefce28f44970dd5dda44990b822b8a23d4 +Reviewed-on: https://go-review.googlesource.com/c/go/+/637315 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Joel Sing +Reviewed-by: David Chase +Reviewed-by: Cherry Mui +--- + src/cmd/asm/internal/asm/endtoend_test.go | 4 ++ + .../asm/internal/asm/testdata/riscv64error.s | 34 -------------- + .../internal/asm/testdata/riscv64validation.s | 46 +++++++++++++++++++ + src/cmd/internal/obj/riscv/obj.go | 5 ++ + 4 files changed, 55 insertions(+), 34 deletions(-) + create mode 100644 src/cmd/asm/internal/asm/testdata/riscv64validation.s + +diff --git a/src/cmd/asm/internal/asm/endtoend_test.go b/src/cmd/asm/internal/asm/endtoend_test.go +index 02bc6b7923..0b9b0cbe83 100644 +--- a/src/cmd/asm/internal/asm/endtoend_test.go ++++ b/src/cmd/asm/internal/asm/endtoend_test.go +@@ -480,6 +480,10 @@ func TestRISCVErrors(t *testing.T) { + testErrors(t, "riscv64", "riscv64error") + } + ++func TestRISCVValidation(t *testing.T) { ++ testErrors(t, "riscv64", "riscv64validation") ++} ++ + func TestS390XEndToEnd(t *testing.T) { + testEndToEnd(t, "s390x", "s390x") + } +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index 82a2348894..e8855f6cd5 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -43,50 +43,16 @@ TEXT errors(SB),$0 + SRLIW $-1, X5, X6 // ERROR "immediate out of range 0 to 31" + SRAIW $-1, X5, X6 // ERROR "immediate out of range 0 to 31" + SD X5, 4294967296(X6) // ERROR "constant 4294967296 too large" +- SRLI $1, X5, F1 // ERROR "expected integer register in rd position but got non-integer register F1" +- SRLI $1, F1, X5 // ERROR "expected integer register in rs1 position but got non-integer register F1" + FNES F1, (X5) // ERROR "needs an integer register output" +- VSETVLI $32, E16, M1, TU, MU, X12 // ERROR "must be in range [0, 31] (5 bits)" +- VSETVLI $-1, E32, M2, TA, MA, X12 // ERROR "must be in range [0, 31] (5 bits)" + VSETIVLI X10, E32, M2, TA, MA, X12 // ERROR "expected immediate value" +- VSETVL X10, X11 // ERROR "expected integer register in rs1 position" +- VLE8V (X10), X10 // ERROR "expected vector register in rd position" +- VLE8V (V1), V3 // ERROR "expected integer register in rs1 position" + VLE8V (X10), V1, V3 // ERROR "invalid vector mask register" +- VSE8V X10, (X10) // ERROR "expected vector register in rs1 position" +- VSE8V V3, (V1) // ERROR "expected integer register in rd position" + VSE8V V3, V1, (X10) // ERROR "invalid vector mask register" +- VLSE8V (X10), V3 // ERROR "expected integer register in rs2 position" +- VLSE8V (X10), X10, X11 // ERROR "expected vector register in rd position" +- VLSE8V (V1), X10, V3 // ERROR "expected integer register in rs1 position" +- VLSE8V (X10), V1, V0, V3 // ERROR "expected integer register in rs2 position" + VLSE8V (X10), X10, V1, V3 // ERROR "invalid vector mask register" +- VSSE8V V3, (X10) // ERROR "expected integer register in rs2 position" +- VSSE8V X10, X11, (X10) // ERROR "expected vector register in rd position" +- VSSE8V V3, X11, (V1) // ERROR "expected integer register in rs1 position" +- VSSE8V V3, V1, V0, (X10) // ERROR "expected integer register in rs2 position" + VSSE8V V3, X11, V1, (X10) // ERROR "invalid vector mask register" +- VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" +- VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" +- VLUXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" +- VLUXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in rs2 position" + VLUXEI8V (X10), V2, V1, V3 // ERROR "invalid vector mask register" +- VSUXEI8V X10, V2, (X10) // ERROR "expected vector register in rd position" +- VSUXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" +- VSUXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in rs2 position" + VSUXEI8V V3, V2, V1, (X10) // ERROR "invalid vector mask register" +- VLOXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" +- VLOXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" +- VLOXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in rs2 position" + VLOXEI8V (X10), V2, V1, V3 // ERROR "invalid vector mask register" +- VSOXEI8V X10, V2, (X10) // ERROR "expected vector register in rd position" +- VSOXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" +- VSOXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in rs2 position" + VSOXEI8V V3, V2, V1, (X10) // ERROR "invalid vector mask register" + VL1RV (X10), V0, V3 // ERROR "too many operands for instruction" +- VL1RV (X10), X10 // ERROR "expected vector register in rd position" +- VL1RV (V1), V3 // ERROR "expected integer register in rs1 position" + VS1RV V3, V0, (X11) // ERROR "too many operands for instruction" +- VS1RV X11, (X11) // ERROR "expected vector register in rs1 position" +- VS1RV V3, (V1) // ERROR "expected integer register in rd position" + RET +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +new file mode 100644 +index 0000000000..773f275dd3 +--- /dev/null ++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +@@ -0,0 +1,46 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++// This file is for validation errors only, i.e., errors reported by the validate function. ++// Negative test cases for errors generated earlier in the assembler's preprocess stage ++// should be added to riscv64error.s. If they are added to this file, they will prevent ++// the validate function from being run and TestRISCVValidation will report missing ++// errors. ++ ++TEXT validation(SB),$0 ++ SRLI $1, X5, F1 // ERROR "expected integer register in rd position but got non-integer register F1" ++ SRLI $1, F1, X5 // ERROR "expected integer register in rs1 position but got non-integer register F1" ++ VSETVLI $32, E16, M1, TU, MU, X12 // ERROR "must be in range [0, 31] (5 bits)" ++ VSETVLI $-1, E32, M2, TA, MA, X12 // ERROR "must be in range [0, 31] (5 bits)" ++ VSETVL X10, X11 // ERROR "expected integer register in rs1 position" ++ VLE8V (X10), X10 // ERROR "expected vector register in rd position" ++ VLE8V (V1), V3 // ERROR "expected integer register in rs1 position" ++ VSE8V X10, (X10) // ERROR "expected vector register in rs1 position" ++ VSE8V V3, (V1) // ERROR "expected integer register in rd position" ++ VLSE8V (X10), V3 // ERROR "expected integer register in rs2 position" ++ VLSE8V (X10), X10, X11 // ERROR "expected vector register in rd position" ++ VLSE8V (V1), X10, V3 // ERROR "expected integer register in rs1 position" ++ VLSE8V (X10), V1, V0, V3 // ERROR "expected integer register in rs2 position" ++ VSSE8V V3, (X10) // ERROR "expected integer register in rs2 position" ++ VSSE8V X10, X11, (X10) // ERROR "expected vector register in rd position" ++ VSSE8V V3, X11, (V1) // ERROR "expected integer register in rs1 position" ++ VSSE8V V3, V1, V0, (X10) // ERROR "expected integer register in rs2 position" ++ VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" ++ VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" ++ VLUXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" ++ VLUXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in rs2 position" ++ VSUXEI8V X10, V2, (X10) // ERROR "expected vector register in rd position" ++ VSUXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" ++ VSUXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in rs2 position" ++ VLOXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" ++ VLOXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" ++ VLOXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in rs2 position" ++ VSOXEI8V X10, V2, (X10) // ERROR "expected vector register in rd position" ++ VSOXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" ++ VSOXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in rs2 position" ++ VL1RV (X10), X10 // ERROR "expected vector register in rd position" ++ VL1RV (V1), V3 // ERROR "expected integer register in rs1 position" ++ VS1RV X11, (X11) // ERROR "expected vector register in rs1 position" ++ VS1RV V3, (V1) // ERROR "expected integer register in rd position" ++ RET +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index a558dc3596..d61cef9695 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -708,6 +708,11 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { + } + } + ++ // Return if errors have been detected up to this point. Continuing ++ // may lead to duplicate errors being output. ++ if ctxt.Errors > 0 { ++ return ++ } + if !rescan { + break + } +-- +2.39.5 + diff --git a/2081-cmd-internal-obj-riscv-prevent-panics-on-bad-branche.patch b/2081-cmd-internal-obj-riscv-prevent-panics-on-bad-branche.patch new file mode 100644 index 0000000..051136f --- /dev/null +++ b/2081-cmd-internal-obj-riscv-prevent-panics-on-bad-branche.patch @@ -0,0 +1,74 @@ +From c61add823865d15d032f87fbc1bc1983e53f4437 Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 081/119] cmd/internal/obj/riscv: prevent panics on bad + branches + +Syntactically incorrect branches, such as + +BEQ X5, X6, $1 +BEQ X5, X6, 31(X10) + +cause the assembler to panic, which they shouldn't really do. It's +better for the user to see a normal error, as reported for other +syntax errors in riscv64 assembly. The panics also prevent us +from writing negative tests for these sorts of errors. + +Here we fix the issue by ensuring we generate a normal error instead +of panicking when the user provides an invalid branch target. We +also add a couple of negative tests. + +Change-Id: I1da568999a75097484b61a01d418f5d4be3e04fa +Reviewed-on: https://go-review.googlesource.com/c/go/+/637316 +Reviewed-by: Cherry Mui +Reviewed-by: Joel Sing +LUCI-TryBot-Result: Go LUCI +Reviewed-by: David Chase +Reviewed-by: Meng Zhuo +--- + src/cmd/asm/internal/asm/testdata/riscv64error.s | 2 ++ + src/cmd/internal/obj/riscv/obj.go | 8 ++++++-- + 2 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index e8855f6cd5..005b794612 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -30,6 +30,8 @@ TEXT errors(SB),$0 + SLLI $64, X5, X6 // ERROR "immediate out of range 0 to 63" + SRLI $64, X5, X6 // ERROR "immediate out of range 0 to 63" + SRAI $64, X5, X6 // ERROR "immediate out of range 0 to 63" ++ BEQ X5, X6, $1 // ERROR "instruction with branch-like opcode lacks destination" ++ BEQ X5, X6, 31(X10) // ERROR "instruction with branch-like opcode lacks destination" + RORI $-1, X5, X6 // ERROR "immediate out of range 0 to 63" + SLLI $-1, X5, X6 // ERROR "immediate out of range 0 to 63" + SRLI $-1, X5, X6 // ERROR "immediate out of range 0 to 63" +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index d61cef9695..0a754231cc 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -644,7 +644,8 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { + switch p.As { + case ABEQ, ABEQZ, ABGE, ABGEU, ABGEZ, ABGT, ABGTU, ABGTZ, ABLE, ABLEU, ABLEZ, ABLT, ABLTU, ABLTZ, ABNE, ABNEZ: + if p.To.Type != obj.TYPE_BRANCH { +- panic("assemble: instruction with branch-like opcode lacks destination") ++ ctxt.Diag("%v: instruction with branch-like opcode lacks destination", p) ++ break + } + offset := p.To.Target().Pc - p.Pc + if offset < -4096 || 4096 <= offset { +@@ -728,7 +729,10 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { + case obj.TYPE_BRANCH: + p.To.Type, p.To.Offset = obj.TYPE_CONST, p.To.Target().Pc-p.Pc + case obj.TYPE_MEM: +- panic("unhandled type") ++ if ctxt.Errors == 0 { ++ // An error should have already been reported for this instruction ++ panic("unhandled type") ++ } + } + + case AJAL: +-- +2.39.5 + diff --git a/2082-cmd-internal-obj-riscv-fix-the-encoding-for-REV8-and.patch b/2082-cmd-internal-obj-riscv-fix-the-encoding-for-REV8-and.patch new file mode 100644 index 0000000..34b871c --- /dev/null +++ b/2082-cmd-internal-obj-riscv-fix-the-encoding-for-REV8-and.patch @@ -0,0 +1,41 @@ +From b72bd33745886f2064a1fd5c3e938f431f913a02 Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 082/119] cmd/internal/obj/riscv: fix the encoding for REV8 and + ORCB + +The instructions are currently encoded and validated using an +iIIEncoding which is incorrect as these instructions do not +take an immediate operand. Encode them instead using an +rIIEncoding as is done for the other two register argument bitmanip +instructions. + +Change-Id: Ia4d9c6f6ebd2dfc381935ebc11afa8fc3664232b +Reviewed-on: https://go-review.googlesource.com/c/go/+/637317 +Reviewed-by: David Chase +Reviewed-by: Joel Sing +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Cherry Mui +--- + src/cmd/internal/obj/riscv/obj.go | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 0a754231cc..7d7a123bcf 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1968,8 +1968,8 @@ var instructions = [ALAST & obj.AMask]instructionData{ + ARORI & obj.AMask: {enc: iIIEncoding, ternary: true}, + ARORIW & obj.AMask: {enc: iIIEncoding, ternary: true}, + ARORW & obj.AMask: {enc: rIIIEncoding, immForm: ARORIW, ternary: true}, +- AORCB & obj.AMask: {enc: iIIEncoding}, +- AREV8 & obj.AMask: {enc: iIIEncoding}, ++ AORCB & obj.AMask: {enc: rIIEncoding}, ++ AREV8 & obj.AMask: {enc: rIIEncoding}, + + // 28.4.4: Single-bit Instructions (Zbs) + ABCLR & obj.AMask: {enc: rIIIEncoding, immForm: ABCLRI, ternary: true}, +-- +2.39.5 + diff --git a/2083-cmd-internal-obj-riscv-factor-out-shift-constant-cod.patch b/2083-cmd-internal-obj-riscv-factor-out-shift-constant-cod.patch new file mode 100644 index 0000000..88f4deb --- /dev/null +++ b/2083-cmd-internal-obj-riscv-factor-out-shift-constant-cod.patch @@ -0,0 +1,151 @@ +From 011266a777692d5761f994845823efe9f1b4d539 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 083/119] cmd/internal/obj/riscv: factor out shift constant + code + +Move the code that tests to see if a constant can be represented by a +32 bit signed integer and a logical left shift. This reduces duplication +and increases readability. Also add test coverage now that this is an +independent function. + +Change-Id: Id25395b1380b00cf5b69ca201b7715ef84f7ade6 +Reviewed-on: https://go-review.googlesource.com/c/go/+/652777 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: David Chase +Reviewed-by: Cherry Mui +--- + src/cmd/internal/obj/riscv/obj.go | 30 ++++++++---- + src/cmd/internal/obj/riscv/obj_test.go | 64 ++++++++++++++++++++++++++ + 2 files changed, 86 insertions(+), 8 deletions(-) + create mode 100644 src/cmd/internal/obj/riscv/obj_test.go + +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 7d7a123bcf..795452bbcb 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -135,10 +135,7 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { + + case AMOV: + if p.From.Type == obj.TYPE_CONST && p.From.Name == obj.NAME_NONE && p.From.Reg == obj.REG_NONE && int64(int32(p.From.Offset)) != p.From.Offset { +- ctz := bits.TrailingZeros64(uint64(p.From.Offset)) +- val := p.From.Offset >> ctz +- if int64(int32(val)) == val { +- // It's ok. We can handle constants with many trailing zeros. ++ if isShiftConst(p.From.Offset) { + break + } + // Put >32-bit constants in memory and load them. +@@ -2097,6 +2094,24 @@ func encodingForAs(as obj.As) (*encoding, error) { + return &insData.enc, nil + } + ++// splitShiftConst attempts to split a constant into a signed 32 bit integer ++// and a corresponding left shift. ++func splitShiftConst(v int64) (imm int64, lsh int, ok bool) { ++ lsh = bits.TrailingZeros64(uint64(v)) ++ c := v >> lsh ++ if int64(int32(c)) != c { ++ return 0, 0, false ++ } ++ return c, lsh, true ++} ++ ++// isShiftConst indicates whether a constant can be represented as a signed ++// 32 bit integer that is left shifted. ++func isShiftConst(v int64) bool { ++ _, lsh, ok := splitShiftConst(v) ++ return ok && lsh > 0 ++} ++ + type instruction struct { + p *obj.Prog // Prog that instruction is for + as obj.As // Assembler opcode +@@ -2378,10 +2393,9 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + // SLLI $63, X10, X10 + var insSLLI *instruction + if err := immIFits(ins.imm, 32); err != nil { +- ctz := bits.TrailingZeros64(uint64(ins.imm)) +- if err := immIFits(ins.imm>>ctz, 32); err == nil { +- ins.imm = ins.imm >> ctz +- insSLLI = &instruction{as: ASLLI, rd: ins.rd, rs1: ins.rd, imm: int64(ctz)} ++ if c, lsh, ok := splitShiftConst(ins.imm); ok { ++ ins.imm = c ++ insSLLI = &instruction{as: ASLLI, rd: ins.rd, rs1: ins.rd, imm: int64(lsh)} + } + } + +diff --git a/src/cmd/internal/obj/riscv/obj_test.go b/src/cmd/internal/obj/riscv/obj_test.go +new file mode 100644 +index 0000000000..688f262d8f +--- /dev/null ++++ b/src/cmd/internal/obj/riscv/obj_test.go +@@ -0,0 +1,64 @@ ++// Copyright 2025 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++package riscv ++ ++import ( ++ "fmt" ++ "testing" ++) ++ ++func TestSplitShiftConst(t *testing.T) { ++ tests := []struct { ++ v int64 ++ wantImm int64 ++ wantLsh int ++ wantOk bool ++ }{ ++ {0x100000000, 1, 32, true}, ++ {0xfffff001, 0, 0, false}, ++ {0xfffff801, 0, 0, false}, ++ {0xfffffff1, 0, 0, false}, ++ {0xffffffff, 0, 0, false}, ++ {0xfffffffe, 0x7fffffff, 1, true}, ++ {0xfffffffffffda, 0, 0, false}, ++ {0xfffffffffffde, 0, 0, false}, ++ {0x000003ffffffffff, 0, 0, false}, ++ {0x0007ffffffffffff, 0, 0, false}, ++ {0x7fffffff00000000, 0x7fffffff, 32, true}, ++ {0x7fffffffffffffff, 0, 0, false}, ++ {0x7f7f7f7f7f7f7f7f, 0, 0, false}, ++ {0x0080000010000000, 0x8000001, 28, true}, ++ {0x0abcdabcd0000000, 0, 0, false}, ++ {-4503599610593281, 0, 0, false}, // 0x8abcdabcd0000000 ++ {-7543254330000000, 0, 0, false}, // 0xfff0000000ffffff ++ } ++ for _, test := range tests { ++ t.Run(fmt.Sprintf("0x%x", test.v), func(t *testing.T) { ++ c, l, ok := splitShiftConst(test.v) ++ ++ if got, want := c, test.wantImm; got != want { ++ t.Errorf("Got immediate %d, want %d", got, want) ++ } ++ if got, want := l, test.wantLsh; got != want { ++ t.Errorf("Got left shift %d, want %d", got, want) ++ } ++ switch { ++ case !ok && test.wantOk: ++ t.Error("Failed to split shift constant, want success") ++ case ok && !test.wantOk: ++ t.Error("Successfully split shift constant, want failure") ++ } ++ if !ok || ok != test.wantOk { ++ return ++ } ++ ++ // Reconstruct as a 32 bit signed constant. ++ v := int64(uint64(int32(test.wantImm)) << l) ++ if v != test.v { ++ t.Errorf("Got v = %d (%x), want v = %d (%x)", v, v, test.v, test.v) ++ } ++ }) ++ } ++} +-- +2.39.5 + diff --git a/2084-cmd-asm-internal-asm-add-additional-tests-for-consta.patch b/2084-cmd-asm-internal-asm-add-additional-tests-for-consta.patch new file mode 100644 index 0000000..35a3b04 --- /dev/null +++ b/2084-cmd-asm-internal-asm-add-additional-tests-for-consta.patch @@ -0,0 +1,75 @@ +From 19169a649428edec75a7f94aac545acb6f34fcca Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 084/119] cmd/asm/internal/asm: add additional tests for + constant loads on riscv64 + +This improves test coverage around the various constant load edge cases. + +Change-Id: Ibafeec78e76d95e9f56b48fa6bd012772bf505c5 +Reviewed-on: https://go-review.googlesource.com/c/go/+/652776 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Cherry Mui +Reviewed-by: Meng Zhuo +Reviewed-by: David Chase +Reviewed-by: Mark Ryan +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 40 ++++++++++++++------- + 1 file changed, 28 insertions(+), 12 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 49f3ac00f3..86b9eb1fe6 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -561,20 +561,36 @@ start: + WORD $0x9abcdef0 // WORD $2596069104 // f0debc9a + + // MOV pseudo-instructions +- MOV X5, X6 // 13830200 +- MOV $2047, X5 // 9302f07f +- MOV $-2048, X5 // 93020080 +- MOV $2048, X5 // b71200009b820280 +- MOV $-2049, X5 // b7f2ffff9b82f27f +- MOV $4096, X5 // b7120000 +- MOV $2147479552, X5 // b7f2ff7f +- MOV $2147483647, X5 // b70200809b82f2ff +- MOV $-2147483647, X5 // b70200809b821200 ++ MOV X5, X6 // 13830200 ++ MOV $2047, X5 // 9302f07f ++ MOV $-2048, X5 // 93020080 ++ MOV $2048, X5 // b71200009b820280 ++ MOV $-2049, X5 // b7f2ffff9b82f27f ++ MOV $4096, X5 // b7120000 ++ MOV $0x7ffff000, X5 // MOV $2147479552, X5 // b7f2ff7f ++ MOV $-0x7ffff000, X5 // MOV $-2147479552, X5 // b7120080 ++ MOV $0x7fffffff, X5 // MOV $2147483647, X5 // b70200809b82f2ff ++ MOV $-0x7fffffff, X5 // MOV $-2147483647, X5 // b70200809b821200 ++ ++ // Converted to load and shift (MOV + SLLI) ++ MOV $0x100000000, X5 // MOV $4294967296, X5 // 9302100093920202 ++ MOV $0x7fffffff00000000, X5 // MOV $9223372032559808512, X5 // b70200809b82f2ff93920202 ++ MOV $0x8000000100000000, X5 // MOV $-9223372032559808512, X5 // b70200809b82120093920202 ++ MOV $0xffffffff00000000, X5 // MOV $-4294967296, X5 // 9302f0ff93920202 + + // Converted to load of symbol (AUIPC + LD) +- MOV $4294967295, X5 // 9702000083b20200 +- // Converted to MOV $1, X5 + SLLI $32, X5 +- MOV $4294967296, X5 // 9302100093920202 ++ MOV $0x80000001, X5 // MOV $2147483649, X5 // 9702000083b20200 ++ MOV $0xffffffff, X5 // MOV $4294967295, X5 // 9702000083b20200 ++ MOV $0x100000001, X5 // MOV $4294967297, X5 // 9702000083b20200 ++ MOV $0xfffffffffffda, X5 // MOV $4503599627370458, X5 // 9702000083b20200 ++ MOV $0xffffffffffffe, X5 // MOV $4503599627370494, X5 // 9702000083b20200 ++ MOV $0x0800000010000000, X5 // MOV $576460752571858944, X5 // 9702000083b20200 ++ MOV $0x8000000010000000, X5 // MOV $-9223372036586340352, X5 // 9702000083b20200 ++ MOV $0x0abcdabcd0000000, X5 // MOV $773733740479250432, X5 // 9702000083b20200 ++ MOV $0x8abcdabcd0000000, X5 // MOV $-8449638296375525376, X5 // 9702000083b20200 ++ MOV $0x1ffffffff0000000, X5 // MOV $2305843008945258496, X5 // 9702000083b20200 ++ MOV $0x7fffffffffffffff, X5 // MOV $9223372036854775807, X5 // 9702000083b20200 ++ MOV $0xfff0000000ffffff, X5 // MOV $-4503599610593281, X5 // 9702000083b20200 + + MOV (X5), X6 // 03b30200 + MOV 4(X5), X6 // 03b34200 +-- +2.39.5 + diff --git a/2085-test-codegen-add-combined-conversion-and-shift-tests.patch b/2085-test-codegen-add-combined-conversion-and-shift-tests.patch new file mode 100644 index 0000000..e453dca --- /dev/null +++ b/2085-test-codegen-add-combined-conversion-and-shift-tests.patch @@ -0,0 +1,95 @@ +From a84d551feabbf5aa6f3e47cfcda19b5ba62af37d Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 085/119] test/codegen: add combined conversion and shift tests + +This adds tests for type conversion and shifts, detailing various +poor bad code generation that currently exists for riscv64. This +will be addressed in future CLs. + +Change-Id: Ie1d366dfe878832df691600f8500ef383da92848 +Reviewed-on: https://go-review.googlesource.com/c/go/+/615678 +Reviewed-by: Meng Zhuo +Reviewed-by: Mark Ryan +LUCI-TryBot-Result: Go LUCI +Reviewed-by: David Chase +Reviewed-by: Carlos Amedee +--- + test/codegen/shift.go | 64 +++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 64 insertions(+) + +diff --git a/test/codegen/shift.go b/test/codegen/shift.go +index 6b1157d3fd..3836311d5d 100644 +--- a/test/codegen/shift.go ++++ b/test/codegen/shift.go +@@ -491,3 +491,67 @@ func checkLeftShiftWithAddition(a int64, b int64) int64 { + a = a + b<<3 + return a + } ++ ++// ++// Convert and shift. ++// ++ ++func rsh64Uto32U(v uint64) uint32 { ++ x := uint32(v) ++ // riscv64:"MOVWU" ++ if x > 8 { ++ // riscv64:"SRLIW",-"MOVWU",-"SLLI" ++ x >>= 2 ++ } ++ return x ++} ++ ++func rsh64Uto16U(v uint64) uint16 { ++ x := uint16(v) ++ // riscv64:"MOVHU" ++ if x > 8 { ++ // riscv64:"SLLI","SRLI" ++ x >>= 2 ++ } ++ return x ++} ++ ++func rsh64Uto8U(v uint64) uint8 { ++ x := uint8(v) ++ // riscv64:"MOVBU" ++ if x > 8 { ++ // riscv64:"SLLI","SRLI" ++ x >>= 2 ++ } ++ return x ++} ++ ++func rsh64to32(v int64) int32 { ++ x := int32(v) ++ // riscv64:"MOVW" ++ if x > 8 { ++ // riscv64:"SRAIW",-"MOVW",-"SLLI" ++ x >>= 2 ++ } ++ return x ++} ++ ++func rsh64to16(v int64) int16 { ++ x := int16(v) ++ // riscv64:"MOVH" ++ if x > 8 { ++ // riscv64:"SLLI","SRAI" ++ x >>= 2 ++ } ++ return x ++} ++ ++func rsh64to8(v int64) int8 { ++ x := int8(v) ++ // riscv64:"MOVB" ++ if x > 8 { ++ // riscv64:"SLLI","SRAI" ++ x >>= 2 ++ } ++ return x ++} +-- +2.39.5 + diff --git a/2086-cmd-internal-obj-riscv-internal-bytealg-synthesize-M.patch b/2086-cmd-internal-obj-riscv-internal-bytealg-synthesize-M.patch new file mode 100644 index 0000000..94e47f8 --- /dev/null +++ b/2086-cmd-internal-obj-riscv-internal-bytealg-synthesize-M.patch @@ -0,0 +1,453 @@ +From 7957258a309c668e7d59db35f443723f23d5e210 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 086/119] cmd/internal/obj/riscv,internal/bytealg: synthesize + MIN/MAX/MINU/MAXU instructions + +Provide a synthesized version of the MIN/MAX/MINU/MAXU instructions +if they're not natively available. This allows these instructions to +be used in assembly unconditionally. + +Use MIN in internal/bytealg.compare. + +Cq-Include-Trybots: luci.golang.try:gotip-linux-riscv64 +Change-Id: I8a5a3a59f0a9205e136fc3d673b23eaf3ca469f8 +Reviewed-on: https://go-review.googlesource.com/c/go/+/653295 +Reviewed-by: Mark Ryan +Reviewed-by: Cherry Mui +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 16 +- + src/cmd/internal/obj/riscv/asm_test.go | 14 ++ + src/cmd/internal/obj/riscv/obj.go | 44 ++++++ + .../riscv/testdata/testminmax/minmax_test.go | 140 ++++++++++++++++++ + .../riscv/testdata/testminmax/minmax_test.s | 131 ++++++++++++++++ + src/internal/bytealg/compare_riscv64.s | 8 +- + 6 files changed, 339 insertions(+), 14 deletions(-) + create mode 100644 src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.go + create mode 100644 src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.s + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 86b9eb1fe6..eeaadf6298 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -376,14 +376,14 @@ start: + CPOPW X23, X24 // 1b9c2b60 + CTZ X24, X25 // 931c1c60 + CTZW X25, X26 // 1b9d1c60 +- MAX X26, X28, X29 // b36eae0b +- MAX X26, X28 // 336eae0b +- MAXU X28, X29, X30 // 33ffce0b +- MAXU X28, X29 // b3fece0b +- MIN X29, X30, X5 // b342df0b +- MIN X29, X30 // 334fdf0b +- MINU X30, X5, X6 // 33d3e20b +- MINU X30, X5 // b3d2e20b ++ MAX X26, X28, X29 // b36eae0b or b32fae01b30ff041b34eae01b3fedf01b34ede01 ++ MAX X26, X28 // 336eae0b or b32fcd01b30ff041334ecd0133fecf01334ecd01 ++ MAXU X28, X29, X30 // 33ffce0b or b3bfce01b30ff04133cfce0133ffef0133cfee01 ++ MAXU X28, X29 // b3fece0b or b33fde01b30ff041b34ede01b3fedf01b34ede01 ++ MIN X29, X30, X5 // b342df0b or b3afee01b30ff041b342df01b3f25f00b3425f00 ++ MIN X29, X30 // 334fdf0b or b32fdf01b30ff04133cfee0133ffef0133cfee01 ++ MINU X30, X5, X6 // 33d3e20b or b33f5f00b30ff04133c3e20133f36f0033c36200 ++ MINU X30, X5 // b3d2e20b or b3bfe201b30ff041b3425f00b3f25f00b3425f00 + ORN X6, X7, X8 // 33e46340 or 1344f3ff33e48300 + ORN X6, X7 // b3e36340 or 934ff3ffb3e3f301 + SEXTB X16, X17 // 93184860 +diff --git a/src/cmd/internal/obj/riscv/asm_test.go b/src/cmd/internal/obj/riscv/asm_test.go +index 96ea230841..35854516b9 100644 +--- a/src/cmd/internal/obj/riscv/asm_test.go ++++ b/src/cmd/internal/obj/riscv/asm_test.go +@@ -280,6 +280,20 @@ func TestBranch(t *testing.T) { + } + } + ++func TestMinMax(t *testing.T) { ++ if runtime.GOARCH != "riscv64" { ++ t.Skip("Requires riscv64 to run") ++ } ++ ++ testenv.MustHaveGoBuild(t) ++ ++ cmd := testenv.Command(t, testenv.GoToolPath(t), "test") ++ cmd.Dir = "testdata/testminmax" ++ if out, err := testenv.CleanCmdEnv(cmd).CombinedOutput(); err != nil { ++ t.Errorf("Min max test failed: %v\n%s", err, out) ++ } ++} ++ + func TestPCAlign(t *testing.T) { + dir := t.TempDir() + tmpfile := filepath.Join(dir, "x.s") +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 795452bbcb..83d06f09f1 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -2625,6 +2625,47 @@ func instructionsForRotate(p *obj.Prog, ins *instruction) []*instruction { + } + } + ++// instructionsForMinMax returns the machine instructions for an integer minimum or maximum. ++func instructionsForMinMax(p *obj.Prog, ins *instruction) []*instruction { ++ if buildcfg.GORISCV64 >= 22 { ++ // Minimum and maximum instructions are supported natively. ++ return []*instruction{ins} ++ } ++ ++ // Generate a move for identical inputs. ++ if ins.rs1 == ins.rs2 { ++ ins.as, ins.rs2, ins.imm = AADDI, obj.REG_NONE, 0 ++ return []*instruction{ins} ++ } ++ ++ // Ensure that if one of the source registers is the same as the destination, ++ // it is processed first. ++ if ins.rs1 == ins.rd { ++ ins.rs1, ins.rs2 = ins.rs2, ins.rs1 ++ } ++ sltReg1, sltReg2 := ins.rs2, ins.rs1 ++ ++ // MIN -> SLT/SUB/XOR/AND/XOR ++ // MAX -> SLT/SUB/XOR/AND/XOR with swapped inputs to SLT ++ switch ins.as { ++ case AMIN: ++ ins.as = ASLT ++ case AMAX: ++ ins.as, sltReg1, sltReg2 = ASLT, sltReg2, sltReg1 ++ case AMINU: ++ ins.as = ASLTU ++ case AMAXU: ++ ins.as, sltReg1, sltReg2 = ASLTU, sltReg2, sltReg1 ++ } ++ return []*instruction{ ++ &instruction{as: ins.as, rs1: sltReg1, rs2: sltReg2, rd: REG_TMP}, ++ &instruction{as: ASUB, rs1: REG_ZERO, rs2: REG_TMP, rd: REG_TMP}, ++ &instruction{as: AXOR, rs1: ins.rs1, rs2: ins.rs2, rd: ins.rd}, ++ &instruction{as: AAND, rs1: REG_TMP, rs2: ins.rd, rd: ins.rd}, ++ &instruction{as: AXOR, rs1: ins.rs1, rs2: ins.rd, rd: ins.rd}, ++ } ++} ++ + // instructionsForProg returns the machine instructions for an *obj.Prog. + func instructionsForProg(p *obj.Prog) []*instruction { + ins := instructionForProg(p) +@@ -2874,6 +2915,9 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.as = AXOR + inss = append(inss, &instruction{as: AXORI, rs1: ins.rd, rs2: obj.REG_NONE, rd: ins.rd, imm: -1}) + ++ case AMIN, AMAX, AMINU, AMAXU: ++ inss = instructionsForMinMax(p, ins) ++ + case AVSETVLI, AVSETIVLI: + ins.rs1, ins.rs2 = ins.rs2, obj.REG_NONE + vtype, err := EncodeVectorType(p.RestArgs[0].Offset, p.RestArgs[1].Offset, p.RestArgs[2].Offset, p.RestArgs[3].Offset) +diff --git a/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.go b/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.go +new file mode 100644 +index 0000000000..46d321147b +--- /dev/null ++++ b/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.go +@@ -0,0 +1,140 @@ ++// Copyright 2025 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++//go:build riscv64 ++ ++package testminmax ++ ++import ( ++ "testing" ++) ++ ++func testMIN1(a int64) (r int64) ++func testMIN2(a, b int64) (r int64) ++func testMIN3(a, b int64) (r int64) ++func testMIN4(a, b int64) (r int64) ++func testMAX1(a int64) (r int64) ++func testMAX2(a, b int64) (r int64) ++func testMAX3(a, b int64) (r int64) ++func testMAX4(a, b int64) (r int64) ++func testMINU1(a int64) (r int64) ++func testMINU2(a, b int64) (r int64) ++func testMINU3(a, b int64) (r int64) ++func testMINU4(a, b int64) (r int64) ++func testMAXU1(a int64) (r int64) ++func testMAXU2(a, b int64) (r int64) ++func testMAXU3(a, b int64) (r int64) ++func testMAXU4(a, b int64) (r int64) ++ ++func TestMin(t *testing.T) { ++ tests := []struct { ++ a int64 ++ b int64 ++ want int64 ++ }{ ++ {1, 2, 1}, ++ {2, 1, 1}, ++ {2, 2, 2}, ++ {1, -1, -1}, ++ {-1, 1, -1}, ++ } ++ for _, test := range tests { ++ if got := testMIN1(test.a); got != test.a { ++ t.Errorf("Assembly testMIN1 %v = %v, want %v", test.a, got, test.a) ++ } ++ if got := testMIN2(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMIN2 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ if got := testMIN3(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMIN3 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ if got := testMIN4(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMIN4 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ } ++} ++ ++func TestMax(t *testing.T) { ++ tests := []struct { ++ a int64 ++ b int64 ++ want int64 ++ }{ ++ {1, 2, 2}, ++ {2, 1, 2}, ++ {2, 2, 2}, ++ {1, -1, 1}, ++ {-1, 1, 1}, ++ } ++ for _, test := range tests { ++ if got := testMAX1(test.a); got != test.a { ++ t.Errorf("Assembly testMAX1 %v = %v, want %v", test.a, got, test.a) ++ } ++ if got := testMAX2(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMAX2 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ if got := testMAX3(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMAX3 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ if got := testMAX4(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMAX4 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ } ++} ++ ++func TestMinU(t *testing.T) { ++ tests := []struct { ++ a int64 ++ b int64 ++ want int64 ++ }{ ++ {1, 2, 1}, ++ {2, 1, 1}, ++ {2, 2, 2}, ++ {1, -1, 1}, ++ {-1, 1, 1}, ++ } ++ for _, test := range tests { ++ if got := testMINU1(test.a); got != test.a { ++ t.Errorf("Assembly testMINU1 %v = %v, want %v", test.a, got, test.a) ++ } ++ if got := testMINU2(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMINU2 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ if got := testMINU3(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMINU3 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ if got := testMINU4(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMINU4 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ } ++} ++ ++func TestMaxU(t *testing.T) { ++ tests := []struct { ++ a int64 ++ b int64 ++ want int64 ++ }{ ++ {1, 2, 2}, ++ {2, 1, 2}, ++ {2, 2, 2}, ++ {1, -1, -1}, ++ {-1, 1, -1}, ++ } ++ for _, test := range tests { ++ if got := testMAXU1(test.a); got != test.a { ++ t.Errorf("Assembly testMAXU1 %v = %v, want %v", test.a, got, test.a) ++ } ++ if got := testMAXU2(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMAXU2 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ if got := testMAXU3(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMAXU3 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ if got := testMAXU4(test.a, test.b); got != test.want { ++ t.Errorf("Assembly testMAXU4 %v, %v = %v, want %v", test.a, test.b, got, test.want) ++ } ++ } ++} +diff --git a/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.s b/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.s +new file mode 100644 +index 0000000000..9d295791a5 +--- /dev/null ++++ b/src/cmd/internal/obj/riscv/testdata/testminmax/minmax_test.s +@@ -0,0 +1,131 @@ ++// Copyright 2025 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++//go:build riscv64 ++ ++#include "textflag.h" ++ ++// func testMIN1(a int64) (r int64) ++TEXT ·testMIN1(SB),NOSPLIT,$0-16 ++ MOV a+0(FP), X5 ++ MIN X5, X5, X6 ++ MOV X6, r+8(FP) ++ RET ++ ++// func testMIN2(a, b int64) (r int64) ++TEXT ·testMIN2(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MIN X5, X6, X6 ++ MOV X6, r+16(FP) ++ RET ++ ++// func testMIN3(a, b int64) (r int64) ++TEXT ·testMIN3(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MIN X6, X5, X5 ++ MOV X5, r+16(FP) ++ RET ++ ++// func testMIN4(a, b int64) (r int64) ++TEXT ·testMIN4(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MIN X5, X6, X7 ++ MOV X7, r+16(FP) ++ RET ++ ++// func testMAX1(a int64) (r int64) ++TEXT ·testMAX1(SB),NOSPLIT,$0-16 ++ MOV a+0(FP), X5 ++ MAX X5, X5, X6 ++ MOV X6, r+8(FP) ++ RET ++ ++// func testMAX2(a, b int64) (r int64) ++TEXT ·testMAX2(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MAX X5, X6, X6 ++ MOV X6, r+16(FP) ++ RET ++ ++// func testMAX3(a, b int64) (r int64) ++TEXT ·testMAX3(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MAX X6, X5, X5 ++ MOV X5, r+16(FP) ++ RET ++ ++// func testMAX4(a, b int64) (r int64) ++TEXT ·testMAX4(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MAX X5, X6, X7 ++ MOV X7, r+16(FP) ++ RET ++ ++// func testMINU1(a int64) (r int64) ++TEXT ·testMINU1(SB),NOSPLIT,$0-16 ++ MOV a+0(FP), X5 ++ MINU X5, X5, X6 ++ MOV X6, r+8(FP) ++ RET ++ ++// func testMINU2(a, b int64) (r int64) ++TEXT ·testMINU2(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MINU X5, X6, X6 ++ MOV X6, r+16(FP) ++ RET ++ ++// func testMINU3(a, b int64) (r int64) ++TEXT ·testMINU3(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MINU X6, X5, X5 ++ MOV X5, r+16(FP) ++ RET ++ ++// func testMINU4(a, b int64) (r int64) ++TEXT ·testMINU4(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MINU X5, X6, X7 ++ MOV X7, r+16(FP) ++ RET ++ ++// func testMAXU1(a int64) (r int64) ++TEXT ·testMAXU1(SB),NOSPLIT,$0-16 ++ MOV a+0(FP), X5 ++ MAXU X5, X5, X6 ++ MOV X6, r+8(FP) ++ RET ++ ++// func testMAXU2(a, b int64) (r int64) ++TEXT ·testMAXU2(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MAXU X5, X6, X6 ++ MOV X6, r+16(FP) ++ RET ++ ++// func testMAXU3(a, b int64) (r int64) ++TEXT ·testMAXU3(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MAXU X6, X5, X5 ++ MOV X5, r+16(FP) ++ RET ++ ++// func testMAXU4(a, b int64) (r int64) ++TEXT ·testMAXU4(SB),NOSPLIT,$0-24 ++ MOV a+0(FP), X5 ++ MOV b+8(FP), X6 ++ MAXU X5, X6, X7 ++ MOV X7, r+16(FP) ++ RET +diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s +index b1e1f7bcc7..6388fcd209 100644 +--- a/src/internal/bytealg/compare_riscv64.s ++++ b/src/internal/bytealg/compare_riscv64.s +@@ -28,15 +28,11 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 + // X11 length of a + // X12 points to start of b + // X13 length of b +-// for non-regabi X14 points to the address to store the return value (-1/0/1) +-// for regabi the return value in X10 ++// return value in X10 (-1/0/1) + TEXT compare<>(SB),NOSPLIT|NOFRAME,$0 + BEQ X10, X12, cmp_len + +- MOV X11, X5 +- BGE X13, X5, use_a_len // X5 = min(len(a), len(b)) +- MOV X13, X5 +-use_a_len: ++ MIN X11, X13, X5 + BEQZ X5, cmp_len + + MOV $32, X6 +-- +2.39.5 + diff --git a/2087-cmd-internal-obj-riscv-improve-constant-construction.patch b/2087-cmd-internal-obj-riscv-improve-constant-construction.patch new file mode 100644 index 0000000..59bccad --- /dev/null +++ b/2087-cmd-internal-obj-riscv-improve-constant-construction.patch @@ -0,0 +1,239 @@ +From 88bf26fdb0c84eedfcc6e370840bccf34b9c9d46 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 087/119] cmd/internal/obj/riscv: improve constant construction + +Attempt to construct large constants that have a consecutive sequence +of ones from a small negative constant, with a logical right and/or +left shift. This allows for a large range of mask like constants to be +constructed with only two or three instructions, avoiding the need to +load from memory. + +Change-Id: I35a77fecdd2df0ed3f33b772d518f85119d4ff66 +Reviewed-on: https://go-review.googlesource.com/c/go/+/652778 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Cherry Mui +Reviewed-by: Mark Ryan +Reviewed-by: Meng Zhuo +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 12 ++-- + src/cmd/internal/obj/riscv/obj.go | 62 ++++++++++++++++----- + src/cmd/internal/obj/riscv/obj_test.go | 48 +++++++++------- + 3 files changed, 82 insertions(+), 40 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index eeaadf6298..0b34bba032 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -572,24 +572,24 @@ start: + MOV $0x7fffffff, X5 // MOV $2147483647, X5 // b70200809b82f2ff + MOV $-0x7fffffff, X5 // MOV $-2147483647, X5 // b70200809b821200 + +- // Converted to load and shift (MOV + SLLI) ++ // Converted to load and shift(s) ++ MOV $0xffffffff, X5 // MOV $4294967295, X5 // 9302f0ff93d20202 + MOV $0x100000000, X5 // MOV $4294967296, X5 // 9302100093920202 ++ MOV $0xfffffffffffda, X5 // MOV $4503599627370458, X5 // 9302d0fe9392d20093d2c200 ++ MOV $0xffffffffffffe, X5 // MOV $4503599627370494, X5 // 9302f0ff9392d20093d2c200 + MOV $0x7fffffff00000000, X5 // MOV $9223372032559808512, X5 // b70200809b82f2ff93920202 + MOV $0x8000000100000000, X5 // MOV $-9223372032559808512, X5 // b70200809b82120093920202 + MOV $0xffffffff00000000, X5 // MOV $-4294967296, X5 // 9302f0ff93920202 ++ MOV $0x1ffffffff0000000, X5 // MOV $2305843008945258496, X5 // 9302f0ff9392f20193d23200 ++ MOV $0x7fffffffffffffff, X5 // MOV $9223372036854775807, X5 // 9302f0ff93d21200 + + // Converted to load of symbol (AUIPC + LD) + MOV $0x80000001, X5 // MOV $2147483649, X5 // 9702000083b20200 +- MOV $0xffffffff, X5 // MOV $4294967295, X5 // 9702000083b20200 + MOV $0x100000001, X5 // MOV $4294967297, X5 // 9702000083b20200 +- MOV $0xfffffffffffda, X5 // MOV $4503599627370458, X5 // 9702000083b20200 +- MOV $0xffffffffffffe, X5 // MOV $4503599627370494, X5 // 9702000083b20200 + MOV $0x0800000010000000, X5 // MOV $576460752571858944, X5 // 9702000083b20200 + MOV $0x8000000010000000, X5 // MOV $-9223372036586340352, X5 // 9702000083b20200 + MOV $0x0abcdabcd0000000, X5 // MOV $773733740479250432, X5 // 9702000083b20200 + MOV $0x8abcdabcd0000000, X5 // MOV $-8449638296375525376, X5 // 9702000083b20200 +- MOV $0x1ffffffff0000000, X5 // MOV $2305843008945258496, X5 // 9702000083b20200 +- MOV $0x7fffffffffffffff, X5 // MOV $9223372036854775807, X5 // 9702000083b20200 + MOV $0xfff0000000ffffff, X5 // MOV $-4503599610593281, X5 // 9702000083b20200 + + MOV (X5), X6 // 03b30200 +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 83d06f09f1..b7989ddbd7 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -2094,22 +2094,35 @@ func encodingForAs(as obj.As) (*encoding, error) { + return &insData.enc, nil + } + +-// splitShiftConst attempts to split a constant into a signed 32 bit integer +-// and a corresponding left shift. +-func splitShiftConst(v int64) (imm int64, lsh int, ok bool) { ++// splitShiftConst attempts to split a constant into a signed 12 bit or ++// 32 bit integer, with corresponding logical right shift and/or left shift. ++func splitShiftConst(v int64) (imm int64, lsh int, rsh int, ok bool) { ++ // See if we can reconstruct this value from a signed 32 bit integer. + lsh = bits.TrailingZeros64(uint64(v)) + c := v >> lsh +- if int64(int32(c)) != c { +- return 0, 0, false ++ if int64(int32(c)) == c { ++ return c, lsh, 0, true + } +- return c, lsh, true ++ ++ // See if we can reconstruct this value from a small negative constant. ++ rsh = bits.LeadingZeros64(uint64(v)) ++ ones := bits.OnesCount64((uint64(v) >> lsh) >> 11) ++ c = signExtend(1<<11|((v>>lsh)&0x7ff), 12) ++ if rsh+ones+lsh+11 == 64 { ++ if lsh > 0 || c != -1 { ++ lsh += rsh ++ } ++ return c, lsh, rsh, true ++ } ++ ++ return 0, 0, 0, false + } + + // isShiftConst indicates whether a constant can be represented as a signed + // 32 bit integer that is left shifted. + func isShiftConst(v int64) bool { +- _, lsh, ok := splitShiftConst(v) +- return ok && lsh > 0 ++ _, lsh, rsh, ok := splitShiftConst(v) ++ return ok && (lsh > 0 || rsh > 0) + } + + type instruction struct { +@@ -2386,16 +2399,34 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + // For constants larger than 32 bits in size that have trailing zeros, + // use the value with the trailing zeros removed and then use a SLLI + // instruction to restore the original constant. ++ // + // For example: +- // MOV $0x8000000000000000, X10 ++ // MOV $0x8000000000000000, X10 + // becomes +- // MOV $1, X10 +- // SLLI $63, X10, X10 +- var insSLLI *instruction ++ // MOV $1, X10 ++ // SLLI $63, X10, X10 ++ // ++ // Similarly, we can construct large constants that have a consecutive ++ // sequence of ones from a small negative constant, with a right and/or ++ // left shift. ++ // ++ // For example: ++ // MOV $0x000fffffffffffda, X10 ++ // becomes ++ // MOV $-19, X10 ++ // SLLI $13, X10 ++ // SRLI $12, X10 ++ // ++ var insSLLI, insSRLI *instruction + if err := immIFits(ins.imm, 32); err != nil { +- if c, lsh, ok := splitShiftConst(ins.imm); ok { ++ if c, lsh, rsh, ok := splitShiftConst(ins.imm); ok { + ins.imm = c +- insSLLI = &instruction{as: ASLLI, rd: ins.rd, rs1: ins.rd, imm: int64(lsh)} ++ if lsh > 0 { ++ insSLLI = &instruction{as: ASLLI, rd: ins.rd, rs1: ins.rd, imm: int64(lsh)} ++ } ++ if rsh > 0 { ++ insSRLI = &instruction{as: ASRLI, rd: ins.rd, rs1: ins.rd, imm: int64(rsh)} ++ } + } + } + +@@ -2422,6 +2453,9 @@ func instructionsForMOV(p *obj.Prog) []*instruction { + if insSLLI != nil { + inss = append(inss, insSLLI) + } ++ if insSRLI != nil { ++ inss = append(inss, insSRLI) ++ } + + case p.From.Type == obj.TYPE_CONST && p.To.Type != obj.TYPE_REG: + p.Ctxt.Diag("%v: constant load must target register", p) +diff --git a/src/cmd/internal/obj/riscv/obj_test.go b/src/cmd/internal/obj/riscv/obj_test.go +index 688f262d8f..87b31e5a89 100644 +--- a/src/cmd/internal/obj/riscv/obj_test.go ++++ b/src/cmd/internal/obj/riscv/obj_test.go +@@ -14,29 +14,30 @@ func TestSplitShiftConst(t *testing.T) { + v int64 + wantImm int64 + wantLsh int ++ wantRsh int + wantOk bool + }{ +- {0x100000000, 1, 32, true}, +- {0xfffff001, 0, 0, false}, +- {0xfffff801, 0, 0, false}, +- {0xfffffff1, 0, 0, false}, +- {0xffffffff, 0, 0, false}, +- {0xfffffffe, 0x7fffffff, 1, true}, +- {0xfffffffffffda, 0, 0, false}, +- {0xfffffffffffde, 0, 0, false}, +- {0x000003ffffffffff, 0, 0, false}, +- {0x0007ffffffffffff, 0, 0, false}, +- {0x7fffffff00000000, 0x7fffffff, 32, true}, +- {0x7fffffffffffffff, 0, 0, false}, +- {0x7f7f7f7f7f7f7f7f, 0, 0, false}, +- {0x0080000010000000, 0x8000001, 28, true}, +- {0x0abcdabcd0000000, 0, 0, false}, +- {-4503599610593281, 0, 0, false}, // 0x8abcdabcd0000000 +- {-7543254330000000, 0, 0, false}, // 0xfff0000000ffffff ++ {0x100000000, 1, 32, 0, true}, ++ {0xfffff001, 0, 0, 0, false}, ++ {0xfffff801, -2047, 32, 32, true}, ++ {0xfffffff1, -15, 32, 32, true}, ++ {0xffffffff, -1, 0, 32, true}, ++ {0xfffffffe, 0x7fffffff, 1, 0, true}, ++ {0xfffffffffffda, -19, 13, 12, true}, ++ {0xfffffffffffde, -17, 13, 12, true}, ++ {0x000003ffffffffff, -1, 0, 22, true}, ++ {0x0007ffffffffffff, -1, 0, 13, true}, ++ {0x7fffffff00000000, 0x7fffffff, 32, 0, true}, ++ {0x7fffffffffffffff, -1, 0, 1, true}, ++ {0x7f7f7f7f7f7f7f7f, 0, 0, 0, false}, ++ {0x0080000010000000, 0x8000001, 28, 0, true}, ++ {0x0abcdabcd0000000, 0, 0, 0, false}, ++ {-4503599610593281, 0, 0, 0, false}, // 0x8abcdabcd0000000 ++ {-7543254330000000, 0, 0, 0, false}, // 0xfff0000000ffffff + } + for _, test := range tests { + t.Run(fmt.Sprintf("0x%x", test.v), func(t *testing.T) { +- c, l, ok := splitShiftConst(test.v) ++ c, l, r, ok := splitShiftConst(test.v) + + if got, want := c, test.wantImm; got != want { + t.Errorf("Got immediate %d, want %d", got, want) +@@ -44,6 +45,9 @@ func TestSplitShiftConst(t *testing.T) { + if got, want := l, test.wantLsh; got != want { + t.Errorf("Got left shift %d, want %d", got, want) + } ++ if got, want := r, test.wantRsh; got != want { ++ t.Errorf("Got right shift %d, want %d", got, want) ++ } + switch { + case !ok && test.wantOk: + t.Error("Failed to split shift constant, want success") +@@ -54,8 +58,12 @@ func TestSplitShiftConst(t *testing.T) { + return + } + +- // Reconstruct as a 32 bit signed constant. +- v := int64(uint64(int32(test.wantImm)) << l) ++ // Reconstruct as either a 12 bit or 32 bit signed constant. ++ s := 64 - 12 ++ v := int64((uint64(((c << s) >> s)) << l) >> r) ++ if test.wantImm != ((test.wantImm << s) >> s) { ++ v = int64((uint64(int32(test.wantImm)) << l) >> r) ++ } + if v != test.v { + t.Errorf("Got v = %d (%x), want v = %d (%x)", v, v, test.v, test.v) + } +-- +2.39.5 + diff --git a/2088-cmd-compile-internal-ssa-optimise-more-branches-with.patch b/2088-cmd-compile-internal-ssa-optimise-more-branches-with.patch new file mode 100644 index 0000000..614e235 --- /dev/null +++ b/2088-cmd-compile-internal-ssa-optimise-more-branches-with.patch @@ -0,0 +1,125 @@ +From 753bc12d386d9481c83086003a7dd85fcab1d9ec Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 088/119] cmd/compile/internal/ssa: optimise more branches with + zero on riscv64 + +Optimise more branches with zero on riscv64. In particular, BLTU with +zero occurs with IsInBounds checks for index zero. This currently results +in two instructions and requires an additional register: + + li t2, 0 + bltu t2, t1, 0x174b4 + +This is equivalent to checking if the bounds is not equal to zero. With +this change: + + bnez t1, 0x174c0 + +This removes more than 500 instructions from the Go binary on riscv64. + +Change-Id: I6cd861d853e3ef270bd46dacecdfaa205b1c4644 +Reviewed-on: https://go-review.googlesource.com/c/go/+/606715 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: Cherry Mui +Reviewed-by: Dmitri Shuralyov +--- + .../compile/internal/ssa/_gen/RISCV64.rules | 18 +++++++------- + .../compile/internal/ssa/rewriteRISCV64.go | 24 +++++++++++++++++++ + test/codegen/compare_and_branch.go | 10 ++++++++ + 3 files changed, 44 insertions(+), 8 deletions(-) + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 58cadc8944..93f4e6a948 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -545,14 +545,16 @@ + (BNEZ (SLTIU [x] y) yes no) => (BLTU y (MOVDconst [x]) yes no) + + // Convert branch with zero to more optimal branch zero. +-(BEQ (MOVDconst [0]) cond yes no) => (BEQZ cond yes no) +-(BEQ cond (MOVDconst [0]) yes no) => (BEQZ cond yes no) +-(BNE (MOVDconst [0]) cond yes no) => (BNEZ cond yes no) +-(BNE cond (MOVDconst [0]) yes no) => (BNEZ cond yes no) +-(BLT (MOVDconst [0]) cond yes no) => (BGTZ cond yes no) +-(BLT cond (MOVDconst [0]) yes no) => (BLTZ cond yes no) +-(BGE (MOVDconst [0]) cond yes no) => (BLEZ cond yes no) +-(BGE cond (MOVDconst [0]) yes no) => (BGEZ cond yes no) ++(BEQ (MOVDconst [0]) cond yes no) => (BEQZ cond yes no) ++(BEQ cond (MOVDconst [0]) yes no) => (BEQZ cond yes no) ++(BNE (MOVDconst [0]) cond yes no) => (BNEZ cond yes no) ++(BNE cond (MOVDconst [0]) yes no) => (BNEZ cond yes no) ++(BLT (MOVDconst [0]) cond yes no) => (BGTZ cond yes no) ++(BLT cond (MOVDconst [0]) yes no) => (BLTZ cond yes no) ++(BLTU (MOVDconst [0]) cond yes no) => (BNEZ cond yes no) ++(BGE (MOVDconst [0]) cond yes no) => (BLEZ cond yes no) ++(BGE cond (MOVDconst [0]) yes no) => (BGEZ cond yes no) ++(BGEU (MOVDconst [0]) cond yes no) => (BEQZ cond yes no) + + // Remove redundant NEG from SEQZ/SNEZ. + (SEQZ (NEG x)) => (SEQZ x) +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 8f8c902df8..c3018f270c 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -9277,6 +9277,18 @@ func rewriteBlockRISCV64(b *Block) bool { + b.resetWithControl(BlockRISCV64BGEZ, cond) + return true + } ++ case BlockRISCV64BGEU: ++ // match: (BGEU (MOVDconst [0]) cond yes no) ++ // result: (BEQZ cond yes no) ++ for b.Controls[0].Op == OpRISCV64MOVDconst { ++ v_0 := b.Controls[0] ++ if auxIntToInt64(v_0.AuxInt) != 0 { ++ break ++ } ++ cond := b.Controls[1] ++ b.resetWithControl(BlockRISCV64BEQZ, cond) ++ return true ++ } + case BlockRISCV64BLT: + // match: (BLT (MOVDconst [0]) cond yes no) + // result: (BGTZ cond yes no) +@@ -9300,6 +9312,18 @@ func rewriteBlockRISCV64(b *Block) bool { + b.resetWithControl(BlockRISCV64BLTZ, cond) + return true + } ++ case BlockRISCV64BLTU: ++ // match: (BLTU (MOVDconst [0]) cond yes no) ++ // result: (BNEZ cond yes no) ++ for b.Controls[0].Op == OpRISCV64MOVDconst { ++ v_0 := b.Controls[0] ++ if auxIntToInt64(v_0.AuxInt) != 0 { ++ break ++ } ++ cond := b.Controls[1] ++ b.resetWithControl(BlockRISCV64BNEZ, cond) ++ return true ++ } + case BlockRISCV64BNE: + // match: (BNE (MOVDconst [0]) cond yes no) + // result: (BNEZ cond yes no) +diff --git a/test/codegen/compare_and_branch.go b/test/codegen/compare_and_branch.go +index b3feef0eb7..fe25ebb3d3 100644 +--- a/test/codegen/compare_and_branch.go ++++ b/test/codegen/compare_and_branch.go +@@ -239,4 +239,14 @@ func ui64x0(x chan uint64) { + for <-x < 1 { + dummy() + } ++ ++ // riscv64:"BNEZ" ++ for 0 < <-x { ++ dummy() ++ } ++ ++ // riscv64:"BEQZ" ++ for 0 >= <-x { ++ dummy() ++ } + } +-- +2.39.5 + diff --git a/2089-cmd-internal-obj-riscv-add-support-for-vector-intege.patch b/2089-cmd-internal-obj-riscv-add-support-for-vector-intege.patch new file mode 100644 index 0000000..9e7e372 --- /dev/null +++ b/2089-cmd-internal-obj-riscv-add-support-for-vector-intege.patch @@ -0,0 +1,1327 @@ +From 7b35c25b80e46a548fb4f606f2be44f245062bf5 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 089/119] cmd/internal/obj/riscv: add support for vector + integer arithmetic instructions + +Add support for vector integer arithmetic instructions to the RISC-V +assembler. This includes vector addition, subtraction, integer +extension, add-with-carry, subtract-with-borrow, bitwise logical +operations, comparison, min/max, integer division and multiplication +instructions. + +Change-Id: I8c191ef8e31291e13743732903e4f12356133a46 +Reviewed-on: https://go-review.googlesource.com/c/go/+/646775 +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Cherry Mui +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 317 ++++++++++++++ + .../asm/internal/asm/testdata/riscv64error.s | 165 ++++++++ + .../internal/asm/testdata/riscv64validation.s | 225 +++++++++- + src/cmd/internal/obj/riscv/anames.go | 13 + + src/cmd/internal/obj/riscv/cpu.go | 13 + + src/cmd/internal/obj/riscv/obj.go | 393 +++++++++++++++++- + 6 files changed, 1101 insertions(+), 25 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 0b34bba032..852104375b 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -546,6 +546,323 @@ start: + VS4RV V4, (X11) // 27828562 + VS8RV V8, (X11) // 278485e2 + ++ // 31.11.1: Vector Single-Width Integer Add and Subtract ++ VADDVV V1, V2, V3 // d7812002 ++ VADDVV V1, V2, V0, V3 // d7812000 ++ VADDVX X10, V2, V3 // d7412502 ++ VADDVX X10, V2, V0, V3 // d7412500 ++ VADDVI $15, V2, V3 // d7b12702 ++ VADDVI $15, V2, V0, V3 // d7b12700 ++ VADDVI $-16, V2, V3 // d7312802 ++ VADDVI $-16, V2, V0, V3 // d7312800 ++ VSUBVV V1, V2, V3 // d781200a ++ VSUBVV V1, V2, V0, V3 // d7812008 ++ VSUBVX X10, V2, V3 // d741250a ++ VSUBVX X10, V2, V0, V3 // d7412508 ++ VRSUBVX X10, V2, V3 // d741250e ++ VRSUBVX X10, V2, V0, V3 // d741250c ++ VRSUBVI $15, V2, V0, V3 // d7b1270c ++ VRSUBVI $-16, V2, V0, V3 // d731280c ++ VNEGV V2, V3 // d741200e ++ VNEGV V2, V0, V3 // d741200c ++ ++ // 31.11.2: Vector Widening Integer Add/Subtract ++ VWADDUVV V1, V2, V3 // d7a120c2 ++ VWADDUVV V1, V2, V0, V3 // d7a120c0 ++ VWADDUVX X10, V2, V3 // d76125c2 ++ VWADDUVX X10, V2, V0, V3 // d76125c0 ++ VWSUBUVV V1, V2, V3 // d7a120ca ++ VWSUBUVV V1, V2, V0, V3 // d7a120c8 ++ VWSUBUVX X10, V2, V3 // d76125ca ++ VWSUBUVX X10, V2, V0, V3 // d76125c8 ++ VWADDVV V1, V2, V3 // d7a120c6 ++ VWADDVV V1, V2, V0, V3 // d7a120c4 ++ VWADDVX X10, V2, V3 // d76125c6 ++ VWADDVX X10, V2, V0, V3 // d76125c4 ++ VWSUBVV V1, V2, V3 // d7a120ce ++ VWSUBVV V1, V2, V0, V3 // d7a120cc ++ VWSUBVX X10, V2, V3 // d76125ce ++ VWSUBVX X10, V2, V0, V3 // d76125cc ++ VWADDUWV V1, V2, V3 // d7a120d2 ++ VWADDUWV V1, V2, V0, V3 // d7a120d0 ++ VWADDUWX X10, V2, V3 // d76125d2 ++ VWADDUWX X10, V2, V0, V3 // d76125d0 ++ VWSUBUWV V1, V2, V3 // d7a120da ++ VWSUBUWV V1, V2, V0, V3 // d7a120d8 ++ VWSUBUWX X10, V2, V3 // d76125da ++ VWSUBUWX X10, V2, V0, V3 // d76125d8 ++ VWADDWV V1, V2, V3 // d7a120d6 ++ VWADDWV V1, V2, V0, V3 // d7a120d4 ++ VWADDWX X10, V2, V3 // d76125d6 ++ VWADDWX X10, V2, V0, V3 // d76125d4 ++ VWSUBWV V1, V2, V3 // d7a120de ++ VWSUBWV V1, V2, V0, V3 // d7a120dc ++ VWSUBWX X10, V2, V3 // d76125de ++ VWSUBWX X10, V2, V0, V3 // d76125dc ++ VWCVTXXV V2, V3 // d76120c6 ++ VWCVTXXV V2, V0, V3 // d76120c4 ++ VWCVTUXXV V2, V3 // d76120c2 ++ VWCVTUXXV V2, V0, V3 // d76120c0 ++ ++ // 31.11.3: Vector Integer Extension ++ VZEXTVF2 V2, V3 // d721234a ++ VZEXTVF2 V2, V0, V3 // d7212348 ++ VSEXTVF2 V2, V3 // d7a1234a ++ VSEXTVF2 V2, V0, V3 // d7a12348 ++ VZEXTVF4 V2, V3 // d721224a ++ VZEXTVF4 V2, V0, V3 // d7212248 ++ VSEXTVF4 V2, V3 // d7a1224a ++ VSEXTVF4 V2, V0, V3 // d7a12248 ++ VZEXTVF8 V2, V3 // d721214a ++ VZEXTVF8 V2, V0, V3 // d7212148 ++ VSEXTVF8 V2, V3 // d7a1214a ++ VSEXTVF8 V2, V0, V3 // d7a12148 ++ ++ // 31.11.4: Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions ++ VADCVVM V1, V2, V0, V3 // d7812040 ++ VADCVXM X11, V2, V0, V3 // d7c12540 ++ VADCVIM $15, V2, V0, V3 // d7b12740 ++ VMADCVVM V1, V2, V0, V3 // d7812044 ++ VMADCVXM X11, V2, V0, V3 // d7c12544 ++ VMADCVIM $15, V2, V0, V3 // d7b12744 ++ VMADCVV V1, V2, V3 // d7812046 ++ VMADCVX X11, V2, V3 // d7c12546 ++ VMADCVI $15, V2, V3 // d7b12746 ++ VSBCVVM V1, V2, V0, V3 // d7812048 ++ VSBCVXM X11, V2, V0, V3 // d7c12548 ++ VMSBCVVM V1, V2, V0, V3 // d781204c ++ VMSBCVXM X11, V2, V0, V3 // d7c1254c ++ VMSBCVV V1, V2, V3 // d781204e ++ VMSBCVX X11, V2, V3 // d7c1254e ++ ++ // 31.11.5: Vector Bitwise Logical Instructions ++ VANDVV V1, V2, V3 // d7812026 ++ VANDVV V1, V2, V0, V3 // d7812024 ++ VANDVX X11, V2, V3 // d7c12526 ++ VANDVX X11, V2, V0, V3 // d7c12524 ++ VANDVI $15, V2, V3 // d7b12726 ++ VANDVI $15, V2, V0, V3 // d7b12724 ++ VORVV V1, V2, V3 // d781202a ++ VORVV V1, V2, V0, V3 // d7812028 ++ VORVX X11, V2, V3 // d7c1252a ++ VORVX X11, V2, V0, V3 // d7c12528 ++ VORVI $15, V2, V3 // d7b1272a ++ VORVI $15, V2, V0, V3 // d7b12728 ++ VXORVV V1, V2, V3 // d781202e ++ VXORVV V1, V2, V0, V3 // d781202c ++ VXORVX X11, V2, V3 // d7c1252e ++ VXORVX X11, V2, V0, V3 // d7c1252c ++ VXORVI $15, V2, V3 // d7b1272e ++ VXORVI $15, V2, V0, V3 // d7b1272c ++ VNOTV V2, V3 // d7b12f2e ++ VNOTV V2, V0, V3 // d7b12f2c ++ ++ // 31.11.6: Vector Single-Width Shift Instructions ++ VSLLVV V1, V2, V3 // d7812096 ++ VSLLVV V1, V2, V0, V3 // d7812094 ++ VSLLVX X11, V2, V3 // d7c12596 ++ VSLLVX X11, V2, V0, V3 // d7c12594 ++ VSLLVI $15, V2, V3 // d7b12796 ++ VSLLVI $15, V2, V0, V3 // d7b12794 ++ VSRLVV V1, V2, V3 // d78120a2 ++ VSRLVV V1, V2, V0, V3 // d78120a0 ++ VSRLVX X11, V2, V3 // d7c125a2 ++ VSRLVX X11, V2, V0, V3 // d7c125a0 ++ VSRLVI $15, V2, V3 // d7b127a2 ++ VSRLVI $15, V2, V0, V3 // d7b127a0 ++ VSRAVV V1, V2, V3 // d78120a6 ++ VSRAVV V1, V2, V0, V3 // d78120a4 ++ VSRAVX X11, V2, V3 // d7c125a6 ++ VSRAVX X11, V2, V0, V3 // d7c125a4 ++ VSRAVI $15, V2, V3 // d7b127a6 ++ VSRAVI $15, V2, V0, V3 // d7b127a4 ++ ++ // 31.11.7: Vector Narrowing Integer Right Shift Instructions ++ VNSRLWV V1, V2, V3 // d78120b2 ++ VNSRLWV V1, V2, V0, V3 // d78120b0 ++ VNSRLWX X10, V2, V3 // d74125b2 ++ VNSRLWX X10, V2, V0, V3 // d74125b0 ++ VNSRLWI $31, V2, V3 // d7b12fb2 ++ VNSRLWI $31, V2, V0, V3 // d7b12fb0 ++ VNSRAWV V1, V2, V3 // d78120b6 ++ VNSRAWV V1, V2, V0, V3 // d78120b4 ++ VNSRAWX X10, V2, V3 // d74125b6 ++ VNSRAWX X10, V2, V0, V3 // d74125b4 ++ VNSRAWI $31, V2, V3 // d7b12fb6 ++ VNSRAWI $31, V2, V0, V3 // d7b12fb4 ++ VNCVTXXW V2, V3 // d74120b2 ++ VNCVTXXW V2, V0, V3 // d74120b0 ++ ++ // 31.11.8: Vector Integer Compare Instructions ++ VMSEQVV V1, V2, V3 // d7812062 ++ VMSEQVV V1, V2, V0, V3 // d7812060 ++ VMSEQVX X10, V2, V3 // d7412562 ++ VMSEQVX X10, V2, V0, V3 // d7412560 ++ VMSEQVI $15, V2, V3 // d7b12762 ++ VMSEQVI $15, V2, V0, V3 // d7b12760 ++ VMSNEVV V1, V2, V3 // d7812066 ++ VMSNEVV V1, V2, V0, V3 // d7812064 ++ VMSNEVX X10, V2, V3 // d7412566 ++ VMSNEVX X10, V2, V0, V3 // d7412564 ++ VMSNEVI $15, V2, V3 // d7b12766 ++ VMSNEVI $15, V2, V0, V3 // d7b12764 ++ VMSLTUVV V1, V2, V3 // d781206a ++ VMSLTUVV V1, V2, V0, V3 // d7812068 ++ VMSLTUVX X10, V2, V3 // d741256a ++ VMSLTUVX X10, V2, V0, V3 // d7412568 ++ VMSLTVV V1, V2, V3 // d781206e ++ VMSLTVV V1, V2, V0, V3 // d781206c ++ VMSLTVX X10, V2, V3 // d741256e ++ VMSLTVX X10, V2, V0, V3 // d741256c ++ VMSLEUVV V1, V2, V3 // d7812072 ++ VMSLEUVV V1, V2, V0, V3 // d7812070 ++ VMSLEUVX X10, V2, V3 // d7412572 ++ VMSLEUVX X10, V2, V0, V3 // d7412570 ++ VMSLEUVI $15, V2, V3 // d7b12772 ++ VMSLEUVI $15, V2, V0, V3 // d7b12770 ++ VMSLEVV V1, V2, V3 // d7812076 ++ VMSLEVV V1, V2, V0, V3 // d7812074 ++ VMSLEVX X10, V2, V3 // d7412576 ++ VMSLEVX X10, V2, V0, V3 // d7412574 ++ VMSLEVI $15, V2, V3 // d7b12776 ++ VMSLEVI $15, V2, V0, V3 // d7b12774 ++ VMSGTUVX X10, V2, V3 // d741257a ++ VMSGTUVX X10, V2, V0, V3 // d7412578 ++ VMSGTUVI $15, V2, V3 // d7b1277a ++ VMSGTUVI $15, V2, V0, V3 // d7b12778 ++ VMSGTVX X10, V2, V3 // d741257e ++ VMSGTVX X10, V2, V0, V3 // d741257c ++ VMSGTVI $15, V2, V3 // d7b1277e ++ VMSGTVI $15, V2, V0, V3 // d7b1277c ++ VMSGTVV V1, V2, V3 // d701116e ++ VMSGTVV V1, V2, V0, V3 // d701116c ++ VMSGTUVV V1, V2, V3 // d701116a ++ VMSGTUVV V1, V2, V0, V3 // d7011168 ++ VMSGEVV V1, V2, V3 // d7011176 ++ VMSGEVV V1, V2, V0, V3 // d7011174 ++ VMSGEUVV V1, V2, V3 // d7011172 ++ VMSGEUVV V1, V2, V0, V3 // d7011170 ++ VMSLTVI $15, V2, V3 // d7312776 ++ VMSLTVI $15, V2, V0, V3 // d7312774 ++ VMSLTUVI $15, V2, V3 // d7312772 ++ VMSLTUVI $15, V2, V0, V3 // d7312770 ++ VMSGEVI $15, V2, V3 // d731277e ++ VMSGEVI $15, V2, V0, V3 // d731277c ++ VMSGEUVI $15, V2, V3 // d731277a ++ VMSGEUVI $15, V2, V0, V3 // d7312778 ++ ++ // 31.11.9: Vector Integer Min/Max Instructions ++ VMINUVV V1, V2, V3 // d7812012 ++ VMINUVV V1, V2, V0, V3 // d7812010 ++ VMINUVX X10, V2, V3 // d7412512 ++ VMINUVX X10, V2, V0, V3 // d7412510 ++ VMINVV V1, V2, V3 // d7812016 ++ VMINVV V1, V2, V0, V3 // d7812014 ++ VMINVX X10, V2, V3 // d7412516 ++ VMINVX X10, V2, V0, V3 // d7412514 ++ VMAXUVV V1, V2, V3 // d781201a ++ VMAXUVV V1, V2, V0, V3 // d7812018 ++ VMAXUVX X10, V2, V3 // d741251a ++ VMAXUVX X10, V2, V0, V3 // d7412518 ++ VMAXVV V1, V2, V3 // d781201e ++ VMAXVV V1, V2, V0, V3 // d781201c ++ VMAXVX X10, V2, V3 // d741251e ++ VMAXVX X10, V2, V0, V3 // d741251c ++ ++ // 31.11.10: Vector Single-Width Integer Multiply Instructions ++ VMULVV V1, V2, V3 // d7a12096 ++ VMULVV V1, V2, V0, V3 // d7a12094 ++ VMULVX X10, V2, V3 // d7612596 ++ VMULVX X10, V2, V0, V3 // d7612594 ++ VMULHVV V1, V2, V3 // d7a1209e ++ VMULHVV V1, V2, V0, V3 // d7a1209c ++ VMULHVX X10, V2, V3 // d761259e ++ VMULHVX X10, V2, V0, V3 // d761259c ++ VMULHUVV V1, V2, V3 // d7a12092 ++ VMULHUVV V1, V2, V0, V3 // d7a12090 ++ VMULHUVX X10, V2, V3 // d7612592 ++ VMULHUVX X10, V2, V0, V3 // d7612590 ++ VMULHSUVV V1, V2, V3 // d7a1209a ++ VMULHSUVV V1, V2, V0, V3 // d7a12098 ++ VMULHSUVX X10, V2, V3 // d761259a ++ VMULHSUVX X10, V2, V0, V3 // d7612598 ++ ++ // 31.11.11: Vector Integer Divide Instructions ++ VDIVUVV V1, V2, V3 // d7a12082 ++ VDIVUVV V1, V2, V0, V3 // d7a12080 ++ VDIVUVX X10, V2, V3 // d7612582 ++ VDIVUVX X10, V2, V0, V3 // d7612580 ++ VDIVVV V1, V2, V3 // d7a12086 ++ VDIVVV V1, V2, V0, V3 // d7a12084 ++ VDIVVX X10, V2, V3 // d7612586 ++ VDIVVX X10, V2, V0, V3 // d7612584 ++ VREMUVV V1, V2, V3 // d7a1208a ++ VREMUVV V1, V2, V0, V3 // d7a12088 ++ VREMUVX X10, V2, V3 // d761258a ++ VREMUVX X10, V2, V0, V3 // d7612588 ++ VREMVV V1, V2, V3 // d7a1208e ++ VREMVV V1, V2, V0, V3 // d7a1208c ++ VREMVX X10, V2, V3 // d761258e ++ VREMVX X10, V2, V0, V3 // d761258c ++ ++ // 31.11.12: Vector Widening Integer Multiply Instructions ++ VWMULVV V1, V2, V3 // d7a120ee ++ VWMULVV V1, V2, V0, V3 // d7a120ec ++ VWMULVX X10, V2, V3 // d76125ee ++ VWMULVX X10, V2, V0, V3 // d76125ec ++ VWMULUVV V1, V2, V3 // d7a120e2 ++ VWMULUVV V1, V2, V0, V3 // d7a120e0 ++ VWMULUVX X10, V2, V3 // d76125e2 ++ VWMULUVX X10, V2, V0, V3 // d76125e0 ++ VWMULSUVV V1, V2, V3 // d7a120ea ++ VWMULSUVV V1, V2, V0, V3 // d7a120e8 ++ VWMULSUVX X10, V2, V3 // d76125ea ++ VWMULSUVX X10, V2, V0, V3 // d76125e8 ++ ++ // 31.11.13: Vector Single-Width Integer Multiply-Add Instructions ++ VMACCVV V1, V2, V3 // d7a120b6 ++ VMACCVV V1, V2, V0, V3 // d7a120b4 ++ VMACCVX X10, V2, V3 // d76125b6 ++ VMACCVX X10, V2, V0, V3 // d76125b4 ++ VNMSACVV V1, V2, V3 // d7a120be ++ VNMSACVV V1, V2, V0, V3 // d7a120bc ++ VNMSACVX X10, V2, V3 // d76125be ++ VNMSACVX X10, V2, V0, V3 // d76125bc ++ VMADDVV V1, V2, V3 // d7a120a6 ++ VMADDVV V1, V2, V0, V3 // d7a120a4 ++ VMADDVX X10, V2, V3 // d76125a6 ++ VMADDVX X10, V2, V0, V3 // d76125a4 ++ VNMSUBVV V1, V2, V3 // d7a120ae ++ VNMSUBVV V1, V2, V0, V3 // d7a120ac ++ VNMSUBVX X10, V2, V3 // d76125ae ++ VNMSUBVX X10, V2, V0, V3 // d76125ac ++ ++ // 31.11.14: Vector Widening Integer Multiply-Add Instructions ++ VWMACCUVV V1, V2, V3 // d7a120f2 ++ VWMACCUVV V1, V2, V0, V3 // d7a120f0 ++ VWMACCUVX X10, V2, V3 // d76125f2 ++ VWMACCUVX X10, V2, V0, V3 // d76125f0 ++ VWMACCVV V1, V2, V3 // d7a120f6 ++ VWMACCVV V1, V2, V0, V3 // d7a120f4 ++ VWMACCVX X10, V2, V3 // d76125f6 ++ VWMACCVX X10, V2, V0, V3 // d76125f4 ++ VWMACCSUVV V1, V2, V3 // d7a120fe ++ VWMACCSUVV V1, V2, V0, V3 // d7a120fc ++ VWMACCSUVX X10, V2, V3 // d76125fe ++ VWMACCSUVX X10, V2, V0, V3 // d76125fc ++ VWMACCUSVX X10, V2, V3 // d76125fa ++ VWMACCUSVX X10, V2, V0, V3 // d76125f8 ++ ++ // 31.11.15: Vector Integer Merge Instructions ++ VMERGEVVM V1, V2, V0, V3 // d781205c ++ VMERGEVXM X10, V2, V0, V3 // d741255c ++ VMERGEVIM $15, V2, V0, V3 // d7b1275c ++ ++ // 31.11.16: Vector Integer Move Instructions ++ VMVVV V2, V3 // d701015e ++ VMVVX X10, V3 // d741055e ++ VMVVI $15, V3 // d7b1075e ++ + // + // Privileged ISA + // +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index 005b794612..025d63a15c 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -46,6 +46,10 @@ TEXT errors(SB),$0 + SRAIW $-1, X5, X6 // ERROR "immediate out of range 0 to 31" + SD X5, 4294967296(X6) // ERROR "constant 4294967296 too large" + FNES F1, (X5) // ERROR "needs an integer register output" ++ ++ // ++ // "V" Standard Extension for Vector Operations, Version 1.0 ++ // + VSETIVLI X10, E32, M2, TA, MA, X12 // ERROR "expected immediate value" + VLE8V (X10), V1, V3 // ERROR "invalid vector mask register" + VSE8V V3, V1, (X10) // ERROR "invalid vector mask register" +@@ -57,4 +61,165 @@ TEXT errors(SB),$0 + VSOXEI8V V3, V2, V1, (X10) // ERROR "invalid vector mask register" + VL1RV (X10), V0, V3 // ERROR "too many operands for instruction" + VS1RV V3, V0, (X11) // ERROR "too many operands for instruction" ++ VADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VADDVX X10, V2, V1, V3 // ERROR "invalid vector mask register" ++ VADDVI $15, V4, V1, V2 // ERROR "invalid vector mask register" ++ VSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSUBVX X10, V2, V1, V3 // ERROR "invalid vector mask register" ++ VRSUBVX X10, V2, V1, V3 // ERROR "invalid vector mask register" ++ VRSUBVI $15, V4, V1, V2 // ERROR "invalid vector mask register" ++ VNEGV V2, V3, V4 // ERROR "invalid vector mask register" ++ VWADDUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDUWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDUWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBUWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBUWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWCVTXXV V2, V1, V3 // ERROR "invalid vector mask register" ++ VWCVTUXXV V2, V1, V3 // ERROR "invalid vector mask register" ++ VZEXTVF2 V2, V3, V4 // ERROR "invalid vector mask register" ++ VSEXTVF2 V2, V3, V4 // ERROR "invalid vector mask register" ++ VZEXTVF4 V2, V3, V4 // ERROR "invalid vector mask register" ++ VSEXTVF4 V2, V3, V4 // ERROR "invalid vector mask register" ++ VZEXTVF8 V2, V3, V4 // ERROR "invalid vector mask register" ++ VSEXTVF8 V2, V3, V4 // ERROR "invalid vector mask register" ++ VADCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VADCVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VADCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VADCVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VADCVIM $15, V2, V1, V3 // ERROR "invalid vector mask register" ++ VADCVIM $15, V2, V3 // ERROR "invalid vector mask register" ++ VMADCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMADCVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VMADCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMADCVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VMADCVIM $15, V2, V1, V3 // ERROR "invalid vector mask register" ++ VMADCVIM $15, V2, V3 // ERROR "invalid vector mask register" ++ VSBCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSBCVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VSBCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSBCVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VMSBCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSBCVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VMSBCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSBCVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VANDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VANDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VANDVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VORVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VORVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VORVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VXORVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VXORVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VXORVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNOTV V1, V2, V3 // ERROR "invalid vector mask register" ++ VSLLVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSLLVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSLLVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRLVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRLVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRLVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRAVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRAVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRAVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRLWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRLWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRLWI $31, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRAWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRAWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRAWI $31, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCVTXXW V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSEQVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSEQVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSEQVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSNEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSNEVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSNEVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGEUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGEVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGEUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMINUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMINUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMINVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMINVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMAXUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMAXUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMAXVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMAXVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHSUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHSUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VDIVUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VDIVUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VDIVVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VDIVVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREMUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREMUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREMVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREMVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULSUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULSUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMACCVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMACCVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNMSACVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNMSACVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNMSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNMSUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCSUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCSUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCUSVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMERGEVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VMERGEVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMERGEVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VMERGEVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMERGEVIM $15, V2, V3 // ERROR "invalid vector mask register" ++ VMERGEVIM $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMVVV V1, V2, V3 // ERROR "too many operands for instruction" ++ VMVVX X10, V2, V3 // ERROR "too many operands for instruction" ++ VMVVI $15, V2, V3 // ERROR "too many operands for instruction" ++ + RET +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +index 773f275dd3..602cab2c2e 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +@@ -11,36 +11,231 @@ + TEXT validation(SB),$0 + SRLI $1, X5, F1 // ERROR "expected integer register in rd position but got non-integer register F1" + SRLI $1, F1, X5 // ERROR "expected integer register in rs1 position but got non-integer register F1" ++ ++ // ++ // "V" Standard Extension for Vector Operations, Version 1.0 ++ // + VSETVLI $32, E16, M1, TU, MU, X12 // ERROR "must be in range [0, 31] (5 bits)" + VSETVLI $-1, E32, M2, TA, MA, X12 // ERROR "must be in range [0, 31] (5 bits)" + VSETVL X10, X11 // ERROR "expected integer register in rs1 position" +- VLE8V (X10), X10 // ERROR "expected vector register in rd position" ++ VLE8V (X10), X10 // ERROR "expected vector register in vd position" + VLE8V (V1), V3 // ERROR "expected integer register in rs1 position" +- VSE8V X10, (X10) // ERROR "expected vector register in rs1 position" ++ VSE8V X10, (X10) // ERROR "expected vector register in vs1 position" + VSE8V V3, (V1) // ERROR "expected integer register in rd position" + VLSE8V (X10), V3 // ERROR "expected integer register in rs2 position" +- VLSE8V (X10), X10, X11 // ERROR "expected vector register in rd position" ++ VLSE8V (X10), X10, X11 // ERROR "expected vector register in vd position" + VLSE8V (V1), X10, V3 // ERROR "expected integer register in rs1 position" + VLSE8V (X10), V1, V0, V3 // ERROR "expected integer register in rs2 position" + VSSE8V V3, (X10) // ERROR "expected integer register in rs2 position" +- VSSE8V X10, X11, (X10) // ERROR "expected vector register in rd position" ++ VSSE8V X10, X11, (X10) // ERROR "expected vector register in vd position" + VSSE8V V3, X11, (V1) // ERROR "expected integer register in rs1 position" + VSSE8V V3, V1, V0, (X10) // ERROR "expected integer register in rs2 position" +- VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" +- VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" ++ VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in vd position" ++ VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in vd position" + VLUXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" +- VLUXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in rs2 position" +- VSUXEI8V X10, V2, (X10) // ERROR "expected vector register in rd position" ++ VLUXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in vs2 position" ++ VSUXEI8V X10, V2, (X10) // ERROR "expected vector register in vd position" + VSUXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" +- VSUXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in rs2 position" +- VLOXEI8V (X10), V2, X11 // ERROR "expected vector register in rd position" ++ VSUXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in vs2 position" ++ VLOXEI8V (X10), V2, X11 // ERROR "expected vector register in vd position" + VLOXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" +- VLOXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in rs2 position" +- VSOXEI8V X10, V2, (X10) // ERROR "expected vector register in rd position" ++ VLOXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in vs2 position" ++ VSOXEI8V X10, V2, (X10) // ERROR "expected vector register in vd position" + VSOXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" +- VSOXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in rs2 position" +- VL1RV (X10), X10 // ERROR "expected vector register in rd position" ++ VSOXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in vs2 position" ++ VL1RV (X10), X10 // ERROR "expected vector register in vd position" + VL1RV (V1), V3 // ERROR "expected integer register in rs1 position" +- VS1RV X11, (X11) // ERROR "expected vector register in rs1 position" ++ VS1RV X11, (X11) // ERROR "expected vector register in vs1 position" + VS1RV V3, (V1) // ERROR "expected integer register in rd position" ++ VADDVV V1, X10, V3 // ERROR "expected vector register in vs2 position" ++ VADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VADDVI $16, V4, V2 // ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)" ++ VADDVI $-17, V4, V2 // ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)" ++ VSUBVV V1, X10, V3 // ERROR "expected vector register in vs2 position" ++ VSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VRSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VRSUBVI $16, V4, V2 // ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)" ++ VRSUBVI $-17, V4, V2 // ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)" ++ VNEGV X10, V3 // ERROR "expected vector register in vs2 position" ++ VNEGV V2 // ERROR "expected vector register in vd position" ++ VWADDUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWADDUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWSUBUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWSUBUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWSUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWADDUWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWADDUWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWSUBUWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWSUBUWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWADDWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWADDWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWSUBWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWSUBWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWCVTXXV X10, V3 // ERROR "expected vector register in vs2 position" ++ VWCVTUXXV X10, V3 // ERROR "expected vector register in vs2 position" ++ VZEXTVF2 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VZEXTVF2 X10, V3 // ERROR "expected vector register in vs2 position" ++ VSEXTVF2 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VSEXTVF2 X10, V3 // ERROR "expected vector register in vs2 position" ++ VZEXTVF4 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VZEXTVF4 X10, V3 // ERROR "expected vector register in vs2 position" ++ VSEXTVF4 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VSEXTVF4 X10, V3 // ERROR "expected vector register in vs2 position" ++ VZEXTVF8 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VZEXTVF8 X10, V3 // ERROR "expected vector register in vs2 position" ++ VSEXTVF8 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VSEXTVF8 X10, V3 // ERROR "expected vector register in vs2 position" ++ VADCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" ++ VADCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" ++ VADCVIM $16, V2, V0, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VADCVIM $-17, V2, V0, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMADCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" ++ VMADCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" ++ VMADCVIM $16, V2, V0, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMADCVIM $-17, V2, V0, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMADCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMADCVV V1, V2, V0, V3 // ERROR "expected no register in rs3" ++ VMADCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMADCVX X10, V2, V0, V3 // ERROR "expected no register in rs3" ++ VMADCVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMADCVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMADCVI $15, V2, V0, V3 // ERROR "expected no register in rs3" ++ VSBCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" ++ VSBCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" ++ VMSBCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" ++ VMSBCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" ++ VMSBCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSBCVV V1, V2, V0, V3 // ERROR "expected no register in rs3" ++ VMSBCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSBCVX X10, V2, V0, V3 // ERROR "expected no register in rs3" ++ VANDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VANDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VANDVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VANDVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VORVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VORVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VORVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VORVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VXORVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VXORVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VXORVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VXORVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VNOTV V3 // ERROR "expected vector register in vd position" ++ VNOTV X10, V3 // ERROR "expected vector register in vs2 position" ++ VSLLVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSLLVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSLLVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VSLLVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VSRLVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSRLVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSRLVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VSRLVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VSRAVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSRAVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSRAVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VSRAVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VNSRLWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNSRLWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNSRLWI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VNSRLWI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VNSRAWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNSRAWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNSRAWI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VNSRAWI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VNCVTXXW X10, V3 // ERROR "expected vector register in vs2 position" ++ VMSEQVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSEQVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSEQVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSEQVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSNEVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSNEVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSNEVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSNEVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSLTUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSLTUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSLTVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSLTVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSLEUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSLEUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSLEUVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSLEUVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSLEVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSLEVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSLEVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSLEVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSGTUVV X10, V2, V3 // ERROR "expected vector register in vs2 position" ++ VMSGTUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSGTUVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSGTUVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSGTVV X10, V2, V3 // ERROR "expected vector register in vs2 position" ++ VMSGTVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSGTVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSGTVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSGEVV X10, V2, V3 // ERROR "expected vector register in vs2 position" ++ VMSGEUVV X10, V2, V3 // ERROR "expected vector register in vs2 position" ++ VMSLTVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSLTVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSLTUVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSLTUVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSGEVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSGEVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSGEUVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSGEUVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMINUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMINUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMINVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMINVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMAXUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMAXUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMAXVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMAXVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMULVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMULHVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMULHVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMULHUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMULHUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMULHSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMULHSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VDIVUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VDIVUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VDIVVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VDIVVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VREMUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREMUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VREMVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREMVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMULVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMULUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMULUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMULSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMULSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMACCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMACCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNMSACVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNMSACVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNMSUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNMSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMACCUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMACCUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMACCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMACCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMACCSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMACCSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMACCUSVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMERGEVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" ++ VMERGEVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" ++ VMERGEVIM $16, V2, V0, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMERGEVIM $-17, V2, V0, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMVVV X10, V3 // ERROR "expected vector register in vs1 position" ++ VMVVX V1, V2 // ERROR "expected integer register in rs1 position" ++ VMVVI $16, V2 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMVVI $-17, V2 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ + RET +diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go +index 6df5f0a173..a65dfceea9 100644 +--- a/src/cmd/internal/obj/riscv/anames.go ++++ b/src/cmd/internal/obj/riscv/anames.go +@@ -654,5 +654,18 @@ var Anames = []string{ + "VL2RV", + "VL4RV", + "VL8RV", ++ "VMSGEUVI", ++ "VMSGEUVV", ++ "VMSGEVI", ++ "VMSGEVV", ++ "VMSGTUVV", ++ "VMSGTVV", ++ "VMSLTUVI", ++ "VMSLTVI", ++ "VNCVTXXW", ++ "VNEGV", ++ "VNOTV", ++ "VWCVTUXXV", ++ "VWCVTXXV", + "LAST", + } +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index 8999ef149b..577b06f0ec 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -1172,6 +1172,19 @@ const ( + AVL2RV + AVL4RV + AVL8RV ++ AVMSGEUVI ++ AVMSGEUVV ++ AVMSGEVI ++ AVMSGEVV ++ AVMSGTUVV ++ AVMSGTVV ++ AVMSLTUVI ++ AVMSLTVI ++ AVNCVTXXW ++ AVNEGV ++ AVNOTV ++ AVWCVTUXXV ++ AVWCVTXXV + + // End marker + ALAST +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index b7989ddbd7..d85bdd302c 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1203,6 +1203,43 @@ func validateRFF(ctxt *obj.Link, ins *instruction) { + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + ++func validateRVIV(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "vd", ins.rd) ++ wantIntReg(ctxt, ins, "rs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "vs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateRVV(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "vd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "vs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateRVVi(ctxt *obj.Link, ins *instruction) { ++ wantImmI(ctxt, ins, ins.imm, 5) ++ wantVectorReg(ctxt, ins, "vd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "vs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateRVVu(ctxt *obj.Link, ins *instruction) { ++ wantImmU(ctxt, ins, ins.imm, 5) ++ wantVectorReg(ctxt, ins, "vd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "vs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateRVVV(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "vd", ins.rd) ++ wantVectorReg(ctxt, ins, "vs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "vs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ + func validateIII(ctxt *obj.Link, ins *instruction) { + wantImmI(ctxt, ins, ins.imm, 12) + wantIntReg(ctxt, ins, "rd", ins.rd) +@@ -1220,23 +1257,23 @@ func validateIF(ctxt *obj.Link, ins *instruction) { + } + + func validateIV(ctxt *obj.Link, ins *instruction) { +- wantVectorReg(ctxt, ins, "rd", ins.rd) ++ wantVectorReg(ctxt, ins, "vd", ins.rd) + wantIntReg(ctxt, ins, "rs1", ins.rs1) + wantNoneReg(ctxt, ins, "rs2", ins.rs2) + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateIIIV(ctxt *obj.Link, ins *instruction) { +- wantVectorReg(ctxt, ins, "rd", ins.rd) ++ wantVectorReg(ctxt, ins, "vd", ins.rd) + wantIntReg(ctxt, ins, "rs1", ins.rs1) + wantIntReg(ctxt, ins, "rs2", ins.rs2) + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateIVIV(ctxt *obj.Link, ins *instruction) { +- wantVectorReg(ctxt, ins, "rd", ins.rd) ++ wantVectorReg(ctxt, ins, "vd", ins.rd) + wantIntReg(ctxt, ins, "rs1", ins.rs1) +- wantVectorReg(ctxt, ins, "rs2", ins.rs2) ++ wantVectorReg(ctxt, ins, "vs2", ins.rs2) + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + +@@ -1258,22 +1295,22 @@ func validateSF(ctxt *obj.Link, ins *instruction) { + + func validateSV(ctxt *obj.Link, ins *instruction) { + wantIntReg(ctxt, ins, "rd", ins.rd) +- wantVectorReg(ctxt, ins, "rs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "vs1", ins.rs1) + wantNoneReg(ctxt, ins, "rs2", ins.rs2) + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateSVII(ctxt *obj.Link, ins *instruction) { +- wantVectorReg(ctxt, ins, "rd", ins.rd) ++ wantVectorReg(ctxt, ins, "vd", ins.rd) + wantIntReg(ctxt, ins, "rs1", ins.rs1) + wantIntReg(ctxt, ins, "rs2", ins.rs2) + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + + func validateSVIV(ctxt *obj.Link, ins *instruction) { +- wantVectorReg(ctxt, ins, "rd", ins.rd) ++ wantVectorReg(ctxt, ins, "vd", ins.rd) + wantIntReg(ctxt, ins, "rs1", ins.rs1) +- wantVectorReg(ctxt, ins, "rs2", ins.rs2) ++ wantVectorReg(ctxt, ins, "vs2", ins.rs2) + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + +@@ -1350,11 +1387,15 @@ func encodeR(as obj.As, rs1, rs2, rd, funct3, funct7 uint32) uint32 { + if enc == nil { + panic("encodeR: could not encode instruction") + } ++ if enc.rs1 != 0 && rs1 != 0 { ++ panic("encodeR: instruction uses rs1, but rs1 is nonzero") ++ } + if enc.rs2 != 0 && rs2 != 0 { +- panic("encodeR: instruction uses rs2, but rs2 was nonzero") ++ panic("encodeR: instruction uses rs2, but rs2 is nonzero") + } + funct3 |= enc.funct3 + funct7 |= enc.funct7 ++ rs1 |= enc.rs1 + rs2 |= enc.rs2 + return funct7<<25 | rs2<<20 | rs1<<15 | funct3<<12 | rd<<7 | enc.opcode + } +@@ -1407,6 +1448,26 @@ func encodeRFF(ins *instruction) uint32 { + return encodeR(ins.as, regF(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7) + } + ++func encodeRVV(ins *instruction) uint32 { ++ return encodeR(ins.as, 0, regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) ++} ++ ++func encodeRVVi(ins *instruction) uint32 { ++ return encodeR(ins.as, immI(ins.as, ins.imm, 5), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) ++} ++ ++func encodeRVVu(ins *instruction) uint32 { ++ return encodeR(ins.as, immU(ins.as, ins.imm, 5), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) ++} ++ ++func encodeRVIV(ins *instruction) uint32 { ++ return encodeR(ins.as, regI(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) ++} ++ ++func encodeRVVV(ins *instruction) uint32 { ++ return encodeR(ins.as, regV(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) ++} ++ + // encodeI encodes an I-type RISC-V instruction. + func encodeI(as obj.As, rs1, rd, imm, funct7 uint32) uint32 { + enc := encode(as) +@@ -1690,6 +1751,11 @@ var ( + rFIEncoding = encoding{encode: encodeRFI, validate: validateRFI, length: 4} + rIFEncoding = encoding{encode: encodeRIF, validate: validateRIF, length: 4} + rFFEncoding = encoding{encode: encodeRFF, validate: validateRFF, length: 4} ++ rVVEncoding = encoding{encode: encodeRVV, validate: validateRVV, length: 4} ++ rVViEncoding = encoding{encode: encodeRVVi, validate: validateRVVi, length: 4} ++ rVVuEncoding = encoding{encode: encodeRVVu, validate: validateRVVu, length: 4} ++ rVIVEncoding = encoding{encode: encodeRVIV, validate: validateRVIV, length: 4} ++ rVVVEncoding = encoding{encode: encodeRVVV, validate: validateRVVV, length: 4} + + iIIEncoding = encoding{encode: encodeIII, validate: validateIII, length: 4} + iFEncoding = encoding{encode: encodeIF, validate: validateIF, length: 4} +@@ -2027,7 +2093,7 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AVSOXEI32V & obj.AMask: {enc: sVIVEncoding}, + AVSOXEI64V & obj.AMask: {enc: sVIVEncoding}, + +- // 31.7.9. Vector Load/Store Whole Register Instructions ++ // 31.7.9: Vector Load/Store Whole Register Instructions + AVL1RE8V & obj.AMask: {enc: iVEncoding}, + AVL1RE16V & obj.AMask: {enc: iVEncoding}, + AVL1RE32V & obj.AMask: {enc: iVEncoding}, +@@ -2049,6 +2115,177 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AVS4RV & obj.AMask: {enc: sVEncoding}, + AVS8RV & obj.AMask: {enc: sVEncoding}, + ++ // 31.11.1: Vector Single-Width Integer Add and Subtract ++ AVADDVV & obj.AMask: {enc: rVVVEncoding}, ++ AVADDVX & obj.AMask: {enc: rVIVEncoding}, ++ AVADDVI & obj.AMask: {enc: rVViEncoding}, ++ AVSUBVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSUBVX & obj.AMask: {enc: rVIVEncoding}, ++ AVRSUBVX & obj.AMask: {enc: rVIVEncoding}, ++ AVRSUBVI & obj.AMask: {enc: rVViEncoding}, ++ ++ // 31.11.2: Vector Widening Integer Add/Subtract ++ AVWADDUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVWADDUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVWSUBUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVWSUBUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVWADDVV & obj.AMask: {enc: rVVVEncoding}, ++ AVWADDVX & obj.AMask: {enc: rVIVEncoding}, ++ AVWSUBVV & obj.AMask: {enc: rVVVEncoding}, ++ AVWSUBVX & obj.AMask: {enc: rVIVEncoding}, ++ AVWADDUWV & obj.AMask: {enc: rVVVEncoding}, ++ AVWADDUWX & obj.AMask: {enc: rVIVEncoding}, ++ AVWSUBUWV & obj.AMask: {enc: rVVVEncoding}, ++ AVWSUBUWX & obj.AMask: {enc: rVIVEncoding}, ++ AVWADDWV & obj.AMask: {enc: rVVVEncoding}, ++ AVWADDWX & obj.AMask: {enc: rVIVEncoding}, ++ AVWSUBWV & obj.AMask: {enc: rVVVEncoding}, ++ AVWSUBWX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.11.3: Vector Integer Extension ++ AVZEXTVF2 & obj.AMask: {enc: rVVEncoding}, ++ AVSEXTVF2 & obj.AMask: {enc: rVVEncoding}, ++ AVZEXTVF4 & obj.AMask: {enc: rVVEncoding}, ++ AVSEXTVF4 & obj.AMask: {enc: rVVEncoding}, ++ AVZEXTVF8 & obj.AMask: {enc: rVVEncoding}, ++ AVSEXTVF8 & obj.AMask: {enc: rVVEncoding}, ++ ++ // 31.11.4: Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions ++ AVADCVVM & obj.AMask: {enc: rVVVEncoding}, ++ AVADCVXM & obj.AMask: {enc: rVIVEncoding}, ++ AVADCVIM & obj.AMask: {enc: rVViEncoding}, ++ AVMADCVVM & obj.AMask: {enc: rVVVEncoding}, ++ AVMADCVXM & obj.AMask: {enc: rVIVEncoding}, ++ AVMADCVIM & obj.AMask: {enc: rVViEncoding}, ++ AVMADCVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMADCVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMADCVI & obj.AMask: {enc: rVViEncoding}, ++ AVSBCVVM & obj.AMask: {enc: rVVVEncoding}, ++ AVSBCVXM & obj.AMask: {enc: rVIVEncoding}, ++ AVMSBCVVM & obj.AMask: {enc: rVVVEncoding}, ++ AVMSBCVXM & obj.AMask: {enc: rVIVEncoding}, ++ AVMSBCVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMSBCVX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.11.5: Vector Bitwise Logical Instructions ++ AVANDVV & obj.AMask: {enc: rVVVEncoding}, ++ AVANDVX & obj.AMask: {enc: rVIVEncoding}, ++ AVANDVI & obj.AMask: {enc: rVViEncoding}, ++ AVORVV & obj.AMask: {enc: rVVVEncoding}, ++ AVORVX & obj.AMask: {enc: rVIVEncoding}, ++ AVORVI & obj.AMask: {enc: rVViEncoding}, ++ AVXORVV & obj.AMask: {enc: rVVVEncoding}, ++ AVXORVX & obj.AMask: {enc: rVIVEncoding}, ++ AVXORVI & obj.AMask: {enc: rVViEncoding}, ++ ++ // 31.11.6: Vector Single-Width Shift Instructions ++ AVSLLVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSLLVX & obj.AMask: {enc: rVIVEncoding}, ++ AVSLLVI & obj.AMask: {enc: rVVuEncoding}, ++ AVSRLVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSRLVX & obj.AMask: {enc: rVIVEncoding}, ++ AVSRLVI & obj.AMask: {enc: rVVuEncoding}, ++ AVSRAVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSRAVX & obj.AMask: {enc: rVIVEncoding}, ++ AVSRAVI & obj.AMask: {enc: rVVuEncoding}, ++ ++ // 31.11.7: Vector Narrowing Integer Right Shift Instructions ++ AVNSRLWV & obj.AMask: {enc: rVVVEncoding}, ++ AVNSRLWX & obj.AMask: {enc: rVIVEncoding}, ++ AVNSRLWI & obj.AMask: {enc: rVVuEncoding}, ++ AVNSRAWV & obj.AMask: {enc: rVVVEncoding}, ++ AVNSRAWX & obj.AMask: {enc: rVIVEncoding}, ++ AVNSRAWI & obj.AMask: {enc: rVVuEncoding}, ++ ++ // 31.11.8: Vector Integer Compare Instructions ++ AVMSEQVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMSEQVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMSEQVI & obj.AMask: {enc: rVViEncoding}, ++ AVMSNEVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMSNEVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMSNEVI & obj.AMask: {enc: rVViEncoding}, ++ AVMSLTUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMSLTUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMSLTVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMSLTVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMSLEUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMSLEUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMSLEUVI & obj.AMask: {enc: rVViEncoding}, ++ AVMSLEVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMSLEVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMSLEVI & obj.AMask: {enc: rVViEncoding}, ++ AVMSGTUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMSGTUVI & obj.AMask: {enc: rVViEncoding}, ++ AVMSGTVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMSGTVI & obj.AMask: {enc: rVViEncoding}, ++ ++ // 31.11.9: Vector Integer Min/Max Instructions ++ AVMINUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMINUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMINVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMINVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMAXUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMAXUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMAXVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMAXVX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.11.10: Vector Single-Width Integer Multiply Instructions ++ AVMULVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMULVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMULHVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMULHVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMULHUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMULHUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMULHSUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMULHSUVX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.11.11: Vector Integer Divide Instructions ++ AVDIVUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVDIVUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVDIVVV & obj.AMask: {enc: rVVVEncoding}, ++ AVDIVVX & obj.AMask: {enc: rVIVEncoding}, ++ AVREMUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVREMUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVREMVV & obj.AMask: {enc: rVVVEncoding}, ++ AVREMVX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.11.12: Vector Widening Integer Multiply Instructions ++ AVWMULVV & obj.AMask: {enc: rVVVEncoding}, ++ AVWMULVX & obj.AMask: {enc: rVIVEncoding}, ++ AVWMULUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVWMULUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVWMULSUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVWMULSUVX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.11.13: Vector Single-Width Integer Multiply-Add Instructions ++ AVMACCVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMACCVX & obj.AMask: {enc: rVIVEncoding}, ++ AVNMSACVV & obj.AMask: {enc: rVVVEncoding}, ++ AVNMSACVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMADDVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMADDVX & obj.AMask: {enc: rVIVEncoding}, ++ AVNMSUBVV & obj.AMask: {enc: rVVVEncoding}, ++ AVNMSUBVX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.11.14: Vector Widening Integer Multiply-Add Instructions ++ AVWMACCUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVWMACCUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVWMACCVV & obj.AMask: {enc: rVVVEncoding}, ++ AVWMACCVX & obj.AMask: {enc: rVIVEncoding}, ++ AVWMACCSUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVWMACCSUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVWMACCUSVX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.11.15: Vector Integer Merge Instructions ++ AVMERGEVVM & obj.AMask: {enc: rVVVEncoding}, ++ AVMERGEVXM & obj.AMask: {enc: rVIVEncoding}, ++ AVMERGEVIM & obj.AMask: {enc: rVViEncoding}, ++ ++ // 31.11.16: Vector Integer Move Instructions ++ AVMVVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMVVX & obj.AMask: {enc: rVIVEncoding}, ++ AVMVVI & obj.AMask: {enc: rVViEncoding}, ++ + // + // Privileged ISA + // +@@ -3020,6 +3257,142 @@ func instructionsForProg(p *obj.Prog) []*instruction { + p.Ctxt.Diag("%v: too many operands for instruction", p) + } + ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), obj.REG_NONE ++ ++ case AVADDVV, AVADDVX, AVSUBVV, AVSUBVX, AVRSUBVX, AVWADDUVV, AVWADDUVX, AVWSUBUVV, AVWSUBUVX, ++ AVWADDVV, AVWADDVX, AVWSUBVV, AVWSUBVX, AVWADDUWV, AVWADDUWX, AVWSUBUWV, AVWSUBUWX, ++ AVWADDWV, AVWADDWX, AVWSUBWV, AVWSUBWX, AVANDVV, AVANDVX, AVORVV, AVORVX, AVXORVV, AVXORVX, ++ AVSLLVV, AVSLLVX, AVSRLVV, AVSRLVX, AVSRAVV, AVSRAVX, ++ AVMSEQVV, AVMSEQVX, AVMSNEVV, AVMSNEVX, AVMSLTUVV, AVMSLTUVX, AVMSLTVV, AVMSLTVX, ++ AVMSLEUVV, AVMSLEUVX, AVMSLEVV, AVMSLEVX, AVMSGTUVX, AVMSGTVX, ++ AVMINUVV, AVMINUVX, AVMINVV, AVMINVX, AVMAXUVV, AVMAXUVX, AVMAXVV, AVMAXVX, ++ AVMULVV, AVMULVX, AVMULHVV, AVMULHVX, AVMULHUVV, AVMULHUVX, AVMULHSUVV, AVMULHSUVX, ++ AVDIVUVV, AVDIVUVX, AVDIVVV, AVDIVVX, AVREMUVV, AVREMUVX, AVREMVV, AVREMVX, ++ AVWMULVV, AVWMULVX, AVWMULUVV, AVWMULUVX, AVWMULSUVV, AVWMULSUVX, ++ AVNSRLWV, AVNSRLWX, AVNSRAWV, AVNSRAWX, ++ AVMACCVV, AVMACCVX, AVNMSACVV, AVNMSACVX, AVMADDVV, AVMADDVX, AVNMSUBVV, AVNMSUBVX, ++ AVWMACCUVV, AVWMACCUVX, AVWMACCVV, AVWMACCVX, AVWMACCSUVV, AVWMACCSUVX, AVWMACCUSVX: ++ // Set mask bit ++ switch { ++ case ins.rs3 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs3 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg), obj.REG_NONE ++ ++ case AVADDVI, AVRSUBVI, AVANDVI, AVORVI, AVXORVI, AVMSEQVI, AVMSNEVI, AVMSLEUVI, AVMSLEVI, AVMSGTUVI, AVMSGTVI, ++ AVSLLVI, AVSRLVI, AVSRAVI, AVNSRLWI, AVNSRAWI: ++ // Set mask bit ++ switch { ++ case ins.rs3 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs3 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), obj.REG_NONE, uint32(p.Reg), obj.REG_NONE ++ ++ case AVZEXTVF2, AVSEXTVF2, AVZEXTVF4, AVSEXTVF4, AVZEXTVF8, AVSEXTVF8: ++ // Set mask bit ++ switch { ++ case ins.rs1 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs1 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ ins.rs1 = obj.REG_NONE ++ ++ case AVMVVV, AVMVVX: ++ if ins.rs1 != obj.REG_NONE { ++ p.Ctxt.Diag("%v: too many operands for instruction", p) ++ } ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), REG_V0 ++ ++ case AVMVVI: ++ if ins.rs1 != obj.REG_NONE { ++ p.Ctxt.Diag("%v: too many operands for instruction", p) ++ } ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), obj.REG_NONE, REG_V0 ++ ++ case AVADCVVM, AVADCVXM, AVMADCVVM, AVMADCVXM, AVSBCVVM, AVSBCVXM, AVMSBCVVM, AVMSBCVXM, AVADCVIM, AVMADCVIM, ++ AVMERGEVVM, AVMERGEVXM, AVMERGEVIM: ++ if ins.rs3 != REG_V0 { ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg), obj.REG_NONE ++ ++ case AVMADCVV, AVMADCVX, AVMSBCVV, AVMSBCVX, AVMADCVI: ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg) ++ ++ case AVNEGV, AVWCVTXXV, AVWCVTUXXV, AVNCVTXXW: ++ // Set mask bit ++ switch { ++ case ins.rs1 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs1 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ switch ins.as { ++ case AVNEGV: ++ ins.as = AVRSUBVX ++ case AVWCVTXXV: ++ ins.as = AVWADDVX ++ case AVWCVTUXXV: ++ ins.as = AVWADDUVX ++ case AVNCVTXXW: ++ ins.as = AVNSRLWX ++ } ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), REG_X0, uint32(p.From.Reg) ++ ++ case AVNOTV: ++ // Set mask bit ++ switch { ++ case ins.rs1 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs1 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ ins.as = AVXORVI ++ ins.rd, ins.rs1, ins.rs2, ins.imm = uint32(p.To.Reg), obj.REG_NONE, uint32(p.From.Reg), -1 ++ ++ case AVMSGTVV, AVMSGTUVV, AVMSGEVV, AVMSGEUVV: ++ // Set mask bit ++ switch { ++ case ins.rs3 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs3 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ switch ins.as { ++ case AVMSGTVV: ++ ins.as = AVMSLTVV ++ case AVMSGTUVV: ++ ins.as = AVMSLTUVV ++ case AVMSGEVV: ++ ins.as = AVMSLEVV ++ case AVMSGEUVV: ++ ins.as = AVMSLEUVV ++ } ++ ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), obj.REG_NONE ++ ++ case AVMSLTVI, AVMSLTUVI, AVMSGEVI, AVMSGEUVI: ++ // Set mask bit ++ switch { ++ case ins.rs3 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs3 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ switch ins.as { ++ case AVMSLTVI: ++ ins.as = AVMSLEVI ++ case AVMSLTUVI: ++ ins.as = AVMSLEUVI ++ case AVMSGEVI: ++ ins.as = AVMSGTVI ++ case AVMSGEUVI: ++ ins.as = AVMSGTUVI ++ } ++ ins.rd, ins.rs1, ins.rs2, ins.rs3, ins.imm = uint32(p.To.Reg), obj.REG_NONE, uint32(p.Reg), obj.REG_NONE, ins.imm-1 + } + + for _, ins := range inss { +-- +2.39.5 + diff --git a/2090-cmd-internal-obj-riscv-add-support-for-vector-fixed-.patch b/2090-cmd-internal-obj-riscv-add-support-for-vector-fixed-.patch new file mode 100644 index 0000000..1ffcdeb --- /dev/null +++ b/2090-cmd-internal-obj-riscv-add-support-for-vector-fixed-.patch @@ -0,0 +1,266 @@ +From 7be6b0fd3f1ef4891d22549b127a7a087fbbd45c Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 090/119] cmd/internal/obj/riscv: add support for vector + fixed-point arithmetic instructions + +Add support for vector fixed-point arithmetic instructions to the +RISC-V assembler. This includes single width saturating addition +and subtraction, averaging addition and subtraction and scaling +shift instructions. + +Change-Id: I9aa27e9565ad016ba5bb2b479e1ba70db24e4ff5 +Reviewed-on: https://go-review.googlesource.com/c/go/+/646776 +Reviewed-by: Mark Ryan +Reviewed-by: Carlos Amedee +Reviewed-by: Dmitri Shuralyov +LUCI-TryBot-Result: Go LUCI +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 74 +++++++++++++++++++ + .../asm/internal/asm/testdata/riscv64error.s | 32 ++++++++ + .../internal/asm/testdata/riscv64validation.s | 32 ++++++++ + src/cmd/internal/obj/riscv/obj.go | 51 ++++++++++++- + 4 files changed, 186 insertions(+), 3 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 852104375b..506fe2a442 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -863,6 +863,80 @@ start: + VMVVX X10, V3 // d741055e + VMVVI $15, V3 // d7b1075e + ++ // 31.12.1: Vector Single-Width Saturating Add and Subtract ++ VSADDUVV V1, V2, V3 // d7812082 ++ VSADDUVV V1, V2, V0, V3 // d7812080 ++ VSADDUVX X10, V2, V3 // d7412582 ++ VSADDUVX X10, V2, V0, V3 // d7412580 ++ VSADDUVI $15, V2, V3 // d7b12782 ++ VSADDUVI $15, V2, V0, V3 // d7b12780 ++ VSADDVV V1, V2, V3 // d7812086 ++ VSADDVV V1, V2, V0, V3 // d7812084 ++ VSADDVX X10, V2, V3 // d7412586 ++ VSADDVX X10, V2, V0, V3 // d7412584 ++ VSADDVI $15, V2, V3 // d7b12786 ++ VSADDVI $15, V2, V0, V3 // d7b12784 ++ VSSUBUVV V1, V2, V3 // d781208a ++ VSSUBUVV V1, V2, V0, V3 // d7812088 ++ VSSUBUVX X10, V2, V3 // d741258a ++ VSSUBUVX X10, V2, V0, V3 // d7412588 ++ VSSUBVV V1, V2, V3 // d781208e ++ VSSUBVV V1, V2, V0, V3 // d781208c ++ VSSUBVX X10, V2, V3 // d741258e ++ VSSUBVX X10, V2, V0, V3 // d741258c ++ ++ // 31.12.2: Vector Single-Width Averaging Add and Subtract ++ VAADDUVV V1, V2, V3 // d7a12022 ++ VAADDUVV V1, V2, V0, V3 // d7a12020 ++ VAADDUVX X10, V2, V3 // d7612522 ++ VAADDUVX X10, V2, V0, V3 // d7612520 ++ VAADDVV V1, V2, V3 // d7a12026 ++ VAADDVV V1, V2, V0, V3 // d7a12024 ++ VAADDVX X10, V2, V3 // d7612526 ++ VAADDVX X10, V2, V0, V3 // d7612524 ++ VASUBUVV V1, V2, V3 // d7a1202a ++ VASUBUVV V1, V2, V0, V3 // d7a12028 ++ VASUBUVX X10, V2, V3 // d761252a ++ VASUBUVX X10, V2, V0, V3 // d7612528 ++ VASUBVV V1, V2, V3 // d7a1202e ++ VASUBVV V1, V2, V0, V3 // d7a1202c ++ VASUBVX X10, V2, V3 // d761252e ++ VASUBVX X10, V2, V0, V3 // d761252c ++ ++ // 31.12.3: Vector Single-Width Fractional Multiply with Rounding and Saturation ++ VSMULVV V1, V2, V3 // d781209e ++ VSMULVV V1, V2, V0, V3 // d781209c ++ VSMULVX X10, V2, V3 // d741259e ++ VSMULVX X10, V2, V0, V3 // d741259c ++ ++ // 31.12.4: Vector Single-Width Scaling Shift Instructions ++ VSSRLVV V1, V2, V3 // d78120aa ++ VSSRLVV V1, V2, V0, V3 // d78120a8 ++ VSSRLVX X10, V2, V3 // d74125aa ++ VSSRLVX X10, V2, V0, V3 // d74125a8 ++ VSSRLVI $15, V2, V3 // d7b127aa ++ VSSRLVI $15, V2, V0, V3 // d7b127a8 ++ VSSRAVV V1, V2, V3 // d78120ae ++ VSSRAVV V1, V2, V0, V3 // d78120ac ++ VSSRAVX X10, V2, V3 // d74125ae ++ VSSRAVX X10, V2, V0, V3 // d74125ac ++ VSSRAVI $16, V2, V3 // d73128ae ++ VSSRAVI $16, V2, V0, V3 // d73128ac ++ ++ // 31.12.5: Vector Narrowing Fixed-Point Clip Instructions ++ VNCLIPUWV V1, V2, V3 // d78120ba ++ VNCLIPUWV V1, V2, V0, V3 // d78120b8 ++ VNCLIPUWX X10, V2, V3 // d74125ba ++ VNCLIPUWX X10, V2, V0, V3 // d74125b8 ++ VNCLIPUWI $16, V2, V3 // d73128ba ++ VNCLIPUWI $16, V2, V0, V3 // d73128b8 ++ VNCLIPWV V1, V2, V3 // d78120be ++ VNCLIPWV V1, V2, V0, V3 // d78120bc ++ VNCLIPWX X10, V2, V3 // d74125be ++ VNCLIPWX X10, V2, V0, V3 // d74125bc ++ VNCLIPWI $16, V2, V3 // d73128be ++ VNCLIPWI $16, V2, V0, V3 // d73128bc ++ + // + // Privileged ISA + // +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index 025d63a15c..6a7c9b9444 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -221,5 +221,37 @@ TEXT errors(SB),$0 + VMVVV V1, V2, V3 // ERROR "too many operands for instruction" + VMVVX X10, V2, V3 // ERROR "too many operands for instruction" + VMVVI $15, V2, V3 // ERROR "too many operands for instruction" ++ VSADDUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSADDUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSADDUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSADDVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSUBUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSUBUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VAADDUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VAADDUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VAADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VAADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VASUBUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VASUBUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VASUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VASUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSMULVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRLVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRLVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRLVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRAVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRAVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRAVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPUWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPUWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPUWI $16, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPWI $16, V2, V4, V3 // ERROR "invalid vector mask register" + + RET +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +index 602cab2c2e..c6f71e64fb 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +@@ -237,5 +237,37 @@ TEXT validation(SB),$0 + VMVVX V1, V2 // ERROR "expected integer register in rs1 position" + VMVVI $16, V2 // ERROR "signed immediate 16 must be in range [-16, 15]" + VMVVI $-17, V2 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VSADDUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSADDUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSADDUVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VSADDUVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VSSUBUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSSUBUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VAADDUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VAADDUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VAADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VAADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VASUBUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VASUBUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VASUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VASUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSMULVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSSRLVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSSRLVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSSRLVI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" ++ VSSRLVI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" ++ VSSRAVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSSRAVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSSRAVI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" ++ VSSRAVI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" ++ VNCLIPUWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNCLIPUWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNCLIPUWI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" ++ VNCLIPUWI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" ++ VNCLIPWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNCLIPWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNCLIPWI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" ++ VNCLIPWI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" + + RET +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index d85bdd302c..e7870000cf 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -2286,6 +2286,48 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AVMVVX & obj.AMask: {enc: rVIVEncoding}, + AVMVVI & obj.AMask: {enc: rVViEncoding}, + ++ // 31.12.1: Vector Single-Width Saturating Add and Subtract ++ AVSADDUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSADDUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVSADDUVI & obj.AMask: {enc: rVViEncoding}, ++ AVSADDVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSADDVX & obj.AMask: {enc: rVIVEncoding}, ++ AVSADDVI & obj.AMask: {enc: rVViEncoding}, ++ AVSSUBUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSSUBUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVSSUBVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSSUBVX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.12.2: Vector Single-Width Averaging Add and Subtract ++ AVAADDUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVAADDUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVAADDVV & obj.AMask: {enc: rVVVEncoding}, ++ AVAADDVX & obj.AMask: {enc: rVIVEncoding}, ++ AVASUBUVV & obj.AMask: {enc: rVVVEncoding}, ++ AVASUBUVX & obj.AMask: {enc: rVIVEncoding}, ++ AVASUBVV & obj.AMask: {enc: rVVVEncoding}, ++ AVASUBVX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.12.3: Vector Single-Width Fractional Multiply with Rounding and Saturation ++ AVSMULVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSMULVX & obj.AMask: {enc: rVIVEncoding}, ++ ++ // 31.12.4: Vector Single-Width Scaling Shift Instructions ++ AVSSRLVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSSRLVX & obj.AMask: {enc: rVIVEncoding}, ++ AVSSRLVI & obj.AMask: {enc: rVVuEncoding}, ++ AVSSRAVV & obj.AMask: {enc: rVVVEncoding}, ++ AVSSRAVX & obj.AMask: {enc: rVIVEncoding}, ++ AVSSRAVI & obj.AMask: {enc: rVVuEncoding}, ++ ++ // 31.12.5: Vector Narrowing Fixed-Point Clip Instructions ++ AVNCLIPUWV & obj.AMask: {enc: rVVVEncoding}, ++ AVNCLIPUWX & obj.AMask: {enc: rVIVEncoding}, ++ AVNCLIPUWI & obj.AMask: {enc: rVVuEncoding}, ++ AVNCLIPWV & obj.AMask: {enc: rVVVEncoding}, ++ AVNCLIPWX & obj.AMask: {enc: rVIVEncoding}, ++ AVNCLIPWI & obj.AMask: {enc: rVVuEncoding}, ++ + // + // Privileged ISA + // +@@ -3267,10 +3309,13 @@ func instructionsForProg(p *obj.Prog) []*instruction { + AVMINUVV, AVMINUVX, AVMINVV, AVMINVX, AVMAXUVV, AVMAXUVX, AVMAXVV, AVMAXVX, + AVMULVV, AVMULVX, AVMULHVV, AVMULHVX, AVMULHUVV, AVMULHUVX, AVMULHSUVV, AVMULHSUVX, + AVDIVUVV, AVDIVUVX, AVDIVVV, AVDIVVX, AVREMUVV, AVREMUVX, AVREMVV, AVREMVX, +- AVWMULVV, AVWMULVX, AVWMULUVV, AVWMULUVX, AVWMULSUVV, AVWMULSUVX, +- AVNSRLWV, AVNSRLWX, AVNSRAWV, AVNSRAWX, ++ AVWMULVV, AVWMULVX, AVWMULUVV, AVWMULUVX, AVWMULSUVV, AVWMULSUVX, AVNSRLWV, AVNSRLWX, AVNSRAWV, AVNSRAWX, + AVMACCVV, AVMACCVX, AVNMSACVV, AVNMSACVX, AVMADDVV, AVMADDVX, AVNMSUBVV, AVNMSUBVX, +- AVWMACCUVV, AVWMACCUVX, AVWMACCVV, AVWMACCVX, AVWMACCSUVV, AVWMACCSUVX, AVWMACCUSVX: ++ AVWMACCUVV, AVWMACCUVX, AVWMACCVV, AVWMACCVX, AVWMACCSUVV, AVWMACCSUVX, AVWMACCUSVX, ++ AVSADDUVV, AVSADDUVX, AVSADDUVI, AVSADDVV, AVSADDVX, AVSADDVI, AVSSUBUVV, AVSSUBUVX, AVSSUBVV, AVSSUBVX, ++ AVAADDUVV, AVAADDUVX, AVAADDVV, AVAADDVX, AVASUBUVV, AVASUBUVX, AVASUBVV, AVASUBVX, ++ AVSMULVV, AVSMULVX, AVSSRLVV, AVSSRLVX, AVSSRLVI, AVSSRAVV, AVSSRAVX, AVSSRAVI, ++ AVNCLIPUWV, AVNCLIPUWX, AVNCLIPUWI, AVNCLIPWV, AVNCLIPWX, AVNCLIPWI: + // Set mask bit + switch { + case ins.rs3 == obj.REG_NONE: +-- +2.39.5 + diff --git a/2091-crypto-sha512-remove-unnecessary-move-op-replace-wit.patch b/2091-crypto-sha512-remove-unnecessary-move-op-replace-wit.patch new file mode 100644 index 0000000..0d5cc8e --- /dev/null +++ b/2091-crypto-sha512-remove-unnecessary-move-op-replace-wit.patch @@ -0,0 +1,66 @@ +From cdba1be01401e1cbb07ea3f4a94b860c0917b7d0 Mon Sep 17 00:00:00 2001 +From: Julian Zhu +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 091/119] crypto/sha512: remove unnecessary move op, replace + with direct add +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +goos: linux +goarch: riscv64 +pkg: crypto/sha512 + │ o │ n │ + │ sec/op │ sec/op vs base │ +Hash8Bytes/New-4 3.499µ ± 0% 3.444µ ± 0% -1.56% (p=0.000 n=10) +Hash8Bytes/Sum384-4 4.012µ ± 0% 3.957µ ± 0% -1.37% (p=0.000 n=10) +Hash8Bytes/Sum512-4 4.218µ ± 0% 4.162µ ± 0% -1.32% (p=0.000 n=10) +Hash1K/New-4 17.07µ ± 0% 16.57µ ± 0% -2.97% (p=0.000 n=10) +Hash1K/Sum384-4 17.59µ ± 0% 17.11µ ± 0% -2.76% (p=0.000 n=10) +Hash1K/Sum512-4 17.78µ ± 0% 17.30µ ± 0% -2.72% (p=0.000 n=10) +Hash8K/New-4 112.2µ ± 0% 108.7µ ± 0% -3.08% (p=0.000 n=10) +Hash8K/Sum384-4 112.7µ ± 0% 109.2µ ± 0% -3.09% (p=0.000 n=10) +Hash8K/Sum512-4 112.9µ ± 0% 109.4µ ± 0% -3.07% (p=0.000 n=10) +geomean 19.72µ 19.24µ -2.44% + + │ o │ n │ + │ B/s │ B/s vs base │ +Hash8Bytes/New-4 2.184Mi ± 0% 2.213Mi ± 0% +1.31% (p=0.000 n=10) +Hash8Bytes/Sum384-4 1.898Mi ± 1% 1.926Mi ± 0% +1.51% (p=0.000 n=10) +Hash8Bytes/Sum512-4 1.812Mi ± 1% 1.831Mi ± 0% +1.05% (p=0.000 n=10) +Hash1K/New-4 57.20Mi ± 0% 58.95Mi ± 0% +3.06% (p=0.000 n=10) +Hash1K/Sum384-4 55.51Mi ± 0% 57.09Mi ± 0% +2.84% (p=0.000 n=10) +Hash1K/Sum512-4 54.91Mi ± 0% 56.44Mi ± 0% +2.79% (p=0.000 n=10) +Hash8K/New-4 69.63Mi ± 0% 71.84Mi ± 0% +3.17% (p=0.000 n=10) +Hash8K/Sum384-4 69.30Mi ± 0% 71.52Mi ± 0% +3.20% (p=0.000 n=10) +Hash8K/Sum512-4 69.19Mi ± 0% 71.39Mi ± 0% +3.18% (p=0.000 n=10) +geomean 19.65Mi 20.13Mi +2.45% + +Change-Id: Ib68b934276ec08246d4ae60ef9870c233f0eac69 +Reviewed-on: https://go-review.googlesource.com/c/go/+/665595 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: David Chase +Reviewed-by: Joel Sing +Reviewed-by: Roland Shoemaker +--- + src/crypto/sha512/sha512block_riscv64.s | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/src/crypto/sha512/sha512block_riscv64.s b/src/crypto/sha512/sha512block_riscv64.s +index 0281464e4d..2bc5d889ea 100644 +--- a/src/crypto/sha512/sha512block_riscv64.s ++++ b/src/crypto/sha512/sha512block_riscv64.s +@@ -136,9 +136,8 @@ + #define SHA512ROUND(index, a, b, c, d, e, f, g, h) \ + SHA512T1(index, e, f, g, h); \ + SHA512T2(a, b, c); \ +- MOV X6, h; \ + ADD X5, d; \ +- ADD X5, h ++ ADD X6, X5, h + + #define SHA512ROUND0(index, a, b, c, d, e, f, g, h) \ + MSGSCHEDULE0(index); \ +-- +2.39.5 + diff --git a/2092-crypto-sha256-improve-performance-of-riscv64-assembl.patch b/2092-crypto-sha256-improve-performance-of-riscv64-assembl.patch new file mode 100644 index 0000000..2d269eb --- /dev/null +++ b/2092-crypto-sha256-improve-performance-of-riscv64-assembl.patch @@ -0,0 +1,120 @@ +From a70b3bc13e145ac78e32c1104cce9a511200f286 Mon Sep 17 00:00:00 2001 +From: Julian Zhu +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 092/119] crypto/sha256: improve performance of riscv64 + assembly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Simplified the implementation of Ch and Maj by reducing instructions, based on CL 605495 which made the same change for SHA-512. + +goos: linux +goarch: riscv64 +pkg: crypto/sha256 +cpu: Spacemit(R) X60 + │ oldsha256 │ newsha256 │ + │ sec/op │ sec/op vs base │ +Hash8Bytes/New-8 2.303µ ± 0% 2.098µ ± 0% -8.90% (p=0.000 n=10) +Hash8Bytes/Sum224-8 2.535µ ± 0% 2.329µ ± 0% -8.13% (p=0.000 n=10) +Hash8Bytes/Sum256-8 2.558µ ± 0% 2.352µ ± 0% -8.04% (p=0.000 n=10) +Hash1K/New-8 28.67µ ± 0% 25.21µ ± 0% -12.06% (p=0.000 n=10) +Hash1K/Sum224-8 28.89µ ± 0% 25.43µ ± 0% -11.99% (p=0.000 n=10) +Hash1K/Sum256-8 28.91µ ± 0% 25.43µ ± 0% -12.04% (p=0.000 n=10) +Hash8K/New-8 218.0µ ± 1% 192.7µ ± 2% -11.58% (p=0.000 n=10) +Hash8K/Sum224-8 218.0µ ± 1% 193.6µ ± 1% -11.20% (p=0.000 n=10) +Hash8K/Sum256-8 219.1µ ± 1% 193.4µ ± 1% -11.74% (p=0.000 n=10) +geomean 24.93µ 22.28µ -10.65% + + │ oldsha256 │ newsha256 │ + │ B/s │ B/s vs base │ +Hash8Bytes/New-8 3.309Mi ± 0% 3.633Mi ± 0% +9.80% (p=0.000 n=10) +Hash8Bytes/Sum224-8 3.009Mi ± 0% 3.271Mi ± 0% +8.72% (p=0.000 n=10) +Hash8Bytes/Sum256-8 2.985Mi ± 0% 3.242Mi ± 0% +8.63% (p=0.000 n=10) +Hash1K/New-8 34.06Mi ± 0% 38.73Mi ± 0% +13.72% (p=0.000 n=10) +Hash1K/Sum224-8 33.80Mi ± 0% 38.40Mi ± 0% +13.63% (p=0.000 n=10) +Hash1K/Sum256-8 33.78Mi ± 0% 38.40Mi ± 0% +13.69% (p=0.000 n=10) +Hash8K/New-8 35.84Mi ± 1% 40.54Mi ± 2% +13.10% (p=0.000 n=10) +Hash8K/Sum224-8 35.83Mi ± 1% 40.35Mi ± 1% +12.61% (p=0.000 n=10) +Hash8K/Sum256-8 35.66Mi ± 1% 40.40Mi ± 1% +13.29% (p=0.000 n=10) +geomean 15.54Mi 17.39Mi +11.89% + +Change-Id: I9aa692fcfd70634dc6c308db9b5d06bd82ac2302 +Reviewed-on: https://go-review.googlesource.com/c/go/+/639495 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Joel Sing +Reviewed-by: Junyang Shao +Reviewed-by: David Chase +Reviewed-by: Meng Zhuo +--- + src/crypto/sha256/sha256block_riscv64.s | 27 ++++++++++++------------- + 1 file changed, 13 insertions(+), 14 deletions(-) + +diff --git a/src/crypto/sha256/sha256block_riscv64.s b/src/crypto/sha256/sha256block_riscv64.s +index fc7bf65e41..2cf7454ba4 100644 +--- a/src/crypto/sha256/sha256block_riscv64.s ++++ b/src/crypto/sha256/sha256block_riscv64.s +@@ -86,47 +86,46 @@ + // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt + // BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x) + // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) ++// = ((y XOR z) AND x) XOR z + #define SHA256T1(index, e, f, g, h) \ + MOVWU (index*4)(X18), X8; \ + ADD X5, h; \ + RORW $6, e, X6; \ + ADD X8, h; \ + RORW $11, e, X7; \ +- XOR X7, X6; \ + RORW $25, e, X8; \ ++ XOR X7, X6; \ ++ XOR f, g, X5; \ + XOR X8, X6; \ ++ AND e, X5; \ + ADD X6, h; \ +- AND e, f, X5; \ +- NOT e, X7; \ +- AND g, X7; \ +- XOR X7, X5; \ ++ XOR g, X5; \ + ADD h, X5 + + // Calculate T2 in X6. + // T2 = BIGSIGMA0(a) + Maj(a, b, c) + // BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x) + // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) ++// = ((y XOR z) AND x) XOR (y AND z) + #define SHA256T2(a, b, c) \ + RORW $2, a, X6; \ + RORW $13, a, X7; \ +- XOR X7, X6; \ + RORW $22, a, X8; \ ++ XOR X7, X6; \ ++ XOR b, c, X9; \ ++ AND b, c, X7; \ ++ AND a, X9; \ + XOR X8, X6; \ +- AND a, b, X7; \ +- AND a, c, X8; \ +- XOR X8, X7; \ +- AND b, c, X9; \ +- XOR X9, X7; \ +- ADD X7, X6 ++ XOR X7, X9; \ ++ ADD X9, X6 + + // Calculate T1 and T2, then e = d + T1 and a = T1 + T2. + // The values for e and a are stored in d and h, ready for rotation. + #define SHA256ROUND(index, a, b, c, d, e, f, g, h) \ + SHA256T1(index, e, f, g, h); \ + SHA256T2(a, b, c); \ +- MOV X6, h; \ + ADD X5, d; \ +- ADD X5, h ++ ADD X6, X5, h + + #define SHA256ROUND0(index, a, b, c, d, e, f, g, h) \ + MSGSCHEDULE0(index); \ +-- +2.39.5 + diff --git a/2093-cmd-link-fix-cgo-on-riscv64-when-building-with-gcc-1.patch b/2093-cmd-link-fix-cgo-on-riscv64-when-building-with-gcc-1.patch new file mode 100644 index 0000000..196a397 --- /dev/null +++ b/2093-cmd-link-fix-cgo-on-riscv64-when-building-with-gcc-1.patch @@ -0,0 +1,81 @@ +From a21957e9bed511d3e108c993e88bfe14c3efc66b Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 093/119] cmd/link: fix cgo on riscv64 when building with + gcc-15 + +It's not currently possible to build cgo programs that are partially +compiled with gcc-15 on riscv64 using the internal linker. There are +two reasons for this. + +1. When gcc-15 compiles _cgo_export.c, which contains no actual code, + for a riscv64 target, it emits a label in the .text section called + .Letext0. This label is referred to by another section, .debug_line, + and an entry is generated in the symbol table for it. The Go linker + panics when processing the .Letext0 symbol in _cgo_export.o, as it + occurs in an empty section. +2. GCC-15 is generating additional debug symbols with the .LVUS + prefix, e.g., .LVUS33, that need to be ignored. + +We fix the issue by removing the check in +cmd/link/internal/loader/loader.go that panics if we encounter a +symbol in an empty section (the comments preceding this check suggest +it's safe to remove it) and by adding .LVUS to the list of symbol +prefixes to ignore. + +Fixes #72840 + +Change-Id: I00658b6bdd01606dde1581b5bc2f42edfc37de82 +Reviewed-on: https://go-review.googlesource.com/c/go/+/668276 +Auto-Submit: Dmitri Shuralyov +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Joel Sing +Reviewed-by: Carlos Amedee +Reviewed-by: Meng Zhuo +--- + src/cmd/link/internal/loadelf/ldelf.go | 8 +++++++- + src/cmd/link/internal/loader/loader.go | 8 -------- + 2 files changed, 7 insertions(+), 9 deletions(-) + +diff --git a/src/cmd/link/internal/loadelf/ldelf.go b/src/cmd/link/internal/loadelf/ldelf.go +index 942d54c06c..dea77bafcb 100644 +--- a/src/cmd/link/internal/loadelf/ldelf.go ++++ b/src/cmd/link/internal/loadelf/ldelf.go +@@ -604,7 +604,13 @@ func Load(l *loader.Loader, arch *sys.Arch, localSymVersion int, f *bio.Reader, + if strings.HasPrefix(elfsym.name, ".LASF") { // gcc on s390x does this + continue + } +- return errorf("%v: sym#%d (%s): ignoring symbol in section %d (type %d)", elfsym.sym, i, elfsym.name, elfsym.shndx, elfsym.type_) ++ ++ if strings.HasPrefix(elfsym.name, ".LASF") || strings.HasPrefix(elfsym.name, ".LLRL") || strings.HasPrefix(elfsym.name, ".LLST") || strings.HasPrefix(elfsym.name, ".LVUS") { ++ // gcc on s390x and riscv64 does this. ++ continue ++ } ++ ++ return errorf("%v: sym#%d (%q): ignoring symbol in section %d (%q) (type %d)", elfsym.sym, i, elfsym.name, elfsym.shndx, sect.name, elfsym.type_) + } + + s := elfsym.sym +diff --git a/src/cmd/link/internal/loader/loader.go b/src/cmd/link/internal/loader/loader.go +index 4d0b497d8e..223ce5dd59 100644 +--- a/src/cmd/link/internal/loader/loader.go ++++ b/src/cmd/link/internal/loader/loader.go +@@ -1724,14 +1724,6 @@ func (l *Loader) GetVarDwarfAuxSym(i Sym) Sym { + // expected to have the actual content/payload) and then a set of + // interior loader.Sym's that point into a portion of the container. + func (l *Loader) AddInteriorSym(container Sym, interior Sym) { +- // Container symbols are expected to have content/data. +- // NB: this restriction may turn out to be too strict (it's possible +- // to imagine a zero-sized container with an interior symbol pointing +- // into it); it's ok to relax or remove it if we counter an +- // oddball host object that triggers this. +- if l.SymSize(container) == 0 && len(l.Data(container)) == 0 { +- panic("unexpected empty container symbol") +- } + // The interior symbols for a container are not expected to have + // content/data or relocations. + if len(l.Data(interior)) != 0 { +-- +2.39.5 + diff --git a/2094-internal-bytealg-deduplicate-code-between-Count-Coun.patch b/2094-internal-bytealg-deduplicate-code-between-Count-Coun.patch new file mode 100644 index 0000000..5e55e2f --- /dev/null +++ b/2094-internal-bytealg-deduplicate-code-between-Count-Coun.patch @@ -0,0 +1,63 @@ +From 95ad58dbd546edbcc4109fb6603254c4bf60dcb1 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 094/119] internal/bytealg: deduplicate code between + Count/CountString for riscv64 + +Change-Id: I22eb4e7444e5fe5f6767cc960895f3c6e2fa13cc +Reviewed-on: https://go-review.googlesource.com/c/go/+/661615 +Reviewed-by: Keith Randall +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Keith Randall +Auto-Submit: Carlos Amedee +Reviewed-by: Carlos Amedee +--- + src/internal/bytealg/count_riscv64.s | 28 +++++++--------------------- + 1 file changed, 7 insertions(+), 21 deletions(-) + +diff --git a/src/internal/bytealg/count_riscv64.s b/src/internal/bytealg/count_riscv64.s +index 3f255cd263..6cc49d1388 100644 +--- a/src/internal/bytealg/count_riscv64.s ++++ b/src/internal/bytealg/count_riscv64.s +@@ -5,6 +5,13 @@ + #include "go_asm.h" + #include "textflag.h" + ++TEXT ·CountString(SB),NOSPLIT,$0-32 ++ // X10 = s_base ++ // X11 = s_len ++ // X12 = byte to count ++ MOV X12, X13 ++ JMP ·Count(SB) ++ + TEXT ·Count(SB),NOSPLIT,$0-40 + // X10 = b_base + // X11 = b_len +@@ -26,24 +33,3 @@ loop: + done: + MOV X14, X10 + RET +- +-TEXT ·CountString(SB),NOSPLIT,$0-32 +- // X10 = s_base +- // X11 = s_len +- // X12 = byte to count +- AND $0xff, X12 +- MOV ZERO, X14 // count +- ADD X10, X11 // end +- +- PCALIGN $16 +-loop: +- BEQ X10, X11, done +- MOVBU (X10), X15 +- ADD $1, X10 +- BNE X12, X15, loop +- ADD $1, X14 +- JMP loop +- +-done: +- MOV X14, X10 +- RET +-- +2.39.5 + diff --git a/2095-cmd-internal-obj-riscv-add-support-for-vector-floati.patch b/2095-cmd-internal-obj-riscv-add-support-for-vector-floati.patch new file mode 100644 index 0000000..6931395 --- /dev/null +++ b/2095-cmd-internal-obj-riscv-add-support-for-vector-floati.patch @@ -0,0 +1,1735 @@ +From 6d2bdcd0a446828a3209372bfea6ccb948c40bcc Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 095/119] cmd/internal/obj/riscv: add support for vector + floating-point instructions + +Add support for vector floating-point instructions to the RISC-V +assembler. This includes single-width and widening addition and +subtraction, multiplication and division, fused multiply-addition, +comparison, min/max, sign-injection, classification and type +conversion instructions. + +Change-Id: I8bceb1c5d7eead0561ba5407ace00805a6144f51 +Reviewed-on: https://go-review.googlesource.com/c/go/+/646777 +Reviewed-by: Carlos Amedee +Reviewed-by: Junyang Shao +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: Mark Ryan +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 224 +++++++ + .../asm/internal/asm/testdata/riscv64error.s | 500 +++++++++------ + .../internal/asm/testdata/riscv64validation.s | 603 ++++++++++-------- + src/cmd/internal/obj/riscv/anames.go | 4 + + src/cmd/internal/obj/riscv/cpu.go | 4 + + src/cmd/internal/obj/riscv/obj.go | 213 ++++++- + 6 files changed, 1076 insertions(+), 472 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 506fe2a442..7e2a070bd0 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -937,6 +937,230 @@ start: + VNCLIPWI $16, V2, V3 // d73128be + VNCLIPWI $16, V2, V0, V3 // d73128bc + ++ // 31.13.2: Vector Single-Width Floating-Point Add/Subtract Instructions ++ VFADDVV V1, V2, V3 // d7912002 ++ VFADDVV V1, V2, V0, V3 // d7912000 ++ VFADDVF F10, V2, V3 // d7512502 ++ VFADDVF F10, V2, V0, V3 // d7512500 ++ VFSUBVV V1, V2, V3 // d791200a ++ VFSUBVV V1, V2, V0, V3 // d7912008 ++ VFSUBVF F10, V2, V3 // d751250a ++ VFSUBVF F10, V2, V0, V3 // d7512508 ++ VFRSUBVF F10, V2, V3 // d751259e ++ VFRSUBVF F10, V2, V0, V3 // d751259c ++ ++ // 31.13.3: Vector Widening Floating-Point Add/Subtract Instructions ++ VFWADDVV V1, V2, V3 // d79120c2 ++ VFWADDVV V1, V2, V0, V3 // d79120c0 ++ VFWADDVF F10, V2, V3 // d75125c2 ++ VFWADDVF F10, V2, V0, V3 // d75125c0 ++ VFWSUBVV V1, V2, V3 // d79120ca ++ VFWSUBVV V1, V2, V0, V3 // d79120c8 ++ VFWSUBVF F10, V2, V3 // d75125ca ++ VFWSUBVF F10, V2, V0, V3 // d75125c8 ++ VFWADDWV V1, V2, V3 // d79120d2 ++ VFWADDWV V1, V2, V0, V3 // d79120d0 ++ VFWADDWF F10, V2, V3 // d75125d2 ++ VFWADDWF F10, V2, V0, V3 // d75125d0 ++ VFWSUBWV V1, V2, V3 // d79120da ++ VFWSUBWV V1, V2, V0, V3 // d79120d8 ++ VFWSUBWF F10, V2, V3 // d75125da ++ VFWSUBWF F10, V2, V0, V3 // d75125d8 ++ ++ // 31.13.4: Vector Single-Width Floating-Point Multiply/Divide Instructions ++ VFMULVV V1, V2, V3 // d7912092 ++ VFMULVV V1, V2, V0, V3 // d7912090 ++ VFMULVF F10, V2, V3 // d7512592 ++ VFMULVF F10, V2, V0, V3 // d7512590 ++ VFDIVVV V1, V2, V3 // d7912082 ++ VFDIVVV V1, V2, V0, V3 // d7912080 ++ VFDIVVF F10, V2, V3 // d7512582 ++ VFDIVVF F10, V2, V0, V3 // d7512580 ++ VFRDIVVF F10, V2, V3 // d7512586 ++ VFRDIVVF F10, V2, V0, V3 // d7512584 ++ ++ // 31.13.5: Vector Widening Floating-Point Multiply ++ VFWMULVV V1, V2, V3 // d79120e2 ++ VFWMULVV V1, V2, V0, V3 // d79120e0 ++ VFWMULVF F10, V2, V3 // d75125e2 ++ VFWMULVF F10, V2, V0, V3 // d75125e0 ++ ++ // 31.13.6: Vector Single-Width Floating-Point Fused Multiply-Add Instructions ++ VFMACCVV V2, V1, V3 // d79120b2 ++ VFMACCVV V2, V1, V0, V3 // d79120b0 ++ VFMACCVF V2, F10, V3 // d75125b2 ++ VFMACCVF V2, F10, V0, V3 // d75125b0 ++ VFNMACCVV V2, V1, V3 // d79120b6 ++ VFNMACCVV V2, V1, V0, V3 // d79120b4 ++ VFNMACCVF V2, F10, V3 // d75125b6 ++ VFNMACCVF V2, F10, V0, V3 // d75125b4 ++ VFMSACVV V2, V1, V3 // d79120ba ++ VFMSACVV V2, V1, V0, V3 // d79120b8 ++ VFMSACVF V2, F10, V3 // d75125ba ++ VFMSACVF V2, F10, V0, V3 // d75125b8 ++ VFNMSACVV V2, V1, V3 // d79120be ++ VFNMSACVV V2, V1, V0, V3 // d79120bc ++ VFNMSACVF V2, F10, V3 // d75125be ++ VFNMSACVF V2, F10, V0, V3 // d75125bc ++ VFMADDVV V2, V1, V3 // d79120a2 ++ VFMADDVV V2, V1, V0, V3 // d79120a0 ++ VFMADDVF V2, F10, V3 // d75125a2 ++ VFMADDVF V2, F10, V0, V3 // d75125a0 ++ VFNMADDVV V2, V1, V3 // d79120a6 ++ VFNMADDVV V2, V1, V0, V3 // d79120a4 ++ VFNMADDVF V2, F10, V3 // d75125a6 ++ VFNMADDVF V2, F10, V0, V3 // d75125a4 ++ VFMSUBVV V2, V1, V3 // d79120aa ++ VFMSUBVV V2, V1, V0, V3 // d79120a8 ++ VFMSUBVF V2, F10, V3 // d75125aa ++ VFMSUBVF V2, F10, V0, V3 // d75125a8 ++ VFNMSUBVV V2, V1, V3 // d79120ae ++ VFNMSUBVV V2, V1, V0, V3 // d79120ac ++ VFNMSUBVF V2, F10, V3 // d75125ae ++ VFNMSUBVF V2, F10, V0, V3 // d75125ac ++ ++ // 31.13.7: Vector Widening Floating-Point Fused Multiply-Add Instructions ++ VFWMACCVV V2, V1, V3 // d79120f2 ++ VFWMACCVV V2, V1, V0, V3 // d79120f0 ++ VFWMACCVF V2, F10, V3 // d75125f2 ++ VFWMACCVF V2, F10, V0, V3 // d75125f0 ++ VFWNMACCVV V2, V1, V3 // d79120f6 ++ VFWNMACCVV V2, V1, V0, V3 // d79120f4 ++ VFWNMACCVF V2, F10, V3 // d75125f6 ++ VFWNMACCVF V2, F10, V0, V3 // d75125f4 ++ VFWMSACVV V2, V1, V3 // d79120fa ++ VFWMSACVV V2, V1, V0, V3 // d79120f8 ++ VFWMSACVF V2, F10, V3 // d75125fa ++ VFWMSACVF V2, F10, V0, V3 // d75125f8 ++ VFWNMSACVV V2, V1, V3 // d79120fe ++ VFWNMSACVV V2, V1, V0, V3 // d79120fc ++ VFWNMSACVF V2, F10, V3 // d75125fe ++ VFWNMSACVF V2, F10, V0, V3 // d75125fc ++ ++ // 31.13.8: Vector Floating-Point Square-Root Instruction ++ VFSQRTV V2, V3 // d711204e ++ VFSQRTV V2, V0, V3 // d711204c ++ ++ // 31.13.9: Vector Floating-Point Reciprocal Square-Root Estimate Instruction ++ VFRSQRT7V V2, V3 // d711224e ++ VFRSQRT7V V2, V0, V3 // d711224c ++ ++ // 31.13.10: Vector Floating-Point Reciprocal Estimate Instruction ++ VFREC7V V2, V3 // d791224e ++ VFREC7V V2, V0, V3 // d791224c ++ ++ // 31.13.11: Vector Floating-Point MIN/MAX Instructions ++ VFMINVV V1, V2, V3 // d7912012 ++ VFMINVV V1, V2, V0, V3 // d7912010 ++ VFMINVF F10, V2, V3 // d7512512 ++ VFMINVF F10, V2, V0, V3 // d7512510 ++ VFMAXVV V1, V2, V3 // d791201a ++ VFMAXVV V1, V2, V0, V3 // d7912018 ++ VFMAXVF F10, V2, V3 // d751251a ++ VFMAXVF F10, V2, V0, V3 // d7512518 ++ ++ // 31.13.12: Vector Floating-Point Sign-Injection Instructions ++ VFSGNJVV V1, V2, V3 // d7912022 ++ VFSGNJVV V1, V2, V0, V3 // d7912020 ++ VFSGNJVF F10, V2, V3 // d7512522 ++ VFSGNJVF F10, V2, V0, V3 // d7512520 ++ VFSGNJNVV V1, V2, V3 // d7912026 ++ VFSGNJNVV V1, V2, V0, V3 // d7912024 ++ VFSGNJNVF F10, V2, V3 // d7512526 ++ VFSGNJNVF F10, V2, V0, V3 // d7512524 ++ VFSGNJXVV V1, V2, V3 // d791202a ++ VFSGNJXVV V1, V2, V0, V3 // d7912028 ++ VFSGNJXVF F10, V2, V3 // d751252a ++ VFSGNJXVF F10, V2, V0, V3 // d7512528 ++ VFNEGV V2, V3 // d7112126 ++ VFNEGV V2, V0, V3 // d7112124 ++ VFABSV V2, V3 // d711212a ++ VFABSV V2, V0, V3 // d7112128 ++ ++ // 31.13.13: Vector Floating-Point Compare Instructions ++ VMFEQVV V1, V2, V3 // d7912062 ++ VMFEQVV V1, V2, V0, V3 // d7912060 ++ VMFEQVF F10, V2, V3 // d7512562 ++ VMFEQVF F10, V2, V0, V3 // d7512560 ++ VMFNEVV V1, V2, V3 // d7912072 ++ VMFNEVV V1, V2, V0, V3 // d7912070 ++ VMFNEVF F10, V2, V3 // d7512572 ++ VMFNEVF F10, V2, V0, V3 // d7512570 ++ VMFLTVV V1, V2, V3 // d791206e ++ VMFLTVV V1, V2, V0, V3 // d791206c ++ VMFLTVF F10, V2, V3 // d751256e ++ VMFLTVF F10, V2, V0, V3 // d751256c ++ VMFLEVV V1, V2, V3 // d7912066 ++ VMFLEVV V1, V2, V0, V3 // d7912064 ++ VMFLEVF F10, V2, V3 // d7512566 ++ VMFLEVF F10, V2, V0, V3 // d7512564 ++ VMFGTVF F10, V2, V3 // d7512576 ++ VMFGTVF F10, V2, V0, V3 // d7512574 ++ VMFGEVF F10, V2, V3 // d751257e ++ VMFGEVF F10, V2, V0, V3 // d751257c ++ VMFGTVV V1, V2, V3 // d711116e ++ VMFGTVV V1, V2, V0, V3 // d711116c ++ VMFGEVV V1, V2, V3 // d7111166 ++ VMFGEVV V1, V2, V0, V3 // d7111164 ++ ++ // 31.13.14: Vector Floating-Point Classify Instruction ++ VFCLASSV V2, V3 // d711284e ++ VFCLASSV V2, V0, V3 // d711284c ++ ++ // 31.13.15: Vector Floating-Point Merge Instruction ++ VFMERGEVFM F10, V2, V0, V3 // d751255c ++ ++ // 31.13.16: Vector Floating-Point Move Instruction ++ VFMVVF F10, V3 // d751055e ++ ++ // 31.13.17: Single-Width Floating-Point/Integer Type-Convert Instructions ++ VFCVTXUFV V2, V3 // d711204a ++ VFCVTXUFV V2, V0, V3 // d7112048 ++ VFCVTXFV V2, V3 // d791204a ++ VFCVTXFV V2, V0, V3 // d7912048 ++ VFCVTRTZXUFV V2, V3 // d711234a ++ VFCVTRTZXUFV V2, V0, V3 // d7112348 ++ VFCVTRTZXFV V2, V3 // d791234a ++ VFCVTRTZXFV V2, V0, V3 // d7912348 ++ VFCVTFXUV V2, V3 // d711214a ++ VFCVTFXUV V2, V0, V3 // d7112148 ++ VFCVTFXV V2, V3 // d791214a ++ VFCVTFXV V2, V0, V3 // d7912148 ++ ++ // 31.13.18: Widening Floating-Point/Integer Type-Convert Instructions ++ VFWCVTXUFV V2, V3 // d711244a ++ VFWCVTXUFV V2, V0, V3 // d7112448 ++ VFWCVTXFV V2, V3 // d791244a ++ VFWCVTXFV V2, V0, V3 // d7912448 ++ VFWCVTRTZXUFV V2, V3 // d711274a ++ VFWCVTRTZXUFV V2, V0, V3 // d7112748 ++ VFWCVTRTZXFV V2, V3 // d791274a ++ VFWCVTRTZXFV V2, V0, V3 // d7912748 ++ VFWCVTFXUV V2, V3 // d711254a ++ VFWCVTFXUV V2, V0, V3 // d7112548 ++ VFWCVTFXV V2, V3 // d791254a ++ VFWCVTFXV V2, V0, V3 // d7912548 ++ VFWCVTFFV V2, V3 // d711264a ++ VFWCVTFFV V2, V0, V3 // d7112648 ++ ++ // 31.13.19: Narrowing Floating-Point/Integer Type-Convert Instructions ++ VFNCVTXUFW V2, V3 // d711284a ++ VFNCVTXUFW V2, V0, V3 // d7112848 ++ VFNCVTXFW V2, V3 // d791284a ++ VFNCVTXFW V2, V0, V3 // d7912848 ++ VFNCVTRTZXUFW V2, V3 // d7112b4a ++ VFNCVTRTZXUFW V2, V0, V3 // d7112b48 ++ VFNCVTRTZXFW V2, V3 // d7912b4a ++ VFNCVTRTZXFW V2, V0, V3 // d7912b48 ++ VFNCVTFXUW V2, V3 // d711294a ++ VFNCVTFXUW V2, V0, V3 // d7112948 ++ VFNCVTFXW V2, V3 // d791294a ++ VFNCVTFXW V2, V0, V3 // d7912948 ++ VFNCVTFFW V2, V3 // d7112a4a ++ VFNCVTFFW V2, V0, V3 // d7112a48 ++ VFNCVTRODFFW V2, V3 // d7912a4a ++ VFNCVTRODFFW V2, V0, V3 // d7912a48 ++ + // + // Privileged ISA + // +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index 6a7c9b9444..3aeeadf848 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -50,208 +50,302 @@ TEXT errors(SB),$0 + // + // "V" Standard Extension for Vector Operations, Version 1.0 + // +- VSETIVLI X10, E32, M2, TA, MA, X12 // ERROR "expected immediate value" +- VLE8V (X10), V1, V3 // ERROR "invalid vector mask register" +- VSE8V V3, V1, (X10) // ERROR "invalid vector mask register" +- VLSE8V (X10), X10, V1, V3 // ERROR "invalid vector mask register" +- VSSE8V V3, X11, V1, (X10) // ERROR "invalid vector mask register" +- VLUXEI8V (X10), V2, V1, V3 // ERROR "invalid vector mask register" +- VSUXEI8V V3, V2, V1, (X10) // ERROR "invalid vector mask register" +- VLOXEI8V (X10), V2, V1, V3 // ERROR "invalid vector mask register" +- VSOXEI8V V3, V2, V1, (X10) // ERROR "invalid vector mask register" +- VL1RV (X10), V0, V3 // ERROR "too many operands for instruction" +- VS1RV V3, V0, (X11) // ERROR "too many operands for instruction" +- VADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VADDVX X10, V2, V1, V3 // ERROR "invalid vector mask register" +- VADDVI $15, V4, V1, V2 // ERROR "invalid vector mask register" +- VSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSUBVX X10, V2, V1, V3 // ERROR "invalid vector mask register" +- VRSUBVX X10, V2, V1, V3 // ERROR "invalid vector mask register" +- VRSUBVI $15, V4, V1, V2 // ERROR "invalid vector mask register" +- VNEGV V2, V3, V4 // ERROR "invalid vector mask register" +- VWADDUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWADDUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWSUBUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWSUBUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWSUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWADDUWV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWADDUWX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWSUBUWV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWSUBUWX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWADDWV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWADDWX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWSUBWV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWSUBWX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWCVTXXV V2, V1, V3 // ERROR "invalid vector mask register" +- VWCVTUXXV V2, V1, V3 // ERROR "invalid vector mask register" +- VZEXTVF2 V2, V3, V4 // ERROR "invalid vector mask register" +- VSEXTVF2 V2, V3, V4 // ERROR "invalid vector mask register" +- VZEXTVF4 V2, V3, V4 // ERROR "invalid vector mask register" +- VSEXTVF4 V2, V3, V4 // ERROR "invalid vector mask register" +- VZEXTVF8 V2, V3, V4 // ERROR "invalid vector mask register" +- VSEXTVF8 V2, V3, V4 // ERROR "invalid vector mask register" +- VADCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VADCVVM V1, V2, V3 // ERROR "invalid vector mask register" +- VADCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VADCVXM X10, V2, V3 // ERROR "invalid vector mask register" +- VADCVIM $15, V2, V1, V3 // ERROR "invalid vector mask register" +- VADCVIM $15, V2, V3 // ERROR "invalid vector mask register" +- VMADCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMADCVVM V1, V2, V3 // ERROR "invalid vector mask register" +- VMADCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMADCVXM X10, V2, V3 // ERROR "invalid vector mask register" +- VMADCVIM $15, V2, V1, V3 // ERROR "invalid vector mask register" +- VMADCVIM $15, V2, V3 // ERROR "invalid vector mask register" +- VSBCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSBCVVM V1, V2, V3 // ERROR "invalid vector mask register" +- VSBCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSBCVXM X10, V2, V3 // ERROR "invalid vector mask register" +- VMSBCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSBCVVM V1, V2, V3 // ERROR "invalid vector mask register" +- VMSBCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSBCVXM X10, V2, V3 // ERROR "invalid vector mask register" +- VANDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VANDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VANDVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VORVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VORVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VORVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VXORVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VXORVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VXORVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VNOTV V1, V2, V3 // ERROR "invalid vector mask register" +- VSLLVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSLLVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSLLVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VSRLVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSRLVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSRLVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VSRAVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSRAVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSRAVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VNSRLWV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VNSRLWX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VNSRLWI $31, V2, V4, V3 // ERROR "invalid vector mask register" +- VNSRAWV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VNSRAWX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VNSRAWI $31, V2, V4, V3 // ERROR "invalid vector mask register" +- VNCVTXXW V2, V4, V3 // ERROR "invalid vector mask register" +- VMSEQVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSEQVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSEQVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSNEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSNEVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSNEVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLTUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLTUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLTVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLTVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLEUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLEUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLEUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLEVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLEVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSGTUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSGTUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSGTUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSGTVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSGTVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSGTVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSGEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSGEUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLTVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSLTUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSGEVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMSGEUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMINUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMINUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMINVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMINVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMAXUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMAXUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMAXVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMAXVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMULVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMULHVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMULHVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMULHUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMULHUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMULHSUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMULHSUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VDIVUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VDIVUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VDIVVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VDIVVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VREMUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VREMUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VREMVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VREMVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMULVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMULUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMULUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMULSUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMULSUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMACCVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMACCVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VNMSACVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VNMSACVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VNMSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VNMSUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMACCUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMACCUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMACCVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMACCVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMACCSUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMACCSUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VWMACCUSVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMERGEVVM V1, V2, V3 // ERROR "invalid vector mask register" +- VMERGEVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VMERGEVXM X10, V2, V3 // ERROR "invalid vector mask register" +- VMERGEVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VMERGEVIM $15, V2, V3 // ERROR "invalid vector mask register" +- VMERGEVIM $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VMVVV V1, V2, V3 // ERROR "too many operands for instruction" +- VMVVX X10, V2, V3 // ERROR "too many operands for instruction" +- VMVVI $15, V2, V3 // ERROR "too many operands for instruction" +- VSADDUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSADDUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSADDUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VSADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSADDVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VSSUBUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSSUBUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSSUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VAADDUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VAADDUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VAADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VAADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VASUBUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VASUBUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VASUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VASUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSMULVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSSRLVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSSRLVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSSRLVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VSSRAVV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VSSRAVX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VSSRAVI $15, V2, V4, V3 // ERROR "invalid vector mask register" +- VNCLIPUWV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VNCLIPUWX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VNCLIPUWI $16, V2, V4, V3 // ERROR "invalid vector mask register" +- VNCLIPWV V1, V2, V4, V3 // ERROR "invalid vector mask register" +- VNCLIPWX X10, V2, V4, V3 // ERROR "invalid vector mask register" +- VNCLIPWI $16, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSETIVLI X10, E32, M2, TA, MA, X12 // ERROR "expected immediate value" ++ VLE8V (X10), V1, V3 // ERROR "invalid vector mask register" ++ VSE8V V3, V1, (X10) // ERROR "invalid vector mask register" ++ VLSE8V (X10), X10, V1, V3 // ERROR "invalid vector mask register" ++ VSSE8V V3, X11, V1, (X10) // ERROR "invalid vector mask register" ++ VLUXEI8V (X10), V2, V1, V3 // ERROR "invalid vector mask register" ++ VSUXEI8V V3, V2, V1, (X10) // ERROR "invalid vector mask register" ++ VLOXEI8V (X10), V2, V1, V3 // ERROR "invalid vector mask register" ++ VSOXEI8V V3, V2, V1, (X10) // ERROR "invalid vector mask register" ++ VL1RV (X10), V0, V3 // ERROR "too many operands for instruction" ++ VS1RV V3, V0, (X11) // ERROR "too many operands for instruction" ++ VADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VADDVX X10, V2, V1, V3 // ERROR "invalid vector mask register" ++ VADDVI $15, V4, V1, V2 // ERROR "invalid vector mask register" ++ VSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSUBVX X10, V2, V1, V3 // ERROR "invalid vector mask register" ++ VRSUBVX X10, V2, V1, V3 // ERROR "invalid vector mask register" ++ VRSUBVI $15, V4, V1, V2 // ERROR "invalid vector mask register" ++ VNEGV V2, V3, V4 // ERROR "invalid vector mask register" ++ VWADDUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDUWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDUWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBUWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBUWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWADDWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWSUBWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWCVTXXV V2, V1, V3 // ERROR "invalid vector mask register" ++ VWCVTUXXV V2, V1, V3 // ERROR "invalid vector mask register" ++ VZEXTVF2 V2, V3, V4 // ERROR "invalid vector mask register" ++ VSEXTVF2 V2, V3, V4 // ERROR "invalid vector mask register" ++ VZEXTVF4 V2, V3, V4 // ERROR "invalid vector mask register" ++ VSEXTVF4 V2, V3, V4 // ERROR "invalid vector mask register" ++ VZEXTVF8 V2, V3, V4 // ERROR "invalid vector mask register" ++ VSEXTVF8 V2, V3, V4 // ERROR "invalid vector mask register" ++ VADCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VADCVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VADCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VADCVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VADCVIM $15, V2, V1, V3 // ERROR "invalid vector mask register" ++ VADCVIM $15, V2, V3 // ERROR "invalid vector mask register" ++ VMADCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMADCVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VMADCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMADCVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VMADCVIM $15, V2, V1, V3 // ERROR "invalid vector mask register" ++ VMADCVIM $15, V2, V3 // ERROR "invalid vector mask register" ++ VSBCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSBCVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VSBCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSBCVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VMSBCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSBCVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VMSBCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSBCVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VANDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VANDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VANDVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VORVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VORVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VORVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VXORVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VXORVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VXORVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNOTV V1, V2, V3 // ERROR "invalid vector mask register" ++ VSLLVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSLLVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSLLVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRLVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRLVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRLVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRAVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRAVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSRAVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRLWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRLWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRLWI $31, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRAWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRAWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNSRAWI $31, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCVTXXW V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSEQVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSEQVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSEQVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSNEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSNEVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSNEVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLEVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGTVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGEUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSLTUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGEVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSGEUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMINUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMINUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMINVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMINVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMAXUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMAXUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMAXVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMAXVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHSUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMULHSUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VDIVUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VDIVUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VDIVVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VDIVVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREMUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREMUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREMVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREMVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULSUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMULSUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMACCVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMACCVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNMSACVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNMSACVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNMSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNMSUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCSUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCSUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWMACCUSVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMERGEVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VMERGEVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMERGEVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VMERGEVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMERGEVIM $15, V2, V3 // ERROR "invalid vector mask register" ++ VMERGEVIM $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMVVV V1, V2, V3 // ERROR "too many operands for instruction" ++ VMVVX X10, V2, V3 // ERROR "too many operands for instruction" ++ VMVVI $15, V2, V3 // ERROR "too many operands for instruction" ++ VSADDUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSADDUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSADDUVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSADDVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSUBUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSUBUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VAADDUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VAADDUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VAADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VAADDVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VASUBUVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VASUBUVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VASUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VASUBVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSMULVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRLVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRLVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRLVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRAVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRAVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSSRAVI $15, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPUWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPUWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPUWI $16, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPWX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VNCLIPWI $16, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFADDVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFSUBVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFRSUBVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWADDVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWADDVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWSUBVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWSUBVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWADDWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWADDWF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWSUBWV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWSUBWF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFMULVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFDIVVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFDIVVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFRDIVVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWMULVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWMULVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFMACCVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFMACCVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFNMACCVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFNMACCVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFMSACVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFMSACVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFNMSACVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFNMSACVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFMADDVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFMADDVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFNMADDVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFNMADDVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFMSUBVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFMSUBVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFNMSUBVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFNMSUBVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFWMACCVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFWMACCVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFWNMACCVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFWNMACCVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFWMSACVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFWMSACVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFWNMSACVV V2, V1, V4, V3 // ERROR "invalid vector mask register" ++ VFWNMSACVF V2, F10, V4, V3 // ERROR "invalid vector mask register" ++ VFSQRTV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFRSQRT7V V2, V4, V3 // ERROR "invalid vector mask register" ++ VFREC7V V2, V4, V3 // ERROR "invalid vector mask register" ++ VFMINVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFMINVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFMAXVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFMAXVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFSGNJVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFSGNJVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFSGNJNVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFSGNJNVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFSGNJXVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFSGNJXVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFNEGV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFABSV V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFEQVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFEQVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFNEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFNEVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFLTVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFLTVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFLEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFLEVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFGTVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFGEVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFGTVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VMFGEVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFMERGEVFM X10, V2, V3 // ERROR "invalid vector mask register" ++ VFMERGEVFM F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFCVTXUFV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFCVTXFV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFCVTRTZXUFV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFCVTRTZXFV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFCVTFXUV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFCVTFXV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWCVTXUFV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWCVTXFV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWCVTRTZXUFV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWCVTRTZXFV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWCVTFXUV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWCVTFXV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWCVTFFV V2, V4, V3 // ERROR "invalid vector mask register" ++ VFNCVTXUFW V2, V4, V3 // ERROR "invalid vector mask register" ++ VFNCVTXFW V2, V4, V3 // ERROR "invalid vector mask register" ++ VFNCVTRTZXUFW V2, V4, V3 // ERROR "invalid vector mask register" ++ VFNCVTRTZXFW V2, V4, V3 // ERROR "invalid vector mask register" ++ VFNCVTFXUW V2, V4, V3 // ERROR "invalid vector mask register" ++ VFNCVTFXW V2, V4, V3 // ERROR "invalid vector mask register" ++ VFNCVTFFW V2, V4, V3 // ERROR "invalid vector mask register" ++ VFNCVTRODFFW V2, V4, V3 // ERROR "invalid vector mask register" + + RET +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +index c6f71e64fb..2c509a1e91 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +@@ -15,259 +15,354 @@ TEXT validation(SB),$0 + // + // "V" Standard Extension for Vector Operations, Version 1.0 + // +- VSETVLI $32, E16, M1, TU, MU, X12 // ERROR "must be in range [0, 31] (5 bits)" +- VSETVLI $-1, E32, M2, TA, MA, X12 // ERROR "must be in range [0, 31] (5 bits)" +- VSETVL X10, X11 // ERROR "expected integer register in rs1 position" +- VLE8V (X10), X10 // ERROR "expected vector register in vd position" +- VLE8V (V1), V3 // ERROR "expected integer register in rs1 position" +- VSE8V X10, (X10) // ERROR "expected vector register in vs1 position" +- VSE8V V3, (V1) // ERROR "expected integer register in rd position" +- VLSE8V (X10), V3 // ERROR "expected integer register in rs2 position" +- VLSE8V (X10), X10, X11 // ERROR "expected vector register in vd position" +- VLSE8V (V1), X10, V3 // ERROR "expected integer register in rs1 position" +- VLSE8V (X10), V1, V0, V3 // ERROR "expected integer register in rs2 position" +- VSSE8V V3, (X10) // ERROR "expected integer register in rs2 position" +- VSSE8V X10, X11, (X10) // ERROR "expected vector register in vd position" +- VSSE8V V3, X11, (V1) // ERROR "expected integer register in rs1 position" +- VSSE8V V3, V1, V0, (X10) // ERROR "expected integer register in rs2 position" +- VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in vd position" +- VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in vd position" +- VLUXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" +- VLUXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in vs2 position" +- VSUXEI8V X10, V2, (X10) // ERROR "expected vector register in vd position" +- VSUXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" +- VSUXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in vs2 position" +- VLOXEI8V (X10), V2, X11 // ERROR "expected vector register in vd position" +- VLOXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" +- VLOXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in vs2 position" +- VSOXEI8V X10, V2, (X10) // ERROR "expected vector register in vd position" +- VSOXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" +- VSOXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in vs2 position" +- VL1RV (X10), X10 // ERROR "expected vector register in vd position" +- VL1RV (V1), V3 // ERROR "expected integer register in rs1 position" +- VS1RV X11, (X11) // ERROR "expected vector register in vs1 position" +- VS1RV V3, (V1) // ERROR "expected integer register in rd position" +- VADDVV V1, X10, V3 // ERROR "expected vector register in vs2 position" +- VADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VADDVI $16, V4, V2 // ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)" +- VADDVI $-17, V4, V2 // ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)" +- VSUBVV V1, X10, V3 // ERROR "expected vector register in vs2 position" +- VSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VRSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VRSUBVI $16, V4, V2 // ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)" +- VRSUBVI $-17, V4, V2 // ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)" +- VNEGV X10, V3 // ERROR "expected vector register in vs2 position" +- VNEGV V2 // ERROR "expected vector register in vd position" +- VWADDUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWADDUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWSUBUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWSUBUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWSUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWADDUWV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWADDUWX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWSUBUWV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWSUBUWX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWADDWV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWADDWX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWSUBWV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWSUBWX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWCVTXXV X10, V3 // ERROR "expected vector register in vs2 position" +- VWCVTUXXV X10, V3 // ERROR "expected vector register in vs2 position" +- VZEXTVF2 V2, V0, V3, V4 // ERROR "expected no register in rs3" +- VZEXTVF2 X10, V3 // ERROR "expected vector register in vs2 position" +- VSEXTVF2 V2, V0, V3, V4 // ERROR "expected no register in rs3" +- VSEXTVF2 X10, V3 // ERROR "expected vector register in vs2 position" +- VZEXTVF4 V2, V0, V3, V4 // ERROR "expected no register in rs3" +- VZEXTVF4 X10, V3 // ERROR "expected vector register in vs2 position" +- VSEXTVF4 V2, V0, V3, V4 // ERROR "expected no register in rs3" +- VSEXTVF4 X10, V3 // ERROR "expected vector register in vs2 position" +- VZEXTVF8 V2, V0, V3, V4 // ERROR "expected no register in rs3" +- VZEXTVF8 X10, V3 // ERROR "expected vector register in vs2 position" +- VSEXTVF8 V2, V0, V3, V4 // ERROR "expected no register in rs3" +- VSEXTVF8 X10, V3 // ERROR "expected vector register in vs2 position" +- VADCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" +- VADCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" +- VADCVIM $16, V2, V0, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VADCVIM $-17, V2, V0, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMADCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" +- VMADCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" +- VMADCVIM $16, V2, V0, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMADCVIM $-17, V2, V0, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMADCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMADCVV V1, V2, V0, V3 // ERROR "expected no register in rs3" +- VMADCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMADCVX X10, V2, V0, V3 // ERROR "expected no register in rs3" +- VMADCVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMADCVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMADCVI $15, V2, V0, V3 // ERROR "expected no register in rs3" +- VSBCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" +- VSBCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" +- VMSBCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" +- VMSBCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" +- VMSBCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMSBCVV V1, V2, V0, V3 // ERROR "expected no register in rs3" +- VMSBCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMSBCVX X10, V2, V0, V3 // ERROR "expected no register in rs3" +- VANDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VANDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VANDVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VANDVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VORVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VORVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VORVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VORVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VXORVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VXORVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VXORVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VXORVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VNOTV V3 // ERROR "expected vector register in vd position" +- VNOTV X10, V3 // ERROR "expected vector register in vs2 position" +- VSLLVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VSLLVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VSLLVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" +- VSLLVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" +- VSRLVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VSRLVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VSRLVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" +- VSRLVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" +- VSRAVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VSRAVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VSRAVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" +- VSRAVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" +- VNSRLWV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VNSRLWX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VNSRLWI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" +- VNSRLWI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" +- VNSRAWV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VNSRAWX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VNSRAWI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" +- VNSRAWI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" +- VNCVTXXW X10, V3 // ERROR "expected vector register in vs2 position" +- VMSEQVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMSEQVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMSEQVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMSEQVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMSNEVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMSNEVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMSNEVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMSNEVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMSLTUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMSLTUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMSLTVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMSLTVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMSLEUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMSLEUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMSLEUVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMSLEUVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMSLEVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMSLEVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMSLEVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMSLEVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMSGTUVV X10, V2, V3 // ERROR "expected vector register in vs2 position" +- VMSGTUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMSGTUVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMSGTUVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMSGTVV X10, V2, V3 // ERROR "expected vector register in vs2 position" +- VMSGTVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMSGTVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMSGTVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMSGEVV X10, V2, V3 // ERROR "expected vector register in vs2 position" +- VMSGEUVV X10, V2, V3 // ERROR "expected vector register in vs2 position" +- VMSLTVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMSLTVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMSLTUVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMSLTUVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMSGEVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMSGEVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMSGEUVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMSGEUVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMINUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMINUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMINVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMINVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMAXUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMAXUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMAXVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMAXVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMULVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMULHVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMULHVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMULHUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMULHUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMULHSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMULHSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VDIVUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VDIVUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VDIVVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VDIVVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VREMUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VREMUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VREMVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VREMVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWMULVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWMULUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWMULUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWMULSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWMULSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMACCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMACCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VNMSACVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VNMSACVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VMADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VNMSUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VNMSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWMACCUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWMACCUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWMACCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWMACCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWMACCSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VWMACCSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWMACCUSVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMERGEVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" +- VMERGEVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" +- VMERGEVIM $16, V2, V0, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMERGEVIM $-17, V2, V0, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VMVVV X10, V3 // ERROR "expected vector register in vs1 position" +- VMVVX V1, V2 // ERROR "expected integer register in rs1 position" +- VMVVI $16, V2 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VMVVI $-17, V2 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VSADDUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VSADDUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VSADDUVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" +- VSADDUVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" +- VSSUBUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VSSUBUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VAADDUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VAADDUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VAADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VAADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VASUBUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VASUBUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VASUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VASUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VSMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VSMULVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VSSRLVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VSSRLVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VSSRLVI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" +- VSSRLVI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" +- VSSRAVV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VSSRAVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VSSRAVI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" +- VSSRAVI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" +- VNCLIPUWV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VNCLIPUWX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VNCLIPUWI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" +- VNCLIPUWI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" +- VNCLIPWV X10, V2, V3 // ERROR "expected vector register in vs1 position" +- VNCLIPWX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VNCLIPWI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" +- VNCLIPWI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" ++ VSETVLI $32, E16, M1, TU, MU, X12 // ERROR "must be in range [0, 31] (5 bits)" ++ VSETVLI $-1, E32, M2, TA, MA, X12 // ERROR "must be in range [0, 31] (5 bits)" ++ VSETVL X10, X11 // ERROR "expected integer register in rs1 position" ++ VLE8V (X10), X10 // ERROR "expected vector register in vd position" ++ VLE8V (V1), V3 // ERROR "expected integer register in rs1 position" ++ VSE8V X10, (X10) // ERROR "expected vector register in vs1 position" ++ VSE8V V3, (V1) // ERROR "expected integer register in rd position" ++ VLSE8V (X10), V3 // ERROR "expected integer register in rs2 position" ++ VLSE8V (X10), X10, X11 // ERROR "expected vector register in vd position" ++ VLSE8V (V1), X10, V3 // ERROR "expected integer register in rs1 position" ++ VLSE8V (X10), V1, V0, V3 // ERROR "expected integer register in rs2 position" ++ VSSE8V V3, (X10) // ERROR "expected integer register in rs2 position" ++ VSSE8V X10, X11, (X10) // ERROR "expected vector register in vd position" ++ VSSE8V V3, X11, (V1) // ERROR "expected integer register in rs1 position" ++ VSSE8V V3, V1, V0, (X10) // ERROR "expected integer register in rs2 position" ++ VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in vd position" ++ VLUXEI8V (X10), V2, X11 // ERROR "expected vector register in vd position" ++ VLUXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" ++ VLUXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in vs2 position" ++ VSUXEI8V X10, V2, (X10) // ERROR "expected vector register in vd position" ++ VSUXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" ++ VSUXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in vs2 position" ++ VLOXEI8V (X10), V2, X11 // ERROR "expected vector register in vd position" ++ VLOXEI8V (V1), V2, V3 // ERROR "expected integer register in rs1 position" ++ VLOXEI8V (X10), X11, V0, V3 // ERROR "expected vector register in vs2 position" ++ VSOXEI8V X10, V2, (X10) // ERROR "expected vector register in vd position" ++ VSOXEI8V V3, V2, (V1) // ERROR "expected integer register in rs1 position" ++ VSOXEI8V V3, X11, V0, (X10) // ERROR "expected vector register in vs2 position" ++ VL1RV (X10), X10 // ERROR "expected vector register in vd position" ++ VL1RV (V1), V3 // ERROR "expected integer register in rs1 position" ++ VS1RV X11, (X11) // ERROR "expected vector register in vs1 position" ++ VS1RV V3, (V1) // ERROR "expected integer register in rd position" ++ VADDVV V1, X10, V3 // ERROR "expected vector register in vs2 position" ++ VADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VADDVI $16, V4, V2 // ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)" ++ VADDVI $-17, V4, V2 // ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)" ++ VSUBVV V1, X10, V3 // ERROR "expected vector register in vs2 position" ++ VSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VRSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VRSUBVI $16, V4, V2 // ERROR "signed immediate 16 must be in range [-16, 15] (5 bits)" ++ VRSUBVI $-17, V4, V2 // ERROR "signed immediate -17 must be in range [-16, 15] (5 bits)" ++ VNEGV X10, V3 // ERROR "expected vector register in vs2 position" ++ VNEGV V2 // ERROR "expected vector register in vd position" ++ VWADDUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWADDUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWSUBUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWSUBUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWSUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWADDUWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWADDUWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWSUBUWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWSUBUWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWADDWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWADDWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWSUBWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWSUBWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWCVTXXV X10, V3 // ERROR "expected vector register in vs2 position" ++ VWCVTUXXV X10, V3 // ERROR "expected vector register in vs2 position" ++ VZEXTVF2 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VZEXTVF2 X10, V3 // ERROR "expected vector register in vs2 position" ++ VSEXTVF2 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VSEXTVF2 X10, V3 // ERROR "expected vector register in vs2 position" ++ VZEXTVF4 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VZEXTVF4 X10, V3 // ERROR "expected vector register in vs2 position" ++ VSEXTVF4 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VSEXTVF4 X10, V3 // ERROR "expected vector register in vs2 position" ++ VZEXTVF8 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VZEXTVF8 X10, V3 // ERROR "expected vector register in vs2 position" ++ VSEXTVF8 V2, V0, V3, V4 // ERROR "expected no register in rs3" ++ VSEXTVF8 X10, V3 // ERROR "expected vector register in vs2 position" ++ VADCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" ++ VADCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" ++ VADCVIM $16, V2, V0, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VADCVIM $-17, V2, V0, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMADCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" ++ VMADCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" ++ VMADCVIM $16, V2, V0, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMADCVIM $-17, V2, V0, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMADCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMADCVV V1, V2, V0, V3 // ERROR "expected no register in rs3" ++ VMADCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMADCVX X10, V2, V0, V3 // ERROR "expected no register in rs3" ++ VMADCVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMADCVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMADCVI $15, V2, V0, V3 // ERROR "expected no register in rs3" ++ VSBCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" ++ VSBCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" ++ VMSBCVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" ++ VMSBCVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" ++ VMSBCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSBCVV V1, V2, V0, V3 // ERROR "expected no register in rs3" ++ VMSBCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSBCVX X10, V2, V0, V3 // ERROR "expected no register in rs3" ++ VANDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VANDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VANDVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VANDVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VORVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VORVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VORVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VORVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VXORVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VXORVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VXORVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VXORVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VNOTV V3 // ERROR "expected vector register in vd position" ++ VNOTV X10, V3 // ERROR "expected vector register in vs2 position" ++ VSLLVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSLLVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSLLVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VSLLVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VSRLVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSRLVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSRLVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VSRLVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VSRAVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSRAVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSRAVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VSRAVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VNSRLWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNSRLWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNSRLWI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VNSRLWI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VNSRAWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNSRAWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNSRAWI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VNSRAWI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VNCVTXXW X10, V3 // ERROR "expected vector register in vs2 position" ++ VMSEQVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSEQVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSEQVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSEQVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSNEVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSNEVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSNEVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSNEVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSLTUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSLTUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSLTVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSLTVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSLEUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSLEUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSLEUVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSLEUVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSLEVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMSLEVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSLEVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSLEVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSGTUVV X10, V2, V3 // ERROR "expected vector register in vs2 position" ++ VMSGTUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSGTUVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSGTUVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSGTVV X10, V2, V3 // ERROR "expected vector register in vs2 position" ++ VMSGTVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMSGTVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSGTVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSGEVV X10, V2, V3 // ERROR "expected vector register in vs2 position" ++ VMSGEUVV X10, V2, V3 // ERROR "expected vector register in vs2 position" ++ VMSLTVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSLTVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSLTUVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSLTUVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSGEVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSGEVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMSGEUVI $17, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMSGEUVI $-16, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMINUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMINUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMINVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMINVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMAXUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMAXUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMAXVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMAXVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMULVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMULHVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMULHVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMULHUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMULHUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMULHSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMULHSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VDIVUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VDIVUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VDIVVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VDIVVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VREMUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREMUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VREMVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREMVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMULVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMULUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMULUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMULSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMULSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMACCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMACCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNMSACVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNMSACVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNMSUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNMSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMACCUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMACCUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMACCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMACCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMACCSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMACCSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VWMACCUSVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VMERGEVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" ++ VMERGEVXM V1, V2, V0, V3 // ERROR "expected integer register in rs1 position" ++ VMERGEVIM $16, V2, V0, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMERGEVIM $-17, V2, V0, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VMVVV X10, V3 // ERROR "expected vector register in vs1 position" ++ VMVVX V1, V2 // ERROR "expected integer register in rs1 position" ++ VMVVI $16, V2 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VMVVI $-17, V2 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VSADDUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSADDUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSADDUVI $16, V2, V3 // ERROR "signed immediate 16 must be in range [-16, 15]" ++ VSADDUVI $-17, V2, V3 // ERROR "signed immediate -17 must be in range [-16, 15]" ++ VSSUBUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSSUBUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VAADDUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VAADDUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VAADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VAADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VASUBUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VASUBUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VASUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VASUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSMULVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSSRLVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSSRLVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSSRLVI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" ++ VSSRLVI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" ++ VSSRAVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VSSRAVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSSRAVI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" ++ VSSRAVI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" ++ VNCLIPUWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNCLIPUWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNCLIPUWI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" ++ VNCLIPUWI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" ++ VNCLIPWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNCLIPWX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VNCLIPWI $32, V2, V3 // ERROR "signed immediate 32 must be in range [0, 31]" ++ VNCLIPWI $-1, V2, V3 // ERROR "signed immediate -1 must be in range [0, 31]" ++ VFADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFADDVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFSUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFSUBVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFRSUBVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFWADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFWADDVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFWSUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFWSUBVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFWADDWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFWADDWF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFWSUBWV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFWSUBWF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFMULVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFDIVVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFDIVVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFRDIVVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFWMULVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFWMULVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFMACCVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFMACCVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFNMACCVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFNMACCVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFMSACVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFMSACVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFNMSACVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFNMSACVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFMADDVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFMADDVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFNMADDVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFNMADDVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFMSUBVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFMSUBVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFNMSUBVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFNMSUBVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFWMACCVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFWMACCVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFWNMACCVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFWNMACCVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFWMSACVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFWMSACVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFWNMSACVV V2, X10, V3 // ERROR "expected vector register in vs1 position" ++ VFWNMSACVF V2, X10, V3 // ERROR "expected float register in rs1 position" ++ VFSQRTV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFRSQRT7V X10, V3 // ERROR "expected vector register in vs2 position" ++ VFREC7V X10, V3 // ERROR "expected vector register in vs2 position" ++ VFMINVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFMINVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFMAXVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFMAXVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFSGNJVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFSGNJVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFSGNJNVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFSGNJNVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFSGNJXVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFSGNJXVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VFNEGV V2, X10 // ERROR "expected vector register in vd position" ++ VFABSV V2, X10 // ERROR "expected vector register in vd position" ++ VMFEQVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMFEQVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VMFNEVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMFNEVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VMFLTVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMFLTVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VMFLEVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMFLEVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VMFGTVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VMFGEVF X10, V2, V3 // ERROR "expected float register in rs1 position" ++ VMFGTVV X10, V2, V3 // ERROR "expected vector register in vs2 position" ++ VMFGEVV X10, V2, V3 // ERROR "expected vector register in vs2 position" ++ VFCLASSV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFMERGEVFM X10, V2, V0, V3 // ERROR "expected float register in rs1 position" ++ VFMVVF X10, V3 // ERROR "expected float register in rs1 position" ++ VFCVTXUFV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFCVTXFV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFCVTRTZXUFV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFCVTRTZXFV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFCVTFXUV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFCVTFXV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFWCVTXUFV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFWCVTXFV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFWCVTRTZXUFV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFWCVTRTZXFV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFWCVTFXUV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFWCVTFXV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFWCVTFFV X10, V3 // ERROR "expected vector register in vs2 position" ++ VFNCVTXUFW X10, V3 // ERROR "expected vector register in vs2 position" ++ VFNCVTXFW X10, V3 // ERROR "expected vector register in vs2 position" ++ VFNCVTRTZXUFW X10, V3 // ERROR "expected vector register in vs2 position" ++ VFNCVTRTZXFW X10, V3 // ERROR "expected vector register in vs2 position" ++ VFNCVTFXUW X10, V3 // ERROR "expected vector register in vs2 position" ++ VFNCVTFXW X10, V3 // ERROR "expected vector register in vs2 position" ++ VFNCVTFFW X10, V3 // ERROR "expected vector register in vs2 position" ++ VFNCVTRODFFW X10, V3 // ERROR "expected vector register in vs2 position" + + RET +diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go +index a65dfceea9..bf1fdb8b88 100644 +--- a/src/cmd/internal/obj/riscv/anames.go ++++ b/src/cmd/internal/obj/riscv/anames.go +@@ -650,6 +650,10 @@ var Anames = []string{ + "RDTIME", + "SEQZ", + "SNEZ", ++ "VFABSV", ++ "VFNEGV", ++ "VMFGEVV", ++ "VMFGTVV", + "VL1RV", + "VL2RV", + "VL4RV", +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index 577b06f0ec..b641eadde7 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -1168,6 +1168,10 @@ const ( + ARDTIME + ASEQZ + ASNEZ ++ AVFABSV ++ AVFNEGV ++ AVMFGEVV ++ AVMFGTVV + AVL1RV + AVL2RV + AVL4RV +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index e7870000cf..6066c840ca 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1189,17 +1189,24 @@ func validateRFI(ctxt *obj.Link, ins *instruction) { + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + +-func validateRIF(ctxt *obj.Link, ins *instruction) { ++func validateRFF(ctxt *obj.Link, ins *instruction) { + wantFloatReg(ctxt, ins, "rd", ins.rd) + wantNoneReg(ctxt, ins, "rs1", ins.rs1) +- wantIntReg(ctxt, ins, "rs2", ins.rs2) ++ wantFloatReg(ctxt, ins, "rs2", ins.rs2) + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + +-func validateRFF(ctxt *obj.Link, ins *instruction) { ++func validateRIF(ctxt *obj.Link, ins *instruction) { + wantFloatReg(ctxt, ins, "rd", ins.rd) + wantNoneReg(ctxt, ins, "rs1", ins.rs1) +- wantFloatReg(ctxt, ins, "rs2", ins.rs2) ++ wantIntReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateRVFV(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "vd", ins.rd) ++ wantFloatReg(ctxt, ins, "rs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "vs2", ins.rs2) + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + +@@ -1440,12 +1447,20 @@ func encodeRFI(ins *instruction) uint32 { + return encodeR(ins.as, regF(ins.rs2), 0, regI(ins.rd), ins.funct3, ins.funct7) + } + ++func encodeRFF(ins *instruction) uint32 { ++ return encodeR(ins.as, regF(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7) ++} ++ + func encodeRIF(ins *instruction) uint32 { + return encodeR(ins.as, regI(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7) + } + +-func encodeRFF(ins *instruction) uint32 { +- return encodeR(ins.as, regF(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7) ++func encodeRVFV(ins *instruction) uint32 { ++ return encodeR(ins.as, regF(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) ++} ++ ++func encodeRVIV(ins *instruction) uint32 { ++ return encodeR(ins.as, regI(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) + } + + func encodeRVV(ins *instruction) uint32 { +@@ -1460,10 +1475,6 @@ func encodeRVVu(ins *instruction) uint32 { + return encodeR(ins.as, immU(ins.as, ins.imm, 5), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) + } + +-func encodeRVIV(ins *instruction) uint32 { +- return encodeR(ins.as, regI(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) +-} +- + func encodeRVVV(ins *instruction) uint32 { + return encodeR(ins.as, regV(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) + } +@@ -1751,10 +1762,11 @@ var ( + rFIEncoding = encoding{encode: encodeRFI, validate: validateRFI, length: 4} + rIFEncoding = encoding{encode: encodeRIF, validate: validateRIF, length: 4} + rFFEncoding = encoding{encode: encodeRFF, validate: validateRFF, length: 4} ++ rVFVEncoding = encoding{encode: encodeRVFV, validate: validateRVFV, length: 4} ++ rVIVEncoding = encoding{encode: encodeRVIV, validate: validateRVIV, length: 4} + rVVEncoding = encoding{encode: encodeRVV, validate: validateRVV, length: 4} + rVViEncoding = encoding{encode: encodeRVVi, validate: validateRVVi, length: 4} + rVVuEncoding = encoding{encode: encodeRVVu, validate: validateRVVu, length: 4} +- rVIVEncoding = encoding{encode: encodeRVIV, validate: validateRVIV, length: 4} + rVVVEncoding = encoding{encode: encodeRVVV, validate: validateRVVV, length: 4} + + iIIEncoding = encoding{encode: encodeIII, validate: validateIII, length: 4} +@@ -2328,6 +2340,133 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AVNCLIPWX & obj.AMask: {enc: rVIVEncoding}, + AVNCLIPWI & obj.AMask: {enc: rVVuEncoding}, + ++ // 31.13.2: Vector Single-Width Floating-Point Add/Subtract Instructions ++ AVFADDVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFADDVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFSUBVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFSUBVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFRSUBVF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.3: Vector Widening Floating-Point Add/Subtract Instructions ++ AVFWADDVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFWADDVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFWSUBVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFWSUBVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFWADDWV & obj.AMask: {enc: rVVVEncoding}, ++ AVFWADDWF & obj.AMask: {enc: rVFVEncoding}, ++ AVFWSUBWV & obj.AMask: {enc: rVVVEncoding}, ++ AVFWSUBWF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.4: Vector Single-Width Floating-Point Multiply/Divide Instructions ++ AVFMULVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFMULVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFDIVVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFDIVVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFRDIVVF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.5: Vector Widening Floating-Point Multiply ++ AVFWMULVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFWMULVF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.6: Vector Single-Width Floating-Point Fused Multiply-Add Instructions ++ AVFMACCVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFMACCVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFNMACCVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFNMACCVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFMSACVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFMSACVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFNMSACVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFNMSACVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFMADDVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFMADDVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFNMADDVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFNMADDVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFMSUBVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFMSUBVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFNMSUBVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFNMSUBVF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.7: Vector Widening Floating-Point Fused Multiply-Add Instructions ++ AVFWMACCVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFWMACCVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFWNMACCVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFWNMACCVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFWMSACVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFWMSACVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFWNMSACVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFWNMSACVF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.8: Vector Floating-Point Square-Root Instruction ++ AVFSQRTV & obj.AMask: {enc: rVVEncoding}, ++ ++ // 31.13.9: Vector Floating-Point Reciprocal Square-Root Estimate Instruction ++ AVFRSQRT7V & obj.AMask: {enc: rVVEncoding}, ++ ++ // 31.13.10: Vector Floating-Point Reciprocal Estimate Instruction ++ AVFREC7V & obj.AMask: {enc: rVVEncoding}, ++ ++ // 31.13.11: Vector Floating-Point MIN/MAX Instructions ++ AVFMINVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFMINVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFMAXVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFMAXVF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.12: Vector Floating-Point Sign-Injection Instructions ++ AVFSGNJVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFSGNJVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFSGNJNVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFSGNJNVF & obj.AMask: {enc: rVFVEncoding}, ++ AVFSGNJXVV & obj.AMask: {enc: rVVVEncoding}, ++ AVFSGNJXVF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.13: Vector Floating-Point Compare Instructions ++ AVMFEQVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMFEQVF & obj.AMask: {enc: rVFVEncoding}, ++ AVMFNEVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMFNEVF & obj.AMask: {enc: rVFVEncoding}, ++ AVMFLTVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMFLTVF & obj.AMask: {enc: rVFVEncoding}, ++ AVMFLEVV & obj.AMask: {enc: rVVVEncoding}, ++ AVMFLEVF & obj.AMask: {enc: rVFVEncoding}, ++ AVMFGTVF & obj.AMask: {enc: rVFVEncoding}, ++ AVMFGEVF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.14: Vector Floating-Point Classify Instruction ++ AVFCLASSV & obj.AMask: {enc: rVVEncoding}, ++ ++ // 31.13.15: Vector Floating-Point Merge Instruction ++ AVFMERGEVFM & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.16: Vector Floating-Point Move Instruction ++ AVFMVVF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.13.17: Single-Width Floating-Point/Integer Type-Convert Instructions ++ AVFCVTXUFV & obj.AMask: {enc: rVVEncoding}, ++ AVFCVTXFV & obj.AMask: {enc: rVVEncoding}, ++ AVFCVTRTZXUFV & obj.AMask: {enc: rVVEncoding}, ++ AVFCVTRTZXFV & obj.AMask: {enc: rVVEncoding}, ++ AVFCVTFXUV & obj.AMask: {enc: rVVEncoding}, ++ AVFCVTFXV & obj.AMask: {enc: rVVEncoding}, ++ ++ // 31.13.18: Widening Floating-Point/Integer Type-Convert Instructions ++ AVFWCVTXUFV & obj.AMask: {enc: rVVEncoding}, ++ AVFWCVTXFV & obj.AMask: {enc: rVVEncoding}, ++ AVFWCVTRTZXUFV & obj.AMask: {enc: rVVEncoding}, ++ AVFWCVTRTZXFV & obj.AMask: {enc: rVVEncoding}, ++ AVFWCVTFXUV & obj.AMask: {enc: rVVEncoding}, ++ AVFWCVTFXV & obj.AMask: {enc: rVVEncoding}, ++ AVFWCVTFFV & obj.AMask: {enc: rVVEncoding}, ++ ++ // 31.13.19: Narrowing Floating-Point/Integer Type-Convert Instructions ++ AVFNCVTXUFW & obj.AMask: {enc: rVVEncoding}, ++ AVFNCVTXFW & obj.AMask: {enc: rVVEncoding}, ++ AVFNCVTRTZXUFW & obj.AMask: {enc: rVVEncoding}, ++ AVFNCVTRTZXFW & obj.AMask: {enc: rVVEncoding}, ++ AVFNCVTFXUW & obj.AMask: {enc: rVVEncoding}, ++ AVFNCVTFXW & obj.AMask: {enc: rVVEncoding}, ++ AVFNCVTFFW & obj.AMask: {enc: rVVEncoding}, ++ AVFNCVTRODFFW & obj.AMask: {enc: rVVEncoding}, ++ + // + // Privileged ISA + // +@@ -3315,7 +3454,13 @@ func instructionsForProg(p *obj.Prog) []*instruction { + AVSADDUVV, AVSADDUVX, AVSADDUVI, AVSADDVV, AVSADDVX, AVSADDVI, AVSSUBUVV, AVSSUBUVX, AVSSUBVV, AVSSUBVX, + AVAADDUVV, AVAADDUVX, AVAADDVV, AVAADDVX, AVASUBUVV, AVASUBUVX, AVASUBVV, AVASUBVX, + AVSMULVV, AVSMULVX, AVSSRLVV, AVSSRLVX, AVSSRLVI, AVSSRAVV, AVSSRAVX, AVSSRAVI, +- AVNCLIPUWV, AVNCLIPUWX, AVNCLIPUWI, AVNCLIPWV, AVNCLIPWX, AVNCLIPWI: ++ AVNCLIPUWV, AVNCLIPUWX, AVNCLIPUWI, AVNCLIPWV, AVNCLIPWX, AVNCLIPWI, ++ AVFADDVV, AVFADDVF, AVFSUBVV, AVFSUBVF, AVFRSUBVF, ++ AVFWADDVV, AVFWADDVF, AVFWSUBVV, AVFWSUBVF, AVFWADDWV, AVFWADDWF, AVFWSUBWV, AVFWSUBWF, ++ AVFMULVV, AVFMULVF, AVFDIVVV, AVFDIVVF, AVFRDIVVF, AVFWMULVV, AVFWMULVF, ++ AVFMINVV, AVFMINVF, AVFMAXVV, AVFMAXVF, ++ AVFSGNJVV, AVFSGNJVF, AVFSGNJNVV, AVFSGNJNVF, AVFSGNJXVV, AVFSGNJXVF, ++ AVMFEQVV, AVMFEQVF, AVMFNEVV, AVMFNEVF, AVMFLTVV, AVMFLTVF, AVMFLEVV, AVMFLEVF, AVMFGTVF, AVMFGEVF: + // Set mask bit + switch { + case ins.rs3 == obj.REG_NONE: +@@ -3325,6 +3470,17 @@ func instructionsForProg(p *obj.Prog) []*instruction { + } + ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg), obj.REG_NONE + ++ case AVFMACCVV, AVFMACCVF, AVFNMACCVV, AVFNMACCVF, AVFMSACVV, AVFMSACVF, AVFNMSACVV, AVFNMSACVF, ++ AVFMADDVV, AVFMADDVF, AVFNMADDVV, AVFNMADDVF, AVFMSUBVV, AVFMSUBVF, AVFNMSUBVV, AVFNMSUBVF, ++ AVFWMACCVV, AVFWMACCVF, AVFWNMACCVV, AVFWNMACCVF, AVFWMSACVV, AVFWMSACVF, AVFWNMSACVV, AVFWNMSACVF: ++ switch { ++ case ins.rs3 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs3 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), obj.REG_NONE ++ + case AVADDVI, AVRSUBVI, AVANDVI, AVORVI, AVXORVI, AVMSEQVI, AVMSNEVI, AVMSLEUVI, AVMSLEVI, AVMSGTUVI, AVMSGTVI, + AVSLLVI, AVSRLVI, AVSRAVI, AVNSRLWI, AVNSRAWI: + // Set mask bit +@@ -3336,7 +3492,10 @@ func instructionsForProg(p *obj.Prog) []*instruction { + } + ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), obj.REG_NONE, uint32(p.Reg), obj.REG_NONE + +- case AVZEXTVF2, AVSEXTVF2, AVZEXTVF4, AVSEXTVF4, AVZEXTVF8, AVSEXTVF8: ++ case AVZEXTVF2, AVSEXTVF2, AVZEXTVF4, AVSEXTVF4, AVZEXTVF8, AVSEXTVF8, AVFSQRTV, AVFRSQRT7V, AVFREC7V, AVFCLASSV, ++ AVFCVTXUFV, AVFCVTXFV, AVFCVTRTZXUFV, AVFCVTRTZXFV, AVFCVTFXUV, AVFCVTFXV, ++ AVFWCVTXUFV, AVFWCVTXFV, AVFWCVTRTZXUFV, AVFWCVTRTZXFV, AVFWCVTFXUV, AVFWCVTFXV, AVFWCVTFFV, ++ AVFNCVTXUFW, AVFNCVTXFW, AVFNCVTRTZXUFW, AVFNCVTRTZXFW, AVFNCVTFXUW, AVFNCVTFXW, AVFNCVTFFW, AVFNCVTRODFFW: + // Set mask bit + switch { + case ins.rs1 == obj.REG_NONE: +@@ -3358,8 +3517,12 @@ func instructionsForProg(p *obj.Prog) []*instruction { + } + ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), obj.REG_NONE, REG_V0 + ++ case AVFMVVF: ++ ins.funct7 |= 1 // unmasked ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), REG_V0 ++ + case AVADCVVM, AVADCVXM, AVMADCVVM, AVMADCVXM, AVSBCVVM, AVSBCVXM, AVMSBCVVM, AVMSBCVXM, AVADCVIM, AVMADCVIM, +- AVMERGEVVM, AVMERGEVXM, AVMERGEVIM: ++ AVMERGEVVM, AVMERGEVXM, AVMERGEVIM, AVFMERGEVFM: + if ins.rs3 != REG_V0 { + p.Ctxt.Diag("%v: invalid vector mask register", p) + } +@@ -3399,7 +3562,7 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.as = AVXORVI + ins.rd, ins.rs1, ins.rs2, ins.imm = uint32(p.To.Reg), obj.REG_NONE, uint32(p.From.Reg), -1 + +- case AVMSGTVV, AVMSGTUVV, AVMSGEVV, AVMSGEUVV: ++ case AVMSGTVV, AVMSGTUVV, AVMSGEVV, AVMSGEUVV, AVMFGTVV, AVMFGEVV: + // Set mask bit + switch { + case ins.rs3 == obj.REG_NONE: +@@ -3416,6 +3579,10 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.as = AVMSLEVV + case AVMSGEUVV: + ins.as = AVMSLEUVV ++ case AVMFGTVV: ++ ins.as = AVMFLTVV ++ case AVMFGEVV: ++ ins.as = AVMFLEVV + } + ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), obj.REG_NONE + +@@ -3438,6 +3605,22 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.as = AVMSGTUVI + } + ins.rd, ins.rs1, ins.rs2, ins.rs3, ins.imm = uint32(p.To.Reg), obj.REG_NONE, uint32(p.Reg), obj.REG_NONE, ins.imm-1 ++ ++ case AVFABSV, AVFNEGV: ++ // Set mask bit ++ switch { ++ case ins.rs1 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs1 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ switch ins.as { ++ case AVFABSV: ++ ins.as = AVFSGNJXVV ++ case AVFNEGV: ++ ins.as = AVFSGNJNVV ++ } ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.From.Reg) + } + + for _, ins := range inss { +-- +2.39.5 + diff --git a/2096-cmd-internal-obj-riscv-add-support-for-vector-reduct.patch b/2096-cmd-internal-obj-riscv-add-support-for-vector-reduct.patch new file mode 100644 index 0000000..ea3d014 --- /dev/null +++ b/2096-cmd-internal-obj-riscv-add-support-for-vector-reduct.patch @@ -0,0 +1,176 @@ +From a65365b6a8f51e2333a4228b2aaf64cf02902177 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 096/119] cmd/internal/obj/riscv: add support for vector + reduction instructions + +Add support for vector reduction instructions to the RISC-V assembler, +including single-width integer reduction, widening integer reduction, +single-width floating-point reduction and widening floating-point +reduction. + +Change-Id: I8f17bef11389f3a017e0430275023fc5d75936e3 +Reviewed-on: https://go-review.googlesource.com/c/go/+/646778 +Reviewed-by: Meng Zhuo +Reviewed-by: Mark Ryan +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Carlos Amedee +Reviewed-by: Dmitri Shuralyov +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 40 +++++++++++++++++++ + .../asm/internal/asm/testdata/riscv64error.s | 15 +++++++ + .../internal/asm/testdata/riscv64validation.s | 16 ++++++++ + src/cmd/internal/obj/riscv/obj.go | 28 ++++++++++++- + 4 files changed, 98 insertions(+), 1 deletion(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 7e2a070bd0..13f0279fc7 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -1161,6 +1161,46 @@ start: + VFNCVTRODFFW V2, V3 // d7912a4a + VFNCVTRODFFW V2, V0, V3 // d7912a48 + ++ // 31.14.1: Vector Single-Width Integer Reduction Instructions ++ VREDSUMVS V1, V2, V3 // d7a12002 ++ VREDSUMVS V1, V2, V0, V3 // d7a12000 ++ VREDMAXUVS V1, V2, V3 // d7a1201a ++ VREDMAXUVS V1, V2, V0, V3 // d7a12018 ++ VREDMAXVS V1, V2, V3 // d7a1201e ++ VREDMAXVS V1, V2, V0, V3 // d7a1201c ++ VREDMINUVS V1, V2, V3 // d7a12012 ++ VREDMINUVS V1, V2, V0, V3 // d7a12010 ++ VREDMINVS V1, V2, V3 // d7a12016 ++ VREDMINVS V1, V2, V0, V3 // d7a12014 ++ VREDANDVS V1, V2, V3 // d7a12006 ++ VREDANDVS V1, V2, V0, V3 // d7a12004 ++ VREDORVS V1, V2, V3 // d7a1200a ++ VREDORVS V1, V2, V0, V3 // d7a12008 ++ VREDXORVS V1, V2, V3 // d7a1200e ++ VREDXORVS V1, V2, V0, V3 // d7a1200c ++ ++ // 31.14.2: Vector Widening Integer Reduction Instructions ++ VWREDSUMUVS V1, V2, V3 // d78120c2 ++ VWREDSUMUVS V1, V2, V0, V3 // d78120c0 ++ VWREDSUMVS V1, V2, V3 // d78120c6 ++ VWREDSUMVS V1, V2, V0, V3 // d78120c4 ++ ++ // 31.14.3: Vector Single-Width Floating-Point Reduction Instructions ++ VFREDOSUMVS V1, V2, V3 // d791200e ++ VFREDOSUMVS V1, V2, V0, V3 // d791200c ++ VFREDUSUMVS V1, V2, V3 // d7912006 ++ VFREDUSUMVS V1, V2, V0, V3 // d7912004 ++ VFREDMAXVS V1, V2, V3 // d791201e ++ VFREDMAXVS V1, V2, V0, V3 // d791201c ++ VFREDMINVS V1, V2, V3 // d7912016 ++ VFREDMINVS V1, V2, V0, V3 // d7912014 ++ ++ // 31.14.4: Vector Widening Floating-Point Reduction Instructions ++ VFWREDOSUMVS V1, V2, V3 // d79120ce ++ VFWREDOSUMVS V1, V2, V0, V3 // d79120cc ++ VFWREDUSUMVS V1, V2, V3 // d79120c6 ++ VFWREDUSUMVS V1, V2, V0, V3 // d79120c4 ++ + // + // Privileged ISA + // +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index 3aeeadf848..3a4bb1c761 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -347,5 +347,20 @@ TEXT errors(SB),$0 + VFNCVTFXW V2, V4, V3 // ERROR "invalid vector mask register" + VFNCVTFFW V2, V4, V3 // ERROR "invalid vector mask register" + VFNCVTRODFFW V2, V4, V3 // ERROR "invalid vector mask register" ++ VREDSUMVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREDMAXUVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREDMAXVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREDMINUVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREDMINVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREDANDVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREDORVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VREDXORVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWREDSUMUVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VWREDSUMVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFREDOSUMVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFREDUSUMVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFREDMAXVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFREDMINVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFWREDOSUMVS V1, V2, V4, V3 // ERROR "invalid vector mask register" + + RET +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +index 2c509a1e91..adb10823d7 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +@@ -364,5 +364,21 @@ TEXT validation(SB),$0 + VFNCVTFXW X10, V3 // ERROR "expected vector register in vs2 position" + VFNCVTFFW X10, V3 // ERROR "expected vector register in vs2 position" + VFNCVTRODFFW X10, V3 // ERROR "expected vector register in vs2 position" ++ VREDSUMVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREDMAXUVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREDMAXVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREDMINUVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREDMINVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREDANDVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREDORVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VREDXORVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWREDSUMUVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWREDSUMVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFREDOSUMVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFREDUSUMVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFREDMAXVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFREDMINVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFWREDOSUMVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VFWREDUSUMVS X10, V2, V3 // ERROR "expected vector register in vs1 position" + + RET +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 6066c840ca..9b99416b95 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -2467,6 +2467,30 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AVFNCVTFFW & obj.AMask: {enc: rVVEncoding}, + AVFNCVTRODFFW & obj.AMask: {enc: rVVEncoding}, + ++ // 31.14.1: Vector Single-Width Integer Reduction Instructions ++ AVREDSUMVS & obj.AMask: {enc: rVVVEncoding}, ++ AVREDMAXUVS & obj.AMask: {enc: rVVVEncoding}, ++ AVREDMAXVS & obj.AMask: {enc: rVVVEncoding}, ++ AVREDMINUVS & obj.AMask: {enc: rVVVEncoding}, ++ AVREDMINVS & obj.AMask: {enc: rVVVEncoding}, ++ AVREDANDVS & obj.AMask: {enc: rVVVEncoding}, ++ AVREDORVS & obj.AMask: {enc: rVVVEncoding}, ++ AVREDXORVS & obj.AMask: {enc: rVVVEncoding}, ++ ++ // 31.14.2: Vector Widening Integer Reduction Instructions ++ AVWREDSUMUVS & obj.AMask: {enc: rVVVEncoding}, ++ AVWREDSUMVS & obj.AMask: {enc: rVVVEncoding}, ++ ++ // 31.14.3: Vector Single-Width Floating-Point Reduction Instructions ++ AVFREDOSUMVS & obj.AMask: {enc: rVVVEncoding}, ++ AVFREDUSUMVS & obj.AMask: {enc: rVVVEncoding}, ++ AVFREDMAXVS & obj.AMask: {enc: rVVVEncoding}, ++ AVFREDMINVS & obj.AMask: {enc: rVVVEncoding}, ++ ++ // 31.14.4: Vector Widening Floating-Point Reduction Instructions ++ AVFWREDOSUMVS & obj.AMask: {enc: rVVVEncoding}, ++ AVFWREDUSUMVS & obj.AMask: {enc: rVVVEncoding}, ++ + // + // Privileged ISA + // +@@ -3460,7 +3484,9 @@ func instructionsForProg(p *obj.Prog) []*instruction { + AVFMULVV, AVFMULVF, AVFDIVVV, AVFDIVVF, AVFRDIVVF, AVFWMULVV, AVFWMULVF, + AVFMINVV, AVFMINVF, AVFMAXVV, AVFMAXVF, + AVFSGNJVV, AVFSGNJVF, AVFSGNJNVV, AVFSGNJNVF, AVFSGNJXVV, AVFSGNJXVF, +- AVMFEQVV, AVMFEQVF, AVMFNEVV, AVMFNEVF, AVMFLTVV, AVMFLTVF, AVMFLEVV, AVMFLEVF, AVMFGTVF, AVMFGEVF: ++ AVMFEQVV, AVMFEQVF, AVMFNEVV, AVMFNEVF, AVMFLTVV, AVMFLTVF, AVMFLEVV, AVMFLEVF, AVMFGTVF, AVMFGEVF, ++ AVREDSUMVS, AVREDMAXUVS, AVREDMAXVS, AVREDMINUVS, AVREDMINVS, AVREDANDVS, AVREDORVS, AVREDXORVS, ++ AVWREDSUMUVS, AVWREDSUMVS, AVFREDOSUMVS, AVFREDUSUMVS, AVFREDMAXVS, AVFREDMINVS, AVFWREDOSUMVS, AVFWREDUSUMVS: + // Set mask bit + switch { + case ins.rs3 == obj.REG_NONE: +-- +2.39.5 + diff --git a/2097-cmd-internal-obj-riscv-add-support-for-vector-mask-i.patch b/2097-cmd-internal-obj-riscv-add-support-for-vector-mask-i.patch new file mode 100644 index 0000000..fb39fe4 --- /dev/null +++ b/2097-cmd-internal-obj-riscv-add-support-for-vector-mask-i.patch @@ -0,0 +1,269 @@ +From 1027809e88579f1ad50ce1acb5c2eb6bea8ca51d Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 097/119] cmd/internal/obj/riscv: add support for vector mask + instructions + +Add support for vector mask instructions to the RISC-V assembler. +These allow manipulation of vector masks and include mask register +logical instructions, population count and find-first bit set +instructions. + +Change-Id: I3ab3aa0f918338aee9b37ac5a2b2fdc407875072 +Reviewed-on: https://go-review.googlesource.com/c/go/+/646779 +Reviewed-by: Carlos Amedee +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +Reviewed-by: Junyang Shao +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 28 ++++++++ + .../asm/internal/asm/testdata/riscv64error.s | 6 ++ + .../internal/asm/testdata/riscv64validation.s | 19 +++++ + src/cmd/internal/obj/riscv/anames.go | 8 ++- + src/cmd/internal/obj/riscv/cpu.go | 8 ++- + src/cmd/internal/obj/riscv/obj.go | 70 +++++++++++++++++++ + 6 files changed, 135 insertions(+), 4 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 13f0279fc7..ffffbf4bd4 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -1201,6 +1201,34 @@ start: + VFWREDUSUMVS V1, V2, V3 // d79120c6 + VFWREDUSUMVS V1, V2, V0, V3 // d79120c4 + ++ // 31.15: Vector Mask Instructions ++ VMANDMM V1, V2, V3 // d7a12066 ++ VMNANDMM V1, V2, V3 // d7a12076 ++ VMANDNMM V1, V2, V3 // d7a12062 ++ VMXORMM V1, V2, V3 // d7a1206e ++ VMORMM V1, V2, V3 // d7a1206a ++ VMNORMM V1, V2, V3 // d7a1207a ++ VMORNMM V1, V2, V3 // d7a12072 ++ VMXNORMM V1, V2, V3 // d7a1207e ++ VMMVM V2, V3 // d7212166 ++ VMCLRM V3 // d7a1316e ++ VMSETM V3 // d7a1317e ++ VMNOTM V2, V3 // d7212176 ++ VCPOPM V2, X10 // 57252842 ++ VCPOPM V2, V0, X10 // 57252840 ++ VFIRSTM V2, X10 // 57a52842 ++ VFIRSTM V2, V0, X10 // 57a52840 ++ VMSBFM V2, V3 // d7a12052 ++ VMSBFM V2, V0, V3 // d7a12050 ++ VMSIFM V2, V3 // d7a12152 ++ VMSIFM V2, V0, V3 // d7a12150 ++ VMSOFM V2, V3 // d7212152 ++ VMSOFM V2, V0, V3 // d7212150 ++ VIOTAM V2, V3 // d7212852 ++ VIOTAM V2, V0, V3 // d7212850 ++ VIDV V3 // d7a10852 ++ VIDV V0, V3 // d7a10850 ++ + // + // Privileged ISA + // +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index 3a4bb1c761..b076cf50e0 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -362,5 +362,11 @@ TEXT errors(SB),$0 + VFREDMAXVS V1, V2, V4, V3 // ERROR "invalid vector mask register" + VFREDMINVS V1, V2, V4, V3 // ERROR "invalid vector mask register" + VFWREDOSUMVS V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VCPOPM V2, V4, X10 // ERROR "invalid vector mask register" ++ VFIRSTM V2, V4, X10 // ERROR "invalid vector mask register" ++ VMSBFM V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSIFM V2, V4, V3 // ERROR "invalid vector mask register" ++ VMSOFM V2, V4, V3 // ERROR "invalid vector mask register" ++ VIOTAM V2, V4, V3 // ERROR "invalid vector mask register" + + RET +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +index adb10823d7..8b0349584f 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +@@ -380,5 +380,24 @@ TEXT validation(SB),$0 + VFREDMINVS X10, V2, V3 // ERROR "expected vector register in vs1 position" + VFWREDOSUMVS X10, V2, V3 // ERROR "expected vector register in vs1 position" + VFWREDUSUMVS X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMANDMM X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMNANDMM X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMANDNMM X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMXORMM X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMORMM X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMNORMM X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMORNMM X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMXNORMM X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMMVM V3, X10 // ERROR "expected vector register in vd position" ++ VMNOTM V3, X10 // ERROR "expected vector register in vd position" ++ VCPOPM V2, V1 // ERROR "expected integer register in rd position" ++ VCPOPM X11, X10 // ERROR "expected vector register in vs2 position" ++ VFIRSTM V2, V1 // ERROR "expected integer register in rd position" ++ VFIRSTM X11, X10 // ERROR "expected vector register in vs2 position" ++ VMSBFM X10, V3 // ERROR "expected vector register in vs2 position" ++ VMSIFM X10, V3 // ERROR "expected vector register in vs2 position" ++ VMSOFM X10, V3 // ERROR "expected vector register in vs2 position" ++ VIOTAM X10, V3 // ERROR "expected vector register in vs2 position" ++ VIDV X10 // ERROR "expected vector register in vd position" + + RET +diff --git a/src/cmd/internal/obj/riscv/anames.go b/src/cmd/internal/obj/riscv/anames.go +index bf1fdb8b88..a689f2de27 100644 +--- a/src/cmd/internal/obj/riscv/anames.go ++++ b/src/cmd/internal/obj/riscv/anames.go +@@ -652,12 +652,16 @@ var Anames = []string{ + "SNEZ", + "VFABSV", + "VFNEGV", +- "VMFGEVV", +- "VMFGTVV", + "VL1RV", + "VL2RV", + "VL4RV", + "VL8RV", ++ "VMCLRM", ++ "VMFGEVV", ++ "VMFGTVV", ++ "VMMVM", ++ "VMNOTM", ++ "VMSETM", + "VMSGEUVI", + "VMSGEUVV", + "VMSGEVI", +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index b641eadde7..aaf5db9e75 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -1170,12 +1170,16 @@ const ( + ASNEZ + AVFABSV + AVFNEGV +- AVMFGEVV +- AVMFGTVV + AVL1RV + AVL2RV + AVL4RV + AVL8RV ++ AVMCLRM ++ AVMFGEVV ++ AVMFGTVV ++ AVMMVM ++ AVMNOTM ++ AVMSETM + AVMSGEUVI + AVMSGEUVV + AVMSGEVI +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 9b99416b95..0e1d482f1d 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1210,6 +1210,13 @@ func validateRVFV(ctxt *obj.Link, ins *instruction) { + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + ++func validateRVI(ctxt *obj.Link, ins *instruction) { ++ wantIntReg(ctxt, ins, "rd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "vs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ + func validateRVIV(ctxt *obj.Link, ins *instruction) { + wantVectorReg(ctxt, ins, "vd", ins.rd) + wantIntReg(ctxt, ins, "rs1", ins.rs1) +@@ -1459,6 +1466,10 @@ func encodeRVFV(ins *instruction) uint32 { + return encodeR(ins.as, regF(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) + } + ++func encodeRVI(ins *instruction) uint32 { ++ return encodeR(ins.as, 0, regV(ins.rs2), regI(ins.rd), ins.funct3, ins.funct7) ++} ++ + func encodeRVIV(ins *instruction) uint32 { + return encodeR(ins.as, regI(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) + } +@@ -1763,6 +1774,7 @@ var ( + rIFEncoding = encoding{encode: encodeRIF, validate: validateRIF, length: 4} + rFFEncoding = encoding{encode: encodeRFF, validate: validateRFF, length: 4} + rVFVEncoding = encoding{encode: encodeRVFV, validate: validateRVFV, length: 4} ++ rVIEncoding = encoding{encode: encodeRVI, validate: validateRVI, length: 4} + rVIVEncoding = encoding{encode: encodeRVIV, validate: validateRVIV, length: 4} + rVVEncoding = encoding{encode: encodeRVV, validate: validateRVV, length: 4} + rVViEncoding = encoding{encode: encodeRVVi, validate: validateRVVi, length: 4} +@@ -2491,6 +2503,23 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AVFWREDOSUMVS & obj.AMask: {enc: rVVVEncoding}, + AVFWREDUSUMVS & obj.AMask: {enc: rVVVEncoding}, + ++ // 31.15: Vector Mask Instructions ++ AVMANDMM & obj.AMask: {enc: rVVVEncoding}, ++ AVMNANDMM & obj.AMask: {enc: rVVVEncoding}, ++ AVMANDNMM & obj.AMask: {enc: rVVVEncoding}, ++ AVMXORMM & obj.AMask: {enc: rVVVEncoding}, ++ AVMORMM & obj.AMask: {enc: rVVVEncoding}, ++ AVMNORMM & obj.AMask: {enc: rVVVEncoding}, ++ AVMORNMM & obj.AMask: {enc: rVVVEncoding}, ++ AVMXNORMM & obj.AMask: {enc: rVVVEncoding}, ++ AVCPOPM & obj.AMask: {enc: rVIEncoding}, ++ AVFIRSTM & obj.AMask: {enc: rVIEncoding}, ++ AVMSBFM & obj.AMask: {enc: rVVEncoding}, ++ AVMSIFM & obj.AMask: {enc: rVVEncoding}, ++ AVMSOFM & obj.AMask: {enc: rVVEncoding}, ++ AVIOTAM & obj.AMask: {enc: rVVEncoding}, ++ AVIDV & obj.AMask: {enc: rVVEncoding}, ++ + // + // Privileged ISA + // +@@ -3647,6 +3676,47 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.as = AVFSGNJNVV + } + ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.From.Reg) ++ ++ case AVMANDMM, AVMNANDMM, AVMANDNMM, AVMXORMM, AVMORMM, AVMNORMM, AVMORNMM, AVMXNORMM, AVMMVM, AVMNOTM: ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg) ++ switch ins.as { ++ case AVMMVM: ++ ins.as, ins.rs2 = AVMANDMM, ins.rs1 ++ case AVMNOTM: ++ ins.as, ins.rs2 = AVMNANDMM, ins.rs1 ++ } ++ ++ case AVMCLRM, AVMSETM: ++ ins.rd, ins.rs1, ins.rs2 = uint32(p.From.Reg), uint32(p.From.Reg), uint32(p.From.Reg) ++ switch ins.as { ++ case AVMCLRM: ++ ins.as = AVMXORMM ++ case AVMSETM: ++ ins.as = AVMXNORMM ++ } ++ ++ case AVCPOPM, AVFIRSTM, AVMSBFM, AVMSIFM, AVMSOFM, AVIOTAM: ++ // Set mask bit ++ switch { ++ case ins.rs1 == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rs1 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ ins.rs1 = obj.REG_NONE ++ ++ case AVIDV: ++ // Set mask bit ++ switch { ++ case ins.rd == obj.REG_NONE: ++ ins.funct7 |= 1 // unmasked ++ case ins.rd != obj.REG_NONE && ins.rs2 != REG_V0: ++ p.Ctxt.Diag("%v: invalid vector mask register", p) ++ } ++ if ins.rd == obj.REG_NONE { ++ ins.rd = uint32(p.From.Reg) ++ } ++ ins.rs1, ins.rs2 = obj.REG_NONE, REG_V0 + } + + for _, ins := range inss { +-- +2.39.5 + diff --git a/2098-cmd-internal-obj-riscv-add-support-for-vector-permut.patch b/2098-cmd-internal-obj-riscv-add-support-for-vector-permut.patch new file mode 100644 index 0000000..12f15e8 --- /dev/null +++ b/2098-cmd-internal-obj-riscv-add-support-for-vector-permut.patch @@ -0,0 +1,287 @@ +From 95cd5726bc0fd0b5f20049fad6b698981e404d04 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 098/119] cmd/internal/obj/riscv: add support for vector + permutation instructions + +Add support for vector permutation instructions to the RISC-V assembler. +This includes integer scalar move, floating point scalar move, slide up +and slide down, register gather, compression and whole vector register +move instructions. + +Change-Id: I1da9f393091504fd81714006355725b8b9ecadea +Reviewed-on: https://go-review.googlesource.com/c/go/+/646780 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Carlos Amedee +Reviewed-by: Mark Ryan +Reviewed-by: Junyang Shao +Reviewed-by: Meng Zhuo +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 45 +++++++++++ + .../asm/internal/asm/testdata/riscv64error.s | 12 +++ + .../internal/asm/testdata/riscv64validation.s | 28 +++++++ + src/cmd/internal/obj/riscv/obj.go | 77 ++++++++++++++++++- + 4 files changed, 159 insertions(+), 3 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index ffffbf4bd4..2bab6842e7 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -1229,6 +1229,51 @@ start: + VIDV V3 // d7a10852 + VIDV V0, V3 // d7a10850 + ++ // 31.16.1: Integer Scalar Move Instructions ++ VMVXS V2, X10 // 57252042 ++ VMVSX X10, V2 // 57610542 ++ ++ // 31.16.2: Floating-Point Scalar Move Instructions ++ VFMVFS V2, F10 // 57152042 ++ VFMVSF F10, V2 // 57510542 ++ ++ // 31.16.3: Vector Slide Instructions ++ VSLIDEUPVX X10, V2, V3 // d741253a ++ VSLIDEUPVX X10, V2, V0, V3 // d7412538 ++ VSLIDEUPVI $16, V2, V3 // d731283a ++ VSLIDEUPVI $16, V2, V0, V3 // d7312838 ++ VSLIDEDOWNVX X10, V2, V3 // d741253e ++ VSLIDEDOWNVX X10, V2, V0, V3 // d741253c ++ VSLIDEDOWNVI $16, V2, V3 // d731283e ++ VSLIDEDOWNVI $16, V2, V0, V3 // d731283c ++ VSLIDE1UPVX X10, V2, V3 // d761253a ++ VSLIDE1UPVX X10, V2, V0, V3 // d7612538 ++ VFSLIDE1UPVF F10, V2, V3 // d751253a ++ VFSLIDE1UPVF F10, V2, V0, V3 // d7512538 ++ VSLIDE1DOWNVX X10, V2, V3 // d761253e ++ VSLIDE1DOWNVX X10, V2, V0, V3 // d761253c ++ VFSLIDE1DOWNVF F10, V2, V3 // d751253e ++ VFSLIDE1DOWNVF F10, V2, V0, V3 // d751253c ++ ++ // 31.16.4: Vector Register Gather Instructions ++ VRGATHERVV V1, V2, V3 // d7812032 ++ VRGATHERVV V1, V2, V0, V3 // d7812030 ++ VRGATHEREI16VV V1, V2, V3 // d781203a ++ VRGATHEREI16VV V1, V2, V0, V3 // d7812038 ++ VRGATHERVX X10, V2, V3 // d7412532 ++ VRGATHERVX X10, V2, V0, V3 // d7412530 ++ VRGATHERVI $16, V2, V3 // d7312832 ++ VRGATHERVI $16, V2, V0, V3 // d7312830 ++ ++ // 31.16.5: Vector Compress Instruction ++ VCOMPRESSVM V1, V2, V3 // d7a1205e ++ ++ // 31.16.6: Whole Vector Register Move ++ VMV1RV V2, V1 // d730209e ++ VMV2RV V12, V10 // 57b5c09e ++ VMV4RV V8, V4 // 57b2819e ++ VMV8RV V8, V0 // 57b0839e ++ + // + // Privileged ISA + // +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index b076cf50e0..4238197893 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -368,5 +368,17 @@ TEXT errors(SB),$0 + VMSIFM V2, V4, V3 // ERROR "invalid vector mask register" + VMSOFM V2, V4, V3 // ERROR "invalid vector mask register" + VIOTAM V2, V4, V3 // ERROR "invalid vector mask register" ++ VSLIDEUPVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSLIDEUPVI $16, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSLIDEDOWNVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSLIDEDOWNVI $16, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSLIDE1UPVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFSLIDE1UPVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VSLIDE1DOWNVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VFSLIDE1DOWNVF F10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VRGATHERVV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VRGATHEREI16VV V1, V2, V4, V3 // ERROR "invalid vector mask register" ++ VRGATHERVX X10, V2, V4, V3 // ERROR "invalid vector mask register" ++ VRGATHERVI $16, V2, V4, V3 // ERROR "invalid vector mask register" + + RET +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +index 8b0349584f..374a97dcfe 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +@@ -399,5 +399,33 @@ TEXT validation(SB),$0 + VMSOFM X10, V3 // ERROR "expected vector register in vs2 position" + VIOTAM X10, V3 // ERROR "expected vector register in vs2 position" + VIDV X10 // ERROR "expected vector register in vd position" ++ VMVXS X11, X10 // ERROR "expected vector register in vs2 position" ++ VMVXS V2, V1 // ERROR "expected integer register in rd position" ++ VMVSX X11, X10 // ERROR "expected vector register in vd position" ++ VMVSX V2, V1 // ERROR "expected integer register in rs2 position" ++ VFMVFS X10, F10 // ERROR "expected vector register in vs2 position" ++ VFMVFS V2, V1 // ERROR "expected float register in rd position" ++ VFMVSF X10, V2 // ERROR "expected float register in rs2 position" ++ VFMVSF V2, V1 // ERROR "expected float register in rs2 position" ++ VSLIDEUPVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSLIDEUPVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VSLIDEUPVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VSLIDEDOWNVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VSLIDEDOWNVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VSLIDEDOWNVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VSLIDE1UPVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VFSLIDE1UPVF V1, V2, V3 // ERROR "expected float register in rs1 position" ++ VSLIDE1DOWNVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VFSLIDE1DOWNVF V1, V2, V3 // ERROR "expected float register in rs1 position" ++ VRGATHERVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VRGATHEREI16VV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VRGATHERVX V1, V2, V3 // ERROR "expected integer register in rs1 position" ++ VRGATHERVI $-1, V2, V3 // ERROR "unsigned immediate -1 must be in range [0, 31]" ++ VRGATHERVI $32, V2, V3 // ERROR "unsigned immediate 32 must be in range [0, 31]" ++ VCOMPRESSVM X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMV1RV X10, V1 // ERROR "expected vector register in vs2 position" ++ VMV2RV X10, V10 // ERROR "expected vector register in vs2 position" ++ VMV4RV X10, V4 // ERROR "expected vector register in vs2 position" ++ VMV8RV X10, V0 // ERROR "expected vector register in vs2 position" + + RET +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 0e1d482f1d..4fbf4b4336 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -1189,6 +1189,13 @@ func validateRFI(ctxt *obj.Link, ins *instruction) { + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + ++func validateRFV(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "vd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantFloatReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ + func validateRFF(ctxt *obj.Link, ins *instruction) { + wantFloatReg(ctxt, ins, "rd", ins.rd) + wantNoneReg(ctxt, ins, "rs1", ins.rs1) +@@ -1203,6 +1210,20 @@ func validateRIF(ctxt *obj.Link, ins *instruction) { + wantNoneReg(ctxt, ins, "rs3", ins.rs3) + } + ++func validateRIV(ctxt *obj.Link, ins *instruction) { ++ wantVectorReg(ctxt, ins, "vd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantIntReg(ctxt, ins, "rs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ ++func validateRVF(ctxt *obj.Link, ins *instruction) { ++ wantFloatReg(ctxt, ins, "rd", ins.rd) ++ wantNoneReg(ctxt, ins, "rs1", ins.rs1) ++ wantVectorReg(ctxt, ins, "vs2", ins.rs2) ++ wantNoneReg(ctxt, ins, "rs3", ins.rs3) ++} ++ + func validateRVFV(ctxt *obj.Link, ins *instruction) { + wantVectorReg(ctxt, ins, "vd", ins.rd) + wantFloatReg(ctxt, ins, "rs1", ins.rs1) +@@ -1458,10 +1479,22 @@ func encodeRFF(ins *instruction) uint32 { + return encodeR(ins.as, regF(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7) + } + ++func encodeRFV(ins *instruction) uint32 { ++ return encodeR(ins.as, regF(ins.rs2), 0, regV(ins.rd), ins.funct3, ins.funct7) ++} ++ + func encodeRIF(ins *instruction) uint32 { + return encodeR(ins.as, regI(ins.rs2), 0, regF(ins.rd), ins.funct3, ins.funct7) + } + ++func encodeRIV(ins *instruction) uint32 { ++ return encodeR(ins.as, regI(ins.rs2), 0, regV(ins.rd), ins.funct3, ins.funct7) ++} ++ ++func encodeRVF(ins *instruction) uint32 { ++ return encodeR(ins.as, 0, regV(ins.rs2), regF(ins.rd), ins.funct3, ins.funct7) ++} ++ + func encodeRVFV(ins *instruction) uint32 { + return encodeR(ins.as, regF(ins.rs1), regV(ins.rs2), regV(ins.rd), ins.funct3, ins.funct7) + } +@@ -1771,8 +1804,11 @@ var ( + rFFFFEncoding = encoding{encode: encodeRFFFF, validate: validateRFFFF, length: 4} + rFFIEncoding = encoding{encode: encodeRFFI, validate: validateRFFI, length: 4} + rFIEncoding = encoding{encode: encodeRFI, validate: validateRFI, length: 4} ++ rFVEncoding = encoding{encode: encodeRFV, validate: validateRFV, length: 4} + rIFEncoding = encoding{encode: encodeRIF, validate: validateRIF, length: 4} ++ rIVEncoding = encoding{encode: encodeRIV, validate: validateRIV, length: 4} + rFFEncoding = encoding{encode: encodeRFF, validate: validateRFF, length: 4} ++ rVFEncoding = encoding{encode: encodeRVF, validate: validateRVF, length: 4} + rVFVEncoding = encoding{encode: encodeRVFV, validate: validateRVFV, length: 4} + rVIEncoding = encoding{encode: encodeRVI, validate: validateRVI, length: 4} + rVIVEncoding = encoding{encode: encodeRVIV, validate: validateRVIV, length: 4} +@@ -2520,6 +2556,39 @@ var instructions = [ALAST & obj.AMask]instructionData{ + AVIOTAM & obj.AMask: {enc: rVVEncoding}, + AVIDV & obj.AMask: {enc: rVVEncoding}, + ++ // 31.16.1: Integer Scalar Move Instructions ++ AVMVXS & obj.AMask: {enc: rVIEncoding}, ++ AVMVSX & obj.AMask: {enc: rIVEncoding}, ++ ++ // 31.16.2: Floating-Point Scalar Move Instructions ++ AVFMVFS & obj.AMask: {enc: rVFEncoding}, ++ AVFMVSF & obj.AMask: {enc: rFVEncoding}, ++ ++ // 31.16.3: Vector Slide Instructions ++ AVSLIDEUPVX & obj.AMask: {enc: rVIVEncoding}, ++ AVSLIDEUPVI & obj.AMask: {enc: rVVuEncoding}, ++ AVSLIDEDOWNVX & obj.AMask: {enc: rVIVEncoding}, ++ AVSLIDEDOWNVI & obj.AMask: {enc: rVVuEncoding}, ++ AVSLIDE1UPVX & obj.AMask: {enc: rVIVEncoding}, ++ AVFSLIDE1UPVF & obj.AMask: {enc: rVFVEncoding}, ++ AVSLIDE1DOWNVX & obj.AMask: {enc: rVIVEncoding}, ++ AVFSLIDE1DOWNVF & obj.AMask: {enc: rVFVEncoding}, ++ ++ // 31.16.4: Vector Register Gather Instructions ++ AVRGATHERVV & obj.AMask: {enc: rVVVEncoding}, ++ AVRGATHEREI16VV & obj.AMask: {enc: rVVVEncoding}, ++ AVRGATHERVX & obj.AMask: {enc: rVIVEncoding}, ++ AVRGATHERVI & obj.AMask: {enc: rVVuEncoding}, ++ ++ // 31.16.5: Vector Compress Instruction ++ AVCOMPRESSVM & obj.AMask: {enc: rVVVEncoding}, ++ ++ // 31.16.6: Whole Vector Register Move ++ AVMV1RV & obj.AMask: {enc: rVVEncoding}, ++ AVMV2RV & obj.AMask: {enc: rVVEncoding}, ++ AVMV4RV & obj.AMask: {enc: rVVEncoding}, ++ AVMV8RV & obj.AMask: {enc: rVVEncoding}, ++ + // + // Privileged ISA + // +@@ -3515,7 +3584,9 @@ func instructionsForProg(p *obj.Prog) []*instruction { + AVFSGNJVV, AVFSGNJVF, AVFSGNJNVV, AVFSGNJNVF, AVFSGNJXVV, AVFSGNJXVF, + AVMFEQVV, AVMFEQVF, AVMFNEVV, AVMFNEVF, AVMFLTVV, AVMFLTVF, AVMFLEVV, AVMFLEVF, AVMFGTVF, AVMFGEVF, + AVREDSUMVS, AVREDMAXUVS, AVREDMAXVS, AVREDMINUVS, AVREDMINVS, AVREDANDVS, AVREDORVS, AVREDXORVS, +- AVWREDSUMUVS, AVWREDSUMVS, AVFREDOSUMVS, AVFREDUSUMVS, AVFREDMAXVS, AVFREDMINVS, AVFWREDOSUMVS, AVFWREDUSUMVS: ++ AVWREDSUMUVS, AVWREDSUMVS, AVFREDOSUMVS, AVFREDUSUMVS, AVFREDMAXVS, AVFREDMINVS, AVFWREDOSUMVS, AVFWREDUSUMVS, ++ AVSLIDEUPVX, AVSLIDEDOWNVX, AVSLIDE1UPVX, AVFSLIDE1UPVF, AVSLIDE1DOWNVX, AVFSLIDE1DOWNVF, ++ AVRGATHERVV, AVRGATHEREI16VV, AVRGATHERVX: + // Set mask bit + switch { + case ins.rs3 == obj.REG_NONE: +@@ -3537,7 +3608,7 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.rd, ins.rs1, ins.rs2, ins.rs3 = uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), obj.REG_NONE + + case AVADDVI, AVRSUBVI, AVANDVI, AVORVI, AVXORVI, AVMSEQVI, AVMSNEVI, AVMSLEUVI, AVMSLEVI, AVMSGTUVI, AVMSGTVI, +- AVSLLVI, AVSRLVI, AVSRAVI, AVNSRLWI, AVNSRAWI: ++ AVSLLVI, AVSRLVI, AVSRAVI, AVNSRLWI, AVNSRAWI, AVRGATHERVI, AVSLIDEUPVI, AVSLIDEDOWNVI: + // Set mask bit + switch { + case ins.rs3 == obj.REG_NONE: +@@ -3677,7 +3748,7 @@ func instructionsForProg(p *obj.Prog) []*instruction { + } + ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.From.Reg) + +- case AVMANDMM, AVMNANDMM, AVMANDNMM, AVMXORMM, AVMORMM, AVMNORMM, AVMORNMM, AVMXNORMM, AVMMVM, AVMNOTM: ++ case AVMANDMM, AVMNANDMM, AVMANDNMM, AVMXORMM, AVMORMM, AVMNORMM, AVMORNMM, AVMXNORMM, AVMMVM, AVMNOTM, AVCOMPRESSVM: + ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg) + switch ins.as { + case AVMMVM: +-- +2.39.5 + diff --git a/2099-internal-bytealg-vector-implementation-of-equal-for-.patch b/2099-internal-bytealg-vector-implementation-of-equal-for-.patch new file mode 100644 index 0000000..ba0dfc3 --- /dev/null +++ b/2099-internal-bytealg-vector-implementation-of-equal-for-.patch @@ -0,0 +1,186 @@ +From aeae24dea6c37769461cc13c64acc8e4ce27afa9 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 099/119] internal/bytealg: vector implementation of equal for + riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Provide a vector implementation of equal for riscv64, which is used +when compiled with the rva23u64 profile, or when vector is detected +to be available. Inputs that are 8 byte aligned will still be handled +via a the non-vector code if the length is less than or equal to 64 +bytes. + +On a Banana Pi F3, with GORISCV64=rva23u64: + + │ equal.1 │ equal.2 │ + │ sec/op │ sec/op vs base │ +Equal/0-8 1.254n ± 0% 1.254n ± 0% ~ (p=1.000 n=10) +Equal/same/1-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.466 n=10) +Equal/same/6-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.689 n=10) +Equal/same/9-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.861 n=10) +Equal/same/15-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.657 n=10) +Equal/same/16-8 21.32n ± 0% 21.33n ± 0% ~ (p=0.075 n=10) +Equal/same/20-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.249 n=10) +Equal/same/32-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.303 n=10) +Equal/same/4K-8 21.32n ± 0% 21.32n ± 0% ~ (p=1.000 n=10) +Equal/same/4M-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.582 n=10) +Equal/same/64M-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.930 n=10) +Equal/1-8 39.16n ± 1% 38.71n ± 0% -1.15% (p=0.000 n=10) +Equal/6-8 51.49n ± 1% 50.40n ± 1% -2.12% (p=0.000 n=10) +Equal/9-8 54.46n ± 1% 53.89n ± 0% -1.04% (p=0.000 n=10) +Equal/15-8 71.81n ± 1% 70.59n ± 0% -1.71% (p=0.000 n=10) +Equal/16-8 69.14n ± 0% 68.21n ± 0% -1.34% (p=0.000 n=10) +Equal/20-8 78.59n ± 0% 77.59n ± 0% -1.26% (p=0.000 n=10) +Equal/32-8 41.55n ± 0% 41.16n ± 0% -0.96% (p=0.000 n=10) +Equal/4K-8 925.5n ± 0% 561.4n ± 1% -39.34% (p=0.000 n=10) +Equal/4M-8 3.110m ± 32% 2.463m ± 16% -20.80% (p=0.000 n=10) +Equal/64M-8 47.34m ± 30% 39.89m ± 16% -15.75% (p=0.004 n=10) +EqualBothUnaligned/64_0-8 32.17n ± 1% 32.11n ± 1% ~ (p=0.184 n=10) +EqualBothUnaligned/64_1-8 79.48n ± 0% 48.24n ± 1% -39.31% (p=0.000 n=10) +EqualBothUnaligned/64_4-8 72.71n ± 0% 48.37n ± 1% -33.48% (p=0.000 n=10) +EqualBothUnaligned/64_7-8 77.12n ± 0% 48.16n ± 1% -37.56% (p=0.000 n=10) +EqualBothUnaligned/4096_0-8 908.4n ± 0% 562.4n ± 2% -38.09% (p=0.000 n=10) +EqualBothUnaligned/4096_1-8 956.6n ± 0% 571.4n ± 3% -40.26% (p=0.000 n=10) +EqualBothUnaligned/4096_4-8 949.6n ± 0% 571.6n ± 3% -39.81% (p=0.000 n=10) +EqualBothUnaligned/4096_7-8 954.2n ± 0% 571.7n ± 3% -40.09% (p=0.000 n=10) +EqualBothUnaligned/4194304_0-8 2.935m ± 29% 2.664m ± 19% ~ (p=0.089 n=10) +EqualBothUnaligned/4194304_1-8 3.341m ± 13% 2.896m ± 34% ~ (p=0.075 n=10) +EqualBothUnaligned/4194304_4-8 3.204m ± 39% 3.352m ± 33% ~ (p=0.796 n=10) +EqualBothUnaligned/4194304_7-8 3.226m ± 30% 2.737m ± 34% -15.16% (p=0.043 n=10) +EqualBothUnaligned/67108864_0-8 49.04m ± 17% 39.94m ± 12% -18.57% (p=0.005 n=10) +EqualBothUnaligned/67108864_1-8 51.96m ± 15% 42.48m ± 15% -18.23% (p=0.015 n=10) +EqualBothUnaligned/67108864_4-8 47.67m ± 17% 37.85m ± 41% -20.61% (p=0.035 n=10) +EqualBothUnaligned/67108864_7-8 53.00m ± 22% 38.76m ± 21% -26.87% (p=0.000 n=10) +CompareBytesEqual-8 51.71n ± 1% 52.00n ± 0% +0.57% (p=0.002 n=10) +geomean 1.469µ 1.265µ -13.93% + + │ equal.1 │ equal.2 │ + │ B/s │ B/s vs base │ +Equal/same/1-8 44.73Mi ± 0% 44.72Mi ± 0% ~ (p=0.426 n=10) +Equal/same/6-8 268.3Mi ± 0% 268.4Mi ± 0% ~ (p=0.753 n=10) +Equal/same/9-8 402.6Mi ± 0% 402.5Mi ± 0% ~ (p=0.209 n=10) +Equal/same/15-8 670.9Mi ± 0% 670.9Mi ± 0% ~ (p=0.724 n=10) +Equal/same/16-8 715.6Mi ± 0% 715.4Mi ± 0% -0.04% (p=0.022 n=10) +Equal/same/20-8 894.6Mi ± 0% 894.5Mi ± 0% ~ (p=0.060 n=10) +Equal/same/32-8 1.398Gi ± 0% 1.398Gi ± 0% ~ (p=0.986 n=10) +Equal/same/4K-8 178.9Gi ± 0% 178.9Gi ± 0% ~ (p=0.853 n=10) +Equal/same/4M-8 178.9Ti ± 0% 178.9Ti ± 0% ~ (p=0.971 n=10) +Equal/same/64M-8 2862.8Ti ± 0% 2862.6Ti ± 0% ~ (p=0.971 n=10) +Equal/1-8 24.35Mi ± 1% 24.63Mi ± 0% +1.16% (p=0.000 n=10) +Equal/6-8 111.1Mi ± 1% 113.5Mi ± 1% +2.17% (p=0.000 n=10) +Equal/9-8 157.6Mi ± 1% 159.3Mi ± 0% +1.05% (p=0.000 n=10) +Equal/15-8 199.2Mi ± 1% 202.7Mi ± 0% +1.74% (p=0.000 n=10) +Equal/16-8 220.7Mi ± 0% 223.7Mi ± 0% +1.36% (p=0.000 n=10) +Equal/20-8 242.7Mi ± 0% 245.8Mi ± 0% +1.27% (p=0.000 n=10) +Equal/32-8 734.3Mi ± 0% 741.6Mi ± 0% +0.98% (p=0.000 n=10) +Equal/4K-8 4.122Gi ± 0% 6.795Gi ± 1% +64.84% (p=0.000 n=10) +Equal/4M-8 1.258Gi ± 24% 1.586Gi ± 14% +26.12% (p=0.000 n=10) +Equal/64M-8 1.320Gi ± 23% 1.567Gi ± 14% +18.69% (p=0.004 n=10) +EqualBothUnaligned/64_0-8 1.853Gi ± 1% 1.856Gi ± 1% ~ (p=0.190 n=10) +EqualBothUnaligned/64_1-8 767.9Mi ± 0% 1265.2Mi ± 1% +64.76% (p=0.000 n=10) +EqualBothUnaligned/64_4-8 839.4Mi ± 0% 1261.9Mi ± 1% +50.33% (p=0.000 n=10) +EqualBothUnaligned/64_7-8 791.4Mi ± 0% 1267.5Mi ± 1% +60.16% (p=0.000 n=10) +EqualBothUnaligned/4096_0-8 4.199Gi ± 0% 6.784Gi ± 2% +61.54% (p=0.000 n=10) +EqualBothUnaligned/4096_1-8 3.988Gi ± 0% 6.676Gi ± 3% +67.40% (p=0.000 n=10) +EqualBothUnaligned/4096_4-8 4.017Gi ± 0% 6.674Gi ± 3% +66.14% (p=0.000 n=10) +EqualBothUnaligned/4096_7-8 3.998Gi ± 0% 6.673Gi ± 3% +66.92% (p=0.000 n=10) +EqualBothUnaligned/4194304_0-8 1.332Gi ± 22% 1.468Gi ± 16% ~ (p=0.089 n=10) +EqualBothUnaligned/4194304_1-8 1.169Gi ± 12% 1.350Gi ± 25% ~ (p=0.075 n=10) +EqualBothUnaligned/4194304_4-8 1.222Gi ± 28% 1.165Gi ± 48% ~ (p=0.796 n=10) +EqualBothUnaligned/4194304_7-8 1.211Gi ± 23% 1.427Gi ± 26% +17.88% (p=0.043 n=10) +EqualBothUnaligned/67108864_0-8 1.274Gi ± 14% 1.567Gi ± 14% +22.97% (p=0.005 n=10) +EqualBothUnaligned/67108864_1-8 1.204Gi ± 14% 1.471Gi ± 13% +22.18% (p=0.015 n=10) +EqualBothUnaligned/67108864_4-8 1.311Gi ± 14% 1.651Gi ± 29% +25.92% (p=0.035 n=10) +EqualBothUnaligned/67108864_7-8 1.179Gi ± 18% 1.612Gi ± 17% +36.73% (p=0.000 n=10) +geomean 1.870Gi 2.190Gi +17.16% + +Change-Id: I9c5270bcc6997d020a96d1e97c7e7cfc7ca7fd34 +Reviewed-on: https://go-review.googlesource.com/c/go/+/646736 +Reviewed-by: Mark Ryan +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Mark Freeman +--- + src/internal/bytealg/bytealg.go | 10 ++++++---- + src/internal/bytealg/equal_riscv64.s | 30 ++++++++++++++++++++++++++++ + 2 files changed, 36 insertions(+), 4 deletions(-) + +diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go +index ae4b8b48d2..37881f75e6 100644 +--- a/src/internal/bytealg/bytealg.go ++++ b/src/internal/bytealg/bytealg.go +@@ -11,13 +11,15 @@ import ( + + // Offsets into internal/cpu records for use in assembly. + const ( +- offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42) +- offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) +- offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) ++ offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9) ++ ++ offsetRISCV64HasV = unsafe.Offsetof(cpu.RISCV64.HasV) + + offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX) + +- offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9) ++ offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42) ++ offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) ++ offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) + ) + + // MaxLen is the maximum length of the string to be searched for (argument b) in Index. +diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s +index 87b2d79302..58e033f847 100644 +--- a/src/internal/bytealg/equal_riscv64.s ++++ b/src/internal/bytealg/equal_riscv64.s +@@ -2,6 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + ++#include "asm_riscv64.h" + #include "go_asm.h" + #include "textflag.h" + +@@ -28,6 +29,35 @@ length_check: + MOV $32, X23 + BLT X12, X23, loop4_check + ++#ifndef hasV ++ MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5 ++ BEQZ X5, equal_scalar ++#endif ++ ++ // Use vector if not 8 byte aligned. ++ OR X10, X11, X5 ++ AND $7, X5 ++ BNEZ X5, vector_loop ++ ++ // Use scalar if 8 byte aligned and <= 64 bytes. ++ SUB $64, X12, X6 ++ BLEZ X6, loop32_check ++ ++ PCALIGN $16 ++vector_loop: ++ VSETVLI X12, E8, M8, TA, MA, X5 ++ VLE8V (X10), V8 ++ VLE8V (X11), V16 ++ VMSNEVV V8, V16, V0 ++ VFIRSTM V0, X6 ++ BGEZ X6, done ++ ADD X5, X10 ++ ADD X5, X11 ++ SUB X5, X12 ++ BNEZ X12, vector_loop ++ JMP done ++ ++equal_scalar: + // Check alignment - if alignment differs we have to do one byte at a time. + AND $7, X10, X9 + AND $7, X11, X19 +-- +2.39.5 + diff --git a/2100-internal-bytealg-vector-implementation-of-indexbyte-.patch b/2100-internal-bytealg-vector-implementation-of-indexbyte-.patch new file mode 100644 index 0000000..48fd0f3 --- /dev/null +++ b/2100-internal-bytealg-vector-implementation-of-indexbyte-.patch @@ -0,0 +1,156 @@ +From 4d2345cf8ba4e61588e881af5d83cc496300c583 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 100/119] internal/bytealg: vector implementation of indexbyte + for riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Provide a vector implementation of indexbyte for riscv64, which is used +when compiled with the rva23u64 profile, or when vector is detected +to be available. Inputs that are smaller than 24 bytes will continue +to use the non-vector path. + +On a Banana Pi F3, with GORISCV64=rva23u64: + + │ indexbyte.1 │ indexbyte.2 │ + │ sec/op │ sec/op vs base │ +IndexByte/10-8 52.68n ± 0% 47.26n ± 0% -10.30% (p=0.000 n=10) +IndexByte/32-8 68.62n ± 0% 47.02n ± 0% -31.49% (p=0.000 n=10) +IndexByte/4K-8 2217.0n ± 0% 420.4n ± 0% -81.04% (p=0.000 n=10) +IndexByte/4M-8 2624.4µ ± 0% 767.5µ ± 0% -70.75% (p=0.000 n=10) +IndexByte/64M-8 68.08m ± 10% 47.84m ± 45% -29.73% (p=0.004 n=10) +geomean 17.03µ 8.073µ -52.59% + + │ indexbyte.1 │ indexbyte.2 │ + │ B/s │ B/s vs base │ +IndexByte/10-8 181.0Mi ± 0% 201.8Mi ± 0% +11.48% (p=0.000 n=10) +IndexByte/32-8 444.7Mi ± 0% 649.1Mi ± 0% +45.97% (p=0.000 n=10) +IndexByte/4K-8 1.721Gi ± 0% 9.076Gi ± 0% +427.51% (p=0.000 n=10) +IndexByte/4M-8 1.488Gi ± 0% 5.089Gi ± 0% +241.93% (p=0.000 n=10) +IndexByte/64M-8 940.3Mi ± 9% 1337.8Mi ± 31% +42.27% (p=0.004 n=10) +geomean 727.1Mi 1.498Gi +110.94% + +Change-Id: If7b0dbef38d76fa7a2021e4ecaed668a1d4b9783 +Reviewed-on: https://go-review.googlesource.com/c/go/+/648856 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: Mark Freeman +Reviewed-by: Mark Ryan +Reviewed-by: Dmitri Shuralyov +--- + src/internal/bytealg/indexbyte_riscv64.s | 60 ++++++++++++++++-------- + 1 file changed, 41 insertions(+), 19 deletions(-) + +diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s +index fde00da0ea..527ae6d35e 100644 +--- a/src/internal/bytealg/indexbyte_riscv64.s ++++ b/src/internal/bytealg/indexbyte_riscv64.s +@@ -2,6 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + ++#include "asm_riscv64.h" + #include "go_asm.h" + #include "textflag.h" + +@@ -11,12 +12,14 @@ TEXT ·IndexByte(SB),NOSPLIT,$0-40 + // X12 = b_cap (unused) + // X13 = byte to find + AND $0xff, X13, X12 // x12 byte to look for +- MOV X10, X13 // store base for later + + SLTI $24, X11, X14 +- ADD X10, X11 // end +- BEQZ X14, bigBody ++ BNEZ X14, small ++ JMP indexByteBig<>(SB) + ++small: ++ MOV X10, X13 // store base for later ++ ADD X10, X11 // end + SUB $1, X10 + loop: + ADD $1, X10 +@@ -31,21 +34,19 @@ notfound: + MOV $-1, X10 + RET + +-bigBody: +- JMP indexByteBig<>(SB) +- + TEXT ·IndexByteString(SB),NOSPLIT,$0-32 + // X10 = b_base + // X11 = b_len + // X12 = byte to find +- + AND $0xff, X12 // x12 byte to look for +- MOV X10, X13 // store base for later + + SLTI $24, X11, X14 +- ADD X10, X11 // end +- BEQZ X14, bigBody ++ BNEZ X14, small ++ JMP indexByteBig<>(SB) + ++small: ++ MOV X10, X13 // store base for later ++ ADD X10, X11 // end + SUB $1, X10 + loop: + ADD $1, X10 +@@ -60,20 +61,41 @@ notfound: + MOV $-1, X10 + RET + +-bigBody: +- JMP indexByteBig<>(SB) +- + TEXT indexByteBig<>(SB),NOSPLIT|NOFRAME,$0 +- // On entry ++ // On entry: + // X10 = b_base +- // X11 = end ++ // X11 = b_len (at least 16 bytes) + // X12 = byte to find +- // X13 = b_base +- // X11 is at least 16 bytes > X10 +- +- // On exit ++ // On exit: + // X10 = index of first instance of sought byte, if found, or -1 otherwise + ++ MOV X10, X13 // store base for later ++ ++#ifndef hasV ++ MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5 ++ BEQZ X5, indexbyte_scalar ++#endif ++ ++ PCALIGN $16 ++vector_loop: ++ VSETVLI X11, E8, M8, TA, MA, X5 ++ VLE8V (X10), V8 ++ VMSEQVX X12, V8, V0 ++ VFIRSTM V0, X6 ++ BGEZ X6, vector_found ++ ADD X5, X10 ++ SUB X5, X11 ++ BNEZ X11, vector_loop ++ JMP notfound ++ ++vector_found: ++ SUB X13, X10 ++ ADD X6, X10 ++ RET ++ ++indexbyte_scalar: ++ ADD X10, X11 // end ++ + // Process the first few bytes until we get to an 8 byte boundary + // No need to check for end here as we have at least 16 bytes in + // the buffer. +-- +2.39.5 + diff --git a/2101-cmd-internal-obj-riscv-reject-invalid-vadc-vsbc-enco.patch b/2101-cmd-internal-obj-riscv-reject-invalid-vadc-vsbc-enco.patch new file mode 100644 index 0000000..871b3de --- /dev/null +++ b/2101-cmd-internal-obj-riscv-reject-invalid-vadc-vsbc-enco.patch @@ -0,0 +1,123 @@ +From 73a717cb05344f5ad53e15755c2ed0146220dd1b Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 101/119] cmd/internal/obj/riscv: reject invalid vadc/vsbc + encodings + +The RISC-V Instruction Set Manual Volume states that "for vadc and +vsbc, the instruction encoding is reserved if the destination vector +register is v0". The assembler currently allows instructions like + +VADCVVM V1, V2, V0, V0 + +to be assembled. It's not clear what the behaviour of such +instructions will be on target hardware so it's best to disallow +them. + +For reference, binutils (2.44-3.fc42) allows the instruction + +vadc.vvm v0, v4, v8, v0 + +to be assembled and the instruction actually executes on a Banana PI +F3 without crashing. However, clang (20.1.2) refuses to assemble the +instruction, producing the following error. + +error: the destination vector register group cannot be V0 + vadc.vvm v0, v4, v8, v0 + ^ +Change-Id: Ia913cbd864ae8dbcf9227f69b963c93a99481cff +Reviewed-on: https://go-review.googlesource.com/c/go/+/669315 +Reviewed-by: Carlos Amedee +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Cherry Mui +Reviewed-by: Joel Sing +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 10 ++++++++++ + src/cmd/asm/internal/asm/testdata/riscv64error.s | 5 +++++ + src/cmd/internal/obj/riscv/obj.go | 9 +++++++-- + 3 files changed, 22 insertions(+), 2 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 2bab6842e7..01838664a3 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -623,17 +623,27 @@ start: + VADCVXM X11, V2, V0, V3 // d7c12540 + VADCVIM $15, V2, V0, V3 // d7b12740 + VMADCVVM V1, V2, V0, V3 // d7812044 ++ VMADCVVM V1, V2, V0, V0 // 57802044 + VMADCVXM X11, V2, V0, V3 // d7c12544 ++ VMADCVXM X11, V2, V0, V0 // 57c02544 + VMADCVIM $15, V2, V0, V3 // d7b12744 ++ VMADCVIM $15, V2, V0, V0 // 57b02744 + VMADCVV V1, V2, V3 // d7812046 ++ VMADCVV V1, V2, V0 // 57802046 + VMADCVX X11, V2, V3 // d7c12546 ++ VMADCVX X11, V2, V0 // 57c02546 + VMADCVI $15, V2, V3 // d7b12746 ++ VMADCVI $15, V2, V0 // 57b02746 + VSBCVVM V1, V2, V0, V3 // d7812048 + VSBCVXM X11, V2, V0, V3 // d7c12548 + VMSBCVVM V1, V2, V0, V3 // d781204c ++ VMSBCVVM V1, V2, V0, V0 // 5780204c + VMSBCVXM X11, V2, V0, V3 // d7c1254c ++ VMSBCVXM X11, V2, V0, V0 // 57c0254c + VMSBCVV V1, V2, V3 // d781204e ++ VMSBCVV V1, V2, V0 // 5780204e + VMSBCVX X11, V2, V3 // d7c1254e ++ VMSBCVX X11, V2, V0 // 57c0254e + + // 31.11.5: Vector Bitwise Logical Instructions + VANDVV V1, V2, V3 // d7812026 +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64error.s b/src/cmd/asm/internal/asm/testdata/riscv64error.s +index 4238197893..4e6afa0ac2 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64error.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64error.s +@@ -95,10 +95,13 @@ TEXT errors(SB),$0 + VSEXTVF8 V2, V3, V4 // ERROR "invalid vector mask register" + VADCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" + VADCVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VADCVVM V1, V2, V0, V0 // ERROR "invalid destination register V0" + VADCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" + VADCVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VADCVXM X10, V2, V0, V0 // ERROR "invalid destination register V0" + VADCVIM $15, V2, V1, V3 // ERROR "invalid vector mask register" + VADCVIM $15, V2, V3 // ERROR "invalid vector mask register" ++ VADCVIM $15, V2, V0, V0 // ERROR "invalid destination register V0" + VMADCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" + VMADCVVM V1, V2, V3 // ERROR "invalid vector mask register" + VMADCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" +@@ -107,8 +110,10 @@ TEXT errors(SB),$0 + VMADCVIM $15, V2, V3 // ERROR "invalid vector mask register" + VSBCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" + VSBCVVM V1, V2, V3 // ERROR "invalid vector mask register" ++ VSBCVVM V1, V2, V0, V0 // ERROR "invalid destination register V0" + VSBCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" + VSBCVXM X10, V2, V3 // ERROR "invalid vector mask register" ++ VSBCVXM X10, V2, V0, V0 // ERROR "invalid destination register V0" + VMSBCVVM V1, V2, V4, V3 // ERROR "invalid vector mask register" + VMSBCVVM V1, V2, V3 // ERROR "invalid vector mask register" + VMSBCVXM X10, V2, V4, V3 // ERROR "invalid vector mask register" +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 4fbf4b4336..592b7adba3 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -3647,8 +3647,13 @@ func instructionsForProg(p *obj.Prog) []*instruction { + ins.funct7 |= 1 // unmasked + ins.rd, ins.rs1, ins.rs2 = uint32(p.To.Reg), uint32(p.From.Reg), REG_V0 + +- case AVADCVVM, AVADCVXM, AVMADCVVM, AVMADCVXM, AVSBCVVM, AVSBCVXM, AVMSBCVVM, AVMSBCVXM, AVADCVIM, AVMADCVIM, +- AVMERGEVVM, AVMERGEVXM, AVMERGEVIM, AVFMERGEVFM: ++ case AVADCVIM, AVADCVVM, AVADCVXM, AVSBCVVM, AVSBCVXM: ++ if ins.rd == REG_V0 { ++ p.Ctxt.Diag("%v: invalid destination register V0", p) ++ } ++ fallthrough ++ ++ case AVMADCVVM, AVMADCVXM, AVMSBCVVM, AVMSBCVXM, AVMADCVIM, AVMERGEVVM, AVMERGEVXM, AVMERGEVIM, AVFMERGEVFM: + if ins.rs3 != REG_V0 { + p.Ctxt.Diag("%v: invalid vector mask register", p) + } +-- +2.39.5 + diff --git a/2102-cmd-internal-obj-riscv-fix-LMUL-encoding-for-MF2-and.patch b/2102-cmd-internal-obj-riscv-fix-LMUL-encoding-for-MF2-and.patch new file mode 100644 index 0000000..01182b5 --- /dev/null +++ b/2102-cmd-internal-obj-riscv-fix-LMUL-encoding-for-MF2-and.patch @@ -0,0 +1,68 @@ +From 7b07e3ed76af9a2e0e554cd06e9ce0f943a3f9f7 Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 102/119] cmd/internal/obj/riscv: fix LMUL encoding for MF2 and + MF8 + +The encodings for the riscv64 special operands SPOP_MF2 and SPOP_MF8 +are incorrect, i.e., their values are swapped. This leads to +incorrect encodings for the VSETVLI and VSETIVLI instructions. The +assembler currently encodes + +VSETVLI X10, E32, MF8, TA, MA, X12 + +as + +VSETVLI X10, E32, MF2, TA, MA, X12 + +We update the encodings for SPOP_MF2 and SPOP_MF8 so that they match +the LMUL table in section "31.3.4. Vector type register, vtype" of +the "RISC-V Instruction Set Manual Volume 1". + +Change-Id: Ic73355533d7c2a901ee060b35c2f7af6d58453e4 +Reviewed-on: https://go-review.googlesource.com/c/go/+/670016 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Cherry Mui +Reviewed-by: Carlos Amedee +Reviewed-by: Meng Zhuo +Reviewed-by: Joel Sing +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 4 ++-- + src/cmd/internal/obj/riscv/cpu.go | 4 ++-- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 01838664a3..103f1e3272 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -438,9 +438,9 @@ start: + VSETVLI X10, E32, M2, TA, MA, X12 // 5776150d + VSETVLI X10, E32, M4, TA, MA, X12 // 5776250d + VSETVLI X10, E32, M8, TA, MA, X12 // 5776350d +- VSETVLI X10, E32, MF2, TA, MA, X12 // 5776550d ++ VSETVLI X10, E32, MF8, TA, MA, X12 // 5776550d + VSETVLI X10, E32, MF4, TA, MA, X12 // 5776650d +- VSETVLI X10, E32, MF8, TA, MA, X12 // 5776750d ++ VSETVLI X10, E32, MF2, TA, MA, X12 // 5776750d + VSETVLI X10, E32, M1, TA, MA, X12 // 5776050d + VSETVLI $15, E32, M1, TA, MA, X12 // 57f607cd + VSETIVLI $0, E32, M1, TA, MA, X12 // 577600cd +diff --git a/src/cmd/internal/obj/riscv/cpu.go b/src/cmd/internal/obj/riscv/cpu.go +index aaf5db9e75..a2b6a436ba 100644 +--- a/src/cmd/internal/obj/riscv/cpu.go ++++ b/src/cmd/internal/obj/riscv/cpu.go +@@ -1287,9 +1287,9 @@ var specialOperands = map[SpecialOperand]struct { + SPOP_M2: {encoding: 1, name: "M2"}, + SPOP_M4: {encoding: 2, name: "M4"}, + SPOP_M8: {encoding: 3, name: "M8"}, +- SPOP_MF2: {encoding: 5, name: "MF2"}, ++ SPOP_MF8: {encoding: 5, name: "MF8"}, + SPOP_MF4: {encoding: 6, name: "MF4"}, +- SPOP_MF8: {encoding: 7, name: "MF8"}, ++ SPOP_MF2: {encoding: 7, name: "MF2"}, + + SPOP_E8: {encoding: 0, name: "E8"}, + SPOP_E16: {encoding: 1, name: "E16"}, +-- +2.39.5 + diff --git a/2103-cmd-compile-add-generic-simplifications-on-riscv64.patch b/2103-cmd-compile-add-generic-simplifications-on-riscv64.patch new file mode 100644 index 0000000..3609d04 --- /dev/null +++ b/2103-cmd-compile-add-generic-simplifications-on-riscv64.patch @@ -0,0 +1,203 @@ +From 8a20dfdf22e664784ddbaa7c1664c1cb6898ae3e Mon Sep 17 00:00:00 2001 +From: Julian Zhu +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 103/119] cmd/compile: add generic simplifications on riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +file before after Δ % +addr2line 3636263 3636215 -48 -0.001% +asm 6318110 6317966 -144 -0.002% +buildid 3463352 3463224 -128 -0.004% +cgo 5672502 5672214 -288 -0.005% +compile 26904997 26905719 +722 +0.003% +cover 6405603 6405467 -136 -0.002% +dist 4092630 4092494 -136 -0.003% +doc 9728281 9723977 -4304 -0.044% +fix 4014891 4014835 -56 -0.001% +link 8327674 8327426 -248 -0.003% +nm 3628718 3628494 -224 -0.006% +objdump 5951778 5951626 -152 -0.003% +pack 2896080 2896040 -40 -0.001% +pprof 17596796 17591908 -4888 -0.028% +test2json 3346622 3346566 -56 -0.002% +trace 16179738 16175706 -4032 -0.025% +vet 9603472 9603264 -208 -0.002% +total 156070021 156055655 -14366 -0.009% + +Change-Id: Ie4a79a3c410eb79155ce2418ae64fa670d1ccd53 +Reviewed-on: https://go-review.googlesource.com/c/go/+/673477 +Reviewed-by: Keith Randall +Reviewed-by: Keith Randall +LUCI-TryBot-Result: Go LUCI +Auto-Submit: Keith Randall +Reviewed-by: David Chase +--- + .../compile/internal/ssa/_gen/RISCV64.rules | 9 ++ + .../compile/internal/ssa/rewriteRISCV64.go | 87 +++++++++++++++++++ + 2 files changed, 96 insertions(+) + +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 93f4e6a948..f0d2d74b7b 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -719,6 +719,15 @@ + (ROL x (NEG y)) => (ROR x y) + (ROLW x (NEG y)) => (RORW x y) + ++// generic simplifications ++(ADD x (NEG y)) => (SUB x y) ++(SUB x (NEG y)) => (ADD x y) ++(SUB x x) => (MOVDconst [0]) ++(AND x x) => x ++(OR x x) => x ++(ORN x x) => (MOVDconst [-1]) ++(XOR x x) => (MOVDconst [0]) ++ + // Convert const subtraction into ADDI with negative immediate, where possible. + (SUB x (MOVDconst [val])) && is32Bit(-val) => (ADDI [-val] x) + (SUB (MOVDconst [val]) y) && is32Bit(-val) => (NEG (ADDI [-val] y)) +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index c3018f270c..966199c450 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -541,6 +541,8 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpRISCV64OR(v) + case OpRISCV64ORI: + return rewriteValueRISCV64_OpRISCV64ORI(v) ++ case OpRISCV64ORN: ++ return rewriteValueRISCV64_OpRISCV64ORN(v) + case OpRISCV64ROL: + return rewriteValueRISCV64_OpRISCV64ROL(v) + case OpRISCV64ROLW: +@@ -3317,6 +3319,21 @@ func rewriteValueRISCV64_OpRISCV64ADD(v *Value) bool { + } + break + } ++ // match: (ADD x (NEG y)) ++ // result: (SUB x y) ++ for { ++ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { ++ x := v_0 ++ if v_1.Op != OpRISCV64NEG { ++ continue ++ } ++ y := v_1.Args[0] ++ v.reset(OpRISCV64SUB) ++ v.AddArg2(x, y) ++ return true ++ } ++ break ++ } + // match: (ADD (SLLI [1] x) y) + // cond: buildcfg.GORISCV64 >= 22 + // result: (SH1ADD x y) +@@ -3467,6 +3484,16 @@ func rewriteValueRISCV64_OpRISCV64AND(v *Value) bool { + } + break + } ++ // match: (AND x x) ++ // result: x ++ for { ++ x := v_0 ++ if x != v_1 { ++ break ++ } ++ v.copyOf(x) ++ return true ++ } + return false + } + func rewriteValueRISCV64_OpRISCV64ANDI(v *Value) bool { +@@ -6191,6 +6218,16 @@ func rewriteValueRISCV64_OpRISCV64OR(v *Value) bool { + } + break + } ++ // match: (OR x x) ++ // result: x ++ for { ++ x := v_0 ++ if x != v_1 { ++ break ++ } ++ v.copyOf(x) ++ return true ++ } + return false + } + func rewriteValueRISCV64_OpRISCV64ORI(v *Value) bool { +@@ -6243,6 +6280,22 @@ func rewriteValueRISCV64_OpRISCV64ORI(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64ORN(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ // match: (ORN x x) ++ // result: (MOVDconst [-1]) ++ for { ++ x := v_0 ++ if x != v_1 { ++ break ++ } ++ v.reset(OpRISCV64MOVDconst) ++ v.AuxInt = int64ToAuxInt(-1) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64ROL(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +@@ -6888,6 +6941,29 @@ func rewriteValueRISCV64_OpRISCV64SUB(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block ++ // match: (SUB x (NEG y)) ++ // result: (ADD x y) ++ for { ++ x := v_0 ++ if v_1.Op != OpRISCV64NEG { ++ break ++ } ++ y := v_1.Args[0] ++ v.reset(OpRISCV64ADD) ++ v.AddArg2(x, y) ++ return true ++ } ++ // match: (SUB x x) ++ // result: (MOVDconst [0]) ++ for { ++ x := v_0 ++ if x != v_1 { ++ break ++ } ++ v.reset(OpRISCV64MOVDconst) ++ v.AuxInt = int64ToAuxInt(0) ++ return true ++ } + // match: (SUB x (MOVDconst [val])) + // cond: is32Bit(-val) + // result: (ADDI [-val] x) +@@ -6999,6 +7075,17 @@ func rewriteValueRISCV64_OpRISCV64XOR(v *Value) bool { + } + break + } ++ // match: (XOR x x) ++ // result: (MOVDconst [0]) ++ for { ++ x := v_0 ++ if x != v_1 { ++ break ++ } ++ v.reset(OpRISCV64MOVDconst) ++ v.AuxInt = int64ToAuxInt(0) ++ return true ++ } + return false + } + func rewriteValueRISCV64_OpRotateLeft16(v *Value) bool { +-- +2.39.5 + diff --git a/2104-cmd-internal-obj-riscv-fix-vector-integer-multiply-a.patch b/2104-cmd-internal-obj-riscv-fix-vector-integer-multiply-a.patch new file mode 100644 index 0000000..a94d9ef --- /dev/null +++ b/2104-cmd-internal-obj-riscv-fix-vector-integer-multiply-a.patch @@ -0,0 +1,187 @@ +From 33771744d688d61520413c2fa204cfddce74c10b Mon Sep 17 00:00:00 2001 +From: Mark Ryan +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 104/119] cmd/internal/obj/riscv: fix vector integer multiply + add + +The RISC-V integer vector multiply add instructions are not encoded +correctly; the first and second arguments are swapped. For example, +the instruction + +VMACCVV V1, V2, V3 + +encodes to + +b620a1d7 or vmacc.vv v3,v1,v2 + +and not + +b61121d7 or vmacc.vv v3,v2,v1 + +as expected. + +This is inconsistent with the argument ordering we use for 3 +argument vector instructions, in which the argument order, as given +in the RISC-V specifications, is reversed, and also with the vector +FMA instructions which have the same argument ordering as the vector +integer multiply add instructions in the "The RISC-V Instruction Set +Manual Volume I". For example, in the ISA manual we have the +following instruction definitions + +; Integer multiply-add, overwrite addend +vmacc.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i] + +; FP multiply-accumulate, overwrites addend +vfmacc.vv vd, vs1, vs2, vm # vd[i] = +(vs1[i] * vs2[i]) + vd[i] + +It's reasonable to expect that the Go assembler would use the same +argument ordering for both of these instructions. It currently does +not. + +We fix the issue by switching the argument ordering for the vector +integer multiply add instructions to match those of the vector FMA +instructions. + +Change-Id: Ib98e9999617f991969e5c831734b3bb3324439f6 +Reviewed-on: https://go-review.googlesource.com/c/go/+/670335 +Reviewed-by: Carlos Amedee +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: Cherry Mui +--- + src/cmd/asm/internal/asm/testdata/riscv64.s | 60 +++++++++---------- + .../internal/asm/testdata/riscv64validation.s | 14 ++--- + src/cmd/internal/obj/riscv/obj.go | 6 +- + 3 files changed, 40 insertions(+), 40 deletions(-) + +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64.s b/src/cmd/asm/internal/asm/testdata/riscv64.s +index 103f1e3272..1d8c2d3530 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64.s +@@ -830,38 +830,38 @@ start: + VWMULSUVX X10, V2, V0, V3 // d76125e8 + + // 31.11.13: Vector Single-Width Integer Multiply-Add Instructions +- VMACCVV V1, V2, V3 // d7a120b6 +- VMACCVV V1, V2, V0, V3 // d7a120b4 +- VMACCVX X10, V2, V3 // d76125b6 +- VMACCVX X10, V2, V0, V3 // d76125b4 +- VNMSACVV V1, V2, V3 // d7a120be +- VNMSACVV V1, V2, V0, V3 // d7a120bc +- VNMSACVX X10, V2, V3 // d76125be +- VNMSACVX X10, V2, V0, V3 // d76125bc +- VMADDVV V1, V2, V3 // d7a120a6 +- VMADDVV V1, V2, V0, V3 // d7a120a4 +- VMADDVX X10, V2, V3 // d76125a6 +- VMADDVX X10, V2, V0, V3 // d76125a4 +- VNMSUBVV V1, V2, V3 // d7a120ae +- VNMSUBVV V1, V2, V0, V3 // d7a120ac +- VNMSUBVX X10, V2, V3 // d76125ae +- VNMSUBVX X10, V2, V0, V3 // d76125ac ++ VMACCVV V2, V1, V3 // d7a120b6 ++ VMACCVV V2, V1, V0, V3 // d7a120b4 ++ VMACCVX V2, X10, V3 // d76125b6 ++ VMACCVX V2, X10, V0, V3 // d76125b4 ++ VNMSACVV V2, V1, V3 // d7a120be ++ VNMSACVV V2, V1, V0, V3 // d7a120bc ++ VNMSACVX V2, X10, V3 // d76125be ++ VNMSACVX V2, X10, V0, V3 // d76125bc ++ VMADDVV V2, V1, V3 // d7a120a6 ++ VMADDVV V2, V1, V0, V3 // d7a120a4 ++ VMADDVX V2, X10, V3 // d76125a6 ++ VMADDVX V2, X10, V0, V3 // d76125a4 ++ VNMSUBVV V2, V1, V3 // d7a120ae ++ VNMSUBVV V2, V1, V0, V3 // d7a120ac ++ VNMSUBVX V2, X10, V3 // d76125ae ++ VNMSUBVX V2, X10, V0, V3 // d76125ac + + // 31.11.14: Vector Widening Integer Multiply-Add Instructions +- VWMACCUVV V1, V2, V3 // d7a120f2 +- VWMACCUVV V1, V2, V0, V3 // d7a120f0 +- VWMACCUVX X10, V2, V3 // d76125f2 +- VWMACCUVX X10, V2, V0, V3 // d76125f0 +- VWMACCVV V1, V2, V3 // d7a120f6 +- VWMACCVV V1, V2, V0, V3 // d7a120f4 +- VWMACCVX X10, V2, V3 // d76125f6 +- VWMACCVX X10, V2, V0, V3 // d76125f4 +- VWMACCSUVV V1, V2, V3 // d7a120fe +- VWMACCSUVV V1, V2, V0, V3 // d7a120fc +- VWMACCSUVX X10, V2, V3 // d76125fe +- VWMACCSUVX X10, V2, V0, V3 // d76125fc +- VWMACCUSVX X10, V2, V3 // d76125fa +- VWMACCUSVX X10, V2, V0, V3 // d76125f8 ++ VWMACCUVV V2, V1, V3 // d7a120f2 ++ VWMACCUVV V2, V1, V0, V3 // d7a120f0 ++ VWMACCUVX V2, X10, V3 // d76125f2 ++ VWMACCUVX V2, X10, V0, V3 // d76125f0 ++ VWMACCVV V2, V1, V3 // d7a120f6 ++ VWMACCVV V2, V1, V0, V3 // d7a120f4 ++ VWMACCVX V2, X10, V3 // d76125f6 ++ VWMACCVX V2, X10, V0, V3 // d76125f4 ++ VWMACCSUVV V2, V1, V3 // d7a120fe ++ VWMACCSUVV V2, V1, V0, V3 // d7a120fc ++ VWMACCSUVX V2, X10, V3 // d76125fe ++ VWMACCSUVX V2, X10, V0, V3 // d76125fc ++ VWMACCUSVX V2, X10, V3 // d76125fa ++ VWMACCUSVX V2, X10, V0, V3 // d76125f8 + + // 31.11.15: Vector Integer Merge Instructions + VMERGEVVM V1, V2, V0, V3 // d781205c +diff --git a/src/cmd/asm/internal/asm/testdata/riscv64validation.s b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +index 374a97dcfe..55bf518e68 100644 +--- a/src/cmd/asm/internal/asm/testdata/riscv64validation.s ++++ b/src/cmd/asm/internal/asm/testdata/riscv64validation.s +@@ -214,19 +214,19 @@ TEXT validation(SB),$0 + VWMULUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" + VWMULSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" + VWMULSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMACCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMACCVV V2, X10, V3 // ERROR "expected vector register in vs1 position" + VMACCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VNMSACVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNMSACVV V2, X10, V3 // ERROR "expected vector register in vs1 position" + VNMSACVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VMADDVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VMADDVV V2, X10, V3 // ERROR "expected vector register in vs1 position" + VMADDVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VNMSUBVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VNMSUBVV V2, X10, V3 // ERROR "expected vector register in vs1 position" + VNMSUBVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWMACCUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMACCUVV V2, X10, V3 // ERROR "expected vector register in vs1 position" + VWMACCUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWMACCVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMACCVV V2, X10, V3 // ERROR "expected vector register in vs1 position" + VWMACCVX V1, V2, V3 // ERROR "expected integer register in rs1 position" +- VWMACCSUVV X10, V2, V3 // ERROR "expected vector register in vs1 position" ++ VWMACCSUVV V2, X10, V3 // ERROR "expected vector register in vs1 position" + VWMACCSUVX V1, V2, V3 // ERROR "expected integer register in rs1 position" + VWMACCUSVX V1, V2, V3 // ERROR "expected integer register in rs1 position" + VMERGEVVM X10, V2, V0, V3 // ERROR "expected vector register in vs1 position" +diff --git a/src/cmd/internal/obj/riscv/obj.go b/src/cmd/internal/obj/riscv/obj.go +index 592b7adba3..0b09a2e79c 100644 +--- a/src/cmd/internal/obj/riscv/obj.go ++++ b/src/cmd/internal/obj/riscv/obj.go +@@ -3571,8 +3571,6 @@ func instructionsForProg(p *obj.Prog) []*instruction { + AVMULVV, AVMULVX, AVMULHVV, AVMULHVX, AVMULHUVV, AVMULHUVX, AVMULHSUVV, AVMULHSUVX, + AVDIVUVV, AVDIVUVX, AVDIVVV, AVDIVVX, AVREMUVV, AVREMUVX, AVREMVV, AVREMVX, + AVWMULVV, AVWMULVX, AVWMULUVV, AVWMULUVX, AVWMULSUVV, AVWMULSUVX, AVNSRLWV, AVNSRLWX, AVNSRAWV, AVNSRAWX, +- AVMACCVV, AVMACCVX, AVNMSACVV, AVNMSACVX, AVMADDVV, AVMADDVX, AVNMSUBVV, AVNMSUBVX, +- AVWMACCUVV, AVWMACCUVX, AVWMACCVV, AVWMACCVX, AVWMACCSUVV, AVWMACCSUVX, AVWMACCUSVX, + AVSADDUVV, AVSADDUVX, AVSADDUVI, AVSADDVV, AVSADDVX, AVSADDVI, AVSSUBUVV, AVSSUBUVX, AVSSUBVV, AVSSUBVX, + AVAADDUVV, AVAADDUVX, AVAADDVV, AVAADDVX, AVASUBUVV, AVASUBUVX, AVASUBVV, AVASUBVX, + AVSMULVV, AVSMULVX, AVSSRLVV, AVSSRLVX, AVSSRLVI, AVSSRAVV, AVSSRAVX, AVSSRAVI, +@@ -3598,7 +3596,9 @@ func instructionsForProg(p *obj.Prog) []*instruction { + + case AVFMACCVV, AVFMACCVF, AVFNMACCVV, AVFNMACCVF, AVFMSACVV, AVFMSACVF, AVFNMSACVV, AVFNMSACVF, + AVFMADDVV, AVFMADDVF, AVFNMADDVV, AVFNMADDVF, AVFMSUBVV, AVFMSUBVF, AVFNMSUBVV, AVFNMSUBVF, +- AVFWMACCVV, AVFWMACCVF, AVFWNMACCVV, AVFWNMACCVF, AVFWMSACVV, AVFWMSACVF, AVFWNMSACVV, AVFWNMSACVF: ++ AVFWMACCVV, AVFWMACCVF, AVFWNMACCVV, AVFWNMACCVF, AVFWMSACVV, AVFWMSACVF, AVFWNMSACVV, AVFWNMSACVF, ++ AVMACCVV, AVMACCVX, AVNMSACVV, AVNMSACVX, AVMADDVV, AVMADDVX, AVNMSUBVV, AVNMSUBVX, ++ AVWMACCUVV, AVWMACCUVX, AVWMACCVV, AVWMACCVX, AVWMACCSUVV, AVWMACCSUVX, AVWMACCUSVX: + switch { + case ins.rs3 == obj.REG_NONE: + ins.funct7 |= 1 // unmasked +-- +2.39.5 + diff --git a/2105-cmd-compile-optimise-float-int-register-moves-on-ris.patch b/2105-cmd-compile-optimise-float-int-register-moves-on-ris.patch new file mode 100644 index 0000000..b47b83a --- /dev/null +++ b/2105-cmd-compile-optimise-float-int-register-moves-on-ris.patch @@ -0,0 +1,663 @@ +From eba33bc3c20003dd122cf703a93e6f66f5c85cf2 Mon Sep 17 00:00:00 2001 +From: Michael Munday +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 105/119] cmd/compile: optimise float <-> int register moves on + riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Use the FMV* instructions to move values between the floating point and +integer register files. + +Note: I'm unsure why there is a slowdown in the Float32bits benchmark, +I've checked and an FMVXS instruction is being used as expected. There +are multiple loads and other instructions in the main loop. + +goos: linux +goarch: riscv64 +pkg: math +cpu: Spacemit(R) X60 + │ fmv-before.txt │ fmv-after.txt │ + │ sec/op │ sec/op vs base │ +Acos 122.7n ± 0% 122.7n ± 0% ~ (p=1.000 n=10) +Acosh 197.2n ± 0% 191.5n ± 0% -2.89% (p=0.000 n=10) +Asin 122.7n ± 0% 122.7n ± 0% ~ (p=0.474 n=10) +Asinh 231.0n ± 0% 224.1n ± 0% -2.99% (p=0.000 n=10) +Atan 91.39n ± 0% 91.41n ± 0% ~ (p=0.465 n=10) +Atanh 210.3n ± 0% 203.4n ± 0% -3.26% (p=0.000 n=10) +Atan2 149.6n ± 0% 149.6n ± 0% ~ (p=0.721 n=10) +Cbrt 176.5n ± 0% 165.9n ± 0% -6.01% (p=0.000 n=10) +Ceil 25.67n ± 0% 24.42n ± 0% -4.87% (p=0.000 n=10) +Copysign 3.756n ± 0% 3.756n ± 0% ~ (p=0.149 n=10) +Cos 95.15n ± 0% 95.15n ± 0% ~ (p=0.374 n=10) +Cosh 228.6n ± 0% 224.7n ± 0% -1.71% (p=0.000 n=10) +Erf 115.2n ± 0% 115.2n ± 0% ~ (p=0.474 n=10) +Erfc 116.4n ± 0% 116.4n ± 0% ~ (p=0.628 n=10) +Erfinv 133.3n ± 0% 133.3n ± 0% ~ (p=1.000 n=10) +Erfcinv 133.3n ± 0% 133.3n ± 0% ~ (p=1.000 n=10) +Exp 194.1n ± 0% 190.3n ± 0% -1.93% (p=0.000 n=10) +ExpGo 204.7n ± 0% 200.3n ± 0% -2.15% (p=0.000 n=10) +Expm1 137.7n ± 0% 135.2n ± 0% -1.82% (p=0.000 n=10) +Exp2 173.4n ± 0% 169.0n ± 0% -2.54% (p=0.000 n=10) +Exp2Go 182.8n ± 0% 178.4n ± 0% -2.41% (p=0.000 n=10) +Abs 3.756n ± 0% 3.756n ± 0% ~ (p=0.157 n=10) +Dim 12.52n ± 0% 12.52n ± 0% ~ (p=0.737 n=10) +Floor 25.67n ± 0% 24.42n ± 0% -4.87% (p=0.000 n=10) +Max 21.29n ± 0% 20.03n ± 0% -5.92% (p=0.000 n=10) +Min 21.28n ± 0% 20.04n ± 0% -5.85% (p=0.000 n=10) +Mod 344.9n ± 0% 319.2n ± 0% -7.45% (p=0.000 n=10) +Frexp 55.71n ± 0% 48.85n ± 0% -12.30% (p=0.000 n=10) +Gamma 165.9n ± 0% 167.8n ± 0% +1.15% (p=0.000 n=10) +Hypot 73.24n ± 0% 70.74n ± 0% -3.41% (p=0.000 n=10) +HypotGo 84.50n ± 0% 82.63n ± 0% -2.21% (p=0.000 n=10) +Ilogb 49.45n ± 0% 45.70n ± 0% -7.59% (p=0.000 n=10) +J0 556.5n ± 0% 544.0n ± 0% -2.25% (p=0.000 n=10) +J1 555.3n ± 0% 542.8n ± 0% -2.24% (p=0.000 n=10) +Jn 1.181µ ± 0% 1.156µ ± 0% -2.12% (p=0.000 n=10) +Ldexp 59.47n ± 0% 53.84n ± 0% -9.47% (p=0.000 n=10) +Lgamma 167.2n ± 0% 154.6n ± 0% -7.51% (p=0.000 n=10) +Log 160.9n ± 0% 154.6n ± 0% -3.92% (p=0.000 n=10) +Logb 49.45n ± 0% 45.70n ± 0% -7.58% (p=0.000 n=10) +Log1p 147.1n ± 0% 137.1n ± 0% -6.80% (p=0.000 n=10) +Log10 162.1n ± 1% 154.6n ± 0% -4.63% (p=0.000 n=10) +Log2 66.99n ± 0% 60.72n ± 0% -9.36% (p=0.000 n=10) +Modf 29.42n ± 0% 26.29n ± 0% -10.64% (p=0.000 n=10) +Nextafter32 41.95n ± 0% 37.88n ± 0% -9.70% (p=0.000 n=10) +Nextafter64 38.82n ± 0% 33.49n ± 0% -13.73% (p=0.000 n=10) +PowInt 252.3n ± 0% 237.3n ± 0% -5.95% (p=0.000 n=10) +PowFrac 615.5n ± 0% 589.7n ± 0% -4.19% (p=0.000 n=10) +Pow10Pos 10.64n ± 0% 10.64n ± 0% ~ (p=1.000 n=10) +Pow10Neg 24.42n ± 0% 15.02n ± 0% -38.49% (p=0.000 n=10) +Round 21.91n ± 0% 18.16n ± 0% -17.12% (p=0.000 n=10) +RoundToEven 24.42n ± 0% 21.29n ± 0% -12.84% (p=0.000 n=10) +Remainder 308.0n ± 0% 291.2n ± 0% -5.44% (p=0.000 n=10) +Signbit 10.02n ± 0% 10.02n ± 0% ~ (p=1.000 n=10) +Sin 102.7n ± 0% 102.7n ± 0% ~ (p=0.211 n=10) +Sincos 124.0n ± 1% 123.3n ± 0% -0.56% (p=0.002 n=10) +Sinh 239.1n ± 0% 234.7n ± 0% -1.84% (p=0.000 n=10) +SqrtIndirect 2.504n ± 0% 2.504n ± 0% ~ (p=0.303 n=10) +SqrtLatency 15.03n ± 0% 15.02n ± 0% ~ (p=0.598 n=10) +SqrtIndirectLatency 15.02n ± 0% 15.02n ± 0% ~ (p=0.907 n=10) +SqrtGoLatency 165.3n ± 0% 157.2n ± 0% -4.90% (p=0.000 n=10) +SqrtPrime 3.801µ ± 0% 3.802µ ± 0% ~ (p=1.000 n=10) +Tan 125.2n ± 0% 125.2n ± 0% ~ (p=0.458 n=10) +Tanh 244.2n ± 0% 239.9n ± 0% -1.76% (p=0.000 n=10) +Trunc 25.67n ± 0% 24.42n ± 0% -4.87% (p=0.000 n=10) +Y0 550.2n ± 0% 538.1n ± 0% -2.21% (p=0.000 n=10) +Y1 552.8n ± 0% 540.6n ± 0% -2.21% (p=0.000 n=10) +Yn 1.168µ ± 0% 1.143µ ± 0% -2.14% (p=0.000 n=10) +Float64bits 8.139n ± 0% 4.385n ± 0% -46.13% (p=0.000 n=10) +Float64frombits 7.512n ± 0% 3.759n ± 0% -49.96% (p=0.000 n=10) +Float32bits 8.138n ± 0% 9.393n ± 0% +15.42% (p=0.000 n=10) +Float32frombits 7.513n ± 0% 3.757n ± 0% -49.98% (p=0.000 n=10) +FMA 3.756n ± 0% 3.756n ± 0% ~ (p=0.246 n=10) +geomean 77.43n 72.42n -6.47% + +Change-Id: I8dac69b1d17cb3d2af78d1c844d2b5d80000d667 +Reviewed-on: https://go-review.googlesource.com/c/go/+/599235 +Reviewed-by: Keith Randall +Auto-Submit: Michael Munday +Reviewed-by: Dmitri Shuralyov +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Keith Randall +--- + src/cmd/compile/internal/riscv64/ssa.go | 2 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 23 ++ + .../compile/internal/ssa/_gen/RISCV64Ops.go | 6 +- + src/cmd/compile/internal/ssa/opGen.go | 28 ++ + .../compile/internal/ssa/rewriteRISCV64.go | 312 ++++++++++++++++++ + test/codegen/math.go | 4 + + 6 files changed, 372 insertions(+), 3 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 759d8d7cf4..4aac891e13 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -416,7 +416,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case ssa.OpRISCV64FSQRTS, ssa.OpRISCV64FNEGS, ssa.OpRISCV64FABSD, ssa.OpRISCV64FSQRTD, ssa.OpRISCV64FNEGD, +- ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVDX, ++ ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVXS, ssa.OpRISCV64FMVDX, ssa.OpRISCV64FMVXD, + ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS, + ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD, + ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW: +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index f0d2d74b7b..9e39a58197 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -275,6 +275,11 @@ + (base.Op != OpSB || !config.ctxt.Flag_dynlink) => + (MOV(B|BU|H|HU|W|WU|D)load [off1+off2] {mergeSym(sym1,sym2)} base mem) + ++(FMOV(W|D)load [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) && ++ is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && ++ (base.Op != OpSB || !config.ctxt.Flag_dynlink) => ++ (FMOV(W|D)load [off1+off2] {mergeSym(sym1,sym2)} base mem) ++ + (MOV(B|H|W|D)store [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && + is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && + (base.Op != OpSB || !config.ctxt.Flag_dynlink) => +@@ -285,15 +290,26 @@ + (base.Op != OpSB || !config.ctxt.Flag_dynlink) => + (MOV(B|H|W|D)storezero [off1+off2] {mergeSym(sym1,sym2)} base mem) + ++(FMOV(W|D)store [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) && ++ is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && ++ (base.Op != OpSB || !config.ctxt.Flag_dynlink) => ++ (FMOV(W|D)store [off1+off2] {mergeSym(sym1,sym2)} base val mem) ++ + (MOV(B|BU|H|HU|W|WU|D)load [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => + (MOV(B|BU|H|HU|W|WU|D)load [off1+int32(off2)] {sym} base mem) + ++(FMOV(W|D)load [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => ++ (FMOV(W|D)load [off1+int32(off2)] {sym} base mem) ++ + (MOV(B|H|W|D)store [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => + (MOV(B|H|W|D)store [off1+int32(off2)] {sym} base val mem) + + (MOV(B|H|W|D)storezero [off1] {sym} (ADDI [off2] base) mem) && is32Bit(int64(off1)+off2) => + (MOV(B|H|W|D)storezero [off1+int32(off2)] {sym} base mem) + ++(FMOV(W|D)store [off1] {sym} (ADDI [off2] base) val mem) && is32Bit(int64(off1)+off2) => ++ (FMOV(W|D)store [off1+int32(off2)] {sym} base val mem) ++ + // Similarly, fold ADDI into MOVaddr to avoid confusing live variable analysis + // with OffPtr -> ADDI. + (ADDI [c] (MOVaddr [d] {s} x)) && is32Bit(c+int64(d)) => (MOVaddr [int32(c)+d] {s} x) +@@ -675,6 +691,13 @@ + (MOVHUreg x:(MOVHload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHUload [off] {sym} ptr mem) + (MOVWUreg x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWUload [off] {sym} ptr mem) + ++// Replace load from same location as preceding store with copy. ++(MOVDload [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (FMVXD x) ++(FMOVDload [off] {sym} ptr1 (MOVDstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (FMVDX x) ++(MOVWload [off] {sym} ptr1 (FMOVWstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (FMVXS x) ++(MOVWUload [off] {sym} ptr1 (FMOVWstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (MOVWUreg (FMVXS x)) ++(FMOVWload [off] {sym} ptr1 (MOVWstore [off] {sym} ptr2 x _)) && isSamePtr(ptr1, ptr2) => (FMVSX x) ++ + // If a register move has only 1 use, just use the same register without emitting instruction + // MOVnop does not emit an instruction, only for ensuring the type. + (MOVDreg x) && x.Uses == 1 => (MOVDnop x) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index 7f3c4a2bf4..a69b347a84 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -440,7 +440,8 @@ func init() { + {name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS", commutative: true, typ: "Float32"}, // -(arg0 * arg1) - arg2 + {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS", typ: "Float32"}, // sqrt(arg0) + {name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS", typ: "Float32"}, // -arg0 +- {name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"}, // reinterpret arg0 as float ++ {name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"}, // reinterpret arg0 as float32 ++ {name: "FMVXS", argLength: 1, reg: fpgp, asm: "FMVXS", typ: "Int32"}, // reinterpret arg0 as int32, sign extended to 64 bits + {name: "FCVTSW", argLength: 1, reg: gpfp, asm: "FCVTSW", typ: "Float32"}, // float32(low 32 bits of arg0) + {name: "FCVTSL", argLength: 1, reg: gpfp, asm: "FCVTSL", typ: "Float32"}, // float32(arg0) + {name: "FCVTWS", argLength: 1, reg: fpgp, asm: "FCVTWS", typ: "Int32"}, // int32(arg0) +@@ -467,7 +468,8 @@ func init() { + {name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD", typ: "Float64"}, // -arg0 + {name: "FABSD", argLength: 1, reg: fp11, asm: "FABSD", typ: "Float64"}, // abs(arg0) + {name: "FSGNJD", argLength: 2, reg: fp21, asm: "FSGNJD", typ: "Float64"}, // copy sign of arg1 to arg0 +- {name: "FMVDX", argLength: 1, reg: gpfp, asm: "FMVDX", typ: "Float64"}, // reinterpret arg0 as float ++ {name: "FMVDX", argLength: 1, reg: gpfp, asm: "FMVDX", typ: "Float64"}, // reinterpret arg0 as float64 ++ {name: "FMVXD", argLength: 1, reg: fpgp, asm: "FMVXD", typ: "Int64"}, // reinterpret arg0 as int64 + {name: "FCVTDW", argLength: 1, reg: gpfp, asm: "FCVTDW", typ: "Float64"}, // float64(low 32 bits of arg0) + {name: "FCVTDL", argLength: 1, reg: gpfp, asm: "FCVTDL", typ: "Float64"}, // float64(arg0) + {name: "FCVTWD", argLength: 1, reg: fpgp, asm: "FCVTWD", typ: "Int32"}, // int32(arg0) +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index a02afc2da0..5fda7ffc2f 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2454,6 +2454,7 @@ const ( + OpRISCV64FSQRTS + OpRISCV64FNEGS + OpRISCV64FMVSX ++ OpRISCV64FMVXS + OpRISCV64FCVTSW + OpRISCV64FCVTSL + OpRISCV64FCVTWS +@@ -2479,6 +2480,7 @@ const ( + OpRISCV64FABSD + OpRISCV64FSGNJD + OpRISCV64FMVDX ++ OpRISCV64FMVXD + OpRISCV64FCVTDW + OpRISCV64FCVTDL + OpRISCV64FCVTWD +@@ -32948,6 +32950,19 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "FMVXS", ++ argLen: 1, ++ asm: riscv.AFMVXS, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "FCVTSW", + argLen: 1, +@@ -33308,6 +33323,19 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "FMVXD", ++ argLen: 1, ++ asm: riscv.AFMVXD, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "FCVTDW", + argLen: 1, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 966199c450..a449ce01c6 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -471,6 +471,14 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpRISCV64FMADDD(v) + case OpRISCV64FMADDS: + return rewriteValueRISCV64_OpRISCV64FMADDS(v) ++ case OpRISCV64FMOVDload: ++ return rewriteValueRISCV64_OpRISCV64FMOVDload(v) ++ case OpRISCV64FMOVDstore: ++ return rewriteValueRISCV64_OpRISCV64FMOVDstore(v) ++ case OpRISCV64FMOVWload: ++ return rewriteValueRISCV64_OpRISCV64FMOVWload(v) ++ case OpRISCV64FMOVWstore: ++ return rewriteValueRISCV64_OpRISCV64FMOVWstore(v) + case OpRISCV64FMSUBD: + return rewriteValueRISCV64_OpRISCV64FMSUBD(v) + case OpRISCV64FMSUBS: +@@ -3686,6 +3694,250 @@ func rewriteValueRISCV64_OpRISCV64FMADDS(v *Value) bool { + } + return false + } ++func rewriteValueRISCV64_OpRISCV64FMOVDload(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config ++ // match: (FMOVDload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) ++ // result: (FMOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem) ++ for { ++ off1 := auxIntToInt32(v.AuxInt) ++ sym1 := auxToSym(v.Aux) ++ if v_0.Op != OpRISCV64MOVaddr { ++ break ++ } ++ off2 := auxIntToInt32(v_0.AuxInt) ++ sym2 := auxToSym(v_0.Aux) ++ base := v_0.Args[0] ++ mem := v_1 ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { ++ break ++ } ++ v.reset(OpRISCV64FMOVDload) ++ v.AuxInt = int32ToAuxInt(off1 + off2) ++ v.Aux = symToAux(mergeSym(sym1, sym2)) ++ v.AddArg2(base, mem) ++ return true ++ } ++ // match: (FMOVDload [off1] {sym} (ADDI [off2] base) mem) ++ // cond: is32Bit(int64(off1)+off2) ++ // result: (FMOVDload [off1+int32(off2)] {sym} base mem) ++ for { ++ off1 := auxIntToInt32(v.AuxInt) ++ sym := auxToSym(v.Aux) ++ if v_0.Op != OpRISCV64ADDI { ++ break ++ } ++ off2 := auxIntToInt64(v_0.AuxInt) ++ base := v_0.Args[0] ++ mem := v_1 ++ if !(is32Bit(int64(off1) + off2)) { ++ break ++ } ++ v.reset(OpRISCV64FMOVDload) ++ v.AuxInt = int32ToAuxInt(off1 + int32(off2)) ++ v.Aux = symToAux(sym) ++ v.AddArg2(base, mem) ++ return true ++ } ++ // match: (FMOVDload [off] {sym} ptr1 (MOVDstore [off] {sym} ptr2 x _)) ++ // cond: isSamePtr(ptr1, ptr2) ++ // result: (FMVDX x) ++ for { ++ off := auxIntToInt32(v.AuxInt) ++ sym := auxToSym(v.Aux) ++ ptr1 := v_0 ++ if v_1.Op != OpRISCV64MOVDstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym { ++ break ++ } ++ x := v_1.Args[1] ++ ptr2 := v_1.Args[0] ++ if !(isSamePtr(ptr1, ptr2)) { ++ break ++ } ++ v.reset(OpRISCV64FMVDX) ++ v.AddArg(x) ++ return true ++ } ++ return false ++} ++func rewriteValueRISCV64_OpRISCV64FMOVDstore(v *Value) bool { ++ v_2 := v.Args[2] ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config ++ // match: (FMOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) ++ // result: (FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) ++ for { ++ off1 := auxIntToInt32(v.AuxInt) ++ sym1 := auxToSym(v.Aux) ++ if v_0.Op != OpRISCV64MOVaddr { ++ break ++ } ++ off2 := auxIntToInt32(v_0.AuxInt) ++ sym2 := auxToSym(v_0.Aux) ++ base := v_0.Args[0] ++ val := v_1 ++ mem := v_2 ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { ++ break ++ } ++ v.reset(OpRISCV64FMOVDstore) ++ v.AuxInt = int32ToAuxInt(off1 + off2) ++ v.Aux = symToAux(mergeSym(sym1, sym2)) ++ v.AddArg3(base, val, mem) ++ return true ++ } ++ // match: (FMOVDstore [off1] {sym} (ADDI [off2] base) val mem) ++ // cond: is32Bit(int64(off1)+off2) ++ // result: (FMOVDstore [off1+int32(off2)] {sym} base val mem) ++ for { ++ off1 := auxIntToInt32(v.AuxInt) ++ sym := auxToSym(v.Aux) ++ if v_0.Op != OpRISCV64ADDI { ++ break ++ } ++ off2 := auxIntToInt64(v_0.AuxInt) ++ base := v_0.Args[0] ++ val := v_1 ++ mem := v_2 ++ if !(is32Bit(int64(off1) + off2)) { ++ break ++ } ++ v.reset(OpRISCV64FMOVDstore) ++ v.AuxInt = int32ToAuxInt(off1 + int32(off2)) ++ v.Aux = symToAux(sym) ++ v.AddArg3(base, val, mem) ++ return true ++ } ++ return false ++} ++func rewriteValueRISCV64_OpRISCV64FMOVWload(v *Value) bool { ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config ++ // match: (FMOVWload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) ++ // result: (FMOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem) ++ for { ++ off1 := auxIntToInt32(v.AuxInt) ++ sym1 := auxToSym(v.Aux) ++ if v_0.Op != OpRISCV64MOVaddr { ++ break ++ } ++ off2 := auxIntToInt32(v_0.AuxInt) ++ sym2 := auxToSym(v_0.Aux) ++ base := v_0.Args[0] ++ mem := v_1 ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { ++ break ++ } ++ v.reset(OpRISCV64FMOVWload) ++ v.AuxInt = int32ToAuxInt(off1 + off2) ++ v.Aux = symToAux(mergeSym(sym1, sym2)) ++ v.AddArg2(base, mem) ++ return true ++ } ++ // match: (FMOVWload [off1] {sym} (ADDI [off2] base) mem) ++ // cond: is32Bit(int64(off1)+off2) ++ // result: (FMOVWload [off1+int32(off2)] {sym} base mem) ++ for { ++ off1 := auxIntToInt32(v.AuxInt) ++ sym := auxToSym(v.Aux) ++ if v_0.Op != OpRISCV64ADDI { ++ break ++ } ++ off2 := auxIntToInt64(v_0.AuxInt) ++ base := v_0.Args[0] ++ mem := v_1 ++ if !(is32Bit(int64(off1) + off2)) { ++ break ++ } ++ v.reset(OpRISCV64FMOVWload) ++ v.AuxInt = int32ToAuxInt(off1 + int32(off2)) ++ v.Aux = symToAux(sym) ++ v.AddArg2(base, mem) ++ return true ++ } ++ // match: (FMOVWload [off] {sym} ptr1 (MOVWstore [off] {sym} ptr2 x _)) ++ // cond: isSamePtr(ptr1, ptr2) ++ // result: (FMVSX x) ++ for { ++ off := auxIntToInt32(v.AuxInt) ++ sym := auxToSym(v.Aux) ++ ptr1 := v_0 ++ if v_1.Op != OpRISCV64MOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym { ++ break ++ } ++ x := v_1.Args[1] ++ ptr2 := v_1.Args[0] ++ if !(isSamePtr(ptr1, ptr2)) { ++ break ++ } ++ v.reset(OpRISCV64FMVSX) ++ v.AddArg(x) ++ return true ++ } ++ return false ++} ++func rewriteValueRISCV64_OpRISCV64FMOVWstore(v *Value) bool { ++ v_2 := v.Args[2] ++ v_1 := v.Args[1] ++ v_0 := v.Args[0] ++ b := v.Block ++ config := b.Func.Config ++ // match: (FMOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem) ++ // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) ++ // result: (FMOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) ++ for { ++ off1 := auxIntToInt32(v.AuxInt) ++ sym1 := auxToSym(v.Aux) ++ if v_0.Op != OpRISCV64MOVaddr { ++ break ++ } ++ off2 := auxIntToInt32(v_0.AuxInt) ++ sym2 := auxToSym(v_0.Aux) ++ base := v_0.Args[0] ++ val := v_1 ++ mem := v_2 ++ if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink)) { ++ break ++ } ++ v.reset(OpRISCV64FMOVWstore) ++ v.AuxInt = int32ToAuxInt(off1 + off2) ++ v.Aux = symToAux(mergeSym(sym1, sym2)) ++ v.AddArg3(base, val, mem) ++ return true ++ } ++ // match: (FMOVWstore [off1] {sym} (ADDI [off2] base) val mem) ++ // cond: is32Bit(int64(off1)+off2) ++ // result: (FMOVWstore [off1+int32(off2)] {sym} base val mem) ++ for { ++ off1 := auxIntToInt32(v.AuxInt) ++ sym := auxToSym(v.Aux) ++ if v_0.Op != OpRISCV64ADDI { ++ break ++ } ++ off2 := auxIntToInt64(v_0.AuxInt) ++ base := v_0.Args[0] ++ val := v_1 ++ mem := v_2 ++ if !(is32Bit(int64(off1) + off2)) { ++ break ++ } ++ v.reset(OpRISCV64FMOVWstore) ++ v.AuxInt = int32ToAuxInt(off1 + int32(off2)) ++ v.Aux = symToAux(sym) ++ v.AddArg3(base, val, mem) ++ return true ++ } ++ return false ++} + func rewriteValueRISCV64_OpRISCV64FMSUBD(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] +@@ -4739,6 +4991,25 @@ func rewriteValueRISCV64_OpRISCV64MOVDload(v *Value) bool { + v.AddArg2(base, mem) + return true + } ++ // match: (MOVDload [off] {sym} ptr1 (FMOVDstore [off] {sym} ptr2 x _)) ++ // cond: isSamePtr(ptr1, ptr2) ++ // result: (FMVXD x) ++ for { ++ off := auxIntToInt32(v.AuxInt) ++ sym := auxToSym(v.Aux) ++ ptr1 := v_0 ++ if v_1.Op != OpRISCV64FMOVDstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym { ++ break ++ } ++ x := v_1.Args[1] ++ ptr2 := v_1.Args[0] ++ if !(isSamePtr(ptr1, ptr2)) { ++ break ++ } ++ v.reset(OpRISCV64FMVXD) ++ v.AddArg(x) ++ return true ++ } + return false + } + func rewriteValueRISCV64_OpRISCV64MOVDnop(v *Value) bool { +@@ -5420,6 +5691,7 @@ func rewriteValueRISCV64_OpRISCV64MOVWUload(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + config := b.Func.Config ++ typ := &b.Func.Config.Types + // match: (MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem) + // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_dynlink) + // result: (MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem) +@@ -5463,6 +5735,27 @@ func rewriteValueRISCV64_OpRISCV64MOVWUload(v *Value) bool { + v.AddArg2(base, mem) + return true + } ++ // match: (MOVWUload [off] {sym} ptr1 (FMOVWstore [off] {sym} ptr2 x _)) ++ // cond: isSamePtr(ptr1, ptr2) ++ // result: (MOVWUreg (FMVXS x)) ++ for { ++ off := auxIntToInt32(v.AuxInt) ++ sym := auxToSym(v.Aux) ++ ptr1 := v_0 ++ if v_1.Op != OpRISCV64FMOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym { ++ break ++ } ++ x := v_1.Args[1] ++ ptr2 := v_1.Args[0] ++ if !(isSamePtr(ptr1, ptr2)) { ++ break ++ } ++ v.reset(OpRISCV64MOVWUreg) ++ v0 := b.NewValue0(v_1.Pos, OpRISCV64FMVXS, typ.Int32) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } + return false + } + func rewriteValueRISCV64_OpRISCV64MOVWUreg(v *Value) bool { +@@ -5653,6 +5946,25 @@ func rewriteValueRISCV64_OpRISCV64MOVWload(v *Value) bool { + v.AddArg2(base, mem) + return true + } ++ // match: (MOVWload [off] {sym} ptr1 (FMOVWstore [off] {sym} ptr2 x _)) ++ // cond: isSamePtr(ptr1, ptr2) ++ // result: (FMVXS x) ++ for { ++ off := auxIntToInt32(v.AuxInt) ++ sym := auxToSym(v.Aux) ++ ptr1 := v_0 ++ if v_1.Op != OpRISCV64FMOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym { ++ break ++ } ++ x := v_1.Args[1] ++ ptr2 := v_1.Args[0] ++ if !(isSamePtr(ptr1, ptr2)) { ++ break ++ } ++ v.reset(OpRISCV64FMVXS) ++ v.AddArg(x) ++ return true ++ } + return false + } + func rewriteValueRISCV64_OpRISCV64MOVWreg(v *Value) bool { +diff --git a/test/codegen/math.go b/test/codegen/math.go +index 331ebbe609..3a2fac3e2e 100644 +--- a/test/codegen/math.go ++++ b/test/codegen/math.go +@@ -158,6 +158,7 @@ func fromFloat64(f64 float64) uint64 { + // arm64:"FMOVD\tF.*, R.*" + // ppc64x:"MFVSRD" + // mips64/hardfloat:"MOVV\tF.*, R.*" ++ // riscv64:"FMVXD" + return math.Float64bits(f64+1) + 1 + } + +@@ -165,6 +166,7 @@ func fromFloat32(f32 float32) uint32 { + // amd64:"MOVL\tX.*, [^X].*" + // arm64:"FMOVS\tF.*, R.*" + // mips64/hardfloat:"MOVW\tF.*, R.*" ++ // riscv64:"FMVXW" + return math.Float32bits(f32+1) + 1 + } + +@@ -173,6 +175,7 @@ func toFloat64(u64 uint64) float64 { + // arm64:"FMOVD\tR.*, F.*" + // ppc64x:"MTVSRD" + // mips64/hardfloat:"MOVV\tR.*, F.*" ++ // riscv64:"FMVDX" + return math.Float64frombits(u64+1) + 1 + } + +@@ -180,6 +183,7 @@ func toFloat32(u32 uint32) float32 { + // amd64:"MOVL\t[^X].*, X.*" + // arm64:"FMOVS\tR.*, F.*" + // mips64/hardfloat:"MOVW\tR.*, F.*" ++ // riscv64:"FMVWX" + return math.Float32frombits(u32+1) + 1 + } + +-- +2.39.5 + diff --git a/2106-internal-bytealg-vector-implementation-of-compare-fo.patch b/2106-internal-bytealg-vector-implementation-of-compare-fo.patch new file mode 100644 index 0000000..a26f057 --- /dev/null +++ b/2106-internal-bytealg-vector-implementation-of-compare-fo.patch @@ -0,0 +1,163 @@ +From c06ec43f26c1a46351bb2320dacac177db1e0d9c Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:40 +0800 +Subject: [PATCH 106/119] internal/bytealg: vector implementation of compare + for riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Provide a vector implementation of compare for riscv64, which is used +when compiled with the rva23u64 profile, or when vector is detected +to be available. Inputs that are 8 byte aligned will still be handled +via a the non-vector code if the length is less than or equal to 128 +bytes. + +On a Banana Pi F3, with GORISCV64=rva23u64: + + │ compare.1 │ compare.2 │ + │ sec/op │ sec/op vs base │ +BytesCompare/1-8 24.36n ± 0% 24.15n ± 0% -0.84% (p=0.007 n=10) +BytesCompare/2-8 26.75n ± 0% 26.97n ± 0% +0.82% (p=0.000 n=10) +BytesCompare/4-8 27.63n ± 0% 27.80n ± 0% +0.60% (p=0.001 n=10) +BytesCompare/8-8 35.91n ± 0% 35.19n ± 0% -2.01% (p=0.000 n=10) +BytesCompare/16-8 53.22n ± 0% 24.04n ± 1% -54.82% (p=0.000 n=10) +BytesCompare/32-8 25.12n ± 0% 26.09n ± 1% +3.86% (p=0.000 n=10) +BytesCompare/64-8 32.52n ± 0% 33.43n ± 1% +2.78% (p=0.000 n=10) +BytesCompare/128-8 46.59n ± 0% 48.22n ± 1% +3.50% (p=0.000 n=10) +BytesCompare/256-8 74.25n ± 0% 50.18n ± 0% -32.42% (p=0.000 n=10) +BytesCompare/512-8 129.85n ± 0% 83.12n ± 0% -35.98% (p=0.000 n=10) +BytesCompare/1024-8 244.6n ± 0% 148.0n ± 1% -39.49% (p=0.000 n=10) +BytesCompare/2048-8 465.9n ± 0% 282.8n ± 2% -39.30% (p=0.000 n=10) +CompareBytesEqual-8 51.96n ± 0% 52.90n ± 1% +1.80% (p=0.000 n=10) +CompareBytesToNil-8 15.77n ± 1% 15.68n ± 0% -0.57% (p=0.000 n=10) +CompareBytesEmpty-8 14.21n ± 1% 14.20n ± 1% ~ (p=1.000 n=10) +CompareBytesIdentical-8 14.20n ± 1% 15.07n ± 1% +6.20% (p=0.000 n=10) +CompareBytesSameLength-8 31.38n ± 0% 30.52n ± 0% -2.74% (p=0.000 n=10) +CompareBytesDifferentLength-8 31.38n ± 0% 30.53n ± 0% -2.71% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=1-8 2401.0µ ± 0% 437.6µ ± 0% -81.77% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=2-8 2376.8µ ± 0% 437.4µ ± 0% -81.60% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=3-8 2384.1µ ± 0% 437.5µ ± 0% -81.65% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=4-8 2377.7µ ± 0% 437.4µ ± 0% -81.60% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=5-8 2366.3µ ± 0% 437.5µ ± 0% -81.51% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=6-8 2357.3µ ± 0% 437.3µ ± 0% -81.45% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=7-8 2385.3µ ± 0% 437.6µ ± 0% -81.65% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=0-8 447.2µ ± 0% 464.8µ ± 0% +3.94% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=1-8 447.7µ ± 0% 453.1µ ± 0% +1.20% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=2-8 447.9µ ± 0% 453.0µ ± 0% +1.15% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=3-8 448.0µ ± 0% 452.5µ ± 0% +1.02% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=4-8 448.0µ ± 0% 452.1µ ± 0% +0.92% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=5-8 447.8µ ± 0% 452.8µ ± 0% +1.12% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=6-8 447.9µ ± 0% 452.4µ ± 0% +1.01% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=7-8 447.9µ ± 0% 452.8µ ± 0% +1.09% (p=0.000 n=10) +CompareBytesBig-8 441.2µ ± 0% 461.8µ ± 0% +4.66% (p=0.000 n=10) +CompareBytesBigIdentical-8 13.81n ± 0% 13.80n ± 0% ~ (p=0.519 n=10) +geomean 3.980µ 2.651µ -33.40% + + │ compare.1 │ compare.2 │ + │ B/s │ B/s vs base │ +CompareBytesBigUnaligned/offset=1-8 416.5Mi ± 0% 2285.1Mi ± 0% +448.64% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=2-8 420.7Mi ± 0% 2286.4Mi ± 0% +443.43% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=3-8 419.5Mi ± 0% 2285.9Mi ± 0% +444.97% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=4-8 420.6Mi ± 0% 2286.1Mi ± 0% +443.57% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=5-8 422.6Mi ± 0% 2285.7Mi ± 0% +440.86% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=6-8 424.2Mi ± 0% 2286.8Mi ± 0% +439.07% (p=0.000 n=10) +CompareBytesBigUnaligned/offset=7-8 419.2Mi ± 0% 2285.2Mi ± 0% +445.07% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=0-8 2.184Gi ± 0% 2.101Gi ± 0% -3.79% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=1-8 2.181Gi ± 0% 2.155Gi ± 0% -1.18% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=2-8 2.180Gi ± 0% 2.156Gi ± 0% -1.13% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=3-8 2.180Gi ± 0% 2.158Gi ± 0% -1.01% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=4-8 2.180Gi ± 0% 2.160Gi ± 0% -0.91% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=5-8 2.181Gi ± 0% 2.157Gi ± 0% -1.11% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=6-8 2.181Gi ± 0% 2.159Gi ± 0% -1.00% (p=0.000 n=10) +CompareBytesBigBothUnaligned/offset=7-8 2.180Gi ± 0% 2.157Gi ± 0% -1.08% (p=0.000 n=10) +CompareBytesBig-8 2.213Gi ± 0% 2.115Gi ± 0% -4.45% (p=0.000 n=10) +CompareBytesBigIdentical-8 69.06Ti ± 0% 69.09Ti ± 0% ~ (p=0.315 n=10) +geomean 2.022Gi 4.022Gi +98.95% + +Change-Id: Id3012faf8d353eb1be0e1fb01b78ac43fa4c7e8b +Reviewed-on: https://go-review.googlesource.com/c/go/+/646737 +Reviewed-by: Mark Ryan +Reviewed-by: Mark Freeman +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Dmitri Shuralyov +Reviewed-by: Meng Zhuo +--- + src/internal/bytealg/compare_riscv64.s | 47 ++++++++++++++++++++++++-- + 1 file changed, 44 insertions(+), 3 deletions(-) + +diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s +index 6388fcd209..3b1523dfbf 100644 +--- a/src/internal/bytealg/compare_riscv64.s ++++ b/src/internal/bytealg/compare_riscv64.s +@@ -2,6 +2,7 @@ + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + ++#include "asm_riscv64.h" + #include "go_asm.h" + #include "textflag.h" + +@@ -35,6 +36,46 @@ TEXT compare<>(SB),NOSPLIT|NOFRAME,$0 + MIN X11, X13, X5 + BEQZ X5, cmp_len + ++ MOV $16, X6 ++ BLT X5, X6, check8_unaligned ++ ++#ifndef hasV ++ MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X6 ++ BEQZ X6, compare_scalar ++#endif ++ ++ // Use vector if not 8 byte aligned. ++ OR X10, X12, X6 ++ AND $7, X6 ++ BNEZ X6, vector_loop ++ ++ // Use scalar if 8 byte aligned and <= 128 bytes. ++ SUB $128, X5, X6 ++ BLEZ X6, compare_scalar_aligned ++ ++ PCALIGN $16 ++vector_loop: ++ VSETVLI X5, E8, M8, TA, MA, X6 ++ VLE8V (X10), V8 ++ VLE8V (X12), V16 ++ VMSNEVV V8, V16, V0 ++ VFIRSTM V0, X7 ++ BGEZ X7, vector_not_eq ++ ADD X6, X10 ++ ADD X6, X12 ++ SUB X6, X5 ++ BNEZ X5, vector_loop ++ JMP cmp_len ++ ++vector_not_eq: ++ // Load first differing bytes in X8/X9. ++ ADD X7, X10 ++ ADD X7, X12 ++ MOVBU (X10), X8 ++ MOVBU (X12), X9 ++ JMP cmp ++ ++compare_scalar: + MOV $32, X6 + BLT X5, X6, check8_unaligned + +@@ -57,9 +98,9 @@ align: + ADD $1, X12 + BNEZ X7, align + +-check32: +- // X6 contains $32 +- BLT X5, X6, compare16 ++compare_scalar_aligned: ++ MOV $32, X6 ++ BLT X5, X6, check16 + compare32: + MOV 0(X10), X15 + MOV 0(X12), X16 +-- +2.39.5 + diff --git a/2107-cmd-compile-internal-ssagen-improve-intrinsic-archit.patch b/2107-cmd-compile-internal-ssagen-improve-intrinsic-archit.patch new file mode 100644 index 0000000..6104747 --- /dev/null +++ b/2107-cmd-compile-internal-ssagen-improve-intrinsic-archit.patch @@ -0,0 +1,101 @@ +From c395e1476dbd60c4045c1061e5ef4d0283b31603 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:38:41 +0800 +Subject: [PATCH 107/119] cmd/compile/internal/ssagen: improve intrinsic + architecture handling + +The architecture handling code for intrinsics is more complex than +it needs to be. sys.Archs is already an array of *sys.Arch and the +existing InFamily function can be used instead of a reimplementation. + +Add some test coverage for sys.Arch.InFamily while here. + +Change-Id: Ia764f211114fea65424c09a421c5ccb02b7187b0 +Reviewed-on: https://go-review.googlesource.com/c/go/+/605476 +Reviewed-by: Carlos Amedee +Reviewed-by: Keith Randall +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +--- + src/cmd/compile/internal/ssagen/ssa.go | 16 ++++------------ + src/cmd/internal/sys/arch_test.go | 24 ++++++++++++++++++++++++ + 2 files changed, 28 insertions(+), 12 deletions(-) + create mode 100644 src/cmd/internal/sys/arch_test.go + +diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go +index a0a3470ea2..cbf9587a56 100644 +--- a/src/cmd/compile/internal/ssagen/ssa.go ++++ b/src/cmd/compile/internal/ssagen/ssa.go +@@ -4047,12 +4047,10 @@ type intrinsicKey struct { + func InitTables() { + intrinsics = map[intrinsicKey]intrinsicBuilder{} + +- var all []*sys.Arch + var p4 []*sys.Arch + var p8 []*sys.Arch + var lwatomics []*sys.Arch +- for _, a := range &sys.Archs { +- all = append(all, a) ++ for _, a := range sys.Archs { + if a.PtrSize == 4 { + p4 = append(p4, a) + } else { +@@ -4062,6 +4060,7 @@ func InitTables() { + lwatomics = append(lwatomics, a) + } + } ++ all := sys.Archs[:] + + // add adds the intrinsic b for pkg.fn for the given list of architectures. + add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) { +@@ -4071,15 +4070,8 @@ func InitTables() { + } + // addF does the same as add but operates on architecture families. + addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) { +- m := 0 +- for _, f := range archFamilies { +- if f >= 32 { +- panic("too many architecture families") +- } +- m |= 1 << uint(f) +- } +- for _, a := range all { +- if m>>uint(a.Family)&1 != 0 { ++ for _, a := range sys.Archs { ++ if a.InFamily(archFamilies...) { + intrinsics[intrinsicKey{a, pkg, fn}] = b + } + } +diff --git a/src/cmd/internal/sys/arch_test.go b/src/cmd/internal/sys/arch_test.go +new file mode 100644 +index 0000000000..011d0923d5 +--- /dev/null ++++ b/src/cmd/internal/sys/arch_test.go +@@ -0,0 +1,24 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++package sys ++ ++import ( ++ "testing" ++) ++ ++func TestArchInFamily(t *testing.T) { ++ if got, want := ArchPPC64LE.InFamily(AMD64), false; got != want { ++ t.Errorf("Got ArchPPC64LE.InFamily(AMD64) = %v, want %v", got, want) ++ } ++ if got, want := ArchPPC64LE.InFamily(PPC64), true; got != want { ++ t.Errorf("Got ArchPPC64LE.InFamily(PPC64) = %v, want %v", got, want) ++ } ++ if got, want := ArchPPC64LE.InFamily(AMD64, RISCV64), false; got != want { ++ t.Errorf("Got ArchPPC64LE.InFamily(AMD64, RISCV64) = %v, want %v", got, want) ++ } ++ if got, want := ArchPPC64LE.InFamily(AMD64, PPC64), true; got != want { ++ t.Errorf("Got ArchPPC64LE.InFamily(AMD64, PPC64) = %v, want %v", got, want) ++ } ++} +-- +2.39.5 + diff --git a/2108-cmd-compile-internal-ssagen-factor-out-intrinsics-co.patch b/2108-cmd-compile-internal-ssagen-factor-out-intrinsics-co.patch new file mode 100644 index 0000000..4128118 --- /dev/null +++ b/2108-cmd-compile-internal-ssagen-factor-out-intrinsics-co.patch @@ -0,0 +1,2066 @@ +From c6eff25852170f02d0526aee69ee0110ab5f9f9e Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:42:51 +0800 +Subject: [PATCH 108/119] cmd/compile/internal/ssagen: factor out intrinsics + code + +The intrinsic handling code is a good thousand lines in the fairly +large ssa.go file. This code is already reasonably self-contained - factor +it out into a separate file so that future changes are easier to manage +(and it becomes easier to add/change intrinsics for an architecture). + +Change-Id: I3c18d3d1bb6332f1817d902150e736373bf1ac81 +Reviewed-on: https://go-review.googlesource.com/c/go/+/605477 +Reviewed-by: Carlos Amedee +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Keith Randall +Reviewed-by: Cherry Mui +--- + src/cmd/compile/internal/ssagen/intrinsics.go | 1047 +++++++++++++++++ + src/cmd/compile/internal/ssagen/ssa.go | 969 +-------------- + 2 files changed, 1056 insertions(+), 960 deletions(-) + create mode 100644 src/cmd/compile/internal/ssagen/intrinsics.go + +diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go +new file mode 100644 +index 0000000000..59eb1869bb +--- /dev/null ++++ b/src/cmd/compile/internal/ssagen/intrinsics.go +@@ -0,0 +1,1047 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++package ssagen ++ ++import ( ++ "fmt" ++ "internal/buildcfg" ++ ++ "cmd/compile/internal/base" ++ "cmd/compile/internal/ir" ++ "cmd/compile/internal/ssa" ++ "cmd/compile/internal/types" ++ "cmd/internal/sys" ++) ++ ++var intrinsics map[intrinsicKey]intrinsicBuilder ++ ++// An intrinsicBuilder converts a call node n into an ssa value that ++// implements that call as an intrinsic. args is a list of arguments to the func. ++type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value ++ ++type intrinsicKey struct { ++ arch *sys.Arch ++ pkg string ++ fn string ++} ++ ++func initIntrinsics() { ++ intrinsics = map[intrinsicKey]intrinsicBuilder{} ++ ++ var p4 []*sys.Arch ++ var p8 []*sys.Arch ++ var lwatomics []*sys.Arch ++ for _, a := range sys.Archs { ++ if a.PtrSize == 4 { ++ p4 = append(p4, a) ++ } else { ++ p8 = append(p8, a) ++ } ++ if a.Family != sys.PPC64 { ++ lwatomics = append(lwatomics, a) ++ } ++ } ++ all := sys.Archs[:] ++ ++ // add adds the intrinsic b for pkg.fn for the given list of architectures. ++ add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) { ++ for _, a := range archs { ++ intrinsics[intrinsicKey{a, pkg, fn}] = b ++ } ++ } ++ // addF does the same as add but operates on architecture families. ++ addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) { ++ for _, a := range sys.Archs { ++ if a.InFamily(archFamilies...) { ++ intrinsics[intrinsicKey{a, pkg, fn}] = b ++ } ++ } ++ } ++ // alias defines pkg.fn = pkg2.fn2 for all architectures in archs for which pkg2.fn2 exists. ++ alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) { ++ aliased := false ++ for _, a := range archs { ++ if b, ok := intrinsics[intrinsicKey{a, pkg2, fn2}]; ok { ++ intrinsics[intrinsicKey{a, pkg, fn}] = b ++ aliased = true ++ } ++ } ++ if !aliased { ++ panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn)) ++ } ++ } ++ ++ /******** runtime ********/ ++ if !base.Flag.Cfg.Instrumenting { ++ add("runtime", "slicebytetostringtmp", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ // Compiler frontend optimizations emit OBYTES2STRTMP nodes ++ // for the backend instead of slicebytetostringtmp calls ++ // when not instrumenting. ++ return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1]) ++ }, ++ all...) ++ } ++ addF("internal/runtime/math", "MulUintptr", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ if s.config.PtrSize == 4 { ++ return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1]) ++ } ++ return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1]) ++ }, ++ sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.RISCV64, sys.ARM64) ++ add("runtime", "KeepAlive", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0]) ++ s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem()) ++ return nil ++ }, ++ all...) ++ add("runtime", "getclosureptr", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr) ++ }, ++ all...) ++ ++ add("runtime", "getcallerpc", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr) ++ }, ++ all...) ++ ++ add("runtime", "getcallersp", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem()) ++ }, ++ all...) ++ ++ addF("runtime", "publicationBarrier", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem()) ++ return nil ++ }, ++ sys.ARM64, sys.PPC64, sys.RISCV64) ++ ++ brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X} ++ if buildcfg.GOPPC64 >= 10 { ++ // Use only on Power10 as the new byte reverse instructions that Power10 provide ++ // make it worthwhile as an intrinsic ++ brev_arch = append(brev_arch, sys.PPC64) ++ } ++ /******** internal/runtime/sys ********/ ++ addF("internal/runtime/sys", "Bswap32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0]) ++ }, ++ brev_arch...) ++ addF("internal/runtime/sys", "Bswap64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0]) ++ }, ++ brev_arch...) ++ ++ /****** Prefetch ******/ ++ makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem()) ++ return nil ++ } ++ } ++ ++ // Make Prefetch intrinsics for supported platforms ++ // On the unsupported platforms stub function will be eliminated ++ addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache), ++ sys.AMD64, sys.ARM64, sys.PPC64) ++ addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed), ++ sys.AMD64, sys.ARM64, sys.PPC64) ++ ++ /******** internal/runtime/atomic ********/ ++ addF("internal/runtime/atomic", "Load", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) ++ }, ++ sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "Load8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v) ++ }, ++ sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "Load64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) ++ }, ++ sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "LoadAcq", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) ++ }, ++ sys.PPC64, sys.S390X) ++ addF("internal/runtime/atomic", "LoadAcq64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) ++ }, ++ sys.PPC64) ++ addF("internal/runtime/atomic", "Loadp", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v) ++ }, ++ sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ ++ addF("internal/runtime/atomic", "Store", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem()) ++ return nil ++ }, ++ sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "Store8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem()) ++ return nil ++ }, ++ sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "Store64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem()) ++ return nil ++ }, ++ sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "StorepNoWB", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem()) ++ return nil ++ }, ++ sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "StoreRel", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem()) ++ return nil ++ }, ++ sys.PPC64, sys.S390X) ++ addF("internal/runtime/atomic", "StoreRel64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem()) ++ return nil ++ }, ++ sys.PPC64) ++ ++ addF("internal/runtime/atomic", "Xchg", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) ++ }, ++ sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "Xchg64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) ++ }, ++ sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ ++ type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) ++ ++ makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder { ++ ++ return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ if buildcfg.GOARM64.LSE || buildcfg.GOARM64.KPAtomicOpt { ++ emit(s, n, args, op1, typ, needReturn) ++ } else { ++ // Target Atomic feature is identified by dynamic detection ++ addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb) ++ v := s.load(types.Types[types.TBOOL], addr) ++ b := s.endBlock() ++ b.Kind = ssa.BlockIf ++ b.SetControl(v) ++ bTrue := s.f.NewBlock(ssa.BlockPlain) ++ bFalse := s.f.NewBlock(ssa.BlockPlain) ++ bEnd := s.f.NewBlock(ssa.BlockPlain) ++ b.AddEdgeTo(bTrue) ++ b.AddEdgeTo(bFalse) ++ b.Likely = ssa.BranchLikely ++ ++ // We have atomic instructions - use it directly. ++ s.startBlock(bTrue) ++ emit(s, n, args, op1, typ, needReturn) ++ s.endBlock().AddEdgeTo(bEnd) ++ ++ // Use original instruction sequence. ++ s.startBlock(bFalse) ++ emit(s, n, args, op0, typ, needReturn) ++ s.endBlock().AddEdgeTo(bEnd) ++ ++ // Merge results. ++ s.startBlock(bEnd) ++ } ++ if needReturn { ++ return s.variable(n, types.Types[typ]) ++ } else { ++ return nil ++ } ++ } ++ } ++ makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder { ++ return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true) ++ } ++ makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder { ++ return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false) ++ } ++ ++ atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) { ++ v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ if needReturn { ++ s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v) ++ } ++ } ++ addF("internal/runtime/atomic", "Xchg", ++ makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64), ++ sys.ARM64) ++ addF("internal/runtime/atomic", "Xchg64", ++ makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64), ++ sys.ARM64) ++ ++ addF("internal/runtime/atomic", "Xadd", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) ++ }, ++ sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "Xadd64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) ++ }, ++ sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ ++ addF("internal/runtime/atomic", "Xadd", ++ makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64), ++ sys.ARM64) ++ addF("internal/runtime/atomic", "Xadd64", ++ makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64), ++ sys.ARM64) ++ ++ addF("internal/runtime/atomic", "Cas", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v) ++ }, ++ sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "Cas64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v) ++ }, ++ sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "CasRel", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v) ++ }, ++ sys.PPC64) ++ ++ atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) { ++ v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) ++ s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) ++ if needReturn { ++ s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v) ++ } ++ } ++ ++ addF("internal/runtime/atomic", "Cas", ++ makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64), ++ sys.ARM64) ++ addF("internal/runtime/atomic", "Cas64", ++ makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64), ++ sys.ARM64) ++ ++ // Old-style atomic logical operation API (all supported archs except arm64). ++ addF("internal/runtime/atomic", "And8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem()) ++ return nil ++ }, ++ sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "And", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem()) ++ return nil ++ }, ++ sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "Or8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem()) ++ return nil ++ }, ++ sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("internal/runtime/atomic", "Or", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem()) ++ return nil ++ }, ++ sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) ++ ++ // arm64 always uses the new-style atomic logical operations, for both the ++ // old and new style API. ++ addF("internal/runtime/atomic", "And8", ++ makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64), ++ sys.ARM64) ++ addF("internal/runtime/atomic", "Or8", ++ makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64), ++ sys.ARM64) ++ addF("internal/runtime/atomic", "And64", ++ makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64), ++ sys.ARM64) ++ addF("internal/runtime/atomic", "And32", ++ makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64), ++ sys.ARM64) ++ addF("internal/runtime/atomic", "And", ++ makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64), ++ sys.ARM64) ++ addF("internal/runtime/atomic", "Or64", ++ makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64), ++ sys.ARM64) ++ addF("internal/runtime/atomic", "Or32", ++ makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64), ++ sys.ARM64) ++ addF("internal/runtime/atomic", "Or", ++ makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64), ++ sys.ARM64) ++ ++ // New-style atomic logical operations, which return the old memory value. ++ addF("internal/runtime/atomic", "And64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) ++ p0, p1 := s.split(v) ++ s.vars[memVar] = p1 ++ return p0 ++ }, ++ sys.AMD64) ++ addF("internal/runtime/atomic", "And32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) ++ p0, p1 := s.split(v) ++ s.vars[memVar] = p1 ++ return p0 ++ }, ++ sys.AMD64) ++ addF("internal/runtime/atomic", "Or64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) ++ p0, p1 := s.split(v) ++ s.vars[memVar] = p1 ++ return p0 ++ }, ++ sys.AMD64) ++ addF("internal/runtime/atomic", "Or32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) ++ p0, p1 := s.split(v) ++ s.vars[memVar] = p1 ++ return p0 ++ }, ++ sys.AMD64) ++ ++ // Aliases for atomic load operations ++ alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...) ++ alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...) ++ alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...) ++ alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...) ++ alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...) ++ alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...) ++ alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...) ++ alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...) ++ alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) ++ alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed ++ alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) ++ alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed ++ ++ // Aliases for atomic store operations ++ alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...) ++ alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...) ++ alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...) ++ alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...) ++ alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...) ++ alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...) ++ alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) ++ alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed ++ alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) ++ alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed ++ ++ // Aliases for atomic swap operations ++ alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...) ++ alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...) ++ alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...) ++ alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...) ++ ++ // Aliases for atomic add operations ++ alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...) ++ alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...) ++ alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...) ++ alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...) ++ ++ // Aliases for atomic CAS operations ++ alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...) ++ alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...) ++ alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...) ++ alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...) ++ alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...) ++ alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...) ++ alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...) ++ ++ // Aliases for atomic And/Or operations ++ alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64) ++ alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64) ++ ++ /******** math ********/ ++ addF("math", "sqrt", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0]) ++ }, ++ sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) ++ addF("math", "Trunc", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0]) ++ }, ++ sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm) ++ addF("math", "Ceil", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0]) ++ }, ++ sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm) ++ addF("math", "Floor", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0]) ++ }, ++ sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm) ++ addF("math", "Round", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0]) ++ }, ++ sys.ARM64, sys.PPC64, sys.S390X) ++ addF("math", "RoundToEven", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0]) ++ }, ++ sys.ARM64, sys.S390X, sys.Wasm) ++ addF("math", "Abs", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0]) ++ }, ++ sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64) ++ addF("math", "Copysign", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1]) ++ }, ++ sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm) ++ addF("math", "FMA", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) ++ }, ++ sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X) ++ addF("math", "FMA", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ if !s.config.UseFMA { ++ s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] ++ return s.variable(n, types.Types[types.TFLOAT64]) ++ } ++ ++ if buildcfg.GOAMD64 >= 3 { ++ return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) ++ } ++ ++ v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA) ++ b := s.endBlock() ++ b.Kind = ssa.BlockIf ++ b.SetControl(v) ++ bTrue := s.f.NewBlock(ssa.BlockPlain) ++ bFalse := s.f.NewBlock(ssa.BlockPlain) ++ bEnd := s.f.NewBlock(ssa.BlockPlain) ++ b.AddEdgeTo(bTrue) ++ b.AddEdgeTo(bFalse) ++ b.Likely = ssa.BranchLikely // >= haswell cpus are common ++ ++ // We have the intrinsic - use it directly. ++ s.startBlock(bTrue) ++ s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) ++ s.endBlock().AddEdgeTo(bEnd) ++ ++ // Call the pure Go version. ++ s.startBlock(bFalse) ++ s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] ++ s.endBlock().AddEdgeTo(bEnd) ++ ++ // Merge results. ++ s.startBlock(bEnd) ++ return s.variable(n, types.Types[types.TFLOAT64]) ++ }, ++ sys.AMD64) ++ addF("math", "FMA", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ if !s.config.UseFMA { ++ s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] ++ return s.variable(n, types.Types[types.TFLOAT64]) ++ } ++ addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb) ++ v := s.load(types.Types[types.TBOOL], addr) ++ b := s.endBlock() ++ b.Kind = ssa.BlockIf ++ b.SetControl(v) ++ bTrue := s.f.NewBlock(ssa.BlockPlain) ++ bFalse := s.f.NewBlock(ssa.BlockPlain) ++ bEnd := s.f.NewBlock(ssa.BlockPlain) ++ b.AddEdgeTo(bTrue) ++ b.AddEdgeTo(bFalse) ++ b.Likely = ssa.BranchLikely ++ ++ // We have the intrinsic - use it directly. ++ s.startBlock(bTrue) ++ s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) ++ s.endBlock().AddEdgeTo(bEnd) ++ ++ // Call the pure Go version. ++ s.startBlock(bFalse) ++ s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] ++ s.endBlock().AddEdgeTo(bEnd) ++ ++ // Merge results. ++ s.startBlock(bEnd) ++ return s.variable(n, types.Types[types.TFLOAT64]) ++ }, ++ sys.ARM) ++ ++ makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ if buildcfg.GOAMD64 >= 2 { ++ return s.newValue1(op, types.Types[types.TFLOAT64], args[0]) ++ } ++ ++ v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41) ++ b := s.endBlock() ++ b.Kind = ssa.BlockIf ++ b.SetControl(v) ++ bTrue := s.f.NewBlock(ssa.BlockPlain) ++ bFalse := s.f.NewBlock(ssa.BlockPlain) ++ bEnd := s.f.NewBlock(ssa.BlockPlain) ++ b.AddEdgeTo(bTrue) ++ b.AddEdgeTo(bFalse) ++ b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays ++ ++ // We have the intrinsic - use it directly. ++ s.startBlock(bTrue) ++ s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0]) ++ s.endBlock().AddEdgeTo(bEnd) ++ ++ // Call the pure Go version. ++ s.startBlock(bFalse) ++ s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] ++ s.endBlock().AddEdgeTo(bEnd) ++ ++ // Merge results. ++ s.startBlock(bEnd) ++ return s.variable(n, types.Types[types.TFLOAT64]) ++ } ++ } ++ addF("math", "RoundToEven", ++ makeRoundAMD64(ssa.OpRoundToEven), ++ sys.AMD64) ++ addF("math", "Floor", ++ makeRoundAMD64(ssa.OpFloor), ++ sys.AMD64) ++ addF("math", "Ceil", ++ makeRoundAMD64(ssa.OpCeil), ++ sys.AMD64) ++ addF("math", "Trunc", ++ makeRoundAMD64(ssa.OpTrunc), ++ sys.AMD64) ++ ++ /******** math/bits ********/ ++ addF("math/bits", "TrailingZeros64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0]) ++ }, ++ sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) ++ addF("math/bits", "TrailingZeros32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0]) ++ }, ++ sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) ++ addF("math/bits", "TrailingZeros16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0]) ++ c := s.constInt32(types.Types[types.TUINT32], 1<<16) ++ y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c) ++ return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y) ++ }, ++ sys.MIPS) ++ addF("math/bits", "TrailingZeros16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0]) ++ }, ++ sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm) ++ addF("math/bits", "TrailingZeros16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0]) ++ c := s.constInt64(types.Types[types.TUINT64], 1<<16) ++ y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c) ++ return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y) ++ }, ++ sys.S390X, sys.PPC64) ++ addF("math/bits", "TrailingZeros8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0]) ++ c := s.constInt32(types.Types[types.TUINT32], 1<<8) ++ y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c) ++ return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y) ++ }, ++ sys.MIPS) ++ addF("math/bits", "TrailingZeros8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0]) ++ }, ++ sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm) ++ addF("math/bits", "TrailingZeros8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0]) ++ c := s.constInt64(types.Types[types.TUINT64], 1<<8) ++ y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c) ++ return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y) ++ }, ++ sys.S390X) ++ alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...) ++ alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...) ++ // ReverseBytes inlines correctly, no need to intrinsify it. ++ // Nothing special is needed for targets where ReverseBytes16 lowers to a rotate ++ // On Power10, 16-bit rotate is not available so use BRH instruction ++ if buildcfg.GOPPC64 >= 10 { ++ addF("math/bits", "ReverseBytes16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0]) ++ }, ++ sys.PPC64) ++ } ++ ++ addF("math/bits", "Len64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0]) ++ }, ++ sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) ++ addF("math/bits", "Len32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) ++ }, ++ sys.AMD64, sys.ARM64, sys.PPC64) ++ addF("math/bits", "Len32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ if s.config.PtrSize == 4 { ++ return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) ++ } ++ x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0]) ++ return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) ++ }, ++ sys.ARM, sys.S390X, sys.MIPS, sys.Wasm) ++ addF("math/bits", "Len16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ if s.config.PtrSize == 4 { ++ x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0]) ++ return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x) ++ } ++ x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0]) ++ return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) ++ }, ++ sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) ++ addF("math/bits", "Len16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0]) ++ }, ++ sys.AMD64) ++ addF("math/bits", "Len8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ if s.config.PtrSize == 4 { ++ x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0]) ++ return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x) ++ } ++ x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0]) ++ return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) ++ }, ++ sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) ++ addF("math/bits", "Len8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0]) ++ }, ++ sys.AMD64) ++ addF("math/bits", "Len", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ if s.config.PtrSize == 4 { ++ return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) ++ } ++ return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0]) ++ }, ++ sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) ++ // LeadingZeros is handled because it trivially calls Len. ++ addF("math/bits", "Reverse64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0]) ++ }, ++ sys.ARM64) ++ addF("math/bits", "Reverse32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0]) ++ }, ++ sys.ARM64) ++ addF("math/bits", "Reverse16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0]) ++ }, ++ sys.ARM64) ++ addF("math/bits", "Reverse8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0]) ++ }, ++ sys.ARM64) ++ addF("math/bits", "Reverse", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0]) ++ }, ++ sys.ARM64) ++ addF("math/bits", "RotateLeft8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1]) ++ }, ++ sys.AMD64, sys.RISCV64) ++ addF("math/bits", "RotateLeft16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1]) ++ }, ++ sys.AMD64, sys.RISCV64) ++ addF("math/bits", "RotateLeft32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1]) ++ }, ++ sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) ++ addF("math/bits", "RotateLeft64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1]) ++ }, ++ sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) ++ alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...) ++ ++ makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ if buildcfg.GOAMD64 >= 2 { ++ return s.newValue1(op, types.Types[types.TINT], args[0]) ++ } ++ ++ v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT) ++ b := s.endBlock() ++ b.Kind = ssa.BlockIf ++ b.SetControl(v) ++ bTrue := s.f.NewBlock(ssa.BlockPlain) ++ bFalse := s.f.NewBlock(ssa.BlockPlain) ++ bEnd := s.f.NewBlock(ssa.BlockPlain) ++ b.AddEdgeTo(bTrue) ++ b.AddEdgeTo(bFalse) ++ b.Likely = ssa.BranchLikely // most machines have popcnt nowadays ++ ++ // We have the intrinsic - use it directly. ++ s.startBlock(bTrue) ++ s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0]) ++ s.endBlock().AddEdgeTo(bEnd) ++ ++ // Call the pure Go version. ++ s.startBlock(bFalse) ++ s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT] ++ s.endBlock().AddEdgeTo(bEnd) ++ ++ // Merge results. ++ s.startBlock(bEnd) ++ return s.variable(n, types.Types[types.TINT]) ++ } ++ } ++ addF("math/bits", "OnesCount64", ++ makeOnesCountAMD64(ssa.OpPopCount64), ++ sys.AMD64) ++ addF("math/bits", "OnesCount64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0]) ++ }, ++ sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm) ++ addF("math/bits", "OnesCount32", ++ makeOnesCountAMD64(ssa.OpPopCount32), ++ sys.AMD64) ++ addF("math/bits", "OnesCount32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0]) ++ }, ++ sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm) ++ addF("math/bits", "OnesCount16", ++ makeOnesCountAMD64(ssa.OpPopCount16), ++ sys.AMD64) ++ addF("math/bits", "OnesCount16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0]) ++ }, ++ sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm) ++ addF("math/bits", "OnesCount8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0]) ++ }, ++ sys.S390X, sys.PPC64, sys.Wasm) ++ addF("math/bits", "OnesCount", ++ makeOnesCountAMD64(ssa.OpPopCount64), ++ sys.AMD64) ++ addF("math/bits", "Mul64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1]) ++ }, ++ sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64) ++ alias("math/bits", "Mul", "math/bits", "Mul64", p8...) ++ alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...) ++ addF("math/bits", "Add64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) ++ }, ++ sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64) ++ alias("math/bits", "Add", "math/bits", "Add64", p8...) ++ alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...) ++ addF("math/bits", "Sub64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) ++ }, ++ sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64) ++ alias("math/bits", "Sub", "math/bits", "Sub64", p8...) ++ addF("math/bits", "Div64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ // check for divide-by-zero/overflow and panic with appropriate message ++ cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64])) ++ s.check(cmpZero, ir.Syms.Panicdivide) ++ cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2]) ++ s.check(cmpOverflow, ir.Syms.Panicoverflow) ++ return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) ++ }, ++ sys.AMD64) ++ alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64) ++ ++ alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...) ++ alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...) ++ alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...) ++ alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...) ++ alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...) ++ alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...) ++ ++ /******** sync/atomic ********/ ++ ++ // Note: these are disabled by flag_race in findIntrinsic below. ++ alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...) ++ alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...) ++ alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...) ++ alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...) ++ alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...) ++ alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...) ++ alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...) ++ ++ alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...) ++ alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...) ++ // Note: not StorePointer, that needs a write barrier. Same below for {CompareAnd}Swap. ++ alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...) ++ alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...) ++ alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...) ++ alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...) ++ ++ alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...) ++ alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...) ++ alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...) ++ alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...) ++ alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...) ++ alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...) ++ ++ alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...) ++ alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...) ++ alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...) ++ alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...) ++ alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...) ++ alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...) ++ ++ alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...) ++ alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...) ++ alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...) ++ alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...) ++ alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...) ++ alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...) ++ ++ alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64) ++ alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64) ++ alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64) ++ alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64) ++ alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64) ++ alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64) ++ alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64) ++ alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64) ++ alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64) ++ alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64) ++ ++ /******** math/big ********/ ++ alias("math/big", "mulWW", "math/bits", "Mul64", p8...) ++} ++ ++// findIntrinsic returns a function which builds the SSA equivalent of the ++// function identified by the symbol sym. If sym is not an intrinsic call, returns nil. ++func findIntrinsic(sym *types.Sym) intrinsicBuilder { ++ if sym == nil || sym.Pkg == nil { ++ return nil ++ } ++ pkg := sym.Pkg.Path ++ if sym.Pkg == ir.Pkgs.Runtime { ++ pkg = "runtime" ++ } ++ if base.Flag.Race && pkg == "sync/atomic" { ++ // The race detector needs to be able to intercept these calls. ++ // We can't intrinsify them. ++ return nil ++ } ++ // Skip intrinsifying math functions (which may contain hard-float ++ // instructions) when soft-float ++ if Arch.SoftFloat && pkg == "math" { ++ return nil ++ } ++ ++ fn := sym.Name ++ if ssa.IntrinsicsDisable { ++ if pkg == "runtime" && (fn == "getcallerpc" || fn == "getcallersp" || fn == "getclosureptr") { ++ // These runtime functions don't have definitions, must be intrinsics. ++ } else { ++ return nil ++ } ++ } ++ return intrinsics[intrinsicKey{Arch.LinkArch.Arch, pkg, fn}] ++} ++ ++func IsIntrinsicCall(n *ir.CallExpr) bool { ++ if n == nil { ++ return false ++ } ++ name, ok := n.Fun.(*ir.Name) ++ if !ok { ++ return false ++ } ++ return findIntrinsic(name.Sym()) != nil ++} +diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go +index cbf9587a56..0f6f2de4a7 100644 +--- a/src/cmd/compile/internal/ssagen/ssa.go ++++ b/src/cmd/compile/internal/ssagen/ssa.go +@@ -208,6 +208,10 @@ func InitConfig() { + ir.Syms.SigPanic = typecheck.LookupRuntimeFunc("sigpanic") + } + ++func InitTables() { ++ initIntrinsics() ++} ++ + // AbiForBodylessFuncStackMap returns the ABI for a bodyless function's stack map. + // This is not necessarily the ABI used to call it. + // Currently (1.17 dev) such a stack map is always ABI0; +@@ -4032,966 +4036,11 @@ func (s *state) sfcall(op ssa.Op, args ...*ssa.Value) (*ssa.Value, bool) { + return nil, false + } + +-var intrinsics map[intrinsicKey]intrinsicBuilder +- +-// An intrinsicBuilder converts a call node n into an ssa value that +-// implements that call as an intrinsic. args is a list of arguments to the func. +-type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value +- +-type intrinsicKey struct { +- arch *sys.Arch +- pkg string +- fn string +-} +- +-func InitTables() { +- intrinsics = map[intrinsicKey]intrinsicBuilder{} +- +- var p4 []*sys.Arch +- var p8 []*sys.Arch +- var lwatomics []*sys.Arch +- for _, a := range sys.Archs { +- if a.PtrSize == 4 { +- p4 = append(p4, a) +- } else { +- p8 = append(p8, a) +- } +- if a.Family != sys.PPC64 { +- lwatomics = append(lwatomics, a) +- } +- } +- all := sys.Archs[:] +- +- // add adds the intrinsic b for pkg.fn for the given list of architectures. +- add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) { +- for _, a := range archs { +- intrinsics[intrinsicKey{a, pkg, fn}] = b +- } +- } +- // addF does the same as add but operates on architecture families. +- addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) { +- for _, a := range sys.Archs { +- if a.InFamily(archFamilies...) { +- intrinsics[intrinsicKey{a, pkg, fn}] = b +- } +- } +- } +- // alias defines pkg.fn = pkg2.fn2 for all architectures in archs for which pkg2.fn2 exists. +- alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) { +- aliased := false +- for _, a := range archs { +- if b, ok := intrinsics[intrinsicKey{a, pkg2, fn2}]; ok { +- intrinsics[intrinsicKey{a, pkg, fn}] = b +- aliased = true +- } +- } +- if !aliased { +- panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn)) +- } +- } +- +- /******** runtime ********/ +- if !base.Flag.Cfg.Instrumenting { +- add("runtime", "slicebytetostringtmp", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- // Compiler frontend optimizations emit OBYTES2STRTMP nodes +- // for the backend instead of slicebytetostringtmp calls +- // when not instrumenting. +- return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1]) +- }, +- all...) +- } +- addF("runtime/internal/math", "MulUintptr", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if s.config.PtrSize == 4 { +- return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1]) +- } +- return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1]) +- }, +- sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.RISCV64, sys.ARM64) +- alias("runtime", "mulUintptr", "runtime/internal/math", "MulUintptr", all...) +- add("runtime", "KeepAlive", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0]) +- s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem()) +- return nil +- }, +- all...) +- add("runtime", "getclosureptr", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr) +- }, +- all...) +- +- add("runtime", "getcallerpc", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr) +- }, +- all...) +- +- add("runtime", "getcallersp", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem()) +- }, +- all...) +- +- addF("runtime", "publicationBarrier", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem()) +- return nil +- }, +- sys.ARM64, sys.PPC64, sys.RISCV64) +- +- brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X} +- if buildcfg.GOPPC64 >= 10 { +- // Use only on Power10 as the new byte reverse instructions that Power10 provide +- // make it worthwhile as an intrinsic +- brev_arch = append(brev_arch, sys.PPC64) +- } +- /******** runtime/internal/sys ********/ +- addF("runtime/internal/sys", "Bswap32", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0]) +- }, +- brev_arch...) +- addF("runtime/internal/sys", "Bswap64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0]) +- }, +- brev_arch...) +- +- /****** Prefetch ******/ +- makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem()) +- return nil +- } +- } +- +- // Make Prefetch intrinsics for supported platforms +- // On the unsupported platforms stub function will be eliminated +- addF("runtime/internal/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache), +- sys.AMD64, sys.ARM64, sys.PPC64) +- addF("runtime/internal/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed), +- sys.AMD64, sys.ARM64, sys.PPC64) +- +- /******** runtime/internal/atomic ********/ +- addF("runtime/internal/atomic", "Load", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) +- }, +- sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "Load8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v) +- }, +- sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "Load64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) +- }, +- sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "LoadAcq", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) +- }, +- sys.PPC64, sys.S390X) +- addF("runtime/internal/atomic", "LoadAcq64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) +- }, +- sys.PPC64) +- addF("runtime/internal/atomic", "Loadp", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v) +- }, +- sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- +- addF("runtime/internal/atomic", "Store", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem()) +- return nil +- }, +- sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "Store8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem()) +- return nil +- }, +- sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "Store64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem()) +- return nil +- }, +- sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "StorepNoWB", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem()) +- return nil +- }, +- sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "StoreRel", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem()) +- return nil +- }, +- sys.PPC64, sys.S390X) +- addF("runtime/internal/atomic", "StoreRel64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem()) +- return nil +- }, +- sys.PPC64) +- +- addF("runtime/internal/atomic", "Xchg", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) +- }, +- sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "Xchg64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) +- }, +- sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- +- type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind) +- +- makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ, rtyp types.Kind, emit atomicOpEmitter) intrinsicBuilder { +- +- return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if buildcfg.GOARM64.KPAtomicOpt { +- emit(s, n, args, op0, typ) +- } else { +- // Target Atomic feature is identified by dynamic detection +- addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb) +- v := s.load(types.Types[types.TBOOL], addr) +- b := s.endBlock() +- b.Kind = ssa.BlockIf +- b.SetControl(v) +- bTrue := s.f.NewBlock(ssa.BlockPlain) +- bFalse := s.f.NewBlock(ssa.BlockPlain) +- bEnd := s.f.NewBlock(ssa.BlockPlain) +- b.AddEdgeTo(bTrue) +- b.AddEdgeTo(bFalse) +- b.Likely = ssa.BranchLikely +- +- // We have atomic instructions - use it directly. +- s.startBlock(bTrue) +- emit(s, n, args, op1, typ) +- s.endBlock().AddEdgeTo(bEnd) +- +- // Use original instruction sequence. +- s.startBlock(bFalse) +- emit(s, n, args, op0, typ) +- s.endBlock().AddEdgeTo(bEnd) +- +- // Merge results. +- s.startBlock(bEnd) +- } +- if rtyp == types.TNIL { +- return nil +- } else { +- return s.variable(n, types.Types[rtyp]) +- } +- +- } +- } +- +- atomicXchgXaddEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind) { +- v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v) +- } +- addF("runtime/internal/atomic", "Xchg", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, types.TUINT32, atomicXchgXaddEmitterARM64), +- sys.ARM64) +- addF("runtime/internal/atomic", "Xchg64", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, types.TUINT64, atomicXchgXaddEmitterARM64), +- sys.ARM64) +- +- addF("runtime/internal/atomic", "Xadd", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) +- }, +- sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "Xadd64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) +- }, +- sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- +- addF("runtime/internal/atomic", "Xadd", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, types.TUINT32, atomicXchgXaddEmitterARM64), +- sys.ARM64) +- addF("runtime/internal/atomic", "Xadd64", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, types.TUINT64, atomicXchgXaddEmitterARM64), +- sys.ARM64) +- +- addF("runtime/internal/atomic", "Cas", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v) +- }, +- sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "Cas64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v) +- }, +- sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "CasRel", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v) +- }, +- sys.PPC64) +- +- atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind) { +- v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) +- s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) +- s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v) +- } +- +- addF("runtime/internal/atomic", "Cas", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TUINT32, types.TBOOL, atomicCasEmitterARM64), +- sys.ARM64) +- addF("runtime/internal/atomic", "Cas64", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TUINT64, types.TBOOL, atomicCasEmitterARM64), +- sys.ARM64) +- +- addF("runtime/internal/atomic", "And8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem()) +- return nil +- }, +- sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "And", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem()) +- return nil +- }, +- sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "Or8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem()) +- return nil +- }, +- sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("runtime/internal/atomic", "Or", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem()) +- return nil +- }, +- sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) +- +- atomicAndOrEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind) { +- s.vars[memVar] = s.newValue3(op, types.TypeMem, args[0], args[1], s.mem()) +- } +- +- addF("runtime/internal/atomic", "And8", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd8, ssa.OpAtomicAnd8Variant, types.TNIL, types.TNIL, atomicAndOrEmitterARM64), +- sys.ARM64) +- addF("runtime/internal/atomic", "And", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32, ssa.OpAtomicAnd32Variant, types.TNIL, types.TNIL, atomicAndOrEmitterARM64), +- sys.ARM64) +- addF("runtime/internal/atomic", "Or8", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr8, ssa.OpAtomicOr8Variant, types.TNIL, types.TNIL, atomicAndOrEmitterARM64), +- sys.ARM64) +- addF("runtime/internal/atomic", "Or", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32, ssa.OpAtomicOr32Variant, types.TNIL, types.TNIL, atomicAndOrEmitterARM64), +- sys.ARM64) +- +- // Aliases for atomic load operations +- alias("runtime/internal/atomic", "Loadint32", "runtime/internal/atomic", "Load", all...) +- alias("runtime/internal/atomic", "Loadint64", "runtime/internal/atomic", "Load64", all...) +- alias("runtime/internal/atomic", "Loaduintptr", "runtime/internal/atomic", "Load", p4...) +- alias("runtime/internal/atomic", "Loaduintptr", "runtime/internal/atomic", "Load64", p8...) +- alias("runtime/internal/atomic", "Loaduint", "runtime/internal/atomic", "Load", p4...) +- alias("runtime/internal/atomic", "Loaduint", "runtime/internal/atomic", "Load64", p8...) +- alias("runtime/internal/atomic", "LoadAcq", "runtime/internal/atomic", "Load", lwatomics...) +- alias("runtime/internal/atomic", "LoadAcq64", "runtime/internal/atomic", "Load64", lwatomics...) +- alias("runtime/internal/atomic", "LoadAcquintptr", "runtime/internal/atomic", "LoadAcq", p4...) +- alias("sync", "runtime_LoadAcquintptr", "runtime/internal/atomic", "LoadAcq", p4...) // linknamed +- alias("runtime/internal/atomic", "LoadAcquintptr", "runtime/internal/atomic", "LoadAcq64", p8...) +- alias("sync", "runtime_LoadAcquintptr", "runtime/internal/atomic", "LoadAcq64", p8...) // linknamed +- +- // Aliases for atomic store operations +- alias("runtime/internal/atomic", "Storeint32", "runtime/internal/atomic", "Store", all...) +- alias("runtime/internal/atomic", "Storeint64", "runtime/internal/atomic", "Store64", all...) +- alias("runtime/internal/atomic", "Storeuintptr", "runtime/internal/atomic", "Store", p4...) +- alias("runtime/internal/atomic", "Storeuintptr", "runtime/internal/atomic", "Store64", p8...) +- alias("runtime/internal/atomic", "StoreRel", "runtime/internal/atomic", "Store", lwatomics...) +- alias("runtime/internal/atomic", "StoreRel64", "runtime/internal/atomic", "Store64", lwatomics...) +- alias("runtime/internal/atomic", "StoreReluintptr", "runtime/internal/atomic", "StoreRel", p4...) +- alias("sync", "runtime_StoreReluintptr", "runtime/internal/atomic", "StoreRel", p4...) // linknamed +- alias("runtime/internal/atomic", "StoreReluintptr", "runtime/internal/atomic", "StoreRel64", p8...) +- alias("sync", "runtime_StoreReluintptr", "runtime/internal/atomic", "StoreRel64", p8...) // linknamed +- +- // Aliases for atomic swap operations +- alias("runtime/internal/atomic", "Xchgint32", "runtime/internal/atomic", "Xchg", all...) +- alias("runtime/internal/atomic", "Xchgint64", "runtime/internal/atomic", "Xchg64", all...) +- alias("runtime/internal/atomic", "Xchguintptr", "runtime/internal/atomic", "Xchg", p4...) +- alias("runtime/internal/atomic", "Xchguintptr", "runtime/internal/atomic", "Xchg64", p8...) +- +- // Aliases for atomic add operations +- alias("runtime/internal/atomic", "Xaddint32", "runtime/internal/atomic", "Xadd", all...) +- alias("runtime/internal/atomic", "Xaddint64", "runtime/internal/atomic", "Xadd64", all...) +- alias("runtime/internal/atomic", "Xadduintptr", "runtime/internal/atomic", "Xadd", p4...) +- alias("runtime/internal/atomic", "Xadduintptr", "runtime/internal/atomic", "Xadd64", p8...) +- +- // Aliases for atomic CAS operations +- alias("runtime/internal/atomic", "Casint32", "runtime/internal/atomic", "Cas", all...) +- alias("runtime/internal/atomic", "Casint64", "runtime/internal/atomic", "Cas64", all...) +- alias("runtime/internal/atomic", "Casuintptr", "runtime/internal/atomic", "Cas", p4...) +- alias("runtime/internal/atomic", "Casuintptr", "runtime/internal/atomic", "Cas64", p8...) +- alias("runtime/internal/atomic", "Casp1", "runtime/internal/atomic", "Cas", p4...) +- alias("runtime/internal/atomic", "Casp1", "runtime/internal/atomic", "Cas64", p8...) +- alias("runtime/internal/atomic", "CasRel", "runtime/internal/atomic", "Cas", lwatomics...) +- +- /******** math ********/ +- addF("math", "sqrt", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0]) +- }, +- sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) +- addF("math", "Trunc", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0]) +- }, +- sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm) +- addF("math", "Ceil", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0]) +- }, +- sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm) +- addF("math", "Floor", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0]) +- }, +- sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm) +- addF("math", "Round", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0]) +- }, +- sys.ARM64, sys.PPC64, sys.S390X) +- addF("math", "RoundToEven", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0]) +- }, +- sys.ARM64, sys.S390X, sys.Wasm) +- addF("math", "Abs", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0]) +- }, +- sys.ARM64, sys.ARM, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64) +- addF("math", "Copysign", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1]) +- }, +- sys.PPC64, sys.RISCV64, sys.Wasm) +- addF("math", "FMA", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) +- }, +- sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X) +- addF("math", "FMA", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if !s.config.UseFMA { +- s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] +- return s.variable(n, types.Types[types.TFLOAT64]) +- } +- +- if buildcfg.GOAMD64 >= 3 { +- return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) +- } +- +- v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA) +- b := s.endBlock() +- b.Kind = ssa.BlockIf +- b.SetControl(v) +- bTrue := s.f.NewBlock(ssa.BlockPlain) +- bFalse := s.f.NewBlock(ssa.BlockPlain) +- bEnd := s.f.NewBlock(ssa.BlockPlain) +- b.AddEdgeTo(bTrue) +- b.AddEdgeTo(bFalse) +- b.Likely = ssa.BranchLikely // >= haswell cpus are common +- +- // We have the intrinsic - use it directly. +- s.startBlock(bTrue) +- s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) +- s.endBlock().AddEdgeTo(bEnd) +- +- // Call the pure Go version. +- s.startBlock(bFalse) +- s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] +- s.endBlock().AddEdgeTo(bEnd) +- +- // Merge results. +- s.startBlock(bEnd) +- return s.variable(n, types.Types[types.TFLOAT64]) +- }, +- sys.AMD64) +- addF("math", "FMA", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if !s.config.UseFMA { +- s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] +- return s.variable(n, types.Types[types.TFLOAT64]) +- } +- addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb) +- v := s.load(types.Types[types.TBOOL], addr) +- b := s.endBlock() +- b.Kind = ssa.BlockIf +- b.SetControl(v) +- bTrue := s.f.NewBlock(ssa.BlockPlain) +- bFalse := s.f.NewBlock(ssa.BlockPlain) +- bEnd := s.f.NewBlock(ssa.BlockPlain) +- b.AddEdgeTo(bTrue) +- b.AddEdgeTo(bFalse) +- b.Likely = ssa.BranchLikely +- +- // We have the intrinsic - use it directly. +- s.startBlock(bTrue) +- s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) +- s.endBlock().AddEdgeTo(bEnd) +- +- // Call the pure Go version. +- s.startBlock(bFalse) +- s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] +- s.endBlock().AddEdgeTo(bEnd) +- +- // Merge results. +- s.startBlock(bEnd) +- return s.variable(n, types.Types[types.TFLOAT64]) +- }, +- sys.ARM) +- +- makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if buildcfg.GOAMD64 >= 2 { +- return s.newValue1(op, types.Types[types.TFLOAT64], args[0]) +- } +- +- v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41) +- b := s.endBlock() +- b.Kind = ssa.BlockIf +- b.SetControl(v) +- bTrue := s.f.NewBlock(ssa.BlockPlain) +- bFalse := s.f.NewBlock(ssa.BlockPlain) +- bEnd := s.f.NewBlock(ssa.BlockPlain) +- b.AddEdgeTo(bTrue) +- b.AddEdgeTo(bFalse) +- b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays +- +- // We have the intrinsic - use it directly. +- s.startBlock(bTrue) +- s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0]) +- s.endBlock().AddEdgeTo(bEnd) +- +- // Call the pure Go version. +- s.startBlock(bFalse) +- s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] +- s.endBlock().AddEdgeTo(bEnd) +- +- // Merge results. +- s.startBlock(bEnd) +- return s.variable(n, types.Types[types.TFLOAT64]) +- } +- } +- addF("math", "RoundToEven", +- makeRoundAMD64(ssa.OpRoundToEven), +- sys.AMD64) +- addF("math", "Floor", +- makeRoundAMD64(ssa.OpFloor), +- sys.AMD64) +- addF("math", "Ceil", +- makeRoundAMD64(ssa.OpCeil), +- sys.AMD64) +- addF("math", "Trunc", +- makeRoundAMD64(ssa.OpTrunc), +- sys.AMD64) +- +- /******** math/bits ********/ +- addF("math/bits", "TrailingZeros64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0]) +- }, +- sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) +- addF("math/bits", "TrailingZeros32", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0]) +- }, +- sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) +- addF("math/bits", "TrailingZeros16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0]) +- c := s.constInt32(types.Types[types.TUINT32], 1<<16) +- y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c) +- return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y) +- }, +- sys.MIPS) +- addF("math/bits", "TrailingZeros16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0]) +- }, +- sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm) +- addF("math/bits", "TrailingZeros16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0]) +- c := s.constInt64(types.Types[types.TUINT64], 1<<16) +- y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c) +- return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y) +- }, +- sys.S390X, sys.PPC64) +- addF("math/bits", "TrailingZeros8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0]) +- c := s.constInt32(types.Types[types.TUINT32], 1<<8) +- y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c) +- return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y) +- }, +- sys.MIPS) +- addF("math/bits", "TrailingZeros8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0]) +- }, +- sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm) +- addF("math/bits", "TrailingZeros8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0]) +- c := s.constInt64(types.Types[types.TUINT64], 1<<8) +- y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c) +- return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y) +- }, +- sys.S390X) +- alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...) +- alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...) +- // ReverseBytes inlines correctly, no need to intrinsify it. +- // Nothing special is needed for targets where ReverseBytes16 lowers to a rotate +- // On Power10, 16-bit rotate is not available so use BRH instruction +- if buildcfg.GOPPC64 >= 10 { +- addF("math/bits", "ReverseBytes16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0]) +- }, +- sys.PPC64) +- } +- +- addF("math/bits", "Len64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0]) +- }, +- sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) +- addF("math/bits", "Len32", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) +- }, +- sys.AMD64, sys.ARM64, sys.PPC64) +- addF("math/bits", "Len32", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if s.config.PtrSize == 4 { +- return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) +- } +- x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0]) +- return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) +- }, +- sys.ARM, sys.S390X, sys.MIPS, sys.Wasm) +- addF("math/bits", "Len16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if s.config.PtrSize == 4 { +- x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0]) +- return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x) +- } +- x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0]) +- return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) +- }, +- sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) +- addF("math/bits", "Len16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0]) +- }, +- sys.AMD64) +- addF("math/bits", "Len8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if s.config.PtrSize == 4 { +- x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0]) +- return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x) +- } +- x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0]) +- return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) +- }, +- sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) +- addF("math/bits", "Len8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0]) +- }, +- sys.AMD64) +- addF("math/bits", "Len", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if s.config.PtrSize == 4 { +- return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) +- } +- return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0]) +- }, +- sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) +- // LeadingZeros is handled because it trivially calls Len. +- addF("math/bits", "Reverse64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0]) +- }, +- sys.ARM64) +- addF("math/bits", "Reverse32", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0]) +- }, +- sys.ARM64) +- addF("math/bits", "Reverse16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0]) +- }, +- sys.ARM64) +- addF("math/bits", "Reverse8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0]) +- }, +- sys.ARM64) +- addF("math/bits", "Reverse", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0]) +- }, +- sys.ARM64) +- addF("math/bits", "RotateLeft8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1]) +- }, +- sys.AMD64, sys.RISCV64) +- addF("math/bits", "RotateLeft16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1]) +- }, +- sys.AMD64, sys.RISCV64) +- addF("math/bits", "RotateLeft32", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1]) +- }, +- sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) +- addF("math/bits", "RotateLeft64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1]) +- }, +- sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) +- alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...) +- +- makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if buildcfg.GOAMD64 >= 2 { +- return s.newValue1(op, types.Types[types.TINT], args[0]) +- } +- +- v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT) +- b := s.endBlock() +- b.Kind = ssa.BlockIf +- b.SetControl(v) +- bTrue := s.f.NewBlock(ssa.BlockPlain) +- bFalse := s.f.NewBlock(ssa.BlockPlain) +- bEnd := s.f.NewBlock(ssa.BlockPlain) +- b.AddEdgeTo(bTrue) +- b.AddEdgeTo(bFalse) +- b.Likely = ssa.BranchLikely // most machines have popcnt nowadays +- +- // We have the intrinsic - use it directly. +- s.startBlock(bTrue) +- s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0]) +- s.endBlock().AddEdgeTo(bEnd) +- +- // Call the pure Go version. +- s.startBlock(bFalse) +- s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT] +- s.endBlock().AddEdgeTo(bEnd) +- +- // Merge results. +- s.startBlock(bEnd) +- return s.variable(n, types.Types[types.TINT]) +- } +- } +- addF("math/bits", "OnesCount64", +- makeOnesCountAMD64(ssa.OpPopCount64), +- sys.AMD64) +- addF("math/bits", "OnesCount64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0]) +- }, +- sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm) +- addF("math/bits", "OnesCount32", +- makeOnesCountAMD64(ssa.OpPopCount32), +- sys.AMD64) +- addF("math/bits", "OnesCount32", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0]) +- }, +- sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm) +- addF("math/bits", "OnesCount16", +- makeOnesCountAMD64(ssa.OpPopCount16), +- sys.AMD64) +- addF("math/bits", "OnesCount16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0]) +- }, +- sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm) +- addF("math/bits", "OnesCount8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0]) +- }, +- sys.S390X, sys.PPC64, sys.Wasm) +- addF("math/bits", "OnesCount", +- makeOnesCountAMD64(ssa.OpPopCount64), +- sys.AMD64) +- addF("math/bits", "Mul64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1]) +- }, +- sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64) +- alias("math/bits", "Mul", "math/bits", "Mul64", p8...) +- alias("runtime/internal/math", "Mul64", "math/bits", "Mul64", p8...) +- addF("math/bits", "Add64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) +- }, +- sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64) +- alias("math/bits", "Add", "math/bits", "Add64", p8...) +- addF("math/bits", "Sub64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) +- }, +- sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64) +- alias("math/bits", "Sub", "math/bits", "Sub64", p8...) +- addF("math/bits", "Div64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- // check for divide-by-zero/overflow and panic with appropriate message +- cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64])) +- s.check(cmpZero, ir.Syms.Panicdivide) +- cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2]) +- s.check(cmpOverflow, ir.Syms.Panicoverflow) +- return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) +- }, +- sys.AMD64) +- alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64) +- +- alias("runtime/internal/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...) +- alias("runtime/internal/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...) +- alias("runtime/internal/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...) +- alias("runtime/internal/sys", "Len8", "math/bits", "Len8", all...) +- alias("runtime/internal/sys", "Len64", "math/bits", "Len64", all...) +- alias("runtime/internal/sys", "OnesCount64", "math/bits", "OnesCount64", all...) +- +- /******** sync/atomic ********/ +- +- // Note: these are disabled by flag_race in findIntrinsic below. +- alias("sync/atomic", "LoadInt32", "runtime/internal/atomic", "Load", all...) +- alias("sync/atomic", "LoadInt64", "runtime/internal/atomic", "Load64", all...) +- alias("sync/atomic", "LoadPointer", "runtime/internal/atomic", "Loadp", all...) +- alias("sync/atomic", "LoadUint32", "runtime/internal/atomic", "Load", all...) +- alias("sync/atomic", "LoadUint64", "runtime/internal/atomic", "Load64", all...) +- alias("sync/atomic", "LoadUintptr", "runtime/internal/atomic", "Load", p4...) +- alias("sync/atomic", "LoadUintptr", "runtime/internal/atomic", "Load64", p8...) +- +- alias("sync/atomic", "StoreInt32", "runtime/internal/atomic", "Store", all...) +- alias("sync/atomic", "StoreInt64", "runtime/internal/atomic", "Store64", all...) +- // Note: not StorePointer, that needs a write barrier. Same below for {CompareAnd}Swap. +- alias("sync/atomic", "StoreUint32", "runtime/internal/atomic", "Store", all...) +- alias("sync/atomic", "StoreUint64", "runtime/internal/atomic", "Store64", all...) +- alias("sync/atomic", "StoreUintptr", "runtime/internal/atomic", "Store", p4...) +- alias("sync/atomic", "StoreUintptr", "runtime/internal/atomic", "Store64", p8...) +- +- alias("sync/atomic", "SwapInt32", "runtime/internal/atomic", "Xchg", all...) +- alias("sync/atomic", "SwapInt64", "runtime/internal/atomic", "Xchg64", all...) +- alias("sync/atomic", "SwapUint32", "runtime/internal/atomic", "Xchg", all...) +- alias("sync/atomic", "SwapUint64", "runtime/internal/atomic", "Xchg64", all...) +- alias("sync/atomic", "SwapUintptr", "runtime/internal/atomic", "Xchg", p4...) +- alias("sync/atomic", "SwapUintptr", "runtime/internal/atomic", "Xchg64", p8...) +- +- alias("sync/atomic", "CompareAndSwapInt32", "runtime/internal/atomic", "Cas", all...) +- alias("sync/atomic", "CompareAndSwapInt64", "runtime/internal/atomic", "Cas64", all...) +- alias("sync/atomic", "CompareAndSwapUint32", "runtime/internal/atomic", "Cas", all...) +- alias("sync/atomic", "CompareAndSwapUint64", "runtime/internal/atomic", "Cas64", all...) +- alias("sync/atomic", "CompareAndSwapUintptr", "runtime/internal/atomic", "Cas", p4...) +- alias("sync/atomic", "CompareAndSwapUintptr", "runtime/internal/atomic", "Cas64", p8...) +- +- alias("sync/atomic", "AddInt32", "runtime/internal/atomic", "Xadd", all...) +- alias("sync/atomic", "AddInt64", "runtime/internal/atomic", "Xadd64", all...) +- alias("sync/atomic", "AddUint32", "runtime/internal/atomic", "Xadd", all...) +- alias("sync/atomic", "AddUint64", "runtime/internal/atomic", "Xadd64", all...) +- alias("sync/atomic", "AddUintptr", "runtime/internal/atomic", "Xadd", p4...) +- alias("sync/atomic", "AddUintptr", "runtime/internal/atomic", "Xadd64", p8...) +- +- /******** math/big ********/ +- alias("math/big", "mulWW", "math/bits", "Mul64", p8...) +-} +- +-// findIntrinsic returns a function which builds the SSA equivalent of the +-// function identified by the symbol sym. If sym is not an intrinsic call, returns nil. +-func findIntrinsic(sym *types.Sym) intrinsicBuilder { +- if sym == nil || sym.Pkg == nil { +- return nil +- } +- pkg := sym.Pkg.Path +- if sym.Pkg == ir.Pkgs.Runtime { +- pkg = "runtime" +- } +- if base.Flag.Race && pkg == "sync/atomic" { +- // The race detector needs to be able to intercept these calls. +- // We can't intrinsify them. +- return nil +- } +- // Skip intrinsifying math functions (which may contain hard-float +- // instructions) when soft-float +- if Arch.SoftFloat && pkg == "math" { +- return nil +- } +- +- fn := sym.Name +- if ssa.IntrinsicsDisable { +- if pkg == "runtime" && (fn == "getcallerpc" || fn == "getcallersp" || fn == "getclosureptr") { +- // These runtime functions don't have definitions, must be intrinsics. +- } else { +- return nil +- } +- } +- return intrinsics[intrinsicKey{Arch.LinkArch.Arch, pkg, fn}] +-} +- +-func IsIntrinsicCall(n *ir.CallExpr) bool { +- if n == nil { +- return false +- } +- name, ok := n.X.(*ir.Name) +- if !ok { +- return false +- } +- return findIntrinsic(name.Sym()) != nil ++// split breaks up a tuple-typed value into its 2 parts. ++func (s *state) split(v *ssa.Value) (*ssa.Value, *ssa.Value) { ++ p0 := s.newValue1(ssa.OpSelect0, v.Type.FieldType(0), v) ++ p1 := s.newValue1(ssa.OpSelect1, v.Type.FieldType(1), v) ++ return p0, p1 + } + + // intrinsicCall converts a call to a recognized intrinsic function into the intrinsic SSA operation. +-- +2.39.5 + diff --git a/2109-cmd-compile-internal-ssagen-add-initial-test-coverag.patch b/2109-cmd-compile-internal-ssagen-add-initial-test-coverag.patch new file mode 100644 index 0000000..b48f59a --- /dev/null +++ b/2109-cmd-compile-internal-ssagen-add-initial-test-coverag.patch @@ -0,0 +1,1254 @@ +From 7ccd47a906363e0bc476ca09cbcf09dcf961149e Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:42:56 +0800 +Subject: [PATCH 109/119] cmd/compile/internal/ssagen: add initial test + coverage for intrinsics + +Add basic test coverage for the intrinisic table - this at least allows +us to tell if intrinsics are added or removed unexpectedly. Code +generation changes resulting from intrinsics is not covered and is +left for test/codegen and others. + +Change-Id: I3d538708b90cd04d3f449945e0fd9388097d683e +Reviewed-on: https://go-review.googlesource.com/c/go/+/605475 +Reviewed-by: Cherry Mui +LUCI-TryBot-Result: Go LUCI +Reviewed-by: David Chase +--- + .../internal/ssagen/intrinsics_test.go | 1224 +++++++++++++++++ + 1 file changed, 1224 insertions(+) + create mode 100644 src/cmd/compile/internal/ssagen/intrinsics_test.go + +diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go +new file mode 100644 +index 0000000000..74ea276cc0 +--- /dev/null ++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go +@@ -0,0 +1,1224 @@ ++// Copyright 2024 The Go Authors. All rights reserved. ++// Use of this source code is governed by a BSD-style ++// license that can be found in the LICENSE file. ++ ++package ssagen ++ ++import ( ++ "internal/buildcfg" ++ "testing" ++) ++ ++type testIntrinsicKey struct { ++ archName string ++ pkg string ++ fn string ++} ++ ++var wantIntrinsics = map[testIntrinsicKey]struct{}{ ++ {"386", "internal/runtime/math", "MulUintptr"}: struct{}{}, ++ {"386", "internal/runtime/sys", "Bswap32"}: struct{}{}, ++ {"386", "internal/runtime/sys", "Bswap64"}: struct{}{}, ++ {"386", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, ++ {"386", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"386", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"386", "math", "sqrt"}: struct{}{}, ++ {"386", "math/bits", "ReverseBytes32"}: struct{}{}, ++ {"386", "math/bits", "ReverseBytes64"}: struct{}{}, ++ {"386", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"386", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"386", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"386", "math/bits", "TrailingZeros8"}: struct{}{}, ++ {"386", "runtime", "KeepAlive"}: struct{}{}, ++ {"386", "runtime", "getcallerpc"}: struct{}{}, ++ {"386", "runtime", "getcallersp"}: struct{}{}, ++ {"386", "runtime", "getclosureptr"}: struct{}{}, ++ {"386", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "And"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "And8"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Cas64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Casint64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Load64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "LoadAcq64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Loadint64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Or"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Or8"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Store64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "StoreRel64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Storeint64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "StorepNoWB"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Xadd64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Xaddint64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Xchg64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Xchgint64"}: struct{}{}, ++ {"amd64", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"amd64", "internal/runtime/math", "Add64"}: struct{}{}, ++ {"amd64", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"amd64", "internal/runtime/math", "MulUintptr"}: struct{}{}, ++ {"amd64", "internal/runtime/sys", "Bswap32"}: struct{}{}, ++ {"amd64", "internal/runtime/sys", "Bswap64"}: struct{}{}, ++ {"amd64", "internal/runtime/sys", "Len64"}: struct{}{}, ++ {"amd64", "internal/runtime/sys", "Len8"}: struct{}{}, ++ {"amd64", "internal/runtime/sys", "OnesCount64"}: struct{}{}, ++ {"amd64", "internal/runtime/sys", "Prefetch"}: struct{}{}, ++ {"amd64", "internal/runtime/sys", "PrefetchStreamed"}: struct{}{}, ++ {"amd64", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, ++ {"amd64", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"amd64", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"amd64", "math", "Ceil"}: struct{}{}, ++ {"amd64", "math", "FMA"}: struct{}{}, ++ {"amd64", "math", "Floor"}: struct{}{}, ++ {"amd64", "math", "RoundToEven"}: struct{}{}, ++ {"amd64", "math", "Trunc"}: struct{}{}, ++ {"amd64", "math", "sqrt"}: struct{}{}, ++ {"amd64", "math/big", "mulWW"}: struct{}{}, ++ {"amd64", "math/bits", "Add"}: struct{}{}, ++ {"amd64", "math/bits", "Add64"}: struct{}{}, ++ {"amd64", "math/bits", "Div"}: struct{}{}, ++ {"amd64", "math/bits", "Div64"}: struct{}{}, ++ {"amd64", "math/bits", "Len"}: struct{}{}, ++ {"amd64", "math/bits", "Len16"}: struct{}{}, ++ {"amd64", "math/bits", "Len32"}: struct{}{}, ++ {"amd64", "math/bits", "Len64"}: struct{}{}, ++ {"amd64", "math/bits", "Len8"}: struct{}{}, ++ {"amd64", "math/bits", "Mul"}: struct{}{}, ++ {"amd64", "math/bits", "Mul64"}: struct{}{}, ++ {"amd64", "math/bits", "OnesCount"}: struct{}{}, ++ {"amd64", "math/bits", "OnesCount16"}: struct{}{}, ++ {"amd64", "math/bits", "OnesCount32"}: struct{}{}, ++ {"amd64", "math/bits", "OnesCount64"}: struct{}{}, ++ {"amd64", "math/bits", "ReverseBytes32"}: struct{}{}, ++ {"amd64", "math/bits", "ReverseBytes64"}: struct{}{}, ++ {"amd64", "math/bits", "RotateLeft"}: struct{}{}, ++ {"amd64", "math/bits", "RotateLeft16"}: struct{}{}, ++ {"amd64", "math/bits", "RotateLeft32"}: struct{}{}, ++ {"amd64", "math/bits", "RotateLeft64"}: struct{}{}, ++ {"amd64", "math/bits", "RotateLeft8"}: struct{}{}, ++ {"amd64", "math/bits", "Sub"}: struct{}{}, ++ {"amd64", "math/bits", "Sub64"}: struct{}{}, ++ {"amd64", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"amd64", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"amd64", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"amd64", "math/bits", "TrailingZeros8"}: struct{}{}, ++ {"amd64", "runtime", "KeepAlive"}: struct{}{}, ++ {"amd64", "runtime", "getcallerpc"}: struct{}{}, ++ {"amd64", "runtime", "getcallersp"}: struct{}{}, ++ {"amd64", "runtime", "getclosureptr"}: struct{}{}, ++ {"amd64", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"amd64", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"amd64", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"amd64", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"amd64", "sync/atomic", "AddInt64"}: struct{}{}, ++ {"amd64", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"amd64", "sync/atomic", "AddUint64"}: struct{}{}, ++ {"amd64", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"amd64", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"amd64", "sync/atomic", "CompareAndSwapInt64"}: struct{}{}, ++ {"amd64", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"amd64", "sync/atomic", "CompareAndSwapUint64"}: struct{}{}, ++ {"amd64", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"amd64", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"amd64", "sync/atomic", "LoadInt64"}: struct{}{}, ++ {"amd64", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"amd64", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"amd64", "sync/atomic", "LoadUint64"}: struct{}{}, ++ {"amd64", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"amd64", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"amd64", "sync/atomic", "StoreInt64"}: struct{}{}, ++ {"amd64", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"amd64", "sync/atomic", "StoreUint64"}: struct{}{}, ++ {"amd64", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"amd64", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"amd64", "sync/atomic", "SwapInt64"}: struct{}{}, ++ {"amd64", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"amd64", "sync/atomic", "SwapUint64"}: struct{}{}, ++ {"amd64", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"arm", "internal/runtime/sys", "Bswap32"}: struct{}{}, ++ {"arm", "internal/runtime/sys", "Bswap64"}: struct{}{}, ++ {"arm", "internal/runtime/sys", "Len64"}: struct{}{}, ++ {"arm", "internal/runtime/sys", "Len8"}: struct{}{}, ++ {"arm", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, ++ {"arm", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"arm", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"arm", "math", "Abs"}: struct{}{}, ++ {"arm", "math", "FMA"}: struct{}{}, ++ {"arm", "math", "sqrt"}: struct{}{}, ++ {"arm", "math/bits", "Len"}: struct{}{}, ++ {"arm", "math/bits", "Len16"}: struct{}{}, ++ {"arm", "math/bits", "Len32"}: struct{}{}, ++ {"arm", "math/bits", "Len64"}: struct{}{}, ++ {"arm", "math/bits", "Len8"}: struct{}{}, ++ {"arm", "math/bits", "ReverseBytes32"}: struct{}{}, ++ {"arm", "math/bits", "ReverseBytes64"}: struct{}{}, ++ {"arm", "math/bits", "RotateLeft32"}: struct{}{}, ++ {"arm", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"arm", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"arm", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"arm", "math/bits", "TrailingZeros8"}: struct{}{}, ++ {"arm", "runtime", "KeepAlive"}: struct{}{}, ++ {"arm", "runtime", "getcallerpc"}: struct{}{}, ++ {"arm", "runtime", "getcallersp"}: struct{}{}, ++ {"arm", "runtime", "getclosureptr"}: struct{}{}, ++ {"arm", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Cas64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Casint64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Load64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "LoadAcq64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Loadint64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Store64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "StoreRel64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Storeint64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "StorepNoWB"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Xadd64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Xaddint64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Xchg64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Xchgint64"}: struct{}{}, ++ {"arm64", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"arm64", "internal/runtime/math", "Add64"}: struct{}{}, ++ {"arm64", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"arm64", "internal/runtime/math", "MulUintptr"}: struct{}{}, ++ {"arm64", "internal/runtime/sys", "Bswap32"}: struct{}{}, ++ {"arm64", "internal/runtime/sys", "Bswap64"}: struct{}{}, ++ {"arm64", "internal/runtime/sys", "Len64"}: struct{}{}, ++ {"arm64", "internal/runtime/sys", "Len8"}: struct{}{}, ++ {"arm64", "internal/runtime/sys", "OnesCount64"}: struct{}{}, ++ {"arm64", "internal/runtime/sys", "Prefetch"}: struct{}{}, ++ {"arm64", "internal/runtime/sys", "PrefetchStreamed"}: struct{}{}, ++ {"arm64", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, ++ {"arm64", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"arm64", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"arm64", "math", "Abs"}: struct{}{}, ++ {"arm64", "math", "Ceil"}: struct{}{}, ++ {"arm64", "math", "FMA"}: struct{}{}, ++ {"arm64", "math", "Floor"}: struct{}{}, ++ {"arm64", "math", "Round"}: struct{}{}, ++ {"arm64", "math", "RoundToEven"}: struct{}{}, ++ {"arm64", "math", "Trunc"}: struct{}{}, ++ {"arm64", "math", "sqrt"}: struct{}{}, ++ {"arm64", "math/big", "mulWW"}: struct{}{}, ++ {"arm64", "math/bits", "Add"}: struct{}{}, ++ {"arm64", "math/bits", "Add64"}: struct{}{}, ++ {"arm64", "math/bits", "Len"}: struct{}{}, ++ {"arm64", "math/bits", "Len16"}: struct{}{}, ++ {"arm64", "math/bits", "Len32"}: struct{}{}, ++ {"arm64", "math/bits", "Len64"}: struct{}{}, ++ {"arm64", "math/bits", "Len8"}: struct{}{}, ++ {"arm64", "math/bits", "Mul"}: struct{}{}, ++ {"arm64", "math/bits", "Mul64"}: struct{}{}, ++ {"arm64", "math/bits", "OnesCount16"}: struct{}{}, ++ {"arm64", "math/bits", "OnesCount32"}: struct{}{}, ++ {"arm64", "math/bits", "OnesCount64"}: struct{}{}, ++ {"arm64", "math/bits", "Reverse"}: struct{}{}, ++ {"arm64", "math/bits", "Reverse16"}: struct{}{}, ++ {"arm64", "math/bits", "Reverse32"}: struct{}{}, ++ {"arm64", "math/bits", "Reverse64"}: struct{}{}, ++ {"arm64", "math/bits", "Reverse8"}: struct{}{}, ++ {"arm64", "math/bits", "ReverseBytes32"}: struct{}{}, ++ {"arm64", "math/bits", "ReverseBytes64"}: struct{}{}, ++ {"arm64", "math/bits", "RotateLeft"}: struct{}{}, ++ {"arm64", "math/bits", "RotateLeft32"}: struct{}{}, ++ {"arm64", "math/bits", "RotateLeft64"}: struct{}{}, ++ {"arm64", "math/bits", "Sub"}: struct{}{}, ++ {"arm64", "math/bits", "Sub64"}: struct{}{}, ++ {"arm64", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"arm64", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"arm64", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"arm64", "math/bits", "TrailingZeros8"}: struct{}{}, ++ {"arm64", "runtime", "KeepAlive"}: struct{}{}, ++ {"arm64", "runtime", "getcallerpc"}: struct{}{}, ++ {"arm64", "runtime", "getcallersp"}: struct{}{}, ++ {"arm64", "runtime", "getclosureptr"}: struct{}{}, ++ {"arm64", "runtime", "publicationBarrier"}: struct{}{}, ++ {"arm64", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"arm64", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"arm64", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"arm64", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"arm64", "sync/atomic", "AddInt64"}: struct{}{}, ++ {"arm64", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"arm64", "sync/atomic", "AddUint64"}: struct{}{}, ++ {"arm64", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"arm64", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"arm64", "sync/atomic", "CompareAndSwapInt64"}: struct{}{}, ++ {"arm64", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"arm64", "sync/atomic", "CompareAndSwapUint64"}: struct{}{}, ++ {"arm64", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"arm64", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"arm64", "sync/atomic", "LoadInt64"}: struct{}{}, ++ {"arm64", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"arm64", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"arm64", "sync/atomic", "LoadUint64"}: struct{}{}, ++ {"arm64", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"arm64", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"arm64", "sync/atomic", "StoreInt64"}: struct{}{}, ++ {"arm64", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"arm64", "sync/atomic", "StoreUint64"}: struct{}{}, ++ {"arm64", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"arm64", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"arm64", "sync/atomic", "SwapInt64"}: struct{}{}, ++ {"arm64", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"arm64", "sync/atomic", "SwapUint64"}: struct{}{}, ++ {"arm64", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Cas64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Casint64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Load64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "LoadAcq64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Loadint64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Store64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "StoreRel64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Storeint64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "StorepNoWB"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Xadd64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Xaddint64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Xchg64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Xchgint64"}: struct{}{}, ++ {"loong64", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"loong64", "internal/runtime/math", "Add64"}: struct{}{}, ++ {"loong64", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"loong64", "internal/runtime/math", "MulUintptr"}: struct{}{}, ++ {"loong64", "math", "Abs"}: struct{}{}, ++ {"loong64", "math", "Copysign"}: struct{}{}, ++ {"loong64", "math", "sqrt"}: struct{}{}, ++ {"loong64", "math/big", "mulWW"}: struct{}{}, ++ {"loong64", "math/bits", "Add"}: struct{}{}, ++ {"loong64", "math/bits", "Add64"}: struct{}{}, ++ {"loong64", "math/bits", "Mul"}: struct{}{}, ++ {"loong64", "math/bits", "Mul64"}: struct{}{}, ++ {"loong64", "math/bits", "RotateLeft"}: struct{}{}, ++ {"loong64", "math/bits", "RotateLeft32"}: struct{}{}, ++ {"loong64", "math/bits", "RotateLeft64"}: struct{}{}, ++ {"loong64", "math/bits", "Sub"}: struct{}{}, ++ {"loong64", "math/bits", "Sub64"}: struct{}{}, ++ {"loong64", "runtime", "KeepAlive"}: struct{}{}, ++ {"loong64", "runtime", "getcallerpc"}: struct{}{}, ++ {"loong64", "runtime", "getcallersp"}: struct{}{}, ++ {"loong64", "runtime", "getclosureptr"}: struct{}{}, ++ {"loong64", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"loong64", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"loong64", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"loong64", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"loong64", "sync/atomic", "AddInt64"}: struct{}{}, ++ {"loong64", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"loong64", "sync/atomic", "AddUint64"}: struct{}{}, ++ {"loong64", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"loong64", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"loong64", "sync/atomic", "CompareAndSwapInt64"}: struct{}{}, ++ {"loong64", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"loong64", "sync/atomic", "CompareAndSwapUint64"}: struct{}{}, ++ {"loong64", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"loong64", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"loong64", "sync/atomic", "LoadInt64"}: struct{}{}, ++ {"loong64", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"loong64", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"loong64", "sync/atomic", "LoadUint64"}: struct{}{}, ++ {"loong64", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"loong64", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"loong64", "sync/atomic", "StoreInt64"}: struct{}{}, ++ {"loong64", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"loong64", "sync/atomic", "StoreUint64"}: struct{}{}, ++ {"loong64", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"loong64", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"loong64", "sync/atomic", "SwapInt64"}: struct{}{}, ++ {"loong64", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"loong64", "sync/atomic", "SwapUint64"}: struct{}{}, ++ {"loong64", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "And"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "And8"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Or"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Or8"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "StorepNoWB"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"mips", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"mips", "internal/runtime/sys", "Len64"}: struct{}{}, ++ {"mips", "internal/runtime/sys", "Len8"}: struct{}{}, ++ {"mips", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, ++ {"mips", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"mips", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"mips", "math", "Abs"}: struct{}{}, ++ {"mips", "math", "sqrt"}: struct{}{}, ++ {"mips", "math/bits", "Len"}: struct{}{}, ++ {"mips", "math/bits", "Len16"}: struct{}{}, ++ {"mips", "math/bits", "Len32"}: struct{}{}, ++ {"mips", "math/bits", "Len64"}: struct{}{}, ++ {"mips", "math/bits", "Len8"}: struct{}{}, ++ {"mips", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"mips", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"mips", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"mips", "math/bits", "TrailingZeros8"}: struct{}{}, ++ {"mips", "runtime", "KeepAlive"}: struct{}{}, ++ {"mips", "runtime", "getcallerpc"}: struct{}{}, ++ {"mips", "runtime", "getcallersp"}: struct{}{}, ++ {"mips", "runtime", "getclosureptr"}: struct{}{}, ++ {"mips", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"mips", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"mips", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"mips", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"mips", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"mips", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"mips", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"mips", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"mips", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"mips", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"mips", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"mips", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"mips", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"mips", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"mips", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"mips", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"mips", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"mips", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"mips", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "And"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "And8"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Cas64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Casint64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Load64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "LoadAcq64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Loadint64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Or"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Or8"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Store64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "StoreRel64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Storeint64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "StorepNoWB"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Xadd64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Xaddint64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Xchg64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Xchgint64"}: struct{}{}, ++ {"mips64", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"mips64", "internal/runtime/math", "Add64"}: struct{}{}, ++ {"mips64", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"mips64", "internal/runtime/math", "MulUintptr"}: struct{}{}, ++ {"mips64", "math", "Abs"}: struct{}{}, ++ {"mips64", "math", "sqrt"}: struct{}{}, ++ {"mips64", "math/big", "mulWW"}: struct{}{}, ++ {"mips64", "math/bits", "Add"}: struct{}{}, ++ {"mips64", "math/bits", "Add64"}: struct{}{}, ++ {"mips64", "math/bits", "Mul"}: struct{}{}, ++ {"mips64", "math/bits", "Mul64"}: struct{}{}, ++ {"mips64", "math/bits", "Sub"}: struct{}{}, ++ {"mips64", "math/bits", "Sub64"}: struct{}{}, ++ {"mips64", "runtime", "KeepAlive"}: struct{}{}, ++ {"mips64", "runtime", "getcallerpc"}: struct{}{}, ++ {"mips64", "runtime", "getcallersp"}: struct{}{}, ++ {"mips64", "runtime", "getclosureptr"}: struct{}{}, ++ {"mips64", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"mips64", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"mips64", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"mips64", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"mips64", "sync/atomic", "AddInt64"}: struct{}{}, ++ {"mips64", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"mips64", "sync/atomic", "AddUint64"}: struct{}{}, ++ {"mips64", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"mips64", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"mips64", "sync/atomic", "CompareAndSwapInt64"}: struct{}{}, ++ {"mips64", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"mips64", "sync/atomic", "CompareAndSwapUint64"}: struct{}{}, ++ {"mips64", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"mips64", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"mips64", "sync/atomic", "LoadInt64"}: struct{}{}, ++ {"mips64", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"mips64", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"mips64", "sync/atomic", "LoadUint64"}: struct{}{}, ++ {"mips64", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"mips64", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"mips64", "sync/atomic", "StoreInt64"}: struct{}{}, ++ {"mips64", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"mips64", "sync/atomic", "StoreUint64"}: struct{}{}, ++ {"mips64", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"mips64", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"mips64", "sync/atomic", "SwapInt64"}: struct{}{}, ++ {"mips64", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"mips64", "sync/atomic", "SwapUint64"}: struct{}{}, ++ {"mips64", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "And"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "And8"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Cas64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Casint64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Load64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "LoadAcq64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Loadint64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Or"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Or8"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Store64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "StoreRel64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Storeint64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "StorepNoWB"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Xadd64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Xaddint64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Xchg64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Xchgint64"}: struct{}{}, ++ {"mips64le", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"mips64le", "internal/runtime/math", "Add64"}: struct{}{}, ++ {"mips64le", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"mips64le", "internal/runtime/math", "MulUintptr"}: struct{}{}, ++ {"mips64le", "math", "Abs"}: struct{}{}, ++ {"mips64le", "math", "sqrt"}: struct{}{}, ++ {"mips64le", "math/big", "mulWW"}: struct{}{}, ++ {"mips64le", "math/bits", "Add"}: struct{}{}, ++ {"mips64le", "math/bits", "Add64"}: struct{}{}, ++ {"mips64le", "math/bits", "Mul"}: struct{}{}, ++ {"mips64le", "math/bits", "Mul64"}: struct{}{}, ++ {"mips64le", "math/bits", "Sub"}: struct{}{}, ++ {"mips64le", "math/bits", "Sub64"}: struct{}{}, ++ {"mips64le", "runtime", "KeepAlive"}: struct{}{}, ++ {"mips64le", "runtime", "getcallerpc"}: struct{}{}, ++ {"mips64le", "runtime", "getcallersp"}: struct{}{}, ++ {"mips64le", "runtime", "getclosureptr"}: struct{}{}, ++ {"mips64le", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"mips64le", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"mips64le", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"mips64le", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"mips64le", "sync/atomic", "AddInt64"}: struct{}{}, ++ {"mips64le", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"mips64le", "sync/atomic", "AddUint64"}: struct{}{}, ++ {"mips64le", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"mips64le", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"mips64le", "sync/atomic", "CompareAndSwapInt64"}: struct{}{}, ++ {"mips64le", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"mips64le", "sync/atomic", "CompareAndSwapUint64"}: struct{}{}, ++ {"mips64le", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"mips64le", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"mips64le", "sync/atomic", "LoadInt64"}: struct{}{}, ++ {"mips64le", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"mips64le", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"mips64le", "sync/atomic", "LoadUint64"}: struct{}{}, ++ {"mips64le", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"mips64le", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"mips64le", "sync/atomic", "StoreInt64"}: struct{}{}, ++ {"mips64le", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"mips64le", "sync/atomic", "StoreUint64"}: struct{}{}, ++ {"mips64le", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"mips64le", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"mips64le", "sync/atomic", "SwapInt64"}: struct{}{}, ++ {"mips64le", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"mips64le", "sync/atomic", "SwapUint64"}: struct{}{}, ++ {"mips64le", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "And"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "And8"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Or"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Or8"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "StorepNoWB"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"mipsle", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"mipsle", "internal/runtime/sys", "Len64"}: struct{}{}, ++ {"mipsle", "internal/runtime/sys", "Len8"}: struct{}{}, ++ {"mipsle", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, ++ {"mipsle", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"mipsle", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"mipsle", "math", "Abs"}: struct{}{}, ++ {"mipsle", "math", "sqrt"}: struct{}{}, ++ {"mipsle", "math/bits", "Len"}: struct{}{}, ++ {"mipsle", "math/bits", "Len16"}: struct{}{}, ++ {"mipsle", "math/bits", "Len32"}: struct{}{}, ++ {"mipsle", "math/bits", "Len64"}: struct{}{}, ++ {"mipsle", "math/bits", "Len8"}: struct{}{}, ++ {"mipsle", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"mipsle", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"mipsle", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"mipsle", "math/bits", "TrailingZeros8"}: struct{}{}, ++ {"mipsle", "runtime", "KeepAlive"}: struct{}{}, ++ {"mipsle", "runtime", "getcallerpc"}: struct{}{}, ++ {"mipsle", "runtime", "getcallersp"}: struct{}{}, ++ {"mipsle", "runtime", "getclosureptr"}: struct{}{}, ++ {"mipsle", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"mipsle", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"mipsle", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"mipsle", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"mipsle", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"mipsle", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"mipsle", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"mipsle", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"mipsle", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"mipsle", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"mipsle", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"mipsle", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"mipsle", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"mipsle", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"mipsle", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"mipsle", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"mipsle", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"mipsle", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"mipsle", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "And"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "And8"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Cas64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Casint64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Load64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "LoadAcq64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Loadint64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Or"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Or8"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Store64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "StoreRel64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Storeint64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Xadd64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Xaddint64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Xchg64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Xchgint64"}: struct{}{}, ++ {"ppc64", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"ppc64", "internal/runtime/math", "Add64"}: struct{}{}, ++ {"ppc64", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"ppc64", "internal/runtime/sys", "Len64"}: struct{}{}, ++ {"ppc64", "internal/runtime/sys", "Len8"}: struct{}{}, ++ {"ppc64", "internal/runtime/sys", "OnesCount64"}: struct{}{}, ++ {"ppc64", "internal/runtime/sys", "Prefetch"}: struct{}{}, ++ {"ppc64", "internal/runtime/sys", "PrefetchStreamed"}: struct{}{}, ++ {"ppc64", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, ++ {"ppc64", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"ppc64", "math", "Abs"}: struct{}{}, ++ {"ppc64", "math", "Ceil"}: struct{}{}, ++ {"ppc64", "math", "Copysign"}: struct{}{}, ++ {"ppc64", "math", "FMA"}: struct{}{}, ++ {"ppc64", "math", "Floor"}: struct{}{}, ++ {"ppc64", "math", "Round"}: struct{}{}, ++ {"ppc64", "math", "Trunc"}: struct{}{}, ++ {"ppc64", "math", "sqrt"}: struct{}{}, ++ {"ppc64", "math/big", "mulWW"}: struct{}{}, ++ {"ppc64", "math/bits", "Add"}: struct{}{}, ++ {"ppc64", "math/bits", "Add64"}: struct{}{}, ++ {"ppc64", "math/bits", "Len"}: struct{}{}, ++ {"ppc64", "math/bits", "Len16"}: struct{}{}, ++ {"ppc64", "math/bits", "Len32"}: struct{}{}, ++ {"ppc64", "math/bits", "Len64"}: struct{}{}, ++ {"ppc64", "math/bits", "Len8"}: struct{}{}, ++ {"ppc64", "math/bits", "Mul"}: struct{}{}, ++ {"ppc64", "math/bits", "Mul64"}: struct{}{}, ++ {"ppc64", "math/bits", "OnesCount16"}: struct{}{}, ++ {"ppc64", "math/bits", "OnesCount32"}: struct{}{}, ++ {"ppc64", "math/bits", "OnesCount64"}: struct{}{}, ++ {"ppc64", "math/bits", "OnesCount8"}: struct{}{}, ++ {"ppc64", "math/bits", "RotateLeft"}: struct{}{}, ++ {"ppc64", "math/bits", "RotateLeft32"}: struct{}{}, ++ {"ppc64", "math/bits", "RotateLeft64"}: struct{}{}, ++ {"ppc64", "math/bits", "Sub"}: struct{}{}, ++ {"ppc64", "math/bits", "Sub64"}: struct{}{}, ++ {"ppc64", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"ppc64", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"ppc64", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"ppc64", "runtime", "KeepAlive"}: struct{}{}, ++ {"ppc64", "runtime", "getcallerpc"}: struct{}{}, ++ {"ppc64", "runtime", "getcallersp"}: struct{}{}, ++ {"ppc64", "runtime", "getclosureptr"}: struct{}{}, ++ {"ppc64", "runtime", "publicationBarrier"}: struct{}{}, ++ {"ppc64", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"ppc64", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"ppc64", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"ppc64", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"ppc64", "sync/atomic", "AddInt64"}: struct{}{}, ++ {"ppc64", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"ppc64", "sync/atomic", "AddUint64"}: struct{}{}, ++ {"ppc64", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"ppc64", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"ppc64", "sync/atomic", "CompareAndSwapInt64"}: struct{}{}, ++ {"ppc64", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"ppc64", "sync/atomic", "CompareAndSwapUint64"}: struct{}{}, ++ {"ppc64", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"ppc64", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"ppc64", "sync/atomic", "LoadInt64"}: struct{}{}, ++ {"ppc64", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"ppc64", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"ppc64", "sync/atomic", "LoadUint64"}: struct{}{}, ++ {"ppc64", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"ppc64", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"ppc64", "sync/atomic", "StoreInt64"}: struct{}{}, ++ {"ppc64", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"ppc64", "sync/atomic", "StoreUint64"}: struct{}{}, ++ {"ppc64", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"ppc64", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"ppc64", "sync/atomic", "SwapInt64"}: struct{}{}, ++ {"ppc64", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"ppc64", "sync/atomic", "SwapUint64"}: struct{}{}, ++ {"ppc64", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "And"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "And8"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Cas64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Casint64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Load64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "LoadAcq64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Loadint64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Or"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Or8"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Store64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "StoreRel64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Storeint64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Xadd64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Xaddint64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Xchg64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Xchgint64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"ppc64le", "internal/runtime/math", "Add64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/sys", "Len64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/sys", "Len8"}: struct{}{}, ++ {"ppc64le", "internal/runtime/sys", "OnesCount64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/sys", "Prefetch"}: struct{}{}, ++ {"ppc64le", "internal/runtime/sys", "PrefetchStreamed"}: struct{}{}, ++ {"ppc64le", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, ++ {"ppc64le", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"ppc64le", "math", "Abs"}: struct{}{}, ++ {"ppc64le", "math", "Ceil"}: struct{}{}, ++ {"ppc64le", "math", "Copysign"}: struct{}{}, ++ {"ppc64le", "math", "FMA"}: struct{}{}, ++ {"ppc64le", "math", "Floor"}: struct{}{}, ++ {"ppc64le", "math", "Round"}: struct{}{}, ++ {"ppc64le", "math", "Trunc"}: struct{}{}, ++ {"ppc64le", "math", "sqrt"}: struct{}{}, ++ {"ppc64le", "math/big", "mulWW"}: struct{}{}, ++ {"ppc64le", "math/bits", "Add"}: struct{}{}, ++ {"ppc64le", "math/bits", "Add64"}: struct{}{}, ++ {"ppc64le", "math/bits", "Len"}: struct{}{}, ++ {"ppc64le", "math/bits", "Len16"}: struct{}{}, ++ {"ppc64le", "math/bits", "Len32"}: struct{}{}, ++ {"ppc64le", "math/bits", "Len64"}: struct{}{}, ++ {"ppc64le", "math/bits", "Len8"}: struct{}{}, ++ {"ppc64le", "math/bits", "Mul"}: struct{}{}, ++ {"ppc64le", "math/bits", "Mul64"}: struct{}{}, ++ {"ppc64le", "math/bits", "OnesCount16"}: struct{}{}, ++ {"ppc64le", "math/bits", "OnesCount32"}: struct{}{}, ++ {"ppc64le", "math/bits", "OnesCount64"}: struct{}{}, ++ {"ppc64le", "math/bits", "OnesCount8"}: struct{}{}, ++ {"ppc64le", "math/bits", "RotateLeft"}: struct{}{}, ++ {"ppc64le", "math/bits", "RotateLeft32"}: struct{}{}, ++ {"ppc64le", "math/bits", "RotateLeft64"}: struct{}{}, ++ {"ppc64le", "math/bits", "Sub"}: struct{}{}, ++ {"ppc64le", "math/bits", "Sub64"}: struct{}{}, ++ {"ppc64le", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"ppc64le", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"ppc64le", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"ppc64le", "runtime", "KeepAlive"}: struct{}{}, ++ {"ppc64le", "runtime", "getcallerpc"}: struct{}{}, ++ {"ppc64le", "runtime", "getcallersp"}: struct{}{}, ++ {"ppc64le", "runtime", "getclosureptr"}: struct{}{}, ++ {"ppc64le", "runtime", "publicationBarrier"}: struct{}{}, ++ {"ppc64le", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"ppc64le", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"ppc64le", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "AddInt64"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "AddUint64"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "CompareAndSwapInt64"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "CompareAndSwapUint64"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "LoadInt64"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "LoadUint64"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "StoreInt64"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "StoreUint64"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "SwapInt64"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "SwapUint64"}: struct{}{}, ++ {"ppc64le", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "And"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "And8"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Cas64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Casint64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Load64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "LoadAcq64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Loadint64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Or"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Or8"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Store64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "StoreRel64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Storeint64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "StorepNoWB"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Xadd64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Xaddint64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Xchg64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Xchgint64"}: struct{}{}, ++ {"riscv64", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"riscv64", "internal/runtime/math", "Add64"}: struct{}{}, ++ {"riscv64", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"riscv64", "internal/runtime/math", "MulUintptr"}: struct{}{}, ++ {"riscv64", "math", "Abs"}: struct{}{}, ++ {"riscv64", "math", "Copysign"}: struct{}{}, ++ {"riscv64", "math", "FMA"}: struct{}{}, ++ {"riscv64", "math", "sqrt"}: struct{}{}, ++ {"riscv64", "math/big", "mulWW"}: struct{}{}, ++ {"riscv64", "math/bits", "Add"}: struct{}{}, ++ {"riscv64", "math/bits", "Add64"}: struct{}{}, ++ {"riscv64", "math/bits", "Mul"}: struct{}{}, ++ {"riscv64", "math/bits", "Mul64"}: struct{}{}, ++ {"riscv64", "math/bits", "RotateLeft"}: struct{}{}, ++ {"riscv64", "math/bits", "RotateLeft16"}: struct{}{}, ++ {"riscv64", "math/bits", "RotateLeft32"}: struct{}{}, ++ {"riscv64", "math/bits", "RotateLeft64"}: struct{}{}, ++ {"riscv64", "math/bits", "RotateLeft8"}: struct{}{}, ++ {"riscv64", "math/bits", "Sub"}: struct{}{}, ++ {"riscv64", "math/bits", "Sub64"}: struct{}{}, ++ {"riscv64", "runtime", "KeepAlive"}: struct{}{}, ++ {"riscv64", "runtime", "getcallerpc"}: struct{}{}, ++ {"riscv64", "runtime", "getcallersp"}: struct{}{}, ++ {"riscv64", "runtime", "getclosureptr"}: struct{}{}, ++ {"riscv64", "runtime", "publicationBarrier"}: struct{}{}, ++ {"riscv64", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"riscv64", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"riscv64", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"riscv64", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"riscv64", "sync/atomic", "AddInt64"}: struct{}{}, ++ {"riscv64", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"riscv64", "sync/atomic", "AddUint64"}: struct{}{}, ++ {"riscv64", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"riscv64", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"riscv64", "sync/atomic", "CompareAndSwapInt64"}: struct{}{}, ++ {"riscv64", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"riscv64", "sync/atomic", "CompareAndSwapUint64"}: struct{}{}, ++ {"riscv64", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"riscv64", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"riscv64", "sync/atomic", "LoadInt64"}: struct{}{}, ++ {"riscv64", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"riscv64", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"riscv64", "sync/atomic", "LoadUint64"}: struct{}{}, ++ {"riscv64", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"riscv64", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"riscv64", "sync/atomic", "StoreInt64"}: struct{}{}, ++ {"riscv64", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"riscv64", "sync/atomic", "StoreUint64"}: struct{}{}, ++ {"riscv64", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"riscv64", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"riscv64", "sync/atomic", "SwapInt64"}: struct{}{}, ++ {"riscv64", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"riscv64", "sync/atomic", "SwapUint64"}: struct{}{}, ++ {"riscv64", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "And"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "And8"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Cas"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Cas64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "CasRel"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Casint32"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Casint64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Casp1"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Casuintptr"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Load"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Load64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Load8"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "LoadAcq"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "LoadAcq64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "LoadAcquintptr"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Loadint32"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Loadint64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Loadp"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Loaduint"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Loaduintptr"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Or"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Or8"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Store"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Store64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Store8"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "StoreRel"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "StoreRel64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "StoreReluintptr"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Storeint32"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Storeint64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "StorepNoWB"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Storeuintptr"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Xadd"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Xadd64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Xaddint32"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Xaddint64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Xadduintptr"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Xchg"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Xchg64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Xchgint64"}: struct{}{}, ++ {"s390x", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, ++ {"s390x", "internal/runtime/math", "Add64"}: struct{}{}, ++ {"s390x", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"s390x", "internal/runtime/sys", "Bswap32"}: struct{}{}, ++ {"s390x", "internal/runtime/sys", "Bswap64"}: struct{}{}, ++ {"s390x", "internal/runtime/sys", "Len64"}: struct{}{}, ++ {"s390x", "internal/runtime/sys", "Len8"}: struct{}{}, ++ {"s390x", "internal/runtime/sys", "OnesCount64"}: struct{}{}, ++ {"s390x", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, ++ {"s390x", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"s390x", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"s390x", "math", "Ceil"}: struct{}{}, ++ {"s390x", "math", "FMA"}: struct{}{}, ++ {"s390x", "math", "Floor"}: struct{}{}, ++ {"s390x", "math", "Round"}: struct{}{}, ++ {"s390x", "math", "RoundToEven"}: struct{}{}, ++ {"s390x", "math", "Trunc"}: struct{}{}, ++ {"s390x", "math", "sqrt"}: struct{}{}, ++ {"s390x", "math/big", "mulWW"}: struct{}{}, ++ {"s390x", "math/bits", "Add"}: struct{}{}, ++ {"s390x", "math/bits", "Add64"}: struct{}{}, ++ {"s390x", "math/bits", "Len"}: struct{}{}, ++ {"s390x", "math/bits", "Len16"}: struct{}{}, ++ {"s390x", "math/bits", "Len32"}: struct{}{}, ++ {"s390x", "math/bits", "Len64"}: struct{}{}, ++ {"s390x", "math/bits", "Len8"}: struct{}{}, ++ {"s390x", "math/bits", "Mul"}: struct{}{}, ++ {"s390x", "math/bits", "Mul64"}: struct{}{}, ++ {"s390x", "math/bits", "OnesCount16"}: struct{}{}, ++ {"s390x", "math/bits", "OnesCount32"}: struct{}{}, ++ {"s390x", "math/bits", "OnesCount64"}: struct{}{}, ++ {"s390x", "math/bits", "OnesCount8"}: struct{}{}, ++ {"s390x", "math/bits", "ReverseBytes32"}: struct{}{}, ++ {"s390x", "math/bits", "ReverseBytes64"}: struct{}{}, ++ {"s390x", "math/bits", "RotateLeft"}: struct{}{}, ++ {"s390x", "math/bits", "RotateLeft32"}: struct{}{}, ++ {"s390x", "math/bits", "RotateLeft64"}: struct{}{}, ++ {"s390x", "math/bits", "Sub"}: struct{}{}, ++ {"s390x", "math/bits", "Sub64"}: struct{}{}, ++ {"s390x", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"s390x", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"s390x", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"s390x", "math/bits", "TrailingZeros8"}: struct{}{}, ++ {"s390x", "runtime", "KeepAlive"}: struct{}{}, ++ {"s390x", "runtime", "getcallerpc"}: struct{}{}, ++ {"s390x", "runtime", "getcallersp"}: struct{}{}, ++ {"s390x", "runtime", "getclosureptr"}: struct{}{}, ++ {"s390x", "runtime", "slicebytetostringtmp"}: struct{}{}, ++ {"s390x", "sync", "runtime_LoadAcquintptr"}: struct{}{}, ++ {"s390x", "sync", "runtime_StoreReluintptr"}: struct{}{}, ++ {"s390x", "sync/atomic", "AddInt32"}: struct{}{}, ++ {"s390x", "sync/atomic", "AddInt64"}: struct{}{}, ++ {"s390x", "sync/atomic", "AddUint32"}: struct{}{}, ++ {"s390x", "sync/atomic", "AddUint64"}: struct{}{}, ++ {"s390x", "sync/atomic", "AddUintptr"}: struct{}{}, ++ {"s390x", "sync/atomic", "CompareAndSwapInt32"}: struct{}{}, ++ {"s390x", "sync/atomic", "CompareAndSwapInt64"}: struct{}{}, ++ {"s390x", "sync/atomic", "CompareAndSwapUint32"}: struct{}{}, ++ {"s390x", "sync/atomic", "CompareAndSwapUint64"}: struct{}{}, ++ {"s390x", "sync/atomic", "CompareAndSwapUintptr"}: struct{}{}, ++ {"s390x", "sync/atomic", "LoadInt32"}: struct{}{}, ++ {"s390x", "sync/atomic", "LoadInt64"}: struct{}{}, ++ {"s390x", "sync/atomic", "LoadPointer"}: struct{}{}, ++ {"s390x", "sync/atomic", "LoadUint32"}: struct{}{}, ++ {"s390x", "sync/atomic", "LoadUint64"}: struct{}{}, ++ {"s390x", "sync/atomic", "LoadUintptr"}: struct{}{}, ++ {"s390x", "sync/atomic", "StoreInt32"}: struct{}{}, ++ {"s390x", "sync/atomic", "StoreInt64"}: struct{}{}, ++ {"s390x", "sync/atomic", "StoreUint32"}: struct{}{}, ++ {"s390x", "sync/atomic", "StoreUint64"}: struct{}{}, ++ {"s390x", "sync/atomic", "StoreUintptr"}: struct{}{}, ++ {"s390x", "sync/atomic", "SwapInt32"}: struct{}{}, ++ {"s390x", "sync/atomic", "SwapInt64"}: struct{}{}, ++ {"s390x", "sync/atomic", "SwapUint32"}: struct{}{}, ++ {"s390x", "sync/atomic", "SwapUint64"}: struct{}{}, ++ {"s390x", "sync/atomic", "SwapUintptr"}: struct{}{}, ++ {"wasm", "internal/runtime/sys", "Len64"}: struct{}{}, ++ {"wasm", "internal/runtime/sys", "Len8"}: struct{}{}, ++ {"wasm", "internal/runtime/sys", "OnesCount64"}: struct{}{}, ++ {"wasm", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, ++ {"wasm", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"wasm", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"wasm", "math", "Abs"}: struct{}{}, ++ {"wasm", "math", "Ceil"}: struct{}{}, ++ {"wasm", "math", "Copysign"}: struct{}{}, ++ {"wasm", "math", "Floor"}: struct{}{}, ++ {"wasm", "math", "RoundToEven"}: struct{}{}, ++ {"wasm", "math", "Trunc"}: struct{}{}, ++ {"wasm", "math", "sqrt"}: struct{}{}, ++ {"wasm", "math/bits", "Len"}: struct{}{}, ++ {"wasm", "math/bits", "Len16"}: struct{}{}, ++ {"wasm", "math/bits", "Len32"}: struct{}{}, ++ {"wasm", "math/bits", "Len64"}: struct{}{}, ++ {"wasm", "math/bits", "Len8"}: struct{}{}, ++ {"wasm", "math/bits", "OnesCount16"}: struct{}{}, ++ {"wasm", "math/bits", "OnesCount32"}: struct{}{}, ++ {"wasm", "math/bits", "OnesCount64"}: struct{}{}, ++ {"wasm", "math/bits", "OnesCount8"}: struct{}{}, ++ {"wasm", "math/bits", "RotateLeft"}: struct{}{}, ++ {"wasm", "math/bits", "RotateLeft32"}: struct{}{}, ++ {"wasm", "math/bits", "RotateLeft64"}: struct{}{}, ++ {"wasm", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"wasm", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"wasm", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"wasm", "math/bits", "TrailingZeros8"}: struct{}{}, ++ {"wasm", "runtime", "KeepAlive"}: struct{}{}, ++ {"wasm", "runtime", "getcallerpc"}: struct{}{}, ++ {"wasm", "runtime", "getcallersp"}: struct{}{}, ++ {"wasm", "runtime", "getclosureptr"}: struct{}{}, ++ {"wasm", "runtime", "slicebytetostringtmp"}: struct{}{}, ++} ++ ++var wantIntrinsicsPower10 = map[testIntrinsicKey]struct{}{ ++ {"ppc64", "internal/runtime/sys", "Bswap32"}: struct{}{}, ++ {"ppc64", "internal/runtime/sys", "Bswap64"}: struct{}{}, ++ {"ppc64", "math/bits", "ReverseBytes16"}: struct{}{}, ++ {"ppc64", "math/bits", "ReverseBytes32"}: struct{}{}, ++ {"ppc64", "math/bits", "ReverseBytes64"}: struct{}{}, ++ {"ppc64le", "internal/runtime/sys", "Bswap32"}: struct{}{}, ++ {"ppc64le", "internal/runtime/sys", "Bswap64"}: struct{}{}, ++ {"ppc64le", "math/bits", "ReverseBytes16"}: struct{}{}, ++ {"ppc64le", "math/bits", "ReverseBytes32"}: struct{}{}, ++ {"ppc64le", "math/bits", "ReverseBytes64"}: struct{}{}, ++} ++ ++func TestIntrinsics(t *testing.T) { ++ initIntrinsics() ++ ++ want := make(map[testIntrinsicKey]struct{}) ++ for ik, iv := range wantIntrinsics { ++ want[ik] = iv ++ } ++ if buildcfg.GOPPC64 >= 10 { ++ for ik, iv := range wantIntrinsicsPower10 { ++ want[ik] = iv ++ } ++ } ++ ++ got := make(map[testIntrinsicKey]struct{}) ++ for ik, _ := range intrinsics { ++ got[testIntrinsicKey{ik.arch.Name, ik.pkg, ik.fn}] = struct{}{} ++ } ++ for ik, _ := range got { ++ if _, found := want[ik]; !found { ++ t.Errorf("Got unwanted intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn) ++ } ++ } ++ for ik, _ := range want { ++ if _, found := got[ik]; !found { ++ t.Errorf("Want intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn) ++ } ++ } ++} +-- +2.39.5 + diff --git a/2110-cmd-dist-internal-add-GOARM64-environment-variable.patch b/2110-cmd-dist-internal-add-GOARM64-environment-variable.patch new file mode 100644 index 0000000..847f7bb --- /dev/null +++ b/2110-cmd-dist-internal-add-GOARM64-environment-variable.patch @@ -0,0 +1,232 @@ +From 8891253e664aabdeb96a60aab2097b34076f0b18 Mon Sep 17 00:00:00 2001 +From: Andrey Bokhanko +Date: Fri, 26 Sep 2025 17:47:28 +0800 +Subject: [PATCH 110/119] cmd/dist,internal: add GOARM64 environment variable + +Adds GOARM64 environment variable with accepted range of values "v8.{0-9}", +"v9.{0-5}" and optional ",lse" and ",crypto" suffixes. + +Right now it doesn't affect anything, but can be used in the future to +selectively target specific versions of different ARM64 hardware. + +For #60905 + +Change-Id: I6d530041b6931aa884e34f719f8ec41b1cb03ece +Reviewed-on: https://go-review.googlesource.com/c/go/+/559555 +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mauri de Souza Meneguzzo +Reviewed-by: Cherry Mui +Reviewed-by: Shu-Chun Weng +Reviewed-by: Fannie Zhang +--- + src/cmd/go/alldocs.go | 2 + + src/cmd/go/internal/help/helpdoc.go | 2 + + src/internal/buildcfg/cfg.go | 29 ++++++++-- + src/internal/buildcfg/cfg_test.go | 85 +++++++++++++++++++++++++++++ + 4 files changed, 114 insertions(+), 4 deletions(-) + +diff --git a/src/cmd/go/alldocs.go b/src/cmd/go/alldocs.go +index 32e2ba15e9..fd9602a9b3 100644 +--- a/src/cmd/go/alldocs.go ++++ b/src/cmd/go/alldocs.go +@@ -1965,6 +1965,8 @@ + // correspond to the amd64.v1, amd64.v2, and amd64.v3 feature build tags. + // - For GOARCH=arm, GOARM=5, 6, and 7 + // correspond to the arm.5, arm.6, and arm.7 feature build tags. ++// - For GOARCH=arm64, GOARM64=v8.{0-9} and v9.{0-5} ++// correspond to the arm64.v8.{0-9} and arm64.v9.{0-5} feature build tags. + // - For GOARCH=mips or mipsle, + // GOMIPS=hardfloat and softfloat + // correspond to the mips.hardfloat and mips.softfloat +diff --git a/src/cmd/go/internal/help/helpdoc.go b/src/cmd/go/internal/help/helpdoc.go +index 12b667e9be..b5f820c159 100644 +--- a/src/cmd/go/internal/help/helpdoc.go ++++ b/src/cmd/go/internal/help/helpdoc.go +@@ -897,6 +897,8 @@ The defined architecture feature build tags are: + correspond to the amd64.v1, amd64.v2, and amd64.v3 feature build tags. + - For GOARCH=arm, GOARM=5, 6, and 7 + correspond to the arm.5, arm.6, and arm.7 feature build tags. ++ - For GOARCH=arm64, GOARM64=v8.{0-9} and v9.{0-5} ++ correspond to the arm64.v8.{0-9} and arm64.v9.{0-5} feature build tags. + - For GOARCH=mips or mipsle, + GOMIPS=hardfloat and softfloat + correspond to the mips.hardfloat and mips.softfloat +diff --git a/src/internal/buildcfg/cfg.go b/src/internal/buildcfg/cfg.go +index f6fb2d232f..97d078e954 100644 +--- a/src/internal/buildcfg/cfg.go ++++ b/src/internal/buildcfg/cfg.go +@@ -93,6 +93,12 @@ type Goarm64Features struct { + Version string + // Large System Extension + LSE bool ++ // ARM v8.0 Cryptographic Extension. It includes the following features: ++ // * FEAT_AES, which includes the AESD and AESE instructions. ++ // * FEAT_PMULL, which includes the PMULL, PMULL2 instructions. ++ // * FEAT_SHA1, which includes the SHA1* instructions. ++ // * FEAT_SHA256, which includes the SHA256* instructions. ++ Crypto bool + // Kunpeng atomic optimize + KPAtomicOpt bool + } +@@ -102,7 +108,9 @@ func (g Goarm64Features) String() string { + if g.LSE { + arm64Str += ",lse" + } +- ++ if g.Crypto { ++ arm64Str += ",crypto" ++ } + if g.KPAtomicOpt { + arm64Str += ",kpatomicopt" + } +@@ -113,10 +121,12 @@ func (g Goarm64Features) String() string { + func ParseGoarm64(v string) (g Goarm64Features, e error) { + const ( + lseOpt = ",lse" ++ cryptoOpt = ",crypto" + kpAtomicOpt = ",kpatomicopt" + ) + + g.LSE = false ++ g.Crypto = false + g.KPAtomicOpt = false + + // We allow any combination of suffixes, in any order +@@ -127,6 +137,12 @@ func ParseGoarm64(v string) (g Goarm64Features, e error) { + continue + } + ++ if strings.HasSuffix(v, cryptoOpt) { ++ g.Crypto = true ++ v = v[:len(v)-len(cryptoOpt)] ++ continue ++ } ++ + if strings.HasSuffix(v, kpAtomicOpt) { + if os.Getenv("AI_OPT") == "1" { + g.KPAtomicOpt = true +@@ -139,12 +155,16 @@ func ParseGoarm64(v string) (g Goarm64Features, e error) { + } + + switch v { +- case "v8.0", "v8.1", "v8.2", "v8.3", "v8.4", "v8.5", "v8.6", "v8.7", "v8.8", "v8.9", +- "v9.0", "v9.1", "v9.2", "v9.4", "v9.5": ++ case "v8.0": ++ g.Version = v ++ case "v8.1", "v8.2", "v8.3", "v8.4", "v8.5", "v8.6", "v8.7", "v8.8", "v8.9", ++ "v9.0", "v9.1", "v9.2", "v9.3", "v9.4", "v9.5": + g.Version = v ++ // LSE extension is mandatory starting from 8.1 ++ g.LSE = true + default: + e = fmt.Errorf("invalid GOARM64: must start with v8.{0-9} or v9.{0-5} and may optionally end in %q and/or %q", +- lseOpt, kpAtomicOpt) ++ lseOpt, cryptoOpt, kpAtomicOpt) + g.Version = defaultGOARM64 + } + +@@ -177,6 +197,7 @@ func (g Goarm64Features) Supports(s string) bool { + if major == g_major { + return minor <= g_minor + } else if g_major == '9' { ++ // v9.0 diverged from v8.5. This means we should compare with g_minor increased by five. + return minor <= g_minor+5 + } else { + return false +diff --git a/src/internal/buildcfg/cfg_test.go b/src/internal/buildcfg/cfg_test.go +index 1513cdc9b0..67f9d82d92 100644 +--- a/src/internal/buildcfg/cfg_test.go ++++ b/src/internal/buildcfg/cfg_test.go +@@ -41,4 +41,89 @@ func TestConfigFlags(t *testing.T) { + if _ = goriscv64(); Error == nil { + t.Errorf("Wrong parsing of RISCV64=rva22") + } ++ Error = nil ++ os.Setenv("GOARM64", "v7.0") ++ if _ = goarm64(); Error == nil { ++ t.Errorf("Wrong parsing of GOARM64=7.0") ++ } ++ Error = nil ++ os.Setenv("GOARM64", "8.0") ++ if _ = goarm64(); Error == nil { ++ t.Errorf("Wrong parsing of GOARM64=8.0") ++ } ++ Error = nil ++ os.Setenv("GOARM64", "v8.0,lsb") ++ if _ = goarm64(); Error == nil { ++ t.Errorf("Wrong parsing of GOARM64=v8.0,lsb") ++ } ++ os.Setenv("GOARM64", "v8.0,lse") ++ if goarm64().Version != "v8.0" || goarm64().LSE != true || goarm64().Crypto != false { ++ t.Errorf("Wrong parsing of GOARM64=v8.0,lse") ++ } ++ os.Setenv("GOARM64", "v8.0,crypto") ++ if goarm64().Version != "v8.0" || goarm64().LSE != false || goarm64().Crypto != true { ++ t.Errorf("Wrong parsing of GOARM64=v8.0,crypto") ++ } ++ os.Setenv("GOARM64", "v8.0,crypto,lse") ++ if goarm64().Version != "v8.0" || goarm64().LSE != true || goarm64().Crypto != true { ++ t.Errorf("Wrong parsing of GOARM64=v8.0,crypto,lse") ++ } ++ os.Setenv("GOARM64", "v8.0,lse,crypto") ++ if goarm64().Version != "v8.0" || goarm64().LSE != true || goarm64().Crypto != true { ++ t.Errorf("Wrong parsing of GOARM64=v8.0,lse,crypto") ++ } ++ os.Setenv("GOARM64", "v9.0") ++ if goarm64().Version != "v9.0" || goarm64().LSE != true || goarm64().Crypto != false { ++ t.Errorf("Wrong parsing of GOARM64=v9.0") ++ } ++} ++ ++func TestGoarm64FeaturesSupports(t *testing.T) { ++ g := parseGoarm64("v9.3") ++ ++ if !g.Supports("v9.3") { ++ t.Errorf("Wrong goarm64Features.Supports for v9.3, v9.3") ++ } ++ ++ if g.Supports("v9.4") { ++ t.Errorf("Wrong goarm64Features.Supports for v9.3, v9.4") ++ } ++ ++ if !g.Supports("v8.8") { ++ t.Errorf("Wrong goarm64Features.Supports for v9.3, v8.8") ++ } ++ ++ if g.Supports("v8.9") { ++ t.Errorf("Wrong goarm64Features.Supports for v9.3, v8.9") ++ } ++ ++ if g.Supports(",lse") { ++ t.Errorf("Wrong goarm64Features.Supports for v9.3, ,lse") ++ } ++} ++ ++func TestGogoarchTags(t *testing.T) { ++ old_goarch := GOARCH ++ old_goarm64 := GOARM64 ++ ++ GOARCH = "arm64" ++ ++ os.Setenv("GOARM64", "v9.5") ++ GOARM64 = goarm64() ++ tags := gogoarchTags() ++ want := []string{"arm64.v9.0", "arm64.v9.1", "arm64.v9.2", "arm64.v9.3", "arm64.v9.4", "arm64.v9.5", ++ "arm64.v8.0", "arm64.v8.1", "arm64.v8.2", "arm64.v8.3", "arm64.v8.4", "arm64.v8.5", "arm64.v8.6", "arm64.v8.7", "arm64.v8.8", "arm64.v8.9"} ++ if len(tags) != len(want) { ++ t.Errorf("Wrong number of tags for GOARM64=v9.5") ++ } else { ++ for i, v := range tags { ++ if v != want[i] { ++ t.Error("Wrong tags for GOARM64=v9.5") ++ break ++ } ++ } ++ } ++ ++ GOARCH = old_goarch ++ GOARM64 = old_goarm64 + } +-- +2.39.5 + diff --git a/2111-cmd-compile-internal-ssagen-provide-intrinsicBuilder.patch b/2111-cmd-compile-internal-ssagen-provide-intrinsicBuilder.patch new file mode 100644 index 0000000..8975dbe --- /dev/null +++ b/2111-cmd-compile-internal-ssagen-provide-intrinsicBuilder.patch @@ -0,0 +1,706 @@ +From 6f517fe363d2bc8ce99f839f6736f6802790fa1a Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:48:20 +0800 +Subject: [PATCH 111/119] cmd/compile/internal/ssagen: provide + intrinsicBuilders + +Create an intrinsicBuilders type that has functions for adding and +looking up intrinsics. This makes the implementation more self contained, +readable and testable. Additionally, pass an *intrinsicBuildConfig to +initIntrinsics to improve testability without needing to modify package +level variables. + +Change-Id: I0ee0a19c192dd6da9f1c5f1c29b98a3ad8161fe2 +Reviewed-on: https://go-review.googlesource.com/c/go/+/605478 +Reviewed-by: David Chase +Reviewed-by: Keith Randall +Auto-Submit: Joel Sing +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Keith Randall +--- + src/cmd/compile/internal/ssagen/intrinsics.go | 227 ++++++++---------- + .../internal/ssagen/intrinsics_test.go | 195 +++++++++------ + src/cmd/compile/internal/ssagen/ssa.go | 2 +- + 3 files changed, 224 insertions(+), 200 deletions(-) + +diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go +index 59eb1869bb..c62837cd5b 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics.go +@@ -15,7 +15,7 @@ import ( + "cmd/internal/sys" + ) + +-var intrinsics map[intrinsicKey]intrinsicBuilder ++var intrinsics intrinsicBuilders + + // An intrinsicBuilder converts a call node n into an ssa value that + // implements that call as an intrinsic. args is a list of arguments to the func. +@@ -27,8 +27,80 @@ type intrinsicKey struct { + fn string + } + +-func initIntrinsics() { +- intrinsics = map[intrinsicKey]intrinsicBuilder{} ++// intrinsicBuildConfig specifies the config to use for intrinsic building. ++type intrinsicBuildConfig struct { ++ instrumenting bool ++ ++ go386 string ++ goamd64 int ++ goarm int ++ goarm64 buildcfg.Goarm64Features ++ gomips string ++ gomips64 string ++ goppc64 int ++ goriscv64 int ++} ++ ++type intrinsicBuilders map[intrinsicKey]intrinsicBuilder ++ ++// add adds the intrinsic builder b for pkg.fn for the given architecture. ++func (ib intrinsicBuilders) add(arch *sys.Arch, pkg, fn string, b intrinsicBuilder) { ++ ib[intrinsicKey{arch, pkg, fn}] = b ++} ++ ++// addForArchs adds the intrinsic builder b for pkg.fn for the given architectures. ++func (ib intrinsicBuilders) addForArchs(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) { ++ for _, arch := range archs { ++ ib.add(arch, pkg, fn, b) ++ } ++} ++ ++// addForFamilies does the same as addForArchs but operates on architecture families. ++func (ib intrinsicBuilders) addForFamilies(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) { ++ for _, arch := range sys.Archs { ++ if arch.InFamily(archFamilies...) { ++ intrinsics.add(arch, pkg, fn, b) ++ } ++ } ++} ++ ++// alias aliases pkg.fn to targetPkg.targetFn for all architectures in archs ++// for which targetPkg.targetFn already exists. ++func (ib intrinsicBuilders) alias(pkg, fn, targetPkg, targetFn string, archs ...*sys.Arch) { ++ // TODO(jsing): Consider making this work even if the alias is added ++ // before the intrinsic. ++ aliased := false ++ for _, arch := range archs { ++ if b := intrinsics.lookup(arch, targetPkg, targetFn); b != nil { ++ intrinsics.add(arch, pkg, fn, b) ++ aliased = true ++ } ++ } ++ if !aliased { ++ panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn)) ++ } ++} ++ ++// lookup looks up the intrinsic for a pkg.fn on the specified architecture. ++func (ib intrinsicBuilders) lookup(arch *sys.Arch, pkg, fn string) intrinsicBuilder { ++ return intrinsics[intrinsicKey{arch, pkg, fn}] ++} ++ ++func initIntrinsics(cfg *intrinsicBuildConfig) { ++ if cfg == nil { ++ cfg = &intrinsicBuildConfig{ ++ instrumenting: base.Flag.Cfg.Instrumenting, ++ go386: buildcfg.GO386, ++ goamd64: buildcfg.GOAMD64, ++ goarm: buildcfg.GOARM, ++ goarm64: buildcfg.GOARM64, ++ gomips: buildcfg.GOMIPS, ++ gomips64: buildcfg.GOMIPS64, ++ goppc64: buildcfg.GOPPC64, ++ goriscv64: buildcfg.GORISCV64, ++ } ++ } ++ intrinsics = intrinsicBuilders{} + + var p4 []*sys.Arch + var p8 []*sys.Arch +@@ -45,36 +117,18 @@ func initIntrinsics() { + } + all := sys.Archs[:] + +- // add adds the intrinsic b for pkg.fn for the given list of architectures. + add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) { +- for _, a := range archs { +- intrinsics[intrinsicKey{a, pkg, fn}] = b +- } ++ intrinsics.addForArchs(pkg, fn, b, archs...) + } +- // addF does the same as add but operates on architecture families. + addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) { +- for _, a := range sys.Archs { +- if a.InFamily(archFamilies...) { +- intrinsics[intrinsicKey{a, pkg, fn}] = b +- } +- } ++ intrinsics.addForFamilies(pkg, fn, b, archFamilies...) + } +- // alias defines pkg.fn = pkg2.fn2 for all architectures in archs for which pkg2.fn2 exists. + alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) { +- aliased := false +- for _, a := range archs { +- if b, ok := intrinsics[intrinsicKey{a, pkg2, fn2}]; ok { +- intrinsics[intrinsicKey{a, pkg, fn}] = b +- aliased = true +- } +- } +- if !aliased { +- panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn)) +- } ++ intrinsics.alias(pkg, fn, pkg2, fn2, archs...) + } + + /******** runtime ********/ +- if !base.Flag.Cfg.Instrumenting { ++ if !cfg.instrumenting { + add("runtime", "slicebytetostringtmp", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + // Compiler frontend optimizations emit OBYTES2STRTMP nodes +@@ -125,18 +179,18 @@ func initIntrinsics() { + sys.ARM64, sys.PPC64, sys.RISCV64) + + brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X} +- if buildcfg.GOPPC64 >= 10 { ++ if cfg.goppc64 >= 10 { + // Use only on Power10 as the new byte reverse instructions that Power10 provide + // make it worthwhile as an intrinsic + brev_arch = append(brev_arch, sys.PPC64) + } +- /******** internal/runtime/sys ********/ +- addF("internal/runtime/sys", "Bswap32", ++ /******** runtime/internal/sys ********/ ++ addF("runtime/internal/sys", "Bswap32", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0]) + }, + brev_arch...) +- addF("internal/runtime/sys", "Bswap64", ++ addF("runtime/internal/sys", "Bswap64", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0]) + }, +@@ -152,9 +206,9 @@ func initIntrinsics() { + + // Make Prefetch intrinsics for supported platforms + // On the unsupported platforms stub function will be eliminated +- addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache), ++ addF("runtime/internal/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache), + sys.AMD64, sys.ARM64, sys.PPC64) +- addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed), ++ addF("runtime/internal/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed), + sys.AMD64, sys.ARM64, sys.PPC64) + + /******** internal/runtime/atomic ********/ +@@ -258,7 +312,7 @@ func initIntrinsics() { + makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder { + + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if buildcfg.GOARM64.LSE || buildcfg.GOARM64.KPAtomicOpt { ++ if cfg.goarm64.LSE || cfg.goarm64.KPAtomicOpt { + emit(s, n, args, op1, typ, needReturn) + } else { + // Target Atomic feature is identified by dynamic detection +@@ -297,9 +351,6 @@ func initIntrinsics() { + makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder { + return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true) + } +- makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder { +- return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false) +- } + + atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) { + v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem()) +@@ -400,67 +451,6 @@ func initIntrinsics() { + }, + sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) + +- // arm64 always uses the new-style atomic logical operations, for both the +- // old and new style API. +- addF("internal/runtime/atomic", "And8", +- makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64), +- sys.ARM64) +- addF("internal/runtime/atomic", "Or8", +- makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64), +- sys.ARM64) +- addF("internal/runtime/atomic", "And64", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64), +- sys.ARM64) +- addF("internal/runtime/atomic", "And32", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64), +- sys.ARM64) +- addF("internal/runtime/atomic", "And", +- makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64), +- sys.ARM64) +- addF("internal/runtime/atomic", "Or64", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64), +- sys.ARM64) +- addF("internal/runtime/atomic", "Or32", +- makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64), +- sys.ARM64) +- addF("internal/runtime/atomic", "Or", +- makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64), +- sys.ARM64) +- +- // New-style atomic logical operations, which return the old memory value. +- addF("internal/runtime/atomic", "And64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) +- p0, p1 := s.split(v) +- s.vars[memVar] = p1 +- return p0 +- }, +- sys.AMD64) +- addF("internal/runtime/atomic", "And32", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) +- p0, p1 := s.split(v) +- s.vars[memVar] = p1 +- return p0 +- }, +- sys.AMD64) +- addF("internal/runtime/atomic", "Or64", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) +- p0, p1 := s.split(v) +- s.vars[memVar] = p1 +- return p0 +- }, +- sys.AMD64) +- addF("internal/runtime/atomic", "Or32", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) +- p0, p1 := s.split(v) +- s.vars[memVar] = p1 +- return p0 +- }, +- sys.AMD64) +- + // Aliases for atomic load operations + alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...) + alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...) +@@ -508,9 +498,9 @@ func initIntrinsics() { + alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...) + alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...) + +- // Aliases for atomic And/Or operations +- alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64) +- alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64) ++ // // Aliases for atomic And/Or operations ++ // alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64) ++ // alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64) + + /******** math ********/ + addF("math", "sqrt", +@@ -565,7 +555,7 @@ func initIntrinsics() { + return s.variable(n, types.Types[types.TFLOAT64]) + } + +- if buildcfg.GOAMD64 >= 3 { ++ if cfg.goamd64 >= 3 { + return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) + } + +@@ -631,7 +621,7 @@ func initIntrinsics() { + + makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if buildcfg.GOAMD64 >= 2 { ++ if cfg.goamd64 >= 2 { + return s.newValue1(op, types.Types[types.TFLOAT64], args[0]) + } + +@@ -727,12 +717,12 @@ func initIntrinsics() { + return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y) + }, + sys.S390X) +- alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...) +- alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...) ++ alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...) ++ alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...) + // ReverseBytes inlines correctly, no need to intrinsify it. + // Nothing special is needed for targets where ReverseBytes16 lowers to a rotate + // On Power10, 16-bit rotate is not available so use BRH instruction +- if buildcfg.GOPPC64 >= 10 { ++ if cfg.goppc64 >= 10 { + addF("math/bits", "ReverseBytes16", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0]) +@@ -847,7 +837,7 @@ func initIntrinsics() { + + makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if buildcfg.GOAMD64 >= 2 { ++ if cfg.goamd64 >= 2 { + return s.newValue1(op, types.Types[types.TINT], args[0]) + } + +@@ -941,12 +931,12 @@ func initIntrinsics() { + sys.AMD64) + alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64) + +- alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...) +- alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...) +- alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...) +- alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...) +- alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...) +- alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...) ++ alias("runtime/internal/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...) ++ alias("runtime/internal/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...) ++ alias("runtime/internal/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...) ++ alias("runtime/internal/sys", "Len8", "math/bits", "Len8", all...) ++ alias("runtime/internal/sys", "Len64", "math/bits", "Len64", all...) ++ alias("runtime/internal/sys", "OnesCount64", "math/bits", "OnesCount64", all...) + + /******** sync/atomic ********/ + +@@ -988,17 +978,6 @@ func initIntrinsics() { + alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...) + alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...) + +- alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64) +- alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64) +- alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64) +- alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64) +- alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64) +- alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64) +- alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64) +- alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64) +- alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64) +- alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64) +- + /******** math/big ********/ + alias("math/big", "mulWW", "math/bits", "Mul64", p8...) + } +@@ -1032,14 +1011,14 @@ func findIntrinsic(sym *types.Sym) intrinsicBuilder { + return nil + } + } +- return intrinsics[intrinsicKey{Arch.LinkArch.Arch, pkg, fn}] ++ return intrinsics.lookup(Arch.LinkArch.Arch, pkg, fn) + } + + func IsIntrinsicCall(n *ir.CallExpr) bool { + if n == nil { + return false + } +- name, ok := n.Fun.(*ir.Name) ++ name, ok := n.X.(*ir.Name) + if !ok { + return false + } +diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go +index 74ea276cc0..d725e4092a 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go +@@ -7,6 +7,8 @@ package ssagen + import ( + "internal/buildcfg" + "testing" ++ ++ "cmd/internal/sys" + ) + + type testIntrinsicKey struct { +@@ -17,11 +19,11 @@ type testIntrinsicKey struct { + + var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"386", "internal/runtime/math", "MulUintptr"}: struct{}{}, +- {"386", "internal/runtime/sys", "Bswap32"}: struct{}{}, +- {"386", "internal/runtime/sys", "Bswap64"}: struct{}{}, +- {"386", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, +- {"386", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, +- {"386", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"386", "runtime/internal/sys", "Bswap32"}: struct{}{}, ++ {"386", "runtime/internal/sys", "Bswap64"}: struct{}{}, ++ {"386", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"386", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, ++ {"386", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"386", "math", "sqrt"}: struct{}{}, + {"386", "math/bits", "ReverseBytes32"}: struct{}{}, + {"386", "math/bits", "ReverseBytes64"}: struct{}{}, +@@ -79,16 +81,16 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"amd64", "internal/runtime/math", "Add64"}: struct{}{}, + {"amd64", "internal/runtime/math", "Mul64"}: struct{}{}, + {"amd64", "internal/runtime/math", "MulUintptr"}: struct{}{}, +- {"amd64", "internal/runtime/sys", "Bswap32"}: struct{}{}, +- {"amd64", "internal/runtime/sys", "Bswap64"}: struct{}{}, +- {"amd64", "internal/runtime/sys", "Len64"}: struct{}{}, +- {"amd64", "internal/runtime/sys", "Len8"}: struct{}{}, +- {"amd64", "internal/runtime/sys", "OnesCount64"}: struct{}{}, +- {"amd64", "internal/runtime/sys", "Prefetch"}: struct{}{}, +- {"amd64", "internal/runtime/sys", "PrefetchStreamed"}: struct{}{}, +- {"amd64", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, +- {"amd64", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, +- {"amd64", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"amd64", "runtime/internal/sys", "Bswap32"}: struct{}{}, ++ {"amd64", "runtime/internal/sys", "Bswap64"}: struct{}{}, ++ {"amd64", "runtime/internal/sys", "Len64"}: struct{}{}, ++ {"amd64", "runtime/internal/sys", "Len8"}: struct{}{}, ++ {"amd64", "runtime/internal/sys", "OnesCount64"}: struct{}{}, ++ {"amd64", "runtime/internal/sys", "Prefetch"}: struct{}{}, ++ {"amd64", "runtime/internal/sys", "PrefetchStreamed"}: struct{}{}, ++ {"amd64", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"amd64", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, ++ {"amd64", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"amd64", "math", "Ceil"}: struct{}{}, + {"amd64", "math", "FMA"}: struct{}{}, + {"amd64", "math", "Floor"}: struct{}{}, +@@ -157,13 +159,13 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"amd64", "sync/atomic", "SwapUint32"}: struct{}{}, + {"amd64", "sync/atomic", "SwapUint64"}: struct{}{}, + {"amd64", "sync/atomic", "SwapUintptr"}: struct{}{}, +- {"arm", "internal/runtime/sys", "Bswap32"}: struct{}{}, +- {"arm", "internal/runtime/sys", "Bswap64"}: struct{}{}, +- {"arm", "internal/runtime/sys", "Len64"}: struct{}{}, +- {"arm", "internal/runtime/sys", "Len8"}: struct{}{}, +- {"arm", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, +- {"arm", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, +- {"arm", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"arm", "runtime/internal/sys", "Bswap32"}: struct{}{}, ++ {"arm", "runtime/internal/sys", "Bswap64"}: struct{}{}, ++ {"arm", "runtime/internal/sys", "Len64"}: struct{}{}, ++ {"arm", "runtime/internal/sys", "Len8"}: struct{}{}, ++ {"arm", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"arm", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, ++ {"arm", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"arm", "math", "Abs"}: struct{}{}, + {"arm", "math", "FMA"}: struct{}{}, + {"arm", "math", "sqrt"}: struct{}{}, +@@ -225,16 +227,16 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"arm64", "internal/runtime/math", "Add64"}: struct{}{}, + {"arm64", "internal/runtime/math", "Mul64"}: struct{}{}, + {"arm64", "internal/runtime/math", "MulUintptr"}: struct{}{}, +- {"arm64", "internal/runtime/sys", "Bswap32"}: struct{}{}, +- {"arm64", "internal/runtime/sys", "Bswap64"}: struct{}{}, +- {"arm64", "internal/runtime/sys", "Len64"}: struct{}{}, +- {"arm64", "internal/runtime/sys", "Len8"}: struct{}{}, +- {"arm64", "internal/runtime/sys", "OnesCount64"}: struct{}{}, +- {"arm64", "internal/runtime/sys", "Prefetch"}: struct{}{}, +- {"arm64", "internal/runtime/sys", "PrefetchStreamed"}: struct{}{}, +- {"arm64", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, +- {"arm64", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, +- {"arm64", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"arm64", "runtime/internal/sys", "Bswap32"}: struct{}{}, ++ {"arm64", "runtime/internal/sys", "Bswap64"}: struct{}{}, ++ {"arm64", "runtime/internal/sys", "Len64"}: struct{}{}, ++ {"arm64", "runtime/internal/sys", "Len8"}: struct{}{}, ++ {"arm64", "runtime/internal/sys", "OnesCount64"}: struct{}{}, ++ {"arm64", "runtime/internal/sys", "Prefetch"}: struct{}{}, ++ {"arm64", "runtime/internal/sys", "PrefetchStreamed"}: struct{}{}, ++ {"arm64", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"arm64", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, ++ {"arm64", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"arm64", "math", "Abs"}: struct{}{}, + {"arm64", "math", "Ceil"}: struct{}{}, + {"arm64", "math", "FMA"}: struct{}{}, +@@ -423,11 +425,11 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"mips", "internal/runtime/atomic", "Xchg"}: struct{}{}, + {"mips", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, + {"mips", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, +- {"mips", "internal/runtime/sys", "Len64"}: struct{}{}, +- {"mips", "internal/runtime/sys", "Len8"}: struct{}{}, +- {"mips", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, +- {"mips", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, +- {"mips", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"mips", "runtime/internal/sys", "Len64"}: struct{}{}, ++ {"mips", "runtime/internal/sys", "Len8"}: struct{}{}, ++ {"mips", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"mips", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, ++ {"mips", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"mips", "math", "Abs"}: struct{}{}, + {"mips", "math", "sqrt"}: struct{}{}, + {"mips", "math/bits", "Len"}: struct{}{}, +@@ -666,11 +668,11 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"mipsle", "internal/runtime/atomic", "Xchg"}: struct{}{}, + {"mipsle", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, + {"mipsle", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, +- {"mipsle", "internal/runtime/sys", "Len64"}: struct{}{}, +- {"mipsle", "internal/runtime/sys", "Len8"}: struct{}{}, +- {"mipsle", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, +- {"mipsle", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, +- {"mipsle", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"mipsle", "runtime/internal/sys", "Len64"}: struct{}{}, ++ {"mipsle", "runtime/internal/sys", "Len8"}: struct{}{}, ++ {"mipsle", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"mipsle", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, ++ {"mipsle", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"mipsle", "math", "Abs"}: struct{}{}, + {"mipsle", "math", "sqrt"}: struct{}{}, + {"mipsle", "math/bits", "Len"}: struct{}{}, +@@ -748,13 +750,13 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, + {"ppc64", "internal/runtime/math", "Add64"}: struct{}{}, + {"ppc64", "internal/runtime/math", "Mul64"}: struct{}{}, +- {"ppc64", "internal/runtime/sys", "Len64"}: struct{}{}, +- {"ppc64", "internal/runtime/sys", "Len8"}: struct{}{}, +- {"ppc64", "internal/runtime/sys", "OnesCount64"}: struct{}{}, +- {"ppc64", "internal/runtime/sys", "Prefetch"}: struct{}{}, +- {"ppc64", "internal/runtime/sys", "PrefetchStreamed"}: struct{}{}, +- {"ppc64", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, +- {"ppc64", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "Len64"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "Len8"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "OnesCount64"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "Prefetch"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "PrefetchStreamed"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, + {"ppc64", "math", "Abs"}: struct{}{}, + {"ppc64", "math", "Ceil"}: struct{}{}, + {"ppc64", "math", "Copysign"}: struct{}{}, +@@ -862,13 +864,13 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64le", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, + {"ppc64le", "internal/runtime/math", "Add64"}: struct{}{}, + {"ppc64le", "internal/runtime/math", "Mul64"}: struct{}{}, +- {"ppc64le", "internal/runtime/sys", "Len64"}: struct{}{}, +- {"ppc64le", "internal/runtime/sys", "Len8"}: struct{}{}, +- {"ppc64le", "internal/runtime/sys", "OnesCount64"}: struct{}{}, +- {"ppc64le", "internal/runtime/sys", "Prefetch"}: struct{}{}, +- {"ppc64le", "internal/runtime/sys", "PrefetchStreamed"}: struct{}{}, +- {"ppc64le", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, +- {"ppc64le", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "Len64"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "Len8"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "OnesCount64"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "Prefetch"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "PrefetchStreamed"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, + {"ppc64le", "math", "Abs"}: struct{}{}, + {"ppc64le", "math", "Ceil"}: struct{}{}, + {"ppc64le", "math", "Copysign"}: struct{}{}, +@@ -1072,14 +1074,14 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"s390x", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, + {"s390x", "internal/runtime/math", "Add64"}: struct{}{}, + {"s390x", "internal/runtime/math", "Mul64"}: struct{}{}, +- {"s390x", "internal/runtime/sys", "Bswap32"}: struct{}{}, +- {"s390x", "internal/runtime/sys", "Bswap64"}: struct{}{}, +- {"s390x", "internal/runtime/sys", "Len64"}: struct{}{}, +- {"s390x", "internal/runtime/sys", "Len8"}: struct{}{}, +- {"s390x", "internal/runtime/sys", "OnesCount64"}: struct{}{}, +- {"s390x", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, +- {"s390x", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, +- {"s390x", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"s390x", "runtime/internal/sys", "Bswap32"}: struct{}{}, ++ {"s390x", "runtime/internal/sys", "Bswap64"}: struct{}{}, ++ {"s390x", "runtime/internal/sys", "Len64"}: struct{}{}, ++ {"s390x", "runtime/internal/sys", "Len8"}: struct{}{}, ++ {"s390x", "runtime/internal/sys", "OnesCount64"}: struct{}{}, ++ {"s390x", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"s390x", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, ++ {"s390x", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"s390x", "math", "Ceil"}: struct{}{}, + {"s390x", "math", "FMA"}: struct{}{}, + {"s390x", "math", "Floor"}: struct{}{}, +@@ -1145,12 +1147,12 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"s390x", "sync/atomic", "SwapUint32"}: struct{}{}, + {"s390x", "sync/atomic", "SwapUint64"}: struct{}{}, + {"s390x", "sync/atomic", "SwapUintptr"}: struct{}{}, +- {"wasm", "internal/runtime/sys", "Len64"}: struct{}{}, +- {"wasm", "internal/runtime/sys", "Len8"}: struct{}{}, +- {"wasm", "internal/runtime/sys", "OnesCount64"}: struct{}{}, +- {"wasm", "internal/runtime/sys", "TrailingZeros32"}: struct{}{}, +- {"wasm", "internal/runtime/sys", "TrailingZeros64"}: struct{}{}, +- {"wasm", "internal/runtime/sys", "TrailingZeros8"}: struct{}{}, ++ {"wasm", "runtime/internal/sys", "Len64"}: struct{}{}, ++ {"wasm", "runtime/internal/sys", "Len8"}: struct{}{}, ++ {"wasm", "runtime/internal/sys", "OnesCount64"}: struct{}{}, ++ {"wasm", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"wasm", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, ++ {"wasm", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"wasm", "math", "Abs"}: struct{}{}, + {"wasm", "math", "Ceil"}: struct{}{}, + {"wasm", "math", "Copysign"}: struct{}{}, +@@ -1182,20 +1184,20 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + } + + var wantIntrinsicsPower10 = map[testIntrinsicKey]struct{}{ +- {"ppc64", "internal/runtime/sys", "Bswap32"}: struct{}{}, +- {"ppc64", "internal/runtime/sys", "Bswap64"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "Bswap32"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "Bswap64"}: struct{}{}, + {"ppc64", "math/bits", "ReverseBytes16"}: struct{}{}, + {"ppc64", "math/bits", "ReverseBytes32"}: struct{}{}, + {"ppc64", "math/bits", "ReverseBytes64"}: struct{}{}, +- {"ppc64le", "internal/runtime/sys", "Bswap32"}: struct{}{}, +- {"ppc64le", "internal/runtime/sys", "Bswap64"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "Bswap32"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "Bswap64"}: struct{}{}, + {"ppc64le", "math/bits", "ReverseBytes16"}: struct{}{}, + {"ppc64le", "math/bits", "ReverseBytes32"}: struct{}{}, + {"ppc64le", "math/bits", "ReverseBytes64"}: struct{}{}, + } + + func TestIntrinsics(t *testing.T) { +- initIntrinsics() ++ initIntrinsics(nil) + + want := make(map[testIntrinsicKey]struct{}) + for ik, iv := range wantIntrinsics { +@@ -1222,3 +1224,46 @@ func TestIntrinsics(t *testing.T) { + } + } + } ++ ++func TestIntrinsicBuilders(t *testing.T) { ++ cfg := &intrinsicBuildConfig{} ++ initIntrinsics(cfg) ++ ++ for _, arch := range sys.Archs { ++ if intrinsics.lookup(arch, "runtime", "getcallersp") == nil { ++ t.Errorf("No intrinsic for runtime.getcallersp on arch %v", arch) ++ } ++ } ++ ++ if intrinsics.lookup(sys.ArchAMD64, "runtime", "slicebytetostringtmp") == nil { ++ t.Error("No intrinsic for runtime.slicebytetostringtmp") ++ } ++ ++ if intrinsics.lookup(sys.ArchRISCV64, "runtime", "publicationBarrier") == nil { ++ t.Errorf("No intrinsic for runtime.publicationBarrier on arch %v", sys.ArchRISCV64) ++ } ++ ++ if intrinsics.lookup(sys.ArchAMD64, "runtime/internal/sys", "Bswap32") == nil { ++ t.Errorf("No intrinsic for runtime/internal/sys.Bswap32 on arch %v", sys.ArchAMD64) ++ } ++ if intrinsics.lookup(sys.ArchAMD64, "runtime/internal/sys", "Bswap64") == nil { ++ t.Errorf("No intrinsic for runtime/internal/sys.Bswap64 on arch %v", sys.ArchAMD64) ++ } ++ ++ if intrinsics.lookup(sys.ArchPPC64, "runtime/internal/sys", "Bswap64") != nil { ++ t.Errorf("Found intrinsic for runtime/internal/sys.Bswap64 on arch %v", sys.ArchPPC64) ++ } ++ ++ cfg.goppc64 = 10 ++ cfg.instrumenting = true ++ ++ initIntrinsics(cfg) ++ ++ if intrinsics.lookup(sys.ArchAMD64, "runtime", "slicebytetostringtmp") != nil { ++ t.Error("Intrinsic incorrectly exists for runtime.slicebytetostringtmp") ++ } ++ ++ if intrinsics.lookup(sys.ArchPPC64, "runtime/internal/sys", "Bswap64") == nil { ++ t.Errorf("No intrinsic for runtime/internal/sys.Bswap64 on arch %v", sys.ArchPPC64) ++ } ++} +diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go +index 0f6f2de4a7..71ad2df0a7 100644 +--- a/src/cmd/compile/internal/ssagen/ssa.go ++++ b/src/cmd/compile/internal/ssagen/ssa.go +@@ -209,7 +209,7 @@ func InitConfig() { + } + + func InitTables() { +- initIntrinsics() ++ initIntrinsics(nil) + } + + // AbiForBodylessFuncStackMap returns the ABI for a bodyless function's stack map. +-- +2.39.5 + diff --git a/2112-cmd-compile-internal-ssagen-improve-intrinsic-test.patch b/2112-cmd-compile-internal-ssagen-improve-intrinsic-test.patch new file mode 100644 index 0000000..02c47f7 --- /dev/null +++ b/2112-cmd-compile-internal-ssagen-improve-intrinsic-test.patch @@ -0,0 +1,155 @@ +From e22dc7c7ba5db0c2850f8d11cc5bacc0c99598dd Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:48:22 +0800 +Subject: [PATCH 112/119] cmd/compile/internal/ssagen: improve intrinsic test + +Now that we can pass configuration to initIntrinsics, clean up the +intrinsic test and always enable power10. Additionally, provide an +-update flag that prints out updated golden values. + +Change-Id: Ibfef339d513a4d67d53a5a310a82165592ca338f +Reviewed-on: https://go-review.googlesource.com/c/go/+/607055 +Reviewed-by: David Chase +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Keith Randall +Reviewed-by: Keith Randall +--- + .../internal/ssagen/intrinsics_test.go | 72 +++++++++++-------- + 1 file changed, 44 insertions(+), 28 deletions(-) + +diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go +index d725e4092a..4bf5fce2a5 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go +@@ -5,12 +5,17 @@ + package ssagen + + import ( +- "internal/buildcfg" ++ "flag" ++ "fmt" ++ "slices" ++ "strings" + "testing" + + "cmd/internal/sys" + ) + ++var updateIntrinsics = flag.Bool("update", false, "Print an updated intrinsics table") ++ + type testIntrinsicKey struct { + archName string + pkg string +@@ -750,6 +755,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, + {"ppc64", "internal/runtime/math", "Add64"}: struct{}{}, + {"ppc64", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "Bswap32"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "Bswap64"}: struct{}{}, + {"ppc64", "runtime/internal/sys", "Len64"}: struct{}{}, + {"ppc64", "runtime/internal/sys", "Len8"}: struct{}{}, + {"ppc64", "runtime/internal/sys", "OnesCount64"}: struct{}{}, +@@ -779,6 +786,9 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64", "math/bits", "OnesCount32"}: struct{}{}, + {"ppc64", "math/bits", "OnesCount64"}: struct{}{}, + {"ppc64", "math/bits", "OnesCount8"}: struct{}{}, ++ {"ppc64", "math/bits", "ReverseBytes16"}: struct{}{}, ++ {"ppc64", "math/bits", "ReverseBytes32"}: struct{}{}, ++ {"ppc64", "math/bits", "ReverseBytes64"}: struct{}{}, + {"ppc64", "math/bits", "RotateLeft"}: struct{}{}, + {"ppc64", "math/bits", "RotateLeft32"}: struct{}{}, + {"ppc64", "math/bits", "RotateLeft64"}: struct{}{}, +@@ -864,6 +874,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64le", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, + {"ppc64le", "internal/runtime/math", "Add64"}: struct{}{}, + {"ppc64le", "internal/runtime/math", "Mul64"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "Bswap32"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "Bswap64"}: struct{}{}, + {"ppc64le", "runtime/internal/sys", "Len64"}: struct{}{}, + {"ppc64le", "runtime/internal/sys", "Len8"}: struct{}{}, + {"ppc64le", "runtime/internal/sys", "OnesCount64"}: struct{}{}, +@@ -893,6 +905,9 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64le", "math/bits", "OnesCount32"}: struct{}{}, + {"ppc64le", "math/bits", "OnesCount64"}: struct{}{}, + {"ppc64le", "math/bits", "OnesCount8"}: struct{}{}, ++ {"ppc64le", "math/bits", "ReverseBytes16"}: struct{}{}, ++ {"ppc64le", "math/bits", "ReverseBytes32"}: struct{}{}, ++ {"ppc64le", "math/bits", "ReverseBytes64"}: struct{}{}, + {"ppc64le", "math/bits", "RotateLeft"}: struct{}{}, + {"ppc64le", "math/bits", "RotateLeft32"}: struct{}{}, + {"ppc64le", "math/bits", "RotateLeft64"}: struct{}{}, +@@ -1183,43 +1198,44 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"wasm", "runtime", "slicebytetostringtmp"}: struct{}{}, + } + +-var wantIntrinsicsPower10 = map[testIntrinsicKey]struct{}{ +- {"ppc64", "runtime/internal/sys", "Bswap32"}: struct{}{}, +- {"ppc64", "runtime/internal/sys", "Bswap64"}: struct{}{}, +- {"ppc64", "math/bits", "ReverseBytes16"}: struct{}{}, +- {"ppc64", "math/bits", "ReverseBytes32"}: struct{}{}, +- {"ppc64", "math/bits", "ReverseBytes64"}: struct{}{}, +- {"ppc64le", "runtime/internal/sys", "Bswap32"}: struct{}{}, +- {"ppc64le", "runtime/internal/sys", "Bswap64"}: struct{}{}, +- {"ppc64le", "math/bits", "ReverseBytes16"}: struct{}{}, +- {"ppc64le", "math/bits", "ReverseBytes32"}: struct{}{}, +- {"ppc64le", "math/bits", "ReverseBytes64"}: struct{}{}, +-} +- + func TestIntrinsics(t *testing.T) { +- initIntrinsics(nil) +- +- want := make(map[testIntrinsicKey]struct{}) +- for ik, iv := range wantIntrinsics { +- want[ik] = iv ++ cfg := &intrinsicBuildConfig{ ++ goppc64: 10, + } +- if buildcfg.GOPPC64 >= 10 { +- for ik, iv := range wantIntrinsicsPower10 { +- want[ik] = iv ++ initIntrinsics(cfg) ++ ++ if *updateIntrinsics { ++ var updatedIntrinsics []*testIntrinsicKey ++ for ik, _ := range intrinsics { ++ updatedIntrinsics = append(updatedIntrinsics, &testIntrinsicKey{ik.arch.Name, ik.pkg, ik.fn}) + } ++ slices.SortFunc(updatedIntrinsics, func(a, b *testIntrinsicKey) int { ++ if n := strings.Compare(a.archName, b.archName); n != 0 { ++ return n ++ } ++ if n := strings.Compare(a.pkg, b.pkg); n != 0 { ++ return n ++ } ++ return strings.Compare(a.fn, b.fn) ++ }) ++ for _, tik := range updatedIntrinsics { ++ fmt.Printf("\t{%q, %q, %q}: struct{}{},\n", tik.archName, tik.pkg, tik.fn) ++ } ++ return + } + +- got := make(map[testIntrinsicKey]struct{}) ++ gotIntrinsics := make(map[testIntrinsicKey]struct{}) + for ik, _ := range intrinsics { +- got[testIntrinsicKey{ik.arch.Name, ik.pkg, ik.fn}] = struct{}{} ++ gotIntrinsics[testIntrinsicKey{ik.arch.Name, ik.pkg, ik.fn}] = struct{}{} + } +- for ik, _ := range got { +- if _, found := want[ik]; !found { ++ for ik, _ := range gotIntrinsics { ++ if _, found := wantIntrinsics[ik]; !found { + t.Errorf("Got unwanted intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn) + } + } +- for ik, _ := range want { +- if _, found := got[ik]; !found { ++ ++ for ik, _ := range wantIntrinsics { ++ if _, found := gotIntrinsics[ik]; !found { + t.Errorf("Want intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn) + } + } +-- +2.39.5 + diff --git a/2113-cmd-compile-simplify-intrinsification-of-BitLen16-an.patch b/2113-cmd-compile-simplify-intrinsification-of-BitLen16-an.patch new file mode 100644 index 0000000..a1ac37c --- /dev/null +++ b/2113-cmd-compile-simplify-intrinsification-of-BitLen16-an.patch @@ -0,0 +1,582 @@ +From 0967ad259be6b3a768723327135dcbe368f655a2 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:48:22 +0800 +Subject: [PATCH 113/119] cmd/compile: simplify intrinsification of BitLen16 + and BitLen8 + +Decompose BitLen16 and BitLen8 within the SSA rules for architectures that +support BitLen32 or BitLen64, rather than having a custom intrinsic. + +Change-Id: Ie4188ce69d1021e63cec27a8e7418efb0714812b +Reviewed-on: https://go-review.googlesource.com/c/go/+/651817 +Reviewed-by: Keith Randall +Reviewed-by: Michael Pratt +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Junyang Shao +TryBot-Result: Gopher Robot +Run-TryBot: Joel Sing +Reviewed-by: Michael Knyszek +--- + src/cmd/compile/internal/ssa/_gen/ARM.rules | 1 + + src/cmd/compile/internal/ssa/_gen/ARM64.rules | 1 + + src/cmd/compile/internal/ssa/_gen/MIPS.rules | 1 + + src/cmd/compile/internal/ssa/_gen/PPC64.rules | 1 + + src/cmd/compile/internal/ssa/_gen/S390X.rules | 1 + + src/cmd/compile/internal/ssa/_gen/Wasm.rules | 1 + + src/cmd/compile/internal/ssa/rewriteARM.go | 34 +++++++++++++ + src/cmd/compile/internal/ssa/rewriteARM64.go | 34 +++++++++++++ + src/cmd/compile/internal/ssa/rewriteMIPS.go | 34 +++++++++++++ + src/cmd/compile/internal/ssa/rewritePPC64.go | 34 +++++++++++++ + src/cmd/compile/internal/ssa/rewriteS390X.go | 51 +++++++++++++++++++ + src/cmd/compile/internal/ssa/rewriteWasm.go | 51 +++++++++++++++++++ + src/cmd/compile/internal/ssagen/intrinsics.go | 39 ++------------ + 13 files changed, 249 insertions(+), 34 deletions(-) + +diff --git a/src/cmd/compile/internal/ssa/_gen/ARM.rules b/src/cmd/compile/internal/ssa/_gen/ARM.rules +index a60afb000a..46dbe31f3a 100644 +--- a/src/cmd/compile/internal/ssa/_gen/ARM.rules ++++ b/src/cmd/compile/internal/ssa/_gen/ARM.rules +@@ -80,6 +80,7 @@ + + // bit length + (BitLen32 x) => (RSBconst [32] (CLZ x)) ++(BitLen(16|8) x) => (BitLen32 (ZeroExt(16|8)to32 x)) + + // byte swap for ARMv5 + // let (a, b, c, d) be the bytes of x from high to low +diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64.rules b/src/cmd/compile/internal/ssa/_gen/ARM64.rules +index 94032d6ca4..4d0affae79 100644 +--- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules +@@ -103,6 +103,7 @@ + + (BitLen64 x) => (SUB (MOVDconst [64]) (CLZ x)) + (BitLen32 x) => (SUB (MOVDconst [32]) (CLZW x)) ++(BitLen(16|8) x) => (BitLen64 (ZeroExt(16|8)to64 x)) + + (Bswap64 ...) => (REV ...) + (Bswap32 ...) => (REVW ...) +diff --git a/src/cmd/compile/internal/ssa/_gen/MIPS.rules b/src/cmd/compile/internal/ssa/_gen/MIPS.rules +index d6ae0101cb..e10cf359e3 100644 +--- a/src/cmd/compile/internal/ssa/_gen/MIPS.rules ++++ b/src/cmd/compile/internal/ssa/_gen/MIPS.rules +@@ -135,6 +135,7 @@ + + // bit length + (BitLen32 x) => (SUB (MOVWconst [32]) (CLZ x)) ++(BitLen(16|8) x) => (BitLen32 (ZeroExt(16|8)to32 x)) + + // boolean ops -- booleans are represented with 0=false, 1=true + (AndB ...) => (AND ...) +diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64.rules b/src/cmd/compile/internal/ssa/_gen/PPC64.rules +index 97e592fd7e..d1e8bba7ef 100644 +--- a/src/cmd/compile/internal/ssa/_gen/PPC64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/PPC64.rules +@@ -252,6 +252,7 @@ + + (BitLen64 x) => (SUBFCconst [64] (CNTLZD x)) + (BitLen32 x) => (SUBFCconst [32] (CNTLZW x)) ++(BitLen(16|8) x) => (BitLen64 (ZeroExt(16|8)to64 x)) + + (PopCount64 ...) => (POPCNTD ...) + (PopCount(32|16|8) x) => (POPCNT(W|W|B) (MOV(W|H|B)Zreg x)) +diff --git a/src/cmd/compile/internal/ssa/_gen/S390X.rules b/src/cmd/compile/internal/ssa/_gen/S390X.rules +index a9d62c79ce..78ef1214d7 100644 +--- a/src/cmd/compile/internal/ssa/_gen/S390X.rules ++++ b/src/cmd/compile/internal/ssa/_gen/S390X.rules +@@ -89,6 +89,7 @@ + (Ctz32 x) => (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW (SUBWconst [1] x) (NOTW x))))) + + (BitLen64 x) => (SUB (MOVDconst [64]) (FLOGR x)) ++(BitLen(32|16|8) x) => (BitLen64 (ZeroExt(32|16|8)to64 x)) + + // POPCNT treats the input register as a vector of 8 bytes, producing + // a population count for each individual byte. For inputs larger than +diff --git a/src/cmd/compile/internal/ssa/_gen/Wasm.rules b/src/cmd/compile/internal/ssa/_gen/Wasm.rules +index 91a9fc5e4a..03c681f440 100644 +--- a/src/cmd/compile/internal/ssa/_gen/Wasm.rules ++++ b/src/cmd/compile/internal/ssa/_gen/Wasm.rules +@@ -329,6 +329,7 @@ + (Ctz(64|32|16|8)NonZero ...) => (I64Ctz ...) + + (BitLen64 x) => (I64Sub (I64Const [64]) (I64Clz x)) ++(BitLen(32|16|8) x) => (BitLen64 (ZeroExt(32|16|8)to64 x)) + + (PopCount64 ...) => (I64Popcnt ...) + (PopCount32 x) => (I64Popcnt (ZeroExt32to64 x)) +diff --git a/src/cmd/compile/internal/ssa/rewriteARM.go b/src/cmd/compile/internal/ssa/rewriteARM.go +index 70cacb90ed..d622a0cd3a 100644 +--- a/src/cmd/compile/internal/ssa/rewriteARM.go ++++ b/src/cmd/compile/internal/ssa/rewriteARM.go +@@ -466,8 +466,12 @@ func rewriteValueARM(v *Value) bool { + return true + case OpAvg32u: + return rewriteValueARM_OpAvg32u(v) ++ case OpBitLen16: ++ return rewriteValueARM_OpBitLen16(v) + case OpBitLen32: + return rewriteValueARM_OpBitLen32(v) ++ case OpBitLen8: ++ return rewriteValueARM_OpBitLen8(v) + case OpBswap32: + return rewriteValueARM_OpBswap32(v) + case OpClosureCall: +@@ -13042,6 +13046,21 @@ func rewriteValueARM_OpAvg32u(v *Value) bool { + return true + } + } ++func rewriteValueARM_OpBitLen16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen16 x) ++ // result: (BitLen32 (ZeroExt16to32 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen32) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to32, typ.UInt32) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueARM_OpBitLen32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +@@ -13058,6 +13077,21 @@ func rewriteValueARM_OpBitLen32(v *Value) bool { + return true + } + } ++func rewriteValueARM_OpBitLen8(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen8 x) ++ // result: (BitLen32 (ZeroExt8to32 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen32) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to32, typ.UInt32) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueARM_OpBswap32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go +index 93a741ad87..65d99f5a9f 100644 +--- a/src/cmd/compile/internal/ssa/rewriteARM64.go ++++ b/src/cmd/compile/internal/ssa/rewriteARM64.go +@@ -535,10 +535,14 @@ func rewriteValueARM64(v *Value) bool { + return true + case OpAvg64u: + return rewriteValueARM64_OpAvg64u(v) ++ case OpBitLen16: ++ return rewriteValueARM64_OpBitLen16(v) + case OpBitLen32: + return rewriteValueARM64_OpBitLen32(v) + case OpBitLen64: + return rewriteValueARM64_OpBitLen64(v) ++ case OpBitLen8: ++ return rewriteValueARM64_OpBitLen8(v) + case OpBitRev16: + return rewriteValueARM64_OpBitRev16(v) + case OpBitRev32: +@@ -18425,6 +18429,21 @@ func rewriteValueARM64_OpAvg64u(v *Value) bool { + return true + } + } ++func rewriteValueARM64_OpBitLen16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen16 x) ++ // result: (BitLen64 (ZeroExt16to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueARM64_OpBitLen32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +@@ -18459,6 +18478,21 @@ func rewriteValueARM64_OpBitLen64(v *Value) bool { + return true + } + } ++func rewriteValueARM64_OpBitLen8(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen8 x) ++ // result: (BitLen64 (ZeroExt8to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueARM64_OpBitRev16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +diff --git a/src/cmd/compile/internal/ssa/rewriteMIPS.go b/src/cmd/compile/internal/ssa/rewriteMIPS.go +index 6a259f5a47..978be79417 100644 +--- a/src/cmd/compile/internal/ssa/rewriteMIPS.go ++++ b/src/cmd/compile/internal/ssa/rewriteMIPS.go +@@ -82,8 +82,12 @@ func rewriteValueMIPS(v *Value) bool { + return true + case OpAvg32u: + return rewriteValueMIPS_OpAvg32u(v) ++ case OpBitLen16: ++ return rewriteValueMIPS_OpBitLen16(v) + case OpBitLen32: + return rewriteValueMIPS_OpBitLen32(v) ++ case OpBitLen8: ++ return rewriteValueMIPS_OpBitLen8(v) + case OpClosureCall: + v.Op = OpMIPSCALLclosure + return true +@@ -792,6 +796,21 @@ func rewriteValueMIPS_OpAvg32u(v *Value) bool { + return true + } + } ++func rewriteValueMIPS_OpBitLen16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen16 x) ++ // result: (BitLen32 (ZeroExt16to32 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen32) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to32, typ.UInt32) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueMIPS_OpBitLen32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +@@ -810,6 +829,21 @@ func rewriteValueMIPS_OpBitLen32(v *Value) bool { + return true + } + } ++func rewriteValueMIPS_OpBitLen8(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen8 x) ++ // result: (BitLen32 (ZeroExt8to32 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen32) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to32, typ.UInt32) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueMIPS_OpCom16(v *Value) bool { + v_0 := v.Args[0] + // match: (Com16 x) +diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go +index d1c0c2b07f..bdc690212e 100644 +--- a/src/cmd/compile/internal/ssa/rewritePPC64.go ++++ b/src/cmd/compile/internal/ssa/rewritePPC64.go +@@ -103,10 +103,14 @@ func rewriteValuePPC64(v *Value) bool { + return rewriteValuePPC64_OpAtomicStoreRel64(v) + case OpAvg64u: + return rewriteValuePPC64_OpAvg64u(v) ++ case OpBitLen16: ++ return rewriteValuePPC64_OpBitLen16(v) + case OpBitLen32: + return rewriteValuePPC64_OpBitLen32(v) + case OpBitLen64: + return rewriteValuePPC64_OpBitLen64(v) ++ case OpBitLen8: ++ return rewriteValuePPC64_OpBitLen8(v) + case OpBswap16: + return rewriteValuePPC64_OpBswap16(v) + case OpBswap32: +@@ -1106,6 +1110,21 @@ func rewriteValuePPC64_OpAvg64u(v *Value) bool { + return true + } + } ++func rewriteValuePPC64_OpBitLen16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen16 x) ++ // result: (BitLen64 (ZeroExt16to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValuePPC64_OpBitLen32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +@@ -1138,6 +1157,21 @@ func rewriteValuePPC64_OpBitLen64(v *Value) bool { + return true + } + } ++func rewriteValuePPC64_OpBitLen8(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen8 x) ++ // result: (BitLen64 (ZeroExt8to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValuePPC64_OpBswap16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go +index a3d621898f..bf3073eea9 100644 +--- a/src/cmd/compile/internal/ssa/rewriteS390X.go ++++ b/src/cmd/compile/internal/ssa/rewriteS390X.go +@@ -88,8 +88,14 @@ func rewriteValueS390X(v *Value) bool { + return rewriteValueS390X_OpAtomicStoreRel32(v) + case OpAvg64u: + return rewriteValueS390X_OpAvg64u(v) ++ case OpBitLen16: ++ return rewriteValueS390X_OpBitLen16(v) ++ case OpBitLen32: ++ return rewriteValueS390X_OpBitLen32(v) + case OpBitLen64: + return rewriteValueS390X_OpBitLen64(v) ++ case OpBitLen8: ++ return rewriteValueS390X_OpBitLen8(v) + case OpBswap16: + return rewriteValueS390X_OpBswap16(v) + case OpBswap32: +@@ -1261,6 +1267,36 @@ func rewriteValueS390X_OpAvg64u(v *Value) bool { + return true + } + } ++func rewriteValueS390X_OpBitLen16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen16 x) ++ // result: (BitLen64 (ZeroExt16to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} ++func rewriteValueS390X_OpBitLen32(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen32 x) ++ // result: (BitLen64 (ZeroExt32to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueS390X_OpBitLen64(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +@@ -1278,6 +1314,21 @@ func rewriteValueS390X_OpBitLen64(v *Value) bool { + return true + } + } ++func rewriteValueS390X_OpBitLen8(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen8 x) ++ // result: (BitLen64 (ZeroExt8to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueS390X_OpBswap16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +diff --git a/src/cmd/compile/internal/ssa/rewriteWasm.go b/src/cmd/compile/internal/ssa/rewriteWasm.go +index 6f83aea13a..f3b8205b24 100644 +--- a/src/cmd/compile/internal/ssa/rewriteWasm.go ++++ b/src/cmd/compile/internal/ssa/rewriteWasm.go +@@ -49,8 +49,14 @@ func rewriteValueWasm(v *Value) bool { + case OpAndB: + v.Op = OpWasmI64And + return true ++ case OpBitLen16: ++ return rewriteValueWasm_OpBitLen16(v) ++ case OpBitLen32: ++ return rewriteValueWasm_OpBitLen32(v) + case OpBitLen64: + return rewriteValueWasm_OpBitLen64(v) ++ case OpBitLen8: ++ return rewriteValueWasm_OpBitLen8(v) + case OpCeil: + v.Op = OpWasmF64Ceil + return true +@@ -679,6 +685,36 @@ func rewriteValueWasm_OpAddr(v *Value) bool { + return true + } + } ++func rewriteValueWasm_OpBitLen16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen16 x) ++ // result: (BitLen64 (ZeroExt16to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} ++func rewriteValueWasm_OpBitLen32(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen32 x) ++ // result: (BitLen64 (ZeroExt32to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueWasm_OpBitLen64(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +@@ -696,6 +732,21 @@ func rewriteValueWasm_OpBitLen64(v *Value) bool { + return true + } + } ++func rewriteValueWasm_OpBitLen8(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen8 x) ++ // result: (BitLen64 (ZeroExt8to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueWasm_OpCom16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go +index c62837cd5b..6fcdcf57ed 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics.go +@@ -734,51 +734,22 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0]) + }, +- sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) ++ sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) + addF("math/bits", "Len32", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) + }, +- sys.AMD64, sys.ARM64, sys.PPC64) +- addF("math/bits", "Len32", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if s.config.PtrSize == 4 { +- return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) +- } +- x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0]) +- return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) +- }, +- sys.ARM, sys.S390X, sys.MIPS, sys.Wasm) +- addF("math/bits", "Len16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if s.config.PtrSize == 4 { +- x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0]) +- return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x) +- } +- x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0]) +- return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) +- }, +- sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) ++ sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) + addF("math/bits", "Len16", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0]) + }, +- sys.AMD64) +- addF("math/bits", "Len8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if s.config.PtrSize == 4 { +- x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0]) +- return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x) +- } +- x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0]) +- return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) +- }, +- sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) ++ sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) + addF("math/bits", "Len8", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0]) + }, +- sys.AMD64) ++ sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) + addF("math/bits", "Len", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + if s.config.PtrSize == 4 { +@@ -786,7 +757,7 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { + } + return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0]) + }, +- sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) ++ sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) + // LeadingZeros is handled because it trivially calls Len. + addF("math/bits", "Reverse64", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +-- +2.39.5 + diff --git a/2114-cmd-compile-simplify-intrinsification-of-TrailingZer.patch b/2114-cmd-compile-simplify-intrinsification-of-TrailingZer.patch new file mode 100644 index 0000000..937cab4 --- /dev/null +++ b/2114-cmd-compile-simplify-intrinsification-of-TrailingZer.patch @@ -0,0 +1,563 @@ +From a0cbf6b18dada4ea1b2a86df54d509c1151f47cd Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:48:22 +0800 +Subject: [PATCH 114/119] cmd/compile: simplify intrinsification of + TrailingZeros16 and TrailingZeros8 + +Decompose Ctz16 and Ctz8 within the SSA rules for LOONG64, MIPS, PPC64 +and S390X, rather than having a custom intrinsic. Note that for PPC64 this +actually allows the existing Ctz16 and Ctz8 rules to be used. + +Change-Id: I27a5e978f852b9d75396d2a80f5d7dfcb5ef7dd4 +Reviewed-on: https://go-review.googlesource.com/c/go/+/651816 +Reviewed-by: Paul Murphy +TryBot-Result: Gopher Robot +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Michael Pratt +Run-TryBot: Joel Sing +Reviewed-by: Keith Randall +Reviewed-by: Keith Randall +--- + src/cmd/compile/internal/ssa/_gen/MIPS.rules | 5 +- + src/cmd/compile/internal/ssa/_gen/PPC64.rules | 21 +++--- + src/cmd/compile/internal/ssa/_gen/S390X.rules | 6 +- + src/cmd/compile/internal/ssa/rewriteMIPS.go | 44 +++++++++++++ + src/cmd/compile/internal/ssa/rewritePPC64.go | 64 ++++++++++++++++++- + src/cmd/compile/internal/ssa/rewriteS390X.go | 46 ++++++++++++- + src/cmd/compile/internal/ssagen/intrinsics.go | 47 ++------------ + .../internal/ssagen/intrinsics_test.go | 6 +- + test/codegen/mathbits.go | 6 +- + 9 files changed, 181 insertions(+), 64 deletions(-) + +diff --git a/src/cmd/compile/internal/ssa/_gen/MIPS.rules b/src/cmd/compile/internal/ssa/_gen/MIPS.rules +index e10cf359e3..80c7f71685 100644 +--- a/src/cmd/compile/internal/ssa/_gen/MIPS.rules ++++ b/src/cmd/compile/internal/ssa/_gen/MIPS.rules +@@ -126,12 +126,13 @@ + (Sqrt ...) => (SQRTD ...) + (Sqrt32 ...) => (SQRTF ...) + +-// TODO: optimize this case? +-(Ctz32NonZero ...) => (Ctz32 ...) ++(Ctz(32|16|8)NonZero ...) => (Ctz32 ...) + + // count trailing zero + // 32 - CLZ(x&-x - 1) + (Ctz32 x) => (SUB (MOVWconst [32]) (CLZ (SUBconst [1] (AND x (NEG x))))) ++(Ctz16 x) => (Ctz32 (Or32 x (MOVWconst [1<<16]))) ++(Ctz8 x) => (Ctz32 (Or32 x (MOVWconst [1<<8]))) + + // bit length + (BitLen32 x) => (SUB (MOVWconst [32]) (CLZ x)) +diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64.rules b/src/cmd/compile/internal/ssa/_gen/PPC64.rules +index d1e8bba7ef..1a34de9aad 100644 +--- a/src/cmd/compile/internal/ssa/_gen/PPC64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/PPC64.rules +@@ -239,16 +239,17 @@ + (LocalAddr {sym} base _) && !t.Elem().HasPointers() => (MOVDaddr {sym} base) + (OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr) + +-// TODO: optimize these cases? +-(Ctz32NonZero ...) => (Ctz32 ...) +-(Ctz64NonZero ...) => (Ctz64 ...) +- +-(Ctz64 x) && buildcfg.GOPPC64<=8 => (POPCNTD (ANDN (ADDconst [-1] x) x)) +-(Ctz64 x) => (CNTTZD x) +-(Ctz32 x) && buildcfg.GOPPC64<=8 => (POPCNTW (MOVWZreg (ANDN (ADDconst [-1] x) x))) +-(Ctz32 x) => (CNTTZW (MOVWZreg x)) +-(Ctz16 x) => (POPCNTW (MOVHZreg (ANDN (ADDconst [-1] x) x))) +-(Ctz8 x) => (POPCNTB (MOVBZreg (ANDN (ADDconst [-1] x) x))) ++(Ctz(64|32|16|8)NonZero ...) => (Ctz64 ...) ++ ++(Ctz64 x) && buildcfg.GOPPC64 <= 8 => (POPCNTD (ANDN (ADDconst [-1] x) x)) ++(Ctz32 x) && buildcfg.GOPPC64 <= 8 => (POPCNTW (MOVWZreg (ANDN (ADDconst [-1] x) x))) ++(Ctz16 x) && buildcfg.GOPPC64 <= 8 => (POPCNTW (MOVHZreg (ANDN (ADDconst [-1] x) x))) ++(Ctz8 x) && buildcfg.GOPPC64 <= 8 => (POPCNTB (MOVBZreg (ANDN (ADDconst [-1] x) x))) ++ ++(Ctz64 x) && buildcfg.GOPPC64 >= 9 => (CNTTZD x) ++(Ctz32 x) && buildcfg.GOPPC64 >= 9 => (CNTTZW (MOVWZreg x)) ++(Ctz16 x) && buildcfg.GOPPC64 >= 9 => (CNTTZD (OR x (MOVDconst [1<<16]))) ++(Ctz8 x) && buildcfg.GOPPC64 >= 9 => (CNTTZD (OR x (MOVDconst [1<<8]))) + + (BitLen64 x) => (SUBFCconst [64] (CNTLZD x)) + (BitLen32 x) => (SUBFCconst [32] (CNTLZW x)) +diff --git a/src/cmd/compile/internal/ssa/_gen/S390X.rules b/src/cmd/compile/internal/ssa/_gen/S390X.rules +index 78ef1214d7..7505a5ff06 100644 +--- a/src/cmd/compile/internal/ssa/_gen/S390X.rules ++++ b/src/cmd/compile/internal/ssa/_gen/S390X.rules +@@ -80,13 +80,13 @@ + (OffPtr [off] ptr) && is32Bit(off) => (ADDconst [int32(off)] ptr) + (OffPtr [off] ptr) => (ADD (MOVDconst [off]) ptr) + +-// TODO: optimize these cases? +-(Ctz64NonZero ...) => (Ctz64 ...) +-(Ctz32NonZero ...) => (Ctz32 ...) ++(Ctz(64|32|16|8)NonZero ...) => (Ctz64 ...) + + // Ctz(x) = 64 - findLeftmostOne((x-1)&^x) + (Ctz64 x) => (SUB (MOVDconst [64]) (FLOGR (AND (SUBconst [1] x) (NOT x)))) + (Ctz32 x) => (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW (SUBWconst [1] x) (NOTW x))))) ++(Ctz16 x) => (Ctz64 (Or64 x (MOVDconst [1<<16]))) ++(Ctz8 x) => (Ctz64 (Or64 x (MOVDconst [1<<8]))) + + (BitLen64 x) => (SUB (MOVDconst [64]) (FLOGR x)) + (BitLen(32|16|8) x) => (BitLen64 (ZeroExt(32|16|8)to64 x)) +diff --git a/src/cmd/compile/internal/ssa/rewriteMIPS.go b/src/cmd/compile/internal/ssa/rewriteMIPS.go +index 978be79417..eb34dfd03e 100644 +--- a/src/cmd/compile/internal/ssa/rewriteMIPS.go ++++ b/src/cmd/compile/internal/ssa/rewriteMIPS.go +@@ -113,11 +113,21 @@ func rewriteValueMIPS(v *Value) bool { + return rewriteValueMIPS_OpConstBool(v) + case OpConstNil: + return rewriteValueMIPS_OpConstNil(v) ++ case OpCtz16: ++ return rewriteValueMIPS_OpCtz16(v) ++ case OpCtz16NonZero: ++ v.Op = OpCtz32 ++ return true + case OpCtz32: + return rewriteValueMIPS_OpCtz32(v) + case OpCtz32NonZero: + v.Op = OpCtz32 + return true ++ case OpCtz8: ++ return rewriteValueMIPS_OpCtz8(v) ++ case OpCtz8NonZero: ++ v.Op = OpCtz32 ++ return true + case OpCvt32Fto32: + v.Op = OpMIPSTRUNCFW + return true +@@ -929,6 +939,23 @@ func rewriteValueMIPS_OpConstNil(v *Value) bool { + return true + } + } ++func rewriteValueMIPS_OpCtz16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (Ctz16 x) ++ // result: (Ctz32 (Or32 x (MOVWconst [1<<16]))) ++ for { ++ x := v_0 ++ v.reset(OpCtz32) ++ v0 := b.NewValue0(v.Pos, OpOr32, typ.UInt32) ++ v1 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32) ++ v1.AuxInt = int32ToAuxInt(1 << 16) ++ v0.AddArg2(x, v1) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueMIPS_OpCtz32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +@@ -954,6 +981,23 @@ func rewriteValueMIPS_OpCtz32(v *Value) bool { + return true + } + } ++func rewriteValueMIPS_OpCtz8(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (Ctz8 x) ++ // result: (Ctz32 (Or32 x (MOVWconst [1<<8]))) ++ for { ++ x := v_0 ++ v.reset(OpCtz32) ++ v0 := b.NewValue0(v.Pos, OpOr32, typ.UInt32) ++ v1 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32) ++ v1.AuxInt = int32ToAuxInt(1 << 8) ++ v0.AddArg2(x, v1) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueMIPS_OpDiv16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go +index bdc690212e..19b779e02c 100644 +--- a/src/cmd/compile/internal/ssa/rewritePPC64.go ++++ b/src/cmd/compile/internal/ssa/rewritePPC64.go +@@ -155,10 +155,13 @@ func rewriteValuePPC64(v *Value) bool { + return rewriteValuePPC64_OpCopysign(v) + case OpCtz16: + return rewriteValuePPC64_OpCtz16(v) ++ case OpCtz16NonZero: ++ v.Op = OpCtz64 ++ return true + case OpCtz32: + return rewriteValuePPC64_OpCtz32(v) + case OpCtz32NonZero: +- v.Op = OpCtz32 ++ v.Op = OpCtz64 + return true + case OpCtz64: + return rewriteValuePPC64_OpCtz64(v) +@@ -167,6 +170,9 @@ func rewriteValuePPC64(v *Value) bool { + return true + case OpCtz8: + return rewriteValuePPC64_OpCtz8(v) ++ case OpCtz8NonZero: ++ v.Op = OpCtz64 ++ return true + case OpCvt32Fto32: + return rewriteValuePPC64_OpCvt32Fto32(v) + case OpCvt32Fto64: +@@ -1520,9 +1526,13 @@ func rewriteValuePPC64_OpCtz16(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Ctz16 x) ++ // cond: buildcfg.GOPPC64 <= 8 + // result: (POPCNTW (MOVHZreg (ANDN (ADDconst [-1] x) x))) + for { + x := v_0 ++ if !(buildcfg.GOPPC64 <= 8) { ++ break ++ } + v.reset(OpPPC64POPCNTW) + v0 := b.NewValue0(v.Pos, OpPPC64MOVHZreg, typ.Int64) + v1 := b.NewValue0(v.Pos, OpPPC64ANDN, typ.Int16) +@@ -1534,13 +1544,30 @@ func rewriteValuePPC64_OpCtz16(v *Value) bool { + v.AddArg(v0) + return true + } ++ // match: (Ctz16 x) ++ // cond: buildcfg.GOPPC64 >= 9 ++ // result: (CNTTZD (OR x (MOVDconst [1<<16]))) ++ for { ++ x := v_0 ++ if !(buildcfg.GOPPC64 >= 9) { ++ break ++ } ++ v.reset(OpPPC64CNTTZD) ++ v0 := b.NewValue0(v.Pos, OpPPC64OR, typ.UInt64) ++ v1 := b.NewValue0(v.Pos, OpPPC64MOVDconst, typ.Int64) ++ v1.AuxInt = int64ToAuxInt(1 << 16) ++ v0.AddArg2(x, v1) ++ v.AddArg(v0) ++ return true ++ } ++ return false + } + func rewriteValuePPC64_OpCtz32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types + // match: (Ctz32 x) +- // cond: buildcfg.GOPPC64<=8 ++ // cond: buildcfg.GOPPC64 <= 8 + // result: (POPCNTW (MOVWZreg (ANDN (ADDconst [-1] x) x))) + for { + x := v_0 +@@ -1559,22 +1586,27 @@ func rewriteValuePPC64_OpCtz32(v *Value) bool { + return true + } + // match: (Ctz32 x) ++ // cond: buildcfg.GOPPC64 >= 9 + // result: (CNTTZW (MOVWZreg x)) + for { + x := v_0 ++ if !(buildcfg.GOPPC64 >= 9) { ++ break ++ } + v.reset(OpPPC64CNTTZW) + v0 := b.NewValue0(v.Pos, OpPPC64MOVWZreg, typ.Int64) + v0.AddArg(x) + v.AddArg(v0) + return true + } ++ return false + } + func rewriteValuePPC64_OpCtz64(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types + // match: (Ctz64 x) +- // cond: buildcfg.GOPPC64<=8 ++ // cond: buildcfg.GOPPC64 <= 8 + // result: (POPCNTD (ANDN (ADDconst [-1] x) x)) + for { + x := v_0 +@@ -1591,22 +1623,31 @@ func rewriteValuePPC64_OpCtz64(v *Value) bool { + return true + } + // match: (Ctz64 x) ++ // cond: buildcfg.GOPPC64 >= 9 + // result: (CNTTZD x) + for { + x := v_0 ++ if !(buildcfg.GOPPC64 >= 9) { ++ break ++ } + v.reset(OpPPC64CNTTZD) + v.AddArg(x) + return true + } ++ return false + } + func rewriteValuePPC64_OpCtz8(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types + // match: (Ctz8 x) ++ // cond: buildcfg.GOPPC64 <= 8 + // result: (POPCNTB (MOVBZreg (ANDN (ADDconst [-1] x) x))) + for { + x := v_0 ++ if !(buildcfg.GOPPC64 <= 8) { ++ break ++ } + v.reset(OpPPC64POPCNTB) + v0 := b.NewValue0(v.Pos, OpPPC64MOVBZreg, typ.Int64) + v1 := b.NewValue0(v.Pos, OpPPC64ANDN, typ.UInt8) +@@ -1618,6 +1659,23 @@ func rewriteValuePPC64_OpCtz8(v *Value) bool { + v.AddArg(v0) + return true + } ++ // match: (Ctz8 x) ++ // cond: buildcfg.GOPPC64 >= 9 ++ // result: (CNTTZD (OR x (MOVDconst [1<<8]))) ++ for { ++ x := v_0 ++ if !(buildcfg.GOPPC64 >= 9) { ++ break ++ } ++ v.reset(OpPPC64CNTTZD) ++ v0 := b.NewValue0(v.Pos, OpPPC64OR, typ.UInt64) ++ v1 := b.NewValue0(v.Pos, OpPPC64MOVDconst, typ.Int64) ++ v1.AuxInt = int64ToAuxInt(1 << 8) ++ v0.AddArg2(x, v1) ++ v.AddArg(v0) ++ return true ++ } ++ return false + } + func rewriteValuePPC64_OpCvt32Fto32(v *Value) bool { + v_0 := v.Args[0] +diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go +index bf3073eea9..e54e3ba7fc 100644 +--- a/src/cmd/compile/internal/ssa/rewriteS390X.go ++++ b/src/cmd/compile/internal/ssa/rewriteS390X.go +@@ -139,16 +139,26 @@ func rewriteValueS390X(v *Value) bool { + return rewriteValueS390X_OpConstBool(v) + case OpConstNil: + return rewriteValueS390X_OpConstNil(v) ++ case OpCtz16: ++ return rewriteValueS390X_OpCtz16(v) ++ case OpCtz16NonZero: ++ v.Op = OpCtz64 ++ return true + case OpCtz32: + return rewriteValueS390X_OpCtz32(v) + case OpCtz32NonZero: +- v.Op = OpCtz32 ++ v.Op = OpCtz64 + return true + case OpCtz64: + return rewriteValueS390X_OpCtz64(v) + case OpCtz64NonZero: + v.Op = OpCtz64 + return true ++ case OpCtz8: ++ return rewriteValueS390X_OpCtz8(v) ++ case OpCtz8NonZero: ++ v.Op = OpCtz64 ++ return true + case OpCvt32Fto32: + v.Op = OpS390XCFEBRA + return true +@@ -1449,6 +1459,23 @@ func rewriteValueS390X_OpConstNil(v *Value) bool { + return true + } + } ++func rewriteValueS390X_OpCtz16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (Ctz16 x) ++ // result: (Ctz64 (Or64 x (MOVDconst [1<<16]))) ++ for { ++ x := v_0 ++ v.reset(OpCtz64) ++ v0 := b.NewValue0(v.Pos, OpOr64, typ.UInt64) ++ v1 := b.NewValue0(v.Pos, OpS390XMOVDconst, typ.UInt64) ++ v1.AuxInt = int64ToAuxInt(1 << 16) ++ v0.AddArg2(x, v1) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueS390X_OpCtz32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block +@@ -1501,6 +1528,23 @@ func rewriteValueS390X_OpCtz64(v *Value) bool { + return true + } + } ++func rewriteValueS390X_OpCtz8(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (Ctz8 x) ++ // result: (Ctz64 (Or64 x (MOVDconst [1<<8]))) ++ for { ++ x := v_0 ++ v.reset(OpCtz64) ++ v0 := b.NewValue0(v.Pos, OpOr64, typ.UInt64) ++ v1 := b.NewValue0(v.Pos, OpS390XMOVDconst, typ.UInt64) ++ v1.AuxInt = int64ToAuxInt(1 << 8) ++ v0.AddArg2(x, v1) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueS390X_OpDiv16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go +index 6fcdcf57ed..a337ef7f1b 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics.go +@@ -675,48 +675,16 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { + return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0]) + }, + sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) +- addF("math/bits", "TrailingZeros16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0]) +- c := s.constInt32(types.Types[types.TUINT32], 1<<16) +- y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c) +- return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y) +- }, +- sys.MIPS) + addF("math/bits", "TrailingZeros16", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0]) + }, +- sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm) +- addF("math/bits", "TrailingZeros16", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0]) +- c := s.constInt64(types.Types[types.TUINT64], 1<<16) +- y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c) +- return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y) +- }, +- sys.S390X, sys.PPC64) +- addF("math/bits", "TrailingZeros8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0]) +- c := s.constInt32(types.Types[types.TUINT32], 1<<8) +- y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c) +- return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y) +- }, +- sys.MIPS) ++ sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) + addF("math/bits", "TrailingZeros8", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0]) + }, +- sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm) +- addF("math/bits", "TrailingZeros8", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0]) +- c := s.constInt64(types.Types[types.TUINT64], 1<<8) +- y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c) +- return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y) +- }, +- sys.S390X) ++ sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) + alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...) + alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...) + // ReverseBytes inlines correctly, no need to intrinsify it. +@@ -750,14 +718,9 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { + return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0]) + }, + sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) +- addF("math/bits", "Len", +- func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +- if s.config.PtrSize == 4 { +- return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) +- } +- return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0]) +- }, +- sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) ++ alias("math/bits", "Len", "math/bits", "Len64", p8...) ++ alias("math/bits", "Len", "math/bits", "Len32", p4...) ++ + // LeadingZeros is handled because it trivially calls Len. + addF("math/bits", "Reverse64", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go +index 4bf5fce2a5..b3f43eff5e 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go +@@ -762,6 +762,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64", "runtime/internal/sys", "OnesCount64"}: struct{}{}, + {"ppc64", "runtime/internal/sys", "Prefetch"}: struct{}{}, + {"ppc64", "runtime/internal/sys", "PrefetchStreamed"}: struct{}{}, ++ {"ppc64", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"ppc64", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, + {"ppc64", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, + {"ppc64", "math", "Abs"}: struct{}{}, +@@ -794,6 +795,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64", "math/bits", "RotateLeft64"}: struct{}{}, + {"ppc64", "math/bits", "Sub"}: struct{}{}, + {"ppc64", "math/bits", "Sub64"}: struct{}{}, ++ {"ppc64", "math/bits", "TrailingZeros8"}: struct{}{}, + {"ppc64", "math/bits", "TrailingZeros16"}: struct{}{}, + {"ppc64", "math/bits", "TrailingZeros32"}: struct{}{}, + {"ppc64", "math/bits", "TrailingZeros64"}: struct{}{}, +@@ -881,6 +883,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64le", "runtime/internal/sys", "OnesCount64"}: struct{}{}, + {"ppc64le", "runtime/internal/sys", "Prefetch"}: struct{}{}, + {"ppc64le", "runtime/internal/sys", "PrefetchStreamed"}: struct{}{}, ++ {"ppc64le", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"ppc64le", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, + {"ppc64le", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, + {"ppc64le", "math", "Abs"}: struct{}{}, +@@ -913,6 +916,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64le", "math/bits", "RotateLeft64"}: struct{}{}, + {"ppc64le", "math/bits", "Sub"}: struct{}{}, + {"ppc64le", "math/bits", "Sub64"}: struct{}{}, ++ {"ppc64le", "math/bits", "TrailingZeros8"}: struct{}{}, + {"ppc64le", "math/bits", "TrailingZeros16"}: struct{}{}, + {"ppc64le", "math/bits", "TrailingZeros32"}: struct{}{}, + {"ppc64le", "math/bits", "TrailingZeros64"}: struct{}{}, +@@ -1236,7 +1240,7 @@ func TestIntrinsics(t *testing.T) { + + for ik, _ := range wantIntrinsics { + if _, found := gotIntrinsics[ik]; !found { +- t.Errorf("Want intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn) ++ t.Errorf("Want missing intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn) + } + } + } +diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go +index bf2e8130c4..b6375c5e7a 100644 +--- a/test/codegen/mathbits.go ++++ b/test/codegen/mathbits.go +@@ -345,8 +345,8 @@ func TrailingZeros16(n uint16) int { + // arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR" + // arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t" + // s390x:"FLOGR","OR\t\\$65536" +- // ppc64x/power8:"POPCNTD","OR\\t\\$65536" +- // ppc64x/power9:"CNTTZD","OR\\t\\$65536" ++ // ppc64x/power8:"POPCNTW","ADD\t\\$-1" ++ // ppc64x/power9:"CNTTZD","ORIS\\t\\$1" + // wasm:"I64Ctz" + return bits.TrailingZeros16(n) + } +@@ -356,6 +356,8 @@ func TrailingZeros8(n uint8) int { + // 386:"BSFL" + // arm:"ORR\t\\$256","CLZ",-"MOVBU\tR" + // arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t" ++ // ppc64x/power8:"POPCNTB","ADD\t\\$-1" ++ // ppc64x/power9:"CNTTZD","OR\t\\$256" + // s390x:"FLOGR","OR\t\\$256" + // wasm:"I64Ctz" + return bits.TrailingZeros8(n) +-- +2.39.5 + diff --git a/2115-cmd-compile-intrinsify-math-bits.TrailingZeros-on-ri.patch b/2115-cmd-compile-intrinsify-math-bits.TrailingZeros-on-ri.patch new file mode 100644 index 0000000..3f5b7ef --- /dev/null +++ b/2115-cmd-compile-intrinsify-math-bits.TrailingZeros-on-ri.patch @@ -0,0 +1,375 @@ +From 49515bbf06a159e4af4681ce5b8081af7462766d Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:48:22 +0800 +Subject: [PATCH 115/119] cmd/compile: intrinsify math/bits.TrailingZeros on + riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +For riscv64/rva22u64 and above, we can intrinsify math/bits.TrailingZeros +using the CTZ/CTZW machine instructions. + +On a StarFive VisionFive 2 with GORISCV64=rva22u64: + + │ ctz.b.1 │ ctz.b.2 │ + │ sec/op │ sec/op vs base │ +TrailingZeros-4 25.500n ± 0% 8.052n ± 0% -68.42% (p=0.000 n=10) +TrailingZeros8-4 14.76n ± 0% 10.74n ± 0% -27.24% (p=0.000 n=10) +TrailingZeros16-4 26.84n ± 0% 10.74n ± 0% -59.99% (p=0.000 n=10) +TrailingZeros32-4 25.500n ± 0% 8.052n ± 0% -68.42% (p=0.000 n=10) +TrailingZeros64-4 25.500n ± 0% 8.052n ± 0% -68.42% (p=0.000 n=10) +geomean 23.09n 9.035n -60.88% + +Change-Id: I71edf2b988acb7a68e797afda4ee66d7a57d587e +Reviewed-on: https://go-review.googlesource.com/c/go/+/652320 +Reviewed-by: Cherry Mui +Reviewed-by: Mark Ryan +Reviewed-by: David Chase +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +--- + src/cmd/compile/internal/riscv64/ssa.go | 2 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 7 +++ + .../compile/internal/ssa/_gen/RISCV64Ops.go | 2 + + src/cmd/compile/internal/ssa/opGen.go | 28 ++++++++++ + .../compile/internal/ssa/rewriteRISCV64.go | 54 +++++++++++++++++++ + src/cmd/compile/internal/ssagen/intrinsics.go | 24 +++++++++ + .../internal/ssagen/intrinsics_test.go | 10 +++- + test/codegen/mathbits.go | 19 +++++-- + 8 files changed, 141 insertions(+), 5 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 4aac891e13..ba982a13cd 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -419,7 +419,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVXS, ssa.OpRISCV64FMVDX, ssa.OpRISCV64FMVXD, + ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS, + ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD, +- ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW: ++ ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 9e39a58197..72e4e8d7b3 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -218,6 +218,13 @@ + (RotateLeft32 ...) => (ROLW ...) + (RotateLeft64 ...) => (ROL ...) + ++// Count trailing zeros (note that these will only be emitted for rva22u64 and above). ++(Ctz(64|32|16|8)NonZero ...) => (Ctz64 ...) ++(Ctz64 ...) => (CTZ ...) ++(Ctz32 ...) => (CTZW ...) ++(Ctz16 x) => (CTZW (ORI [1<<16] x)) ++(Ctz8 x) => (CTZW (ORI [1<<8] x)) ++ + (Less64 ...) => (SLT ...) + (Less32 x y) => (SLT (SignExt32to64 x) (SignExt32to64 y)) + (Less16 x y) => (SLT (SignExt16to64 x) (SignExt16to64 y)) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index a69b347a84..f62bce8980 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -229,6 +229,8 @@ func init() { + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 + {name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"}, // ^arg0 & arg1 + {name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"}, // arg0 & auxint ++ {name: "CTZ", argLength: 1, reg: gp11, asm: "CTZ"}, // count trailing zeros ++ {name: "CTZW", argLength: 1, reg: gp11, asm: "CTZW"}, // count trailing zeros of least significant word + {name: "NOT", argLength: 1, reg: gp11, asm: "NOT"}, // ^arg0 + {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 + {name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // ^arg0 | arg1 +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index 5fda7ffc2f..6afa1662c3 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2387,6 +2387,8 @@ const ( + OpRISCV64AND + OpRISCV64ANDN + OpRISCV64ANDI ++ OpRISCV64CTZ ++ OpRISCV64CTZW + OpRISCV64NOT + OpRISCV64OR + OpRISCV64ORN +@@ -32041,6 +32043,32 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "CTZ", ++ argLen: 1, ++ asm: riscv.ACTZ, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "CTZW", ++ argLen: 1, ++ asm: riscv.ACTZW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "NOT", + argLen: 1, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index a449ce01c6..85b6a05d7c 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -138,6 +138,28 @@ func rewriteValueRISCV64(v *Value) bool { + case OpCopysign: + v.Op = OpRISCV64FSGNJD + return true ++ case OpCtz16: ++ return rewriteValueRISCV64_OpCtz16(v) ++ case OpCtz16NonZero: ++ v.Op = OpCtz64 ++ return true ++ case OpCtz32: ++ v.Op = OpRISCV64CTZW ++ return true ++ case OpCtz32NonZero: ++ v.Op = OpCtz64 ++ return true ++ case OpCtz64: ++ v.Op = OpRISCV64CTZ ++ return true ++ case OpCtz64NonZero: ++ v.Op = OpCtz64 ++ return true ++ case OpCtz8: ++ return rewriteValueRISCV64_OpCtz8(v) ++ case OpCtz8NonZero: ++ v.Op = OpCtz64 ++ return true + case OpCvt32Fto32: + v.Op = OpRISCV64FCVTWS + return true +@@ -1005,6 +1027,38 @@ func rewriteValueRISCV64_OpConstNil(v *Value) bool { + return true + } + } ++func rewriteValueRISCV64_OpCtz16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (Ctz16 x) ++ // result: (CTZW (ORI [1<<16] x)) ++ for { ++ x := v_0 ++ v.reset(OpRISCV64CTZW) ++ v0 := b.NewValue0(v.Pos, OpRISCV64ORI, typ.UInt32) ++ v0.AuxInt = int64ToAuxInt(1 << 16) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} ++func rewriteValueRISCV64_OpCtz8(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (Ctz8 x) ++ // result: (CTZW (ORI [1<<8] x)) ++ for { ++ x := v_0 ++ v.reset(OpRISCV64CTZW) ++ v0 := b.NewValue0(v.Pos, OpRISCV64ORI, typ.UInt32) ++ v0.AuxInt = int64ToAuxInt(1 << 8) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueRISCV64_OpDiv16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] +diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go +index a337ef7f1b..3554558519 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics.go +@@ -685,6 +685,30 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { + return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0]) + }, + sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) ++ ++ if cfg.goriscv64 >= 22 { ++ addF("math/bits", "TrailingZeros64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0]) ++ }, ++ sys.RISCV64) ++ addF("math/bits", "TrailingZeros32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0]) ++ }, ++ sys.RISCV64) ++ addF("math/bits", "TrailingZeros16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0]) ++ }, ++ sys.RISCV64) ++ addF("math/bits", "TrailingZeros8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0]) ++ }, ++ sys.RISCV64) ++ } ++ + alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...) + alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...) + // ReverseBytes inlines correctly, no need to intrinsify it. +diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go +index b3f43eff5e..e8803f1ddf 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go +@@ -999,6 +999,9 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"riscv64", "internal/runtime/math", "Add64"}: struct{}{}, + {"riscv64", "internal/runtime/math", "Mul64"}: struct{}{}, + {"riscv64", "internal/runtime/math", "MulUintptr"}: struct{}{}, ++ {"riscv64", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, ++ {"riscv64", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, ++ {"riscv64", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, + {"riscv64", "math", "Abs"}: struct{}{}, + {"riscv64", "math", "Copysign"}: struct{}{}, + {"riscv64", "math", "FMA"}: struct{}{}, +@@ -1015,6 +1018,10 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"riscv64", "math/bits", "RotateLeft8"}: struct{}{}, + {"riscv64", "math/bits", "Sub"}: struct{}{}, + {"riscv64", "math/bits", "Sub64"}: struct{}{}, ++ {"riscv64", "math/bits", "TrailingZeros16"}: struct{}{}, ++ {"riscv64", "math/bits", "TrailingZeros32"}: struct{}{}, ++ {"riscv64", "math/bits", "TrailingZeros64"}: struct{}{}, ++ {"riscv64", "math/bits", "TrailingZeros8"}: struct{}{}, + {"riscv64", "runtime", "KeepAlive"}: struct{}{}, + {"riscv64", "runtime", "getcallerpc"}: struct{}{}, + {"riscv64", "runtime", "getcallersp"}: struct{}{}, +@@ -1204,7 +1211,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + + func TestIntrinsics(t *testing.T) { + cfg := &intrinsicBuildConfig{ +- goppc64: 10, ++ goppc64: 10, ++ goriscv64: 23, + } + initIntrinsics(cfg) + +diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go +index b6375c5e7a..97926c4ddc 100644 +--- a/test/codegen/mathbits.go ++++ b/test/codegen/mathbits.go +@@ -296,26 +296,30 @@ func RotateLeftVariable32(n uint32, m int) uint32 { + // ------------------------ // + + func TrailingZeros(n uint) int { ++ // 386:"BSFL" + // amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ" + // amd64/v3:"TZCNTQ" +- // 386:"BSFL" + // arm:"CLZ" + // arm64:"RBIT","CLZ" + // s390x:"FLOGR" + // ppc64x/power8:"ANDN","POPCNTD" + // ppc64x/power9: "CNTTZD" ++ // riscv64/rva22u64,riscv64/rva23u64: "CTZ\t" ++ // s390x:"FLOGR" + // wasm:"I64Ctz" + return bits.TrailingZeros(n) + } + + func TrailingZeros64(n uint64) int { ++ // 386:"BSFL","JNE" + // amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ" + // amd64/v3:"TZCNTQ" +- // 386:"BSFL","JNE" + // arm64:"RBIT","CLZ" + // s390x:"FLOGR" + // ppc64x/power8:"ANDN","POPCNTD" + // ppc64x/power9: "CNTTZD" ++ // riscv64/rva22u64,riscv64/rva23u64: "CTZ\t" ++ // s390x:"FLOGR" + // wasm:"I64Ctz" + return bits.TrailingZeros64(n) + } +@@ -327,14 +331,16 @@ func TrailingZeros64Subtract(n uint64) int { + } + + func TrailingZeros32(n uint32) int { ++ // 386:"BSFL" + // amd64/v1,amd64/v2:"BTSQ\\t\\$32","BSFQ" + // amd64/v3:"TZCNTL" +- // 386:"BSFL" + // arm:"CLZ" + // arm64:"RBITW","CLZW" + // s390x:"FLOGR","MOVWZ" + // ppc64x/power8:"ANDN","POPCNTW" + // ppc64x/power9: "CNTTZW" ++ // riscv64/rva22u64,riscv64/rva23u64: "CTZW" ++ // s390x:"FLOGR","MOVWZ" + // wasm:"I64Ctz" + return bits.TrailingZeros32(n) + } +@@ -342,11 +348,14 @@ func TrailingZeros32(n uint32) int { + func TrailingZeros16(n uint16) int { + // amd64:"BSFL","BTSL\\t\\$16" + // 386:"BSFL\t" ++ // amd64:"BSFL","ORL\\t\\$65536" + // arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR" + // arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t" + // s390x:"FLOGR","OR\t\\$65536" + // ppc64x/power8:"POPCNTW","ADD\t\\$-1" + // ppc64x/power9:"CNTTZD","ORIS\\t\\$1" ++ // riscv64/rva22u64,riscv64/rva23u64: "ORI\t\\$65536","CTZW" ++ // s390x:"FLOGR","OR\t\\$65536" + // wasm:"I64Ctz" + return bits.TrailingZeros16(n) + } +@@ -354,10 +363,12 @@ func TrailingZeros16(n uint16) int { + func TrailingZeros8(n uint8) int { + // amd64:"BSFL","BTSL\\t\\$8" + // 386:"BSFL" ++ // amd64:"BSFL","ORL\\t\\$256" + // arm:"ORR\t\\$256","CLZ",-"MOVBU\tR" + // arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t" + // ppc64x/power8:"POPCNTB","ADD\t\\$-1" + // ppc64x/power9:"CNTTZD","OR\t\\$256" ++ // riscv64/rva22u64,riscv64/rva23u64: "ORI\t\\$256","CTZW" + // s390x:"FLOGR","OR\t\\$256" + // wasm:"I64Ctz" + return bits.TrailingZeros8(n) +@@ -404,6 +415,7 @@ func IterateBits16(n uint16) int { + // amd64/v1,amd64/v2:"BSFL",-"BTSL" + // amd64/v3:"TZCNTL" + // arm64:"RBITW","CLZW",-"ORR" ++ // riscv64/rva22u64,riscv64/rva23u64: "CTZ\t",-"ORR" + i += bits.TrailingZeros16(n) + n &= n - 1 + } +@@ -416,6 +428,7 @@ func IterateBits8(n uint8) int { + // amd64/v1,amd64/v2:"BSFL",-"BTSL" + // amd64/v3:"TZCNTL" + // arm64:"RBITW","CLZW",-"ORR" ++ // riscv64/rva22u64,riscv64/rva23u64: "CTZ\t",-"ORR" + i += bits.TrailingZeros8(n) + n &= n - 1 + } +-- +2.39.5 + diff --git a/2116-cmd-compile-internal-ssagen-use-an-alias-for-math-bi.patch b/2116-cmd-compile-internal-ssagen-use-an-alias-for-math-bi.patch new file mode 100644 index 0000000..a92b41f --- /dev/null +++ b/2116-cmd-compile-internal-ssagen-use-an-alias-for-math-bi.patch @@ -0,0 +1,86 @@ +From 7cd780b2c6e1b1d3b58a407b98e8efbaf25ca56a Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:48:22 +0800 +Subject: [PATCH 116/119] cmd/compile/internal/ssagen: use an alias for + math/bits.OnesCount + +Currently, only amd64 has an intrinsic for math/bits.OnesCount, which +generates the same code as math/bits.OnesCount64. Replace this with +an alias that maps math/bits.OnesCount to math/bits.OnesCount64 on +64 bit platforms. + +Change-Id: Ifa12a2173a201aacd52c3c22b9a948be6e314405 +Reviewed-on: https://go-review.googlesource.com/c/go/+/659215 +Reviewed-by: Keith Randall +Reviewed-by: Cherry Mui +Reviewed-by: Keith Randall +Auto-Submit: Keith Randall +LUCI-TryBot-Result: Go LUCI +--- + src/cmd/compile/internal/ssagen/intrinsics.go | 5 ++--- + src/cmd/compile/internal/ssagen/intrinsics_test.go | 5 +++++ + 2 files changed, 7 insertions(+), 3 deletions(-) + +diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go +index 3554558519..52c0db64a9 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics.go +@@ -854,9 +854,8 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { + return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0]) + }, + sys.S390X, sys.PPC64, sys.Wasm) +- addF("math/bits", "OnesCount", +- makeOnesCountAMD64(ssa.OpPopCount64), +- sys.AMD64) ++ alias("math/bits", "OnesCount", "math/bits", "OnesCount64", p8...) ++ + addF("math/bits", "Mul64", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1]) +diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go +index e8803f1ddf..23a0b1678f 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go +@@ -260,6 +260,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"arm64", "math/bits", "Len8"}: struct{}{}, + {"arm64", "math/bits", "Mul"}: struct{}{}, + {"arm64", "math/bits", "Mul64"}: struct{}{}, ++ {"arm64", "math/bits", "OnesCount"}: struct{}{}, + {"arm64", "math/bits", "OnesCount16"}: struct{}{}, + {"arm64", "math/bits", "OnesCount32"}: struct{}{}, + {"arm64", "math/bits", "OnesCount64"}: struct{}{}, +@@ -783,6 +784,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64", "math/bits", "Len8"}: struct{}{}, + {"ppc64", "math/bits", "Mul"}: struct{}{}, + {"ppc64", "math/bits", "Mul64"}: struct{}{}, ++ {"ppc64", "math/bits", "OnesCount"}: struct{}{}, + {"ppc64", "math/bits", "OnesCount16"}: struct{}{}, + {"ppc64", "math/bits", "OnesCount32"}: struct{}{}, + {"ppc64", "math/bits", "OnesCount64"}: struct{}{}, +@@ -904,6 +906,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"ppc64le", "math/bits", "Len8"}: struct{}{}, + {"ppc64le", "math/bits", "Mul"}: struct{}{}, + {"ppc64le", "math/bits", "Mul64"}: struct{}{}, ++ {"ppc64le", "math/bits", "OnesCount"}: struct{}{}, + {"ppc64le", "math/bits", "OnesCount16"}: struct{}{}, + {"ppc64le", "math/bits", "OnesCount32"}: struct{}{}, + {"ppc64le", "math/bits", "OnesCount64"}: struct{}{}, +@@ -1125,6 +1128,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"s390x", "math/bits", "Len8"}: struct{}{}, + {"s390x", "math/bits", "Mul"}: struct{}{}, + {"s390x", "math/bits", "Mul64"}: struct{}{}, ++ {"s390x", "math/bits", "OnesCount"}: struct{}{}, + {"s390x", "math/bits", "OnesCount16"}: struct{}{}, + {"s390x", "math/bits", "OnesCount32"}: struct{}{}, + {"s390x", "math/bits", "OnesCount64"}: struct{}{}, +@@ -1191,6 +1195,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"wasm", "math/bits", "Len32"}: struct{}{}, + {"wasm", "math/bits", "Len64"}: struct{}{}, + {"wasm", "math/bits", "Len8"}: struct{}{}, ++ {"wasm", "math/bits", "OnesCount"}: struct{}{}, + {"wasm", "math/bits", "OnesCount16"}: struct{}{}, + {"wasm", "math/bits", "OnesCount32"}: struct{}{}, + {"wasm", "math/bits", "OnesCount64"}: struct{}{}, +-- +2.39.5 + diff --git a/2117-cmd-compile-intrinsify-math-bits.Len-on-riscv64.patch b/2117-cmd-compile-intrinsify-math-bits.Len-on-riscv64.patch new file mode 100644 index 0000000..e4d725a --- /dev/null +++ b/2117-cmd-compile-intrinsify-math-bits.Len-on-riscv64.patch @@ -0,0 +1,446 @@ +From 2406dc38b01afcf8c11c3a1e87f76613bc64684b Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:48:22 +0800 +Subject: [PATCH 117/119] cmd/compile: intrinsify math/bits.Len on riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +For riscv64/rva22u64 and above, we can intrinsify math/bits.Len using the +CLZ/CLZW machine instructions. + +On a StarFive VisionFive 2 with GORISCV64=rva22u64: + + │ clz.b.1 │ clz.b.2 │ + │ sec/op │ sec/op vs base │ +LeadingZeros-4 28.89n ± 0% 12.08n ± 0% -58.19% (p=0.000 n=10) +LeadingZeros8-4 18.79n ± 0% 14.76n ± 0% -21.45% (p=0.000 n=10) +LeadingZeros16-4 25.27n ± 0% 14.76n ± 0% -41.59% (p=0.000 n=10) +LeadingZeros32-4 25.12n ± 0% 12.08n ± 0% -51.92% (p=0.000 n=10) +LeadingZeros64-4 25.89n ± 0% 12.08n ± 0% -53.35% (p=0.000 n=10) +geomean 24.55n 13.09n -46.70% + +Change-Id: I0dda684713dbdf5336af393f5ccbdae861c4f694 +Reviewed-on: https://go-review.googlesource.com/c/go/+/652321 +Reviewed-by: David Chase +Reviewed-by: Meng Zhuo +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Mark Ryan +Reviewed-by: Cherry Mui +--- + src/cmd/compile/internal/riscv64/ssa.go | 2 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 6 ++ + .../compile/internal/ssa/_gen/RISCV64Ops.go | 2 + + src/cmd/compile/internal/ssa/opGen.go | 28 +++++++ + .../compile/internal/ssa/rewriteRISCV64.go | 74 +++++++++++++++++++ + src/cmd/compile/internal/ssagen/intrinsics.go | 24 ++++++ + .../internal/ssagen/intrinsics_test.go | 7 ++ + test/codegen/mathbits.go | 45 ++++++++--- + 8 files changed, 175 insertions(+), 13 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index ba982a13cd..1f0880f80b 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -419,7 +419,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVXS, ssa.OpRISCV64FMVDX, ssa.OpRISCV64FMVXD, + ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS, + ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD, +- ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW: ++ ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CLZ, ssa.OpRISCV64CLZW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 72e4e8d7b3..36c9b53eef 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -225,6 +225,12 @@ + (Ctz16 x) => (CTZW (ORI [1<<16] x)) + (Ctz8 x) => (CTZW (ORI [1<<8] x)) + ++// Bit length (note that these will only be emitted for rva22u64 and above). ++(BitLen64 x) => (SUB (MOVDconst [64]) (CLZ x)) ++(BitLen32 x) => (SUB (MOVDconst [32]) (CLZW x)) ++(BitLen16 x) => (BitLen64 (ZeroExt16to64 x)) ++(BitLen8 x) => (BitLen64 (ZeroExt8to64 x)) ++ + (Less64 ...) => (SLT ...) + (Less32 x y) => (SLT (SignExt32to64 x) (SignExt32to64 y)) + (Less16 x y) => (SLT (SignExt16to64 x) (SignExt16to64 y)) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index f62bce8980..b411766354 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -229,6 +229,8 @@ func init() { + {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 + {name: "ANDN", argLength: 2, reg: gp21, asm: "ANDN"}, // ^arg0 & arg1 + {name: "ANDI", argLength: 1, reg: gp11, asm: "ANDI", aux: "Int64"}, // arg0 & auxint ++ {name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"}, // count leading zeros ++ {name: "CLZW", argLength: 1, reg: gp11, asm: "CLZW"}, // count leading zeros of least significant word + {name: "CTZ", argLength: 1, reg: gp11, asm: "CTZ"}, // count trailing zeros + {name: "CTZW", argLength: 1, reg: gp11, asm: "CTZW"}, // count trailing zeros of least significant word + {name: "NOT", argLength: 1, reg: gp11, asm: "NOT"}, // ^arg0 +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index 6afa1662c3..28bd5a34bf 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2387,6 +2387,8 @@ const ( + OpRISCV64AND + OpRISCV64ANDN + OpRISCV64ANDI ++ OpRISCV64CLZ ++ OpRISCV64CLZW + OpRISCV64CTZ + OpRISCV64CTZW + OpRISCV64NOT +@@ -32043,6 +32045,32 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "CLZ", ++ argLen: 1, ++ asm: riscv.ACLZ, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, ++ { ++ name: "CLZW", ++ argLen: 1, ++ asm: riscv.ACLZW, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "CTZ", + argLen: 1, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 85b6a05d7c..474deeede6 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -104,6 +104,14 @@ func rewriteValueRISCV64(v *Value) bool { + return true + case OpAvg64u: + return rewriteValueRISCV64_OpAvg64u(v) ++ case OpBitLen16: ++ return rewriteValueRISCV64_OpBitLen16(v) ++ case OpBitLen32: ++ return rewriteValueRISCV64_OpBitLen32(v) ++ case OpBitLen64: ++ return rewriteValueRISCV64_OpBitLen64(v) ++ case OpBitLen8: ++ return rewriteValueRISCV64_OpBitLen8(v) + case OpClosureCall: + v.Op = OpRISCV64CALLclosure + return true +@@ -940,6 +948,72 @@ func rewriteValueRISCV64_OpAvg64u(v *Value) bool { + return true + } + } ++func rewriteValueRISCV64_OpBitLen16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen16 x) ++ // result: (BitLen64 (ZeroExt16to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} ++func rewriteValueRISCV64_OpBitLen32(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen32 x) ++ // result: (SUB (MOVDconst [32]) (CLZW x)) ++ for { ++ t := v.Type ++ x := v_0 ++ v.reset(OpRISCV64SUB) ++ v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) ++ v0.AuxInt = int64ToAuxInt(32) ++ v1 := b.NewValue0(v.Pos, OpRISCV64CLZW, t) ++ v1.AddArg(x) ++ v.AddArg2(v0, v1) ++ return true ++ } ++} ++func rewriteValueRISCV64_OpBitLen64(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen64 x) ++ // result: (SUB (MOVDconst [64]) (CLZ x)) ++ for { ++ t := v.Type ++ x := v_0 ++ v.reset(OpRISCV64SUB) ++ v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) ++ v0.AuxInt = int64ToAuxInt(64) ++ v1 := b.NewValue0(v.Pos, OpRISCV64CLZ, t) ++ v1.AddArg(x) ++ v.AddArg2(v0, v1) ++ return true ++ } ++} ++func rewriteValueRISCV64_OpBitLen8(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ typ := &b.Func.Config.Types ++ // match: (BitLen8 x) ++ // result: (BitLen64 (ZeroExt8to64 x)) ++ for { ++ x := v_0 ++ v.reset(OpBitLen64) ++ v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueRISCV64_OpConst16(v *Value) bool { + // match: (Const16 [val]) + // result: (MOVDconst [int64(val)]) +diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go +index 52c0db64a9..0888ef27d5 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics.go +@@ -742,6 +742,30 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { + return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0]) + }, + sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm) ++ ++ if cfg.goriscv64 >= 22 { ++ addF("math/bits", "Len64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0]) ++ }, ++ sys.RISCV64) ++ addF("math/bits", "Len32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) ++ }, ++ sys.RISCV64) ++ addF("math/bits", "Len16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0]) ++ }, ++ sys.RISCV64) ++ addF("math/bits", "Len8", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0]) ++ }, ++ sys.RISCV64) ++ } ++ + alias("math/bits", "Len", "math/bits", "Len64", p8...) + alias("math/bits", "Len", "math/bits", "Len32", p4...) + +diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go +index 23a0b1678f..c31a6ee609 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go +@@ -1002,6 +1002,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"riscv64", "internal/runtime/math", "Add64"}: struct{}{}, + {"riscv64", "internal/runtime/math", "Mul64"}: struct{}{}, + {"riscv64", "internal/runtime/math", "MulUintptr"}: struct{}{}, ++ {"riscv64", "runtime/internal/sys", "Len64"}: struct{}{}, ++ {"riscv64", "runtime/internal/sys", "Len8"}: struct{}{}, + {"riscv64", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, + {"riscv64", "runtime/internal/sys", "TrailingZeros32"}: struct{}{}, + {"riscv64", "runtime/internal/sys", "TrailingZeros64"}: struct{}{}, +@@ -1012,6 +1014,11 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"riscv64", "math/big", "mulWW"}: struct{}{}, + {"riscv64", "math/bits", "Add"}: struct{}{}, + {"riscv64", "math/bits", "Add64"}: struct{}{}, ++ {"riscv64", "math/bits", "Len"}: struct{}{}, ++ {"riscv64", "math/bits", "Len16"}: struct{}{}, ++ {"riscv64", "math/bits", "Len32"}: struct{}{}, ++ {"riscv64", "math/bits", "Len64"}: struct{}{}, ++ {"riscv64", "math/bits", "Len8"}: struct{}{}, + {"riscv64", "math/bits", "Mul"}: struct{}{}, + {"riscv64", "math/bits", "Mul64"}: struct{}{}, + {"riscv64", "math/bits", "RotateLeft"}: struct{}{}, +diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go +index 97926c4ddc..45048f86eb 100644 +--- a/test/codegen/mathbits.go ++++ b/test/codegen/mathbits.go +@@ -18,8 +18,10 @@ func LeadingZeros(n uint) int { + // s390x:"FLOGR" + // arm:"CLZ" arm64:"CLZ" + // mips:"CLZ" +- // wasm:"I64Clz" + // ppc64x:"CNTLZD" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"SUB" ++ // s390x:"FLOGR" ++ // wasm:"I64Clz" + return bits.LeadingZeros(n) + } + +@@ -29,8 +31,10 @@ func LeadingZeros64(n uint64) int { + // s390x:"FLOGR" + // arm:"CLZ" arm64:"CLZ" + // mips:"CLZ" +- // wasm:"I64Clz" + // ppc64x:"CNTLZD" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"ADDI" ++ // s390x:"FLOGR" ++ // wasm:"I64Clz" + return bits.LeadingZeros64(n) + } + +@@ -40,8 +44,10 @@ func LeadingZeros32(n uint32) int { + // s390x:"FLOGR" + // arm:"CLZ" arm64:"CLZW" + // mips:"CLZ" +- // wasm:"I64Clz" + // ppc64x:"CNTLZW" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZW",-"ADDI" ++ // s390x:"FLOGR" ++ // wasm:"I64Clz" + return bits.LeadingZeros32(n) + } + +@@ -51,8 +57,10 @@ func LeadingZeros16(n uint16) int { + // s390x:"FLOGR" + // arm:"CLZ" arm64:"CLZ" + // mips:"CLZ" +- // wasm:"I64Clz" + // ppc64x:"CNTLZD" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-48",-"NEG" ++ // s390x:"FLOGR" ++ // wasm:"I64Clz" + return bits.LeadingZeros16(n) + } + +@@ -62,8 +70,10 @@ func LeadingZeros8(n uint8) int { + // s390x:"FLOGR" + // arm:"CLZ" arm64:"CLZ" + // mips:"CLZ" +- // wasm:"I64Clz" + // ppc64x:"CNTLZD" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-56",-"NEG" ++ // s390x:"FLOGR" ++ // wasm:"I64Clz" + return bits.LeadingZeros8(n) + } + +@@ -77,8 +87,10 @@ func Len(n uint) int { + // s390x:"FLOGR" + // arm:"CLZ" arm64:"CLZ" + // mips:"CLZ" +- // wasm:"I64Clz" + // ppc64x:"SUBC","CNTLZD" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64" ++ // s390x:"FLOGR" ++ // wasm:"I64Clz" + return bits.Len(n) + } + +@@ -88,13 +100,16 @@ func Len64(n uint64) int { + // s390x:"FLOGR" + // arm:"CLZ" arm64:"CLZ" + // mips:"CLZ" +- // wasm:"I64Clz" + // ppc64x:"SUBC","CNTLZD" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64" ++ // s390x:"FLOGR" ++ // wasm:"I64Clz" + return bits.Len64(n) + } + + func SubFromLen64(n uint64) int { + // ppc64x:"CNTLZD",-"SUBC" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"ADDI",-"NEG" + return 64 - bits.Len64(n) + } + +@@ -104,8 +119,10 @@ func Len32(n uint32) int { + // s390x:"FLOGR" + // arm:"CLZ" arm64:"CLZ" + // mips:"CLZ" +- // wasm:"I64Clz" + // ppc64x: "CNTLZW" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZW","ADDI\t\\$-32" ++ // s390x:"FLOGR" ++ // wasm:"I64Clz" + return bits.Len32(n) + } + +@@ -115,8 +132,10 @@ func Len16(n uint16) int { + // s390x:"FLOGR" + // arm:"CLZ" arm64:"CLZ" + // mips:"CLZ" +- // wasm:"I64Clz" + // ppc64x:"SUBC","CNTLZD" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64" ++ // s390x:"FLOGR" ++ // wasm:"I64Clz" + return bits.Len16(n) + } + +@@ -126,8 +145,10 @@ func Len8(n uint8) int { + // s390x:"FLOGR" + // arm:"CLZ" arm64:"CLZ" + // mips:"CLZ" +- // wasm:"I64Clz" + // ppc64x:"SUBC","CNTLZD" ++ // riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64" ++ // s390x:"FLOGR" ++ // wasm:"I64Clz" + return bits.Len8(n) + } + +@@ -348,7 +369,6 @@ func TrailingZeros32(n uint32) int { + func TrailingZeros16(n uint16) int { + // amd64:"BSFL","BTSL\\t\\$16" + // 386:"BSFL\t" +- // amd64:"BSFL","ORL\\t\\$65536" + // arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR" + // arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t" + // s390x:"FLOGR","OR\t\\$65536" +@@ -363,7 +383,6 @@ func TrailingZeros16(n uint16) int { + func TrailingZeros8(n uint8) int { + // amd64:"BSFL","BTSL\\t\\$8" + // 386:"BSFL" +- // amd64:"BSFL","ORL\\t\\$256" + // arm:"ORR\t\\$256","CLZ",-"MOVBU\tR" + // arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t" + // ppc64x/power8:"POPCNTB","ADD\t\\$-1" +@@ -392,6 +411,7 @@ func IterateBits64(n uint64) int { + for n != 0 { + // amd64/v1,amd64/v2:"BSFQ",-"CMOVEQ" + // amd64/v3:"TZCNTQ" ++ // riscv64/rva22u64,riscv64/rva23u64: "CTZ\t" + i += bits.TrailingZeros64(n) + n &= n - 1 + } +@@ -403,6 +423,7 @@ func IterateBits32(n uint32) int { + for n != 0 { + // amd64/v1,amd64/v2:"BSFL",-"BTSQ" + // amd64/v3:"TZCNTL" ++ // riscv64/rva22u64,riscv64/rva23u64: "CTZ\t" + i += bits.TrailingZeros32(n) + n &= n - 1 + } +-- +2.39.5 + diff --git a/2118-cmd-compile-intrinsify-math-bits.Bswap-on-riscv64.patch b/2118-cmd-compile-intrinsify-math-bits.Bswap-on-riscv64.patch new file mode 100644 index 0000000..01733ef --- /dev/null +++ b/2118-cmd-compile-intrinsify-math-bits.Bswap-on-riscv64.patch @@ -0,0 +1,331 @@ +From 4f4047a3396eeedb4e6972e7ac23073706bd9f57 Mon Sep 17 00:00:00 2001 +From: Joel Sing +Date: Fri, 26 Sep 2025 17:48:22 +0800 +Subject: [PATCH 118/119] cmd/compile: intrinsify math/bits.Bswap on riscv64 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +For riscv64/rva22u64 and above, we can intrinsify math/bits.Bswap +using the REV8 machine instruction. + +On a StarFive VisionFive 2 with GORISCV64=rva22u64: + + │ rb.1 │ rb.2 │ + │ sec/op │ sec/op vs base │ +ReverseBytes-4 18.790n ± 0% 4.026n ± 0% -78.57% (p=0.000 n=10) +ReverseBytes16-4 6.710n ± 0% 5.368n ± 0% -20.00% (p=0.000 n=10) +ReverseBytes32-4 13.420n ± 0% 5.368n ± 0% -60.00% (p=0.000 n=10) +ReverseBytes64-4 17.450n ± 0% 4.026n ± 0% -76.93% (p=0.000 n=10) +geomean 13.11n 4.649n -64.54% + +Change-Id: I26eee34270b1721f7304bb1cddb0fda129b20ece +Reviewed-on: https://go-review.googlesource.com/c/go/+/660855 +Reviewed-by: Mark Ryan +LUCI-TryBot-Result: Go LUCI +Reviewed-by: Meng Zhuo +Reviewed-by: Carlos Amedee +Reviewed-by: Junyang Shao +--- + src/cmd/compile/internal/riscv64/ssa.go | 3 +- + .../compile/internal/ssa/_gen/RISCV64.rules | 5 ++ + .../compile/internal/ssa/_gen/RISCV64Ops.go | 1 + + src/cmd/compile/internal/ssa/opGen.go | 14 ++++++ + .../compile/internal/ssa/rewriteRISCV64.go | 39 +++++++++++++++ + src/cmd/compile/internal/ssagen/intrinsics.go | 50 +++++++++++++++---- + .../internal/ssagen/intrinsics_test.go | 5 ++ + test/codegen/mathbits.go | 13 +++-- + 8 files changed, 111 insertions(+), 19 deletions(-) + +diff --git a/src/cmd/compile/internal/riscv64/ssa.go b/src/cmd/compile/internal/riscv64/ssa.go +index 1f0880f80b..96b0e605e8 100644 +--- a/src/cmd/compile/internal/riscv64/ssa.go ++++ b/src/cmd/compile/internal/riscv64/ssa.go +@@ -419,7 +419,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { + ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVXS, ssa.OpRISCV64FMVDX, ssa.OpRISCV64FMVXD, + ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS, + ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD, +- ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CLZ, ssa.OpRISCV64CLZW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW: ++ ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CLZ, ssa.OpRISCV64CLZW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW, ++ ssa.OpRISCV64REV8: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +index 36c9b53eef..93c4e790f8 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64.rules ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64.rules +@@ -231,6 +231,11 @@ + (BitLen16 x) => (BitLen64 (ZeroExt16to64 x)) + (BitLen8 x) => (BitLen64 (ZeroExt8to64 x)) + ++// Byte swap (note that these will only be emitted for rva22u64 and above). ++(Bswap64 ...) => (REV8 ...) ++(Bswap32 x) => (SRLI [32] (REV8 x)) ++(Bswap16 x) => (SRLI [48] (REV8 x)) ++ + (Less64 ...) => (SLT ...) + (Less32 x y) => (SLT (SignExt32to64 x) (SignExt32to64 y)) + (Less16 x y) => (SLT (SignExt16to64 x) (SignExt16to64 y)) +diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +index b411766354..0a46dc7d5b 100644 +--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go ++++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go +@@ -237,6 +237,7 @@ func init() { + {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1 + {name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // ^arg0 | arg1 + {name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"}, // arg0 | auxint ++ {name: "REV8", argLength: 1, reg: gp11, asm: "REV8"}, // reverse bytes + {name: "ROL", argLength: 2, reg: gp21, asm: "ROL"}, // rotate left arg0 by (arg1 & 63) + {name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"}, // rotate left least significant word of arg0 by (arg1 & 31), sign extended + {name: "ROR", argLength: 2, reg: gp21, asm: "ROR"}, // rotate right arg0 by (arg1 & 63) +diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go +index 28bd5a34bf..f906f86ec1 100644 +--- a/src/cmd/compile/internal/ssa/opGen.go ++++ b/src/cmd/compile/internal/ssa/opGen.go +@@ -2395,6 +2395,7 @@ const ( + OpRISCV64OR + OpRISCV64ORN + OpRISCV64ORI ++ OpRISCV64REV8 + OpRISCV64ROL + OpRISCV64ROLW + OpRISCV64ROR +@@ -32153,6 +32154,19 @@ var opcodeTable = [...]opInfo{ + }, + }, + }, ++ { ++ name: "REV8", ++ argLen: 1, ++ asm: riscv.AREV8, ++ reg: regInfo{ ++ inputs: []inputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ outputs: []outputInfo{ ++ {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30 ++ }, ++ }, ++ }, + { + name: "ROL", + argLen: 2, +diff --git a/src/cmd/compile/internal/ssa/rewriteRISCV64.go b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +index 474deeede6..5a0c9b70c4 100644 +--- a/src/cmd/compile/internal/ssa/rewriteRISCV64.go ++++ b/src/cmd/compile/internal/ssa/rewriteRISCV64.go +@@ -112,6 +112,13 @@ func rewriteValueRISCV64(v *Value) bool { + return rewriteValueRISCV64_OpBitLen64(v) + case OpBitLen8: + return rewriteValueRISCV64_OpBitLen8(v) ++ case OpBswap16: ++ return rewriteValueRISCV64_OpBswap16(v) ++ case OpBswap32: ++ return rewriteValueRISCV64_OpBswap32(v) ++ case OpBswap64: ++ v.Op = OpRISCV64REV8 ++ return true + case OpClosureCall: + v.Op = OpRISCV64CALLclosure + return true +@@ -1014,6 +1021,38 @@ func rewriteValueRISCV64_OpBitLen8(v *Value) bool { + return true + } + } ++func rewriteValueRISCV64_OpBswap16(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ // match: (Bswap16 x) ++ // result: (SRLI [48] (REV8 x)) ++ for { ++ t := v.Type ++ x := v_0 ++ v.reset(OpRISCV64SRLI) ++ v.AuxInt = int64ToAuxInt(48) ++ v0 := b.NewValue0(v.Pos, OpRISCV64REV8, t) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} ++func rewriteValueRISCV64_OpBswap32(v *Value) bool { ++ v_0 := v.Args[0] ++ b := v.Block ++ // match: (Bswap32 x) ++ // result: (SRLI [32] (REV8 x)) ++ for { ++ t := v.Type ++ x := v_0 ++ v.reset(OpRISCV64SRLI) ++ v.AuxInt = int64ToAuxInt(32) ++ v0 := b.NewValue0(v.Pos, OpRISCV64REV8, t) ++ v0.AddArg(x) ++ v.AddArg(v0) ++ return true ++ } ++} + func rewriteValueRISCV64_OpConst16(v *Value) bool { + // match: (Const16 [val]) + // result: (MOVDconst [int64(val)]) +diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go +index 0888ef27d5..019b76195d 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics.go +@@ -178,23 +178,44 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { + }, + sys.ARM64, sys.PPC64, sys.RISCV64) + +- brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X} +- if cfg.goppc64 >= 10 { +- // Use only on Power10 as the new byte reverse instructions that Power10 provide +- // make it worthwhile as an intrinsic +- brev_arch = append(brev_arch, sys.PPC64) +- } +- /******** runtime/internal/sys ********/ + addF("runtime/internal/sys", "Bswap32", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0]) + }, +- brev_arch...) ++ sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X) + addF("runtime/internal/sys", "Bswap64", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0]) + }, +- brev_arch...) ++ sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X) ++ ++ if cfg.goppc64 >= 10 { ++ // Use only on Power10 as the new byte reverse instructions that Power10 provide ++ // make it worthwhile as an intrinsic ++ addF("runtime/internal/sys", "Bswap32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0]) ++ }, ++ sys.PPC64) ++ addF("runtime/internal/sys", "Bswap64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0]) ++ }, ++ sys.PPC64) ++ } ++ ++ if cfg.goriscv64 >= 22 { ++ addF("runtime/internal/sys", "Bswap32", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0]) ++ }, ++ sys.RISCV64) ++ addF("runtime/internal/sys", "Bswap64", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0]) ++ }, ++ sys.RISCV64) ++ } + + /****** Prefetch ******/ + makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +@@ -709,18 +730,25 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { + sys.RISCV64) + } + ++ // ReverseBytes inlines correctly, no need to intrinsify it. + alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...) + alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...) +- // ReverseBytes inlines correctly, no need to intrinsify it. + // Nothing special is needed for targets where ReverseBytes16 lowers to a rotate +- // On Power10, 16-bit rotate is not available so use BRH instruction + if cfg.goppc64 >= 10 { ++ // On Power10, 16-bit rotate is not available so use BRH instruction + addF("math/bits", "ReverseBytes16", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0]) + }, + sys.PPC64) + } ++ if cfg.goriscv64 >= 22 { ++ addF("math/bits", "ReverseBytes16", ++ func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { ++ return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0]) ++ }, ++ sys.RISCV64) ++ } + + addF("math/bits", "Len64", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go +index c31a6ee609..3062c4e489 100644 +--- a/src/cmd/compile/internal/ssagen/intrinsics_test.go ++++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go +@@ -1002,6 +1002,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"riscv64", "internal/runtime/math", "Add64"}: struct{}{}, + {"riscv64", "internal/runtime/math", "Mul64"}: struct{}{}, + {"riscv64", "internal/runtime/math", "MulUintptr"}: struct{}{}, ++ {"riscv64", "runtime/internal/sys", "Bswap32"}: struct{}{}, ++ {"riscv64", "runtime/internal/sys", "Bswap64"}: struct{}{}, + {"riscv64", "runtime/internal/sys", "Len64"}: struct{}{}, + {"riscv64", "runtime/internal/sys", "Len8"}: struct{}{}, + {"riscv64", "runtime/internal/sys", "TrailingZeros8"}: struct{}{}, +@@ -1021,6 +1023,9 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ + {"riscv64", "math/bits", "Len8"}: struct{}{}, + {"riscv64", "math/bits", "Mul"}: struct{}{}, + {"riscv64", "math/bits", "Mul64"}: struct{}{}, ++ {"riscv64", "math/bits", "ReverseBytes16"}: struct{}{}, ++ {"riscv64", "math/bits", "ReverseBytes32"}: struct{}{}, ++ {"riscv64", "math/bits", "ReverseBytes64"}: struct{}{}, + {"riscv64", "math/bits", "RotateLeft"}: struct{}{}, + {"riscv64", "math/bits", "RotateLeft16"}: struct{}{}, + {"riscv64", "math/bits", "RotateLeft32"}: struct{}{}, +diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go +index 45048f86eb..4ddb87a980 100644 +--- a/test/codegen/mathbits.go ++++ b/test/codegen/mathbits.go +@@ -209,38 +209,37 @@ func OnesCount8(n uint8) int { + // ----------------------- // + + func ReverseBytes(n uint) uint { +- // amd64:"BSWAPQ" + // 386:"BSWAPL" +- // s390x:"MOVDBR" ++ // amd64:"BSWAPQ" + // arm64:"REV" + return bits.ReverseBytes(n) + } + + func ReverseBytes64(n uint64) uint64 { +- // amd64:"BSWAPQ" + // 386:"BSWAPL" +- // s390x:"MOVDBR" ++ // amd64:"BSWAPQ" + // arm64:"REV" + // ppc64x/power10: "BRD" + return bits.ReverseBytes64(n) + } + + func ReverseBytes32(n uint32) uint32 { +- // amd64:"BSWAPL" + // 386:"BSWAPL" +- // s390x:"MOVWBR" ++ // amd64:"BSWAPL" + // arm64:"REVW" + // ppc64x/power10: "BRW" ++ // riscv64/rva22u64,riscv64/rva23u64:"REV8","SRLI\t\\$32" ++ // s390x:"MOVWBR" + return bits.ReverseBytes32(n) + } + + func ReverseBytes16(n uint16) uint16 { + // amd64:"ROLW" +- // arm64:"REV16W",-"UBFX",-"ORR" + // arm/5:"SLL","SRL","ORR" + // arm/6:"REV16" + // arm/7:"REV16" + // ppc64x/power10: "BRH" ++ // riscv64/rva22u64,riscv64/rva23u64:"REV8","SRLI\t\\$48" + return bits.ReverseBytes16(n) + } + +-- +2.39.5 + diff --git a/golang.spec b/golang.spec index aa468c1..946155f 100644 --- a/golang.spec +++ b/golang.spec @@ -66,7 +66,7 @@ Name: golang Version: 1.21.4 -Release: 37 +Release: 38 Summary: The Go Programming Language License: BSD and Public Domain URL: https://golang.org/ @@ -164,6 +164,130 @@ Patch8002: 8002-cmd-go-Use-AI-to-guide-optimization.patch Patch8003: 8003-internal-buildcfg-add-Kunpeng-atomic-optimize.patch Patch8004: 8004-runtime-add-gcRatio-option.patch +# Part 2000 ~ 2119 +# RISC-V RVA23 support backport +%ifarch riscv64 +Patch2000: 2000-cmd-asm-cmd-internal-obj-riscv-cmd-link-improve-TLS-.patch +Patch2001: 2001-cmd-compile-fold-most-repetitive-operations-to-simpl.patch +Patch2002: 2002-crypto-internal-bigmod-provide-assembly-addMulVVW-fo.patch +Patch2003: 2003-cmd-compile-sign-or-zero-extend-for-32-bit-equality-.patch +Patch2004: 2004-cmd-compile-improve-FP-FMA-performance-on-riscv64.patch +Patch2005: 2005-cmd-compile-add-single-precision-FMA-code-generation.patch +Patch2006: 2006-NOT-FULL-BACKPORT-cmd-internal-obj-riscv-cmd-link-ad.patch +Patch2007: 2007-cmd-internal-obj-riscv-clean-up-error-checking-for-e.patch +Patch2008: 2008-cmd-internal-obj-riscv-correct-message-in-regVal-pan.patch +Patch2009: 2009-cmd-internal-obj-riscv-simplify-instructionsForMOV.patch +Patch2010: 2010-internal-cpu-fix-wrong-cache-line-size-of-riscv64.patch +Patch2011: 2011-cmd-internal-obj-riscv-clean-up-immediate-checking.patch +Patch2012: 2012-cmd-compile-internal-intrinsify-publicationBarrier-o.patch +Patch2013: 2013-cmd-compile-internal-stop-lowering-OpConvert-on-risc.patch +Patch2014: 2014-cmd-compile-optimize-right-shifts-of-uint32-on-riscv.patch +Patch2015: 2015-cmd-link-internal-ld-assign-temporary-addresses-to-p.patch +Patch2016: 2016-cmd-compile-optimize-right-shifts-of-int32-on-riscv6.patch +Patch2017: 2017-cmd-internal-obj-riscv-support-subtraction-with-a-co.patch +Patch2018: 2018-cmd-internal-obj-riscv-fix-the-offset-of-JALR-transf.patch +Patch2019: 2019-cmd-internal-obj-riscv-improve-handling-of-invalid-a.patch +Patch2020: 2020-all-clean-up-addition-of-constants-in-riscv64-assemb.patch +Patch2021: 2021-cmd-internal-obj-riscv-add-support-of-PCALIGN-direct.patch +Patch2022: 2022-internal-bytealg-optimize-Count-with-PCALIGN-in-risc.patch +Patch2023: 2023-cmd-compile-correct-code-generation-for-right-shifts.patch +Patch2024: 2024-crypto-sha512-provide-optimised-assembly-for-riscv64.patch +Patch2025: 2025-cmd-go-add-GORISCV64-environment-variable.patch +Patch2026: 2026-cmd-compile-implement-float-min-max-in-hardware-for-.patch +Patch2027: 2027-cmd-compile-implement-float-min-max-in-hardware-for-.patch +Patch2028: 2028-cmd-compile-improve-rotations-for-riscv64.patch +Patch2029: 2029-cmd-asm-cmd-internal-obj-enable-rounding-mode-suffix.patch +Patch2030: 2030-math-add-round-assembly-implementations-on-riscv64.patch +Patch2031: 2031-cmd-link-internal-riscv64-generate-local-text-symbol.patch +Patch2032: 2032-cmd-compile-cmd-internal-obj-provide-rotation-pseudo.patch +Patch2033: 2033-cmd-internal-obj-support-Zba-Zbb-Zbs-extensions-in-r.patch +Patch2034: 2034-cmd-internal-obj-riscv-improve-register-MOVB-MOVH-MO.patch +Patch2035: 2035-cmd-internal-obj-riscv-use-native-rotation-instructi.patch +Patch2036: 2036-cmd-internal-obj-riscv-check-immediate-for-rotation-.patch +Patch2037: 2037-test-codegen-add-Mul-test-for-riscv64.patch +Patch2038: 2038-math-remove-riscv64-assembly-implementations-of-roun.patch +Patch2039: 2039-cmd-compile-drop-TODO-in-NilCheck-for-riscv64.patch +Patch2040: 2040-math-big-implement-addVV-in-riscv64-assembly.patch +Patch2041: 2041-math-big-implement-subVV-in-riscv64-assembly.patch +Patch2042: 2042-cmd-compile-use-integer-min-max-instructions-on-risc.patch +Patch2043: 2043-math-big-implement-addVW-in-riscv64-assembly.patch +Patch2044: 2044-math-big-implement-subVW-in-riscv64-assembly.patch +Patch2045: 2045-crypto-sha256-provide-optimised-assembly-for-riscv64.patch +Patch2046: 2046-math-big-implement-mulAddVWW-in-riscv64-assembly.patch +Patch2047: 2047-math-big-implement-addMulVVW-in-riscv64-assembly.patch +Patch2048: 2048-test-codegen-add-initial-codegen-tests-for-integer-m.patch +Patch2049: 2049-cmd-compile-internal-ssa-combine-shift-and-addition-.patch +Patch2050: 2050-math-add-round-assembly-implementations-on-riscv64.patch +Patch2051: 2051-test-codegen-add-Rotate-test-for-riscv64.patch +Patch2052: 2052-runtime-add-asm_riscv64.h.patch +Patch2053: 2053-cmd-compile-cmd-internal-obj-riscv-always-provide-AN.patch +Patch2054: 2054-crypto-md5-provide-optimised-assembly-for-riscv64.patch +Patch2055: 2055-cmd-internal-obj-riscv-rename-the-iIEncoding.patch +Patch2056: 2056-cmd-internal-obj-riscv-add-vector-instruction-encodi.patch +Patch2057: 2057-cmd-internal-obj-cmd-asm-add-vector-registers-to-ris.patch +Patch2058: 2058-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch +Patch2059: 2059-cmd-internal-obj-add-prologue_end-DWARF-stmt-for-ris.patch +Patch2060: 2060-cmd-internal-obj-riscv-update-RISC-V-instruction-tab.patch +Patch2061: 2061-crypto-sha512-improve-performance-of-riscv64-assembl.patch +Patch2062: 2062-internal-bytealg-optimize-IndexByte-for-riscv64.patch +Patch2063: 2063-cmd-internal-obj-riscv-rework-instruction-encoding-i.patch +Patch2064: 2064-cpu-internal-provide-runtime-detection-of-RISC-V-ext.patch +Patch2065: 2065-cmd-go-add-rva23u64-as-a-valid-value-for-GORISCV64.patch +Patch2066: 2066-cmd-internal-obj-riscv-update-references-to-RISC-V-s.patch +Patch2067: 2067-cmd-compile-don-t-merge-symbols-on-riscv64-when-dyna.patch +Patch2068: 2068-cmd-internal-obj-riscv-support-MOVD-with-floating-po.patch +Patch2069: 2069-cmd-asm-cmd-internal-obj-riscv-implement-vector-conf.patch +Patch2070: 2070-internal-bytealg-clean-up-and-simplify-the-riscv64-e.patch +Patch2071: 2071-bytes-internal-bytealg-eliminate-HashStrBytes-HashSt.patch +Patch2072: 2072-cmd-internal-obj-riscv-implement-vector-load-store-i.patch +Patch2073: 2073-cmd-internal-obj-riscv-add-riscv64-CSR-map.patch +Patch2074: 2074-test-codegen-tighten-the-TrailingZeros64-test-on-386.patch +Patch2075: 2075-test-codegen-add-riscv64-codegen-for-arithmetic-test.patch +Patch2076: 2076-test-codegen-add-riscv64-rva23u64-specifiers-to-exis.patch +Patch2077: 2077-test-codegen-add-a-test-for-negation-and-conversion-.patch +Patch2078: 2078-cmd-compile-combine-negation-and-word-sign-extension.patch +Patch2079: 2079-cmd-compile-internal-ssa-remove-double-negation-with.patch +Patch2080: 2080-cmd-internal-obj-riscv-prevent-duplicate-error-repor.patch +Patch2081: 2081-cmd-internal-obj-riscv-prevent-panics-on-bad-branche.patch +Patch2082: 2082-cmd-internal-obj-riscv-fix-the-encoding-for-REV8-and.patch +Patch2083: 2083-cmd-internal-obj-riscv-factor-out-shift-constant-cod.patch +Patch2084: 2084-cmd-asm-internal-asm-add-additional-tests-for-consta.patch +Patch2085: 2085-test-codegen-add-combined-conversion-and-shift-tests.patch +Patch2086: 2086-cmd-internal-obj-riscv-internal-bytealg-synthesize-M.patch +Patch2087: 2087-cmd-internal-obj-riscv-improve-constant-construction.patch +Patch2088: 2088-cmd-compile-internal-ssa-optimise-more-branches-with.patch +Patch2089: 2089-cmd-internal-obj-riscv-add-support-for-vector-intege.patch +Patch2090: 2090-cmd-internal-obj-riscv-add-support-for-vector-fixed-.patch +Patch2091: 2091-crypto-sha512-remove-unnecessary-move-op-replace-wit.patch +Patch2092: 2092-crypto-sha256-improve-performance-of-riscv64-assembl.patch +Patch2093: 2093-cmd-link-fix-cgo-on-riscv64-when-building-with-gcc-1.patch +Patch2094: 2094-internal-bytealg-deduplicate-code-between-Count-Coun.patch +Patch2095: 2095-cmd-internal-obj-riscv-add-support-for-vector-floati.patch +Patch2096: 2096-cmd-internal-obj-riscv-add-support-for-vector-reduct.patch +Patch2097: 2097-cmd-internal-obj-riscv-add-support-for-vector-mask-i.patch +Patch2098: 2098-cmd-internal-obj-riscv-add-support-for-vector-permut.patch +Patch2099: 2099-internal-bytealg-vector-implementation-of-equal-for-.patch +Patch2100: 2100-internal-bytealg-vector-implementation-of-indexbyte-.patch +Patch2101: 2101-cmd-internal-obj-riscv-reject-invalid-vadc-vsbc-enco.patch +Patch2102: 2102-cmd-internal-obj-riscv-fix-LMUL-encoding-for-MF2-and.patch +Patch2103: 2103-cmd-compile-add-generic-simplifications-on-riscv64.patch +Patch2104: 2104-cmd-internal-obj-riscv-fix-vector-integer-multiply-a.patch +Patch2105: 2105-cmd-compile-optimise-float-int-register-moves-on-ris.patch +Patch2106: 2106-internal-bytealg-vector-implementation-of-compare-fo.patch +Patch2107: 2107-cmd-compile-internal-ssagen-improve-intrinsic-archit.patch +Patch2108: 2108-cmd-compile-internal-ssagen-factor-out-intrinsics-co.patch +Patch2109: 2109-cmd-compile-internal-ssagen-add-initial-test-coverag.patch +Patch2110: 2110-cmd-dist-internal-add-GOARM64-environment-variable.patch +Patch2111: 2111-cmd-compile-internal-ssagen-provide-intrinsicBuilder.patch +Patch2112: 2112-cmd-compile-internal-ssagen-improve-intrinsic-test.patch +Patch2113: 2113-cmd-compile-simplify-intrinsification-of-BitLen16-an.patch +Patch2114: 2114-cmd-compile-simplify-intrinsification-of-TrailingZer.patch +Patch2115: 2115-cmd-compile-intrinsify-math-bits.TrailingZeros-on-ri.patch +Patch2116: 2116-cmd-compile-internal-ssagen-use-an-alias-for-math-bi.patch +Patch2117: 2117-cmd-compile-intrinsify-math-bits.Len-on-riscv64.patch +Patch2118: 2118-cmd-compile-intrinsify-math-bits.Bswap-on-riscv64.patch +%endif + ExclusiveArch: %{golang_arches} %description @@ -401,6 +525,12 @@ fi %files devel -f go-tests.list -f go-misc.list -f go-src.list %changelog +* Fri Sep 26 2025 Julian Zhu - 1.21.4-38 +- Type:Feature +- CVE:NA +- SUG:NA +- DESC: Backport RISC-V RVA23 support for RISC-V 64 + * Mon Sep 15 2025 songliyang - 1.21.4-37 - Type:CVE - CVE:CVE-2025-22871 -- Gitee