diff --git a/dist b/dist new file mode 100644 index 0000000000000000000000000000000000000000..9c0e36ec42a2d9bfefacb21ac6354c9ddd910533 --- /dev/null +++ b/dist @@ -0,0 +1 @@ +an8 diff --git a/openblas-0.3.15-opt-loongarch64.patch b/openblas-0.3.15-opt-loongarch64.patch deleted file mode 100644 index d83710a33156f4582a0ebecc30557058441ee54e..0000000000000000000000000000000000000000 --- a/openblas-0.3.15-opt-loongarch64.patch +++ /dev/null @@ -1,31426 +0,0 @@ -From b844ee9613bf88f20bbd78ff1fbad29740c99ba2 Mon Sep 17 00:00:00 2001 -From: Shiyou Yin -Date: Wed, 7 Sep 2022 18:06:33 +0800 -Subject: [PATCH] Add support for LoongArch64. - -include latest commit(fbfe1daf6ea71a50bc36cb29d0d27e0359926ef7) in branch develop. ---- - Makefile.loongarch64 | 3 + - Makefile.system | 14 + - TargetList.txt | 4 + - c_check | 53 +- - cmake/arch.cmake | 4 + - cmake/cc.cmake | 9 + - cmake/fc.cmake | 7 + - cmake/system_check.cmake | 4 +- - common.h | 6 +- - common_loongarch64.h | 199 ++ - common_macro.h | 3 +- - cpuid_loongarch64.c | 143 + - ctest.c | 4 + - driver/others/Makefile | 8 + - driver/others/dynamic_loongarch64.c | 128 + - getarch.c | 58 +- - kernel/loongarch64/KERNEL | 238 ++ - kernel/loongarch64/KERNEL.LOONGSON3R5 | 14 + - kernel/loongarch64/KERNEL.generic | 167 + - kernel/loongarch64/Makefile | 1 + - kernel/loongarch64/amax.S | 230 ++ - kernel/loongarch64/amin.S | 186 ++ - kernel/loongarch64/asum.S | 232 ++ - kernel/loongarch64/cnrm2.S | 159 + - kernel/loongarch64/copy.S | 225 ++ - kernel/loongarch64/dgemm_kernel_16x4.S | 4250 ++++++++++++++++++++++++ - kernel/loongarch64/dgemm_ncopy_16.S | 691 ++++ - kernel/loongarch64/dgemm_ncopy_4.S | 237 ++ - kernel/loongarch64/dgemm_tcopy_16.S | 710 ++++ - kernel/loongarch64/dgemm_tcopy_4.S | 270 ++ - kernel/loongarch64/dnrm2.S | 324 ++ - kernel/loongarch64/dot.S | 391 +++ - kernel/loongarch64/gemm_kernel.S | 1859 +++++++++++ - kernel/loongarch64/gemv_n.S | 531 +++ - kernel/loongarch64/gemv_t.S | 436 +++ - kernel/loongarch64/iamax.S | 233 ++ - kernel/loongarch64/iamin.S | 233 ++ - kernel/loongarch64/izamax.S | 217 ++ - kernel/loongarch64/izamin.S | 217 ++ - kernel/loongarch64/max.S | 174 + - kernel/loongarch64/min.S | 174 + - kernel/loongarch64/scal.S | 330 ++ - kernel/loongarch64/snrm2.S | 249 ++ - kernel/loongarch64/swap.S | 330 ++ - kernel/loongarch64/trsm_kernel_LN.S | 2863 ++++++++++++++++ - kernel/loongarch64/trsm_kernel_LT.S | 2854 ++++++++++++++++ - kernel/loongarch64/trsm_kernel_RT.S | 2850 ++++++++++++++++ - kernel/loongarch64/zamax.S | 190 ++ - kernel/loongarch64/zamin.S | 198 ++ - kernel/loongarch64/zasum.S | 158 + - kernel/loongarch64/zcopy.S | 217 ++ - kernel/loongarch64/zdot.S | 330 ++ - kernel/loongarch64/zgemm3m_kernel.S | 1359 ++++++++ - kernel/loongarch64/zgemm_kernel.S | 1047 ++++++ - kernel/loongarch64/zgemv_n.S | 648 ++++ - kernel/loongarch64/zgemv_t.S | 556 ++++ - kernel/loongarch64/znrm2.S | 304 ++ - kernel/loongarch64/zscal.S | 356 ++ - kernel/loongarch64/ztrsm_kernel_LT.S | 1344 ++++++++ - kernel/loongarch64/ztrsm_kernel_RT.S | 1343 ++++++++ - kernel/mips64/dnrm2.S | 9 + - kernel/setparam-ref.c | 29 + - lapack/laswp/loongarch64/Makefile | 12 + - param.h | 116 + - 64 files changed, 30708 insertions(+), 30 deletions(-) - create mode 100644 Makefile.loongarch64 - create mode 100644 common_loongarch64.h - create mode 100644 cpuid_loongarch64.c - create mode 100644 driver/others/dynamic_loongarch64.c - create mode 100644 kernel/loongarch64/KERNEL - create mode 100644 kernel/loongarch64/KERNEL.LOONGSON3R5 - create mode 100644 kernel/loongarch64/KERNEL.generic - create mode 100644 kernel/loongarch64/Makefile - create mode 100644 kernel/loongarch64/amax.S - create mode 100644 kernel/loongarch64/amin.S - create mode 100644 kernel/loongarch64/asum.S - create mode 100644 kernel/loongarch64/cnrm2.S - create mode 100644 kernel/loongarch64/copy.S - create mode 100644 kernel/loongarch64/dgemm_kernel_16x4.S - create mode 100644 kernel/loongarch64/dgemm_ncopy_16.S - create mode 100644 kernel/loongarch64/dgemm_ncopy_4.S - create mode 100644 kernel/loongarch64/dgemm_tcopy_16.S - create mode 100644 kernel/loongarch64/dgemm_tcopy_4.S - create mode 100644 kernel/loongarch64/dnrm2.S - create mode 100644 kernel/loongarch64/dot.S - create mode 100644 kernel/loongarch64/gemm_kernel.S - create mode 100644 kernel/loongarch64/gemv_n.S - create mode 100644 kernel/loongarch64/gemv_t.S - create mode 100644 kernel/loongarch64/iamax.S - create mode 100644 kernel/loongarch64/iamin.S - create mode 100644 kernel/loongarch64/izamax.S - create mode 100644 kernel/loongarch64/izamin.S - create mode 100644 kernel/loongarch64/max.S - create mode 100644 kernel/loongarch64/min.S - create mode 100644 kernel/loongarch64/scal.S - create mode 100644 kernel/loongarch64/snrm2.S - create mode 100644 kernel/loongarch64/swap.S - create mode 100644 kernel/loongarch64/trsm_kernel_LN.S - create mode 100644 kernel/loongarch64/trsm_kernel_LT.S - create mode 100644 kernel/loongarch64/trsm_kernel_RT.S - create mode 100644 kernel/loongarch64/zamax.S - create mode 100644 kernel/loongarch64/zamin.S - create mode 100644 kernel/loongarch64/zasum.S - create mode 100644 kernel/loongarch64/zcopy.S - create mode 100644 kernel/loongarch64/zdot.S - create mode 100644 kernel/loongarch64/zgemm3m_kernel.S - create mode 100644 kernel/loongarch64/zgemm_kernel.S - create mode 100644 kernel/loongarch64/zgemv_n.S - create mode 100644 kernel/loongarch64/zgemv_t.S - create mode 100644 kernel/loongarch64/znrm2.S - create mode 100644 kernel/loongarch64/zscal.S - create mode 100644 kernel/loongarch64/ztrsm_kernel_LT.S - create mode 100644 kernel/loongarch64/ztrsm_kernel_RT.S - create mode 100644 lapack/laswp/loongarch64/Makefile - -diff --git a/Makefile.loongarch64 b/Makefile.loongarch64 -new file mode 100644 -index 0000000..05ea9c6 ---- /dev/null -+++ b/Makefile.loongarch64 -@@ -0,0 +1,3 @@ -+ifdef BINARY64 -+else -+endif -diff --git a/Makefile.system b/Makefile.system -index 80739dc..5aca7c0 100644 ---- a/Makefile.system -+++ b/Makefile.system -@@ -636,6 +636,10 @@ ifeq ($(ARCH), mips64) - DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 - endif - -+ifeq ($(ARCH), loongarch64) -+DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC -+endif -+ - ifeq ($(ARCH), zarch) - DYNAMIC_CORE = ZARCH_GENERIC - -@@ -772,6 +776,11 @@ NO_BINARY_MODE = 1 - BINARY_DEFINED = 1 - endif - -+ifeq ($(ARCH), loongarch64) -+NO_BINARY_MODE = 1 -+BINARY_DEFINED = 1 -+endif -+ - - # - # C Compiler dependent settings -@@ -842,6 +851,11 @@ ifeq ($(OSNAME), AIX) - BINARY_DEFINED = 1 - endif - -+ifeq ($(ARCH), loongarch64) -+CCOMMON_OPT += -march=loongarch64 -mabi=lp64 -+FCOMMON_OPT += -march=loongarch64 -mabi=lp64 -+endif -+ - endif - - ifndef BINARY_DEFINED -diff --git a/TargetList.txt b/TargetList.txt -index d199649..cd3e756 100644 ---- a/TargetList.txt -+++ b/TargetList.txt -@@ -109,3 +109,7 @@ Z14 - RISCV64_GENERIC - C910V - -+11.LOONGARCH64: -+LOONGSONGENERIC -+LOONGSON3R5 -+LOONGSON2K1000 -diff --git a/c_check b/c_check -index e24943a..030f5e6 100644 ---- a/c_check -+++ b/c_check -@@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/); - $os = Android if ($data =~ /OS_ANDROID/); - $os = Haiku if ($data =~ /OS_HAIKU/); - --$architecture = x86 if ($data =~ /ARCH_X86/); --$architecture = x86_64 if ($data =~ /ARCH_X86_64/); --$architecture = power if ($data =~ /ARCH_POWER/); --$architecture = mips if ($data =~ /ARCH_MIPS/); --$architecture = mips64 if ($data =~ /ARCH_MIPS64/); --$architecture = alpha if ($data =~ /ARCH_ALPHA/); --$architecture = sparc if ($data =~ /ARCH_SPARC/); --$architecture = ia64 if ($data =~ /ARCH_IA64/); --$architecture = arm if ($data =~ /ARCH_ARM/); --$architecture = arm64 if ($data =~ /ARCH_ARM64/); --$architecture = zarch if ($data =~ /ARCH_ZARCH/); --$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); -+$architecture = x86 if ($data =~ /ARCH_X86/); -+$architecture = x86_64 if ($data =~ /ARCH_X86_64/); -+$architecture = power if ($data =~ /ARCH_POWER/); -+$architecture = mips if ($data =~ /ARCH_MIPS/); -+$architecture = mips64 if ($data =~ /ARCH_MIPS64/); -+$architecture = alpha if ($data =~ /ARCH_ALPHA/); -+$architecture = sparc if ($data =~ /ARCH_SPARC/); -+$architecture = ia64 if ($data =~ /ARCH_IA64/); -+$architecture = arm if ($data =~ /ARCH_ARM/); -+$architecture = arm64 if ($data =~ /ARCH_ARM64/); -+$architecture = zarch if ($data =~ /ARCH_ZARCH/); -+$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); -+$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); - - $defined = 0; - -@@ -143,6 +144,11 @@ if ($architecture eq "riscv64") { - $binary = 64; - } - -+if ($architecture eq "loongarch64") { -+ $defined = 1; -+ $binary = 64; -+} -+ - if ($compiler eq "PGI") { - $compiler_name .= " -tp p7" if ($binary eq "32"); - $compiler_name .= " -tp p7-64" if ($binary eq "64"); -@@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { - } - } - --$architecture = x86 if ($data =~ /ARCH_X86/); --$architecture = x86_64 if ($data =~ /ARCH_X86_64/); --$architecture = power if ($data =~ /ARCH_POWER/); --$architecture = mips if ($data =~ /ARCH_MIPS/); --$architecture = mips64 if ($data =~ /ARCH_MIPS64/); --$architecture = alpha if ($data =~ /ARCH_ALPHA/); --$architecture = sparc if ($data =~ /ARCH_SPARC/); --$architecture = ia64 if ($data =~ /ARCH_IA64/); --$architecture = arm if ($data =~ /ARCH_ARM/); --$architecture = arm64 if ($data =~ /ARCH_ARM64/); --$architecture = zarch if ($data =~ /ARCH_ZARCH/); -+$architecture = x86 if ($data =~ /ARCH_X86/); -+$architecture = x86_64 if ($data =~ /ARCH_X86_64/); -+$architecture = power if ($data =~ /ARCH_POWER/); -+$architecture = mips if ($data =~ /ARCH_MIPS/); -+$architecture = mips64 if ($data =~ /ARCH_MIPS64/); -+$architecture = alpha if ($data =~ /ARCH_ALPHA/); -+$architecture = sparc if ($data =~ /ARCH_SPARC/); -+$architecture = ia64 if ($data =~ /ARCH_IA64/); -+$architecture = arm if ($data =~ /ARCH_ARM/); -+$architecture = arm64 if ($data =~ /ARCH_ARM64/); -+$architecture = zarch if ($data =~ /ARCH_ZARCH/); -+$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); - - $binformat = bin32; - $binformat = bin64 if ($data =~ /BINARY_64/); -diff --git a/cmake/arch.cmake b/cmake/arch.cmake -index 4451f9e..b1d18cc 100644 ---- a/cmake/arch.cmake -+++ b/cmake/arch.cmake -@@ -113,6 +113,10 @@ if (MIPS64) - set(NO_BINARY_MODE 1) - endif () - -+if (LOONGARCH64) -+ set(NO_BINARY_MODE 1) -+endif () -+ - if (${ARCH} STREQUAL "alpha") - set(NO_BINARY_MODE 1) - set(BINARY_DEFINED 1) -diff --git a/cmake/cc.cmake b/cmake/cc.cmake -index 7695215..d0b195c 100644 ---- a/cmake/cc.cmake -+++ b/cmake/cc.cmake -@@ -29,6 +29,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS - set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") - endif () - -+ if (LOONGARCH64) -+ if (BINARY64) -+ set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") -+ else () -+ set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") -+ endif () -+ set(BINARY_DEFINED 1) -+ endif () -+ - if (CMAKE_SYSTEM_NAME STREQUAL "AIX") - set(BINARY_DEFINED 1) - endif () -diff --git a/cmake/fc.cmake b/cmake/fc.cmake -index fc1f9bb..6316645 100644 ---- a/cmake/fc.cmake -+++ b/cmake/fc.cmake -@@ -61,6 +61,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") - endif () - endif () -+ if (LOONGARCH64) -+ if (BINARY64) -+ set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") -+ else () -+ set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") -+ endif () -+ endif () - else () - if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -m64") -diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake -index fdc79c8..8d0558c 100644 ---- a/cmake/system_check.cmake -+++ b/cmake/system_check.cmake -@@ -38,6 +38,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") - set(PPC 1) - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") - set(MIPS64 1) -+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") -+ set(LOONGARCH64 1) - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") - if (NOT BINARY) - if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") -@@ -95,7 +97,7 @@ else() - endif () - - if (NOT BINARY) -- if (X86_64 OR ARM64 OR PPC OR MIPS64) -+ if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64) - set(BINARY 64) - else () - set(BINARY 32) -diff --git a/common.h b/common.h -index ac79593..ff5254a 100644 ---- a/common.h -+++ b/common.h -@@ -449,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 - #include "common_mips.h" - #endif - -- -+ - #ifdef ARCH_RISCV64 - #include "common_riscv64.h" - #endif -@@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 - #include "common_zarch.h" - #endif - -+#ifdef ARCH_LOONGARCH64 -+#include "common_loongarch64.h" -+#endif -+ - #ifndef ASSEMBLER - #ifdef OS_WINDOWSSTORE - typedef char env_var_t[MAX_PATH]; -diff --git a/common_loongarch64.h b/common_loongarch64.h -new file mode 100644 -index 0000000..e15539b ---- /dev/null -+++ b/common_loongarch64.h -@@ -0,0 +1,199 @@ -+/***************************************************************************** -+Copyright (c) 2011-2020, The OpenBLAS Project -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+ -+ 1. Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ -+ 2. Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in -+ the documentation and/or other materials provided with the -+ distribution. -+ 3. Neither the name of the OpenBLAS project nor the names of -+ its contributors may be used to endorse or promote products -+ derived from this software without specific prior written -+ permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+**********************************************************************************/ -+ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#ifndef COMMON_LOONGARCH64 -+#define COMMON_LOONGARCH64 -+ -+#define MB __sync_synchronize() -+#define WMB __sync_synchronize() -+#define RMB __sync_synchronize() -+ -+#define INLINE inline -+ -+#ifndef ASSEMBLER -+ -+static inline int blas_quickdivide(blasint x, blasint y){ -+ return x / y; -+} -+ -+#ifdef DOUBLE -+#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") -+#else -+#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory") -+#endif -+ -+#define GET_IMAGE_CANCEL -+ -+#else -+ -+#ifdef DOUBLE -+#define LD fld.d -+#define ST fst.d -+#define MADD fmadd.d -+#define NMADD fnmadd.d -+#define MSUB fmsub.d -+#define NMSUB fnmsub.d -+#define ADD fadd.d -+#define SUB fsub.d -+#define MUL fmul.d -+#define MOV fmov.d -+#define CMOVT fsel -+#define MTC movgr2fr.d -+#define FABS fabs.d -+#define CMPEQ fcmp.ceq.d -+#define CMPLE fcmp.cle.d -+#define CMPLT fcmp.clt.d -+#define NEG fneg.d -+#else -+#define LD fld.s -+#define ST fst.s -+#define MADD fmadd.s -+#define NMADD fnmadd.s -+#define MSUB fmsub.s -+#define NMSUB fnmsub.s -+#define ADD fadd.s -+#define SUB fsub.s -+#define MUL fmul.s -+#define MOV fmov.s -+#define CMOVT fsel -+#define MTC movgr2fr.w -+#define FABS fabs.s -+#define CMPEQ fcmp.ceq.s -+#define CMPLE fcmp.cle.s -+#define CMPLT fcmp.clt.s -+#define NEG fneg.s -+#endif /* defined(DOUBLE) */ -+ -+#if defined(__64BIT__) && defined(USE64BITINT) -+#define LDINT ld.d -+#define LDARG ld.d -+#define SDARG st.d -+#elif defined(__64BIT__) && !defined(USE64BITINT) -+#define LDINT ld.w -+#define LDARG ld.d -+#define SDARG st.d -+#else -+#define LDINT ld.w -+#define LDARG ld.w -+#define SDARG st.w -+#endif -+ -+ -+#ifndef F_INTERFACE -+#define REALNAME ASMNAME -+#else -+#define REALNAME ASMFNAME -+#endif /* defined(F_INTERFACE) */ -+ -+#if defined(ASSEMBLER) && !defined(NEEDPARAM) -+ -+#define PROLOGUE \ -+ .text ;\ -+ .align 5 ;\ -+ .globl REALNAME ;\ -+ .type REALNAME, @function ;\ -+REALNAME: ;\ -+ -+#if defined(__linux__) && defined(__ELF__) -+#define GNUSTACK .section .note.GNU-stack,"",@progbits -+#else -+#define GNUSTACK -+#endif /* defined(__linux__) && defined(__ELF__) */ -+ -+#define EPILOGUE \ -+ .end REALNAME ;\ -+ GNUSTACK -+ -+#define PROFCODE -+ -+#define MOVT(dst, src, cc) \ -+ bceqz cc, 1f; \ -+ add.d dst, src, $r0; \ -+ 1: -+ -+#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */ -+ -+#endif /* defined(ASSEMBLER) */ -+ -+#define SEEK_ADDRESS -+ -+#define BUFFER_SIZE ( 32 << 20) -+ -+#define PAGESIZE (16UL << 10) -+#define FIXED_PAGESIZE (16UL << 10) -+#define HUGE_PAGESIZE ( 2 << 20) -+ -+#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) -+ -+#ifndef MAP_ANONYMOUS -+#define MAP_ANONYMOUS MAP_ANON -+#endif -+ -+#endif -diff --git a/common_macro.h b/common_macro.h -index c6ea1bf..0136f18 100644 ---- a/common_macro.h -+++ b/common_macro.h -@@ -2490,7 +2490,8 @@ - #endif - - #ifndef ASSEMBLER --#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) -+#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ -+|| defined(ARCH_LOONGARCH64) - extern BLASLONG gemm_offset_a; - extern BLASLONG gemm_offset_b; - extern BLASLONG sbgemm_p; -diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c -new file mode 100644 -index 0000000..ca07c7f ---- /dev/null -+++ b/cpuid_loongarch64.c -@@ -0,0 +1,143 @@ -+/***************************************************************************** -+Copyright (c) 2011-2020, The OpenBLAS Project -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+ -+ 1. Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ -+ 2. Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in -+ the documentation and/or other materials provided with the -+ distribution. -+ 3. Neither the name of the OpenBLAS project nor the names of -+ its contributors may be used to endorse or promote products -+ derived from this software without specific prior written -+ permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+**********************************************************************************/ -+ -+#include -+ -+/* If LASX extension instructions supported, -+ * using core LOONGSON3R5 -+ * If only LSX extension instructions supported, -+ * using core LOONGSON2K1000 -+ * If neither LASX nor LSX extension instructions supported, -+ * using core LOONGSONGENERIC (As far as I know, there is no such -+ * CPU yet) -+ */ -+ -+#define CPU_GENERIC 0 -+#define CPU_LOONGSON3R5 1 -+#define CPU_LOONGSON2K1000 2 -+ -+#define LOONGARCH_CFG2 0x02 -+#define LOONGARCH_LASX 1<<7 -+#define LOONGARCH_LSX 1<<6 -+ -+static char *cpuname[] = { -+ "LOONGSONGENERIC", -+ "LOONGSON3R5", -+ "LOONGSON2K1000" -+}; -+ -+static char *cpuname_lower[] = { -+ "loongsongeneric", -+ "loongson3r5", -+ "loongson2k1000" -+}; -+ -+int detect(void) { -+#ifdef __linux -+ uint32_t reg = 0; -+ -+ __asm__ volatile ( -+ "cpucfg %0, %1 \n\t" -+ : "+&r"(reg) -+ : "r"(LOONGARCH_CFG2) -+ ); -+ -+ if (reg & LOONGARCH_LASX) -+ return CPU_LOONGSON3R5; -+ else if (reg & LOONGARCH_LSX) -+ return CPU_LOONGSON2K1000; -+ else -+ return CPU_GENERIC; -+#endif -+ return CPU_GENERIC; -+} -+ -+char *get_corename(void) { -+ return cpuname[detect()]; -+} -+ -+void get_architecture(void) { -+ printf("LOONGARCH64"); -+} -+ -+void get_subarchitecture(void) { -+ int d = detect(); -+ printf("%s", cpuname[d]); -+} -+ -+void get_subdirname(void) { -+ printf("loongarch64"); -+} -+ -+void get_cpuconfig(void) { -+ int d = detect(); -+ switch (d) { -+ case CPU_LOONGSON3R5: -+ printf("#define LOONGSON3R5\n"); -+ printf("#define L1_DATA_SIZE 65536\n"); -+ printf("#define L1_DATA_LINESIZE 64\n"); -+ printf("#define L2_SIZE 1048576\n"); -+ printf("#define L2_LINESIZE 64\n"); -+ printf("#define DTB_DEFAULT_ENTRIES 64\n"); -+ printf("#define DTB_SIZE 4096\n"); -+ printf("#define L2_ASSOCIATIVE 16\n"); -+ break; -+ -+ case CPU_LOONGSON2K1000: -+ printf("#define LOONGSON2K1000\n"); -+ printf("#define L1_DATA_SIZE 65536\n"); -+ printf("#define L1_DATA_LINESIZE 64\n"); -+ printf("#define L2_SIZE 262144\n"); -+ printf("#define L2_LINESIZE 64\n"); -+ printf("#define DTB_DEFAULT_ENTRIES 64\n"); -+ printf("#define DTB_SIZE 4096\n"); -+ printf("#define L2_ASSOCIATIVE 16\n"); -+ break; -+ -+ default: -+ printf("#define LOONGSONGENERIC\n"); -+ printf("#define L1_DATA_SIZE 65536\n"); -+ printf("#define L1_DATA_LINESIZE 64\n"); -+ printf("#define L2_SIZE 262144\n"); -+ printf("#define L2_LINESIZE 64\n"); -+ printf("#define DTB_DEFAULT_ENTRIES 64\n"); -+ printf("#define DTB_SIZE 4096\n"); -+ printf("#define L2_ASSOCIATIVE 16\n"); -+ break; -+ } -+} -+ -+void get_libname(void){ -+ int d = detect(); -+ printf("%s", cpuname_lower[d]); -+} -diff --git a/ctest.c b/ctest.c -index d674a8c..4f18918 100644 ---- a/ctest.c -+++ b/ctest.c -@@ -157,6 +157,10 @@ ARCH_ARM64 - ARCH_RISCV64 - #endif - -+#ifdef __loongarch64 -+ARCH_LOONGARCH64 -+#endif -+ - #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) - HAVE_C11 - #endif -diff --git a/driver/others/Makefile b/driver/others/Makefile -index 4a421ef..e4e9ee1 100644 ---- a/driver/others/Makefile -+++ b/driver/others/Makefile -@@ -27,11 +27,15 @@ else - ifeq ($(ARCH),mips64) - COMMONOBJS += dynamic_mips64.$(SUFFIX) - else -+ifeq ($(ARCH),loongarch64) -+COMMONOBJS += dynamic_loongarch64.$(SUFFIX) -+else - COMMONOBJS += dynamic.$(SUFFIX) - endif - endif - endif - endif -+endif - else - COMMONOBJS += parameter.$(SUFFIX) - endif -@@ -99,11 +103,15 @@ else - ifeq ($(ARCH),mips64) - HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) - else -+ifeq ($(ARCH),loongarch64) -+HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX) -+else - HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) - endif - endif - endif - endif -+endif - else - HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) - endif -diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c -new file mode 100644 -index 0000000..52f8bcb ---- /dev/null -+++ b/driver/others/dynamic_loongarch64.c -@@ -0,0 +1,128 @@ -+/******************************************************************************* -+Copyright (c) 2022, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+ -+#include "common.h" -+ -+extern gotoblas_t gotoblas_LOONGSON3R5; -+extern gotoblas_t gotoblas_LOONGSON2K1000; -+extern gotoblas_t gotoblas_LOONGSONGENERIC; -+ -+extern void openblas_warning(int verbose, const char * msg); -+ -+#define NUM_CORETYPES 3 -+ -+static char *corename[] = { -+ "loongson3r5", -+ "loongson2k1000", -+ "loongsongeneric", -+ "unknown" -+}; -+ -+char *gotoblas_corename(void) { -+ if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0]; -+ if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1]; -+ if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2]; -+ return corename[NUM_CORETYPES]; -+} -+ -+static gotoblas_t *force_coretype(char *coretype) { -+ int i; -+ int found = -1; -+ char message[128]; -+ -+ for ( i=0 ; i < NUM_CORETYPES; i++) -+ { -+ if (!strncasecmp(coretype, corename[i], 20)) -+ { -+ found = i; -+ break; -+ } -+ } -+ -+ switch (found) -+ { -+ case 0: return (&gotoblas_LOONGSON3R5); -+ case 1: return (&gotoblas_LOONGSON2K1000); -+ case 2: return (&gotoblas_LOONGSONGENERIC); -+ } -+ snprintf(message, 128, "Core not found: %s\n", coretype); -+ openblas_warning(1, message); -+ return NULL; -+} -+ -+#define LASX_MASK 1<<7 -+#define LSX_MASK 1<<6 -+#define LOONGARCH_CFG2 0x02 -+ -+static gotoblas_t *get_coretype(void) { -+ int ret = 0; -+ __asm__ volatile ( -+ "cpucfg %0, %1 \n\t" -+ : "+&r"(ret) -+ : "r"(LOONGARCH_CFG2) -+ ); -+ -+ if (ret & LASX_MASK) -+ return &gotoblas_LOONGSON3R5; -+ else if (ret & LSX_MASK) -+ return &gotoblas_LOONGSON2K1000; -+ else -+ return &gotoblas_LOONGSONGENERIC; -+} -+ -+void gotoblas_dynamic_init(void) { -+ char coremsg[128]; -+ char coren[22]; -+ char *p; -+ -+ if (gotoblas) return; -+ -+ p = getenv("OPENBLAS_CORETYPE"); -+ if ( p ) -+ { -+ gotoblas = force_coretype(p); -+ } -+ else -+ { -+ gotoblas = get_coretype(); -+ } -+ -+ if (gotoblas && gotoblas->init) { -+ strncpy(coren, gotoblas_corename(), 20); -+ sprintf(coremsg, "Core: %s\n", coren); -+ openblas_warning(2, coremsg); -+ gotoblas -> init(); -+ } else { -+ openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); -+ exit(1); -+ } -+ -+} -+ -+void gotoblas_dynamic_quit(void) { -+ gotoblas = NULL; -+} -diff --git a/getarch.c b/getarch.c -index f48944f..5906458 100644 ---- a/getarch.c -+++ b/getarch.c -@@ -140,8 +140,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - /* #define FORCE_PPC440FP2 */ - /* #define FORCE_CELL */ - /* #define FORCE_SICORTEX */ --/* #define FORCE_LOONGSON3R3 */ --/* #define FORCE_LOONGSON3R4 */ -+/* #define FORCE_LOONGSON3R3 */ -+/* #define FORCE_LOONGSON3R4 */ -+/* #define FORCE_LOONGSON3R5 */ -+/* #define FORCE_LOONGSON2K1000 */ -+/* #define FORCE_LOONGSONGENERIC */ - /* #define FORCE_I6400 */ - /* #define FORCE_P6600 */ - /* #define FORCE_P5600 */ -@@ -842,6 +845,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #else - #endif - -+#ifdef FORCE_LOONGSON3R5 -+#define FORCE -+#define ARCHITECTURE "LOONGARCH" -+#define SUBARCHITECTURE "LOONGSON3R5" -+#define SUBDIRNAME "loongarch64" -+#define ARCHCONFIG "-DLOONGSON3R5 " \ -+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ -+ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ -+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " -+#define LIBNAME "loongson3r5" -+#define CORENAME "LOONGSON3R5" -+#else -+#endif -+ -+#ifdef FORCE_LOONGSON2K1000 -+#define FORCE -+#define ARCHITECTURE "LOONGARCH" -+#define SUBARCHITECTURE "LOONGSON2K1000" -+#define SUBDIRNAME "loongarch64" -+#define ARCHCONFIG "-DLOONGSON2K1000 " \ -+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ -+ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ -+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " -+#define LIBNAME "loongson2k1000" -+#define CORENAME "LOONGSON2K1000" -+#else -+#endif -+ -+#ifdef FORCE_LOONGSONGENERIC -+#define FORCE -+#define ARCHITECTURE "LOONGARCH" -+#define SUBARCHITECTURE "LOONGSONGENERIC" -+#define SUBDIRNAME "loongarch64" -+#define ARCHCONFIG "-DLOONGSONGENERIC " \ -+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ -+ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ -+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " -+#define LIBNAME "loongsongeneric" -+#define CORENAME "LOONGSONGENERIC" -+#else -+#endif -+ - #ifdef FORCE_I6400 - #define FORCE - #define ARCHITECTURE "MIPS" -@@ -1373,6 +1418,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #define OPENBLAS_SUPPORTED - #endif - -+#ifdef __loongarch64 -+#include "cpuid_loongarch64.c" -+#define OPENBLAS_SUPPORTED -+#endif -+ - #ifdef __riscv - #include "cpuid_riscv64.c" - #define OPENBLAS_SUPPORTED -@@ -1448,7 +1498,7 @@ int main(int argc, char *argv[]){ - #ifdef FORCE - printf("CORE=%s\n", CORENAME); - #else --#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) -+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) - printf("CORE=%s\n", get_corename()); - #endif - #endif -@@ -1596,7 +1646,7 @@ printf("ELF_VERSION=2\n"); - #ifdef FORCE - printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); - #else --#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) -+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) - printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); - #endif - #endif -diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL -new file mode 100644 -index 0000000..e5d145a ---- /dev/null -+++ b/kernel/loongarch64/KERNEL -@@ -0,0 +1,238 @@ -+ifndef SAXPYKERNEL -+SAXPYKERNEL = ../arm/axpy.c -+endif -+ -+ifndef DAXPYKERNEL -+DAXPYKERNEL = ../arm/axpy.c -+endif -+ -+ifndef CAXPYKERNEL -+CAXPYKERNEL = ../arm/zaxpy.c -+endif -+ -+ifndef ZAXPYKERNEL -+ZAXPYKERNEL = ../arm/zaxpy.c -+endif -+ -+ifndef SROTKERNEL -+SROTKERNEL = ../arm/rot.c -+endif -+ -+ifndef DROTKERNEL -+DROTKERNEL = ../arm/rot.c -+endif -+ -+ifndef CROTKERNEL -+CROTKERNEL = ../arm/zrot.c -+endif -+ -+ifndef ZROTKERNEL -+ZROTKERNEL = ../arm/zrot.c -+endif -+ -+ifndef CSWAPKERNEL -+CSWAPKERNEL = ../arm/zswap.c -+endif -+ -+ifndef ZSWAPKERNEL -+ZSWAPKERNEL = ../arm/zswap.c -+endif -+ -+ifndef SSUMKERNEL -+SSUMKERNEL = ../arm/sum.c -+endif -+ -+ifndef DSUMKERNEL -+DSUMKERNEL = ../arm/sum.c -+endif -+ -+ifndef CSUMKERNEL -+CSUMKERNEL = ../arm/zsum.c -+endif -+ -+ifndef ZSUMKERNEL -+ZSUMKERNEL = ../arm/zsum.c -+endif -+ -+ifndef ISMAXKERNEL -+ISMAXKERNEL = ../arm/imax.c -+endif -+ -+ifndef IDMAXKERNEL -+IDMAXKERNEL = ../arm/imax.c -+endif -+ -+ifndef ISMINKERNEL -+ISMINKERNEL = ../arm/imin.c -+endif -+ -+ifndef IDMINKERNEL -+IDMINKERNEL = ../arm/imin.c -+endif -+ -+ifndef SNRM2KERNEL -+SNRM2KERNEL = snrm2.S -+endif -+ -+ifndef DNRM2KERNEL -+DNRM2KERNEL = dnrm2.S -+endif -+ -+ifndef CNRM2KERNEL -+CNRM2KERNEL = cnrm2.S -+endif -+ -+ifndef ZNRM2KERNEL -+ZNRM2KERNEL = znrm2.S -+endif -+ -+ifndef SCABS_KERNEL -+SCABS_KERNEL = ../generic/cabs.c -+endif -+ -+ifndef DCABS_KERNEL -+DCABS_KERNEL = ../generic/cabs.c -+endif -+ -+ifndef QCABS_KERNEL -+QCABS_KERNEL = ../generic/cabs.c -+endif -+ -+ifndef LSAME_KERNEL -+LSAME_KERNEL = ../generic/lsame.c -+endif -+ -+ifndef SGEMMKERNEL -+SGEMMKERNEL = gemm_kernel.S -+SGEMMINCOPY = ../generic/gemm_ncopy_2.c -+SGEMMITCOPY = ../generic/gemm_tcopy_2.c -+SGEMMONCOPY = ../generic/gemm_ncopy_8.c -+SGEMMOTCOPY = ../generic/gemm_tcopy_8.c -+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -+endif -+ -+ifndef DGEMMKERNEL -+DGEMMKERNEL = gemm_kernel.S -+DGEMMINCOPY = ../generic/gemm_ncopy_2.c -+DGEMMITCOPY = ../generic/gemm_tcopy_2.c -+DGEMMONCOPY = ../generic/gemm_ncopy_8.c -+DGEMMOTCOPY = ../generic/gemm_tcopy_8.c -+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -+endif -+ -+ifndef CGEMMKERNEL -+CGEMMKERNEL = zgemm_kernel.S -+CGEMMINCOPY = ../generic/zgemm_ncopy_1.c -+CGEMMITCOPY = ../generic/zgemm_tcopy_1.c -+CGEMMONCOPY = ../generic/zgemm_ncopy_4.c -+CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -+endif -+ -+ifndef ZGEMMKERNEL -+ZGEMMKERNEL = zgemm_kernel.S -+ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c -+ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c -+ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c -+ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -+endif -+ -+ifndef SGEMM_BETA -+SGEMM_BETA = ../generic/gemm_beta.c -+endif -+ifndef DGEMM_BETA -+DGEMM_BETA = ../generic/gemm_beta.c -+endif -+ifndef CGEMM_BETA -+CGEMM_BETA = ../generic/zgemm_beta.c -+endif -+ifndef ZGEMM_BETA -+ZGEMM_BETA = ../generic/zgemm_beta.c -+endif -+ -+ifndef STRSMKERNEL_LN -+STRSMKERNEL_LN = trsm_kernel_LN.S -+endif -+ -+ifndef STRSMKERNEL_LT -+STRSMKERNEL_LT = trsm_kernel_LT.S -+endif -+ -+ifndef STRSMKERNEL_RN -+STRSMKERNEL_RN = trsm_kernel_LT.S -+endif -+ -+ifndef STRSMKERNEL_RT -+STRSMKERNEL_RT = trsm_kernel_RT.S -+endif -+ -+ifndef DTRSMKERNEL_LN -+DTRSMKERNEL_LN = trsm_kernel_LN.S -+endif -+ -+ifndef DTRSMKERNEL_LT -+DTRSMKERNEL_LT = trsm_kernel_LT.S -+endif -+ -+ifndef DTRSMKERNEL_RN -+DTRSMKERNEL_RN = trsm_kernel_LT.S -+endif -+ -+ifndef DTRSMKERNEL_RT -+DTRSMKERNEL_RT = trsm_kernel_RT.S -+endif -+ -+ifndef CTRSMKERNEL_LN -+CTRSMKERNEL_LN = ztrsm_kernel_LT.S -+endif -+ -+ifndef CTRSMKERNEL_LT -+CTRSMKERNEL_LT = ztrsm_kernel_LT.S -+endif -+ -+ifndef CTRSMKERNEL_RN -+CTRSMKERNEL_RN = ztrsm_kernel_LT.S -+endif -+ -+ifndef CTRSMKERNEL_RT -+CTRSMKERNEL_RT = ztrsm_kernel_RT.S -+endif -+ -+ifndef ZTRSMKERNEL_LN -+ZTRSMKERNEL_LN = ztrsm_kernel_LT.S -+endif -+ -+ifndef ZTRSMKERNEL_LT -+ZTRSMKERNEL_LT = ztrsm_kernel_LT.S -+endif -+ -+ifndef ZTRSMKERNEL_RN -+ZTRSMKERNEL_RN = ztrsm_kernel_LT.S -+endif -+ -+ifndef ZTRSMKERNEL_RT -+ZTRSMKERNEL_RT = ztrsm_kernel_RT.S -+endif -+ -+ifndef CGEMM3MKERNEL -+CGEMM3MKERNEL = zgemm3m_kernel.S -+endif -+ -+ifndef ZGEMM3MKERNEL -+ZGEMM3MKERNEL = zgemm3m_kernel.S -+endif -+ -+DSDOTKERNEL = dot.S -diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 -new file mode 100644 -index 0000000..cda3590 ---- /dev/null -+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 -@@ -0,0 +1,14 @@ -+DGEMMKERNEL = dgemm_kernel_16x4.S -+DGEMMINCOPY = dgemm_ncopy_16.S -+DGEMMITCOPY = dgemm_tcopy_16.S -+DGEMMONCOPY = dgemm_ncopy_4.S -+DGEMMOTCOPY = dgemm_tcopy_4.S -+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -+ -+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic -new file mode 100644 -index 0000000..b772a6f ---- /dev/null -+++ b/kernel/loongarch64/KERNEL.generic -@@ -0,0 +1,167 @@ -+SGEMM_BETA = ../generic/gemm_beta.c -+DGEMM_BETA = ../generic/gemm_beta.c -+CGEMM_BETA = ../generic/zgemm_beta.c -+ZGEMM_BETA = ../generic/zgemm_beta.c -+ -+STRMMKERNEL = ../generic/trmmkernel_2x2.c -+DTRMMKERNEL = ../generic/trmmkernel_2x2.c -+CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -+ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -+ -+SGEMMKERNEL = ../generic/gemmkernel_2x2.c -+SGEMMONCOPY = ../generic/gemm_ncopy_2.c -+SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -+ -+DGEMMKERNEL = ../generic/gemmkernel_2x2.c -+DGEMMONCOPY = ../generic/gemm_ncopy_2.c -+DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -+ -+CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -+ -+ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -+ -+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -+ -+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -+ -+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -+ -+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -+ -+#Pure C for other kernels -+SAMAXKERNEL = ../arm/amax.c -+DAMAXKERNEL = ../arm/amax.c -+CAMAXKERNEL = ../arm/zamax.c -+ZAMAXKERNEL = ../arm/zamax.c -+ -+SAMINKERNEL = ../arm/amin.c -+DAMINKERNEL = ../arm/amin.c -+CAMINKERNEL = ../arm/zamin.c -+ZAMINKERNEL = ../arm/zamin.c -+ -+SMAXKERNEL = ../arm/max.c -+DMAXKERNEL = ../arm/max.c -+ -+SMINKERNEL = ../arm/min.c -+DMINKERNEL = ../arm/min.c -+ -+ISAMAXKERNEL = ../arm/iamax.c -+IDAMAXKERNEL = ../arm/iamax.c -+ICAMAXKERNEL = ../arm/izamax.c -+IZAMAXKERNEL = ../arm/izamax.c -+ -+ISAMINKERNEL = ../arm/iamin.c -+IDAMINKERNEL = ../arm/iamin.c -+ICAMINKERNEL = ../arm/izamin.c -+IZAMINKERNEL = ../arm/izamin.c -+ -+ISMAXKERNEL = ../arm/imax.c -+IDMAXKERNEL = ../arm/imax.c -+ -+ISMINKERNEL = ../arm/imin.c -+IDMINKERNEL = ../arm/imin.c -+ -+SASUMKERNEL = ../arm/asum.c -+DASUMKERNEL = ../arm/asum.c -+CASUMKERNEL = ../arm/zasum.c -+ZASUMKERNEL = ../arm/zasum.c -+ -+SSUMKERNEL = ../arm/sum.c -+DSUMKERNEL = ../arm/sum.c -+CSUMKERNEL = ../arm/zsum.c -+ZSUMKERNEL = ../arm/zsum.c -+ -+ -+SAXPYKERNEL = ../arm/axpy.c -+DAXPYKERNEL = ../arm/axpy.c -+CAXPYKERNEL = ../arm/zaxpy.c -+ZAXPYKERNEL = ../arm/zaxpy.c -+ -+SCOPYKERNEL = ../arm/copy.c -+DCOPYKERNEL = ../arm/copy.c -+CCOPYKERNEL = ../arm/zcopy.c -+ZCOPYKERNEL = ../arm/zcopy.c -+ -+SDOTKERNEL = ../generic/dot.c -+DDOTKERNEL = ../arm/dot.c -+CDOTKERNEL = ../arm/zdot.c -+ZDOTKERNEL = ../arm/zdot.c -+ -+SNRM2KERNEL = ../arm/nrm2.c -+DNRM2KERNEL = ../arm/nrm2.c -+CNRM2KERNEL = ../arm/znrm2.c -+ZNRM2KERNEL = ../arm/znrm2.c -+ -+SROTKERNEL = ../arm/rot.c -+DROTKERNEL = ../arm/rot.c -+CROTKERNEL = ../arm/zrot.c -+ZROTKERNEL = ../arm/zrot.c -+ -+SSCALKERNEL = ../arm/scal.c -+DSCALKERNEL = ../arm/scal.c -+CSCALKERNEL = ../arm/zscal.c -+ZSCALKERNEL = ../arm/zscal.c -+ -+SSWAPKERNEL = ../arm/swap.c -+DSWAPKERNEL = ../arm/swap.c -+CSWAPKERNEL = ../arm/zswap.c -+ZSWAPKERNEL = ../arm/zswap.c -+ -+SGEMVNKERNEL = ../arm/gemv_n.c -+DGEMVNKERNEL = ../arm/gemv_n.c -+CGEMVNKERNEL = ../arm/zgemv_n.c -+ZGEMVNKERNEL = ../arm/zgemv_n.c -+ -+SGEMVTKERNEL = ../arm/gemv_t.c -+DGEMVTKERNEL = ../arm/gemv_t.c -+CGEMVTKERNEL = ../arm/zgemv_t.c -+ZGEMVTKERNEL = ../arm/zgemv_t.c -+ -+SSYMV_U_KERNEL = ../generic/symv_k.c -+SSYMV_L_KERNEL = ../generic/symv_k.c -+DSYMV_U_KERNEL = ../generic/symv_k.c -+DSYMV_L_KERNEL = ../generic/symv_k.c -+QSYMV_U_KERNEL = ../generic/symv_k.c -+QSYMV_L_KERNEL = ../generic/symv_k.c -+CSYMV_U_KERNEL = ../generic/zsymv_k.c -+CSYMV_L_KERNEL = ../generic/zsymv_k.c -+ZSYMV_U_KERNEL = ../generic/zsymv_k.c -+ZSYMV_L_KERNEL = ../generic/zsymv_k.c -+XSYMV_U_KERNEL = ../generic/zsymv_k.c -+XSYMV_L_KERNEL = ../generic/zsymv_k.c -+ -+ZHEMV_U_KERNEL = ../generic/zhemv_k.c -+ZHEMV_L_KERNEL = ../generic/zhemv_k.c -+ -+LSAME_KERNEL = ../generic/lsame.c -+SCABS_KERNEL = ../generic/cabs.c -+DCABS_KERNEL = ../generic/cabs.c -+QCABS_KERNEL = ../generic/cabs.c -+ -+#Dump kernel -+CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c -+ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c -diff --git a/kernel/loongarch64/Makefile b/kernel/loongarch64/Makefile -new file mode 100644 -index 0000000..520349b ---- /dev/null -+++ b/kernel/loongarch64/Makefile -@@ -0,0 +1 @@ -+clean :: -diff --git a/kernel/loongarch64/amax.S b/kernel/loongarch64/amax.S -new file mode 100644 -index 0000000..4b135c5 ---- /dev/null -+++ b/kernel/loongarch64/amax.S -@@ -0,0 +1,230 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+ -+#define I $r17 -+#define TEMP $r18 -+ -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+ -+#define t1 $f0 -+#define t2 $f1 -+#define t3 $f2 -+#define t4 $f3 -+ -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ MTC s1, $r0 -+ bge $r0, N, .L999 -+ -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, INCX, .L999 -+ -+ LD a1, X, 0 * SIZE -+ addi.d N, N, -1 -+ -+ add.d X, X, INCX -+ FABS s1, a1 -+ -+ FABS s2, a1 -+ bge $r0, N, .L999 -+ -+ FABS s3, a1 -+ srai.d I, N, 3 -+ -+ FABS s4, a1 -+ bge $r0, I, .L15 -+ -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a8, X, 0 * SIZE -+ addi.d I, I, -1 -+ -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ FABS t1, a1 -+ LD a1, X, 0 * SIZE -+ FABS t2, a2 -+ add.d X, X, INCX -+ -+ FABS t3, a3 -+ LD a2, X, 0 * SIZE -+ FABS t4, a4 -+ add.d X, X, INCX -+ -+ CMPLT $fcc0, s1, t1 -+ LD a3, X, 0 * SIZE -+ CMPLT $fcc1, s2, t2 -+ add.d X, X, INCX -+ -+ CMPLT $fcc2, s3, t3 -+ LD a4, X, 0 * SIZE -+ CMPLT $fcc3, s4, t4 -+ add.d X, X, INCX -+ -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ -+ FABS t1, a5 -+ LD a5, X, 0 * SIZE -+ FABS t2, a6 -+ add.d X, X, INCX -+ -+ FABS t3, a7 -+ LD a6, X, 0 * SIZE -+ FABS t4, a8 -+ add.d X, X, INCX -+ -+ CMPLT $fcc0, s1, t1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, s2, t2 -+ add.d X, X, INCX -+ -+ CMPLT $fcc2, s3, t3 -+ LD a8, X, 0 * SIZE -+ CMPLT $fcc3, s4, t4 -+ add.d X, X, INCX -+ -+ CMOVT s1, s1, t1, $fcc0 -+ addi.d I, I, -1 -+ -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ -+ CMOVT s4, s4, t4, $fcc3 -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ FABS t1, a1 -+ FABS t2, a2 -+ FABS t3, a3 -+ FABS t4, a4 -+ -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t2 -+ CMPLT $fcc2, s3, t3 -+ CMPLT $fcc3, s4, t4 -+ -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ -+ FABS t1, a5 -+ FABS t2, a6 -+ FABS t3, a7 -+ FABS t4, a8 -+ -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t2 -+ CMPLT $fcc2, s3, t3 -+ CMPLT $fcc3, s4, t4 -+ -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ .align 3 -+ -+.L15: -+ andi I, N, 7 -+ -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ -+ FABS t1, a1 -+ -+ CMPLT $fcc0, s1, t1 -+ -+ CMOVT s1, s1, t1, $fcc0 -+ -+ add.d X, X, INCX -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L998: -+ CMPLT $fcc0, s1, s2 -+ CMPLT $fcc1, s3, s4 -+ -+ CMOVT s1, s1, s2, $fcc0 -+ CMOVT s3, s3, s4, $fcc1 -+ -+ CMPLT $fcc0, s1, s3 -+ CMOVT s1, s1, s3, $fcc0 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/amin.S b/kernel/loongarch64/amin.S -new file mode 100644 -index 0000000..ff9978f ---- /dev/null -+++ b/kernel/loongarch64/amin.S -@@ -0,0 +1,186 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define t1 $f0 -+#define t2 $f1 -+#define t3 $f2 -+#define t4 $f3 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+ -+ PROLOGUE -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ MTC s1, $r0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, INCX, .L999 -+ LD a1, X, 0 * SIZE -+ addi.d N, N, -1 -+ add.d X, X, INCX -+ FABS s1, a1 -+ FABS s2, a1 -+ bge $r0, N, .L999 -+ FABS s3, a1 -+ srai.d I, N, 3 -+ FABS s4, a1 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a8, X, 0 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+.L12: -+ FABS t1, a1 -+ LD a1, X, 0 * SIZE -+ FABS t2, a2 -+ add.d X, X, INCX -+ FABS t3, a3 -+ LD a2, X, 0 * SIZE -+ FABS t4, a4 -+ add.d X, X, INCX -+ CMPLT $fcc0, t1, s1 -+ LD a3, X, 0 * SIZE -+ CMPLT $fcc1, t2, s2 -+ add.d X, X, INCX -+ CMPLT $fcc2, t3, s3 -+ LD a4, X, 0 * SIZE -+ CMPLT $fcc3, t4, s4 -+ add.d X, X, INCX -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ FABS t1, a5 -+ LD a5, X, 0 * SIZE -+ FABS t2, a6 -+ add.d X, X, INCX -+ FABS t3, a7 -+ LD a6, X, 0 * SIZE -+ FABS t4, a8 -+ add.d X, X, INCX -+ CMPLT $fcc0, t1, s1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, t2, s2 -+ add.d X, X, INCX -+ CMPLT $fcc2, t3, s3 -+ LD a8, X, 0 * SIZE -+ CMPLT $fcc3, t4, s4 -+ add.d X, X, INCX -+ CMOVT s1, s1, t1, $fcc0 -+ addi.d I, I, -1 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ blt $r0, I, .L12 -+ .align 3 -+.L13: -+ FABS t1, a1 -+ FABS t2, a2 -+ FABS t3, a3 -+ FABS t4, a4 -+ CMPLT $fcc0, t1, s1 -+ CMPLT $fcc1, t2, s2 -+ CMPLT $fcc2, t3, s3 -+ CMPLT $fcc3, t4, s4 -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ FABS t1, a5 -+ FABS t2, a6 -+ FABS t3, a7 -+ FABS t4, a8 -+ CMPLT $fcc0, t1, s1 -+ CMPLT $fcc1, t2, s2 -+ CMPLT $fcc2, t3, s3 -+ CMPLT $fcc3, t4, s4 -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ .align 3 -+.L15: -+ andi I, N, 7 -+NOP -+ bge $r0, I, .L998 -+ .align 3 -+.L16: -+ LD a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ FABS t1, a1 -+ CMPLT $fcc0, t1, s1 -+ CMOVT s1, s1, t1, $fcc0 -+ add.d X, X, INCX -+ blt $r0, I, .L16 -+ .align 3 -+.L998: -+ CMPLT $fcc0, s2, s1 -+ CMPLT $fcc1, s4, s3 -+ CMOVT s1, s1, s2, $fcc0 -+ CMOVT s3, s3, s4, $fcc1 -+ CMPLT $fcc0, s3, s1 -+ CMOVT s1, s1, s3, $fcc0 -+ .align 3 -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S -new file mode 100644 -index 0000000..7d21ce0 ---- /dev/null -+++ b/kernel/loongarch64/asum.S -@@ -0,0 +1,232 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f23 -+#define a2 $f9 -+#define a3 $f10 -+#define a4 $f11 -+#define a5 $f12 -+#define a6 $f13 -+#define a7 $f14 -+#define a8 $f15 -+#define t1 $f16 -+#define t2 $f17 -+#define t3 $f0 -+#define t4 $f1 -+#define s1 $f22 -+#define s2 $f8 -+ PROLOGUE -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ MTC s1, $r0 -+ MTC s2, $r0 -+ slli.d INCX, INCX, BASE_SHIFT -+ li.d TEMP, SIZE -+ bge $r0, N, .L999 -+ srai.d I, N, 3 -+ bne INCX, TEMP, .L20 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD a3, X, 2 * SIZE -+ LD a4, X, 3 * SIZE -+ LD a5, X, 4 * SIZE -+ FABS t1, a1 -+ LD a6, X, 5 * SIZE -+ FABS t2, a2 -+ LD a7, X, 6 * SIZE -+ FABS t3, a3 -+ FABS t4, a4 -+ addi.d I, I, -1 -+ LD a8, X, 7 * SIZE -+ bge $r0, I, .L13 -+ .align 3 -+.L12: -+ ADD s1, s1, t1 -+ LD a1, X, 8 * SIZE -+ FABS t1, a5 -+ addi.d I, I, -1 -+ ADD s2, s2, t2 -+ LD a2, X, 9 * SIZE -+ FABS t2, a6 -+ NOP -+ ADD s1, s1, t3 -+ LD a3, X, 10 * SIZE -+ FABS t3, a7 -+ NOP -+ ADD s2, s2, t4 -+ LD a4, X, 11 * SIZE -+ FABS t4, a8 -+ addi.d X, X, 8 * SIZE -+ ADD s1, s1, t1 -+ LD a5, X, 4 * SIZE -+ FABS t1, a1 -+ NOP -+ ADD s2, s2, t2 -+ LD a6, X, 5 * SIZE -+ FABS t2, a2 -+ NOP -+ ADD s1, s1, t3 -+ LD a7, X, 6 * SIZE -+ FABS t3, a3 -+ NOP -+ ADD s2, s2, t4 -+ LD a8, X, 7 * SIZE -+ FABS t4, a4 -+ blt $r0, I, .L12 -+ .align 3 -+.L13: -+ ADD s1, s1, t1 -+ addi.d X, X, 8 * SIZE -+ FABS t1, a5 -+ NOP -+ ADD s2, s2, t2 -+ FABS t2, a6 -+ ADD s1, s1, t3 -+ FABS t3, a7 -+ ADD s2, s2, t4 -+ FABS t4, a8 -+ ADD s1, s1, t1 -+ ADD s2, s2, t2 -+ ADD s1, s1, t3 -+ ADD s2, s2, t4 -+ .align 3 -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+.L16: -+ LD a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ FABS t1, a1 -+ ADD s1, s1, t1 -+ addi.d X, X, SIZE -+ blt $r0, I, .L16 -+ b .L999 -+ .align 3 -+.L20: -+ bge $r0, I, .L25 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ FABS t1, a1 -+ LD a7, X, 0 * SIZE -+ FABS t2, a2 -+ add.d X, X, INCX -+ FABS t3, a3 -+ LD a8, X, 0 * SIZE -+ FABS t4, a4 -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L24 -+ .align 3 -+.L23: -+ ADD s1, s1, t1 -+ LD a1, X, 0 * SIZE -+ FABS t1, a5 -+ add.d X, X, INCX -+ ADD s2, s2, t2 -+ LD a2, X, 0 * SIZE -+ FABS t2, a6 -+ add.d X, X, INCX -+ ADD s1, s1, t3 -+ LD a3, X, 0 * SIZE -+ FABS t3, a7 -+ add.d X, X, INCX -+ ADD s2, s2, t4 -+ LD a4, X, 0 * SIZE -+ FABS t4, a8 -+ add.d X, X, INCX -+ ADD s1, s1, t1 -+ LD a5, X, 0 * SIZE -+ FABS t1, a1 -+ add.d X, X, INCX -+ ADD s2, s2, t2 -+ LD a6, X, 0 * SIZE -+ FABS t2, a2 -+ add.d X, X, INCX -+ ADD s1, s1, t3 -+ LD a7, X, 0 * SIZE -+ FABS t3, a3 -+ add.d X, X, INCX -+ ADD s2, s2, t4 -+ LD a8, X, 0 * SIZE -+ FABS t4, a4 -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ blt $r0, I, .L23 -+ .align 3 -+.L24: -+ ADD s1, s1, t1 -+ FABS t1, a5 -+ ADD s2, s2, t2 -+ FABS t2, a6 -+ ADD s1, s1, t3 -+ FABS t3, a7 -+ ADD s2, s2, t4 -+ FABS t4, a8 -+ ADD s1, s1, t1 -+ ADD s2, s2, t2 -+ ADD s1, s1, t3 -+ ADD s2, s2, t4 -+ .align 3 -+.L25: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+.L26: -+ LD a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ FABS t1, a1 -+ add.d X, X, INCX -+ ADD s1, s1, t1 -+ blt $r0, I, .L26 -+ .align 3 -+.L999: -+ ADD s1, s1, s2 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S -new file mode 100644 -index 0000000..9d27987 ---- /dev/null -+++ b/kernel/loongarch64/cnrm2.S -@@ -0,0 +1,159 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f12 -+#define a2 $f13 -+#define a3 $f14 -+#define a4 $f15 -+#define a5 $f16 -+#define a6 $f17 -+#define a7 $f0 -+#define a8 $f1 -+#define s1 $f22 -+#define s2 $f8 -+#define t1 $f23 -+#define t2 $f9 -+#define t3 $f10 -+#define t4 $f11 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ movgr2fr.d s1, $r0 -+ li.d TEMP, 2 * SIZE -+ fmov.d s2, s1 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, ZBASE_SHIFT -+ bge $r0, INCX, .L999 -+ srai.d I, N, 2 -+ bge $r0, I, .L25 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ fcvt.d.s t1, a1 -+ LD a7, X, 0 * SIZE -+ fcvt.d.s t2, a2 -+ LD a8, X, 1 * SIZE -+ fcvt.d.s t3, a3 -+ addi.d I, I, -1 -+ fcvt.d.s t4, a4 -+ add.d X, X, INCX -+ bge $r0, I, .L24 -+ .align 3 -+ -+.L23: -+ fmadd.d s1, t1, t1, s1 -+ LD a1, X, 0 * SIZE -+ fcvt.d.s t1, a5 -+ fmadd.d s2, t2, t2, s2 -+ LD a2, X, 1 * SIZE -+ fcvt.d.s t2, a6 -+ add.d X, X, INCX -+ fmadd.d s1, t3, t3, s1 -+ LD a3, X, 0 * SIZE -+ fcvt.d.s t3, a7 -+ fmadd.d s2, t4, t4, s2 -+ LD a4, X, 1 * SIZE -+ fcvt.d.s t4, a8 -+ add.d X, X, INCX -+ fmadd.d s1, t1, t1, s1 -+ LD a5, X, 0 * SIZE -+ fcvt.d.s t1, a1 -+ addi.d I, I, -1 -+ fmadd.d s2, t2, t2, s2 -+ LD a6, X, 1 * SIZE -+ fcvt.d.s t2, a2 -+ add.d X, X, INCX -+ fmadd.d s1, t3, t3, s1 -+ LD a7, X, 0 * SIZE -+ fcvt.d.s t3, a3 -+ LD a8, X, 1 * SIZE -+ fmadd.d s2, t4, t4, s2 -+ add.d X, X, INCX -+ fcvt.d.s t4, a4 -+ blt $r0, I, .L23 -+ .align 3 -+ -+.L24: -+ fmadd.d s1, t1, t1, s1 -+ fcvt.d.s t1, a5 -+ fmadd.d s2, t2, t2, s2 -+ fcvt.d.s t2, a6 -+ fmadd.d s1, t3, t3, s1 -+ fcvt.d.s t3, a7 -+ fmadd.d s2, t4, t4, s2 -+ fcvt.d.s t4, a8 -+ fmadd.d s1, t1, t1, s1 -+ fmadd.d s2, t2, t2, s2 -+ fmadd.d s1, t3, t3, s1 -+ fmadd.d s2, t4, t4, s2 -+ .align 3 -+ -+.L25: -+ andi I, N, 3 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L26: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ addi.d I, I, -1 -+ fcvt.d.s t1, a1 -+ fcvt.d.s t2, a2 -+ fmadd.d s1, t1, t1, s1 -+ add.d X, X, INCX -+ fmadd.d s2, t2, t2, s2 -+ blt $r0, I, .L26 -+ .align 3 -+ -+.L999: -+ fadd.d s1, s1, s2 -+ fsqrt.d s1, s1 -+ move $r4, $r17 -+ fcvt.s.d $f0, s1 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S -new file mode 100644 -index 0000000..3156f60 ---- /dev/null -+++ b/kernel/loongarch64/copy.S -@@ -0,0 +1,225 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define Y $r7 -+#define INCY $r8 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f23 -+#define a4 $f9 -+#define a5 $f10 -+#define a6 $f11 -+#define a7 $f12 -+#define a8 $f13 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+ LDINT INCY, 0(INCY) -+#endif -+ -+ li.d TEMP, SIZE -+ NOP -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, N, .L999 -+ slli.d INCY, INCY, BASE_SHIFT -+ bne INCX, TEMP, .L20 -+ srai.d I, N, 3 -+ bne INCY, TEMP, .L20 -+ addi.d I, I, -1 -+ blt I, $r0, .L15 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD a3, X, 2 * SIZE -+ LD a4, X, 3 * SIZE -+ LD a5, X, 4 * SIZE -+ LD a6, X, 5 * SIZE -+ LD a7, X, 6 * SIZE -+ LD a8, X, 7 * SIZE -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ ST a1, Y, 0 * SIZE -+ LD a1, X, 8 * SIZE -+ ST a2, Y, 1 * SIZE -+ LD a2, X, 9 * SIZE -+ ST a3, Y, 2 * SIZE -+ LD a3, X, 10 * SIZE -+ ST a4, Y, 3 * SIZE -+ LD a4, X, 11 * SIZE -+ ST a5, Y, 4 * SIZE -+ LD a5, X, 12 * SIZE -+ ST a6, Y, 5 * SIZE -+ LD a6, X, 13 * SIZE -+ ST a7, Y, 6 * SIZE -+ LD a7, X, 14 * SIZE -+ ST a8, Y, 7 * SIZE -+ LD a8, X, 15 * SIZE -+ addi.d I, I, -1 -+ addi.d X, X, 8 * SIZE -+ addi.d Y, Y, 8 * SIZE -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ ST a1, Y, 0 * SIZE -+ ST a2, Y, 1 * SIZE -+ ST a3, Y, 2 * SIZE -+ ST a4, Y, 3 * SIZE -+ ST a5, Y, 4 * SIZE -+ ST a6, Y, 5 * SIZE -+ ST a7, Y, 6 * SIZE -+ ST a8, Y, 7 * SIZE -+ addi.d X, X, 8 * SIZE -+ addi.d Y, Y, 8 * SIZE -+ .align 3 -+ -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ addi.d X, X, SIZE -+ addi.d I, I, -1 -+ addi.d Y, Y, SIZE -+ ST a1, Y, -1 * SIZE -+ blt $r0, I, .L16 -+ b .L999 -+ .align 3 -+ -+.L20: -+ srai.d I, N, 3 -+ addi.d I, I, -1 -+ blt I, $r0, .L25 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a8, X, 0 * SIZE -+ add.d X, X, INCX -+ bge $r0, I, .L23 -+ .align 3 -+ -+.L22: -+ ST a1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a2, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a3, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a4, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a5, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a6, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a7, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a8, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a8, X, 0 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ blt $r0, I, .L22 -+ .align 3 -+ -+.L23: -+ ST a1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a2, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a3, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a4, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a5, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a6, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a7, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a8, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ .align 3 -+ -+.L25: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L26: -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ addi.d I, I, -1 -+ ST a1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ blt $r0, I, .L26 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S -new file mode 100644 -index 0000000..13faa97 ---- /dev/null -+++ b/kernel/loongarch64/dgemm_kernel_16x4.S -@@ -0,0 +1,4250 @@ -+/******************************************************************************* -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+/* Function parameters */ -+#define M $r4 // param 1: bm -+#define N $r5 // param 2: bn -+#define K $r6 // param 3: bk -+#define ALPHA $f0 // param 4: alpha -+#define A $r7 // param 5: ba -+#define B $r8 // param 6: bb -+#define C $r9 // param 7: bc -+#define LDC $r10 // param 8: ldc -+ -+#ifdef TRMMKERNEL -+#define OFFSET $r11 // param 9: offset -+#endif -+#define OFF $r12 -+ -+/* Cycle control parameters */ -+#define I $r13 -+#define J $r14 -+#define L $r15 -+#define TL $r16 -+/* Matrix address */ -+#define A0 $r17 -+#define B0 $r18 -+#define C0 $r19 -+#define C1 $r20 -+#define C2 $r23 -+#define C3 $r24 -+#define T0 $r25 /* !! DO NOT USE $r21 and $r22 !! */ -+#define T1 $r26 -+#define T2 $r27 -+#define ZERO $r0 -+ -+/* LASX vectors */ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define D0 $xr7 -+#define D1 $xr8 -+#define D2 $xr9 -+#define D3 $xr10 -+#define D4 $xr11 -+#define D5 $xr12 -+#define D6 $xr13 -+#define D7 $xr14 -+#define D8 $xr15 -+#define D9 $xr16 -+#define D10 $xr17 -+#define D11 $xr18 -+#define D12 $xr19 -+#define D13 $xr20 -+#define D14 $xr21 -+#define D15 $xr22 -+#define VALPHA $xr23 -+ -+/* Prefetch interval */ -+#define A_PRE 0x200 -+#define B_PRE 0x100 -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -56 -+ /* Store regs */ -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 16 -+ SDARG $r26, $sp, 24 -+ SDARG $r27, $sp, 32 -+ ST $f23, $sp, 40 -+ ST ALPHA, $sp, 48 -+ -+ /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ -+ xvld VALPHA, $sp, 48 -+ xvreplve0.d VALPHA, VALPHA -+ -+#if defined (TRMMKERNEL) && !defined(LEFT) -+ sub.d OFF, ZERO, OFFSET -+#else -+ xor OFF, OFF, OFF -+#endif -+ -+ /* if (!(N >> 2)) goto L_N3 */ -+ srai.d J, N, 2 /* J = bn >> 2 */ -+ andi N, N, 0x03 -+ beq ZERO, J, .L_N3 -+ -+.L_J1: /* J-- && This loop include Condition 1 */ -+ -+/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* -+* dgemm_core_16x4 */ -+ move C0, C -+ move A0, A -+ slli.d T0, LDC, 3 -+ add.d C1, C0, T0 -+ addi.d J, J, -1 /* J-- */ -+ add.d C2, C1, T0 -+ add.d C3, C2, T0 -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move OFF, OFFSET -+#endif -+ -+ /* if (!(M >> 4)) goto L_M8 */ -+ srai.d I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_M8 -+ -+.L_I1: /* I-- */ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x07 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x05 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 16 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ /* Calculate the first set of D0~D15, -+ * avoidig set 0 operation -+ * Load 16 * 64 from A0 -+ * U0 = {a3, a2, a1, a0} -+ * U1 = {a7, a6, a5, a4} -+ * U2 = {a11, a10, a9, a8} -+ * U3 = {a15, a14, a13, a12} -+ */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ preld 0, C0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ preld 0, C0, 0x40 -+ xvfmul.d D2, U2, U4 -+ xvfmul.d D3, U3, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ preld 0, C1, 0x00 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ xvfmul.d D5, U1, U4 -+ preld 0, C1, 0x40 -+ xvfmul.d D6, U2, U4 -+ xvfmul.d D7, U3, U4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ preld 0, C2, 0x00 -+ /* line 3 */ -+ xvfmul.d D8, U0, U4 -+ xvfmul.d D9, U1, U4 -+ preld 0, C2, 0x40 -+ xvfmul.d D10, U2, U4 -+ xvfmul.d D11, U3, U4 -+ -+ xvldrepl.d U4, B0, 0x18 -+ preld 0, C3, 0x00 -+ /* line 4 */ -+ xvfmul.d D12, U0, U4 -+ xvfmul.d D13, U1, U4 -+ preld 0, C3, 0x40 -+ xvfmul.d D14, U2, U4 -+ xvfmul.d D15, U3, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_L7 */ -+ beq ZERO,TL, .L_L7 -+ -+ /* Calculate 8 sets of D0~D15 */ -+.L_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ preld 0, B0, B_PRE -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ preld 0, A0, A_PRE -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D10, U2, U4, D10 -+ xvfmadd.d D11, U3, U4, D11 -+ preld 0, A0, A_PRE + 0x40 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ xvfmadd.d D14, U2, U4, D14 -+ xvfmadd.d D15, U3, U4, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ /***8-2***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ preld 0, B0, B_PRE -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ preld 0, A0, A_PRE -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D10, U2, U4, D10 -+ xvfmadd.d D11, U3, U4, D11 -+ preld 0, A0, A_PRE + 0x40 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ xvfmadd.d D14, U2, U4, D14 -+ xvfmadd.d D15, U3, U4, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ /***8-3***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ preld 0, B0, B_PRE -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ preld 0, A0, A_PRE -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D10, U2, U4, D10 -+ xvfmadd.d D11, U3, U4, D11 -+ preld 0, A0, A_PRE + 0x40 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ xvfmadd.d D14, U2, U4, D14 -+ xvfmadd.d D15, U3, U4, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ /***8-4***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ preld 0, B0, B_PRE -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ preld 0, A0, A_PRE -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D10, U2, U4, D10 -+ xvfmadd.d D11, U3, U4, D11 -+ preld 0, A0, A_PRE + 0x40 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ xvfmadd.d D14, U2, U4, D14 -+ xvfmadd.d D15, U3, U4, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ /***8-5***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ preld 0, B0, B_PRE -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ preld 0, A0, A_PRE -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D10, U2, U4, D10 -+ xvfmadd.d D11, U3, U4, D11 -+ preld 0, A0, A_PRE + 0x40 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ xvfmadd.d D14, U2, U4, D14 -+ xvfmadd.d D15, U3, U4, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ /***8-6***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ preld 0, B0, B_PRE -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ preld 0, A0, A_PRE -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D10, U2, U4, D10 -+ xvfmadd.d D11, U3, U4, D11 -+ preld 0, A0, A_PRE + 0x40 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ xvfmadd.d D14, U2, U4, D14 -+ xvfmadd.d D15, U3, U4, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ /***8-7***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ preld 0, B0, B_PRE -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ preld 0, A0, A_PRE -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D10, U2, U4, D10 -+ xvfmadd.d D11, U3, U4, D11 -+ preld 0, A0, A_PRE + 0x40 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ xvfmadd.d D14, U2, U4, D14 -+ xvfmadd.d D15, U3, U4, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ /***8-8***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ preld 0, B0, B_PRE -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ preld 0, A0, A_PRE -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D10, U2, U4, D10 -+ xvfmadd.d D11, U3, U4, D11 -+ preld 0, A0, A_PRE + 0x40 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ xvfmadd.d D14, U2, U4, D14 -+ xvfmadd.d D15, U3, U4, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_TL1 -+ -+ /* Maybe we need calculate the last -+ * 7 sets of D0~D15? -+ */ -+.L_L7: -+ /* if (!(L & 7)) goto L_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_L0 -+ -+.L_L71: -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D10, U2, U4, D10 -+ xvfmadd.d D11, U3, U4, D11 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ xvfmadd.d D14, U2, U4, D14 -+ xvfmadd.d D15, U3, U4, D15 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_L71 -+ -+.L_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D1, D1, VALPHA -+ xvfmul.d D2, D2, VALPHA -+ xvfmul.d D3, D3, VALPHA -+ xvfmul.d D4, D4, VALPHA -+ xvfmul.d D5, D5, VALPHA -+ xvfmul.d D6, D6, VALPHA -+ xvfmul.d D7, D7, VALPHA -+ xvfmul.d D8, D8, VALPHA -+ xvfmul.d D9, D9, VALPHA -+ xvfmul.d D10, D10, VALPHA -+ xvfmul.d D11, D11, VALPHA -+ xvfmul.d D12, D12, VALPHA -+ xvfmul.d D13, D13, VALPHA -+ xvfmul.d D14, D14, VALPHA -+ xvfmul.d D15, D15, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ xvfmadd.d D1, D1, VALPHA, U1 -+ xvfmadd.d D2, D2, VALPHA, U2 -+ xvfmadd.d D3, D3, VALPHA, U3 -+ -+ /* Load C1 */ -+ xvld U0, C1, 0x00 -+ xvld U1, C1, 0x20 -+ xvld U2, C1, 0x40 -+ xvld U3, C1, 0x60 -+ xvfmadd.d D4, D4, VALPHA, U0 -+ xvfmadd.d D5, D5, VALPHA, U1 -+ xvfmadd.d D6, D6, VALPHA, U2 -+ xvfmadd.d D7, D7, VALPHA, U3 -+ -+ /* Load C2 */ -+ xvld U0, C2, 0x00 -+ xvld U1, C2, 0x20 -+ xvld U2, C2, 0x40 -+ xvld U3, C2, 0x60 -+ xvfmadd.d D8, D8, VALPHA, U0 -+ xvfmadd.d D9, D9, VALPHA, U1 -+ xvfmadd.d D10, D10, VALPHA, U2 -+ xvfmadd.d D11, D11, VALPHA, U3 -+ -+ /* Load C3 */ -+ xvld U0, C3, 0x00 -+ xvld U1, C3, 0x20 -+ xvld U2, C3, 0x40 -+ xvld U3, C3, 0x60 -+ xvfmadd.d D12, D12, VALPHA, U0 -+ xvfmadd.d D13, D13, VALPHA, U1 -+ xvfmadd.d D14, D14, VALPHA, U2 -+ xvfmadd.d D15, D15, VALPHA, U3 -+#endif // #if defined(TRMMKERNEL) -+ -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ xvst D1, C0, 0x20 -+ xvst D2, C0, 0x40 -+ xvst D3, C0, 0x60 -+ /* Store C1 */ -+ xvst D4, C1, 0x00 -+ xvst D5, C1, 0x20 -+ xvst D6, C1, 0x40 -+ xvst D7, C1, 0x60 -+ /* Store C2 */ -+ xvst D8, C2, 0x00 -+ xvst D9, C2, 0x20 -+ xvst D10, C2, 0x40 -+ xvst D11, C2, 0x60 -+ /* Store C3 */ -+ xvst D12, C3, 0x00 -+ xvst D13, C3, 0x20 -+ xvst D14, C3, 0x40 -+ xvst D15, C3, 0x60 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x80 -+ addi.d C1, C1, 0x80 -+ addi.d C2, C2, 0x80 -+ addi.d C3, C3, 0x80 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d L, L, -16 -+#else -+ /* number of values in B */ -+ addi.d L, L, -4 -+#endif -+ slli.d T0, L, 0x07 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x05 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x10 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+ addi.d I, I, -1 /* I-- */ -+ blt ZERO,I, .L_I1 -+ -+.L_M8: -+ /* We have done M & 16, considering M=8/4/2/1 */ -+ andi I, M, 15 -+ beq ZERO,I, .L_M0 -+ -+ andi I, M, 8 -+ beq ZERO,I, .L_M4 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x06 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x05 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 8 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif // #if defined(TRMMKERNEL) -+ -+ /* Load 8 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ xvfmul.d D5, U1, U4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ /* line 3 */ -+ xvfmul.d D8, U0, U4 -+ xvfmul.d D9, U1, U4 -+ -+ xvldrepl.d U4, B0, 0x18 -+ /* line 4 */ -+ xvfmul.d D12, U0, U4 -+ xvfmul.d D13, U1, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M8_L7 */ -+ beq ZERO,TL, .L_M8_L7 -+ -+.L_M8_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_M8_TL1 -+ -+.L_M8_L7: -+ /* if (!(L & 7)) goto L_M8_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M8_L0 -+ -+.L_M8_L71: -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D9, U1, U4, D9 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D13, U1, U4, D13 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_M8_L71 -+ -+.L_M8_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D1, D1, VALPHA -+ xvfmul.d D4, D4, VALPHA -+ xvfmul.d D5, D5, VALPHA -+ xvfmul.d D8, D8, VALPHA -+ xvfmul.d D9, D9, VALPHA -+ xvfmul.d D12, D12, VALPHA -+ xvfmul.d D13, D13, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ xvfmadd.d D1, D1, VALPHA, U1 -+ -+ /* Load C1 */ -+ xvld U0, C1, 0x00 -+ xvld U1, C1, 0x20 -+ xvfmadd.d D4, D4, VALPHA, U0 -+ xvfmadd.d D5, D5, VALPHA, U1 -+ -+ /* Load C2 */ -+ xvld U0, C2, 0x00 -+ xvld U1, C2, 0x20 -+ xvfmadd.d D8, D8, VALPHA, U0 -+ xvfmadd.d D9, D9, VALPHA, U1 -+ -+ /* Load C3 */ -+ xvld U0, C3, 0x00 -+ xvld U1, C3, 0x20 -+ xvfmadd.d D12, D12, VALPHA, U0 -+ xvfmadd.d D13, D13, VALPHA, U1 -+#endif // #if defined(TRMMKERNEL) -+ -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ xvst D1, C0, 0x20 -+ /* Store C1 */ -+ xvst D4, C1, 0x00 -+ xvst D5, C1, 0x20 -+ /* Store C2 */ -+ xvst D8, C2, 0x00 -+ xvst D9, C2, 0x20 -+ /* Store C3 */ -+ xvst D12, C3, 0x00 -+ xvst D13, C3, 0x20 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x40 -+ addi.d C1, C1, 0x40 -+ addi.d C2, C2, 0x40 -+ addi.d C3, C3, 0x40 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d L, L, -8 -+#else -+ /* number of values in B */ -+ addi.d L, L, -4 -+#endif -+ slli.d T0, L, 0x06 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x05 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d OFF, OFF, 0x08 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N >> 2 ) && (M & 8)) End************/ -+ -+.L_M4: -+ andi I, M, 4 -+ beq ZERO,I, .L_M2 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x05 -+ add.d A0, A0, T0 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 4 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 4 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ /* line 3 */ -+ xvfmul.d D8, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x18 -+ /* line 4 */ -+ xvfmul.d D12, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M4_L7 */ -+ beq ZERO,TL, .L_M4_L7 -+ -+.L_M4_TL1: /* TL-- */ -+ /***8-1***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_M4_TL1 -+ -+.L_M4_L7: -+ /* if (!(L & 7)) goto L_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M4_L0 -+ -+.L_M4_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_M4_L71 -+ -+.L_M4_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D4, D4, VALPHA -+ xvfmul.d D8, D8, VALPHA -+ xvfmul.d D12, D12, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ -+ /* Load C1 */ -+ xvld U0, C1, 0x00 -+ xvfmadd.d D4, D4, VALPHA, U0 -+ -+ /* Load C2 */ -+ xvld U0, C2, 0x00 -+ xvfmadd.d D8, D8, VALPHA, U0 -+ -+ /* Load C3 */ -+ xvld U0, C3, 0x00 -+ xvfmadd.d D12, D12, VALPHA, U0 -+#endif // #if defined(TRMMKERNEL) -+ -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ /* Store C1 */ -+ xvst D4, C1, 0x00 -+ /* Store C2 */ -+ xvst D8, C2, 0x00 -+ /* Store C3 */ -+ xvst D12, C3, 0x00 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x20 -+ addi.d C1, C1, 0x20 -+ addi.d C2, C2, 0x20 -+ addi.d C3, C3, 0x20 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d L, L, -4 -+#else -+ /* number of values in B */ -+ addi.d L, L, -4 -+#endif -+ slli.d T0, L, 0x05 -+ add.d A0, A0, T0 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d OFF, OFF, 0x04 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ -+ -+.L_M2: -+ andi I, M, 2 -+ beq ZERO,I, .L_M1 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x04 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x05 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 2 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 2 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ /* line 3 */ -+ xvfmul.d D8, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x18 -+ /* line 4 */ -+ xvfmul.d D12, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M2_L7 */ -+ beq ZERO,TL, .L_M2_L7 -+ -+.L_M2_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 2 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_M2_TL1 -+ -+.L_M2_L7: -+ /* if (!(L & 7)) goto L_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M2_L0 -+ -+.L_M2_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_M2_L71 -+ -+.L_M2_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D4, D4, VALPHA -+ xvfmul.d D8, D8, VALPHA -+ xvfmul.d D12, D12, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ -+ /* Load C1 */ -+ xvld U0, C1, 0x00 -+ xvfmadd.d D4, D4, VALPHA, U0 -+ -+ /* Load C2 */ -+ xvld U0, C2, 0x00 -+ xvfmadd.d D8, D8, VALPHA, U0 -+ -+ /* Load C3 */ -+ xvld U0, C3, 0x00 -+ xvfmadd.d D12, D12, VALPHA, U0 -+#endif // #if defined(TRMMKERNEL) -+ -+ xvstelm.d D0, C0, 0x00, 0x00 -+ xvstelm.d D4, C1, 0x00, 0x00 -+ xvstelm.d D8, C2, 0x00, 0x00 -+ xvstelm.d D12, C3, 0x00, 0x00 -+ xvstelm.d D0, C0, 0x08, 0x01 -+ xvstelm.d D4, C1, 0x08, 0x01 -+ xvstelm.d D8, C2, 0x08, 0x01 -+ xvstelm.d D12, C3, 0x08, 0x01 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x10 -+ addi.d C1, C1, 0x10 -+ addi.d C2, C2, 0x10 -+ addi.d C3, C3, 0x10 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d L, L, -2 -+#else -+ /* number of values in B */ -+ addi.d L, L, -4 -+#endif -+ slli.d T0, L, 0x04 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x05 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d OFF, OFF, 0x02 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ -+ -+.L_M1: -+ andi I, M, 1 -+ beq ZERO,I, .L_M0 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x03 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x05 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 1 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 1 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ /* line 3 */ -+ xvfmul.d D8, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x18 -+ /* line 4 */ -+ xvfmul.d D12, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M1_L7 */ -+ beq ZERO,TL, .L_M1_L7 -+ -+.L_M1_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 1 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_M1_TL1 -+ -+.L_M1_L7: -+ /* if (!(L & 7)) goto L_M1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M1_L0 -+ -+.L_M1_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_M1_L71 -+ -+.L_M1_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D4, D4, VALPHA -+ xvfmul.d D8, D8, VALPHA -+ xvfmul.d D12, D12, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ -+ /* Load C1 */ -+ xvld U0, C1, 0x00 -+ xvfmadd.d D4, D4, VALPHA, U0 -+ -+ /* Load C2 */ -+ xvld U0, C2, 0x00 -+ xvfmadd.d D8, D8, VALPHA, U0 -+ -+ /* Load C3 */ -+ xvld U0, C3, 0x00 -+ xvfmadd.d D12, D12, VALPHA, U0 -+#endif // #if defined(TRMMKERNEL) -+ -+ xvstelm.d D0, C0, 0x00, 0x00 -+ xvstelm.d D4, C1, 0x00, 0x00 -+ xvstelm.d D8, C2, 0x00, 0x00 -+ xvstelm.d D12, C3, 0x00, 0x00 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x08 -+ addi.d C1, C1, 0x08 -+ addi.d C2, C2, 0x08 -+ addi.d C3, C3, 0x08 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d L, L, -1 -+#else -+ /* number of values in B */ -+ addi.d L, L, -4 -+#endif -+ slli.d T0, L, 0x03 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x05 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d OFF, OFF, 0x01 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ -+ -+.L_M0: -+ /* Add stride for B and C -+ * B += (K * 32) -+ * C += (LDC * 32) -+ */ -+ /* since the array type is double, -+ * so we must mul 32 -+ */ -+ slli.d T0, K, 5 -+ slli.d T1, LDC, 5 -+ add.d B, B, T0 -+ add.d C, C, T1 -+ -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addi.d OFF, OFF, 0x04 -+#endif -+ -+ blt ZERO, J, .L_J1 -+ -+//////////////// go back to L_J1 ///////////////// -+///////////////////////////////////////////////// -+/************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ -+ -+.L_N3: -+ andi J, N, 2 -+ beq ZERO, J, .L_N1 -+ -+/************************* Condition 2 if((N & 2) && (M >> 4)) START !!! ************************* -+* dgemm_core_16x2 */ -+ -+ move C0, C -+ move A0, A -+ slli.d T0, LDC, 3 -+ add.d C1, C0, T0 -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move OFF, OFFSET -+#endif -+ -+ /* if (!(M >> 4)) goto L_N3_M8 */ -+ srai.d I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_N3_M8 -+ -+.L_N3_I1: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x07 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x04 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 16 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 16 * 64 from A0 -+ * U0 = {a3, a2, a1, a0} -+ * U1 = {a7, a6, a5, a4} -+ * U2 = {a11, a10, a9, a8} -+ * U3 = {a15, a14, a13, a12} -+ */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ xvfmul.d D2, U2, U4 -+ xvfmul.d D3, U3, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ xvfmul.d D5, U1, U4 -+ xvfmul.d D6, U2, U4 -+ xvfmul.d D7, U3, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N3_L7 */ -+ beq ZERO,TL, .L_N3_L7 -+ -+.L_N3_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ /***8-2***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ /***8-3***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ /***8-4***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ /***8-5***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ /***8-6***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ /***8-7***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ /***8-8***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N3_TL1 -+ -+.L_N3_L7: -+ /* if (!(L & 7)) goto L_N3_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N3_L0 -+ -+.L_N3_L71: -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D6, U2, U4, D6 -+ xvfmadd.d D7, U3, U4, D7 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N3_L71 -+ -+.L_N3_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D1, D1, VALPHA -+ xvfmul.d D2, D2, VALPHA -+ xvfmul.d D3, D3, VALPHA -+ xvfmul.d D4, D4, VALPHA -+ xvfmul.d D5, D5, VALPHA -+ xvfmul.d D6, D6, VALPHA -+ xvfmul.d D7, D7, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ xvfmadd.d D1, D1, VALPHA, U1 -+ xvfmadd.d D2, D2, VALPHA, U2 -+ xvfmadd.d D3, D3, VALPHA, U3 -+ -+ /* Load C1 */ -+ xvld U0, C1, 0x00 -+ xvld U1, C1, 0x20 -+ xvld U2, C1, 0x40 -+ xvld U3, C1, 0x60 -+ xvfmadd.d D4, D4, VALPHA, U0 -+ xvfmadd.d D5, D5, VALPHA, U1 -+ xvfmadd.d D6, D6, VALPHA, U2 -+ xvfmadd.d D7, D7, VALPHA, U3 -+#endif // #if defined(TRMMKERNEL) -+ -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ xvst D1, C0, 0x20 -+ xvst D2, C0, 0x40 -+ xvst D3, C0, 0x60 -+ /* Store C1 */ -+ xvst D4, C1, 0x00 -+ xvst D5, C1, 0x20 -+ xvst D6, C1, 0x40 -+ xvst D7, C1, 0x60 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x80 -+ addi.d C1, C1, 0x80 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ addi.d L, L, -16 -+#else -+ addi.d L, L, -2 -+#endif -+ slli.d T0, L, 0x07 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x04 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x10 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+ addi.d I, I, -1 /* I-- */ -+ blt ZERO,I, .L_N3_I1 -+ -+.L_N3_M8: -+ /* We have done M & 16, considering M=8/4/2/1 */ -+ andi I, M, 15 -+ beq ZERO,I, .L_N3_M0 -+ -+ andi I, M, 8 -+ beq ZERO,I, .L_N3_M4 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x06 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x04 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 8 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 8 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ xvfmul.d D5, U1, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N3_M8_L7 */ -+ beq ZERO,TL, .L_N3_M8_L7 -+ -+.L_N3_M8_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N3_M8_TL1 -+ -+.L_N3_M8_L7: -+ /* if (!(L & 7)) goto L_N3_M8_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N3_M8_L0 -+ -+.L_N3_M8_L71: -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ xvfmadd.d D5, U1, U4, D5 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N3_M8_L71 -+ -+.L_N3_M8_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D1, D1, VALPHA -+ xvfmul.d D4, D4, VALPHA -+ xvfmul.d D5, D5, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ xvfmadd.d D1, D1, VALPHA, U1 -+ -+ /* Load C1 */ -+ xvld U0, C1, 0x00 -+ xvld U1, C1, 0x20 -+ xvfmadd.d D4, D4, VALPHA, U0 -+ xvfmadd.d D5, D5, VALPHA, U1 -+#endif // #if defined(TRMMKERNEL) -+ -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ xvst D1, C0, 0x20 -+ /* Store C1 */ -+ xvst D4, C1, 0x00 -+ xvst D5, C1, 0x20 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x40 -+ addi.d C1, C1, 0x40 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ addi.d L, L, -8 -+#else -+ addi.d L, L, -2 -+#endif -+ slli.d T0, L, 0x06 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x04 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x08 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N & 2) && (M & 8) ) End************/ -+ -+.L_N3_M4: -+ andi I, M, 4 -+ beq ZERO,I, .L_N3_M2 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x05 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x04 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 4 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 4 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N3_M4_L7 */ -+ beq ZERO,TL, .L_N3_M4_L7 -+ -+.L_N3_M4_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 8 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ -+ /* Cumulative D0~D15 */ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N3_M4_TL1 -+ -+.L_N3_M4_L7: -+ /* if (!(L & 7)) goto L_N3_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N3_M4_L0 -+ -+.L_N3_M4_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N3_M4_L71 -+ -+.L_N3_M4_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D4, D4, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ -+ /* Load C1 */ -+ xvld U0, C1, 0x00 -+ xvfmadd.d D4, D4, VALPHA, U0 -+#endif // #if defined(TRMMKERNEL) -+ -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ /* Store C1 */ -+ xvst D4, C1, 0x00 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x20 -+ addi.d C1, C1, 0x20 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ addi.d L, L, -4 -+#else -+ addi.d L, L, -2 -+#endif -+ slli.d T0, L, 0x05 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x04 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x04 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N & 2 ) && (M & 4) ) End************/ -+ -+.L_N3_M2: -+ andi I, M, 2 -+ beq ZERO,I, .L_N3_M1 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x04 -+ add.d A0, A0, T0 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 2 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 2 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N3_M2_L7 */ -+ beq ZERO,TL, .L_N3_M2_L7 -+ -+.L_N3_M2_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 2 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N3_M2_TL1 -+ -+.L_N3_M2_L7: -+ /* if (!(L & 7)) goto L_N3_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N3_M2_L0 -+ -+.L_N3_M2_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N3_M2_L71 -+ -+.L_N3_M2_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D4, D4, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ -+ /* Load C1 */ -+ xvld U0, C1, 0x00 -+ xvfmadd.d D4, D4, VALPHA, U0 -+#endif // #if defined(TRMMKERNEL) -+ -+ xvstelm.d D0, C0, 0x00, 0x00 -+ xvstelm.d D4, C1, 0x00, 0x00 -+ xvstelm.d D0, C0, 0x08, 0x01 -+ xvstelm.d D4, C1, 0x08, 0x01 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x10 -+ addi.d C1, C1, 0x10 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ addi.d L, L, -2 -+#else -+ addi.d L, L, -2 -+#endif -+ slli.d T0, L, 0x04 -+ add.d A0, A0, T0 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x02 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N & 2 ) && (M & 2) ) End************/ -+ -+.L_N3_M1: -+ andi I, M, 1 -+ beq ZERO,I, .L_N3_M0 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x03 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x04 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 1 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 1 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N3_M1_L7 */ -+ beq ZERO,TL, .L_N3_M1_L7 -+ -+.L_N3_M1_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 1 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N3_M1_TL1 -+ -+.L_N3_M1_L7: -+ /* if (!(L & 7)) goto L_N3_M1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N3_M1_L0 -+ -+.L_N3_M1_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N3_M1_L71 -+ -+.L_N3_M1_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D4, D4, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ -+ /* Load C1 */ -+ xvld U0, C1, 0x00 -+ xvfmadd.d D4, D4, VALPHA, U0 -+#endif // #if defined(TRMMKERNEL) -+ -+ xvstelm.d D0, C0, 0x00, 0x00 -+ xvstelm.d D4, C1, 0x00, 0x00 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x08 -+ addi.d C1, C1, 0x08 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ addi.d L, L, -1 -+#else -+ addi.d L, L, -2 -+#endif -+ slli.d T0, L, 0x03 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x04 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x01 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N & 2 ) && (M & 1) ) End************/ -+ -+.L_N3_M0: -+ /* Add stride for B and C -+ * B += (K * 16) -+ * C += (LDC * 16) -+ */ -+ /* since the array type is double, -+ * so we must mul 16 -+ */ -+ slli.d T0, K, 4 -+ slli.d T1, LDC, 4 -+ add.d B, B, T0 -+ add.d C, C, T1 -+ -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addi.d OFF, OFF, 0x02 -+#endif -+ -+ /* We must reinit I */ -+ srai.d I, M, 4 /* I = bm >> 4 */ -+ -+/************************* Condition 2 if((N & 2) && (M >> 4)) End !!! ************************* -+* dgemm_core_16x2 */ -+ -+.L_N1: -+ andi J, N, 1 -+ beq ZERO, J, .L_N0 -+ -+/************************* Condition 3 if((N & 1) && (M >> 4)) START !!! ************************* -+* dgemm_core_16x1 */ -+ -+ move C0, C -+ move A0, A -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move OFF, OFFSET -+#endif -+ -+ /* if (!(M >> 4)) goto L_N1_M8 */ -+ srai.d I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_N1_M8 -+ -+.L_N1_I1: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x07 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x03 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 16 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 16 * 64 from A0 -+ * U0 = {a3, a2, a1, a0} -+ * U1 = {a7, a6, a5, a4} -+ * U2 = {a11, a10, a9, a8} -+ * U3 = {a15, a14, a13, a12} -+ */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ xvfmul.d D2, U2, U4 -+ xvfmul.d D3, U3, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_L7 */ -+ beq ZERO,TL, .L_N1_L7 -+ -+.L_N1_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ /***8-2***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ /***8-3***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ /***8-4***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ /***8-5***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ /***8-6***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ /***8-7***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ /***8-8***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_TL1 -+ -+.L_N1_L7: -+ /* if (!(L & 7)) goto L_N1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_L0 -+ -+.L_N1_L71: -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N1_L71 -+ -+.L_N1_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D1, D1, VALPHA -+ xvfmul.d D2, D2, VALPHA -+ xvfmul.d D3, D3, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ xvfmadd.d D1, D1, VALPHA, U1 -+ xvfmadd.d D2, D2, VALPHA, U2 -+ xvfmadd.d D3, D3, VALPHA, U3 -+#endif // #if defined(TRMMKERNEL) -+ -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ xvst D1, C0, 0x20 -+ xvst D2, C0, 0x40 -+ xvst D3, C0, 0x60 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x80 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ addi.d L, L, -16 -+#else -+ addi.d L, L, -1 -+#endif -+ slli.d T0, L, 0x07 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x03 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x10 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+ addi.d I, I, -1 /* I-- */ -+ blt ZERO,I, .L_N1_I1 -+ -+.L_N1_M8: -+ /* We have done M & 16, considering M=8/4/2/1 */ -+ andi I, M, 15 -+ beq ZERO,I, .L_N1_M0 -+ -+ andi I, M, 8 -+ beq ZERO,I, .L_N1_M4 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x06 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x03 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 8 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 8 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M8_L7 */ -+ beq ZERO,TL, .L_N1_M8_L7 -+ -+.L_N1_M8_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_M8_TL1 -+ -+.L_N1_M8_L7: -+ /* if (!(L & 7)) goto L_N1_M8_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_M8_L0 -+ -+.L_N1_M8_L71: -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N1_M8_L71 -+ -+.L_N1_M8_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D1, D1, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ xvfmadd.d D1, D1, VALPHA, U1 -+#endif // #if defined(TRMMKERNEL) -+ -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ xvst D1, C0, 0x20 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x40 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ addi.d L, L, -8 -+#else -+ addi.d L, L, -1 -+#endif -+ slli.d T0, L, 0x06 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x03 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x08 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N & 1) && (M & 8) ) End************/ -+ -+.L_N1_M4: -+ andi I, M, 4 -+ beq ZERO,I, .L_N1_M2 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x05 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x03 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 4 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 4 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M4_L7 */ -+ beq ZERO,TL, .L_N1_M4_L7 -+ -+.L_N1_M4_TL1: /* TL-- */ -+ /***8-1***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_M4_TL1 -+ -+.L_N1_M4_L7: -+ /* if (!(L & 7)) goto L_N1_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_M4_L0 -+ -+.L_N1_M4_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N1_M4_L71 -+ -+.L_N1_M4_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+#endif // #if defined(TRMMKERNEL) -+ -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x20 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ addi.d L, L, -4 -+#else -+ addi.d L, L, -1 -+#endif -+ slli.d T0, L, 0x05 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x03 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x04 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N & 1) && (M & 4) ) End************/ -+ -+.L_N1_M2: -+ andi I, M, 2 -+ beq ZERO,I, .L_N1_M1 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x04 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x03 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 2 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 2 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M2_L7 */ -+ beq ZERO,TL, .L_N1_M2_L7 -+ -+.L_N1_M2_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 2 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_M2_TL1 -+ -+.L_N1_M2_L7: -+ /* if (!(L & 7)) goto L_N1_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_M2_L0 -+ -+.L_N1_M2_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N1_M2_L71 -+ -+.L_N1_M2_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+#endif // #if defined(TRMMKERNEL) -+ -+ xvstelm.d D0, C0, 0x00, 0x00 -+ xvstelm.d D0, C0, 0x08, 0x01 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x10 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ addi.d L, L, -2 -+#else -+ addi.d L, L, -1 -+#endif -+ slli.d T0, L, 0x04 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x03 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x02 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N & 1 ) && (M & 2) ) End************/ -+ -+.L_N1_M1: -+ andi I, M, 1 -+ beq ZERO,I, .L_N1_M0 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x03 -+ add.d A0, A0, T0 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 1 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 1 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M1_L7 */ -+ beq ZERO,TL, .L_N1_M1_L7 -+ -+.L_N1_M1_TL1: /* TL-- */ -+ /***8-1***/ -+ /* Load 1 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ /***8-2***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ /***8-3***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ /***8-4***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ /***8-5***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ /***8-6***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ /***8-7***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ /***8-8***/ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_M1_TL1 -+ -+.L_N1_M1_L7: -+ /* if (!(L & 7)) goto L_N1_M1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_M1_L0 -+ -+.L_N1_M1_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N1_M1_L71 -+ -+.L_N1_M1_L0: -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+#endif // #if defined(TRMMKERNEL) -+ -+ xvstelm.d D0, C0, 0x00, 0x00 -+ -+ /* Add stride for C */ -+ addi.d C0, C0, 0x08 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ addi.d L, L, -1 -+#else -+ addi.d L, L, -1 -+#endif -+ slli.d T0, L, 0x03 -+ add.d A0, A0, T0 -+ add.d B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ addi.d OFF, OFF, 0x01 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+/********LOOP (if(N & 1 ) && (M & 1) ) End************/ -+ -+.L_N1_M0: -+ -+/************************* Condition 3 if((N & 1) && (M >> 4)) End !!! ************************* -+* dgemm_core_16x1 */ -+ -+.L_N0: -+ /* Restore regs */ -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ LDARG $r25, $sp, 16 -+ LDARG $r26, $sp, 24 -+ LDARG $r27, $sp, 32 -+ LD $f23, $sp, 40 -+ addi.d $sp, $sp, 56 -+ -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/dgemm_ncopy_16.S b/kernel/loongarch64/dgemm_ncopy_16.S -new file mode 100644 -index 0000000..95c8790 ---- /dev/null -+++ b/kernel/loongarch64/dgemm_ncopy_16.S -@@ -0,0 +1,691 @@ -+/******************************************************************************* -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+/* Function parameters */ -+#define M $r4 // param 1: m -+#define N $r5 // param 2: n -+#define SRC $r6 // param 3: src -+#define LDA $r7 // param 4: lda -+#define DST $r8 // param 5: dst -+ -+#define I $r9 -+#define J $r10 -+#define S1 $r12 -+#define S2 $r13 -+#define S3 $r14 -+#define S4 $r15 -+#define S5 $r16 -+#define S6 $r17 -+#define S7 $r18 -+#define S8 $r19 -+#define S9 $r20 -+#define S10 $r23 -+#define S11 $r24 -+#define S12 $r25 -+#define S13 $r26 -+#define S14 $r27 -+#define S15 $r28 -+#define S16 $r29 -+#define TD $r30 -+#define TS $r31 -+#define TL $r7 -+#define T0 $r6 -+#define ZERO $r0 -+ -+#define F0 $f0 -+#define F1 $f1 -+#define F2 $f2 -+#define F3 $f3 -+#define F4 $f4 -+#define F5 $f5 -+#define F6 $f6 -+#define F7 $f7 -+/* LASX vectors */ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+#define U8 $xr8 -+#define U9 $xr9 -+#define U10 $xr10 -+#define U11 $xr11 -+#define U12 $xr12 -+#define U13 $xr13 -+#define U14 $xr14 -+#define U15 $xr15 -+#define D0 $xr16 -+#define D1 $xr17 -+#define D2 $xr18 -+#define D3 $xr19 -+#define D4 $xr20 -+#define D5 $xr21 -+#define D6 $xr22 -+#define D7 $xr23 -+#define D8 $xr24 -+#define D9 $xr25 -+#define D10 $xr26 -+#define D11 $xr27 -+#define D12 $xr28 -+#define D13 $xr29 -+#define D14 $xr30 -+#define D15 $xr31 -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -0x90 -+ SDARG $r23, $sp, 0x00 -+ SDARG $r24, $sp, 0x08 -+ SDARG $r25, $sp, 0x10 -+ SDARG $r26, $sp, 0x18 -+ SDARG $r27, $sp, 0x20 -+ SDARG $r28, $sp, 0x28 -+ SDARG $r29, $sp, 0x30 -+ SDARG $r30, $sp, 0x38 -+ SDARG $r31, $sp, 0x40 -+ ST $f23, $sp, 0x48 -+ ST $f24, $sp, 0x50 -+ ST $f25, $sp, 0x58 -+ ST $f26, $sp, 0x60 -+ ST $f27, $sp, 0x68 -+ ST $f28, $sp, 0x70 -+ ST $f29, $sp, 0x78 -+ ST $f30, $sp, 0x80 -+ ST $f31, $sp, 0x88 -+ -+ move TD, DST -+ move TS, SRC -+ slli.d TL, LDA, 0x03 -+ slli.d T0, TL, 0x01 -+ srai.d J, N, 0x04 -+ beq J, ZERO, .L_N8 -+ -+.L_J1: /* J-- */ -+ move S1, TS -+ add.d S2, TS, TL -+ srai.d I, M, 0x03 -+ add.d S3, S2, TL -+ addi.d J, J, -1 -+ add.d S4, S3, TL -+ add.d S5, S3, T0 -+ add.d S6, S4, T0 -+ add.d S7, S5, T0 -+ add.d S8, S6, T0 -+ add.d S9, S7, T0 -+ add.d S10, S8, T0 -+ add.d S11, S9, T0 -+ add.d S12, S10, T0 -+ add.d S13, S11, T0 -+ add.d S14, S12, T0 -+ add.d S15, S13, T0 -+ add.d S16, S14, T0 -+ add.d TS, S15, T0 -+ beq I, ZERO, .L_I7 -+ -+.L_I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ xvld U4, S5, 0x00 -+ xvld U5, S6, 0x00 -+ xvld U6, S7, 0x00 -+ xvld U7, S8, 0x00 -+ xvld U8, S9, 0x00 -+ xvld U9, S10, 0x00 -+ xvld U10, S11, 0x00 -+ xvld U11, S12, 0x00 -+ xvld U12, S13, 0x00 -+ xvld U13, S14, 0x00 -+ xvld U14, S15, 0x00 -+ xvld U15, S16, 0x00 -+ -+ xvpackev.d D0, U1, U0 -+ xvpackod.d D1, U1, U0 -+ xvpackev.d D2, U3, U2 -+ xvpackod.d D3, U3, U2 -+ xvpackev.d D4, U5, U4 -+ xvpackod.d D5, U5, U4 -+ xvpackev.d D6, U7, U6 -+ xvpackod.d D7, U7, U6 -+ -+ xvpackev.d D8, U9, U8 -+ xvpackod.d D9, U9, U8 -+ xvpackev.d D10, U11, U10 -+ xvpackod.d D11, U11, U10 -+ xvpackev.d D12, U13, U12 -+ xvpackod.d D13, U13, U12 -+ xvpackev.d D14, U15, U14 -+ xvpackod.d D15, U15, U14 -+ -+ xvand.v U0, D0, D0 -+ xvpermi.q D0, D2, 0x02 // 0 -+ xvand.v U4, D4, D4 -+ xvpermi.q D4, D6, 0x02 // 1 -+ xvand.v U1, D1, D1 -+ xvpermi.q D1, D3, 0x02 // 4 -+ xvand.v U5, D5, D5 -+ xvpermi.q D5, D7, 0x02 // 5 -+ xvpermi.q D2, U0, 0x31 // 8 -+ xvpermi.q D6, U4, 0x31 // 9 -+ xvpermi.q D3, U1, 0x31 // 12 -+ xvpermi.q D7, U5, 0x31 // 13 -+ -+ xvand.v U8, D8, D8 -+ xvpermi.q D8, D10, 0x02 // 2 -+ xvand.v U12, D12, D12 -+ xvpermi.q D12, D14, 0x02 // 3 -+ xvand.v U9, D9, D9 -+ xvpermi.q D9, D11, 0x02 // 6 -+ xvand.v U13, D13, D13 -+ xvpermi.q D13, D15, 0x02 // 7 -+ xvpermi.q D10, U8, 0x31 // 10 -+ xvpermi.q D14, U12, 0x31 // 11 -+ xvpermi.q D11, U9, 0x31 // 14 -+ xvpermi.q D15, U13, 0x31 // 15 -+ -+ xvst D0, TD, 0x00 // 0 -+ xvst D4, TD, 0x20 // 1 -+ xvst D8, TD, 0x40 // 2 -+ xvst D12, TD, 0x60 // 3 -+ xvst D1, TD, 0x80 // 4 -+ xvst D5, TD, 0xA0 // 5 -+ xvst D9, TD, 0xC0 // 6 -+ xvst D13, TD, 0xE0 // 7 -+ addi.d TD, TD, 0x100 -+ xvst D2, TD, 0x00 // 8 -+ xvst D6, TD, 0x20 // 9 -+ xvst D10, TD, 0x40 // 10 -+ xvst D14, TD, 0x60 // 11 -+ xvst D3, TD, 0x80 // 12 -+ xvst D7, TD, 0xA0 // 13 -+ xvst D11, TD, 0xC0 // 14 -+ xvst D15, TD, 0xE0 // 15 -+ addi.d TD, TD, 0x100 -+ -+ xvld U0, S1, 0x20 -+ xvld U1, S2, 0x20 -+ xvld U2, S3, 0x20 -+ xvld U3, S4, 0x20 -+ xvld U4, S5, 0x20 -+ xvld U5, S6, 0x20 -+ xvld U6, S7, 0x20 -+ xvld U7, S8, 0x20 -+ xvld U8, S9, 0x20 -+ xvld U9, S10, 0x20 -+ xvld U10, S11, 0x20 -+ xvld U11, S12, 0x20 -+ xvld U12, S13, 0x20 -+ xvld U13, S14, 0x20 -+ xvld U14, S15, 0x20 -+ xvld U15, S16, 0x20 -+ -+ xvpackev.d D0, U1, U0 -+ xvpackod.d D1, U1, U0 -+ xvpackev.d D2, U3, U2 -+ xvpackod.d D3, U3, U2 -+ xvpackev.d D4, U5, U4 -+ xvpackod.d D5, U5, U4 -+ xvpackev.d D6, U7, U6 -+ xvpackod.d D7, U7, U6 -+ -+ xvpackev.d D8, U9, U8 -+ xvpackod.d D9, U9, U8 -+ xvpackev.d D10, U11, U10 -+ xvpackod.d D11, U11, U10 -+ xvpackev.d D12, U13, U12 -+ xvpackod.d D13, U13, U12 -+ xvpackev.d D14, U15, U14 -+ xvpackod.d D15, U15, U14 -+ -+ xvand.v U0, D0, D0 -+ xvpermi.q D0, D2, 0x02 // 0 -+ xvand.v U4, D4, D4 -+ xvpermi.q D4, D6, 0x02 // 1 -+ xvand.v U1, D1, D1 -+ xvpermi.q D1, D3, 0x02 // 4 -+ xvand.v U5, D5, D5 -+ xvpermi.q D5, D7, 0x02 // 5 -+ xvpermi.q D2, U0, 0x31 // 8 -+ xvpermi.q D6, U4, 0x31 // 9 -+ xvpermi.q D3, U1, 0x31 // 12 -+ xvpermi.q D7, U5, 0x31 // 13 -+ -+ xvand.v U8, D8, D8 -+ xvpermi.q D8, D10, 0x02 // 2 -+ xvand.v U12, D12, D12 -+ xvpermi.q D12, D14, 0x02 // 3 -+ xvand.v U9, D9, D9 -+ xvpermi.q D9, D11, 0x02 // 6 -+ xvand.v U13, D13, D13 -+ xvpermi.q D13, D15, 0x02 // 7 -+ xvpermi.q D10, U8, 0x31 // 10 -+ xvpermi.q D14, U12, 0x31 // 11 -+ xvpermi.q D11, U9, 0x31 // 14 -+ xvpermi.q D15, U13, 0x31 // 15 -+ -+ xvst D0, TD, 0x00 // 0 -+ xvst D4, TD, 0x20 // 1 -+ xvst D8, TD, 0x40 // 2 -+ xvst D12, TD, 0x60 // 3 -+ xvst D1, TD, 0x80 // 4 -+ xvst D5, TD, 0xA0 // 5 -+ xvst D9, TD, 0xC0 // 6 -+ xvst D13, TD, 0xE0 // 7 -+ addi.d TD, TD, 0x100 -+ xvst D2, TD, 0x00 // 8 -+ xvst D6, TD, 0x20 // 9 -+ xvst D10, TD, 0x40 // 10 -+ xvst D14, TD, 0x60 // 11 -+ xvst D3, TD, 0x80 // 12 -+ xvst D7, TD, 0xA0 // 13 -+ xvst D11, TD, 0xC0 // 14 -+ xvst D15, TD, 0xE0 // 15 -+ addi.d TD, TD, 0x100 -+ -+ -+ addi.d S1, S1, 0x40 -+ addi.d S2, S2, 0x40 -+ addi.d S3, S3, 0x40 -+ addi.d S4, S4, 0x40 -+ addi.d S5, S5, 0x40 -+ addi.d S6, S6, 0x40 -+ addi.d S7, S7, 0x40 -+ addi.d S8, S8, 0x40 -+ addi.d S9, S9, 0x40 -+ addi.d S10, S10, 0x40 -+ addi.d S11, S11, 0x40 -+ addi.d S12, S12, 0x40 -+ addi.d S13, S13, 0x40 -+ addi.d S14, S14, 0x40 -+ addi.d S15, S15, 0x40 -+ addi.d S16, S16, 0x40 -+ -+ addi.d I, I, -1 -+ blt ZERO, I, .L_I1 -+ -+.L_I7: -+ andi I, M, 0x07 -+ beq I, ZERO, .L_I0 -+ -+.L_II1: /* I-- */ -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ fld.d F2, S3, 0x00 -+ fld.d F3, S4, 0x00 -+ fld.d F4, S5, 0x00 -+ fld.d F5, S6, 0x00 -+ fld.d F6, S7, 0x00 -+ fld.d F7, S8, 0x00 -+ -+ fst.d F0, TD, 0x00 -+ addi.d S1, S1, 0x08 -+ fst.d F1, TD, 0x08 -+ addi.d S2, S2, 0x08 -+ fst.d F2, TD, 0x10 -+ addi.d S3, S3, 0x08 -+ fst.d F3, TD, 0x18 -+ addi.d S4, S4, 0x08 -+ fst.d F4, TD, 0x20 -+ addi.d S5, S5, 0x08 -+ fst.d F5, TD, 0x28 -+ addi.d S6, S6, 0x08 -+ fst.d F6, TD, 0x30 -+ addi.d S7, S7, 0x08 -+ fst.d F7, TD, 0x38 -+ addi.d S8, S8, 0x08 -+ addi.d TD, TD, 0x40 -+ -+ fld.d F0, S9, 0x00 -+ fld.d F1, S10, 0x00 -+ fld.d F2, S11, 0x00 -+ fld.d F3, S12, 0x00 -+ fld.d F4, S13, 0x00 -+ fld.d F5, S14, 0x00 -+ fld.d F6, S15, 0x00 -+ fld.d F7, S16, 0x00 -+ -+ fst.d F0, TD, 0x00 -+ addi.d S9, S9, 0x08 -+ fst.d F1, TD, 0x08 -+ addi.d S10, S10, 0x08 -+ fst.d F2, TD, 0x10 -+ addi.d S11, S11, 0x08 -+ fst.d F3, TD, 0x18 -+ addi.d S12, S12, 0x08 -+ fst.d F4, TD, 0x20 -+ addi.d S13, S13, 0x08 -+ fst.d F5, TD, 0x28 -+ addi.d S14, S14, 0x08 -+ fst.d F6, TD, 0x30 -+ addi.d S15, S15, 0x08 -+ fst.d F7, TD, 0x38 -+ addi.d S16, S16, 0x08 -+ addi.d TD, TD, 0x40 -+ -+ addi.d I, I, -1 -+ blt ZERO, I, .L_II1 -+ -+.L_I0: -+ blt ZERO, J, .L_J1 -+ -+.L_N8: -+ andi J, N, 0x08 -+ beq ZERO, J, .L_N4 -+ -+ move S1, TS -+ add.d S2, TS, TL -+ srai.d I, M, 0x03 -+ add.d S3, S2, TL -+ add.d S4, S2, T0 -+ add.d S5, S3, T0 -+ add.d S6, S4, T0 -+ add.d S7, S5, T0 -+ add.d S8, S6, T0 -+ add.d TS, S7, T0 -+ beq I, ZERO, .L_8I3 -+ -+.L_8I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ xvld U4, S5, 0x00 -+ xvld U5, S6, 0x00 -+ xvld U6, S7, 0x00 -+ xvld U7, S8, 0x00 -+ -+ xvpackev.d D0, U1, U0 -+ xvpackod.d D1, U1, U0 -+ xvpackev.d D2, U3, U2 -+ xvpackod.d D3, U3, U2 -+ xvpackev.d D4, U5, U4 -+ xvpackod.d D5, U5, U4 -+ xvpackev.d D6, U7, U6 -+ xvpackod.d D7, U7, U6 -+ -+ xvand.v U0, D0, D0 -+ xvpermi.q D0, D2, 0x02 // 0 -+ xvand.v U4, D4, D4 -+ xvpermi.q D4, D6, 0x02 // 1 -+ xvand.v U1, D1, D1 -+ xvpermi.q D1, D3, 0x02 // 2 -+ xvand.v U5, D5, D5 -+ xvpermi.q D5, D7, 0x02 // 3 -+ xvpermi.q D2, U0, 0x31 // 4 -+ xvpermi.q D6, U4, 0x31 // 5 -+ xvpermi.q D3, U1, 0x31 // 6 -+ xvpermi.q D7, U5, 0x31 // 7 -+ -+ xvst D0, TD, 0x00 -+ xvst D4, TD, 0x20 -+ xvst D1, TD, 0x40 -+ xvst D5, TD, 0x60 -+ xvst D2, TD, 0x80 -+ xvst D6, TD, 0xA0 -+ xvst D3, TD, 0xC0 -+ xvst D7, TD, 0xE0 -+ addi.d TD, TD, 0x100 -+ -+ xvld U0, S1, 0x20 -+ xvld U1, S2, 0x20 -+ xvld U2, S3, 0x20 -+ xvld U3, S4, 0x20 -+ xvld U4, S5, 0x20 -+ xvld U5, S6, 0x20 -+ xvld U6, S7, 0x20 -+ xvld U7, S8, 0x20 -+ -+ xvpackev.d D0, U1, U0 -+ xvpackod.d D1, U1, U0 -+ xvpackev.d D2, U3, U2 -+ xvpackod.d D3, U3, U2 -+ xvpackev.d D4, U5, U4 -+ xvpackod.d D5, U5, U4 -+ xvpackev.d D6, U7, U6 -+ xvpackod.d D7, U7, U6 -+ -+ xvand.v U0, D0, D0 -+ xvpermi.q D0, D2, 0x02 // 0 -+ xvand.v U4, D4, D4 -+ xvpermi.q D4, D6, 0x02 // 1 -+ xvand.v U1, D1, D1 -+ xvpermi.q D1, D3, 0x02 // 2 -+ xvand.v U5, D5, D5 -+ xvpermi.q D5, D7, 0x02 // 3 -+ xvpermi.q D2, U0, 0x31 // 4 -+ xvpermi.q D6, U4, 0x31 // 5 -+ xvpermi.q D3, U1, 0x31 // 6 -+ xvpermi.q D7, U5, 0x31 // 7 -+ -+ xvst D0, TD, 0x00 -+ xvst D4, TD, 0x20 -+ xvst D1, TD, 0x40 -+ xvst D5, TD, 0x60 -+ xvst D2, TD, 0x80 -+ xvst D6, TD, 0xA0 -+ xvst D3, TD, 0xC0 -+ xvst D7, TD, 0xE0 -+ addi.d TD, TD, 0x100 -+ -+ addi.d S1, S1, 0x40 -+ addi.d S2, S2, 0x40 -+ addi.d S3, S3, 0x40 -+ addi.d S4, S4, 0x40 -+ addi.d S5, S5, 0x40 -+ addi.d S6, S6, 0x40 -+ addi.d S7, S7, 0x40 -+ addi.d S8, S8, 0x40 -+ -+ addi.d I, I, -1 -+ blt ZERO, I, .L_8I1 -+ -+.L_8I3: -+ andi I, M, 0x07 -+ beq I, ZERO, .L_N4 -+ -+.L_8I11: -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ fld.d F2, S3, 0x00 -+ fld.d F3, S4, 0x00 -+ fld.d F4, S5, 0x00 -+ fld.d F5, S6, 0x00 -+ fld.d F6, S7, 0x00 -+ fld.d F7, S8, 0x00 -+ -+ fst.d F0, TD, 0x00 -+ addi.d S1, S1, 0x08 -+ fst.d F1, TD, 0x08 -+ addi.d S2, S2, 0x08 -+ fst.d F2, TD, 0x10 -+ addi.d S3, S3, 0x08 -+ fst.d F3, TD, 0x18 -+ addi.d S4, S4, 0x08 -+ fst.d F4, TD, 0x20 -+ addi.d S5, S5, 0x08 -+ fst.d F5, TD, 0x28 -+ addi.d S6, S6, 0x08 -+ fst.d F6, TD, 0x30 -+ addi.d S7, S7, 0x08 -+ fst.d F7, TD, 0x38 -+ addi.d S8, S8, 0x08 -+ -+ addi.d TD, TD, 0x40 -+ addi.d I, I, -1 -+ blt ZERO, I, .L_8I11 -+ -+.L_N4: -+ andi J, N, 0x04 -+ beq ZERO, J, .L_N2 -+ -+ move S1, TS -+ add.d S2, TS, TL -+ srai.d I, M, 0x02 -+ add.d S3, S2, TL -+ add.d S4, S2, T0 -+ add.d TS, S3, T0 -+ beq I, ZERO, .L_I3 -+ -+.L_4I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ -+ xvpackev.d D0, U1, U0 -+ xvpackod.d D1, U1, U0 -+ xvpackev.d D2, U3, U2 -+ xvpackod.d D3, U3, U2 -+ -+ xvand.v U0, D0, D0 -+ xvpermi.q D0, D2, 0x02 // 0 -+ xvand.v U1, D1, D1 -+ xvpermi.q D1, D3, 0x02 // 1 -+ xvpermi.q D2, U0, 0x31 // 2 -+ xvpermi.q D3, U1, 0x31 // 3 -+ -+ xvst D0, TD, 0x00 -+ xvst D1, TD, 0x20 -+ xvst D2, TD, 0x40 -+ xvst D3, TD, 0x60 -+ -+ addi.d S1, S1, 0x20 -+ addi.d S2, S2, 0x20 -+ addi.d S3, S3, 0x20 -+ addi.d S4, S4, 0x20 -+ addi.d TD, TD, 0x80 -+ -+ addi.d I, I, -1 -+ blt ZERO, I, .L_4I1 -+ -+.L_I3: -+ andi I, M, 0x03 -+ beq I, ZERO, .L_N2 -+ -+.L_4II1: -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ fld.d F2, S3, 0x00 -+ fld.d F3, S4, 0x00 -+ -+ fst.d F0, TD, 0x00 -+ addi.d S1, S1, 0x08 -+ fst.d F1, TD, 0x08 -+ addi.d S2, S2, 0x08 -+ fst.d F2, TD, 0x10 -+ addi.d S3, S3, 0x08 -+ fst.d F3, TD, 0x18 -+ addi.d S4, S4, 0x08 -+ -+ addi.d TD, TD, 0x20 -+ addi.d I, I, -1 -+ blt ZERO, I, .L_4II1 -+ -+.L_N2: -+ andi J, N, 0x02 -+ beq ZERO, J, .L_N1 -+ -+ move S1, TS -+ add.d S2, TS, TL -+ srai.d I, M, 0x01 -+ add.d TS, S2, TL -+ beq I, ZERO, .L_NI1 -+ -+.L_2I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ -+ xvpackev.d D0, U1, U0 -+ xvpackod.d D1, U1, U0 -+ -+ xvpermi.q D0, D1, 0x02 // 0 -+ -+ xvst D0, TD, 0x00 -+ -+ addi.d S1, S1, 0x10 -+ addi.d S2, S2, 0x10 -+ addi.d TD, TD, 0x20 -+ -+ addi.d I, I, -1 -+ blt ZERO, I, .L_2I1 -+ -+.L_NI1: -+ andi I, M, 0x01 -+ beq I, ZERO, .L_N1 -+ -+ -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ -+ fst.d F0, TD, 0x00 -+ addi.d S1, S1, 0x08 -+ fst.d F1, TD, 0x08 -+ addi.d S2, S2, 0x08 -+ addi.d TD, TD, 0x10 -+ -+.L_N1: -+ move S1, TS -+ beq ZERO, M, .L_N0 -+ -+.L_M1: -+ fld.d F0, S1, 0x00 -+ addi.d S1, S1, 0x08 -+ fst.d F0, TD, 0x00 -+ addi.d TD, TD, 0x08 -+ addi.d M, M, -1 -+ blt ZERO, M, .L_M1 -+ -+.L_N0: -+ LDARG $r23, $sp, 0x00 -+ LDARG $r24, $sp, 0x08 -+ LDARG $r25, $sp, 0x10 -+ LDARG $r26, $sp, 0x18 -+ LDARG $r27, $sp, 0x20 -+ LDARG $r28, $sp, 0x28 -+ LDARG $r29, $sp, 0x30 -+ LDARG $r30, $sp, 0x38 -+ LDARG $r31, $sp, 0x40 -+ LD $f23, $sp, 0x48 -+ LD $f24, $sp, 0x50 -+ LD $f25, $sp, 0x58 -+ LD $f26, $sp, 0x60 -+ LD $f27, $sp, 0x68 -+ LD $f28, $sp, 0x70 -+ LD $f29, $sp, 0x78 -+ LD $f30, $sp, 0x80 -+ LD $f31, $sp, 0x88 -+ addi.d $sp, $sp, 0x90 -+ jirl $r0, $r1, 0x00 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/dgemm_ncopy_4.S b/kernel/loongarch64/dgemm_ncopy_4.S -new file mode 100644 -index 0000000..b1f322a ---- /dev/null -+++ b/kernel/loongarch64/dgemm_ncopy_4.S -@@ -0,0 +1,237 @@ -+/******************************************************************************* -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+/* Function parameters */ -+#define M $r4 // param 1: m -+#define N $r5 // param 2: n -+#define SRC $r6 // param 3: src -+#define LDA $r7 // param 4: lda -+#define DST $r8 // param 5: dst -+ -+#define I $r9 -+#define J $r10 -+#define S1 $r12 -+#define S2 $r13 -+#define S3 $r14 -+#define S4 $r15 -+#define S5 $r16 -+#define S6 $r17 -+#define S7 $r18 -+#define S8 $r19 -+#define TD $r20 -+#define TS $r11 -+#define TL $r7 -+#define T0 $r23 -+#define ZERO $r0 -+ -+#define F0 $f0 -+#define F1 $f1 -+#define F2 $f2 -+#define F3 $f3 -+#define F4 $f4 -+#define F5 $f5 -+#define F6 $f6 -+#define F7 $f7 -+/* LASX vectors */ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+#define D0 $xr14 -+#define D1 $xr8 -+#define D2 $xr9 -+#define D3 $xr10 -+#define D4 $xr11 -+#define D5 $xr12 -+#define D6 $xr13 -+#define D7 $xr15 -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -8 -+ SDARG $r23, $sp, 0 -+ -+ move TD, DST -+ move TS, SRC -+ slli.d TL, LDA, 0x03 -+ slli.d T0, TL, 0x01 -+ srai.d J, N, 0x02 -+ beq J, ZERO, .L_N2 -+ -+.L_J1: /* J-- */ -+ move S1, TS -+ add.d S2, TS, TL -+ srai.d I, M, 0x02 -+ add.d S3, S2, TL -+ add.d S4, S2, T0 -+ add.d TS, S3, T0 -+ addi.d J, J, -1 -+ beq I, ZERO, .L_I3 -+ -+.L_I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ -+ xvpackev.d D0, U1, U0 -+ xvpackod.d D1, U1, U0 -+ xvpackev.d D2, U3, U2 -+ xvpackod.d D3, U3, U2 -+ -+ xvand.v U0, D0, D0 -+ xvpermi.q D0, D2, 0x02 // 0 -+ xvand.v U1, D1, D1 -+ xvpermi.q D1, D3, 0x02 // 1 -+ xvpermi.q D2, U0, 0x31 // 2 -+ xvpermi.q D3, U1, 0x31 // 3 -+ -+ xvst D0, TD, 0x00 -+ xvst D1, TD, 0x20 -+ xvst D2, TD, 0x40 -+ xvst D3, TD, 0x60 -+ -+ addi.d S1, S1, 0x20 -+ addi.d S2, S2, 0x20 -+ addi.d S3, S3, 0x20 -+ addi.d S4, S4, 0x20 -+ addi.d TD, TD, 0x80 -+ -+ addi.d I, I, -1 -+ blt ZERO, I, .L_I1 -+ -+.L_I3: -+ andi I, M, 0x03 -+ beq I, ZERO, .L_I0 -+ -+.L_II1: -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ fld.d F2, S3, 0x00 -+ fld.d F3, S4, 0x00 -+ -+ fst.d F0, TD, 0x00 -+ addi.d S1, S1, 0x08 -+ fst.d F1, TD, 0x08 -+ addi.d S2, S2, 0x08 -+ fst.d F2, TD, 0x10 -+ addi.d S3, S3, 0x08 -+ fst.d F3, TD, 0x18 -+ addi.d S4, S4, 0x08 -+ -+ addi.d TD, TD, 0x20 -+ addi.d I, I, -1 -+ blt ZERO, I, .L_II1 -+ -+.L_I0: -+ blt ZERO, J, .L_J1 -+ -+.L_N2: -+ andi J, N, 0x02 -+ beq ZERO, J, .L_N1 -+ -+ move S1, TS -+ add.d S2, TS, TL -+ srai.d I, M, 0x02 -+ add.d TS, S2, TL -+ beq I, ZERO, .L_2I3 -+ -+.L_2I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ -+ xvpackev.d D0, U1, U0 -+ xvpackod.d D1, U1, U0 -+ -+ xvand.v U0, D0, D0 -+ xvpermi.q D0, D1, 0x02 // 0 -+ xvpermi.q D1, U0, 0x31 // 1 -+ -+ xvst D0, TD, 0x00 -+ xvst D1, TD, 0x20 -+ addi.d S1, S1, 0x20 -+ addi.d S2, S2, 0x20 -+ addi.d TD, TD, 0x40 -+ addi.d I, I, -1 -+ blt ZERO, I, .L_2I1 -+ -+.L_2I3: -+ andi I, M, 0x03 -+ beq ZERO, I, .L_N1 -+ -+.L_2II1: /* I-- */ -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ fst.d F0, TD, 0x00 -+ addi.d I, I, -1 -+ fst.d F1, TD, 0x08 -+ addi.d S1, S1, 0x08 -+ addi.d S2, S2, 0x08 -+ addi.d TD, TD, 0x10 -+ blt ZERO, I, .L_2II1 -+ -+.L_N1: -+ andi J, N, 0x01 -+ beq ZERO, J, .L_N0 -+ -+ move S1, TS -+ srai.d I, M, 0x02 -+ beq ZERO, I, .L_1I3 -+ -+.L_1I1: -+ xvld U0, S1, 0x00 -+ addi.d S1, S1, 0x20 -+ xvst U0, TD, 0x00 -+ addi.d I, I, -1 -+ addi.d TD, TD, 0x20 -+ blt ZERO, I, .L_1I1 -+ -+.L_1I3: -+ andi I, M, 0x03 -+ beq ZERO, I, .L_N0 -+ -+.L_1II1: -+ fld.d F0, S1, 0x00 -+ addi.d S1, S1, 0x08 -+ fst.d F0, TD, 0x00 -+ addi.d I, I, -1 -+ addi.d TD, TD, 0x08 -+ blt ZERO, I, .L_1II1 -+ -+.L_N0: -+ LDARG $r23, $sp, 0 -+ addi.d $sp, $sp, 8 -+ jirl $r0, $r1, 0x00 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/dgemm_tcopy_16.S b/kernel/loongarch64/dgemm_tcopy_16.S -new file mode 100644 -index 0000000..afafe5b ---- /dev/null -+++ b/kernel/loongarch64/dgemm_tcopy_16.S -@@ -0,0 +1,710 @@ -+/******************************************************************************* -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+/* Function parameters */ -+#define M $r4 // param 1: m -+#define N $r5 // param 2: n -+#define SRC $r6 // param 3: src -+#define LDA $r7 // param 4: lda -+#define DST $r8 // param 5: dst -+ -+#define I $r9 -+#define J $r10 -+#define S0 $r11 -+#define S1 $r12 -+#define S2 $r13 -+#define S3 $r14 -+#define S4 $r15 -+#define S5 $r16 -+#define S6 $r17 -+#define S7 $r18 -+#define S8 $r19 -+#define P0 $r20 -+#define P1 $r23 -+#define P2 $r24 -+#define P3 $r25 -+#define P4 $r26 -+#define P5 $r27 -+#define T0 $r28 -+#define T1 $r29 -+#define TL $r7 -+#define ZERO $r0 -+ -+#define F0 $f0 -+#define F1 $f1 -+#define F2 $f2 -+#define F3 $f3 -+#define F4 $f4 -+#define F5 $f5 -+#define F6 $f6 -+#define F7 $f7 -+/* LASX vectors */ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -56 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 16 -+ SDARG $r26, $sp, 24 -+ SDARG $r27, $sp, 32 -+ SDARG $r28, $sp, 40 -+ SDARG $r29, $sp, 48 -+ -+ move S0, SRC -+ move P0, DST -+ -+ srai.d T0, N, 0x04 -+ srai.d T1, N, 0x03 -+ slli.d T0, T0, 0x04 -+ slli.d T1, T1, 0x03 -+ mul.d P2, M, T0 -+ mul.d P3, M, T1 -+ slli.d P2, P2, 0x03 -+ slli.d P3, P3, 0x03 -+ add.d P2, DST, P2 -+ add.d P3, DST, P3 -+ -+ srai.d T0, N, 0x02 -+ srai.d T1, N, 0x01 -+ slli.d T0, T0, 0x02 -+ slli.d T1, T1, 0x01 -+ mul.d P4, M, T0 -+ mul.d P5, M, T1 -+ slli.d P4, P4, 0x03 -+ slli.d P5, P5, 0x03 -+ add.d P4, DST, P4 -+ add.d P5, DST, P5 -+ -+ slli.d TL, LDA, 0x03 -+ srai.d J, M, 0x03 -+ slli.d T0, TL, 0x01 -+ slli.d T1, M, 0x07 -+ beq ZERO, J, .L_M7 -+ -+.L_J1: /* J-- */ -+ move S1, S0 -+ add.d S2, S0, TL -+ add.d S3, S1, T0 -+ add.d S4, S2, T0 -+ add.d S5, S3, T0 -+ add.d S6, S4, T0 -+ add.d S7, S5, T0 -+ add.d S8, S6, T0 -+ add.d S0, S7, T0 -+ -+ move P1, P0 -+ addi.d P0, P0, 0x400 -+ -+ srai.d I, N, 0x04 -+ addi.d J, J, -1 -+ beq ZERO, I, .L_N15 -+ -+.L_I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ xvld U2, S1, 0x40 -+ xvld U3, S1, 0x60 -+ xvld U4, S2, 0x00 -+ xvld U5, S2, 0x20 -+ xvld U6, S2, 0x40 -+ xvld U7, S2, 0x60 -+ -+ xvst U0, P1, 0x00 -+ xvst U1, P1, 0x20 -+ xvst U2, P1, 0x40 -+ xvst U3, P1, 0x60 -+ xvst U4, P1, 0x80 -+ xvst U5, P1, 0xA0 -+ xvst U6, P1, 0xC0 -+ xvst U7, P1, 0xE0 -+ -+ xvld U0, S3, 0x00 -+ xvld U1, S3, 0x20 -+ xvld U2, S3, 0x40 -+ xvld U3, S3, 0x60 -+ xvld U4, S4, 0x00 -+ xvld U5, S4, 0x20 -+ xvld U6, S4, 0x40 -+ xvld U7, S4, 0x60 -+ -+ xvst U0, P1, 0x100 -+ xvst U1, P1, 0x120 -+ xvst U2, P1, 0x140 -+ xvst U3, P1, 0x160 -+ xvst U4, P1, 0x180 -+ xvst U5, P1, 0x1A0 -+ xvst U6, P1, 0x1C0 -+ xvst U7, P1, 0x1E0 -+ -+ xvld U0, S5, 0x00 -+ xvld U1, S5, 0x20 -+ xvld U2, S5, 0x40 -+ xvld U3, S5, 0x60 -+ xvld U4, S6, 0x00 -+ xvld U5, S6, 0x20 -+ xvld U6, S6, 0x40 -+ xvld U7, S6, 0x60 -+ -+ xvst U0, P1, 0x200 -+ xvst U1, P1, 0x220 -+ xvst U2, P1, 0x240 -+ xvst U3, P1, 0x260 -+ xvst U4, P1, 0x280 -+ xvst U5, P1, 0x2A0 -+ xvst U6, P1, 0x2C0 -+ xvst U7, P1, 0x2E0 -+ -+ xvld U0, S7, 0x00 -+ xvld U1, S7, 0x20 -+ xvld U2, S7, 0x40 -+ xvld U3, S7, 0x60 -+ xvld U4, S8, 0x00 -+ xvld U5, S8, 0x20 -+ xvld U6, S8, 0x40 -+ xvld U7, S8, 0x60 -+ -+ xvst U0, P1, 0x300 -+ xvst U1, P1, 0x320 -+ xvst U2, P1, 0x340 -+ xvst U3, P1, 0x360 -+ xvst U4, P1, 0x380 -+ xvst U5, P1, 0x3A0 -+ xvst U6, P1, 0x3C0 -+ xvst U7, P1, 0x3E0 -+ -+ addi.d S1, S1, 0x80 -+ addi.d S2, S2, 0x80 -+ addi.d S3, S3, 0x80 -+ addi.d S4, S4, 0x80 -+ addi.d S5, S5, 0x80 -+ addi.d S6, S6, 0x80 -+ addi.d S7, S7, 0x80 -+ addi.d S8, S8, 0x80 -+ addi.d I, I, -1 -+ add.d P1, P1, T1 -+ blt ZERO, I, .L_I1 -+ -+.L_N15: -+ andi I, N, 0x08 -+ beq ZERO, I, .L_N7 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ xvld U2, S2, 0x00 -+ xvld U3, S2, 0x20 -+ xvld U4, S3, 0x00 -+ xvld U5, S3, 0x20 -+ xvld U6, S4, 0x00 -+ xvld U7, S4, 0x20 -+ -+ xvst U0, P2, 0x00 -+ xvst U1, P2, 0x20 -+ xvst U2, P2, 0x40 -+ xvst U3, P2, 0x60 -+ xvst U4, P2, 0x80 -+ xvst U5, P2, 0xA0 -+ xvst U6, P2, 0xC0 -+ xvst U7, P2, 0xE0 -+ -+ xvld U0, S5, 0x00 -+ xvld U1, S5, 0x20 -+ xvld U2, S6, 0x00 -+ xvld U3, S6, 0x20 -+ xvld U4, S7, 0x00 -+ xvld U5, S7, 0x20 -+ xvld U6, S8, 0x00 -+ xvld U7, S8, 0x20 -+ -+ xvst U0, P2, 0x100 -+ xvst U1, P2, 0x120 -+ xvst U2, P2, 0x140 -+ xvst U3, P2, 0x160 -+ xvst U4, P2, 0x180 -+ xvst U5, P2, 0x1A0 -+ xvst U6, P2, 0x1C0 -+ xvst U7, P2, 0x1E0 -+ -+ addi.d S1, S1, 0x40 -+ addi.d S2, S2, 0x40 -+ addi.d S3, S3, 0x40 -+ addi.d S4, S4, 0x40 -+ addi.d S5, S5, 0x40 -+ addi.d S6, S6, 0x40 -+ addi.d S7, S7, 0x40 -+ addi.d S8, S8, 0x40 -+ addi.d P2, P2, 0x200 -+ -+.L_N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_N3 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ xvld U4, S5, 0x00 -+ xvld U5, S6, 0x00 -+ xvld U6, S7, 0x00 -+ xvld U7, S8, 0x00 -+ -+ xvst U0, P3, 0x00 -+ xvst U1, P3, 0x20 -+ xvst U2, P3, 0x40 -+ xvst U3, P3, 0x60 -+ xvst U4, P3, 0x80 -+ xvst U5, P3, 0xA0 -+ xvst U6, P3, 0xC0 -+ xvst U7, P3, 0xE0 -+ -+ addi.d S1, S1, 0x20 -+ addi.d S2, S2, 0x20 -+ addi.d S3, S3, 0x20 -+ addi.d S4, S4, 0x20 -+ addi.d S5, S5, 0x20 -+ addi.d S6, S6, 0x20 -+ addi.d S7, S7, 0x20 -+ addi.d S8, S8, 0x20 -+ addi.d P3, P3, 0x100 -+ -+.L_N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_N1 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ xvld U4, S5, 0x00 -+ xvld U5, S6, 0x00 -+ xvld U6, S7, 0x00 -+ xvld U7, S8, 0x00 -+ -+ xvpermi.q U0, U1, 0x02 -+ xvpermi.q U2, U3, 0x02 -+ xvpermi.q U4, U5, 0x02 -+ xvpermi.q U6, U7, 0x02 -+ -+ xvst U0, P4, 0x00 -+ xvst U2, P4, 0x20 -+ xvst U4, P4, 0x40 -+ xvst U6, P4, 0x60 -+ -+ addi.d S1, S1, 0x10 -+ addi.d S2, S2, 0x10 -+ addi.d S3, S3, 0x10 -+ addi.d S4, S4, 0x10 -+ addi.d S5, S5, 0x10 -+ addi.d S6, S6, 0x10 -+ addi.d S7, S7, 0x10 -+ addi.d S8, S8, 0x10 -+ addi.d P4, P4, 0x80 -+ -+.L_N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_N0 -+ -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ fld.d F2, S3, 0x00 -+ fld.d F3, S4, 0x00 -+ fld.d F4, S5, 0x00 -+ fld.d F5, S6, 0x00 -+ fld.d F6, S7, 0x00 -+ fld.d F7, S8, 0x00 -+ -+ fst.d F0, P5, 0x00 -+ fst.d F1, P5, 0x08 -+ fst.d F2, P5, 0x10 -+ fst.d F3, P5, 0x18 -+ fst.d F4, P5, 0x20 -+ fst.d F5, P5, 0x28 -+ fst.d F6, P5, 0x30 -+ fst.d F7, P5, 0x38 -+ -+ addi.d S1, S1, 0x08 -+ addi.d S2, S2, 0x08 -+ addi.d S3, S3, 0x08 -+ addi.d S4, S4, 0x08 -+ addi.d S5, S5, 0x08 -+ addi.d S6, S6, 0x08 -+ addi.d S7, S7, 0x08 -+ addi.d S8, S8, 0x08 -+ addi.d P5, P5, 0x40 -+ -+.L_N0: -+ blt ZERO, J, .L_J1 -+ -+.L_M7: -+ andi J, M, 0x04 -+ beq ZERO, J, .L_M3 -+ -+ move S1, S0 -+ add.d S2, S0, TL -+ add.d S3, S1, T0 -+ add.d S4, S2, T0 -+ add.d S0, S3, T0 -+ -+ move P1, P0 -+ addi.d P0, P0, 0x200 -+ -+ srai.d I, N, 0x04 -+ beq ZERO, I, .L_4N15 -+ -+.L_4I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ xvld U2, S1, 0x40 -+ xvld U3, S1, 0x60 -+ xvld U4, S2, 0x00 -+ xvld U5, S2, 0x20 -+ xvld U6, S2, 0x40 -+ xvld U7, S2, 0x60 -+ -+ xvst U0, P1, 0x00 -+ xvst U1, P1, 0x20 -+ xvst U2, P1, 0x40 -+ xvst U3, P1, 0x60 -+ xvst U4, P1, 0x80 -+ xvst U5, P1, 0xA0 -+ xvst U6, P1, 0xC0 -+ xvst U7, P1, 0xE0 -+ -+ xvld U0, S3, 0x00 -+ xvld U1, S3, 0x20 -+ xvld U2, S3, 0x40 -+ xvld U3, S3, 0x60 -+ xvld U4, S4, 0x00 -+ xvld U5, S4, 0x20 -+ xvld U6, S4, 0x40 -+ xvld U7, S4, 0x60 -+ -+ xvst U0, P1, 0x100 -+ xvst U1, P1, 0x120 -+ xvst U2, P1, 0x140 -+ xvst U3, P1, 0x160 -+ xvst U4, P1, 0x180 -+ xvst U5, P1, 0x1A0 -+ xvst U6, P1, 0x1C0 -+ xvst U7, P1, 0x1E0 -+ -+ addi.d S1, S1, 0x80 -+ addi.d S2, S2, 0x80 -+ addi.d S3, S3, 0x80 -+ addi.d S4, S4, 0x80 -+ addi.d I, I, -1 -+ add.d P1, P1, T1 -+ blt ZERO, I, .L_4I1 -+ -+.L_4N15: -+ andi I, N, 0x08 -+ beq ZERO, I, .L_4N7 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ xvld U2, S2, 0x00 -+ xvld U3, S2, 0x20 -+ xvld U4, S3, 0x00 -+ xvld U5, S3, 0x20 -+ xvld U6, S4, 0x00 -+ xvld U7, S4, 0x20 -+ -+ xvst U0, P2, 0x00 -+ xvst U1, P2, 0x20 -+ xvst U2, P2, 0x40 -+ xvst U3, P2, 0x60 -+ xvst U4, P2, 0x80 -+ xvst U5, P2, 0xA0 -+ xvst U6, P2, 0xC0 -+ xvst U7, P2, 0xE0 -+ -+ addi.d S1, S1, 0x40 -+ addi.d S2, S2, 0x40 -+ addi.d S3, S3, 0x40 -+ addi.d S4, S4, 0x40 -+ addi.d P2, P2, 0x100 -+ -+.L_4N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_4N3 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ -+ xvst U0, P3, 0x00 -+ xvst U1, P3, 0x20 -+ xvst U2, P3, 0x40 -+ xvst U3, P3, 0x60 -+ -+ addi.d S1, S1, 0x20 -+ addi.d S2, S2, 0x20 -+ addi.d S3, S3, 0x20 -+ addi.d S4, S4, 0x20 -+ addi.d P3, P3, 0x80 -+ -+.L_4N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_4N1 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ -+ xvpermi.q U0, U1, 0x02 -+ xvpermi.q U2, U3, 0x02 -+ -+ xvst U0, P4, 0x00 -+ xvst U2, P4, 0x20 -+ -+ addi.d S1, S1, 0x10 -+ addi.d S2, S2, 0x10 -+ addi.d S3, S3, 0x10 -+ addi.d S4, S4, 0x10 -+ addi.d P4, P4, 0x40 -+ -+.L_4N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_M3 -+ -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ fld.d F2, S3, 0x00 -+ fld.d F3, S4, 0x00 -+ -+ fst.d F0, P5, 0x00 -+ fst.d F1, P5, 0x08 -+ fst.d F2, P5, 0x10 -+ fst.d F3, P5, 0x18 -+ -+ addi.d S1, S1, 0x08 -+ addi.d S2, S2, 0x08 -+ addi.d S3, S3, 0x08 -+ addi.d S4, S4, 0x08 -+ addi.d P5, P5, 0x20 -+ -+.L_M3: -+ andi J, M, 0x02 -+ beq ZERO, J, .L_M1 -+ -+ move S1, S0 -+ add.d S2, S0, TL -+ add.d S0, S0, T0 -+ -+ move P1, P0 -+ addi.d P0, P0, 0x100 -+ -+ srai.d I, N, 0x04 -+ beq ZERO, I, .L_2N15 -+ -+.L_2I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ xvld U2, S1, 0x40 -+ xvld U3, S1, 0x60 -+ xvld U4, S2, 0x00 -+ xvld U5, S2, 0x20 -+ xvld U6, S2, 0x40 -+ xvld U7, S2, 0x60 -+ -+ xvst U0, P1, 0x00 -+ xvst U1, P1, 0x20 -+ xvst U2, P1, 0x40 -+ xvst U3, P1, 0x60 -+ xvst U4, P1, 0x80 -+ xvst U5, P1, 0xA0 -+ xvst U6, P1, 0xC0 -+ xvst U7, P1, 0xE0 -+ -+ addi.d S1, S1, 0x80 -+ addi.d S2, S2, 0x80 -+ addi.d I, I, -1 -+ add.d P1, P1, T1 -+ blt ZERO, I, .L_2I1 -+ -+.L_2N15: -+ andi I, N, 0x08 -+ beq ZERO, I, .L_2N7 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ xvld U2, S2, 0x00 -+ xvld U3, S2, 0x20 -+ -+ xvst U0, P2, 0x00 -+ xvst U1, P2, 0x20 -+ xvst U2, P2, 0x40 -+ xvst U3, P2, 0x60 -+ -+ addi.d S1, S1, 0x40 -+ addi.d S2, S2, 0x40 -+ addi.d P2, P2, 0x80 -+ -+.L_2N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_2N3 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ -+ xvst U0, P3, 0x00 -+ xvst U1, P3, 0x20 -+ -+ addi.d S1, S1, 0x20 -+ addi.d S2, S2, 0x20 -+ addi.d P3, P3, 0x40 -+ -+.L_2N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_2N1 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ -+ xvpermi.q U0, U1, 0x02 -+ -+ xvst U0, P4, 0x00 -+ -+ addi.d S1, S1, 0x10 -+ addi.d S2, S2, 0x10 -+ addi.d P4, P4, 0x20 -+ -+.L_2N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_M1 -+ -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ -+ fst.d F0, P5, 0x00 -+ fst.d F1, P5, 0x08 -+ -+ addi.d S1, S1, 0x08 -+ addi.d S2, S2, 0x08 -+ addi.d P5, P5, 0x10 -+ -+.L_M1: -+ andi J, M, 0x01 -+ beq ZERO, J, .L_M0 -+ -+ move S1, S0 -+ add.d S2, S0, TL -+ -+ move P1, P0 -+ addi.d P0, P0, 0x80 -+ -+ srai.d I, N, 0x04 -+ beq ZERO, I, .L_1N15 -+ -+.L_1I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ xvld U2, S1, 0x40 -+ xvld U3, S1, 0x60 -+ -+ xvst U0, P1, 0x00 -+ xvst U1, P1, 0x20 -+ xvst U2, P1, 0x40 -+ xvst U3, P1, 0x60 -+ -+ addi.d S1, S1, 0x80 -+ addi.d I, I, -1 -+ add.d P1, P1, T1 -+ blt ZERO, I, .L_1I1 -+ -+.L_1N15: -+ andi I, N, 0x08 -+ beq ZERO, I, .L_1N7 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ -+ xvst U0, P2, 0x00 -+ xvst U1, P2, 0x20 -+ -+ addi.d S1, S1, 0x40 -+ addi.d P2, P2, 0x40 -+ -+.L_1N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_1N3 -+ -+ xvld U0, S1, 0x00 -+ -+ xvst U0, P3, 0x00 -+ -+ addi.d S1, S1, 0x20 -+ addi.d P3, P3, 0x20 -+ -+.L_1N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_1N1 -+ -+ fld.d F0, S1, 0x00 -+ fld.d F1, S1, 0x08 -+ -+ fst.d F0, P4, 0x00 -+ fst.d F1, P4, 0x08 -+ -+ addi.d S1, S1, 0x10 -+ addi.d P4, P4, 0x10 -+ -+.L_1N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_M0 -+ -+ fld.d F0, S1, 0x00 -+ -+ fst.d F0, P5, 0x00 -+ -+ addi.d S1, S1, 0x08 -+ addi.d P5, P5, 0x08 -+ -+.L_M0: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ LDARG $r25, $sp, 16 -+ LDARG $r26, $sp, 24 -+ LDARG $r27, $sp, 32 -+ LDARG $r28, $sp, 40 -+ LDARG $r29, $sp, 48 -+ addi.d $sp, $sp, 56 -+ jirl $r0, $r1, 0x00 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/dgemm_tcopy_4.S b/kernel/loongarch64/dgemm_tcopy_4.S -new file mode 100644 -index 0000000..700989c ---- /dev/null -+++ b/kernel/loongarch64/dgemm_tcopy_4.S -@@ -0,0 +1,270 @@ -+/******************************************************************************* -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+/* Function parameters */ -+#define M $r4 // param 1: m -+#define N $r5 // param 2: n -+#define SRC $r6 // param 3: src -+#define LDA $r7 // param 4: lda -+#define DST $r8 // param 5: dst -+ -+#define I $r9 -+#define J $r10 -+#define S0 $r11 -+#define S1 $r12 -+#define S2 $r13 -+#define S3 $r14 -+#define S4 $r15 -+#define P0 $r16 -+#define P1 $r17 -+#define P2 $r18 -+#define P3 $r19 -+#define T0 $r20 -+#define T1 $r23 -+#define TL $r7 -+#define ZERO $r0 -+ -+#define F0 $f0 -+#define F1 $f1 -+#define F2 $f2 -+#define F3 $f3 -+/* LASX vectors */ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -8 -+ SDARG $r23, $sp, 0 -+ -+ move S0, SRC -+ move P0, DST -+ -+ srai.d T0, N, 0x02 -+ slli.d T0, T0, 0x02 -+ srai.d T1, N, 0x01 -+ slli.d T1, T1, 0x01 -+ mul.d T0, M, T0 -+ mul.d T1, M, T1 -+ slli.d T0, T0, 0x03 -+ slli.d T1, T1, 0x03 -+ add.d P2, DST, T0 -+ add.d P3, DST, T1 -+ -+ slli.d TL, LDA, 0x03 -+ srai.d J, M, 0x02 -+ slli.d T0, TL, 0x01 -+ slli.d T1, M, 0x05 -+ beq ZERO, J, .L_M3 -+ -+.L_J1: /* J-- */ -+ move S1, S0 -+ add.d S2, S0, TL -+ add.d S3, S1, T0 -+ add.d S4, S2, T0 -+ add.d S0, S3, T0 -+ -+ move P1, P0 -+ addi.d P0, P0, 0x80 -+ -+ srai.d I, N, 0x02 -+ addi.d J, J, -1 -+ beq ZERO, I, .L_N3 -+ -+.L_I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ -+ xvst U0, P1, 0x00 -+ xvst U1, P1, 0x20 -+ xvst U2, P1, 0x40 -+ xvst U3, P1, 0x60 -+ -+ addi.d S1, S1, 0x20 -+ addi.d S2, S2, 0x20 -+ addi.d S3, S3, 0x20 -+ addi.d S4, S4, 0x20 -+ add.d P1, P1, T1 -+ -+ addi.d I, I, -1 -+ blt ZERO, I, .L_I1 -+ -+.L_N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_N1 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ -+ xvpermi.q U0, U1, 0x02 -+ xvpermi.q U2, U3, 0x02 -+ -+ xvst U0, P2, 0x00 -+ xvst U2, P2, 0x20 -+ -+ addi.d S1, S1, 0x10 -+ addi.d S2, S2, 0x10 -+ addi.d S3, S3, 0x10 -+ addi.d S4, S4, 0x10 -+ addi.d P2, P2, 0x40 -+ -+.L_N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_N0 -+ -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ fld.d F2, S3, 0x00 -+ fld.d F3, S4, 0x00 -+ -+ fst.d F0, P3, 0x00 -+ fst.d F1, P3, 0x08 -+ fst.d F2, P3, 0x10 -+ fst.d F3, P3, 0x18 -+ -+ addi.d S1, S1, 0x08 -+ addi.d S2, S2, 0x08 -+ addi.d S3, S3, 0x08 -+ addi.d S4, S4, 0x08 -+ addi.d P3, P3, 0x20 -+ -+.L_N0: -+ blt ZERO, J, .L_J1 -+ -+.L_M3: -+ andi J, M, 0x02 -+ beq ZERO, J, .L_M1 -+ -+ move S1, S0 -+ add.d S2, S0, TL -+ add.d S0, S0, T0 -+ -+ move P1, P0 -+ addi.d P0, P0, 0x40 -+ -+ srai.d I, N, 0x02 -+ beq ZERO, I, .L_2N3 -+ -+.L_2I1: /* I-- */ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ -+ xvst U0, P1, 0x00 -+ xvst U1, P1, 0x20 -+ -+ addi.d S1, S1, 0x20 -+ addi.d S2, S2, 0x20 -+ addi.d I, I, -1 -+ add.d P1, P1, T1 -+ -+ blt ZERO, I, .L_2I1 -+ -+.L_2N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_2N1 -+ -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ -+ xvpermi.q U0, U1, 0x02 -+ -+ xvst U0, P2, 0x00 -+ -+ addi.d S1, S1, 0x10 -+ addi.d S2, S2, 0x10 -+ addi.d P2, P2, 0x20 -+ -+.L_2N1: -+ addi.d I, N, 0x01 -+ beq ZERO, I, .L_M1 -+ -+ fld.d F0, S1, 0x00 -+ fld.d F1, S2, 0x00 -+ -+ fst.d F0, P3, 0x00 -+ fst.d F1, P3, 0x08 -+ -+ addi.d S1, S1, 0x08 -+ addi.d S2, S2, 0x08 -+ addi.d P3, P3, 0x10 -+ -+.L_M1: -+ andi J, M, 0x01 -+ beq ZERO, J, .L_M0 -+ -+ move S1, S0 -+ move P1, P0 -+ -+ srai.d I, N, 0x02 -+ beq ZERO, I, .L_1N3 -+ -+.L_1I1: -+ xvld U0, S1, 0x00 -+ -+ xvst U0, P1, 0x00 -+ -+ addi.d S1, S1, 0x20 -+ addi.d I, I, -1 -+ add.d P1, P1, T1 -+ -+ blt ZERO, I, .L_1I1 -+ -+.L_1N3: -+ andi I, N, 0x02 -+ beq I, ZERO, .L_1N1 -+ -+ fld.d F0, S1, 0x00 -+ fld.d F1, S1, 0x08 -+ -+ fst.d F0, P2, 0x00 -+ fst.d F1, P2, 0x08 -+ -+ addi.d S1, S1, 0x10 -+ addi.d P2, P2, 0x10 -+ -+.L_1N1: -+ andi I, N, 0x01 -+ beq I, ZERO, .L_M0 -+ -+ fld.d F0, S1, 0x00 -+ -+ fst.d F0, P3, 0x00 -+ -+.L_M0: -+ LDARG $r23, $sp, 0 -+ addi.d $sp, $sp, 8 -+ jirl $r0, $r1, 0x00 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S -new file mode 100644 -index 0000000..ff937ae ---- /dev/null -+++ b/kernel/loongarch64/dnrm2.S -@@ -0,0 +1,324 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define XX $r7 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define t1 $f0 -+#define t2 $f1 -+#define t3 $f2 -+#define t4 $f3 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+#define ALPHA $f4 -+#define max $f5 -+#define INF $f6 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ // Init INF -+ addi.d TEMP, $r0, 0x7FF -+ slli.d TEMP, TEMP, 52 -+ MTC INF, TEMP -+ -+ MTC s1, $r0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, INCX, .L999 -+ move XX, X -+ NOP -+ LD a1, X, 0 * SIZE -+ addi.d N, N, -1 -+ add.d X, X, INCX -+ FABS s1, a1 -+ FABS s2, a1 -+ bge $r0, N, .L999 -+ FABS s3, a1 -+ srai.d I, N, 3 -+ FABS s4, a1 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a8, X, 0 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ FABS t1, a1 -+ LD a1, X, 0 * SIZE -+ FABS t2, a2 -+ add.d X, X, INCX -+ FABS t3, a3 -+ LD a2, X, 0 * SIZE -+ FABS t4, a4 -+ add.d X, X, INCX -+ CMPLT $fcc0, s1, t1 -+ LD a3, X, 0 * SIZE -+ CMPLT $fcc1, s2, t2 -+ add.d X, X, INCX -+ CMPLT $fcc2, s3, t3 -+ LD a4, X, 0 * SIZE -+ CMPLT $fcc3, s4, t4 -+ add.d X, X, INCX -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ FABS t1, a5 -+ LD a5, X, 0 * SIZE -+ FABS t2, a6 -+ add.d X, X, INCX -+ FABS t3, a7 -+ LD a6, X, 0 * SIZE -+ FABS t4, a8 -+ add.d X, X, INCX -+ CMPLT $fcc0, s1, t1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, s2, t2 -+ add.d X, X, INCX -+ CMPLT $fcc2, s3, t3 -+ LD a8, X, 0 * SIZE -+ CMPLT $fcc3, s4, t4 -+ add.d X, X, INCX -+ CMOVT s1, s1, t1, $fcc0 -+ addi.d I, I, -1 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ FABS t1, a1 -+ FABS t2, a2 -+ FABS t3, a3 -+ FABS t4, a4 -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t2 -+ CMPLT $fcc2, s3, t3 -+ CMPLT $fcc3, s4, t4 -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ FABS t1, a5 -+ FABS t2, a6 -+ FABS t3, a7 -+ FABS t4, a8 -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t2 -+ CMPLT $fcc2, s3, t3 -+ CMPLT $fcc3, s4, t4 -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ .align 3 -+ -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L100 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ FABS t1, a1 -+ CMPLT $fcc0, s1, t1 -+ CMOVT s1, s1, t1, $fcc0 -+ add.d X, X, INCX -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L100: -+ CMPLT $fcc0, s1, s2 -+ CMPLT $fcc1, s3, s4 -+ CMOVT s1, s1, s2, $fcc0 -+ CMOVT s3, s3, s4, $fcc1 -+ CMPLT $fcc0, s1, s3 -+ CMOVT s1, s1, s3, $fcc0 -+ addi.d N, N, 1 -+ lu12i.w TEMP, 0x3f800 -+ movgr2fr.d a1, $r0 -+ movgr2fr.w ALPHA, TEMP -+ CMPEQ $fcc0, s1, a1 -+ fcvt.d.s ALPHA, ALPHA -+ bcnez $fcc0, .L999 -+ -+ fdiv.d ALPHA, ALPHA, s1 -+ CMPEQ $fcc0, INF, ALPHA -+ bcnez $fcc0, .L999 -+ -+ MOV max, s1 -+ MOV s1, a1 -+ MOV s2, a1 -+ MOV s3, a1 -+ MOV s4, a1 -+ srai.d I, N, 3 -+ bge $r0, I, .L105 -+ LD a1, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD a2, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD a3, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD a4, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD a5, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD a6, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD a7, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD a8, XX, 0 * SIZE -+ addi.d I, I, -1 -+ add.d XX, XX, INCX -+ bge $r0, I, .L104 -+ .align 3 -+ -+.L103: -+ MUL t1, ALPHA, a1 -+ LD a1, XX, 0 * SIZE -+ MUL t2, ALPHA, a2 -+ add.d XX, XX, INCX -+ MUL t3, ALPHA, a3 -+ LD a2, XX, 0 * SIZE -+ MUL t4, ALPHA, a4 -+ add.d XX, XX, INCX -+ MADD s1, t1, t1, s1 -+ LD a3, XX, 0 * SIZE -+ MADD s2, t2, t2, s2 -+ add.d XX, XX, INCX -+ MADD s3, t3, t3, s3 -+ LD a4, XX, 0 * SIZE -+ MADD s4, t4, t4, s4 -+ add.d XX, XX, INCX -+ MUL t1, ALPHA, a5 -+ LD a5, XX, 0 * SIZE -+ MUL t2, ALPHA, a6 -+ add.d XX, XX, INCX -+ MUL t3, ALPHA, a7 -+ LD a6, XX, 0 * SIZE -+ MUL t4, ALPHA, a8 -+ add.d XX, XX, INCX -+ MADD s1, t1, t1, s1 -+ LD a7, XX, 0 * SIZE -+ MADD s2, t2, t2, s2 -+ add.d XX, XX, INCX -+ MADD s3, t3, t3, s3 -+ LD a8, XX, 0 * SIZE -+ MADD s4, t4, t4, s4 -+ addi.d I, I, -1 -+ add.d XX, XX, INCX -+ blt $r0, I, .L103 -+ .align 3 -+ -+.L104: -+ MUL t1, ALPHA, a1 -+ MUL t2, ALPHA, a2 -+ MUL t3, ALPHA, a3 -+ MUL t4, ALPHA, a4 -+ MADD s1, t1, t1, s1 -+ MADD s2, t2, t2, s2 -+ MADD s3, t3, t3, s3 -+ MADD s4, t4, t4, s4 -+ MUL t1, ALPHA, a5 -+ MUL t2, ALPHA, a6 -+ MUL t3, ALPHA, a7 -+ MUL t4, ALPHA, a8 -+ MADD s1, t1, t1, s1 -+ MADD s2, t2, t2, s2 -+ MADD s3, t3, t3, s3 -+ MADD s4, t4, t4, s4 -+ .align 3 -+ -+.L105: -+ andi I, N, 7 -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L106: -+ LD a1, XX, 0 * SIZE -+ addi.d I, I, -1 -+ MUL t1, ALPHA, a1 -+ add.d XX, XX, INCX -+ MADD s1, t1, t1, s1 -+ blt $r0, I, .L106 -+ .align 3 -+ -+.L998: -+ ADD s1, s1, s2 -+ ADD s3, s3, s4 -+ ADD s1, s1, s3 -+ fsqrt.d s1, s1 -+ move $r4, $r17 -+ MUL $f0, max, s1 -+ jirl $r0, $r1, 0x0 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S -new file mode 100644 -index 0000000..1e4c81a ---- /dev/null -+++ b/kernel/loongarch64/dot.S -@@ -0,0 +1,391 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define Y $r7 -+#define INCY $r8 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f23 -+#define a2 $f9 -+#define a3 $f10 -+#define a4 $f11 -+#define b1 $f12 -+#define b2 $f13 -+#define b3 $f14 -+#define b4 $f15 -+#define s1 $f22 -+#define s2 $f8 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+ LDINT INCY, 0(INCY) -+#endif -+ -+ MTC s1, $r0 -+ MTC s2, $r0 -+ slli.d INCX, INCX, BASE_SHIFT -+ li.d TEMP, SIZE -+ slli.d INCY, INCY, BASE_SHIFT -+ bge $r0, N, .L999 -+ srai.d I, N, 3 -+ bne INCX, TEMP, .L20 -+ bne INCY, TEMP, .L20 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ LD b1, Y, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD b2, Y, 1 * SIZE -+ LD a3, X, 2 * SIZE -+ LD b3, Y, 2 * SIZE -+ LD a4, X, 3 * SIZE -+ addi.d I, I, -1 -+ LD b4, Y, 3 * SIZE -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s1, b1, a1, s1 -+#else -+ MADD s1, b1, a1, s1 -+#endif -+ LD a1, X, 4 * SIZE -+ LD b1, Y, 4 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a2, a2 -+ fcvt.d.s b2, b2 -+ fmadd.d s2, b2, a2, s2 -+#else -+ MADD s2, b2, a2, s2 -+#endif -+ LD a2, X, 5 * SIZE -+ LD b2, Y, 5 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a3, a3 -+ fcvt.d.s b3, b3 -+ fmadd.d s1, b3, a3, s1 -+#else -+ MADD s1, b3, a3, s1 -+#endif -+ LD a3, X, 6 * SIZE -+ LD b3, Y, 6 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a4, a4 -+ fcvt.d.s b4, b4 -+ fmadd.d s2, b4, a4, s2 -+#else -+ MADD s2, b4, a4, s2 -+#endif -+ LD a4, X, 7 * SIZE -+ LD b4, Y, 7 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s1, b1, a1, s1 -+#else -+ MADD s1, b1, a1, s1 -+#endif -+ LD a1, X, 8 * SIZE -+ LD b1, Y, 8 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a2, a2 -+ fcvt.d.s b2, b2 -+ fmadd.d s2, b2, a2, s2 -+#else -+ MADD s2, b2, a2, s2 -+#endif -+ LD a2, X, 9 * SIZE -+ LD b2, Y, 9 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a3, a3 -+ fcvt.d.s b3, b3 -+ fmadd.d s1, b3, a3, s1 -+#else -+ MADD s1, b3, a3, s1 -+#endif -+ LD a3, X, 10 * SIZE -+ LD b3, Y, 10 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a4, a4 -+ fcvt.d.s b4, b4 -+ fmadd.d s2, b4, a4, s2 -+#else -+ MADD s2, b4, a4, s2 -+#endif -+ LD a4, X, 11 * SIZE -+ LD b4, Y, 11 * SIZE -+ addi.d I, I, -1 -+ addi.d X, X, 8 * SIZE -+addi.d Y, Y, 8 * SIZE -+ blt $r0, I, .L12 -+ .align 3 -+.L13: -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s1, b1, a1, s1 -+#else -+ MADD s1, b1, a1, s1 -+#endif -+ LD a1, X, 4 * SIZE -+ LD b1, Y, 4 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a2, a2 -+ fcvt.d.s b2, b2 -+ fmadd.d s2, b2, a2, s2 -+#else -+ MADD s2, b2, a2, s2 -+#endif -+ LD a2, X, 5 * SIZE -+ LD b2, Y, 5 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a3, a3 -+ fcvt.d.s b3, b3 -+ fmadd.d s1, b3, a3, s1 -+#else -+ MADD s1, b3, a3, s1 -+#endif -+ LD a3, X, 6 * SIZE -+ LD b3, Y, 6 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a4, a4 -+ fcvt.d.s b4, b4 -+ fmadd.d s2, b4, a4, s2 -+#else -+ MADD s2, b4, a4, s2 -+#endif -+ LD a4, X, 7 * SIZE -+ LD b4, Y, 7 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s1, b1, a1, s1 -+#else -+ MADD s1, b1, a1, s1 -+#endif -+ addi.d X, X, 8 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a2, a2 -+ fcvt.d.s b2, b2 -+ fmadd.d s2, b2, a2, s2 -+#else -+ MADD s2, b2, a2, s2 -+#endif -+ addi.d Y, Y, 8 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a3, a3 -+ fcvt.d.s b3, b3 -+ fmadd.d s1, b3, a3, s1 -+#else -+ MADD s1, b3, a3, s1 -+#endif -+#ifdef DSDOT -+ fcvt.d.s a4, a4 -+ fcvt.d.s b4, b4 -+ fmadd.d s2, b4, a4, s2 -+#else -+ MADD s2, b4, a4, s2 -+#endif -+ .align 3 -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+.L16: -+ LD a1, X, 0 * SIZE -+ LD b1, Y, 0 * SIZE -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s1, b1, a1, s1 -+#else -+ MADD s1, b1, a1, s1 -+#endif -+ addi.d I, I, -1 -+ addi.d X, X, SIZE -+ addi.d Y, Y, SIZE -+ blt $r0, I, .L16 -+ b .L999 -+ .align 3 -+ -+.L20: -+#ifdef F_INTERFACE -+ bgez INCX, .L21 -+ addi.d TEMP, N, -1 -+ mult TEMP, INCX -+ mflo TEMP -+ dsub X, X, TEMP -+ .align 3 -+ -+.L21: -+ bgez INCY, .L22 -+ addi.d TEMP, N, -1 -+ mult TEMP, INCY -+ mflo TEMP -+ dsub Y, Y, TEMP -+ .align 3 -+ -+.L22: -+#endif -+ bge $r0, I, .L25 -+ .align 3 -+ -+.L23: -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s1, b1, a1, s1 -+#else -+ MADD s1, b1, a1, s1 -+#endif -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s2, b1, a1, s2 -+#else -+ MADD s2, b1, a1, s2 -+#endif -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s1, b1, a1, s1 -+#else -+ MADD s1, b1, a1, s1 -+#endif -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s2, b1, a1, s2 -+#else -+ MADD s2, b1, a1, s2 -+#endif -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s1, b1, a1, s1 -+#else -+ MADD s1, b1, a1, s1 -+#endif -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s2, b1, a1, s2 -+#else -+ MADD s2, b1, a1, s2 -+#endif -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s1, b1, a1, s1 -+#else -+ MADD s1, b1, a1, s1 -+#endif -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ addi.d I, I, -1 -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s2, b1, a1, s2 -+#else -+ MADD s2, b1, a1, s2 -+#endif -+ blt $r0, I, .L23 -+ .align 3 -+ -+.L25: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L26: -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ addi.d I, I, -1 -+#ifdef DSDOT -+ fcvt.d.s a1, a1 -+ fcvt.d.s b1, b1 -+ fmadd.d s1, b1, a1, s1 -+#else -+ MADD s1, b1, a1, s1 -+#endif -+ blt $r0, I, .L26 -+ .align 3 -+ -+.L999: -+#ifdef DSDOT -+ fadd.d $f0, s1, s2 -+#else -+ ADD $f0, s1, s2 -+#endif -+ move $r4, $r17 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/gemm_kernel.S b/kernel/loongarch64/gemm_kernel.S -new file mode 100644 -index 0000000..8926bf1 ---- /dev/null -+++ b/kernel/loongarch64/gemm_kernel.S -@@ -0,0 +1,1859 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define M $r4 -+#define N $r5 -+#define K $r6 -+#define A $r7 -+#define B $r8 -+#define C $r9 -+#define LDC $r10 -+#define AO $r12 -+#define BO $r13 -+#define I $r17 -+#define J $r18 -+#define L $r30 -+#define PREFETCHSIZE (4 * 10) -+#define CO1 $r14 -+#define CO2 $r15 -+#define CO3 $r23 -+#define CO4 $r24 -+#define CO5 $r25 -+#define CO6 $r26 -+#define CO7 $r27 -+#define CO8 $r28 -+#define BB $r29 -+ -+#if defined(TRMMKERNEL) -+#define OFFSET $r11 -+#define KK $r20 -+#define TEMP $r16 -+#endif -+ -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f27 -+#define a4 $f28 -+#define b1 $f23 -+#define b2 $f9 -+#define b3 $f10 -+#define b4 $f11 -+#define b5 $f12 -+#define b6 $f13 -+#define b7 $f14 -+#define b8 $f15 -+#define a5 b8 -+#define c11 $f16 -+#define c12 $f17 -+#define c21 $f3 -+#define c22 $f1 -+#define c31 $f2 -+#define c32 $f4 -+#define c41 $f5 -+#define c42 $f6 -+#define c51 $f7 -+#define c52 $f18 -+#define c61 $f19 -+#define c62 $f20 -+#define c71 $f21 -+#define c72 $f24 -+#define c81 $f25 -+#define c82 $f26 -+#define ALPHA $f0 -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -160 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 16 -+ SDARG $r26, $sp, 24 -+ SDARG $r27, $sp, 32 -+ SDARG $r28, $sp, 40 -+ SDARG $r29, $sp, 48 -+ SDARG $r30, $sp, 96 -+ fst.d $f24, $sp, 56 -+ fst.d $f25, $sp, 64 -+ fst.d $f26, $sp, 72 -+ fst.d $f27, $sp, 80 -+ fst.d $f28, $sp, 88 -+#if defined(TRMMKERNEL) -+ SDARG $r20, $sp, 104 -+ SDARG $r16, $sp, 112 -+#endif -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 120 -+ fst.d $f19, $sp, 128 -+ fst.d $f20, $sp, 136 -+ fst.d $f21, $sp, 144 -+#endif -+ slli.d LDC, LDC, BASE_SHIFT -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ sub.d KK, $r0, OFFSET -+#endif -+ srai.d J, N, 3 -+nop -+ bge $r0, J, .L30 -+.L10: -+ move CO1, C -+ MTC c11, $r0 -+ add.d CO2, C, LDC -+ move AO, A -+ add.d CO3, CO2, LDC -+ addi.d J, J, -1 -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ add.d CO5, CO4, LDC -+ MOV c31, c11 -+ add.d CO6, CO5, LDC -+ MOV c41, c11 -+ add.d CO7, CO6, LDC -+ MOV c51, c11 -+ add.d CO8, CO7, LDC -+ srai.d I, M, 1 -+ add.d C, CO8, LDC -+ slli.d BB, K, 2 + BASE_SHIFT -+ add.d BB, B, BB -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move KK, OFFSET -+#endif -+MOV c61, c11 -+ bge $r0, I, .L20 -+.L11: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 3 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ MOV c32, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c42, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c52, c11 -+ LD b5, BO, 4 * SIZE -+ MOV c62, c11 -+ LD b6, BO, 8 * SIZE -+ MOV c72, c11 -+ LD b7, BO, 12 * SIZE -+ MOV c82, c11 -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 2 -+#else -+ addi.d TEMP, KK, 8 -+#endif -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L15 -+#else -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, B, 0 * SIZE -+ MOV c81, c11 -+ preld 1, CO1, 3 * SIZE -+ preld 1, CO2, 3 * SIZE -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, K, 2 -+ MOV c32, c11 -+ LD b3, B, 2 * SIZE -+ MOV c42, c11 -+ LD b4, B, 3 * SIZE -+ MOV c52, c11 -+ LD b5, B, 4 * SIZE -+ MOV c62, c11 -+ LD b6, B, 8 * SIZE -+ MOV c72, c11 -+ LD b7, B, 12 * SIZE -+ MOV c82, c11 -+move BO, B -+ bge $r0, L, .L15 -+#endif -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ bge $r0, L, .L13 -+ preld 1, CO3, 2 * SIZE -+ .align 3 -+.L12: -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ LD a4, AO, 2 * SIZE -+ MADD c61, b2, a1, c61 -+ MADD c71, b3, a1, c71 -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a4, c51 -+ MADD c61, b2, a4, c61 -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ LD a4, AO, 6 * SIZE -+ MADD c61, b2, a3, c61 -+ MADD c71, b3, a3, c71 -+ MADD c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ addi.d L, L, -1 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ blt $r0, L, .L12 -+ .align 3 -+ -+.L13: -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ MADD c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ preld 1, CO4, 3 * SIZE -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a4, c51 -+ preld 1, CO5, 3 * SIZE -+ MADD c61, b2, a4, c61 -+ MADD c71, b3, a4, c71 -+ preld 1, CO6, 3 * SIZE -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ preld 1, CO7, 3 * SIZE -+ MADD c41, b4, a3, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ MADD c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD c71, b3, a3, c71 -+ MADD c81, b4, a3, c81 -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ .align 3 -+ -+.L15: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ preld 1, CO8, 3 * SIZE -+ bge $r0, L, .L18 -+ .align 3 -+.L16: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ addi.d L, L, -1 -+ MADD c61, b2, a1, c61 -+ addi.d AO, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ addi.d BO, BO, 8 * SIZE -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 4 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L16 -+.L18: -+#ifndef TRMMKERNEL -+ LD $f22, CO1, 0 * SIZE -+ addi.d CO3,CO3, 2 * SIZE -+ LD $f8, CO1, 1 * SIZE -+ addi.d CO1,CO1, 2 * SIZE -+ LD $f23, CO2, 0 * SIZE -+ addi.d CO4,CO4, 2 * SIZE -+ LD $f9, CO2, 1 * SIZE -+ addi.d CO2,CO2, 2 * SIZE -+ LD $f10, CO3, -2 * SIZE -+ addi.d CO5,CO5, 2 * SIZE -+ LD $f11, CO3, -1 * SIZE -+ addi.d CO6,CO6, 2 * SIZE -+ LD $f12, CO4, -2 * SIZE -+ addi.d CO7,CO7, 2 * SIZE -+ LD $f13, CO4, -1 * SIZE -+ addi.d I, I, -1 -+ MADD c11, c11, ALPHA, $f22 -+ LD $f22, CO5, -2 * SIZE -+ MADD c12, c12, ALPHA, $f8 -+ LD $f8, CO5, -1 * SIZE -+ MADD c21, c21, ALPHA, $f23 -+ LD $f23, CO6, -2 * SIZE -+ MADD c22, c22, ALPHA, $f9 -+ LD $f9, CO6, -1 * SIZE -+ MADD c31, c31, ALPHA, $f10 -+ LD $f10, CO7, -2 * SIZE -+ MADD c32, c32, ALPHA, $f11 -+ LD $f11, CO7, -1 * SIZE -+ MADD c41, c41, ALPHA, $f12 -+ LD $f12, CO8, 0 * SIZE -+ MADD c42, c42, ALPHA, $f13 -+ LD $f13, CO8, 1 * SIZE -+ preld 0, BB, 0 * SIZE -+ preld 0, BB, 8 * SIZE -+ ST c11, CO1, -2 * SIZE -+ MTC c11, $r0 -+ ST c12, CO1, -1 * SIZE -+ addi.d CO8,CO8, 2 * SIZE -+ ST c21, CO2, -2 * SIZE -+ MOV c21, c11 -+ ST c22, CO2, -1 * SIZE -+ addi.d BB, BB, 16 * SIZE -+ MADD c51, c51, ALPHA, $f22 -+ ST c31, CO3, -2 * SIZE -+ MADD c52, c52, ALPHA, $f8 -+ ST c32, CO3, -1 * SIZE -+ MADD c61, c61, ALPHA, $f23 -+ ST c41, CO4, -2 * SIZE -+ MADD c62, c62, ALPHA, $f9 -+ ST c42, CO4, -1 * SIZE -+ MADD c71, c71, ALPHA, $f10 -+ ST c51, CO5, -2 * SIZE -+ MADD c72, c72, ALPHA, $f11 -+ ST c52, CO5, -1 * SIZE -+ MADD c81, c81, ALPHA, $f12 -+ ST c61, CO6, -2 * SIZE -+ MADD c82, c82, ALPHA, $f13 -+ ST c62, CO6, -1 * SIZE -+ ST c71, CO7, -2 * SIZE -+ MOV c31, c11 -+ ST c72, CO7, -1 * SIZE -+ MOV c41, c11 -+ ST c81, CO8, -2 * SIZE -+ MOV c51, c11 -+ ST c82, CO8, -1 * SIZE -+MOV c61, c11 -+ blt $r0, I, .L11 -+#else -+ addi.d CO4,CO4, 2 * SIZE -+ addi.d CO5,CO5, 2 * SIZE -+ addi.d CO6,CO6, 2 * SIZE -+ addi.d CO7,CO7, 2 * SIZE -+ preld 0, BB, 0 * SIZE -+ preld 0, BB, 8 * SIZE -+ MUL c11, ALPHA, c11 -+ addi.d CO1,CO1, 2 * SIZE -+ MUL c12, ALPHA, c12 -+ MTC a1, $r0 -+ MUL c21, ALPHA, c21 -+ addi.d CO2,CO2, 2 * SIZE -+ MUL c22, ALPHA, c22 -+ addi.d CO3,CO3, 2 * SIZE -+ ST c11, CO1, -2 * SIZE -+ MUL c31, ALPHA, c31 -+ ST c12, CO1, -1 * SIZE -+ MUL c32, ALPHA, c32 -+ ST c21, CO2, -2 * SIZE -+ MUL c41, ALPHA, c41 -+ ST c22, CO2, -1 * SIZE -+ MUL c42, ALPHA, c42 -+ ST c31, CO3, -2 * SIZE -+ MUL c51, ALPHA, c51 -+ ST c32, CO3, -1 * SIZE -+ MUL c52, ALPHA, c52 -+ ST c41, CO4, -2 * SIZE -+ MUL c61, ALPHA, c61 -+ ST c42, CO4, -1 * SIZE -+ MUL c62, ALPHA, c62 -+ ST c51, CO5, -2 * SIZE -+ MUL c71, ALPHA, c71 -+ ST c52, CO5, -1 * SIZE -+ MUL c72, ALPHA, c72 -+ ST c61, CO6, -2 * SIZE -+ MUL c81, ALPHA, c81 -+ ST c62, CO6, -1 * SIZE -+ MUL c82, ALPHA, c82 -+ ST c71, CO7, -2 * SIZE -+ MOV c11, a1 -+ ST c72, CO7, -1 * SIZE -+ MOV c21, a1 -+ addi.d CO8,CO8, 2 * SIZE -+ addi.d BB, BB, 16 * SIZE -+ ST c81, CO8, -2 * SIZE -+ MOV c31, a1 -+ ST c82, CO8, -1 * SIZE -+ MOV c41, a1 -+ addi.d I, I, -1 -+ MOV c51, a1 -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ sub.d TEMP, K, KK -+#ifdef LEFT -+ addi.d TEMP, TEMP, -2 -+#else -+ addi.d TEMP, TEMP, -8 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LEFT -+ addi.d KK, KK, 2 -+#endif -+MOV c61, a1 -+ blt $r0, I, .L11 -+#endif -+ .align 3 -+ -+.L20: -+ andi I, M, 1 -+ MOV c61, c11 -+MOV c71, c11 -+ bge $r0, I, .L29 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 3 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 1 -+#else -+ addi.d TEMP, KK, 8 -+#endif -+ srai.d L, TEMP, 2 -+MOV c81, c11 -+ bge $r0, L, .L25 -+#else -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, K, 2 -+ MOV c81, c11 -+move BO, B -+ bge $r0, L, .L25 -+#endif -+ .align 3 -+.L22: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 16 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ LD b5, BO, 20 * SIZE -+ MADD c61, b2, a1, c61 -+ LD b2, BO, 9 * SIZE -+ MADD c71, b3, a1, c71 -+ LD b3, BO, 10 * SIZE -+ MADD c81, b4, a1, c81 -+ LD b4, BO, 11 * SIZE -+ LD a1, AO, 4 * SIZE -+ addi.d L, L, -1 -+ MADD c11, b6, a2, c11 -+ LD b6, BO, 24 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 13 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 14 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a2, c51 -+ LD b7, BO, 28 * SIZE -+ MADD c61, b2, a2, c61 -+ LD b2, BO, 17 * SIZE -+ MADD c71, b3, a2, c71 -+ LD b3, BO, 18 * SIZE -+ MADD c81, b4, a2, c81 -+ LD b4, BO, 19 * SIZE -+ LD a2, AO, 5 * SIZE -+ addi.d AO, AO, 4 * SIZE -+ MADD c11, b1, a3, c11 -+ LD b1, BO, 32 * SIZE -+ MADD c21, b2, a3, c21 -+ LD b2, BO, 21 * SIZE -+ MADD c31, b3, a3, c31 -+ LD b3, BO, 22 * SIZE -+ MADD c41, b4, a3, c41 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ LD b5, BO, 36 * SIZE -+ MADD c61, b2, a3, c61 -+ LD b2, BO, 25 * SIZE -+ MADD c71, b3, a3, c71 -+ LD b3, BO, 26 * SIZE -+ MADD c81, b4, a3, c81 -+ LD b4, BO, 27 * SIZE -+ LD a3, AO, 2 * SIZE -+ addi.d BO, BO, 32 * SIZE -+ MADD c11, b6, a4, c11 -+ LD b6, BO, 8 * SIZE -+ MADD c21, b2, a4, c21 -+ LD b2, BO, -3 * SIZE -+ MADD c31, b3, a4, c31 -+ LD b3, BO, -2 * SIZE -+ MADD c41, b4, a4, c41 -+ LD b4, BO, -1 * SIZE -+ MADD c51, b7, a4, c51 -+ LD b7, BO, 12 * SIZE -+ MADD c61, b2, a4, c61 -+ LD b2, BO, 1 * SIZE -+ MADD c71, b3, a4, c71 -+ LD b3, BO, 2 * SIZE -+ MADD c81, b4, a4, c81 -+ LD b4, BO, 3 * SIZE -+ LD a4, AO, 3 * SIZE -+ blt $r0, L, .L22 -+ .align 3 -+ -+.L25: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L28 -+ .align 3 -+.L26: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 8 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ addi.d L, L, -1 -+ MOV a2, a2 -+ addi.d AO, AO, 1 * SIZE -+ addi.d BO, BO, 8 * SIZE -+ MADD c51, b5, a1, c51 -+ LD b5, BO, 4 * SIZE -+ MADD c61, b2, a1, c61 -+ LD b2, BO, 1 * SIZE -+ MADD c71, b3, a1, c71 -+ LD b3, BO, 2 * SIZE -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L26 -+.L28: -+#ifndef TRMMKERNEL -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO2, 0 * SIZE -+ LD $f23, CO3, 0 * SIZE -+ LD $f9, CO4, 0 * SIZE -+ MADD c11, c11, ALPHA, $f22 -+ LD $f10, CO5, 0 * SIZE -+ MADD c21, c21, ALPHA, $f8 -+ LD $f11, CO6, 0 * SIZE -+ MADD c31, c31, ALPHA, $f23 -+ LD $f12, CO7, 0 * SIZE -+ MADD c41, c41, ALPHA, $f9 -+ LD $f13, CO8, 0 * SIZE -+ MADD c51, c51, ALPHA, $f10 -+ ST c11, CO1, 0 * SIZE -+ MADD c61, c61, ALPHA, $f11 -+ ST c21, CO2, 0 * SIZE -+ MADD c71, c71, ALPHA, $f12 -+ ST c31, CO3, 0 * SIZE -+ MADD c81, c81, ALPHA, $f13 -+ ST c41, CO4, 0 * SIZE -+ ST c51, CO5, 0 * SIZE -+ ST c61, CO6, 0 * SIZE -+ ST c71, CO7, 0 * SIZE -+ ST c81, CO8, 0 * SIZE -+#else -+ MUL c11, ALPHA, c11 -+ MUL c21, ALPHA, c21 -+ MUL c31, ALPHA, c31 -+ MUL c41, ALPHA, c41 -+ ST c11, CO1, 0 * SIZE -+ MUL c51, ALPHA, c51 -+ ST c21, CO2, 0 * SIZE -+ MUL c61, ALPHA, c61 -+ ST c31, CO3, 0 * SIZE -+ MUL c71, ALPHA, c71 -+ ST c41, CO4, 0 * SIZE -+ MUL c81, ALPHA, c81 -+ ST c51, CO5, 0 * SIZE -+ ST c61, CO6, 0 * SIZE -+ ST c71, CO7, 0 * SIZE -+ ST c81, CO8, 0 * SIZE -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ sub.d TEMP, K, KK -+#ifdef LEFT -+ addi.d TEMP, TEMP, -1 -+#else -+ addi.d TEMP, TEMP, -8 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LEFT -+ addi.d KK, KK, 1 -+#endif -+#endif -+ .align 3 -+ -+.L29: -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addi.d KK, KK, 8 -+#endif -+move B, BO -+ blt $r0, J, .L10 -+ .align 3 -+ -+.L30: -+ andi J, N, 4 -+move AO, A -+ bge $r0, J, .L50 -+ move CO1, C -+ MTC c11, $r0 -+ add.d CO2, C, LDC -+ add.d CO3, CO2, LDC -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ add.d C, CO4, LDC -+ MOV c31, c11 -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move KK, OFFSET -+#endif -+ srai.d I, M, 1 -+MOV c41, c11 -+ bge $r0, I, .L40 -+.L31: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 2 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ LD a3, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c32, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c42, c11 -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 2 -+#else -+ addi.d TEMP, KK, 4 -+#endif -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L35 -+#else -+ LD a1, AO, 0 * SIZE -+ LD a3, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ MOV c32, c11 -+ LD b4, B, 3 * SIZE -+ MOV c42, c11 -+ LD b5, B, 4 * SIZE -+ srai.d L, K, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L35 -+#endif -+ .align 3 -+.L32: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c11, b5, a1, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 8 * SIZE -+ MADD c12, b5, a2, c12 -+ LD b5, BO, 20 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 9 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 10 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ LD a3, AO, 6 * SIZE -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c11, b7, a3, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a3, c21 -+ addi.d AO, AO, 8 * SIZE -+ MADD c31, b3, a3, c31 -+ addi.d BO, BO, 16 * SIZE -+ MADD c41, b4, a3, c41 -+ LD a3, AO, 4 * SIZE -+ MADD c12, b7, a2, c12 -+ LD b7, BO, 12 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 1 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 2 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L32 -+ .align 3 -+ -+.L35: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L38 -+ .align 3 -+.L36: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ addi.d AO, AO, 2 * SIZE -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 0 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 4 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L36 -+.L38: -+#ifndef TRMMKERNEL -+ LD $f22, CO1, 0 * SIZE -+ addi.d CO3,CO3, 2 * SIZE -+ LD $f8, CO1, 1 * SIZE -+ addi.d CO1,CO1, 2 * SIZE -+ LD $f23, CO2, 0 * SIZE -+ addi.d CO4,CO4, 2 * SIZE -+ LD $f9, CO2, 1 * SIZE -+ addi.d CO2,CO2, 2 * SIZE -+ LD $f10, CO3, -2 * SIZE -+ MADD c11, c11, ALPHA, $f22 -+ LD $f11, CO3, -1 * SIZE -+ MADD c12, c12, ALPHA, $f8 -+ LD $f12, CO4, -2 * SIZE -+ MADD c21, c21, ALPHA, $f23 -+ LD $f13, CO4, -1 * SIZE -+ MADD c22, c22, ALPHA, $f9 -+ MADD c31, c31, ALPHA, $f10 -+ ST c11, CO1, -2 * SIZE -+ MADD c32, c32, ALPHA, $f11 -+ ST c12, CO1, -1 * SIZE -+ MADD c41, c41, ALPHA, $f12 -+ ST c21, CO2, -2 * SIZE -+ MADD c42, c42, ALPHA, $f13 -+ ST c22, CO2, -1 * SIZE -+ ST c31, CO3, -2 * SIZE -+ MTC c11, $r0 -+ ST c32, CO3, -1 * SIZE -+ addi.d I, I, -1 -+ ST c41, CO4, -2 * SIZE -+ MOV c21, c11 -+ ST c42, CO4, -1 * SIZE -+ MOV c31, c11 -+#else -+ MUL c11, ALPHA, c11 -+ addi.d CO3,CO3, 2 * SIZE -+ MUL c12, ALPHA, c12 -+ addi.d CO1,CO1, 2 * SIZE -+ MUL c21, ALPHA, c21 -+ addi.d CO4,CO4, 2 * SIZE -+ MUL c22, ALPHA, c22 -+ addi.d CO2,CO2, 2 * SIZE -+ ST c11, CO1, -2 * SIZE -+ MUL c31, ALPHA, c31 -+ ST c12, CO1, -1 * SIZE -+ MUL c32, ALPHA, c32 -+ ST c21, CO2, -2 * SIZE -+ MUL c41, ALPHA, c41 -+ ST c22, CO2, -1 * SIZE -+ MUL c42, ALPHA, c42 -+ ST c31, CO3, -2 * SIZE -+ MTC c11, $r0 -+ ST c32, CO3, -1 * SIZE -+ addi.d I, I, -1 -+ ST c41, CO4, -2 * SIZE -+ MOV c21, c11 -+ ST c42, CO4, -1 * SIZE -+ MOV c31, c11 -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ sub.d TEMP, K, KK -+#ifdef LEFT -+ addi.d TEMP, TEMP, -2 -+#else -+ addi.d TEMP, TEMP, -4 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LEFT -+ addi.d KK, KK, 2 -+#endif -+#endif -+MOV c41, c11 -+ blt $r0, I, .L31 -+ .align 3 -+ -+.L40: -+ andi I, M, 1 -+MOV c61, c11 -+ bge $r0, I, .L49 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 2 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c81, c11 -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 1 -+#else -+ addi.d TEMP, KK, 4 -+#endif -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L45 -+#else -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c81, c11 -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, K, 2 -+move BO, B -+ bge $r0, L, .L45 -+#endif -+ .align 3 -+.L42: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 16 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ LD a1, AO, 4 * SIZE -+ addi.d L, L, -1 -+ MADD c11, b5, a2, c11 -+ LD b5, BO, 20 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 10 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 11 * SIZE -+ LD a2, AO, 2 * SIZE -+ addi.d AO, AO, 4 * SIZE -+ MADD c11, b6, a2, c11 -+ LD b6, BO, 24 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 13 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 14 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 15 * SIZE -+ LD a2, AO, -1 * SIZE -+ addi.d BO, BO, 16 * SIZE -+ MADD c11, b7, a2, c11 -+ LD b7, BO, 12 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 1 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 2 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 3 * SIZE -+ LD a2, AO, 1 * SIZE -+ blt $r0, L, .L42 -+ .align 3 -+ -+.L45: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L48 -+ .align 3 -+.L46: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 4 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 1 * SIZE -+ LD b4, BO, 7 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+ MOV a2, a2 -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L46 -+.L48: -+#ifndef TRMMKERNEL -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO2, 0 * SIZE -+ LD $f23, CO3, 0 * SIZE -+ LD $f9, CO4, 0 * SIZE -+ MADD c11, c11, ALPHA, $f22 -+ MADD c21, c21, ALPHA, $f8 -+ MADD c31, c31, ALPHA, $f23 -+ MADD c41, c41, ALPHA, $f9 -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c41, CO4, 0 * SIZE -+#else -+ MUL c11, ALPHA, c11 -+ MUL c21, ALPHA, c21 -+ MUL c31, ALPHA, c31 -+ MUL c41, ALPHA, c41 -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c41, CO4, 0 * SIZE -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ sub.d TEMP, K, KK -+#ifdef LEFT -+ addi.d TEMP, TEMP, -1 -+#else -+ addi.d TEMP, TEMP, -4 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LEFT -+ addi.d KK, KK, 1 -+#endif -+#endif -+ .align 3 -+ -+.L49: -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addi.d KK, KK, 4 -+#endif -+ move B, BO -+ .align 3 -+ -+.L50: -+ andi J, N, 2 -+move AO, A -+ bge $r0, J, .L70 -+ move CO1, C -+ add.d CO2, C, LDC -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move KK, OFFSET -+#endif -+ srai.d I, M, 1 -+add.d C, CO2, LDC -+ bge $r0, I, .L60 -+.L51: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 1 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 2 -+#else -+ addi.d TEMP, KK, 2 -+#endif -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L55 -+#else -+ LD a1, AO, 0 * SIZE -+ MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ LD b5, B, 4 * SIZE -+ srai.d L, K, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L55 -+#endif -+ .align 3 -+.L52: -+ MADD c11, b1, a1, c11 -+ LD a3, AO, 2 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b4, BO, 3 * SIZE -+ MADD c12, b1, a2, c12 -+ LD a4, AO, 3 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b1, BO, 8 * SIZE -+ MADD c11, b3, a3, c11 -+ LD a1, AO, 8 * SIZE -+ MADD c21, b4, a3, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c12, b3, a4, c12 -+ LD a2, AO, 5 * SIZE -+ MADD c22, b4, a4, c22 -+ LD b3, BO, 6 * SIZE -+ MADD c11, b5, a5, c11 -+ LD a3, AO, 6 * SIZE -+ MADD c21, b2, a5, c21 -+ LD b4, BO, 7 * SIZE -+ MADD c12, b5, a2, c12 -+ LD a4, AO, 7 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b5, BO, 12 * SIZE -+ MADD c11, b3, a3, c11 -+ LD a5, AO, 12 * SIZE -+ MADD c21, b4, a3, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c12, b3, a4, c12 -+ LD a2, AO, 9 * SIZE -+ MADD c22, b4, a4, c22 -+ LD b3, BO, 10 * SIZE -+ addi.d AO, AO, 8 * SIZE -+ addi.d L, L, -1 -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L52 -+ .align 3 -+ -+.L55: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L58 -+ .align 3 -+.L56: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 3 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 2 * SIZE -+addi.d BO, BO, 2 * SIZE -+ blt $r0, L, .L56 -+.L58: -+#ifndef TRMMKERNEL -+ LD $f22, CO1, 0 * SIZE -+ addi.d I, I, -1 -+ LD $f8, CO1, 1 * SIZE -+ addi.d CO1,CO1, 2 * SIZE -+ LD $f23, CO2, 0 * SIZE -+ LD $f9, CO2, 1 * SIZE -+ addi.d CO2,CO2, 2 * SIZE -+ MADD c11, c11, ALPHA, $f22 -+ MADD c12, c12, ALPHA, $f8 -+ MADD c21, c21, ALPHA, $f23 -+ MADD c22, c22, ALPHA, $f9 -+ ST c11, CO1, -2 * SIZE -+ ST c12, CO1, -1 * SIZE -+ ST c21, CO2, -2 * SIZE -+ ST c22, CO2, -1 * SIZE -+ blt $r0, I, .L51 -+#else -+ addi.d I, I, -1 -+ addi.d CO1,CO1, 2 * SIZE -+ addi.d CO2,CO2, 2 * SIZE -+ MUL c11, ALPHA, c11 -+ MUL c12, ALPHA, c12 -+ MUL c21, ALPHA, c21 -+ MUL c22, ALPHA, c22 -+ ST c11, CO1, -2 * SIZE -+ ST c12, CO1, -1 * SIZE -+ ST c21, CO2, -2 * SIZE -+ ST c22, CO2, -1 * SIZE -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ sub.d TEMP, K, KK -+#ifdef LEFT -+ addi.d TEMP, TEMP, -2 -+#else -+ addi.d TEMP, TEMP, -2 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LEFT -+ addi.d KK, KK, 2 -+#endif -+ blt $r0, I, .L51 -+#endif -+ .align 3 -+ -+.L60: -+ andi I, M, 1 -+ bge $r0, I, .L69 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 1 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ MOV c31, c11 -+ LD a4, AO, 3 * SIZE -+ MOV c41, c11 -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 1 -+#else -+ addi.d TEMP, KK, 2 -+#endif -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L65 -+#else -+ srai.d L, K, 2 -+ LD a1, AO, 0 * SIZE -+ MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ MOV c31, c11 -+ LD a4, AO, 3 * SIZE -+ MOV c41, c11 -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L65 -+#endif -+ .align 3 -+.L62: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 4 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 7 * SIZE -+ LD a1, AO, 4 * SIZE -+ LD a2, AO, 5 * SIZE -+ MADD c11, b1, a3, c11 -+ LD b1, BO, 8 * SIZE -+ MADD c21, b2, a3, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c31, b3, a4, c31 -+ LD b3, BO, 10 * SIZE -+ MADD c41, b4, a4, c41 -+ LD b4, BO, 11 * SIZE -+ LD a3, AO, 6 * SIZE -+ LD a4, AO, 7 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 4 * SIZE -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L62 -+ .align 3 -+ -+.L65: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L68 -+ .align 3 -+.L66: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 2 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 3 * SIZE -+ LD a1, AO, 1 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+addi.d BO, BO, 2 * SIZE -+ blt $r0, L, .L66 -+.L68: -+#ifndef TRMMKERNEL -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO2, 0 * SIZE -+ ADD c11, c11, c31 -+ ADD c21, c21, c41 -+ MADD c11, c11, ALPHA, $f22 -+ MADD c21, c21, ALPHA, $f8 -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+#else -+ ADD c11, c11, c31 -+ ADD c21, c21, c41 -+ MUL c11, ALPHA, c11 -+ MUL c21, ALPHA, c21 -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ sub.d TEMP, K, KK -+#ifdef LEFT -+ addi.d TEMP, TEMP, -1 -+#else -+ addi.d TEMP, TEMP, -2 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LEFT -+ addi.d KK, KK, 1 -+#endif -+#endif -+ .align 3 -+ -+.L69: -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addi.d KK, KK, 2 -+#endif -+ move B, BO -+ .align 3 -+ -+.L70: -+ andi J, N, 1 -+move AO, A -+ bge $r0, J, .L999 -+ move CO1, C -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move KK, OFFSET -+#endif -+ srai.d I, M, 1 -+add.d C, CO1, LDC -+ bge $r0, I, .L80 -+.L71: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 0 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 2 -+#else -+ addi.d TEMP, KK, 1 -+#endif -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L75 -+#else -+ LD a1, AO, 0 * SIZE -+ MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ LD b5, B, 4 * SIZE -+ srai.d L, K, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L75 -+#endif -+ .align 3 -+.L72: -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 2 * SIZE -+ LD a2, AO, 3 * SIZE -+ LD b1, BO, 1 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 4 * SIZE -+ LD a2, AO, 5 * SIZE -+ LD b1, BO, 2 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 6 * SIZE -+ LD a2, AO, 7 * SIZE -+ LD b1, BO, 3 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ addi.d L, L, -1 -+ addi.d AO, AO, 8 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L72 -+ .align 3 -+ -+.L75: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L78 -+ .align 3 -+.L76: -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ addi.d L, L, -1 -+ addi.d AO, AO, 2 * SIZE -+addi.d BO, BO, 1 * SIZE -+ blt $r0, L, .L76 -+.L78: -+#ifndef TRMMKERNEL -+ LD $f22, CO1, 0 * SIZE -+ addi.d I, I, -1 -+ LD $f8, CO1, 1 * SIZE -+ addi.d CO1,CO1, 2 * SIZE -+ ADD c11, c11, c21 -+ ADD c12, c12, c22 -+ MADD c11, c11, ALPHA, $f22 -+ MADD c12, c12, ALPHA, $f8 -+ ST c11, CO1, -2 * SIZE -+ ST c12, CO1, -1 * SIZE -+ blt $r0, I, .L71 -+#else -+ ADD c11, c11, c21 -+ addi.d I, I, -1 -+ ADD c12, c12, c22 -+ addi.d CO1,CO1, 2 * SIZE -+ MUL c11, ALPHA, c11 -+ MUL c12, ALPHA, c12 -+ ST c11, CO1, -2 * SIZE -+ ST c12, CO1, -1 * SIZE -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ sub.d TEMP, K, KK -+#ifdef LEFT -+ addi.d TEMP, TEMP, -2 -+#else -+ addi.d TEMP, TEMP, -1 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LEFT -+ addi.d KK, KK, 2 -+#endif -+ blt $r0, I, .L71 -+#endif -+ .align 3 -+ -+.L80: -+ andi I, M, 1 -+ bge $r0, I, .L89 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 0 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 1 -+#else -+ addi.d TEMP, KK, 1 -+#endif -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L85 -+#else -+ LD a1, AO, 0 * SIZE -+ MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, K, 2 -+move BO, B -+ bge $r0, L, .L85 -+#endif -+ .align 3 -+.L82: -+ LD a1, AO, 0 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a1, AO, 1 * SIZE -+ LD b1, BO, 1 * SIZE -+ MADD c21, b1, a1, c21 -+ LD a1, AO, 2 * SIZE -+ LD b1, BO, 2 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a1, AO, 3 * SIZE -+ LD b1, BO, 3 * SIZE -+ MADD c21, b1, a1, c21 -+ addi.d L, L, -1 -+ addi.d AO, AO, 4 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L82 -+ .align 3 -+ -+.L85: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L88 -+ .align 3 -+.L86: -+ LD a1, AO, 0 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+addi.d BO, BO, 1 * SIZE -+ blt $r0, L, .L86 -+.L88: -+#ifndef TRMMKERNEL -+ LD $f22, CO1, 0 * SIZE -+ ADD c11, c11, c21 -+ MADD c11, c11, ALPHA, $f22 -+ ST c11, CO1, 0 * SIZE -+#else -+ ADD c11, c11, c21 -+ MUL c11, ALPHA, c11 -+ ST c11, CO1, 0 * SIZE -+#endif -+ .align 3 -+ -+.L89: -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addi.d KK, KK, 1 -+#endif -+ move B, BO -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ LDARG $r25, $sp, 16 -+ LDARG $r26, $sp, 24 -+ LDARG $r27, $sp, 32 -+ LDARG $r28, $sp, 40 -+ LDARG $r29, $sp, 48 -+ LDARG $r30, $sp, 96 -+ fld.d $f24, $sp, 56 -+ fld.d $f25, $sp, 64 -+ fld.d $f26, $sp, 72 -+ fld.d $f27, $sp, 80 -+ fld.d $f28, $sp, 88 -+#if defined(TRMMKERNEL) -+ LDARG $r20, $sp, 104 -+ LDARG $r16, $sp, 112 -+#endif -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 120 -+ fld.d $f19, $sp, 128 -+ fld.d $f20, $sp, 136 -+ fld.d $f21, $sp, 144 -+#endif -+ addi.d $sp, $sp, 160 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S -new file mode 100644 -index 0000000..9ab43ae ---- /dev/null -+++ b/kernel/loongarch64/gemv_n.S -@@ -0,0 +1,531 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+/* Unused param dummy1 */ -+#define M $r4 -+#define N $r5 -+#define A $r7 -+#define LDA $r8 -+#define X $r9 -+#define INCX $r10 -+#define Y $r11 -+#define INCY $r6 -+#define BUFFER $r16 -+#define YORIG $r18 -+#define XX $r12 -+#define YY $r13 -+#define I $r14 -+#define J $r15 -+#define AO1 $r23 -+#define AO2 $r24 -+#define ALPHA $f0 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f23 -+#define a4 $f9 -+#define a5 $f10 -+#define a6 $f11 -+#define a7 $f12 -+#define a8 $f13 -+#define x1 $f14 -+#define x2 $f15 -+#define y1 $f16 -+#define y2 $f17 -+#define y3 $f3 -+#define y4 $f1 -+#define y5 $f2 -+#define y6 $f4 -+#define y7 $f5 -+#define y8 $f6 -+#define t1 $f7 -+#define t2 $f18 -+#define t3 $f19 -+#define t4 $f20 -+ -+ PROLOGUE -+ -+ LDARG INCY, $sp, 0 -+ LDARG BUFFER, $sp, 8 -+#ifdef __64BIT__ -+ addi.d $sp, $sp, -16 -+#else -+ addi.d $sp, $sp, -48 -+#endif -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ slli.d LDA, LDA, BASE_SHIFT -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 16 -+ fst.d $f19, $sp, 24 -+ fst.d $f20, $sp, 32 -+#endif -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, M, .L999 -+ slli.d INCY, INCY, BASE_SHIFT -+ bge $r0, N, .L999 -+ li.d I, SIZE -+ move YORIG, Y -+ beq INCY, I, .L10 -+ srai.d I, M, 2 -+ move YORIG, BUFFER -+ move XX, Y -+ move YY, BUFFER -+ bge $r0, I, .L05 -+ .align 3 -+ -+.L02: -+ LD a1, XX, 0 * SIZE -+ add.d XX, XX, INCY -+ LD a2, XX, 0 * SIZE -+ add.d XX, XX, INCY -+ LD a3, XX, 0 * SIZE -+ add.d XX, XX, INCY -+ LD a4, XX, 0 * SIZE -+ add.d XX, XX, INCY -+ ST a1, YY, 0 * SIZE -+ ST a2, YY, 1 * SIZE -+ ST a3, YY, 2 * SIZE -+ ST a4, YY, 3 * SIZE -+ addi.d I, I, -1 -+ addi.d YY, YY, 4 * SIZE -+ blt $r0, I, .L02 -+ .align 3 -+ -+.L05: -+ andi I, M, 3 -+ bge $r0, I, .L10 -+ .align 3 -+ -+.L06: -+ LD a1, XX, 0 * SIZE -+ add.d XX, XX, INCY -+ ST a1, YY, 0 * SIZE -+ addi.d I, I, -1 -+ addi.d YY, YY, 1 * SIZE -+ blt $r0, I, .L06 -+ .align 3 -+ -+.L10: -+ srai.d J, N, 1 -+ bge $r0, J, .L20 -+ .align 3 -+ -+.L11: -+ LD x1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD x2, X, 0 * SIZE -+ add.d X, X, INCX -+ move AO1, A -+ add.d AO2, A, LDA -+ add.d A, AO2, LDA -+ move YY, YORIG -+ MUL x1, ALPHA, x1 -+ srai.d I, M, 3 -+ MUL x2, ALPHA, x2 -+ bge $r0, I, .L15 -+ LD a1, AO1, 0 * SIZE -+ LD y1, YY, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a3, AO1, 2 * SIZE -+ LD y3, YY, 2 * SIZE -+ LD a4, AO1, 3 * SIZE -+ LD y4, YY, 3 * SIZE -+ LD a5, AO2, 0 * SIZE -+ LD y5, YY, 4 * SIZE -+ LD a6, AO2, 1 * SIZE -+ LD y6, YY, 5 * SIZE -+ LD a7, AO2, 2 * SIZE -+ LD y7, YY, 6 * SIZE -+ LD a8, AO2, 3 * SIZE -+ addi.d I, I, -1 -+ LD y8, YY, 7 * SIZE -+ bge $r0, I, .L13 -+ .align 3 -+.L12: -+ MADD t1, a1, x1, y1 -+ LD a1, AO1, 4 * SIZE -+ MADD t2, a2, x1, y2 -+ LD a2, AO1, 5 * SIZE -+ LD y1, YY, 8 * SIZE -+ LD y2, YY, 9 * SIZE -+ MADD t3, a3, x1, y3 -+ LD a3, AO1, 6 * SIZE -+ MADD t4, a4, x1, y4 -+ LD a4, AO1, 7 * SIZE -+ LD y3, YY, 10 * SIZE -+ LD y4, YY, 11 * SIZE -+ MADD t1, a5, x2, t1 -+ LD a5, AO2, 4 * SIZE -+ MADD t2, a6, x2, t2 -+ LD a6, AO2, 5 * SIZE -+ MADD t3, a7, x2, t3 -+ LD a7, AO2, 6 * SIZE -+ MADD t4, a8, x2, t4 -+ LD a8, AO2, 7 * SIZE -+ ST t1, YY, 0 * SIZE -+ ST t2, YY, 1 * SIZE -+ ST t3, YY, 2 * SIZE -+ ST t4, YY, 3 * SIZE -+ MADD t1, a1, x1, y5 -+ LD a1, AO1, 8 * SIZE -+ MADD t2, a2, x1, y6 -+ LD a2, AO1, 9 * SIZE -+ LD y5, YY, 12 * SIZE -+ LD y6, YY, 13 * SIZE -+ MADD t3, a3, x1, y7 -+ LD a3, AO1, 10 * SIZE -+ MADD t4, a4, x1, y8 -+ LD a4, AO1, 11 * SIZE -+ LD y7, YY, 14 * SIZE -+ LD y8, YY, 15 * SIZE -+ MADD t1, a5, x2, t1 -+ LD a5, AO2, 8 * SIZE -+ MADD t2, a6, x2, t2 -+ LD a6, AO2, 9 * SIZE -+ MADD t3, a7, x2, t3 -+ LD a7, AO2, 10 * SIZE -+ MADD t4, a8, x2, t4 -+ LD a8, AO2, 11 * SIZE -+ ST t1, YY, 4 * SIZE -+ ST t2, YY, 5 * SIZE -+ ST t3, YY, 6 * SIZE -+ ST t4, YY, 7 * SIZE -+ addi.d I, I, -1 -+ addi.d YY, YY, 8 * SIZE -+ addi.d AO1, AO1, 8 * SIZE -+ addi.d AO2, AO2, 8 * SIZE -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ MADD t1, a1, x1, y1 -+ LD a1, AO1, 4 * SIZE -+ MADD t2, a2, x1, y2 -+ LD a2, AO1, 5 * SIZE -+ MADD t3, a3, x1, y3 -+ LD a3, AO1, 6 * SIZE -+ MADD t4, a4, x1, y4 -+ LD a4, AO1, 7 * SIZE -+ MADD t1, a5, x2, t1 -+ LD a5, AO2, 4 * SIZE -+ MADD t2, a6, x2, t2 -+ LD a6, AO2, 5 * SIZE -+ MADD t3, a7, x2, t3 -+ LD a7, AO2, 6 * SIZE -+ MADD t4, a8, x2, t4 -+ LD a8, AO2, 7 * SIZE -+ ST t1, YY, 0 * SIZE -+ MADD t1, a1, x1, y5 -+ ST t2, YY, 1 * SIZE -+ MADD t2, a2, x1, y6 -+ ST t3, YY, 2 * SIZE -+ MADD t3, a3, x1, y7 -+ ST t4, YY, 3 * SIZE -+ MADD t4, a4, x1, y8 -+ MADD t1, a5, x2, t1 -+ addi.d AO1, AO1, 8 * SIZE -+ MADD t2, a6, x2, t2 -+ addi.d AO2, AO2, 8 * SIZE -+ MADD t3, a7, x2, t3 -+ addi.d YY, YY, 8 * SIZE -+ MADD t4, a8, x2, t4 -+ ST t1, YY, -4 * SIZE -+ ST t2, YY, -3 * SIZE -+ ST t3, YY, -2 * SIZE -+ ST t4, YY, -1 * SIZE -+ .align 3 -+ -+.L15: -+ andi I, M, 4 -+ bge $r0, I, .L16 -+ LD a1, AO1, 0 * SIZE -+ LD y1, YY, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a3, AO1, 2 * SIZE -+ LD y3, YY, 2 * SIZE -+ LD a4, AO1, 3 * SIZE -+ LD y4, YY, 3 * SIZE -+ LD a5, AO2, 0 * SIZE -+ MADD y1, a1, x1, y1 -+ LD a6, AO2, 1 * SIZE -+ MADD y2, a2, x1, y2 -+ LD a7, AO2, 2 * SIZE -+ MADD y3, a3, x1, y3 -+ LD a8, AO2, 3 * SIZE -+ MADD y4, a4, x1, y4 -+ MADD y1, a5, x2, y1 -+ addi.d YY, YY, 4 * SIZE -+ MADD y2, a6, x2, y2 -+ addi.d AO1, AO1, 4 * SIZE -+ MADD y3, a7, x2, y3 -+ addi.d AO2, AO2, 4 * SIZE -+ MADD y4, a8, x2, y4 -+ ST y1, YY, -4 * SIZE -+ ST y2, YY, -3 * SIZE -+ ST y3, YY, -2 * SIZE -+ ST y4, YY, -1 * SIZE -+ .align 3 -+ -+.L16: -+ andi I, M, 2 -+ bge $r0, I, .L17 -+ LD a1, AO1, 0 * SIZE -+ LD y1, YY, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a5, AO2, 0 * SIZE -+ LD a6, AO2, 1 * SIZE -+ MADD y1, a1, x1, y1 -+ MADD y2, a2, x1, y2 -+ addi.d YY, YY, 2 * SIZE -+ MADD y1, a5, x2, y1 -+ addi.d AO1, AO1, 2 * SIZE -+ MADD y2, a6, x2, y2 -+ addi.d AO2, AO2, 2 * SIZE -+ ST y1, YY, -2 * SIZE -+ ST y2, YY, -1 * SIZE -+ .align 3 -+ -+.L17: -+ andi I, M, 1 -+ bge $r0, I, .L19 -+ LD y1, YY, 0 * SIZE -+ LD a1, AO1, 0 * SIZE -+ LD a5, AO2, 0 * SIZE -+ MADD y1, a1, x1, y1 -+ MADD y1, a5, x2, y1 -+ ST y1, YY, 0 * SIZE -+ .align 3 -+ -+.L19: -+ addi.d J, J, -1 -+ blt $r0, J, .L11 -+ .align 3 -+ -+.L20: -+ andi J, N, 1 -+ bge $r0, J, .L900 -+ .align 3 -+ -+.L21: -+ LD x1, X, 0 * SIZE -+ add.d X, X, INCX -+ move YY, YORIG -+ move AO1, A -+ srai.d I, M, 3 -+ MUL x1, ALPHA, x1 -+ bge $r0, I, .L25 -+ LD a1, AO1, 0 * SIZE -+ LD y1, YY, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a3, AO1, 2 * SIZE -+ LD y3, YY, 2 * SIZE -+ LD a4, AO1, 3 * SIZE -+ LD y4, YY, 3 * SIZE -+ LD y5, YY, 4 * SIZE -+ LD y6, YY, 5 * SIZE -+ LD y7, YY, 6 * SIZE -+ addi.d I, I, -1 -+ LD y8, YY, 7 * SIZE -+ bge $r0, I, .L23 -+ .align 3 -+.L22: -+ MADD t1, a1, x1, y1 -+ LD a1, AO1, 4 * SIZE -+ MADD t2, a2, x1, y2 -+ LD a2, AO1, 5 * SIZE -+ LD y1, YY, 8 * SIZE -+ LD y2, YY, 9 * SIZE -+ MADD t3, a3, x1, y3 -+ LD a3, AO1, 6 * SIZE -+ MADD t4, a4, x1, y4 -+ LD a4, AO1, 7 * SIZE -+ LD y3, YY, 10 * SIZE -+ LD y4, YY, 11 * SIZE -+ ST t1, YY, 0 * SIZE -+ ST t2, YY, 1 * SIZE -+ ST t3, YY, 2 * SIZE -+ ST t4, YY, 3 * SIZE -+ MADD t1, a1, x1, y5 -+ LD a1, AO1, 8 * SIZE -+ MADD t2, a2, x1, y6 -+ LD a2, AO1, 9 * SIZE -+ LD y5, YY, 12 * SIZE -+ LD y6, YY, 13 * SIZE -+ MADD t3, a3, x1, y7 -+ LD a3, AO1, 10 * SIZE -+ MADD t4, a4, x1, y8 -+ LD a4, AO1, 11 * SIZE -+ LD y7, YY, 14 * SIZE -+ LD y8, YY, 15 * SIZE -+ ST t1, YY, 4 * SIZE -+ ST t2, YY, 5 * SIZE -+ ST t3, YY, 6 * SIZE -+ ST t4, YY, 7 * SIZE -+ addi.d I, I, -1 -+ addi.d YY, YY, 8 * SIZE -+ addi.d AO1, AO1, 8 * SIZE -+ blt $r0, I, .L22 -+ .align 3 -+ -+.L23: -+ MADD t1, a1, x1, y1 -+ LD a1, AO1, 4 * SIZE -+ MADD t2, a2, x1, y2 -+ LD a2, AO1, 5 * SIZE -+ MADD t3, a3, x1, y3 -+ LD a3, AO1, 6 * SIZE -+ MADD t4, a4, x1, y4 -+ LD a4, AO1, 7 * SIZE -+ ST t1, YY, 0 * SIZE -+ MADD t1, a1, x1, y5 -+ ST t2, YY, 1 * SIZE -+ MADD t2, a2, x1, y6 -+ ST t3, YY, 2 * SIZE -+ MADD t3, a3, x1, y7 -+ ST t4, YY, 3 * SIZE -+ MADD t4, a4, x1, y8 -+ ST t1, YY, 4 * SIZE -+ ST t2, YY, 5 * SIZE -+ ST t3, YY, 6 * SIZE -+ ST t4, YY, 7 * SIZE -+ addi.d AO1, AO1, 8 * SIZE -+ addi.d YY, YY, 8 * SIZE -+ .align 3 -+ -+.L25: -+ andi I, M, 4 -+ bge $r0, I, .L26 -+ LD a1, AO1, 0 * SIZE -+ LD y1, YY, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a3, AO1, 2 * SIZE -+ LD y3, YY, 2 * SIZE -+ LD a4, AO1, 3 * SIZE -+ LD y4, YY, 3 * SIZE -+ MADD y1, a1, x1, y1 -+ MADD y2, a2, x1, y2 -+ MADD y3, a3, x1, y3 -+ addi.d YY, YY, 4 * SIZE -+ MADD y4, a4, x1, y4 -+ addi.d AO1, AO1, 4 * SIZE -+ ST y1, YY, -4 * SIZE -+ ST y2, YY, -3 * SIZE -+ ST y3, YY, -2 * SIZE -+ ST y4, YY, -1 * SIZE -+ .align 3 -+ -+.L26: -+ andi I, M, 2 -+ bge $r0, I, .L27 -+ LD a1, AO1, 0 * SIZE -+ LD y1, YY, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD y2, YY, 1 * SIZE -+ MADD y1, a1, x1, y1 -+ addi.d YY, YY, 2 * SIZE -+ MADD y2, a2, x1, y2 -+ addi.d AO1, AO1, 2 * SIZE -+ ST y1, YY, -2 * SIZE -+ ST y2, YY, -1 * SIZE -+ .align 3 -+ -+.L27: -+ andi I, M, 1 -+ bge $r0, I, .L900 -+ LD y1, YY, 0 * SIZE -+ LD a1, AO1, 0 * SIZE -+ MADD y1, a1, x1, y1 -+ ST y1, YY, 0 * SIZE -+ .align 3 -+ -+.L900: -+ li.d YORIG, SIZE -+ srai.d I, M, 2 -+ beq INCY, YORIG, .L999 -+ move XX, BUFFER -+ bge $r0, I, .L905 -+ .align 3 -+ -+.L902: -+ LD a1, XX, 0 * SIZE -+ LD a2, XX, 1 * SIZE -+ LD a3, XX, 2 * SIZE -+ LD a4, XX, 3 * SIZE -+ ST a1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a2, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a3, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a4, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ addi.d I, I, -1 -+ addi.d XX, XX, 4 * SIZE -+ blt $r0, I, .L902 -+ .align 3 -+ -+.L905: -+ andi I, M, 3 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L906: -+ LD a1, XX, 0 * SIZE -+ addi.d XX, XX, 1 * SIZE -+ ST a1, Y, 0 * SIZE -+ addi.d I, I, -1 -+ add.d Y, Y, INCY -+ blt $r0, I, .L906 -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 16 -+ fld.d $f19, $sp, 24 -+ fld.d $f20, $sp, 32 -+#endif -+#ifdef __64BIT__ -+ addi.d $sp, $sp, 16 -+#else -+ addi.d $sp, $sp, 48 -+#endif -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S -new file mode 100644 -index 0000000..af42327 ---- /dev/null -+++ b/kernel/loongarch64/gemv_t.S -@@ -0,0 +1,436 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+/* Unused param dummy1 */ -+#define M $r4 -+#define N $r5 -+#define A $r7 -+#define LDA $r8 -+#define X $r9 -+#define INCX $r10 -+#define Y $r11 -+#define INCY $r6 -+#define BUFFER $r16 -+#define XORIG $r18 -+#define XX $r12 -+#define YY $r13 -+#define I $r14 -+#define J $r15 -+#define AO1 $r23 -+#define AO2 $r24 -+#define ALPHA $f0 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f23 -+#define a4 $f9 -+#define a5 $f10 -+#define a6 $f11 -+#define a7 $f12 -+#define a8 $f13 -+#define y1 $f14 -+#define y2 $f15 -+#define y3 $f16 -+#define y4 $f17 -+#define x1 $f3 -+#define x2 $f1 -+#define x3 $f2 -+#define x4 $f4 -+#define x5 $f5 -+#define x6 $f6 -+#define x7 $f7 -+#define x8 $f18 -+ -+ PROLOGUE -+ -+ LDARG INCY, $sp, 0 -+ LDARG BUFFER, $sp, 8 -+#ifdef __64BIT__ -+ addi.d $sp, $sp, -16 -+#else -+ addi.d $sp, $sp, -32 -+#endif -+ MTC y1, $r0 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ slli.d LDA, LDA, BASE_SHIFT -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 16 -+#endif -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, M, .L999 -+ slli.d INCY, INCY, BASE_SHIFT -+ bge $r0, N, .L999 -+ li.d I, SIZE -+ move XORIG, X -+ beq INCX, I, .L10 -+ srai.d I, M, 2 -+ move XORIG, BUFFER -+ move YY, BUFFER -+ bge $r0, I, .L05 -+ .align 3 -+ -+.L02: -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a1, YY, 0 * SIZE -+ ST a2, YY, 1 * SIZE -+ ST a3, YY, 2 * SIZE -+ ST a4, YY, 3 * SIZE -+ addi.d I, I, -1 -+ addi.d YY, YY, 4 * SIZE -+ blt $r0, I, .L02 -+ .align 3 -+ -+.L05: -+ andi I, M, 3 -+ bge $r0, I, .L10 -+ .align 3 -+ -+.L06: -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a1, YY, 0 * SIZE -+ addi.d I, I, -1 -+ addi.d YY, YY, 1 * SIZE -+ blt $r0, I, .L06 -+ .align 3 -+ -+.L10: -+ srai.d J, N, 1 -+ move YY, Y -+ bge $r0, J, .L20 -+ .align 3 -+ -+.L11: -+ move AO1, A -+ MOV y2, y1 -+ add.d AO2, A, LDA -+ MOV y3, y1 -+ add.d A, AO2, LDA -+ MOV y4, y1 -+ srai.d I, M, 3 -+ move XX, XORIG -+ bge $r0, I, .L15 -+ LD a1, AO1, 0 * SIZE -+ LD x1, XX, 0 * SIZE -+ LD a2, AO2, 0 * SIZE -+ LD x2, XX, 1 * SIZE -+ LD a3, AO1, 1 * SIZE -+ LD x3, XX, 2 * SIZE -+ LD a4, AO2, 1 * SIZE -+ LD x4, XX, 3 * SIZE -+ LD a5, AO1, 2 * SIZE -+ LD x5, XX, 4 * SIZE -+ LD a6, AO2, 2 * SIZE -+ LD x6, XX, 5 * SIZE -+ LD a7, AO1, 3 * SIZE -+ LD x7, XX, 6 * SIZE -+ LD a8, AO2, 3 * SIZE -+ addi.d I, I, -1 -+ LD x8, XX, 7 * SIZE -+ bge $r0, I, .L13 -+ .align 3 -+.L12: -+ MADD y1, a1, x1, y1 -+ LD a1, AO1, 4 * SIZE -+ MADD y2, a2, x1, y2 -+ LD a2, AO2, 4 * SIZE -+ MADD y3, a3, x2, y3 -+ LD a3, AO1, 5 * SIZE -+ MADD y4, a4, x2, y4 -+ LD a4, AO2, 5 * SIZE -+ LD x1, XX, 8 * SIZE -+ LD x2, XX, 9 * SIZE -+ MADD y1, a5, x3, y1 -+ LD a5, AO1, 6 * SIZE -+ MADD y2, a6, x3, y2 -+ LD a6, AO2, 6 * SIZE -+ MADD y3, a7, x4, y3 -+ LD a7, AO1, 7 * SIZE -+ MADD y4, a8, x4, y4 -+ LD a8, AO2, 7 * SIZE -+ LD x3, XX, 10 * SIZE -+ LD x4, XX, 11 * SIZE -+ MADD y1, a1, x5, y1 -+ LD a1, AO1, 8 * SIZE -+ MADD y2, a2, x5, y2 -+ LD a2, AO2, 8 * SIZE -+ MADD y3, a3, x6, y3 -+ LD a3, AO1, 9 * SIZE -+ MADD y4, a4, x6, y4 -+ LD a4, AO2, 9 * SIZE -+ LD x5, XX, 12 * SIZE -+ LD x6, XX, 13 * SIZE -+ MADD y1, a5, x7, y1 -+ LD a5, AO1, 10 * SIZE -+ MADD y2, a6, x7, y2 -+ LD a6, AO2, 10 * SIZE -+ MADD y3, a7, x8, y3 -+ LD a7, AO1, 11 * SIZE -+ MADD y4, a8, x8, y4 -+ LD a8, AO2, 11 * SIZE -+ LD x7, XX, 14 * SIZE -+ LD x8, XX, 15 * SIZE -+ addi.d I, I, -1 -+ addi.d XX, XX, 8 * SIZE -+ addi.d AO1, AO1, 8 * SIZE -+ addi.d AO2, AO2, 8 * SIZE -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ MADD y1, a1, x1, y1 -+ LD a1, AO1, 4 * SIZE -+ MADD y2, a2, x1, y2 -+ LD a2, AO2, 4 * SIZE -+ MADD y3, a3, x2, y3 -+ LD a3, AO1, 5 * SIZE -+ MADD y4, a4, x2, y4 -+ LD a4, AO2, 5 * SIZE -+ MADD y1, a5, x3, y1 -+ LD a5, AO1, 6 * SIZE -+ MADD y2, a6, x3, y2 -+ LD a6, AO2, 6 * SIZE -+ MADD y3, a7, x4, y3 -+ LD a7, AO1, 7 * SIZE -+ MADD y4, a8, x4, y4 -+ LD a8, AO2, 7 * SIZE -+ MADD y1, a1, x5, y1 -+ MADD y2, a2, x5, y2 -+ MADD y3, a3, x6, y3 -+ MADD y4, a4, x6, y4 -+ MADD y1, a5, x7, y1 -+ addi.d XX, XX, 8 * SIZE -+ MADD y2, a6, x7, y2 -+ addi.d AO1, AO1, 8 * SIZE -+ MADD y3, a7, x8, y3 -+ addi.d AO2, AO2, 8 * SIZE -+ MADD y4, a8, x8, y4 -+ .align 3 -+ -+.L15: -+ andi I, M, 4 -+ bge $r0, I, .L17 -+ LD a1, AO1, 0 * SIZE -+ LD x1, XX, 0 * SIZE -+ LD a2, AO2, 0 * SIZE -+ LD a3, AO1, 1 * SIZE -+ LD x2, XX, 1 * SIZE -+ LD a4, AO2, 1 * SIZE -+ LD a5, AO1, 2 * SIZE -+ LD x3, XX, 2 * SIZE -+ MADD y1, a1, x1, y1 -+ LD a6, AO2, 2 * SIZE -+ MADD y2, a2, x1, y2 -+ LD a7, AO1, 3 * SIZE -+ MADD y3, a3, x2, y3 -+ LD x4, XX, 3 * SIZE -+ MADD y4, a4, x2, y4 -+ LD a8, AO2, 3 * SIZE -+ MADD y1, a5, x3, y1 -+ MADD y2, a6, x3, y2 -+ addi.d XX, XX, 4 * SIZE -+ MADD y3, a7, x4, y3 -+ addi.d AO1, AO1, 4 * SIZE -+ MADD y4, a8, x4, y4 -+ addi.d AO2, AO2, 4 * SIZE -+ .align 3 -+ -+.L17: -+ andi I, M, 3 -+ ADD y1, y1, y3 -+ ADD y2, y2, y4 -+ bge $r0, I, .L19 -+ .align 3 -+.L18: -+ LD x1, XX, 0 * SIZE -+ LD a1, AO1, 0 * SIZE -+ LD a2, AO2, 0 * SIZE -+ addi.d I, I, -1 -+ addi.d XX, XX, 1 * SIZE -+ addi.d AO1, AO1, 1 * SIZE -+ addi.d AO2, AO2, 1 * SIZE -+ MADD y1, a1, x1, y1 -+ MADD y2, a2, x1, y2 -+ blt $r0, I, .L18 -+ .align 3 -+ -+.L19: -+ LD a1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a2, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ MADD a1, y1, ALPHA, a1 -+ addi.d J, J, -1 -+ MADD a2, y2, ALPHA, a2 -+ MTC y1, $r0 -+ ST a1, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ ST a2, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ blt $r0, J, .L11 -+ .align 3 -+ -+.L20: -+ andi J, N, 1 -+ MOV y3, y1 -+ move AO1, A -+ bge $r0, J, .L999 -+ srai.d I, M, 3 -+ move XX, XORIG -+ bge $r0, I, .L25 -+ LD a1, AO1, 0 * SIZE -+ LD x1, XX, 0 * SIZE -+ LD a3, AO1, 1 * SIZE -+ LD x2, XX, 1 * SIZE -+ LD a5, AO1, 2 * SIZE -+ LD x3, XX, 2 * SIZE -+ LD a7, AO1, 3 * SIZE -+ LD x4, XX, 3 * SIZE -+ LD x5, XX, 4 * SIZE -+ LD x6, XX, 5 * SIZE -+ LD x7, XX, 6 * SIZE -+ addi.d I, I, -1 -+ LD x8, XX, 7 * SIZE -+ bge $r0, I, .L23 -+ .align 3 -+.L22: -+ MADD y1, a1, x1, y1 -+ LD a1, AO1, 4 * SIZE -+ MADD y3, a3, x2, y3 -+ LD a3, AO1, 5 * SIZE -+ LD x1, XX, 8 * SIZE -+ LD x2, XX, 9 * SIZE -+ MADD y1, a5, x3, y1 -+ LD a5, AO1, 6 * SIZE -+ MADD y3, a7, x4, y3 -+ LD a7, AO1, 7 * SIZE -+ LD x3, XX, 10 * SIZE -+ LD x4, XX, 11 * SIZE -+ MADD y1, a1, x5, y1 -+ LD a1, AO1, 8 * SIZE -+ MADD y3, a3, x6, y3 -+ LD a3, AO1, 9 * SIZE -+ LD x5, XX, 12 * SIZE -+ LD x6, XX, 13 * SIZE -+ MADD y1, a5, x7, y1 -+ LD a5, AO1, 10 * SIZE -+ MADD y3, a7, x8, y3 -+ LD a7, AO1, 11 * SIZE -+ LD x7, XX, 14 * SIZE -+ LD x8, XX, 15 * SIZE -+ addi.d I, I, -1 -+ addi.d XX, XX, 8 * SIZE -+ addi.d AO1, AO1, 8 * SIZE -+ blt $r0, I, .L22 -+ .align 3 -+ -+.L23: -+ MADD y1, a1, x1, y1 -+ LD a1, AO1, 4 * SIZE -+ MADD y3, a3, x2, y3 -+ LD a3, AO1, 5 * SIZE -+ MADD y1, a5, x3, y1 -+ LD a5, AO1, 6 * SIZE -+ MADD y3, a7, x4, y3 -+ LD a7, AO1, 7 * SIZE -+ MADD y1, a1, x5, y1 -+ MADD y3, a3, x6, y3 -+ MADD y1, a5, x7, y1 -+ MADD y3, a7, x8, y3 -+ addi.d XX, XX, 8 * SIZE -+ addi.d AO1, AO1, 8 * SIZE -+ .align 3 -+ -+.L25: -+ andi I, M, 4 -+ bge $r0, I, .L27 -+ LD a1, AO1, 0 * SIZE -+ LD x1, XX, 0 * SIZE -+ LD a3, AO1, 1 * SIZE -+ LD x2, XX, 1 * SIZE -+ LD a5, AO1, 2 * SIZE -+ LD x3, XX, 2 * SIZE -+ MADD y1, a1, x1, y1 -+ LD a7, AO1, 3 * SIZE -+ MADD y3, a3, x2, y3 -+ LD x4, XX, 3 * SIZE -+ MADD y1, a5, x3, y1 -+ addi.d XX, XX, 4 * SIZE -+ MADD y3, a7, x4, y3 -+ addi.d AO1, AO1, 4 * SIZE -+ .align 3 -+ -+.L27: -+ andi I, M, 3 -+ ADD y1, y1, y3 -+ bge $r0, I, .L29 -+ .align 3 -+.L28: -+ LD x1, XX, 0 * SIZE -+ LD a1, AO1, 0 * SIZE -+ addi.d I, I, -1 -+ addi.d XX, XX, 1 * SIZE -+ addi.d AO1, AO1, 1 * SIZE -+ MADD y1, a1, x1, y1 -+ blt $r0, I, .L28 -+ .align 3 -+ -+.L29: -+ LD a1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ MADD a1, y1, ALPHA, a1 -+ ST a1, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 16 -+#endif -+#ifdef __64BIT__ -+ addi.d $sp, $sp, 16 -+#else -+ addi.d $sp, $sp, 32 -+#endif -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S -new file mode 100644 -index 0000000..31b1a9e ---- /dev/null -+++ b/kernel/loongarch64/iamax.S -@@ -0,0 +1,233 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r18 -+#define TEMP $r7 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define t1 $f0 -+#define t2 $f1 -+#define t3 $f2 -+#define t4 $f3 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+#define x1 $r17 -+#define x2 $r8 -+#define x3 $r9 -+#define x4 $r10 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ li.d x1, 0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, INCX, .L999 -+ LD a1, X, 0 * SIZE -+ addi.d N, N, -1 -+ li.d x1, 1 -+ bge $r0, N, .L999 -+ FABS s1, a1 -+ add.d X, X, INCX -+ FABS s2, a1 -+ li.d x2, 1 -+ FABS s3, a1 -+ srai.d I, N, 3 -+ FABS s4, a1 -+ li.d x3, 1 -+ li.d TEMP, 2 -+ li.d x4, 1 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a8, X, 0 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ FABS t1, a1 -+ LD a1, X, 0 * SIZE -+ FABS t2, a2 -+ add.d X, X, INCX -+ FABS t3, a3 -+ LD a2, X, 0 * SIZE -+ FABS t4, a4 -+ add.d X, X, INCX -+ CMPLT $fcc0, s1, t1 -+ LD a3, X, 0 * SIZE -+ CMPLT $fcc1, s2, t2 -+ add.d X, X, INCX -+ CMPLT $fcc2, s3, t3 -+ LD a4, X, 0 * SIZE -+ CMPLT $fcc3, s4, t4 -+ add.d X, X, INCX -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t2, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t3, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t4, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ addi.d TEMP, TEMP, 4 -+ addi.d I, I, -1 -+ FABS t1, a5 -+ LD a5, X, 0 * SIZE -+ FABS t2, a6 -+ add.d X, X, INCX -+ FABS t3, a7 -+ LD a6, X, 0 * SIZE -+ FABS t4, a8 -+ add.d X, X, INCX -+ CMPLT $fcc0, s1, t1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, s2, t2 -+ add.d X, X, INCX -+ CMPLT $fcc2, s3, t3 -+ LD a8, X, 0 * SIZE -+ CMPLT $fcc3, s4, t4 -+ add.d X, X, INCX -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t2, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t3, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t4, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ addi.d TEMP, TEMP, 4 -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ FABS t1, a1 -+ FABS t2, a2 -+ FABS t3, a3 -+ FABS t4, a4 -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t2 -+ CMPLT $fcc2, s3, t3 -+ CMPLT $fcc3, s4, t4 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t2, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t3, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t4, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ FABS t1, a5 -+ addi.d TEMP, TEMP, 4 -+ FABS t2, a6 -+ FABS t3, a7 -+ FABS t4, a8 -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t2 -+ CMPLT $fcc2, s3, t3 -+ CMPLT $fcc3, s4, t4 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t2, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t3, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t4, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ addi.d TEMP, TEMP, 4 -+ addi.d x2, x2, 1 -+ addi.d x3, x3, 2 -+ addi.d x4, x4, 3 -+ .align 3 -+ -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ FABS t1, a1 -+ addi.d I, I, -1 -+ CMPLT $fcc0, s1, t1 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ addi.d TEMP, TEMP, 1 -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L998: -+ CMPLT $fcc0, s1, s2 -+ CMPLT $fcc1, s3, s4 -+ CMOVT s1, s1, s2, $fcc0 -+ MOVT(x1, x2, $fcc0) -+ CMOVT s3, s3, s4, $fcc1 -+ MOVT(x3, x4, $fcc1) -+ CMPLT $fcc0, s1, s3 -+ CMOVT s1, s1, s3, $fcc0 -+ MOVT(x1, x3, $fcc0) -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S -new file mode 100644 -index 0000000..9364b97 ---- /dev/null -+++ b/kernel/loongarch64/iamin.S -@@ -0,0 +1,233 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r18 -+#define TEMP $r7 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define t1 $f0 -+#define t2 $f1 -+#define t3 $f2 -+#define t4 $f3 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+#define x1 $r17 -+#define x2 $r8 -+#define x3 $r9 -+#define x4 $r10 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ li.d x1, 0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, INCX, .L999 -+ LD a1, X, 0 * SIZE -+ addi.d N, N, -1 -+ li.d x1, 1 -+ bge $r0, N, .L999 -+ FABS s1, a1 -+ add.d X, X, INCX -+ FABS s2, a1 -+ li.d x2, 1 -+ FABS s3, a1 -+ srai.d I, N, 3 -+ FABS s4, a1 -+ li.d x3, 1 -+ li.d TEMP, 2 -+ li.d x4, 1 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a8, X, 0 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ FABS t1, a1 -+ LD a1, X, 0 * SIZE -+ FABS t2, a2 -+ add.d X, X, INCX -+ FABS t3, a3 -+ LD a2, X, 0 * SIZE -+ FABS t4, a4 -+ add.d X, X, INCX -+ CMPLT $fcc0, t1, s1 -+ LD a3, X, 0 * SIZE -+ CMPLT $fcc1, t2, s2 -+ add.d X, X, INCX -+ CMPLT $fcc2, t3, s3 -+ LD a4, X, 0 * SIZE -+ CMPLT $fcc3, t4, s4 -+ add.d X, X, INCX -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t2, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t3, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t4, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ addi.d TEMP, TEMP, 4 -+ addi.d I, I, -1 -+ FABS t1, a5 -+ LD a5, X, 0 * SIZE -+ FABS t2, a6 -+ add.d X, X, INCX -+ FABS t3, a7 -+ LD a6, X, 0 * SIZE -+ FABS t4, a8 -+ add.d X, X, INCX -+ CMPLT $fcc0, t1, s1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, t2, s2 -+ add.d X, X, INCX -+ CMPLT $fcc2, t3, s3 -+ LD a8, X, 0 * SIZE -+ CMPLT $fcc3, t4, s4 -+ add.d X, X, INCX -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t2, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t3, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t4, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ addi.d TEMP, TEMP, 4 -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ FABS t1, a1 -+ FABS t2, a2 -+ FABS t3, a3 -+ FABS t4, a4 -+ CMPLT $fcc0, t1, s1 -+ CMPLT $fcc1, t2, s2 -+ CMPLT $fcc2, t3, s3 -+ CMPLT $fcc3, t4, s4 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t2, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t3, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t4, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ FABS t1, a5 -+ addi.d TEMP, TEMP, 4 -+ FABS t2, a6 -+ FABS t3, a7 -+ FABS t4, a8 -+ CMPLT $fcc0, t1, s1 -+ CMPLT $fcc1, t2, s2 -+ CMPLT $fcc2, t3, s3 -+ CMPLT $fcc3, t4, s4 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t2, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t3, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t4, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ addi.d TEMP, TEMP, 4 -+ addi.d x2, x2, 1 -+ addi.d x3, x3, 2 -+ addi.d x4, x4, 3 -+ .align 3 -+ -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ FABS t1, a1 -+ addi.d I, I, -1 -+ CMPLT $fcc0, t1, s1 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ addi.d TEMP, TEMP, 1 -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L998: -+ CMPLT $fcc0, s2, s1 -+ CMPLT $fcc1, s4, s3 -+ CMOVT s1, s1, s2, $fcc0 -+ MOVT(x1, x2, $fcc0) -+ CMOVT s3, s3, s4, $fcc1 -+ MOVT(x3, x4, $fcc1) -+ CMPLT $fcc0, s3, s1 -+ CMOVT s1, s1, s3, $fcc0 -+ MOVT(x1, x3, $fcc0) -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S -new file mode 100644 -index 0000000..8d3ae52 ---- /dev/null -+++ b/kernel/loongarch64/izamax.S -@@ -0,0 +1,217 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r18 -+#define TEMP $r7 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define t1 $f0 -+#define t2 $f1 -+#define t3 $f2 -+#define t4 $f3 -+#define t5 $f4 -+#define t6 $f5 -+#define t7 $f6 -+#define t8 $f7 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+#define x1 $r17 -+#define x2 $r8 -+#define x3 $r9 -+#define x4 $r10 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ li.d x1, 0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, ZBASE_SHIFT -+ bge $r0, INCX, .L999 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ FABS t1, a1 -+ FABS t2, a2 -+ ADD s1, t1, t2 -+ ADD s2, t1, t2 -+ ADD s3, t1, t2 -+ ADD s4, t1, t2 -+ addi.d N, N, -1 -+ li.d x1, 1 -+ bge $r0, N, .L999 -+ add.d X, X, INCX -+ li.d x2, 1 -+ srai.d I, N, 2 -+ li.d x3, 1 -+ li.d TEMP, 2 -+ li.d x4, 1 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ LD a8, X, 1 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ FABS t1, a1 -+ LD a1, X, 0 * SIZE -+ FABS t2, a2 -+ LD a2, X, 1 * SIZE -+ FABS t3, a3 -+ add.d X, X, INCX -+ FABS t4, a4 -+ FABS t5, a5 -+ LD a3, X, 0 * SIZE -+ FABS t6, a6 -+ LD a4, X, 1 * SIZE -+ FABS t7, a7 -+ add.d X, X, INCX -+ FABS t8, a8 -+ ADD t1, t1, t2 -+ LD a5, X, 0 * SIZE -+ ADD t3, t3, t4 -+ LD a6, X, 1 * SIZE -+ ADD t5, t5, t6 -+ add.d X, X, INCX -+ ADD t7, t7, t8 -+ CMPLT $fcc0, s1, t1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, s2, t3 -+ LD a8, X, 1 * SIZE -+ CMPLT $fcc2, s3, t5 -+ add.d X, X, INCX -+ CMPLT $fcc3, s4, t7 -+ addi.d I, I, -1 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t3, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t5, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t7, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ addi.d TEMP, TEMP, 4 -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ FABS t1, a1 -+ FABS t2, a2 -+ FABS t3, a3 -+ FABS t4, a4 -+ FABS t5, a5 -+ FABS t6, a6 -+ FABS t7, a7 -+ FABS t8, a8 -+ ADD t1, t1, t2 -+ ADD t3, t3, t4 -+ ADD t5, t5, t6 -+ ADD t7, t7, t8 -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t3 -+ CMPLT $fcc2, s3, t5 -+ CMPLT $fcc3, s4, t7 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t3, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t5, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t7, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ addi.d TEMP, TEMP, 4 -+ addi.d x2, x2, 1 -+ addi.d x3, x3, 2 -+ addi.d x4, x4, 3 -+ .align 3 -+ -+.L15: -+ andi I, N, 3 -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ FABS t1, a1 -+ FABS t2, a2 -+ ADD t1, t1, t2 -+ addi.d I, I, -1 -+ CMPLT $fcc0, s1, t1 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ addi.d TEMP, TEMP, 1 -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L998: -+ CMPLT $fcc0, s1, s2 -+ CMPLT $fcc1, s3, s4 -+ CMOVT s1, s1, s2, $fcc0 -+ MOVT(x1, x2, $fcc0) -+ CMOVT s3, s3, s4, $fcc1 -+ MOVT(x3, x4, $fcc1) -+ CMPLT $fcc0, s1, s3 -+ CMOVT s1, s1, s3, $fcc0 -+ MOVT(x1, x3, $fcc0) -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S -new file mode 100644 -index 0000000..38a109c ---- /dev/null -+++ b/kernel/loongarch64/izamin.S -@@ -0,0 +1,217 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r18 -+#define TEMP $r7 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define t1 $f0 -+#define t2 $f1 -+#define t3 $f2 -+#define t4 $f3 -+#define t5 $f4 -+#define t6 $f5 -+#define t7 $f6 -+#define t8 $f7 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+#define x1 $r17 -+#define x2 $r8 -+#define x3 $r9 -+#define x4 $r10 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ li.d x1, 0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, ZBASE_SHIFT -+ bge $r0, INCX, .L999 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ FABS t1, a1 -+ FABS t2, a2 -+ ADD s1, t1, t2 -+ ADD s2, t1, t2 -+ ADD s3, t1, t2 -+ ADD s4, t1, t2 -+ addi.d N, N, -1 -+ li.d x1, 1 -+ bge $r0, N, .L999 -+ add.d X, X, INCX -+ li.d x2, 1 -+ srai.d I, N, 2 -+ li.d x3, 1 -+ li.d TEMP, 2 -+ li.d x4, 1 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ LD a8, X, 1 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ FABS t1, a1 -+ LD a1, X, 0 * SIZE -+ FABS t2, a2 -+ LD a2, X, 1 * SIZE -+ FABS t3, a3 -+ add.d X, X, INCX -+ FABS t4, a4 -+ FABS t5, a5 -+ LD a3, X, 0 * SIZE -+ FABS t6, a6 -+ LD a4, X, 1 * SIZE -+ FABS t7, a7 -+ add.d X, X, INCX -+ FABS t8, a8 -+ ADD t1, t1, t2 -+ LD a5, X, 0 * SIZE -+ ADD t3, t3, t4 -+ LD a6, X, 1 * SIZE -+ ADD t5, t5, t6 -+ add.d X, X, INCX -+ ADD t7, t7, t8 -+ CMPLT $fcc0, t1, s1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, t3, s2 -+ LD a8, X, 1 * SIZE -+ CMPLT $fcc2, t5, s3 -+ add.d X, X, INCX -+ CMPLT $fcc3, t7, s4 -+ addi.d I, I, -1 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t3, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t5, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t7, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ addi.d TEMP, TEMP, 4 -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ FABS t1, a1 -+ FABS t2, a2 -+ FABS t3, a3 -+ FABS t4, a4 -+ FABS t5, a5 -+ FABS t6, a6 -+ FABS t7, a7 -+ FABS t8, a8 -+ ADD t1, t1, t2 -+ ADD t3, t3, t4 -+ ADD t5, t5, t6 -+ ADD t7, t7, t8 -+ CMPLT $fcc0, t1, s1 -+ CMPLT $fcc1, t3, s2 -+ CMPLT $fcc2, t5, s3 -+ CMPLT $fcc3, t7, s4 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ CMOVT s2, s2, t3, $fcc1 -+ MOVT(x2, TEMP, $fcc1) -+ CMOVT s3, s3, t5, $fcc2 -+ MOVT(x3, TEMP, $fcc2) -+ CMOVT s4, s4, t7, $fcc3 -+ MOVT(x4, TEMP, $fcc3) -+ addi.d TEMP, TEMP, 4 -+ addi.d x2, x2, 1 -+ addi.d x3, x3, 2 -+ addi.d x4, x4, 3 -+ .align 3 -+ -+.L15: -+ andi I, N, 3 -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ FABS t1, a1 -+ FABS t2, a2 -+ ADD t1, t1, t2 -+ addi.d I, I, -1 -+ CMPLT $fcc0, t1, s1 -+ CMOVT s1, s1, t1, $fcc0 -+ MOVT(x1, TEMP, $fcc0) -+ addi.d TEMP, TEMP, 1 -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L998: -+ CMPLT $fcc0, s2, s1 -+ CMPLT $fcc1, s4, s3 -+ CMOVT s1, s1, s2, $fcc0 -+ MOVT(x1, x2, $fcc0) -+ CMOVT s3, s3, s4, $fcc1 -+ MOVT(x3, x4, $fcc1) -+ CMPLT $fcc0, s3, s1 -+ CMOVT s1, s1, s3, $fcc0 -+ MOVT(x1, x3, $fcc0) -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/max.S b/kernel/loongarch64/max.S -new file mode 100644 -index 0000000..56c3f99 ---- /dev/null -+++ b/kernel/loongarch64/max.S -@@ -0,0 +1,174 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ MTC s1, $r0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, INCX, .L999 -+ LD s1, X, 0 * SIZE -+ addi.d N, N, -1 -+ add.d X, X, INCX -+ MOV s2, s1 -+ bge $r0, N, .L999 -+ MOV s3, s1 -+ srai.d I, N, 3 -+ MOV s4, s1 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ CMPLT $fcc0, s1, a1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, s2, a2 -+ add.d X, X, INCX -+ CMPLT $fcc2, s3, a3 -+ LD a8, X, 0 * SIZE -+ CMPLT $fcc3, s4, a4 -+ add.d X, X, INCX -+ CMOVT s1, s1, a1, $fcc0 -+ LD a1, X, 0 * SIZE -+ CMOVT s2, s2, a2, $fcc1 -+ add.d X, X, INCX -+ CMOVT s3, s3, a3, $fcc2 -+ LD a2, X, 0 * SIZE -+ CMOVT s4, s4, a4, $fcc3 -+ add.d X, X, INCX -+ CMPLT $fcc0, s1, a5 -+ LD a3, X, 0 * SIZE -+ CMPLT $fcc1, s2, a6 -+ add.d X, X, INCX -+ CMPLT $fcc2, s3, a7 -+ LD a4, X, 0 * SIZE -+ CMPLT $fcc3, s4, a8 -+ add.d X, X, INCX -+ CMOVT s1, s1, a5, $fcc0 -+ LD a5, X, 0 * SIZE -+ CMOVT s2, s2, a6, $fcc1 -+ add.d X, X, INCX -+ CMOVT s3, s3, a7, $fcc2 -+ LD a6, X, 0 * SIZE -+ CMOVT s4, s4, a8, $fcc3 -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ CMPLT $fcc0, s1, a1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, s2, a2 -+ add.d X, X, INCX -+ CMPLT $fcc2, s3, a3 -+ LD a8, X, 0 * SIZE -+ CMPLT $fcc3, s4, a4 -+ add.d X, X, INCX -+ CMOVT s1, s1, a1, $fcc0 -+ CMOVT s2, s2, a2, $fcc1 -+ CMOVT s3, s3, a3, $fcc2 -+ CMOVT s4, s4, a4, $fcc3 -+ CMPLT $fcc0, s1, a5 -+ CMPLT $fcc1, s2, a6 -+ CMPLT $fcc2, s3, a7 -+ CMPLT $fcc3, s4, a8 -+ CMOVT s1, s1, a5, $fcc0 -+ CMOVT s2, s2, a6, $fcc1 -+ CMOVT s3, s3, a7, $fcc2 -+ CMOVT s4, s4, a8, $fcc3 -+ .align 3 -+ -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ CMPLT $fcc0, s1, a1 -+ CMOVT s1, s1, a1, $fcc0 -+ add.d X, X, INCX -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L998: -+ CMPLT $fcc0, s1, s2 -+ CMPLT $fcc1, s3, s4 -+ CMOVT s1, s1, s2, $fcc0 -+ CMOVT s3, s3, s4, $fcc1 -+ CMPLT $fcc0, s1, s3 -+ CMOVT s1, s1, s3, $fcc0 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/min.S b/kernel/loongarch64/min.S -new file mode 100644 -index 0000000..bb2fcfb ---- /dev/null -+++ b/kernel/loongarch64/min.S -@@ -0,0 +1,174 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ MTC s1, $r0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, INCX, .L999 -+ LD s1, X, 0 * SIZE -+ addi.d N, N, -1 -+ add.d X, X, INCX -+ MOV s2, s1 -+ bge $r0, N, .L999 -+ MOV s3, s1 -+ srai.d I, N, 3 -+ MOV s4, s1 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ CMPLT $fcc0, a1, s1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, a2, s2 -+ add.d X, X, INCX -+ CMPLT $fcc2, a3, s3 -+ LD a8, X, 0 * SIZE -+ CMPLT $fcc3, a4, s4 -+ add.d X, X, INCX -+ CMOVT s1, s1, a1, $fcc0 -+ LD a1, X, 0 * SIZE -+ CMOVT s2, s2, a2, $fcc1 -+ add.d X, X, INCX -+ CMOVT s3, s3, a3, $fcc2 -+ LD a2, X, 0 * SIZE -+ CMOVT s4, s4, a4, $fcc3 -+ add.d X, X, INCX -+ CMPLT $fcc0, a5, s1 -+ LD a3, X, 0 * SIZE -+ CMPLT $fcc1, a6, s2 -+ add.d X, X, INCX -+ CMPLT $fcc2, a7, s3 -+ LD a4, X, 0 * SIZE -+ CMPLT $fcc3, a8, s4 -+ add.d X, X, INCX -+ CMOVT s1, s1, a5, $fcc0 -+ LD a5, X, 0 * SIZE -+ CMOVT s2, s2, a6, $fcc1 -+ add.d X, X, INCX -+ CMOVT s3, s3, a7, $fcc2 -+ LD a6, X, 0 * SIZE -+ CMOVT s4, s4, a8, $fcc3 -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ CMPLT $fcc0, a1, s1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, a2, s2 -+ add.d X, X, INCX -+ CMPLT $fcc2, a3, s3 -+ LD a8, X, 0 * SIZE -+ CMPLT $fcc3, a4, s4 -+ add.d X, X, INCX -+ CMOVT s1, s1, a1, $fcc0 -+ CMOVT s2, s2, a2, $fcc1 -+ CMOVT s3, s3, a3, $fcc2 -+ CMOVT s4, s4, a4, $fcc3 -+ CMPLT $fcc0, a5, s1 -+ CMPLT $fcc1, a6, s2 -+ CMPLT $fcc2, a7, s3 -+ CMPLT $fcc3, a8, s4 -+ CMOVT s1, s1, a5, $fcc0 -+ CMOVT s2, s2, a6, $fcc1 -+ CMOVT s3, s3, a7, $fcc2 -+ CMOVT s4, s4, a8, $fcc3 -+ .align 3 -+ -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ CMPLT $fcc0, a1, s1 -+ CMOVT s1, s1, a1, $fcc0 -+ add.d X, X, INCX -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L998: -+ CMPLT $fcc0, s2, s1 -+ CMPLT $fcc1, s4, s3 -+ CMOVT s1, s1, s2, $fcc0 -+ CMOVT s3, s3, s4, $fcc1 -+ CMPLT $fcc0, s3, s1 -+ CMOVT s1, s1, s3, $fcc0 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S -new file mode 100644 -index 0000000..566bce6 ---- /dev/null -+++ b/kernel/loongarch64/scal.S -@@ -0,0 +1,330 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r7 -+#define INCX $r8 -+ -+#define I $r17 -+#define TEMP $r18 -+#define XX $r5 -+#define ALPHA $f0 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f23 -+#define a4 $f9 -+#define a5 $f10 -+#define a6 $f11 -+#define a7 $f12 -+#define a8 $f13 -+#define t1 $f14 -+#define t2 $f15 -+#define t3 $f16 -+#define t4 $f17 -+ -+ PROLOGUE -+ -+ li.d TEMP, SIZE -+ MTC a1, $r0 -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, N, .L999 -+ CMPEQ $fcc0, ALPHA, a1 -+ bceqz $fcc0, .L50 -+ srai.d I, N, 3 -+ bne INCX, TEMP, .L20 -+ bge $r0, I, .L15 -+ .align 3 -+ -+.L12: -+ ST a1, X, 0 * SIZE -+ ST a1, X, 1 * SIZE -+ ST a1, X, 2 * SIZE -+ ST a1, X, 3 * SIZE -+ ST a1, X, 4 * SIZE -+ ST a1, X, 5 * SIZE -+ ST a1, X, 6 * SIZE -+ ST a1, X, 7 * SIZE -+ addi.w I, I, -1 -+ addi.d X, X, 8 * SIZE -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+.L16: -+ ST a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ addi.d X, X, SIZE -+ blt $r0, I, .L16 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ .align 3 -+ -+.L20: -+ srai.d I, N, 3 -+ bge $r0, I, .L25 -+ .align 3 -+ -+.L22: -+ ST a1, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a1, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a1, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a1, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a1, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a1, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a1, X, 0 * SIZE -+ add.d X, X, INCX -+ ST a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ blt $r0, I, .L22 -+ .align 3 -+ -+.L25: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+.L26: -+ addi.d I, I, -1 -+ ST a1, X, 0 * SIZE -+ add.d X, X, INCX -+ blt $r0, I, .L26 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ .align 3 -+ -+.L50: -+ srai.d I, N, 3 -+ bne INCX, TEMP, .L60 -+ addi.d I, I, -1 -+ blt I, $r0, .L55 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD a3, X, 2 * SIZE -+ LD a4, X, 3 * SIZE -+ LD a5, X, 4 * SIZE -+ LD a6, X, 5 * SIZE -+ LD a7, X, 6 * SIZE -+ LD a8, X, 7 * SIZE -+ bge $r0, I, .L53 -+ .align 3 -+ -+.L52: -+ MUL t1, ALPHA, a1 -+ LD a1, X, 8 * SIZE -+ MUL t2, ALPHA, a2 -+ LD a2, X, 9 * SIZE -+ MUL t3, ALPHA, a3 -+ LD a3, X, 10 * SIZE -+ MUL t4, ALPHA, a4 -+ LD a4, X, 11 * SIZE -+ ST t1, X, 0 * SIZE -+ MUL t1, ALPHA, a5 -+ LD a5, X, 12 * SIZE -+ ST t2, X, 1 * SIZE -+ MUL t2, ALPHA, a6 -+ LD a6, X, 13 * SIZE -+ ST t3, X, 2 * SIZE -+ MUL t3, ALPHA, a7 -+ LD a7, X, 14 * SIZE -+ ST t4, X, 3 * SIZE -+ MUL t4, ALPHA, a8 -+ LD a8, X, 15 * SIZE -+ addi.d I, I, -1 -+ ST t1, X, 4 * SIZE -+ ST t2, X, 5 * SIZE -+ ST t3, X, 6 * SIZE -+ ST t4, X, 7 * SIZE -+ addi.d X, X, 8 * SIZE -+ blt $r0, I, .L52 -+ .align 3 -+ -+.L53: -+ MUL t1, ALPHA, a1 -+ MUL t2, ALPHA, a2 -+ MUL t3, ALPHA, a3 -+ MUL t4, ALPHA, a4 -+ ST t1, X, 0 * SIZE -+ MUL t1, ALPHA, a5 -+ ST t2, X, 1 * SIZE -+ MUL t2, ALPHA, a6 -+ ST t3, X, 2 * SIZE -+ MUL t3, ALPHA, a7 -+ ST t4, X, 3 * SIZE -+ MUL t4, ALPHA, a8 -+ ST t1, X, 4 * SIZE -+ ST t2, X, 5 * SIZE -+ ST t3, X, 6 * SIZE -+ ST t4, X, 7 * SIZE -+ addi.d X, X, 8 * SIZE -+ .align 3 -+ -+.L55: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+.L56: -+ LD a1, X, 0 * SIZE -+ MUL t1, ALPHA, a1 -+ addi.d X, X, SIZE -+ addi.d I, I, -1 -+ ST t1, X, -1 * SIZE -+ blt $r0, I, .L56 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ .align 3 -+ -+.L60: -+ srai.d I, N, 3 -+ move XX, X -+ addi.d I, I, -1 -+ blt I, $r0, .L65 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a8, X, 0 * SIZE -+ add.d X, X, INCX -+ bge $r0, I, .L63 -+ .align 3 -+ -+.L62: -+ MUL t1, ALPHA, a1 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ MUL t2, ALPHA, a2 -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ MUL t3, ALPHA, a3 -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ MUL t4, ALPHA, a4 -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ ST t1, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t2, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t3, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t4, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ MUL t1, ALPHA, a5 -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ MUL t2, ALPHA, a6 -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ MUL t3, ALPHA, a7 -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ MUL t4, ALPHA, a8 -+ LD a8, X, 0 * SIZE -+ add.d X, X, INCX -+ ST t1, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t2, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t3, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t4, XX, 0 * SIZE -+ addi.d I, I, -1 -+ add.d XX, XX, INCX -+ blt $r0, I, .L62 -+ .align 3 -+ -+.L63: -+ MUL t1, ALPHA, a1 -+ MUL t2, ALPHA, a2 -+ MUL t3, ALPHA, a3 -+ MUL t4, ALPHA, a4 -+ ST t1, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t2, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t3, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t4, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ MUL t1, ALPHA, a5 -+ MUL t2, ALPHA, a6 -+ MUL t3, ALPHA, a7 -+ MUL t4, ALPHA, a8 -+ ST t1, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t2, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t3, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST t4, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ .align 3 -+ -+.L65: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+.L66: -+ LD a1, X, 0 * SIZE -+ MUL t1, ALPHA, a1 -+ addi.d I, I, -1 -+ ST t1, X, 0 * SIZE -+ add.d X, X, INCX -+ blt $r0, I, .L66 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S -new file mode 100644 -index 0000000..57c21a0 ---- /dev/null -+++ b/kernel/loongarch64/snrm2.S -@@ -0,0 +1,249 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f12 -+#define a2 $f13 -+#define a3 $f14 -+#define a4 $f15 -+#define a5 $f16 -+#define a6 $f17 -+#define a7 $f0 -+#define a8 $f1 -+#define s1 $f22 -+#define s2 $f8 -+#define t1 $f23 -+#define t2 $f9 -+#define t3 $f10 -+#define t4 $f11 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ movgr2fr.d s1, $r0 -+ li.d TEMP, SIZE -+ fmov.d s2, s1 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, INCX, .L999 -+ srai.d I, N, 3 -+ bne INCX, TEMP, .L20 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD a3, X, 2 * SIZE -+ LD a4, X, 3 * SIZE -+ LD a5, X, 4 * SIZE -+ addi.d I, I, -1 -+ fcvt.d.s t1, a1 -+ LD a6, X, 5 * SIZE -+ fcvt.d.s t2, a2 -+ LD a7, X, 6 * SIZE -+ fcvt.d.s t3, a3 -+ LD a8, X, 7 * SIZE -+ fcvt.d.s t4, a4 -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ fmadd.d s1, t1, t1, s1 -+ LD a1, X, 8 * SIZE -+ fcvt.d.s t1, a5 -+ NOP -+ fmadd.d s2, t2, t2, s2 -+ LD a2, X, 9 * SIZE -+ fcvt.d.s t2, a6 -+ NOP -+ fmadd.d s1, t3, t3, s1 -+ LD a3, X, 10 * SIZE -+ fcvt.d.s t3, a7 -+ NOP -+ fmadd.d s2, t4, t4, s2 -+ LD a4, X, 11 * SIZE -+ fcvt.d.s t4, a8 -+ NOP -+ fmadd.d s1, t1, t1, s1 -+ LD a5, X, 12 * SIZE -+ fcvt.d.s t1, a1 -+ NOP -+ fmadd.d s2, t2, t2, s2 -+ LD a6, X, 13 * SIZE -+ fcvt.d.s t2, a2 -+ addi.d I, I, -1 -+ fmadd.d s1, t3, t3, s1 -+ LD a7, X, 14 * SIZE -+ fcvt.d.s t3, a3 -+ addi.d X, X, 8 * SIZE -+ fmadd.d s2, t4, t4, s2 -+ LD a8, X, 7 * SIZE -+ fcvt.d.s t4, a4 -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ fmadd.d s1, t1, t1, s1 -+ fcvt.d.s t1, a5 -+ fmadd.d s2, t2, t2, s2 -+ fcvt.d.s t2, a6 -+ fmadd.d s1, t3, t3, s1 -+ fcvt.d.s t3, a7 -+ fmadd.d s2, t4, t4, s2 -+ fcvt.d.s t4, a8 -+ fmadd.d s1, t1, t1, s1 -+ fmadd.d s2, t2, t2, s2 -+ fmadd.d s1, t3, t3, s1 -+ fmadd.d s2, t4, t4, s2 -+ addi.d X, X, 8 * SIZE -+ .align 3 -+ -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ fcvt.d.s t1, a1 -+ fmadd.d s1, t1, t1, s1 -+ addi.d X, X, SIZE -+ blt $r0, I, .L16 -+ b .L999 -+ .align 3 -+ -+.L20: -+ bge $r0, I, .L25 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ LD a8, X, 0 * SIZE -+ addi.d I, I, -1 -+ fcvt.d.s t1, a1 -+ fcvt.d.s t2, a2 -+ fcvt.d.s t3, a3 -+ fcvt.d.s t4, a4 -+ add.d X, X, INCX -+ bge $r0, I, .L24 -+ .align 3 -+ -+.L23: -+ fmadd.d s1, t1, t1, s1 -+ LD a1, X, 0 * SIZE -+ fcvt.d.s t1, a5 -+ add.d X, X, INCX -+ fmadd.d s2, t2, t2, s2 -+ LD a2, X, 0 * SIZE -+ fcvt.d.s t2, a6 -+ add.d X, X, INCX -+ fmadd.d s1, t3, t3, s1 -+ LD a3, X, 0 * SIZE -+ fcvt.d.s t3, a7 -+ add.d X, X, INCX -+ fmadd.d s2, t4, t4, s2 -+ LD a4, X, 0 * SIZE -+ fcvt.d.s t4, a8 -+ add.d X, X, INCX -+ fmadd.d s1, t1, t1, s1 -+ LD a5, X, 0 * SIZE -+ fcvt.d.s t1, a1 -+ add.d X, X, INCX -+ fmadd.d s2, t2, t2, s2 -+ LD a6, X, 0 * SIZE -+ fcvt.d.s t2, a2 -+ add.d X, X, INCX -+ fmadd.d s1, t3, t3, s1 -+ LD a7, X, 0 * SIZE -+ fcvt.d.s t3, a3 -+ add.d X, X, INCX -+ fmadd.d s2, t4, t4, s2 -+ LD a8, X, 0 * SIZE -+ fcvt.d.s t4, a4 -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ blt $r0, I, .L23 -+ .align 3 -+ -+.L24: -+ fmadd.d s1, t1, t1, s1 -+ fcvt.d.s t1, a5 -+ fmadd.d s2, t2, t2, s2 -+ fcvt.d.s t2, a6 -+ fmadd.d s1, t3, t3, s1 -+ fcvt.d.s t3, a7 -+ fmadd.d s2, t4, t4, s2 -+ fcvt.d.s t4, a8 -+ fmadd.d s1, t1, t1, s1 -+ fmadd.d s2, t2, t2, s2 -+ fmadd.d s1, t3, t3, s1 -+ fmadd.d s2, t4, t4, s2 -+ .align 3 -+ -+.L25: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L26: -+ LD a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ fcvt.d.s t1, a1 -+ add.d X, X, INCX -+ fmadd.d s1, t1, t1, s1 -+ blt $r0, I, .L26 -+ .align 3 -+ -+.L999: -+ fadd.d s1, s1, s2 -+ fsqrt.d s1, s1 -+ move $r4, $r17 -+ fcvt.s.d $f0, s1 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S -new file mode 100644 -index 0000000..4578a8d ---- /dev/null -+++ b/kernel/loongarch64/swap.S -@@ -0,0 +1,330 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r7 -+#define INCX $r8 -+#define Y $r9 -+#define INCY $r10 -+ -+#define I $r17 -+#define TEMP $r18 -+#define XX $r5 -+#define YY $r6 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f23 -+#define a4 $f9 -+#define a5 $f10 -+#define a6 $f11 -+#define a7 $f12 -+#define a8 $f13 -+#define b1 $f14 -+#define b2 $f15 -+#define b3 $f16 -+#define b4 $f17 -+#define b5 $f0 -+#define b6 $f1 -+#define b7 $f2 -+#define b8 $f3 -+ -+ PROLOGUE -+ -+ li.d TEMP, SIZE -+ slli.d INCX, INCX, BASE_SHIFT -+ bge $r0, N, .L999 -+ slli.d INCY, INCY, BASE_SHIFT -+ bne INCX, TEMP, .L20 -+ srai.d I, N, 3 -+ bne INCY, TEMP, .L20 -+ addi.d I, I, -1 -+ blt I, $r0, .L15 -+ LD a1, X, 0 * SIZE -+ LD b1, Y, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD b2, Y, 1 * SIZE -+ LD a3, X, 2 * SIZE -+ LD b3, Y, 2 * SIZE -+ LD a4, X, 3 * SIZE -+ LD b4, Y, 3 * SIZE -+ LD a5, X, 4 * SIZE -+ LD b5, Y, 4 * SIZE -+ LD a6, X, 5 * SIZE -+ LD b6, Y, 5 * SIZE -+ LD a7, X, 6 * SIZE -+ LD b7, Y, 6 * SIZE -+ LD a8, X, 7 * SIZE -+ LD b8, Y, 7 * SIZE -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ ST a1, Y, 0 * SIZE -+ LD a1, X, 8 * SIZE -+ ST b1, X, 0 * SIZE -+ LD b1, Y, 8 * SIZE -+ ST a2, Y, 1 * SIZE -+ LD a2, X, 9 * SIZE -+ ST b2, X, 1 * SIZE -+ LD b2, Y, 9 * SIZE -+ ST a3, Y, 2 * SIZE -+ LD a3, X, 10 * SIZE -+ ST b3, X, 2 * SIZE -+ LD b3, Y, 10 * SIZE -+ ST a4, Y, 3 * SIZE -+ LD a4, X, 11 * SIZE -+ ST b4, X, 3 * SIZE -+ LD b4, Y, 11 * SIZE -+ ST a5, Y, 4 * SIZE -+ LD a5, X, 12 * SIZE -+ ST b5, X, 4 * SIZE -+ LD b5, Y, 12 * SIZE -+ ST a6, Y, 5 * SIZE -+ LD a6, X, 13 * SIZE -+ ST b6, X, 5 * SIZE -+ LD b6, Y, 13 * SIZE -+ ST a7, Y, 6 * SIZE -+ LD a7, X, 14 * SIZE -+ ST b7, X, 6 * SIZE -+ LD b7, Y, 14 * SIZE -+ ST a8, Y, 7 * SIZE -+ LD a8, X, 15 * SIZE -+ ST b8, X, 7 * SIZE -+ LD b8, Y, 15 * SIZE -+ addi.d I, I, -1 -+ addi.d X, X, 8 * SIZE -+ addi.d Y, Y, 8 * SIZE -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ ST a1, Y, 0 * SIZE -+ ST b1, X, 0 * SIZE -+ ST a2, Y, 1 * SIZE -+ ST b2, X, 1 * SIZE -+ ST a3, Y, 2 * SIZE -+ ST b3, X, 2 * SIZE -+ ST a4, Y, 3 * SIZE -+ ST b4, X, 3 * SIZE -+ ST a5, Y, 4 * SIZE -+ ST b5, X, 4 * SIZE -+ ST a6, Y, 5 * SIZE -+ ST b6, X, 5 * SIZE -+ ST a7, Y, 6 * SIZE -+ ST b7, X, 6 * SIZE -+ ST a8, Y, 7 * SIZE -+ ST b8, X, 7 * SIZE -+ addi.d X, X, 8 * SIZE -+ addi.d Y, Y, 8 * SIZE -+ .align 3 -+ -+.L15: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+.L16: -+ LD a1, X, 0 * SIZE -+ LD b1, Y, 0 * SIZE -+ addi.d X, X, SIZE -+ addi.d I, I, -1 -+ addi.d Y, Y, SIZE -+ ST b1, X, -1 * SIZE -+ ST a1, Y, -1 * SIZE -+ blt $r0, I, .L16 -+ b .L999 -+ .align 3 -+ -+.L20: -+ srai.d I, N, 3 -+ move XX, X -+ move YY, Y -+ addi.d I, I, -1 -+ blt I, $r0, .L25 -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b2, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b3, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b4, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b5, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b6, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b7, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ LD a8, X, 0 * SIZE -+ add.d X, X, INCX -+ LD b8, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ bge $r0, I, .L23 -+ .align 3 -+ -+.L22: -+ ST a1, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ LD a1, X, 0 * SIZE -+ add.d X, X, INCX -+ ST b1, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD b1, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a2, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ LD a2, X, 0 * SIZE -+ add.d X, X, INCX -+ ST b2, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD b2, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a3, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ LD a3, X, 0 * SIZE -+ add.d X, X, INCX -+ ST b3, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD b3, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a4, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ LD a4, X, 0 * SIZE -+ add.d X, X, INCX -+ ST b4, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD b4, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a5, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ LD a5, X, 0 * SIZE -+ add.d X, X, INCX -+ ST b5, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD b5, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a6, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ LD a6, X, 0 * SIZE -+ add.d X, X, INCX -+ ST b6, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD b6, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a7, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ LD a7, X, 0 * SIZE -+ add.d X, X, INCX -+ ST b7, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD b7, Y, 0 * SIZE -+ add.d Y, Y, INCY -+ ST a8, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ LD a8, X, 0 * SIZE -+ add.d X, X, INCX -+ ST b8, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ LD b8, Y, 0 * SIZE -+ addi.d I, I, -1 -+ add.d Y, Y, INCY -+ blt $r0, I, .L22 -+ .align 3 -+ -+.L23: -+ ST a1, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ ST b1, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST a2, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ ST b2, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST a3, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ ST b3, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST a4, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ ST b4, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST a5, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ ST b5, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST a6, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ ST b6, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST a7, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ ST b7, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ ST a8, YY, 0 * SIZE -+ add.d YY, YY, INCY -+ ST b8, XX, 0 * SIZE -+ add.d XX, XX, INCX -+ .align 3 -+ -+.L25: -+ andi I, N, 7 -+ bge $r0, I, .L999 -+ .align 3 -+.L26: -+ LD a1, X, 0 * SIZE -+ LD b1, Y, 0 * SIZE -+ addi.d I, I, -1 -+ ST a1, Y, 0 * SIZE -+ ST b1, X, 0 * SIZE -+ add.d X, X, INCX -+ add.d Y, Y, INCY -+ blt $r0, I, .L26 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/trsm_kernel_LN.S b/kernel/loongarch64/trsm_kernel_LN.S -new file mode 100644 -index 0000000..a0bd29f ---- /dev/null -+++ b/kernel/loongarch64/trsm_kernel_LN.S -@@ -0,0 +1,2863 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define M $r4 -+#define N $r5 -+#define K $r6 -+#define A $r7 -+#define B $r8 -+#define C $r9 -+#define LDC $r10 -+#define OFFSET $r11 -+#define AO $r12 -+#define BO $r13 -+#define I $r17 -+#define J $r18 -+#define L $r29 -+#define CO1 $r14 -+#define CO2 $r15 -+#define CO3 $r23 -+#define CO4 $r24 -+#define CO5 $r25 -+#define CO6 $r26 -+#define CO7 $r27 -+#define CO8 $r28 -+#define KK $r30 -+#define TEMP $r20 -+#define AORIG $r16 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f27 -+#define a4 $f28 -+#define b1 $f23 -+#define b2 $f9 -+#define b3 $f10 -+#define b4 $f11 -+#define b5 $f12 -+#define b6 $f13 -+#define b7 $f14 -+#define b8 $f15 -+#define a5 b8 -+#define c11 $f16 -+#define c12 $f17 -+#define c21 $f3 -+#define c22 $f1 -+#define c31 $f2 -+#define c32 $f4 -+#define c41 $f5 -+#define c42 $f6 -+#define c51 $f7 -+#define c52 $f18 -+#define c61 $f19 -+#define c62 $f20 -+#define c71 $f21 -+#define c72 $f24 -+#define c81 $f25 -+#define c82 $f26 -+#define ALPHA $f0 -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -144 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 16 -+ SDARG $r26, $sp, 24 -+ SDARG $r27, $sp, 32 -+ SDARG $r28, $sp, 40 -+ fst.d $f24, $sp, 48 -+ fst.d $f25, $sp, 56 -+ fst.d $f26, $sp, 64 -+ fst.d $f27, $sp, 72 -+ fst.d $f28, $sp, 80 -+ SDARG $r29, $sp, 88 -+ SDARG $r30, $sp, 96 -+ SDARG $r20, $sp, 104 -+ SDARG $r16, $sp, 112 -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 112 -+ fst.d $f19, $sp, 120 -+ fst.d $f20, $sp, 128 -+ fst.d $f21, $sp, 136 -+#endif -+ slli.d LDC, LDC, BASE_SHIFT -+#ifdef LN -+ mul.w TEMP, M, K -+ slli.d TEMP, TEMP, BASE_SHIFT -+ add.d A, A, TEMP -+ slli.d TEMP, M, BASE_SHIFT -+ add.d C, C, TEMP -+#endif -+#ifdef RN -+ neg KK, OFFSET -+#endif -+#ifdef RT -+ mul.w TEMP, N, K -+ slli.d TEMP, TEMP, BASE_SHIFT -+ add.d B, B, TEMP -+ mul.w TEMP, N, LDC -+ add.d C, C, TEMP -+ sub.d KK, N, OFFSET -+#endif -+ srai.d J, N, 3 -+nop -+ bge $r0, J, .L30 -+.L10: -+#ifdef RT -+ slli.d TEMP, K, 3 + BASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 3 -+ sub.d C, C, TEMP -+#endif -+ move CO1, C -+MTC c11, $r0 -+ add.d CO2, C, LDC -+ add.d CO3, CO2, LDC -+ addi.d J, J, -1 -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ add.d CO5, CO4, LDC -+ MOV c31, c11 -+ add.d CO6, CO5, LDC -+ MOV c41, c11 -+ add.d CO7, CO6, LDC -+ MOV c51, c11 -+ add.d CO8, CO7, LDC -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO8, LDC -+#endif -+ andi I, M, 1 -+ MOV c61, c11 -+MOV c71, c11 -+ bge $r0, I, .L20 -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, KK, 2 -+ MOV c81, c11 -+move BO, B -+ bge $r0, L, .L25 -+#else -+#ifdef LN -+ slli.d TEMP, K, 0 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ srai.d L, TEMP, 2 -+ MOV c81, c11 -+ bge $r0, L, .L25 -+#endif -+ .align 3 -+.L22: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 16 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ LD b5, BO, 20 * SIZE -+ MADD c61, b2, a1, c61 -+ LD b2, BO, 9 * SIZE -+ MADD c71, b3, a1, c71 -+ LD b3, BO, 10 * SIZE -+ MADD c81, b4, a1, c81 -+ LD b4, BO, 11 * SIZE -+ LD a1, AO, 4 * SIZE -+ addi.d L, L, -1 -+ MADD c11, b6, a2, c11 -+ LD b6, BO, 24 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 13 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 14 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a2, c51 -+ LD b7, BO, 28 * SIZE -+ MADD c61, b2, a2, c61 -+ LD b2, BO, 17 * SIZE -+ MADD c71, b3, a2, c71 -+ LD b3, BO, 18 * SIZE -+ MADD c81, b4, a2, c81 -+ LD b4, BO, 19 * SIZE -+ LD a2, AO, 5 * SIZE -+ addi.d AO, AO, 4 * SIZE -+ MADD c11, b1, a3, c11 -+ LD b1, BO, 32 * SIZE -+ MADD c21, b2, a3, c21 -+ LD b2, BO, 21 * SIZE -+ MADD c31, b3, a3, c31 -+ LD b3, BO, 22 * SIZE -+ MADD c41, b4, a3, c41 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ LD b5, BO, 36 * SIZE -+ MADD c61, b2, a3, c61 -+ LD b2, BO, 25 * SIZE -+ MADD c71, b3, a3, c71 -+ LD b3, BO, 26 * SIZE -+ MADD c81, b4, a3, c81 -+ LD b4, BO, 27 * SIZE -+ LD a3, AO, 2 * SIZE -+ addi.d BO, BO, 32 * SIZE -+ MADD c11, b6, a4, c11 -+ LD b6, BO, 8 * SIZE -+ MADD c21, b2, a4, c21 -+ LD b2, BO, -3 * SIZE -+ MADD c31, b3, a4, c31 -+ LD b3, BO, -2 * SIZE -+ MADD c41, b4, a4, c41 -+ LD b4, BO, -1 * SIZE -+ MADD c51, b7, a4, c51 -+ LD b7, BO, 12 * SIZE -+ MADD c61, b2, a4, c61 -+ LD b2, BO, 1 * SIZE -+ MADD c71, b3, a4, c71 -+ LD b3, BO, 2 * SIZE -+ MADD c81, b4, a4, c81 -+ LD b4, BO, 3 * SIZE -+ LD a4, AO, 3 * SIZE -+ blt $r0, L, .L22 -+ .align 3 -+ -+.L25: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L28 -+ .align 3 -+.L26: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 8 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ addi.d L, L, -1 -+ MOV a2, a2 -+ addi.d AO, AO, 1 * SIZE -+ addi.d BO, BO, 8 * SIZE -+ MADD c51, b5, a1, c51 -+ LD b5, BO, 4 * SIZE -+ MADD c61, b2, a1, c61 -+ LD b2, BO, 1 * SIZE -+ MADD c71, b3, a1, c71 -+ LD b3, BO, 2 * SIZE -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L26 -+.L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -8 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+ SUB c51, b5, c51 -+ SUB c61, b6, c61 -+ SUB c71, b7, c71 -+ SUB c81, b8, c81 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ LD b5, AO, 4 * SIZE -+ LD b6, AO, 5 * SIZE -+ LD b7, AO, 6 * SIZE -+ LD b8, AO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+ SUB c51, b5, c51 -+ SUB c61, b6, c61 -+ SUB c71, b7, c71 -+ SUB c81, b8, c81 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+ MUL c51, b1, c51 -+ MUL c61, b1, c61 -+ MUL c71, b1, c71 -+ MUL c81, b1, c81 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c31, c11, b3, c31 -+ NMSUB c41, c11, b4, c41 -+ NMSUB c51, c11, b5, c51 -+ NMSUB c61, c11, b6, c61 -+ NMSUB c71, c11, b7, c71 -+ NMSUB c81, c11, b8, c81 -+ LD b2, BO, 9 * SIZE -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ LD b5, BO, 12 * SIZE -+ LD b6, BO, 13 * SIZE -+ LD b7, BO, 14 * SIZE -+ LD b8, BO, 15 * SIZE -+ MUL c21, b2, c21 -+ NMSUB c31, c21, b3, c31 -+ NMSUB c41, c21, b4, c41 -+ NMSUB c51, c21, b5, c51 -+ NMSUB c61, c21, b6, c61 -+ NMSUB c71, c21, b7, c71 -+ NMSUB c81, c21, b8, c81 -+ LD b3, BO, 18 * SIZE -+ LD b4, BO, 19 * SIZE -+ LD b5, BO, 20 * SIZE -+ LD b6, BO, 21 * SIZE -+ LD b7, BO, 22 * SIZE -+ LD b8, BO, 23 * SIZE -+ MUL c31, b3, c31 -+ NMSUB c41, c31, b4, c41 -+ NMSUB c51, c31, b5, c51 -+ NMSUB c61, c31, b6, c61 -+ NMSUB c71, c31, b7, c71 -+ NMSUB c81, c31, b8, c81 -+ LD b4, BO, 27 * SIZE -+ LD b5, BO, 28 * SIZE -+ LD b6, BO, 29 * SIZE -+ LD b7, BO, 30 * SIZE -+ LD b8, BO, 31 * SIZE -+ MUL c41, b4, c41 -+ NMSUB c51, c41, b5, c51 -+ NMSUB c61, c41, b6, c61 -+ NMSUB c71, c41, b7, c71 -+ NMSUB c81, c41, b8, c81 -+ LD b5, BO, 36 * SIZE -+ LD b6, BO, 37 * SIZE -+ LD b7, BO, 38 * SIZE -+ LD b8, BO, 39 * SIZE -+ MUL c51, b5, c51 -+ NMSUB c61, c51, b6, c61 -+ NMSUB c71, c51, b7, c71 -+ NMSUB c81, c51, b8, c81 -+ LD b6, BO, 45 * SIZE -+ LD b7, BO, 46 * SIZE -+ LD b8, BO, 47 * SIZE -+ MUL c61, b6, c61 -+ NMSUB c71, c61, b7, c71 -+ NMSUB c81, c61, b8, c81 -+ LD b7, BO, 54 * SIZE -+ LD b8, BO, 55 * SIZE -+ MUL c71, b7, c71 -+ NMSUB c81, c71, b8, c81 -+ LD b8, BO, 63 * SIZE -+ MUL c81, b8, c81 -+#endif -+#ifdef RT -+ LD b1, BO, 63 * SIZE -+ LD b2, BO, 62 * SIZE -+ LD b3, BO, 61 * SIZE -+ LD b4, BO, 60 * SIZE -+ LD b5, BO, 59 * SIZE -+ LD b6, BO, 58 * SIZE -+ LD b7, BO, 57 * SIZE -+ LD b8, BO, 56 * SIZE -+ MUL c81, b1, c81 -+ NMSUB c71, c81, b2, c71 -+ NMSUB c61, c81, b3, c61 -+ NMSUB c51, c81, b4, c51 -+ NMSUB c41, c81, b5, c41 -+ NMSUB c31, c81, b6, c31 -+ NMSUB c21, c81, b7, c21 -+ NMSUB c11, c81, b8, c11 -+ LD b2, BO, 54 * SIZE -+ LD b3, BO, 53 * SIZE -+ LD b4, BO, 52 * SIZE -+ LD b5, BO, 51 * SIZE -+ LD b6, BO, 50 * SIZE -+ LD b7, BO, 49 * SIZE -+ LD b8, BO, 48 * SIZE -+ MUL c71, b2, c71 -+ NMSUB c61, c71, b3, c61 -+ NMSUB c51, c71, b4, c51 -+ NMSUB c41, c71, b5, c41 -+ NMSUB c31, c71, b6, c31 -+ NMSUB c21, c71, b7, c21 -+ NMSUB c11, c71, b8, c11 -+ LD b3, BO, 45 * SIZE -+ LD b4, BO, 44 * SIZE -+ LD b5, BO, 43 * SIZE -+ LD b6, BO, 42 * SIZE -+ LD b7, BO, 41 * SIZE -+ LD b8, BO, 40 * SIZE -+ MUL c61, b3, c61 -+ NMSUB c51, c61, b4, c51 -+ NMSUB c41, c61, b5, c41 -+ NMSUB c31, c61, b6, c31 -+ NMSUB c21, c61, b7, c21 -+ NMSUB c11, c61, b8, c11 -+ LD b4, BO, 36 * SIZE -+ LD b5, BO, 35 * SIZE -+ LD b6, BO, 34 * SIZE -+ LD b7, BO, 33 * SIZE -+ LD b8, BO, 32 * SIZE -+ MUL c51, b4, c51 -+ NMSUB c41, c51, b5, c41 -+ NMSUB c31, c51, b6, c31 -+ NMSUB c21, c51, b7, c21 -+ NMSUB c11, c51, b8, c11 -+ LD b5, BO, 27 * SIZE -+ LD b6, BO, 26 * SIZE -+ LD b7, BO, 25 * SIZE -+ LD b8, BO, 24 * SIZE -+ MUL c41, b5, c41 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c21, c41, b7, c21 -+ NMSUB c11, c41, b8, c11 -+ LD b6, BO, 18 * SIZE -+ LD b7, BO, 17 * SIZE -+ LD b8, BO, 16 * SIZE -+ MUL c31, b6, c31 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c11, c31, b8, c11 -+ LD b7, BO, 9 * SIZE -+ LD b8, BO, 8 * SIZE -+ MUL c21, b7, c21 -+ NMSUB c11, c21, b8, c11 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+ addi.d CO2, CO2, -1 * SIZE -+ addi.d CO3, CO3, -1 * SIZE -+ addi.d CO4, CO4, -1 * SIZE -+ addi.d CO5, CO5, -1 * SIZE -+ addi.d CO6, CO6, -1 * SIZE -+ addi.d CO7, CO7, -1 * SIZE -+ addi.d CO8, CO8, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+ ST c51, BO, 4 * SIZE -+ ST c61, BO, 5 * SIZE -+ ST c71, BO, 6 * SIZE -+ ST c81, BO, 7 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c21, AO, 1 * SIZE -+ ST c31, AO, 2 * SIZE -+ ST c41, AO, 3 * SIZE -+ ST c51, AO, 4 * SIZE -+ ST c61, AO, 5 * SIZE -+ ST c71, AO, 6 * SIZE -+ ST c81, AO, 7 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c41, CO4, 0 * SIZE -+ ST c51, CO5, 0 * SIZE -+ ST c61, CO6, 0 * SIZE -+ ST c71, CO7, 0 * SIZE -+ ST c81, CO8, 0 * SIZE -+MTC c11, $r0 -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+ addi.d CO2, CO2, 1 * SIZE -+ addi.d CO3, CO3, 1 * SIZE -+ addi.d CO4, CO4, 1 * SIZE -+ addi.d CO5, CO5, 1 * SIZE -+ addi.d CO6, CO6, 1 * SIZE -+ addi.d CO7, CO7, 1 * SIZE -+ addi.d CO8, CO8, 1 * SIZE -+#endif -+ MOV c21, c11 -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+ MOV c31, c11 -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+ MOV c41, c11 -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L20: -+ srai.d I, M, 1 -+ MOV c51, c11 -+MOV c61, c11 -+ bge $r0, I, .L29 -+.L11: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, B, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, KK, 2 -+ MOV c32, c11 -+ LD b3, B, 2 * SIZE -+ MOV c42, c11 -+ LD b4, B, 3 * SIZE -+ MOV c52, c11 -+ LD b5, B, 4 * SIZE -+ MOV c62, c11 -+ LD b6, B, 8 * SIZE -+ MOV c72, c11 -+ LD b7, B, 12 * SIZE -+ MOV c82, c11 -+move BO, B -+ bge $r0, L, .L15 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ MOV c32, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c42, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c52, c11 -+ LD b5, BO, 4 * SIZE -+ MOV c62, c11 -+ LD b6, BO, 8 * SIZE -+ MOV c72, c11 -+ LD b7, BO, 12 * SIZE -+ MOV c82, c11 -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L15 -+#endif -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ bge $r0, L, .L13 -+ .align 3 -+.L12: -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ MADD c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a4, c51 -+ MADD c61, b2, a4, c61 -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ MADD c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD c71, b3, a3, c71 -+ MADD c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ addi.d L, L, -1 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ blt $r0, L, .L12 -+ .align 3 -+ -+.L13: -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ MADD c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a4, c51 -+ MADD c61, b2, a4, c61 -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ MADD c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD c71, b3, a3, c71 -+ MADD c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ .align 3 -+ -+.L15: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L18 -+ .align 3 -+.L16: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ addi.d L, L, -1 -+ MADD c61, b2, a1, c61 -+ addi.d AO, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ addi.d BO, BO, 8 * SIZE -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 4 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L16 -+.L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -8 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ LD b5, BO, 4 * SIZE -+ SUB c21, b2, c21 -+ LD b6, BO, 5 * SIZE -+ SUB c31, b3, c31 -+ LD b7, BO, 6 * SIZE -+ SUB c41, b4, c41 -+ LD b8, BO, 7 * SIZE -+ SUB c51, b5, c51 -+ LD b1, BO, 8 * SIZE -+ SUB c61, b6, c61 -+ LD b2, BO, 9 * SIZE -+ SUB c71, b7, c71 -+ LD b3, BO, 10 * SIZE -+ SUB c81, b8, c81 -+ LD b4, BO, 11 * SIZE -+ SUB c12, b1, c12 -+ LD b5, BO, 12 * SIZE -+ SUB c22, b2, c22 -+ LD b6, BO, 13 * SIZE -+ SUB c32, b3, c32 -+ LD b7, BO, 14 * SIZE -+ SUB c42, b4, c42 -+ LD b8, BO, 15 * SIZE -+ SUB c52, b5, c52 -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+#else -+ LD b1, AO, 0 * SIZE -+#endif -+ SUB c62, b6, c62 -+ SUB c72, b7, c72 -+ SUB c82, b8, c82 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ LD b5, AO, 4 * SIZE -+ SUB c12, b2, c12 -+ LD b6, AO, 5 * SIZE -+ SUB c21, b3, c21 -+ LD b7, AO, 6 * SIZE -+ SUB c22, b4, c22 -+ LD b8, AO, 7 * SIZE -+ SUB c31, b5, c31 -+ LD b1, AO, 8 * SIZE -+ SUB c32, b6, c32 -+ LD b2, AO, 9 * SIZE -+ SUB c41, b7, c41 -+ LD b3, AO, 10 * SIZE -+ SUB c42, b8, c42 -+ LD b4, AO, 11 * SIZE -+ LD b5, AO, 12 * SIZE -+ SUB c51, b1, c51 -+ LD b6, AO, 13 * SIZE -+ SUB c52, b2, c52 -+ LD b7, AO, 14 * SIZE -+ SUB c61, b3, c61 -+ LD b8, AO, 15 * SIZE -+ SUB c62, b4, c62 -+ SUB c71, b5, c71 -+ SUB c72, b6, c72 -+ SUB c81, b7, c81 -+ SUB c82, b8, c82 -+#endif -+#ifdef LN -+ MUL c12, b1, c12 -+ LD b2, AO, 2 * SIZE -+ MUL c22, b1, c22 -+ MUL c32, b1, c32 -+ MUL c42, b1, c42 -+ MUL c52, b1, c52 -+ MUL c62, b1, c62 -+ MUL c72, b1, c72 -+ MUL c82, b1, c82 -+ NMSUB c11, c12, b2, c11 -+ LD b3, AO, 0 * SIZE -+ NMSUB c21, c22, b2, c21 -+ NMSUB c31, c32, b2, c31 -+ NMSUB c41, c42, b2, c41 -+ NMSUB c51, c52, b2, c51 -+ NMSUB c61, c62, b2, c61 -+ NMSUB c71, c72, b2, c71 -+ NMSUB c81, c82, b2, c81 -+ MUL c11, b3, c11 -+ addi.d CO1, CO1, -2 * SIZE -+ MUL c21, b3, c21 -+ addi.d CO2, CO2, -2 * SIZE -+ MUL c31, b3, c31 -+ addi.d CO3, CO3, -2 * SIZE -+ MUL c41, b3, c41 -+ addi.d CO4, CO4, -2 * SIZE -+ MUL c51, b3, c51 -+ addi.d CO5, CO5, -2 * SIZE -+ MUL c61, b3, c61 -+ addi.d CO6, CO6, -2 * SIZE -+ MUL c71, b3, c71 -+ addi.d CO7, CO7, -2 * SIZE -+ MUL c81, b3, c81 -+ addi.d CO8, CO8, -2 * SIZE -+#endif -+#ifdef LT -+ MUL c11, b1, c11 -+ LD b2, AO, 1 * SIZE -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+ MUL c51, b1, c51 -+ MUL c61, b1, c61 -+ MUL c71, b1, c71 -+ MUL c81, b1, c81 -+ NMSUB c12, c11, b2, c12 -+ LD b3, AO, 3 * SIZE -+ NMSUB c22, c21, b2, c22 -+ NMSUB c32, c31, b2, c32 -+ NMSUB c42, c41, b2, c42 -+ NMSUB c52, c51, b2, c52 -+ NMSUB c62, c61, b2, c62 -+ NMSUB c72, c71, b2, c72 -+ NMSUB c82, c81, b2, c82 -+ MUL c12, b3, c12 -+ MUL c22, b3, c22 -+ MUL c32, b3, c32 -+ MUL c42, b3, c42 -+ MUL c52, b3, c52 -+ MUL c62, b3, c62 -+ MUL c72, b3, c72 -+ MUL c82, b3, c82 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+ LD b5, BO, 4 * SIZE -+ NMSUB c21, c11, b2, c21 -+ NMSUB c22, c12, b2, c22 -+ LD b6, BO, 5 * SIZE -+ NMSUB c31, c11, b3, c31 -+ NMSUB c32, c12, b3, c32 -+ LD b7, BO, 6 * SIZE -+ NMSUB c41, c11, b4, c41 -+ NMSUB c42, c12, b4, c42 -+ LD b8, BO, 7 * SIZE -+ NMSUB c51, c11, b5, c51 -+ NMSUB c52, c12, b5, c52 -+ LD b2, BO, 9 * SIZE -+ NMSUB c61, c11, b6, c61 -+ NMSUB c62, c12, b6, c62 -+ LD b3, BO, 10 * SIZE -+ NMSUB c71, c11, b7, c71 -+ NMSUB c72, c12, b7, c72 -+ LD b4, BO, 11 * SIZE -+ NMSUB c81, c11, b8, c81 -+ NMSUB c82, c12, b8, c82 -+ LD b5, BO, 12 * SIZE -+ MUL c21, b2, c21 -+ MUL c22, b2, c22 -+ LD b6, BO, 13 * SIZE -+ NMSUB c31, c21, b3, c31 -+ NMSUB c32, c22, b3, c32 -+ LD b7, BO, 14 * SIZE -+ NMSUB c41, c21, b4, c41 -+ NMSUB c42, c22, b4, c42 -+ LD b8, BO, 15 * SIZE -+ NMSUB c51, c21, b5, c51 -+ NMSUB c52, c22, b5, c52 -+ LD b3, BO, 18 * SIZE -+ NMSUB c61, c21, b6, c61 -+ NMSUB c62, c22, b6, c62 -+ LD b4, BO, 19 * SIZE -+ NMSUB c71, c21, b7, c71 -+ NMSUB c72, c22, b7, c72 -+ LD b5, BO, 20 * SIZE -+ NMSUB c81, c21, b8, c81 -+ NMSUB c82, c22, b8, c82 -+ LD b6, BO, 21 * SIZE -+ MUL c31, b3, c31 -+ MUL c32, b3, c32 -+ LD b7, BO, 22 * SIZE -+ NMSUB c41, c31, b4, c41 -+ NMSUB c42, c32, b4, c42 -+ LD b8, BO, 23 * SIZE -+ NMSUB c51, c31, b5, c51 -+ NMSUB c52, c32, b5, c52 -+ LD b4, BO, 27 * SIZE -+ NMSUB c61, c31, b6, c61 -+ NMSUB c62, c32, b6, c62 -+ LD b5, BO, 28 * SIZE -+ NMSUB c71, c31, b7, c71 -+ NMSUB c72, c32, b7, c72 -+ LD b6, BO, 29 * SIZE -+ NMSUB c81, c31, b8, c81 -+ NMSUB c82, c32, b8, c82 -+ LD b7, BO, 30 * SIZE -+ MUL c41, b4, c41 -+ MUL c42, b4, c42 -+ LD b8, BO, 31 * SIZE -+ NMSUB c51, c41, b5, c51 -+ NMSUB c52, c42, b5, c52 -+ LD b5, BO, 36 * SIZE -+ NMSUB c61, c41, b6, c61 -+ NMSUB c62, c42, b6, c62 -+ LD b6, BO, 37 * SIZE -+ NMSUB c71, c41, b7, c71 -+ NMSUB c72, c42, b7, c72 -+ LD b7, BO, 38 * SIZE -+ NMSUB c81, c41, b8, c81 -+ NMSUB c82, c42, b8, c82 -+ LD b8, BO, 39 * SIZE -+ MUL c51, b5, c51 -+ MUL c52, b5, c52 -+ NMSUB c61, c51, b6, c61 -+ NMSUB c62, c52, b6, c62 -+ LD b6, BO, 45 * SIZE -+ NMSUB c71, c51, b7, c71 -+ NMSUB c72, c52, b7, c72 -+ LD b7, BO, 46 * SIZE -+ NMSUB c81, c51, b8, c81 -+ NMSUB c82, c52, b8, c82 -+ LD b8, BO, 47 * SIZE -+ MUL c61, b6, c61 -+ MUL c62, b6, c62 -+ NMSUB c71, c61, b7, c71 -+ NMSUB c72, c62, b7, c72 -+ LD b7, BO, 54 * SIZE -+ NMSUB c81, c61, b8, c81 -+ NMSUB c82, c62, b8, c82 -+ LD b8, BO, 55 * SIZE -+ MUL c71, b7, c71 -+ MUL c72, b7, c72 -+ NMSUB c81, c71, b8, c81 -+ NMSUB c82, c72, b8, c82 -+ LD b8, BO, 63 * SIZE -+ MUL c81, b8, c81 -+ MUL c82, b8, c82 -+#endif -+#ifdef RT -+ LD b1, BO, 63 * SIZE -+ LD b2, BO, 62 * SIZE -+ LD b3, BO, 61 * SIZE -+ LD b4, BO, 60 * SIZE -+ MUL c81, b1, c81 -+ MUL c82, b1, c82 -+ LD b5, BO, 59 * SIZE -+ NMSUB c71, c81, b2, c71 -+ NMSUB c72, c82, b2, c72 -+ LD b6, BO, 58 * SIZE -+ NMSUB c61, c81, b3, c61 -+ NMSUB c62, c82, b3, c62 -+ LD b7, BO, 57 * SIZE -+ NMSUB c51, c81, b4, c51 -+ NMSUB c52, c82, b4, c52 -+ LD b8, BO, 56 * SIZE -+ NMSUB c41, c81, b5, c41 -+ NMSUB c42, c82, b5, c42 -+ LD b2, BO, 54 * SIZE -+ NMSUB c31, c81, b6, c31 -+ NMSUB c32, c82, b6, c32 -+ LD b3, BO, 53 * SIZE -+ NMSUB c21, c81, b7, c21 -+ NMSUB c22, c82, b7, c22 -+ LD b4, BO, 52 * SIZE -+ NMSUB c11, c81, b8, c11 -+ NMSUB c12, c82, b8, c12 -+ LD b5, BO, 51 * SIZE -+ MUL c71, b2, c71 -+ MUL c72, b2, c72 -+ LD b6, BO, 50 * SIZE -+ NMSUB c61, c71, b3, c61 -+ NMSUB c62, c72, b3, c62 -+ LD b7, BO, 49 * SIZE -+ NMSUB c51, c71, b4, c51 -+ NMSUB c52, c72, b4, c52 -+ LD b8, BO, 48 * SIZE -+ NMSUB c41, c71, b5, c41 -+ NMSUB c42, c72, b5, c42 -+ LD b3, BO, 45 * SIZE -+ NMSUB c31, c71, b6, c31 -+ NMSUB c32, c72, b6, c32 -+ LD b4, BO, 44 * SIZE -+ NMSUB c21, c71, b7, c21 -+ NMSUB c22, c72, b7, c22 -+ LD b5, BO, 43 * SIZE -+ NMSUB c11, c71, b8, c11 -+ NMSUB c12, c72, b8, c12 -+ LD b6, BO, 42 * SIZE -+ MUL c61, b3, c61 -+ MUL c62, b3, c62 -+ LD b7, BO, 41 * SIZE -+ NMSUB c51, c61, b4, c51 -+ NMSUB c52, c62, b4, c52 -+ LD b8, BO, 40 * SIZE -+ NMSUB c41, c61, b5, c41 -+ NMSUB c42, c62, b5, c42 -+ LD b4, BO, 36 * SIZE -+ NMSUB c31, c61, b6, c31 -+ NMSUB c32, c62, b6, c32 -+ LD b5, BO, 35 * SIZE -+ NMSUB c21, c61, b7, c21 -+ NMSUB c22, c62, b7, c22 -+ LD b6, BO, 34 * SIZE -+ NMSUB c11, c61, b8, c11 -+ NMSUB c12, c62, b8, c12 -+ LD b7, BO, 33 * SIZE -+ MUL c51, b4, c51 -+ MUL c52, b4, c52 -+ LD b8, BO, 32 * SIZE -+ NMSUB c41, c51, b5, c41 -+ NMSUB c42, c52, b5, c42 -+ LD b5, BO, 27 * SIZE -+ NMSUB c31, c51, b6, c31 -+ NMSUB c32, c52, b6, c32 -+ LD b6, BO, 26 * SIZE -+ NMSUB c21, c51, b7, c21 -+ NMSUB c22, c52, b7, c22 -+ LD b7, BO, 25 * SIZE -+ NMSUB c11, c51, b8, c11 -+ NMSUB c12, c52, b8, c12 -+ LD b8, BO, 24 * SIZE -+ MUL c41, b5, c41 -+ MUL c42, b5, c42 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c32, c42, b6, c32 -+ LD b6, BO, 18 * SIZE -+ NMSUB c21, c41, b7, c21 -+ NMSUB c22, c42, b7, c22 -+ LD b7, BO, 17 * SIZE -+ NMSUB c11, c41, b8, c11 -+ NMSUB c12, c42, b8, c12 -+ LD b8, BO, 16 * SIZE -+ MUL c31, b6, c31 -+ MUL c32, b6, c32 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c22, c32, b7, c22 -+ LD b7, BO, 9 * SIZE -+ NMSUB c11, c31, b8, c11 -+ NMSUB c12, c32, b8, c12 -+ LD b8, BO, 8 * SIZE -+ MUL c21, b7, c21 -+ MUL c22, b7, c22 -+ NMSUB c11, c21, b8, c11 -+ NMSUB c12, c22, b8, c12 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+ MUL c12, b8, c12 -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+ ST c51, BO, 4 * SIZE -+ ST c61, BO, 5 * SIZE -+ ST c71, BO, 6 * SIZE -+ ST c81, BO, 7 * SIZE -+ ST c12, BO, 8 * SIZE -+ ST c22, BO, 9 * SIZE -+ ST c32, BO, 10 * SIZE -+ ST c42, BO, 11 * SIZE -+ ST c52, BO, 12 * SIZE -+ ST c62, BO, 13 * SIZE -+ ST c72, BO, 14 * SIZE -+ ST c82, BO, 15 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c21, AO, 2 * SIZE -+ ST c22, AO, 3 * SIZE -+ ST c31, AO, 4 * SIZE -+ ST c32, AO, 5 * SIZE -+ ST c41, AO, 6 * SIZE -+ ST c42, AO, 7 * SIZE -+ ST c51, AO, 8 * SIZE -+ ST c52, AO, 9 * SIZE -+ ST c61, AO, 10 * SIZE -+ ST c62, AO, 11 * SIZE -+ ST c71, AO, 12 * SIZE -+ ST c72, AO, 13 * SIZE -+ ST c81, AO, 14 * SIZE -+ ST c82, AO, 15 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c22, CO2, 1 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c32, CO3, 1 * SIZE -+ ST c41, CO4, 0 * SIZE -+ ST c42, CO4, 1 * SIZE -+ ST c51, CO5, 0 * SIZE -+ ST c52, CO5, 1 * SIZE -+ ST c61, CO6, 0 * SIZE -+ ST c62, CO6, 1 * SIZE -+ ST c71, CO7, 0 * SIZE -+ ST c72, CO7, 1 * SIZE -+ ST c81, CO8, 0 * SIZE -+ ST c82, CO8, 1 * SIZE -+MTC a1, $r0 -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+ addi.d CO2, CO2, 2 * SIZE -+ addi.d CO3, CO3, 2 * SIZE -+ addi.d CO4, CO4, 2 * SIZE -+ addi.d CO5, CO5, 2 * SIZE -+ addi.d CO6, CO6, 2 * SIZE -+ addi.d CO7, CO7, 2 * SIZE -+ addi.d CO8, CO8, 2 * SIZE -+#endif -+ MOV c11, a1 -+ MOV c21, a1 -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+ MOV c31, a1 -+ MOV c41, a1 -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+ addi.d I, I, -1 -+ MOV c51, a1 -+MOV c61, a1 -+ blt $r0, I, .L11 -+ .align 3 -+ -+.L29: -+#ifdef LN -+ slli.d TEMP, K, 3 + BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 8 -+#endif -+#ifdef RT -+ addi.d KK, KK, -8 -+#endif -+ blt $r0, J, .L10 -+ .align 3 -+ -+.L30: -+ andi J, N, 4 -+move AO, A -+ bge $r0, J, .L50 -+#ifdef RT -+ slli.d TEMP, K, 2 + BASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 2 -+ sub.d C, C, TEMP -+#endif -+ move CO1, C -+MTC c11, $r0 -+ add.d CO2, C, LDC -+ add.d CO3, CO2, LDC -+ MOV c21, c11 -+ add.d CO4, CO3, LDC -+ MOV c31, c11 -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO4, LDC -+#endif -+ andi I, M, 1 -+MOV c41, c11 -+ bge $r0, I, .L40 -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c81, c11 -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, KK, 2 -+move BO, B -+ bge $r0, L, .L45 -+#else -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c81, c11 -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L45 -+#endif -+ .align 3 -+.L42: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 16 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ LD a1, AO, 4 * SIZE -+ addi.d L, L, -1 -+ MADD c11, b5, a2, c11 -+ LD b5, BO, 20 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 10 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 11 * SIZE -+ LD a2, AO, 2 * SIZE -+ addi.d AO, AO, 4 * SIZE -+ MADD c11, b6, a2, c11 -+ LD b6, BO, 24 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 13 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 14 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 15 * SIZE -+ LD a2, AO, -1 * SIZE -+ addi.d BO, BO, 16 * SIZE -+ MADD c11, b7, a2, c11 -+ LD b7, BO, 12 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 1 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 2 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 3 * SIZE -+ LD a2, AO, 1 * SIZE -+ blt $r0, L, .L42 -+ .align 3 -+ -+.L45: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L48 -+ .align 3 -+.L46: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 4 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 1 * SIZE -+ LD b4, BO, 7 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+ MOV a2, a2 -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L46 -+.L48: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -4 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c31, c11, b3, c31 -+ NMSUB c41, c11, b4, c41 -+ LD b2, BO, 5 * SIZE -+ LD b3, BO, 6 * SIZE -+ LD b4, BO, 7 * SIZE -+ MUL c21, b2, c21 -+ NMSUB c31, c21, b3, c31 -+ NMSUB c41, c21, b4, c41 -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ MUL c31, b3, c31 -+ NMSUB c41, c31, b4, c41 -+ LD b4, BO, 15 * SIZE -+ MUL c41, b4, c41 -+#endif -+#ifdef RT -+ LD b5, BO, 15 * SIZE -+ LD b6, BO, 14 * SIZE -+ LD b7, BO, 13 * SIZE -+ LD b8, BO, 12 * SIZE -+ MUL c41, b5, c41 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c21, c41, b7, c21 -+ NMSUB c11, c41, b8, c11 -+ LD b6, BO, 10 * SIZE -+ LD b7, BO, 9 * SIZE -+ LD b8, BO, 8 * SIZE -+ MUL c31, b6, c31 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c11, c31, b8, c11 -+ LD b7, BO, 5 * SIZE -+ LD b8, BO, 4 * SIZE -+ MUL c21, b7, c21 -+ NMSUB c11, c21, b8, c11 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+ addi.d CO2, CO2, -1 * SIZE -+ addi.d CO3, CO3, -1 * SIZE -+ addi.d CO4, CO4, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c21, AO, 1 * SIZE -+ ST c31, AO, 2 * SIZE -+ ST c41, AO, 3 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c41, CO4, 0 * SIZE -+MTC c11, $r0 -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+ addi.d CO2, CO2, 1 * SIZE -+ addi.d CO3, CO3, 1 * SIZE -+ addi.d CO4, CO4, 1 * SIZE -+#endif -+ MOV c21, c11 -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+ MOV c31, c11 -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L40: -+ srai.d I, M, 1 -+ MOV c61, c11 -+MOV c41, c11 -+ bge $r0, I, .L49 -+.L31: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ LD a3, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ MOV c32, c11 -+ LD b4, B, 3 * SIZE -+ MOV c42, c11 -+ LD b5, B, 4 * SIZE -+ srai.d L, KK, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L35 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ LD a3, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c32, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c42, c11 -+ LD b5, BO, 4 * SIZE -+ srai.d L, TEMP, 2 -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L35 -+#endif -+ .align 3 -+.L32: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c11, b5, a1, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 8 * SIZE -+ MADD c12, b5, a2, c12 -+ LD b5, BO, 20 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 9 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 10 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ LD a3, AO, 6 * SIZE -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c11, b7, a3, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a3, c21 -+ addi.d AO, AO, 8 * SIZE -+ MADD c31, b3, a3, c31 -+ addi.d BO, BO, 16 * SIZE -+ MADD c41, b4, a3, c41 -+ LD a3, AO, 4 * SIZE -+ MADD c12, b7, a2, c12 -+ LD b7, BO, 12 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 1 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 2 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L32 -+ .align 3 -+ -+.L35: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L38 -+ .align 3 -+.L36: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ addi.d AO, AO, 2 * SIZE -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 0 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 4 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L36 -+.L38: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -4 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+ SUB c12, b5, c12 -+ SUB c22, b6, c22 -+ SUB c32, b7, c32 -+ SUB c42, b8, c42 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ LD b5, AO, 4 * SIZE -+ LD b6, AO, 5 * SIZE -+ LD b7, AO, 6 * SIZE -+ LD b8, AO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c21, b3, c21 -+ SUB c22, b4, c22 -+ SUB c31, b5, c31 -+ SUB c32, b6, c32 -+ SUB c41, b7, c41 -+ SUB c42, b8, c42 -+#endif -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+ LD b2, AO, 2 * SIZE -+ LD b3, AO, 0 * SIZE -+ MUL c12, b1, c12 -+ MUL c22, b1, c22 -+ MUL c32, b1, c32 -+ MUL c42, b1, c42 -+ NMSUB c11, c12, b2, c11 -+ NMSUB c21, c22, b2, c21 -+ NMSUB c31, c32, b2, c31 -+ NMSUB c41, c42, b2, c41 -+ MUL c11, b3, c11 -+ MUL c21, b3, c21 -+ MUL c31, b3, c31 -+ MUL c41, b3, c41 -+#endif -+#ifdef LT -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+ NMSUB c12, c11, b2, c12 -+ NMSUB c22, c21, b2, c22 -+ NMSUB c32, c31, b2, c32 -+ NMSUB c42, c41, b2, c42 -+ MUL c12, b3, c12 -+ MUL c22, b3, c22 -+ MUL c32, b3, c32 -+ MUL c42, b3, c42 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c22, c12, b2, c22 -+ NMSUB c31, c11, b3, c31 -+ NMSUB c32, c12, b3, c32 -+ NMSUB c41, c11, b4, c41 -+ NMSUB c42, c12, b4, c42 -+ LD b2, BO, 5 * SIZE -+ LD b3, BO, 6 * SIZE -+ LD b4, BO, 7 * SIZE -+ MUL c21, b2, c21 -+ MUL c22, b2, c22 -+ NMSUB c31, c21, b3, c31 -+ NMSUB c32, c22, b3, c32 -+ NMSUB c41, c21, b4, c41 -+ NMSUB c42, c22, b4, c42 -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ MUL c31, b3, c31 -+ MUL c32, b3, c32 -+ NMSUB c41, c31, b4, c41 -+ NMSUB c42, c32, b4, c42 -+ LD b4, BO, 15 * SIZE -+ MUL c41, b4, c41 -+ MUL c42, b4, c42 -+#endif -+#ifdef RT -+ LD b5, BO, 15 * SIZE -+ LD b6, BO, 14 * SIZE -+ LD b7, BO, 13 * SIZE -+ LD b8, BO, 12 * SIZE -+ MUL c41, b5, c41 -+ MUL c42, b5, c42 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c32, c42, b6, c32 -+ NMSUB c21, c41, b7, c21 -+ NMSUB c22, c42, b7, c22 -+ NMSUB c11, c41, b8, c11 -+ NMSUB c12, c42, b8, c12 -+ LD b6, BO, 10 * SIZE -+ LD b7, BO, 9 * SIZE -+ LD b8, BO, 8 * SIZE -+ MUL c31, b6, c31 -+ MUL c32, b6, c32 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c22, c32, b7, c22 -+ NMSUB c11, c31, b8, c11 -+ NMSUB c12, c32, b8, c12 -+ LD b7, BO, 5 * SIZE -+ LD b8, BO, 4 * SIZE -+ MUL c21, b7, c21 -+ MUL c22, b7, c22 -+ NMSUB c11, c21, b8, c11 -+ NMSUB c12, c22, b8, c12 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+ MUL c12, b8, c12 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -2 * SIZE -+ addi.d CO2, CO2, -2 * SIZE -+ addi.d CO3, CO3, -2 * SIZE -+ addi.d CO4, CO4, -2 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+ ST c12, BO, 4 * SIZE -+ ST c22, BO, 5 * SIZE -+ ST c32, BO, 6 * SIZE -+ ST c42, BO, 7 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c21, AO, 2 * SIZE -+ ST c22, AO, 3 * SIZE -+ ST c31, AO, 4 * SIZE -+ ST c32, AO, 5 * SIZE -+ ST c41, AO, 6 * SIZE -+ ST c42, AO, 7 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c22, CO2, 1 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c32, CO3, 1 * SIZE -+ ST c41, CO4, 0 * SIZE -+ ST c42, CO4, 1 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+ addi.d CO2, CO2, 2 * SIZE -+ addi.d CO3, CO3, 2 * SIZE -+ addi.d CO4, CO4, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+MTC a1, $r0 -+ MOV c11, a1 -+ MOV c21, a1 -+ MOV c31, a1 -+ addi.d I, I, -1 -+MOV c41, c11 -+ blt $r0, I, .L31 -+ .align 3 -+ -+.L49: -+#ifdef LN -+ slli.d TEMP, K, 2 + BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 4 -+#endif -+#ifdef RT -+ addi.d KK, KK, -4 -+#endif -+ .align 3 -+ -+.L50: -+ andi J, N, 2 -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+#else -+ move AO, A -+#endif -+ bge $r0, J, .L70 -+#ifdef RT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 1 -+ sub.d C, C, TEMP -+#endif -+ move AO, A -+ move CO1, C -+ add.d CO2, C, LDC -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO2, LDC -+#endif -+ andi I, M, 1 -+ bge $r0, I, .L60 -+#if defined(LT) || defined(RN) -+ srai.d L, KK, 2 -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ MOV c31, c11 -+ LD a4, AO, 3 * SIZE -+ MOV c41, c11 -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L65 -+#else -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ srai.d L, TEMP, 2 -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ MOV c31, c11 -+ LD a4, AO, 3 * SIZE -+ MOV c41, c11 -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L65 -+#endif -+ .align 3 -+.L62: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 4 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 7 * SIZE -+ LD a1, AO, 4 * SIZE -+ LD a2, AO, 5 * SIZE -+ MADD c11, b1, a3, c11 -+ LD b1, BO, 8 * SIZE -+ MADD c21, b2, a3, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c31, b3, a4, c31 -+ LD b3, BO, 10 * SIZE -+ MADD c41, b4, a4, c41 -+ LD b4, BO, 11 * SIZE -+ LD a3, AO, 6 * SIZE -+ LD a4, AO, 7 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 4 * SIZE -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L62 -+ .align 3 -+ -+.L65: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L68 -+ .align 3 -+.L66: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 2 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 3 * SIZE -+ LD a1, AO, 1 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+addi.d BO, BO, 2 * SIZE -+ blt $r0, L, .L66 -+.L68: -+ ADD c11, c11, c31 -+ ADD c21, c21, c41 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -2 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+#endif -+#if defined(LN) || defined(LT) -+ LD b3, AO, 0 * SIZE -+ MUL c11, b3, c11 -+ MUL c21, b3, c21 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c21, c11, b2, c21 -+ MUL c21, b3, c21 -+#endif -+#ifdef RT -+ LD b1, BO, 3 * SIZE -+ LD b2, BO, 2 * SIZE -+ LD b3, BO, 0 * SIZE -+ MUL c21, b1, c21 -+ NMSUB c11, c21, b2, c11 -+ MUL c11, b3, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+ addi.d CO2, CO2, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c21, AO, 1 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+ addi.d CO2, CO2, 1 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 0 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L60: -+ srai.d I, M, 1 -+ bge $r0, I, .L69 -+.L51: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ LD b5, B, 4 * SIZE -+ srai.d L, KK, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L55 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ LD b5, BO, 4 * SIZE -+ srai.d L, TEMP, 2 -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L55 -+#endif -+ .align 3 -+.L52: -+ MADD c11, b1, a1, c11 -+ LD a3, AO, 2 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b4, BO, 3 * SIZE -+ MADD c12, b1, a2, c12 -+ LD a4, AO, 3 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b1, BO, 8 * SIZE -+ MADD c11, b3, a3, c11 -+ LD a1, AO, 8 * SIZE -+ MADD c21, b4, a3, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c12, b3, a4, c12 -+ LD a2, AO, 5 * SIZE -+ MADD c22, b4, a4, c22 -+ LD b3, BO, 6 * SIZE -+ MADD c11, b5, a5, c11 -+ LD a3, AO, 6 * SIZE -+ MADD c21, b2, a5, c21 -+ LD b4, BO, 7 * SIZE -+ MADD c12, b5, a2, c12 -+ LD a4, AO, 7 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b5, BO, 12 * SIZE -+ MADD c11, b3, a3, c11 -+ LD a5, AO, 12 * SIZE -+ MADD c21, b4, a3, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c12, b3, a4, c12 -+ LD a2, AO, 9 * SIZE -+ MADD c22, b4, a4, c22 -+ LD b3, BO, 10 * SIZE -+ addi.d AO, AO, 8 * SIZE -+ addi.d L, L, -1 -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L52 -+ .align 3 -+ -+.L55: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L58 -+ .align 3 -+.L56: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 3 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 2 * SIZE -+addi.d BO, BO, 2 * SIZE -+ blt $r0, L, .L56 -+.L58: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -2 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c12, b3, c12 -+ SUB c22, b4, c22 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c21, b3, c21 -+ SUB c22, b4, c22 -+#endif -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+ LD b2, AO, 2 * SIZE -+ LD b3, AO, 0 * SIZE -+ MUL c12, b1, c12 -+ MUL c22, b1, c22 -+ NMSUB c11, c12, b2, c11 -+ NMSUB c21, c22, b2, c21 -+ MUL c11, b3, c11 -+ MUL c21, b3, c21 -+#endif -+#ifdef LT -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ NMSUB c12, c11, b2, c12 -+ NMSUB c22, c21, b2, c22 -+ MUL c12, b3, c12 -+ MUL c22, b3, c22 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c22, c12, b2, c22 -+ MUL c21, b3, c21 -+ MUL c22, b3, c22 -+#endif -+#ifdef RT -+ LD b1, BO, 3 * SIZE -+ LD b2, BO, 2 * SIZE -+ LD b3, BO, 0 * SIZE -+ MUL c21, b1, c21 -+ MUL c22, b1, c22 -+ NMSUB c11, c21, b2, c11 -+ NMSUB c12, c22, b2, c12 -+ MUL c11, b3, c11 -+ MUL c12, b3, c12 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -2 * SIZE -+ addi.d CO2, CO2, -2 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c12, BO, 2 * SIZE -+ ST c22, BO, 3 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c21, AO, 2 * SIZE -+ ST c22, AO, 3 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c22, CO2, 1 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+ addi.d CO2, CO2, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AO, TEMP -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+MTC a1, $r0 -+ MOV c11, a1 -+ MOV c21, a1 -+ MOV c31, a1 -+ addi.d I, I, -1 -+MOV c41, c11 -+ blt $r0, I, .L51 -+ .align 3 -+ -+.L69: -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 2 -+#endif -+#ifdef RT -+ addi.d KK, KK, -2 -+#endif -+ .align 3 -+ -+.L70: -+ andi J, N, 1 -+ bge $r0, J, .L999 -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d B, B, TEMP -+ sub.d C, C, LDC -+#endif -+ move AO, A -+ move CO1, C -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO1, LDC -+#endif -+ andi I, M, 1 -+ bge $r0, I, .L80 -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, KK, 2 -+move BO, B -+ bge $r0, L, .L85 -+#else -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d TEMP, KK, BASE_SHIFT -+ add.d AO, AORIG, TEMP -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L85 -+#endif -+ .align 3 -+.L82: -+ LD a1, AO, 0 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a1, AO, 1 * SIZE -+ LD b1, BO, 1 * SIZE -+ MADD c21, b1, a1, c21 -+ LD a1, AO, 2 * SIZE -+ LD b1, BO, 2 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a1, AO, 3 * SIZE -+ LD b1, BO, 3 * SIZE -+ MADD c21, b1, a1, c21 -+ addi.d L, L, -1 -+ addi.d AO, AO, 4 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L82 -+ .align 3 -+ -+.L85: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L88 -+ .align 3 -+.L86: -+ LD a1, AO, 0 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+addi.d BO, BO, 1 * SIZE -+ blt $r0, L, .L86 -+.L88: -+ ADD c11, c11, c21 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -1 -+#endif -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AORIG, TEMP -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ SUB c11, b1, c11 -+#else -+ LD b1, AO, 0 * SIZE -+ SUB c11, b1, c11 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ MUL c11, b1, c11 -+#endif -+#if defined(RN) || defined(RT) -+ LD b1, BO, 0 * SIZE -+ MUL c11, b1, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AO, TEMP -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L80: -+ srai.d I, M, 1 -+ bge $r0, I, .L89 -+.L71: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ LD b5, B, 4 * SIZE -+ srai.d L, KK, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L75 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 0 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ LD b5, BO, 4 * SIZE -+ srai.d L, TEMP, 2 -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L75 -+#endif -+ .align 3 -+.L72: -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 2 * SIZE -+ LD a2, AO, 3 * SIZE -+ LD b1, BO, 1 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 4 * SIZE -+ LD a2, AO, 5 * SIZE -+ LD b1, BO, 2 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 6 * SIZE -+ LD a2, AO, 7 * SIZE -+ LD b1, BO, 3 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ addi.d L, L, -1 -+ addi.d AO, AO, 8 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L72 -+ .align 3 -+ -+.L75: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L78 -+ .align 3 -+.L76: -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ addi.d L, L, -1 -+ addi.d AO, AO, 2 * SIZE -+addi.d BO, BO, 1 * SIZE -+ blt $r0, L, .L76 -+.L78: -+ ADD c11, c11, c21 -+ ADD c12, c12, c22 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -1 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+#endif -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+ LD b2, AO, 2 * SIZE -+ LD b3, AO, 0 * SIZE -+ MUL c12, b1, c12 -+ NMSUB c11, c12, b2, c11 -+ MUL c11, b3, c11 -+#endif -+#ifdef LT -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 3 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c12, c11, b2, c12 -+ MUL c12, b3, c12 -+#endif -+#if defined(RN) || defined(RT) -+ LD b1, BO, 0 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -2 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c12, BO, 1 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+ addi.d I, I, -1 -+ blt $r0, I, .L71 -+ .align 3 -+ -+.L89: -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 1 -+#endif -+#ifdef RT -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ LDARG $r25, $sp, 16 -+ LDARG $r26, $sp, 24 -+ LDARG $r27, $sp, 32 -+ LDARG $r28, $sp, 40 -+ fld.d $f24, $sp, 48 -+ fld.d $f25, $sp, 56 -+ fld.d $f26, $sp, 64 -+ fld.d $f27, $sp, 72 -+ fld.d $f28, $sp, 80 -+ LDARG $r29, $sp, 88 -+ LDARG $r30, $sp, 96 -+ LDARG $r20, $sp, 104 -+ LDARG $r16, $sp, 112 -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 112 -+ fld.d $f19, $sp, 120 -+ fld.d $f20, $sp, 128 -+ fld.d $f21, $sp, 136 -+#endif -+ addi.d $sp, $sp, 144 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/trsm_kernel_LT.S b/kernel/loongarch64/trsm_kernel_LT.S -new file mode 100644 -index 0000000..aa6822c ---- /dev/null -+++ b/kernel/loongarch64/trsm_kernel_LT.S -@@ -0,0 +1,2854 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define M $r4 -+#define N $r5 -+#define K $r6 -+#define A $r7 -+#define B $r8 -+#define C $r9 -+#define LDC $r10 -+#define OFFSET $r11 -+#define AO $r12 -+#define BO $r13 -+#define I $r17 -+#define J $r18 -+#define L $r29 -+#define CO1 $r14 -+#define CO2 $r15 -+#define CO3 $r23 -+#define CO4 $r24 -+#define CO5 $r25 -+#define CO6 $r26 -+#define CO7 $r27 -+#define CO8 $r28 -+#define KK $r30 -+#define TEMP $r20 -+#define AORIG $r16 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f27 -+#define a4 $f28 -+#define b1 $f23 -+#define b2 $f9 -+#define b3 $f10 -+#define b4 $f11 -+#define b5 $f12 -+#define b6 $f13 -+#define b7 $f14 -+#define b8 $f15 -+#define a5 b8 -+#define c11 $f16 -+#define c12 $f17 -+#define c21 $f3 -+#define c22 $f1 -+#define c31 $f2 -+#define c32 $f4 -+#define c41 $f5 -+#define c42 $f6 -+#define c51 $f7 -+#define c52 $f18 -+#define c61 $f19 -+#define c62 $f20 -+#define c71 $f21 -+#define c72 $f24 -+#define c81 $f25 -+#define c82 $f26 -+#define ALPHA $f0 -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -144 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 16 -+ SDARG $r26, $sp, 24 -+ SDARG $r27, $sp, 32 -+ SDARG $r28, $sp, 40 -+ fst.d $f24, $sp, 48 -+ fst.d $f25, $sp, 56 -+ fst.d $f26, $sp, 64 -+ fst.d $f27, $sp, 72 -+ fst.d $f28, $sp, 80 -+ SDARG $r29, $sp, 88 -+ SDARG $r30, $sp, 96 -+ SDARG $r20, $sp, 104 -+ SDARG $r16, $sp, 112 -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 112 -+ fst.d $f19, $sp, 120 -+ fst.d $f20, $sp, 128 -+ fst.d $f21, $sp, 136 -+#endif -+ slli.d LDC, LDC, BASE_SHIFT -+#ifdef LN -+ mul.w TEMP, M, K -+ slli.d TEMP, TEMP, BASE_SHIFT -+ add.d A, A, TEMP -+ slli.d TEMP, M, BASE_SHIFT -+ add.d C, C, TEMP -+#endif -+#ifdef RN -+ sub.d KK, $r0, OFFSET -+#endif -+#ifdef RT -+ mul.w TEMP, N, K -+ slli.d TEMP, TEMP, BASE_SHIFT -+ add.d B, B, TEMP -+ mul.w TEMP, N, LDC -+ add.d C, C, TEMP -+ sub.d KK, N, OFFSET -+#endif -+ srai.d J, N, 3 -+nop -+ bge $r0, J, .L30 -+.L10: -+#ifdef RT -+ slli.d TEMP, K, 3 + BASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 3 -+ sub.d C, C, TEMP -+#endif -+ move CO1, C -+MTC c11, $r0 -+ add.d CO2, C, LDC -+ add.d CO3, CO2, LDC -+ addi.d J, J, -1 -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ add.d CO5, CO4, LDC -+ MOV c31, c11 -+ add.d CO6, CO5, LDC -+ MOV c41, c11 -+ add.d CO7, CO6, LDC -+ MOV c51, c11 -+ add.d CO8, CO7, LDC -+ srai.d I, M, 1 -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO8, LDC -+#endif -+MOV c61, c11 -+ bge $r0, I, .L20 -+.L11: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, B, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, KK, 2 -+ MOV c32, c11 -+ LD b3, B, 2 * SIZE -+ MOV c42, c11 -+ LD b4, B, 3 * SIZE -+ MOV c52, c11 -+ LD b5, B, 4 * SIZE -+ MOV c62, c11 -+ LD b6, B, 8 * SIZE -+ MOV c72, c11 -+ LD b7, B, 12 * SIZE -+ MOV c82, c11 -+move BO, B -+ bge $r0, L, .L15 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, TEMP, 2 -+ MOV c32, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c42, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c52, c11 -+ LD b5, BO, 4 * SIZE -+ MOV c62, c11 -+ LD b6, BO, 8 * SIZE -+ MOV c72, c11 -+ LD b7, BO, 12 * SIZE -+ MOV c82, c11 -+ bge $r0, L, .L15 -+#endif -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ bge $r0, L, .L13 -+ .align 3 -+.L12: -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ MADD c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a4, c51 -+ MADD c61, b2, a4, c61 -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ MADD c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD c71, b3, a3, c71 -+ MADD c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ addi.d L, L, -1 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ blt $r0, L, .L12 -+ .align 3 -+ -+.L13: -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ MADD c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a4, c51 -+ MADD c61, b2, a4, c61 -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ MADD c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD c71, b3, a3, c71 -+ MADD c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ .align 3 -+ -+.L15: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L18 -+ .align 3 -+.L16: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ addi.d L, L, -1 -+ MADD c61, b2, a1, c61 -+ addi.d AO, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ addi.d BO, BO, 8 * SIZE -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 4 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L16 -+.L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -8 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ LD b5, BO, 4 * SIZE -+ SUB c21, b2, c21 -+ LD b6, BO, 5 * SIZE -+ SUB c31, b3, c31 -+ LD b7, BO, 6 * SIZE -+ SUB c41, b4, c41 -+ LD b8, BO, 7 * SIZE -+ SUB c51, b5, c51 -+ LD b1, BO, 8 * SIZE -+ SUB c61, b6, c61 -+ LD b2, BO, 9 * SIZE -+ SUB c71, b7, c71 -+ LD b3, BO, 10 * SIZE -+ SUB c81, b8, c81 -+ LD b4, BO, 11 * SIZE -+ SUB c12, b1, c12 -+ LD b5, BO, 12 * SIZE -+ SUB c22, b2, c22 -+ LD b6, BO, 13 * SIZE -+ SUB c32, b3, c32 -+ LD b7, BO, 14 * SIZE -+ SUB c42, b4, c42 -+ LD b8, BO, 15 * SIZE -+ SUB c52, b5, c52 -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+#else -+ LD b1, AO, 0 * SIZE -+#endif -+ SUB c62, b6, c62 -+ SUB c72, b7, c72 -+ SUB c82, b8, c82 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ LD b5, AO, 4 * SIZE -+ SUB c12, b2, c12 -+ LD b6, AO, 5 * SIZE -+ SUB c21, b3, c21 -+ LD b7, AO, 6 * SIZE -+ SUB c22, b4, c22 -+ LD b8, AO, 7 * SIZE -+ SUB c31, b5, c31 -+ LD b1, AO, 8 * SIZE -+ SUB c32, b6, c32 -+ LD b2, AO, 9 * SIZE -+ SUB c41, b7, c41 -+ LD b3, AO, 10 * SIZE -+ SUB c42, b8, c42 -+ LD b4, AO, 11 * SIZE -+ LD b5, AO, 12 * SIZE -+ SUB c51, b1, c51 -+ LD b6, AO, 13 * SIZE -+ SUB c52, b2, c52 -+ LD b7, AO, 14 * SIZE -+ SUB c61, b3, c61 -+ LD b8, AO, 15 * SIZE -+ SUB c62, b4, c62 -+ SUB c71, b5, c71 -+ SUB c72, b6, c72 -+ SUB c81, b7, c81 -+ SUB c82, b8, c82 -+#endif -+#ifdef LN -+ MUL c12, b1, c12 -+ LD b2, AO, 2 * SIZE -+ MUL c22, b1, c22 -+ MUL c32, b1, c32 -+ MUL c42, b1, c42 -+ MUL c52, b1, c52 -+ MUL c62, b1, c62 -+ MUL c72, b1, c72 -+ MUL c82, b1, c82 -+ NMSUB c11, c12, b2, c11 -+ LD b3, AO, 0 * SIZE -+ NMSUB c21, c22, b2, c21 -+ NMSUB c31, c32, b2, c31 -+ NMSUB c41, c42, b2, c41 -+ NMSUB c51, c52, b2, c51 -+ NMSUB c61, c62, b2, c61 -+ NMSUB c71, c72, b2, c71 -+ NMSUB c81, c82, b2, c81 -+ MUL c11, b3, c11 -+ addi.d CO1, CO1, -2 * SIZE -+ MUL c21, b3, c21 -+ addi.d CO2, CO2, -2 * SIZE -+ MUL c31, b3, c31 -+ addi.d CO3, CO3, -2 * SIZE -+ MUL c41, b3, c41 -+ addi.d CO4, CO4, -2 * SIZE -+ MUL c51, b3, c51 -+ addi.d CO5, CO5, -2 * SIZE -+ MUL c61, b3, c61 -+ addi.d CO6, CO6, -2 * SIZE -+ MUL c71, b3, c71 -+ addi.d CO7, CO7, -2 * SIZE -+ MUL c81, b3, c81 -+ addi.d CO8, CO8, -2 * SIZE -+#endif -+#ifdef LT -+ MUL c11, b1, c11 -+ LD b2, AO, 1 * SIZE -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+ MUL c51, b1, c51 -+ MUL c61, b1, c61 -+ MUL c71, b1, c71 -+ MUL c81, b1, c81 -+ NMSUB c12, c11, b2, c12 -+ LD b3, AO, 3 * SIZE -+ NMSUB c22, c21, b2, c22 -+ NMSUB c32, c31, b2, c32 -+ NMSUB c42, c41, b2, c42 -+ NMSUB c52, c51, b2, c52 -+ NMSUB c62, c61, b2, c62 -+ NMSUB c72, c71, b2, c72 -+ NMSUB c82, c81, b2, c82 -+ MUL c12, b3, c12 -+ MUL c22, b3, c22 -+ MUL c32, b3, c32 -+ MUL c42, b3, c42 -+ MUL c52, b3, c52 -+ MUL c62, b3, c62 -+ MUL c72, b3, c72 -+ MUL c82, b3, c82 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+ LD b5, BO, 4 * SIZE -+ NMSUB c21, c11, b2, c21 -+ NMSUB c22, c12, b2, c22 -+ LD b6, BO, 5 * SIZE -+ NMSUB c31, c11, b3, c31 -+ NMSUB c32, c12, b3, c32 -+ LD b7, BO, 6 * SIZE -+ NMSUB c41, c11, b4, c41 -+ NMSUB c42, c12, b4, c42 -+ LD b8, BO, 7 * SIZE -+ NMSUB c51, c11, b5, c51 -+ NMSUB c52, c12, b5, c52 -+ LD b2, BO, 9 * SIZE -+ NMSUB c61, c11, b6, c61 -+ NMSUB c62, c12, b6, c62 -+ LD b3, BO, 10 * SIZE -+ NMSUB c71, c11, b7, c71 -+ NMSUB c72, c12, b7, c72 -+ LD b4, BO, 11 * SIZE -+ NMSUB c81, c11, b8, c81 -+ NMSUB c82, c12, b8, c82 -+ LD b5, BO, 12 * SIZE -+ MUL c21, b2, c21 -+ MUL c22, b2, c22 -+ LD b6, BO, 13 * SIZE -+ NMSUB c31, c21, b3, c31 -+ NMSUB c32, c22, b3, c32 -+ LD b7, BO, 14 * SIZE -+ NMSUB c41, c21, b4, c41 -+ NMSUB c42, c22, b4, c42 -+ LD b8, BO, 15 * SIZE -+ NMSUB c51, c21, b5, c51 -+ NMSUB c52, c22, b5, c52 -+ LD b3, BO, 18 * SIZE -+ NMSUB c61, c21, b6, c61 -+ NMSUB c62, c22, b6, c62 -+ LD b4, BO, 19 * SIZE -+ NMSUB c71, c21, b7, c71 -+ NMSUB c72, c22, b7, c72 -+ LD b5, BO, 20 * SIZE -+ NMSUB c81, c21, b8, c81 -+ NMSUB c82, c22, b8, c82 -+ LD b6, BO, 21 * SIZE -+ MUL c31, b3, c31 -+ MUL c32, b3, c32 -+ LD b7, BO, 22 * SIZE -+ NMSUB c41, c31, b4, c41 -+ NMSUB c42, c32, b4, c42 -+ LD b8, BO, 23 * SIZE -+ NMSUB c51, c31, b5, c51 -+ NMSUB c52, c32, b5, c52 -+ LD b4, BO, 27 * SIZE -+ NMSUB c61, c31, b6, c61 -+ NMSUB c62, c32, b6, c62 -+ LD b5, BO, 28 * SIZE -+ NMSUB c71, c31, b7, c71 -+ NMSUB c72, c32, b7, c72 -+ LD b6, BO, 29 * SIZE -+ NMSUB c81, c31, b8, c81 -+ NMSUB c82, c32, b8, c82 -+ LD b7, BO, 30 * SIZE -+ MUL c41, b4, c41 -+ MUL c42, b4, c42 -+ LD b8, BO, 31 * SIZE -+ NMSUB c51, c41, b5, c51 -+ NMSUB c52, c42, b5, c52 -+ LD b5, BO, 36 * SIZE -+ NMSUB c61, c41, b6, c61 -+ NMSUB c62, c42, b6, c62 -+ LD b6, BO, 37 * SIZE -+ NMSUB c71, c41, b7, c71 -+ NMSUB c72, c42, b7, c72 -+ LD b7, BO, 38 * SIZE -+ NMSUB c81, c41, b8, c81 -+ NMSUB c82, c42, b8, c82 -+ LD b8, BO, 39 * SIZE -+ MUL c51, b5, c51 -+ MUL c52, b5, c52 -+ NMSUB c61, c51, b6, c61 -+ NMSUB c62, c52, b6, c62 -+ LD b6, BO, 45 * SIZE -+ NMSUB c71, c51, b7, c71 -+ NMSUB c72, c52, b7, c72 -+ LD b7, BO, 46 * SIZE -+ NMSUB c81, c51, b8, c81 -+ NMSUB c82, c52, b8, c82 -+ LD b8, BO, 47 * SIZE -+ MUL c61, b6, c61 -+ MUL c62, b6, c62 -+ NMSUB c71, c61, b7, c71 -+ NMSUB c72, c62, b7, c72 -+ LD b7, BO, 54 * SIZE -+ NMSUB c81, c61, b8, c81 -+ NMSUB c82, c62, b8, c82 -+ LD b8, BO, 55 * SIZE -+ MUL c71, b7, c71 -+ MUL c72, b7, c72 -+ NMSUB c81, c71, b8, c81 -+ NMSUB c82, c72, b8, c82 -+ LD b8, BO, 63 * SIZE -+ MUL c81, b8, c81 -+ MUL c82, b8, c82 -+#endif -+#ifdef RT -+ LD b1, BO, 63 * SIZE -+ LD b2, BO, 62 * SIZE -+ LD b3, BO, 61 * SIZE -+ LD b4, BO, 60 * SIZE -+ MUL c81, b1, c81 -+ MUL c82, b1, c82 -+ LD b5, BO, 59 * SIZE -+ NMSUB c71, c81, b2, c71 -+ NMSUB c72, c82, b2, c72 -+ LD b6, BO, 58 * SIZE -+ NMSUB c61, c81, b3, c61 -+ NMSUB c62, c82, b3, c62 -+ LD b7, BO, 57 * SIZE -+ NMSUB c51, c81, b4, c51 -+ NMSUB c52, c82, b4, c52 -+ LD b8, BO, 56 * SIZE -+ NMSUB c41, c81, b5, c41 -+ NMSUB c42, c82, b5, c42 -+ LD b2, BO, 54 * SIZE -+ NMSUB c31, c81, b6, c31 -+ NMSUB c32, c82, b6, c32 -+ LD b3, BO, 53 * SIZE -+ NMSUB c21, c81, b7, c21 -+ NMSUB c22, c82, b7, c22 -+ LD b4, BO, 52 * SIZE -+ NMSUB c11, c81, b8, c11 -+ NMSUB c12, c82, b8, c12 -+ LD b5, BO, 51 * SIZE -+ MUL c71, b2, c71 -+ MUL c72, b2, c72 -+ LD b6, BO, 50 * SIZE -+ NMSUB c61, c71, b3, c61 -+ NMSUB c62, c72, b3, c62 -+ LD b7, BO, 49 * SIZE -+ NMSUB c51, c71, b4, c51 -+ NMSUB c52, c72, b4, c52 -+ LD b8, BO, 48 * SIZE -+ NMSUB c41, c71, b5, c41 -+ NMSUB c42, c72, b5, c42 -+ LD b3, BO, 45 * SIZE -+ NMSUB c31, c71, b6, c31 -+ NMSUB c32, c72, b6, c32 -+ LD b4, BO, 44 * SIZE -+ NMSUB c21, c71, b7, c21 -+ NMSUB c22, c72, b7, c22 -+ LD b5, BO, 43 * SIZE -+ NMSUB c11, c71, b8, c11 -+ NMSUB c12, c72, b8, c12 -+ LD b6, BO, 42 * SIZE -+ MUL c61, b3, c61 -+ MUL c62, b3, c62 -+ LD b7, BO, 41 * SIZE -+ NMSUB c51, c61, b4, c51 -+ NMSUB c52, c62, b4, c52 -+ LD b8, BO, 40 * SIZE -+ NMSUB c41, c61, b5, c41 -+ NMSUB c42, c62, b5, c42 -+ LD b4, BO, 36 * SIZE -+ NMSUB c31, c61, b6, c31 -+ NMSUB c32, c62, b6, c32 -+ LD b5, BO, 35 * SIZE -+ NMSUB c21, c61, b7, c21 -+ NMSUB c22, c62, b7, c22 -+ LD b6, BO, 34 * SIZE -+ NMSUB c11, c61, b8, c11 -+ NMSUB c12, c62, b8, c12 -+ LD b7, BO, 33 * SIZE -+ MUL c51, b4, c51 -+ MUL c52, b4, c52 -+ LD b8, BO, 32 * SIZE -+ NMSUB c41, c51, b5, c41 -+ NMSUB c42, c52, b5, c42 -+ LD b5, BO, 27 * SIZE -+ NMSUB c31, c51, b6, c31 -+ NMSUB c32, c52, b6, c32 -+ LD b6, BO, 26 * SIZE -+ NMSUB c21, c51, b7, c21 -+ NMSUB c22, c52, b7, c22 -+ LD b7, BO, 25 * SIZE -+ NMSUB c11, c51, b8, c11 -+ NMSUB c12, c52, b8, c12 -+ LD b8, BO, 24 * SIZE -+ MUL c41, b5, c41 -+ MUL c42, b5, c42 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c32, c42, b6, c32 -+ LD b6, BO, 18 * SIZE -+ NMSUB c21, c41, b7, c21 -+ NMSUB c22, c42, b7, c22 -+ LD b7, BO, 17 * SIZE -+ NMSUB c11, c41, b8, c11 -+ NMSUB c12, c42, b8, c12 -+ LD b8, BO, 16 * SIZE -+ MUL c31, b6, c31 -+ MUL c32, b6, c32 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c22, c32, b7, c22 -+ LD b7, BO, 9 * SIZE -+ NMSUB c11, c31, b8, c11 -+ NMSUB c12, c32, b8, c12 -+ LD b8, BO, 8 * SIZE -+ MUL c21, b7, c21 -+ MUL c22, b7, c22 -+ NMSUB c11, c21, b8, c11 -+ NMSUB c12, c22, b8, c12 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+ MUL c12, b8, c12 -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+ ST c51, BO, 4 * SIZE -+ ST c61, BO, 5 * SIZE -+ ST c71, BO, 6 * SIZE -+ ST c81, BO, 7 * SIZE -+ ST c12, BO, 8 * SIZE -+ ST c22, BO, 9 * SIZE -+ ST c32, BO, 10 * SIZE -+ ST c42, BO, 11 * SIZE -+ ST c52, BO, 12 * SIZE -+ ST c62, BO, 13 * SIZE -+ ST c72, BO, 14 * SIZE -+ ST c82, BO, 15 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c21, AO, 2 * SIZE -+ ST c22, AO, 3 * SIZE -+ ST c31, AO, 4 * SIZE -+ ST c32, AO, 5 * SIZE -+ ST c41, AO, 6 * SIZE -+ ST c42, AO, 7 * SIZE -+ ST c51, AO, 8 * SIZE -+ ST c52, AO, 9 * SIZE -+ ST c61, AO, 10 * SIZE -+ ST c62, AO, 11 * SIZE -+ ST c71, AO, 12 * SIZE -+ ST c72, AO, 13 * SIZE -+ ST c81, AO, 14 * SIZE -+ ST c82, AO, 15 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c22, CO2, 1 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c32, CO3, 1 * SIZE -+ ST c41, CO4, 0 * SIZE -+ ST c42, CO4, 1 * SIZE -+ ST c51, CO5, 0 * SIZE -+ ST c52, CO5, 1 * SIZE -+ ST c61, CO6, 0 * SIZE -+ ST c62, CO6, 1 * SIZE -+ ST c71, CO7, 0 * SIZE -+ ST c72, CO7, 1 * SIZE -+ ST c81, CO8, 0 * SIZE -+ ST c82, CO8, 1 * SIZE -+MTC a1, $r0 -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+ addi.d CO2, CO2, 2 * SIZE -+ addi.d CO3, CO3, 2 * SIZE -+ addi.d CO4, CO4, 2 * SIZE -+ addi.d CO5, CO5, 2 * SIZE -+ addi.d CO6, CO6, 2 * SIZE -+ addi.d CO7, CO7, 2 * SIZE -+ addi.d CO8, CO8, 2 * SIZE -+#endif -+ MOV c11, a1 -+ MOV c21, a1 -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+ MOV c31, a1 -+ MOV c41, a1 -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+ addi.d I, I, -1 -+ MOV c51, a1 -+MOV c61, a1 -+ blt $r0, I, .L11 -+ .align 3 -+ -+.L20: -+ andi I, M, 1 -+ MOV c61, c11 -+MOV c71, c11 -+ bge $r0, I, .L29 -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, KK, 2 -+ MOV c81, c11 -+move BO, B -+ bge $r0, L, .L25 -+#else -+#ifdef LN -+ slli.d TEMP, K, 0 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ srai.d L, TEMP, 2 -+ MOV c81, c11 -+ bge $r0, L, .L25 -+#endif -+ .align 3 -+.L22: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 16 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ LD b5, BO, 20 * SIZE -+ MADD c61, b2, a1, c61 -+ LD b2, BO, 9 * SIZE -+ MADD c71, b3, a1, c71 -+ LD b3, BO, 10 * SIZE -+ MADD c81, b4, a1, c81 -+ LD b4, BO, 11 * SIZE -+ LD a1, AO, 4 * SIZE -+ addi.d L, L, -1 -+ MADD c11, b6, a2, c11 -+ LD b6, BO, 24 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 13 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 14 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a2, c51 -+ LD b7, BO, 28 * SIZE -+ MADD c61, b2, a2, c61 -+ LD b2, BO, 17 * SIZE -+ MADD c71, b3, a2, c71 -+ LD b3, BO, 18 * SIZE -+ MADD c81, b4, a2, c81 -+ LD b4, BO, 19 * SIZE -+ LD a2, AO, 5 * SIZE -+ addi.d AO, AO, 4 * SIZE -+ MADD c11, b1, a3, c11 -+ LD b1, BO, 32 * SIZE -+ MADD c21, b2, a3, c21 -+ LD b2, BO, 21 * SIZE -+ MADD c31, b3, a3, c31 -+ LD b3, BO, 22 * SIZE -+ MADD c41, b4, a3, c41 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ LD b5, BO, 36 * SIZE -+ MADD c61, b2, a3, c61 -+ LD b2, BO, 25 * SIZE -+ MADD c71, b3, a3, c71 -+ LD b3, BO, 26 * SIZE -+ MADD c81, b4, a3, c81 -+ LD b4, BO, 27 * SIZE -+ LD a3, AO, 2 * SIZE -+ addi.d BO, BO, 32 * SIZE -+ MADD c11, b6, a4, c11 -+ LD b6, BO, 8 * SIZE -+ MADD c21, b2, a4, c21 -+ LD b2, BO, -3 * SIZE -+ MADD c31, b3, a4, c31 -+ LD b3, BO, -2 * SIZE -+ MADD c41, b4, a4, c41 -+ LD b4, BO, -1 * SIZE -+ MADD c51, b7, a4, c51 -+ LD b7, BO, 12 * SIZE -+ MADD c61, b2, a4, c61 -+ LD b2, BO, 1 * SIZE -+ MADD c71, b3, a4, c71 -+ LD b3, BO, 2 * SIZE -+ MADD c81, b4, a4, c81 -+ LD b4, BO, 3 * SIZE -+ LD a4, AO, 3 * SIZE -+ blt $r0, L, .L22 -+ .align 3 -+ -+.L25: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L28 -+ .align 3 -+.L26: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 8 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ addi.d L, L, -1 -+ MOV a2, a2 -+ addi.d AO, AO, 1 * SIZE -+ addi.d BO, BO, 8 * SIZE -+ MADD c51, b5, a1, c51 -+ LD b5, BO, 4 * SIZE -+ MADD c61, b2, a1, c61 -+ LD b2, BO, 1 * SIZE -+ MADD c71, b3, a1, c71 -+ LD b3, BO, 2 * SIZE -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L26 -+.L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -8 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+ SUB c51, b5, c51 -+ SUB c61, b6, c61 -+ SUB c71, b7, c71 -+ SUB c81, b8, c81 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ LD b5, AO, 4 * SIZE -+ LD b6, AO, 5 * SIZE -+ LD b7, AO, 6 * SIZE -+ LD b8, AO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+ SUB c51, b5, c51 -+ SUB c61, b6, c61 -+ SUB c71, b7, c71 -+ SUB c81, b8, c81 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+ MUL c51, b1, c51 -+ MUL c61, b1, c61 -+ MUL c71, b1, c71 -+ MUL c81, b1, c81 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c31, c11, b3, c31 -+ NMSUB c41, c11, b4, c41 -+ NMSUB c51, c11, b5, c51 -+ NMSUB c61, c11, b6, c61 -+ NMSUB c71, c11, b7, c71 -+ NMSUB c81, c11, b8, c81 -+ LD b2, BO, 9 * SIZE -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ LD b5, BO, 12 * SIZE -+ LD b6, BO, 13 * SIZE -+ LD b7, BO, 14 * SIZE -+ LD b8, BO, 15 * SIZE -+ MUL c21, b2, c21 -+ NMSUB c31, c21, b3, c31 -+ NMSUB c41, c21, b4, c41 -+ NMSUB c51, c21, b5, c51 -+ NMSUB c61, c21, b6, c61 -+ NMSUB c71, c21, b7, c71 -+ NMSUB c81, c21, b8, c81 -+ LD b3, BO, 18 * SIZE -+ LD b4, BO, 19 * SIZE -+ LD b5, BO, 20 * SIZE -+ LD b6, BO, 21 * SIZE -+ LD b7, BO, 22 * SIZE -+ LD b8, BO, 23 * SIZE -+ MUL c31, b3, c31 -+ NMSUB c41, c31, b4, c41 -+ NMSUB c51, c31, b5, c51 -+ NMSUB c61, c31, b6, c61 -+ NMSUB c71, c31, b7, c71 -+ NMSUB c81, c31, b8, c81 -+ LD b4, BO, 27 * SIZE -+ LD b5, BO, 28 * SIZE -+ LD b6, BO, 29 * SIZE -+ LD b7, BO, 30 * SIZE -+ LD b8, BO, 31 * SIZE -+ MUL c41, b4, c41 -+ NMSUB c51, c41, b5, c51 -+ NMSUB c61, c41, b6, c61 -+ NMSUB c71, c41, b7, c71 -+ NMSUB c81, c41, b8, c81 -+ LD b5, BO, 36 * SIZE -+ LD b6, BO, 37 * SIZE -+ LD b7, BO, 38 * SIZE -+ LD b8, BO, 39 * SIZE -+ MUL c51, b5, c51 -+ NMSUB c61, c51, b6, c61 -+ NMSUB c71, c51, b7, c71 -+ NMSUB c81, c51, b8, c81 -+ LD b6, BO, 45 * SIZE -+ LD b7, BO, 46 * SIZE -+ LD b8, BO, 47 * SIZE -+ MUL c61, b6, c61 -+ NMSUB c71, c61, b7, c71 -+ NMSUB c81, c61, b8, c81 -+ LD b7, BO, 54 * SIZE -+ LD b8, BO, 55 * SIZE -+ MUL c71, b7, c71 -+ NMSUB c81, c71, b8, c81 -+ LD b8, BO, 63 * SIZE -+ MUL c81, b8, c81 -+#endif -+#ifdef RT -+ LD b1, BO, 63 * SIZE -+ LD b2, BO, 62 * SIZE -+ LD b3, BO, 61 * SIZE -+ LD b4, BO, 60 * SIZE -+ LD b5, BO, 59 * SIZE -+ LD b6, BO, 58 * SIZE -+ LD b7, BO, 57 * SIZE -+ LD b8, BO, 56 * SIZE -+ MUL c81, b1, c81 -+ NMSUB c71, c81, b2, c71 -+ NMSUB c61, c81, b3, c61 -+ NMSUB c51, c81, b4, c51 -+ NMSUB c41, c81, b5, c41 -+ NMSUB c31, c81, b6, c31 -+ NMSUB c21, c81, b7, c21 -+ NMSUB c11, c81, b8, c11 -+ LD b2, BO, 54 * SIZE -+ LD b3, BO, 53 * SIZE -+ LD b4, BO, 52 * SIZE -+ LD b5, BO, 51 * SIZE -+ LD b6, BO, 50 * SIZE -+ LD b7, BO, 49 * SIZE -+ LD b8, BO, 48 * SIZE -+ MUL c71, b2, c71 -+ NMSUB c61, c71, b3, c61 -+ NMSUB c51, c71, b4, c51 -+ NMSUB c41, c71, b5, c41 -+ NMSUB c31, c71, b6, c31 -+ NMSUB c21, c71, b7, c21 -+ NMSUB c11, c71, b8, c11 -+ LD b3, BO, 45 * SIZE -+ LD b4, BO, 44 * SIZE -+ LD b5, BO, 43 * SIZE -+ LD b6, BO, 42 * SIZE -+ LD b7, BO, 41 * SIZE -+ LD b8, BO, 40 * SIZE -+ MUL c61, b3, c61 -+ NMSUB c51, c61, b4, c51 -+ NMSUB c41, c61, b5, c41 -+ NMSUB c31, c61, b6, c31 -+ NMSUB c21, c61, b7, c21 -+ NMSUB c11, c61, b8, c11 -+ LD b4, BO, 36 * SIZE -+ LD b5, BO, 35 * SIZE -+ LD b6, BO, 34 * SIZE -+ LD b7, BO, 33 * SIZE -+ LD b8, BO, 32 * SIZE -+ MUL c51, b4, c51 -+ NMSUB c41, c51, b5, c41 -+ NMSUB c31, c51, b6, c31 -+ NMSUB c21, c51, b7, c21 -+ NMSUB c11, c51, b8, c11 -+ LD b5, BO, 27 * SIZE -+ LD b6, BO, 26 * SIZE -+ LD b7, BO, 25 * SIZE -+ LD b8, BO, 24 * SIZE -+ MUL c41, b5, c41 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c21, c41, b7, c21 -+ NMSUB c11, c41, b8, c11 -+ LD b6, BO, 18 * SIZE -+ LD b7, BO, 17 * SIZE -+ LD b8, BO, 16 * SIZE -+ MUL c31, b6, c31 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c11, c31, b8, c11 -+ LD b7, BO, 9 * SIZE -+ LD b8, BO, 8 * SIZE -+ MUL c21, b7, c21 -+ NMSUB c11, c21, b8, c11 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+ addi.d CO2, CO2, -1 * SIZE -+ addi.d CO3, CO3, -1 * SIZE -+ addi.d CO4, CO4, -1 * SIZE -+ addi.d CO5, CO5, -1 * SIZE -+ addi.d CO6, CO6, -1 * SIZE -+ addi.d CO7, CO7, -1 * SIZE -+ addi.d CO8, CO8, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+ ST c51, BO, 4 * SIZE -+ ST c61, BO, 5 * SIZE -+ ST c71, BO, 6 * SIZE -+ ST c81, BO, 7 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c21, AO, 1 * SIZE -+ ST c31, AO, 2 * SIZE -+ ST c41, AO, 3 * SIZE -+ ST c51, AO, 4 * SIZE -+ ST c61, AO, 5 * SIZE -+ ST c71, AO, 6 * SIZE -+ ST c81, AO, 7 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c41, CO4, 0 * SIZE -+ ST c51, CO5, 0 * SIZE -+ ST c61, CO6, 0 * SIZE -+ ST c71, CO7, 0 * SIZE -+ ST c81, CO8, 0 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+ addi.d CO2, CO2, 1 * SIZE -+ addi.d CO3, CO3, 1 * SIZE -+ addi.d CO4, CO4, 1 * SIZE -+ addi.d CO5, CO5, 1 * SIZE -+ addi.d CO6, CO6, 1 * SIZE -+ addi.d CO7, CO7, 1 * SIZE -+ addi.d CO8, CO8, 1 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L29: -+#ifdef LN -+ slli.d TEMP, K, 3 + BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 8 -+#endif -+#ifdef RT -+ addi.d KK, KK, -8 -+#endif -+ blt $r0, J, .L10 -+ .align 3 -+ -+.L30: -+ andi J, N, 4 -+move AO, A -+ bge $r0, J, .L50 -+#ifdef RT -+ slli.d TEMP, K, 2 + BASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 2 -+ sub.d C, C, TEMP -+#endif -+ move CO1, C -+MTC c11, $r0 -+ add.d CO2, C, LDC -+ add.d CO3, CO2, LDC -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ srai.d I, M, 1 -+ MOV c31, c11 -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO4, LDC -+#endif -+MOV c41, c11 -+ bge $r0, I, .L40 -+.L31: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ LD a3, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ MOV c32, c11 -+ LD b4, B, 3 * SIZE -+ MOV c42, c11 -+ LD b5, B, 4 * SIZE -+ srai.d L, KK, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L35 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ LD a3, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c32, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c42, c11 -+ LD b5, BO, 4 * SIZE -+ srai.d L, TEMP, 2 -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L35 -+#endif -+ .align 3 -+.L32: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c11, b5, a1, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 8 * SIZE -+ MADD c12, b5, a2, c12 -+ LD b5, BO, 20 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 9 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 10 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ LD a3, AO, 6 * SIZE -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c11, b7, a3, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a3, c21 -+ addi.d AO, AO, 8 * SIZE -+ MADD c31, b3, a3, c31 -+ addi.d BO, BO, 16 * SIZE -+ MADD c41, b4, a3, c41 -+ LD a3, AO, 4 * SIZE -+ MADD c12, b7, a2, c12 -+ LD b7, BO, 12 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 1 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 2 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L32 -+ .align 3 -+ -+.L35: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L38 -+ .align 3 -+.L36: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ addi.d AO, AO, 2 * SIZE -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 0 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 4 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L36 -+.L38: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -4 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+ SUB c12, b5, c12 -+ SUB c22, b6, c22 -+ SUB c32, b7, c32 -+ SUB c42, b8, c42 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ LD b5, AO, 4 * SIZE -+ LD b6, AO, 5 * SIZE -+ LD b7, AO, 6 * SIZE -+ LD b8, AO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c21, b3, c21 -+ SUB c22, b4, c22 -+ SUB c31, b5, c31 -+ SUB c32, b6, c32 -+ SUB c41, b7, c41 -+ SUB c42, b8, c42 -+#endif -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+ LD b2, AO, 2 * SIZE -+ LD b3, AO, 0 * SIZE -+ MUL c12, b1, c12 -+ MUL c22, b1, c22 -+ MUL c32, b1, c32 -+ MUL c42, b1, c42 -+ NMSUB c11, c12, b2, c11 -+ NMSUB c21, c22, b2, c21 -+ NMSUB c31, c32, b2, c31 -+ NMSUB c41, c42, b2, c41 -+ MUL c11, b3, c11 -+ MUL c21, b3, c21 -+ MUL c31, b3, c31 -+ MUL c41, b3, c41 -+#endif -+#ifdef LT -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+ NMSUB c12, c11, b2, c12 -+ NMSUB c22, c21, b2, c22 -+ NMSUB c32, c31, b2, c32 -+ NMSUB c42, c41, b2, c42 -+ MUL c12, b3, c12 -+ MUL c22, b3, c22 -+ MUL c32, b3, c32 -+ MUL c42, b3, c42 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c22, c12, b2, c22 -+ NMSUB c31, c11, b3, c31 -+ NMSUB c32, c12, b3, c32 -+ NMSUB c41, c11, b4, c41 -+ NMSUB c42, c12, b4, c42 -+ LD b2, BO, 5 * SIZE -+ LD b3, BO, 6 * SIZE -+ LD b4, BO, 7 * SIZE -+ MUL c21, b2, c21 -+ MUL c22, b2, c22 -+ NMSUB c31, c21, b3, c31 -+ NMSUB c32, c22, b3, c32 -+ NMSUB c41, c21, b4, c41 -+ NMSUB c42, c22, b4, c42 -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ MUL c31, b3, c31 -+ MUL c32, b3, c32 -+ NMSUB c41, c31, b4, c41 -+ NMSUB c42, c32, b4, c42 -+ LD b4, BO, 15 * SIZE -+ MUL c41, b4, c41 -+ MUL c42, b4, c42 -+#endif -+#ifdef RT -+ LD b5, BO, 15 * SIZE -+ LD b6, BO, 14 * SIZE -+ LD b7, BO, 13 * SIZE -+ LD b8, BO, 12 * SIZE -+ MUL c41, b5, c41 -+ MUL c42, b5, c42 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c32, c42, b6, c32 -+ NMSUB c21, c41, b7, c21 -+ NMSUB c22, c42, b7, c22 -+ NMSUB c11, c41, b8, c11 -+ NMSUB c12, c42, b8, c12 -+ LD b6, BO, 10 * SIZE -+ LD b7, BO, 9 * SIZE -+ LD b8, BO, 8 * SIZE -+ MUL c31, b6, c31 -+ MUL c32, b6, c32 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c22, c32, b7, c22 -+ NMSUB c11, c31, b8, c11 -+ NMSUB c12, c32, b8, c12 -+ LD b7, BO, 5 * SIZE -+ LD b8, BO, 4 * SIZE -+ MUL c21, b7, c21 -+ MUL c22, b7, c22 -+ NMSUB c11, c21, b8, c11 -+ NMSUB c12, c22, b8, c12 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+ MUL c12, b8, c12 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -2 * SIZE -+ addi.d CO2, CO2, -2 * SIZE -+ addi.d CO3, CO3, -2 * SIZE -+ addi.d CO4, CO4, -2 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+ ST c12, BO, 4 * SIZE -+ ST c22, BO, 5 * SIZE -+ ST c32, BO, 6 * SIZE -+ ST c42, BO, 7 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c21, AO, 2 * SIZE -+ ST c22, AO, 3 * SIZE -+ ST c31, AO, 4 * SIZE -+ ST c32, AO, 5 * SIZE -+ ST c41, AO, 6 * SIZE -+ ST c42, AO, 7 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c22, CO2, 1 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c32, CO3, 1 * SIZE -+ ST c41, CO4, 0 * SIZE -+ ST c42, CO4, 1 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+ addi.d CO2, CO2, 2 * SIZE -+ addi.d CO3, CO3, 2 * SIZE -+ addi.d CO4, CO4, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+MTC a1, $r0 -+ MOV c11, a1 -+ MOV c21, a1 -+ MOV c31, a1 -+ addi.d I, I, -1 -+MOV c41, c11 -+ blt $r0, I, .L31 -+ .align 3 -+ -+.L40: -+ andi I, M, 1 -+MOV c61, c11 -+ bge $r0, I, .L49 -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c81, c11 -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, KK, 2 -+move BO, B -+ bge $r0, L, .L45 -+#else -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c81, c11 -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L45 -+#endif -+ .align 3 -+.L42: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 16 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ LD a1, AO, 4 * SIZE -+ addi.d L, L, -1 -+ MADD c11, b5, a2, c11 -+ LD b5, BO, 20 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 10 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 11 * SIZE -+ LD a2, AO, 2 * SIZE -+ addi.d AO, AO, 4 * SIZE -+ MADD c11, b6, a2, c11 -+ LD b6, BO, 24 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 13 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 14 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 15 * SIZE -+ LD a2, AO, -1 * SIZE -+ addi.d BO, BO, 16 * SIZE -+ MADD c11, b7, a2, c11 -+ LD b7, BO, 12 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 1 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 2 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 3 * SIZE -+ LD a2, AO, 1 * SIZE -+ blt $r0, L, .L42 -+ .align 3 -+ -+.L45: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L48 -+ .align 3 -+.L46: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 4 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 1 * SIZE -+ LD b4, BO, 7 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+ MOV a2, a2 -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L46 -+.L48: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -4 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c31, c11, b3, c31 -+ NMSUB c41, c11, b4, c41 -+ LD b2, BO, 5 * SIZE -+ LD b3, BO, 6 * SIZE -+ LD b4, BO, 7 * SIZE -+ MUL c21, b2, c21 -+ NMSUB c31, c21, b3, c31 -+ NMSUB c41, c21, b4, c41 -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ MUL c31, b3, c31 -+ NMSUB c41, c31, b4, c41 -+ LD b4, BO, 15 * SIZE -+ MUL c41, b4, c41 -+#endif -+#ifdef RT -+ LD b5, BO, 15 * SIZE -+ LD b6, BO, 14 * SIZE -+ LD b7, BO, 13 * SIZE -+ LD b8, BO, 12 * SIZE -+ MUL c41, b5, c41 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c21, c41, b7, c21 -+ NMSUB c11, c41, b8, c11 -+ LD b6, BO, 10 * SIZE -+ LD b7, BO, 9 * SIZE -+ LD b8, BO, 8 * SIZE -+ MUL c31, b6, c31 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c11, c31, b8, c11 -+ LD b7, BO, 5 * SIZE -+ LD b8, BO, 4 * SIZE -+ MUL c21, b7, c21 -+ NMSUB c11, c21, b8, c11 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+ addi.d CO2, CO2, -1 * SIZE -+ addi.d CO3, CO3, -1 * SIZE -+ addi.d CO4, CO4, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c21, AO, 1 * SIZE -+ ST c31, AO, 2 * SIZE -+ ST c41, AO, 3 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c41, CO4, 0 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+ addi.d CO2, CO2, 1 * SIZE -+ addi.d CO3, CO3, 1 * SIZE -+ addi.d CO4, CO4, 1 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L49: -+#ifdef LN -+ slli.d TEMP, K, 2 + BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 4 -+#endif -+#ifdef RT -+ addi.d KK, KK, -4 -+#endif -+ .align 3 -+ -+.L50: -+ andi J, N, 2 -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+#else -+ move AO, A -+#endif -+ bge $r0, J, .L70 -+#ifdef RT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 1 -+ sub.d C, C, TEMP -+#endif -+ move AO, A -+ move CO1, C -+ add.d CO2, C, LDC -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO2, LDC -+#endif -+ srai.d I, M, 1 -+ bge $r0, I, .L60 -+.L51: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ LD b5, B, 4 * SIZE -+ srai.d L, KK, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L55 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ LD b5, BO, 4 * SIZE -+ srai.d L, TEMP, 2 -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L55 -+#endif -+ .align 3 -+.L52: -+ MADD c11, b1, a1, c11 -+ LD a3, AO, 2 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b4, BO, 3 * SIZE -+ MADD c12, b1, a2, c12 -+ LD a4, AO, 3 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b1, BO, 8 * SIZE -+ MADD c11, b3, a3, c11 -+ LD a1, AO, 8 * SIZE -+ MADD c21, b4, a3, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c12, b3, a4, c12 -+ LD a2, AO, 5 * SIZE -+ MADD c22, b4, a4, c22 -+ LD b3, BO, 6 * SIZE -+ MADD c11, b5, a5, c11 -+ LD a3, AO, 6 * SIZE -+ MADD c21, b2, a5, c21 -+ LD b4, BO, 7 * SIZE -+ MADD c12, b5, a2, c12 -+ LD a4, AO, 7 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b5, BO, 12 * SIZE -+ MADD c11, b3, a3, c11 -+ LD a5, AO, 12 * SIZE -+ MADD c21, b4, a3, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c12, b3, a4, c12 -+ LD a2, AO, 9 * SIZE -+ MADD c22, b4, a4, c22 -+ LD b3, BO, 10 * SIZE -+ addi.d AO, AO, 8 * SIZE -+ addi.d L, L, -1 -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L52 -+ .align 3 -+ -+.L55: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L58 -+ .align 3 -+.L56: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 3 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 2 * SIZE -+addi.d BO, BO, 2 * SIZE -+ blt $r0, L, .L56 -+.L58: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -2 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c12, b3, c12 -+ SUB c22, b4, c22 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c21, b3, c21 -+ SUB c22, b4, c22 -+#endif -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+ LD b2, AO, 2 * SIZE -+ LD b3, AO, 0 * SIZE -+ MUL c12, b1, c12 -+ MUL c22, b1, c22 -+ NMSUB c11, c12, b2, c11 -+ NMSUB c21, c22, b2, c21 -+ MUL c11, b3, c11 -+ MUL c21, b3, c21 -+#endif -+#ifdef LT -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ NMSUB c12, c11, b2, c12 -+ NMSUB c22, c21, b2, c22 -+ MUL c12, b3, c12 -+ MUL c22, b3, c22 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c22, c12, b2, c22 -+ MUL c21, b3, c21 -+ MUL c22, b3, c22 -+#endif -+#ifdef RT -+ LD b1, BO, 3 * SIZE -+ LD b2, BO, 2 * SIZE -+ LD b3, BO, 0 * SIZE -+ MUL c21, b1, c21 -+ MUL c22, b1, c22 -+ NMSUB c11, c21, b2, c11 -+ NMSUB c12, c22, b2, c12 -+ MUL c11, b3, c11 -+ MUL c12, b3, c12 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -2 * SIZE -+ addi.d CO2, CO2, -2 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c12, BO, 2 * SIZE -+ ST c22, BO, 3 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c21, AO, 2 * SIZE -+ ST c22, AO, 3 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c22, CO2, 1 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+ addi.d CO2, CO2, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AO, TEMP -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+MTC a1, $r0 -+ MOV c11, a1 -+ MOV c21, a1 -+ MOV c31, a1 -+ addi.d I, I, -1 -+MOV c41, c11 -+ blt $r0, I, .L51 -+ .align 3 -+ -+.L60: -+ andi I, M, 1 -+ bge $r0, I, .L69 -+#if defined(LT) || defined(RN) -+ srai.d L, KK, 2 -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ MOV c31, c11 -+ LD a4, AO, 3 * SIZE -+ MOV c41, c11 -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L65 -+#else -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ srai.d L, TEMP, 2 -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ MOV c31, c11 -+ LD a4, AO, 3 * SIZE -+ MOV c41, c11 -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L65 -+#endif -+ .align 3 -+.L62: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 4 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 7 * SIZE -+ LD a1, AO, 4 * SIZE -+ LD a2, AO, 5 * SIZE -+ MADD c11, b1, a3, c11 -+ LD b1, BO, 8 * SIZE -+ MADD c21, b2, a3, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c31, b3, a4, c31 -+ LD b3, BO, 10 * SIZE -+ MADD c41, b4, a4, c41 -+ LD b4, BO, 11 * SIZE -+ LD a3, AO, 6 * SIZE -+ LD a4, AO, 7 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 4 * SIZE -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L62 -+ .align 3 -+ -+.L65: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L68 -+ .align 3 -+.L66: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 2 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 3 * SIZE -+ LD a1, AO, 1 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+addi.d BO, BO, 2 * SIZE -+ blt $r0, L, .L66 -+.L68: -+ ADD c11, c11, c31 -+ ADD c21, c21, c41 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -2 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+#endif -+#if defined(LN) || defined(LT) -+ LD b3, AO, 0 * SIZE -+ MUL c11, b3, c11 -+ MUL c21, b3, c21 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c21, c11, b2, c21 -+ MUL c21, b3, c21 -+#endif -+#ifdef RT -+ LD b1, BO, 3 * SIZE -+ LD b2, BO, 2 * SIZE -+ LD b3, BO, 0 * SIZE -+ MUL c21, b1, c21 -+ NMSUB c11, c21, b2, c11 -+ MUL c11, b3, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+ addi.d CO2, CO2, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c21, AO, 1 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+ addi.d CO2, CO2, 1 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 0 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L69: -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 2 -+#endif -+#ifdef RT -+ addi.d KK, KK, -2 -+#endif -+ .align 3 -+ -+.L70: -+ andi J, N, 1 -+ bge $r0, J, .L999 -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d B, B, TEMP -+ sub.d C, C, LDC -+#endif -+ move AO, A -+ move CO1, C -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO1, LDC -+#endif -+ srai.d I, M, 1 -+ bge $r0, I, .L80 -+.L71: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ LD b5, B, 4 * SIZE -+ srai.d L, KK, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L75 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 0 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ LD b5, BO, 4 * SIZE -+ srai.d L, TEMP, 2 -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L75 -+#endif -+ .align 3 -+.L72: -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 2 * SIZE -+ LD a2, AO, 3 * SIZE -+ LD b1, BO, 1 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 4 * SIZE -+ LD a2, AO, 5 * SIZE -+ LD b1, BO, 2 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 6 * SIZE -+ LD a2, AO, 7 * SIZE -+ LD b1, BO, 3 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ addi.d L, L, -1 -+ addi.d AO, AO, 8 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L72 -+ .align 3 -+ -+.L75: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L78 -+ .align 3 -+.L76: -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ addi.d L, L, -1 -+ addi.d AO, AO, 2 * SIZE -+addi.d BO, BO, 1 * SIZE -+ blt $r0, L, .L76 -+.L78: -+ ADD c11, c11, c21 -+ ADD c12, c12, c22 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -1 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+#endif -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+ LD b2, AO, 2 * SIZE -+ LD b3, AO, 0 * SIZE -+ MUL c12, b1, c12 -+ NMSUB c11, c12, b2, c11 -+ MUL c11, b3, c11 -+#endif -+#ifdef LT -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 3 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c12, c11, b2, c12 -+ MUL c12, b3, c12 -+#endif -+#if defined(RN) || defined(RT) -+ LD b1, BO, 0 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -2 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c12, BO, 1 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+ addi.d I, I, -1 -+ blt $r0, I, .L71 -+ .align 3 -+ -+.L80: -+ andi I, M, 1 -+ bge $r0, I, .L89 -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, KK, 2 -+move BO, B -+ bge $r0, L, .L85 -+#else -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d TEMP, KK, BASE_SHIFT -+ add.d AO, AORIG, TEMP -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L85 -+#endif -+ .align 3 -+.L82: -+ LD a1, AO, 0 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a1, AO, 1 * SIZE -+ LD b1, BO, 1 * SIZE -+ MADD c21, b1, a1, c21 -+ LD a1, AO, 2 * SIZE -+ LD b1, BO, 2 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a1, AO, 3 * SIZE -+ LD b1, BO, 3 * SIZE -+ MADD c21, b1, a1, c21 -+ addi.d L, L, -1 -+ addi.d AO, AO, 4 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L82 -+ .align 3 -+ -+.L85: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L88 -+ .align 3 -+.L86: -+ LD a1, AO, 0 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+addi.d BO, BO, 1 * SIZE -+ blt $r0, L, .L86 -+.L88: -+ ADD c11, c11, c21 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -1 -+#endif -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AORIG, TEMP -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ SUB c11, b1, c11 -+#else -+ LD b1, AO, 0 * SIZE -+ SUB c11, b1, c11 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ MUL c11, b1, c11 -+#endif -+#if defined(RN) || defined(RT) -+ LD b1, BO, 0 * SIZE -+ MUL c11, b1, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AO, TEMP -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L89: -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 1 -+#endif -+#ifdef RT -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ LDARG $r25, $sp, 16 -+ LDARG $r26, $sp, 24 -+ LDARG $r27, $sp, 32 -+ LDARG $r28, $sp, 40 -+ fld.d $f24, $sp, 48 -+ fld.d $f25, $sp, 56 -+ fld.d $f26, $sp, 64 -+ fld.d $f27, $sp, 72 -+ fld.d $f28, $sp, 80 -+ LDARG $r29, $sp, 88 -+ LDARG $r30, $sp, 96 -+ LDARG $r20, $sp, 104 -+ LDARG $r16, $sp, 112 -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 112 -+ fld.d $f19, $sp, 120 -+ fld.d $f20, $sp, 128 -+ fld.d $f21, $sp, 136 -+#endif -+ addi.d $sp, $sp, 144 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/trsm_kernel_RT.S b/kernel/loongarch64/trsm_kernel_RT.S -new file mode 100644 -index 0000000..c86d9c1 ---- /dev/null -+++ b/kernel/loongarch64/trsm_kernel_RT.S -@@ -0,0 +1,2850 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define M $r4 -+#define N $r5 -+#define K $r6 -+#define A $r7 -+#define B $r8 -+#define C $r9 -+#define LDC $r10 -+#define OFFSET $r11 -+#define AO $r12 -+#define BO $r13 -+#define I $r17 -+#define J $r18 -+#define L $r29 -+#define CO1 $r14 -+#define CO2 $r15 -+#define CO3 $r23 -+#define CO4 $r24 -+#define CO5 $r25 -+#define CO6 $r26 -+#define CO7 $r27 -+#define CO8 $r28 -+#define KK $r30 -+#define TEMP $r20 -+#define AORIG $r16 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f27 -+#define a4 $f28 -+#define b1 $f23 -+#define b2 $f9 -+#define b3 $f10 -+#define b4 $f11 -+#define b5 $f12 -+#define b6 $f13 -+#define b7 $f14 -+#define b8 $f15 -+#define a5 b8 -+#define c11 $f16 -+#define c12 $f17 -+#define c21 $f3 -+#define c22 $f1 -+#define c31 $f2 -+#define c32 $f4 -+#define c41 $f5 -+#define c42 $f6 -+#define c51 $f7 -+#define c52 $f18 -+#define c61 $f19 -+#define c62 $f20 -+#define c71 $f21 -+#define c72 $f24 -+#define c81 $f25 -+#define c82 $f26 -+#define ALPHA $f0 -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -144 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 16 -+ SDARG $r26, $sp, 24 -+ SDARG $r27, $sp, 32 -+ SDARG $r28, $sp, 40 -+ fst.d $f24, $sp, 48 -+ fst.d $f25, $sp, 56 -+ fst.d $f26, $sp, 64 -+ fst.d $f27, $sp, 72 -+ fst.d $f28, $sp, 80 -+ SDARG $r29, $sp, 88 -+ SDARG $r30, $sp, 96 -+ SDARG $r20, $sp, 104 -+ SDARG $r16, $sp, 112 -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 112 -+ fst.d $f19, $sp, 120 -+ fst.d $f20, $sp, 128 -+ fst.d $f21, $sp, 136 -+#endif -+ slli.d LDC, LDC, BASE_SHIFT -+#ifdef LN -+ mul.w TEMP, M, K -+ slli.d TEMP, TEMP, BASE_SHIFT -+ add.d A, A, TEMP -+ slli.d TEMP, M, BASE_SHIFT -+ add.d C, C, TEMP -+#endif -+#ifdef RN -+ sub.d KK, $r0, OFFSET -+#endif -+#ifdef RT -+ mul.w TEMP, N, K -+ slli.d TEMP, TEMP, BASE_SHIFT -+ add.d B, B, TEMP -+ mul.w TEMP, N, LDC -+ add.d C, C, TEMP -+ sub.d KK, N, OFFSET -+#endif -+ andi J, N, 1 -+ bge $r0, J, .L30 -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d B, B, TEMP -+ sub.d C, C, LDC -+#endif -+ move AO, A -+ move CO1, C -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO1, LDC -+#endif -+ srai.d I, M, 1 -+ bge $r0, I, .L80 -+.L71: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ LD b5, B, 4 * SIZE -+ srai.d L, KK, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L75 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 0 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ LD b5, BO, 4 * SIZE -+ srai.d L, TEMP, 2 -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L75 -+#endif -+ .align 3 -+.L72: -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 2 * SIZE -+ LD a2, AO, 3 * SIZE -+ LD b1, BO, 1 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 4 * SIZE -+ LD a2, AO, 5 * SIZE -+ LD b1, BO, 2 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 6 * SIZE -+ LD a2, AO, 7 * SIZE -+ LD b1, BO, 3 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ addi.d L, L, -1 -+ addi.d AO, AO, 8 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L72 -+ .align 3 -+ -+.L75: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L78 -+ .align 3 -+.L76: -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ addi.d L, L, -1 -+ addi.d AO, AO, 2 * SIZE -+addi.d BO, BO, 1 * SIZE -+ blt $r0, L, .L76 -+.L78: -+ ADD c11, c11, c21 -+ ADD c12, c12, c22 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -1 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+#endif -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+ LD b2, AO, 2 * SIZE -+ LD b3, AO, 0 * SIZE -+ MUL c12, b1, c12 -+ NMSUB c11, c12, b2, c11 -+ MUL c11, b3, c11 -+#endif -+#ifdef LT -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 3 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c12, c11, b2, c12 -+ MUL c12, b3, c12 -+#endif -+#if defined(RN) || defined(RT) -+ LD b1, BO, 0 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -2 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c12, BO, 1 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+ addi.d I, I, -1 -+ blt $r0, I, .L71 -+ .align 3 -+ -+.L80: -+ andi I, M, 1 -+ bge $r0, I, .L89 -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ MOV c21, c11 -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, KK, 2 -+move BO, B -+ bge $r0, L, .L85 -+#else -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d TEMP, KK, BASE_SHIFT -+ add.d AO, AORIG, TEMP -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MOV c21, c11 -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L85 -+#endif -+ .align 3 -+.L82: -+ LD a1, AO, 0 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a1, AO, 1 * SIZE -+ LD b1, BO, 1 * SIZE -+ MADD c21, b1, a1, c21 -+ LD a1, AO, 2 * SIZE -+ LD b1, BO, 2 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a1, AO, 3 * SIZE -+ LD b1, BO, 3 * SIZE -+ MADD c21, b1, a1, c21 -+ addi.d L, L, -1 -+ addi.d AO, AO, 4 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L82 -+ .align 3 -+ -+.L85: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L88 -+ .align 3 -+.L86: -+ LD a1, AO, 0 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+addi.d BO, BO, 1 * SIZE -+ blt $r0, L, .L86 -+.L88: -+ ADD c11, c11, c21 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -1 -+#endif -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AORIG, TEMP -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ SUB c11, b1, c11 -+#else -+ LD b1, AO, 0 * SIZE -+ SUB c11, b1, c11 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ MUL c11, b1, c11 -+#endif -+#if defined(RN) || defined(RT) -+ LD b1, BO, 0 * SIZE -+ MUL c11, b1, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d TEMP, TEMP, 0 + BASE_SHIFT -+ add.d AO, AO, TEMP -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L89: -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 1 -+#endif -+#ifdef RT -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L30: -+ andi J, N, 2 -+ bge $r0, J, .L50 -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 1 -+ sub.d C, C, TEMP -+#endif -+ move AO, A -+ move CO1, C -+ add.d CO2, C, LDC -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO2, LDC -+#endif -+ srai.d I, M, 1 -+ bge $r0, I, .L60 -+.L51: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ LD b5, B, 4 * SIZE -+ srai.d L, KK, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L55 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ LD b5, BO, 4 * SIZE -+ srai.d L, TEMP, 2 -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L55 -+#endif -+ .align 3 -+.L52: -+ MADD c11, b1, a1, c11 -+ LD a3, AO, 2 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b4, BO, 3 * SIZE -+ MADD c12, b1, a2, c12 -+ LD a4, AO, 3 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b1, BO, 8 * SIZE -+ MADD c11, b3, a3, c11 -+ LD a1, AO, 8 * SIZE -+ MADD c21, b4, a3, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c12, b3, a4, c12 -+ LD a2, AO, 5 * SIZE -+ MADD c22, b4, a4, c22 -+ LD b3, BO, 6 * SIZE -+ MADD c11, b5, a5, c11 -+ LD a3, AO, 6 * SIZE -+ MADD c21, b2, a5, c21 -+ LD b4, BO, 7 * SIZE -+ MADD c12, b5, a2, c12 -+ LD a4, AO, 7 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b5, BO, 12 * SIZE -+ MADD c11, b3, a3, c11 -+ LD a5, AO, 12 * SIZE -+ MADD c21, b4, a3, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c12, b3, a4, c12 -+ LD a2, AO, 9 * SIZE -+ MADD c22, b4, a4, c22 -+ LD b3, BO, 10 * SIZE -+ addi.d AO, AO, 8 * SIZE -+ addi.d L, L, -1 -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L52 -+ .align 3 -+ -+.L55: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L58 -+ .align 3 -+.L56: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 3 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 2 * SIZE -+addi.d BO, BO, 2 * SIZE -+ blt $r0, L, .L56 -+.L58: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -2 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c12, b3, c12 -+ SUB c22, b4, c22 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c21, b3, c21 -+ SUB c22, b4, c22 -+#endif -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+ LD b2, AO, 2 * SIZE -+ LD b3, AO, 0 * SIZE -+ MUL c12, b1, c12 -+ MUL c22, b1, c22 -+ NMSUB c11, c12, b2, c11 -+ NMSUB c21, c22, b2, c21 -+ MUL c11, b3, c11 -+ MUL c21, b3, c21 -+#endif -+#ifdef LT -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ NMSUB c12, c11, b2, c12 -+ NMSUB c22, c21, b2, c22 -+ MUL c12, b3, c12 -+ MUL c22, b3, c22 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c22, c12, b2, c22 -+ MUL c21, b3, c21 -+ MUL c22, b3, c22 -+#endif -+#ifdef RT -+ LD b1, BO, 3 * SIZE -+ LD b2, BO, 2 * SIZE -+ LD b3, BO, 0 * SIZE -+ MUL c21, b1, c21 -+ MUL c22, b1, c22 -+ NMSUB c11, c21, b2, c11 -+ NMSUB c12, c22, b2, c12 -+ MUL c11, b3, c11 -+ MUL c12, b3, c12 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -2 * SIZE -+ addi.d CO2, CO2, -2 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c12, BO, 2 * SIZE -+ ST c22, BO, 3 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c21, AO, 2 * SIZE -+ ST c22, AO, 3 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c22, CO2, 1 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+ addi.d CO2, CO2, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AO, TEMP -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+MTC a1, $r0 -+ MOV c11, a1 -+ MOV c21, a1 -+ MOV c31, a1 -+ addi.d I, I, -1 -+MOV c41, c11 -+ blt $r0, I, .L51 -+ .align 3 -+ -+.L60: -+ andi I, M, 1 -+ bge $r0, I, .L69 -+#if defined(LT) || defined(RN) -+ srai.d L, KK, 2 -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ MOV c31, c11 -+ LD a4, AO, 3 * SIZE -+ MOV c41, c11 -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L65 -+#else -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ srai.d L, TEMP, 2 -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ MOV c31, c11 -+ LD a4, AO, 3 * SIZE -+ MOV c41, c11 -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L65 -+#endif -+ .align 3 -+.L62: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 4 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 7 * SIZE -+ LD a1, AO, 4 * SIZE -+ LD a2, AO, 5 * SIZE -+ MADD c11, b1, a3, c11 -+ LD b1, BO, 8 * SIZE -+ MADD c21, b2, a3, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c31, b3, a4, c31 -+ LD b3, BO, 10 * SIZE -+ MADD c41, b4, a4, c41 -+ LD b4, BO, 11 * SIZE -+ LD a3, AO, 6 * SIZE -+ LD a4, AO, 7 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 4 * SIZE -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L62 -+ .align 3 -+ -+.L65: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L68 -+ .align 3 -+.L66: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 2 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 3 * SIZE -+ LD a1, AO, 1 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+addi.d BO, BO, 2 * SIZE -+ blt $r0, L, .L66 -+.L68: -+ ADD c11, c11, c31 -+ ADD c21, c21, c41 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -2 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+#endif -+#if defined(LN) || defined(LT) -+ LD b3, AO, 0 * SIZE -+ MUL c11, b3, c11 -+ MUL c21, b3, c21 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c21, c11, b2, c21 -+ MUL c21, b3, c21 -+#endif -+#ifdef RT -+ LD b1, BO, 3 * SIZE -+ LD b2, BO, 2 * SIZE -+ LD b3, BO, 0 * SIZE -+ MUL c21, b1, c21 -+ NMSUB c11, c21, b2, c11 -+ MUL c11, b3, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+ addi.d CO2, CO2, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c21, AO, 1 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+ addi.d CO2, CO2, 1 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 0 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 1 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L69: -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 2 -+#endif -+#ifdef RT -+ addi.d KK, KK, -2 -+#endif -+ .align 3 -+ -+.L50: -+ andi J, N, 4 -+move AO, A -+ bge $r0, J, .L70 -+#ifdef RT -+ slli.d TEMP, K, 2 + BASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 2 -+ sub.d C, C, TEMP -+#endif -+ move CO1, C -+MTC c11, $r0 -+ add.d CO2, C, LDC -+ add.d CO3, CO2, LDC -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ srai.d I, M, 1 -+ MOV c31, c11 -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO4, LDC -+#endif -+MOV c41, c11 -+ bge $r0, I, .L40 -+.L31: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ LD a3, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ MOV c32, c11 -+ LD b4, B, 3 * SIZE -+ MOV c42, c11 -+ LD b5, B, 4 * SIZE -+ srai.d L, KK, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L35 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ LD a3, AO, 4 * SIZE -+ LD b1, BO, 0 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c32, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c42, c11 -+ LD b5, BO, 4 * SIZE -+ srai.d L, TEMP, 2 -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ bge $r0, L, .L35 -+#endif -+ .align 3 -+.L32: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c11, b5, a1, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 8 * SIZE -+ MADD c12, b5, a2, c12 -+ LD b5, BO, 20 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 9 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 10 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ LD a3, AO, 6 * SIZE -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c11, b7, a3, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a3, c21 -+ addi.d AO, AO, 8 * SIZE -+ MADD c31, b3, a3, c31 -+ addi.d BO, BO, 16 * SIZE -+ MADD c41, b4, a3, c41 -+ LD a3, AO, 4 * SIZE -+ MADD c12, b7, a2, c12 -+ LD b7, BO, 12 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 1 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 2 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L32 -+ .align 3 -+ -+.L35: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L38 -+ .align 3 -+.L36: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ addi.d AO, AO, 2 * SIZE -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 0 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 4 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L36 -+.L38: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -4 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+ SUB c12, b5, c12 -+ SUB c22, b6, c22 -+ SUB c32, b7, c32 -+ SUB c42, b8, c42 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ LD b5, AO, 4 * SIZE -+ LD b6, AO, 5 * SIZE -+ LD b7, AO, 6 * SIZE -+ LD b8, AO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c21, b3, c21 -+ SUB c22, b4, c22 -+ SUB c31, b5, c31 -+ SUB c32, b6, c32 -+ SUB c41, b7, c41 -+ SUB c42, b8, c42 -+#endif -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+ LD b2, AO, 2 * SIZE -+ LD b3, AO, 0 * SIZE -+ MUL c12, b1, c12 -+ MUL c22, b1, c22 -+ MUL c32, b1, c32 -+ MUL c42, b1, c42 -+ NMSUB c11, c12, b2, c11 -+ NMSUB c21, c22, b2, c21 -+ NMSUB c31, c32, b2, c31 -+ NMSUB c41, c42, b2, c41 -+ MUL c11, b3, c11 -+ MUL c21, b3, c21 -+ MUL c31, b3, c31 -+ MUL c41, b3, c41 -+#endif -+#ifdef LT -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+ NMSUB c12, c11, b2, c12 -+ NMSUB c22, c21, b2, c22 -+ NMSUB c32, c31, b2, c32 -+ NMSUB c42, c41, b2, c42 -+ MUL c12, b3, c12 -+ MUL c22, b3, c22 -+ MUL c32, b3, c32 -+ MUL c42, b3, c42 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c22, c12, b2, c22 -+ NMSUB c31, c11, b3, c31 -+ NMSUB c32, c12, b3, c32 -+ NMSUB c41, c11, b4, c41 -+ NMSUB c42, c12, b4, c42 -+ LD b2, BO, 5 * SIZE -+ LD b3, BO, 6 * SIZE -+ LD b4, BO, 7 * SIZE -+ MUL c21, b2, c21 -+ MUL c22, b2, c22 -+ NMSUB c31, c21, b3, c31 -+ NMSUB c32, c22, b3, c32 -+ NMSUB c41, c21, b4, c41 -+ NMSUB c42, c22, b4, c42 -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ MUL c31, b3, c31 -+ MUL c32, b3, c32 -+ NMSUB c41, c31, b4, c41 -+ NMSUB c42, c32, b4, c42 -+ LD b4, BO, 15 * SIZE -+ MUL c41, b4, c41 -+ MUL c42, b4, c42 -+#endif -+#ifdef RT -+ LD b5, BO, 15 * SIZE -+ LD b6, BO, 14 * SIZE -+ LD b7, BO, 13 * SIZE -+ LD b8, BO, 12 * SIZE -+ MUL c41, b5, c41 -+ MUL c42, b5, c42 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c32, c42, b6, c32 -+ NMSUB c21, c41, b7, c21 -+ NMSUB c22, c42, b7, c22 -+ NMSUB c11, c41, b8, c11 -+ NMSUB c12, c42, b8, c12 -+ LD b6, BO, 10 * SIZE -+ LD b7, BO, 9 * SIZE -+ LD b8, BO, 8 * SIZE -+ MUL c31, b6, c31 -+ MUL c32, b6, c32 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c22, c32, b7, c22 -+ NMSUB c11, c31, b8, c11 -+ NMSUB c12, c32, b8, c12 -+ LD b7, BO, 5 * SIZE -+ LD b8, BO, 4 * SIZE -+ MUL c21, b7, c21 -+ MUL c22, b7, c22 -+ NMSUB c11, c21, b8, c11 -+ NMSUB c12, c22, b8, c12 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+ MUL c12, b8, c12 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -2 * SIZE -+ addi.d CO2, CO2, -2 * SIZE -+ addi.d CO3, CO3, -2 * SIZE -+ addi.d CO4, CO4, -2 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+ ST c12, BO, 4 * SIZE -+ ST c22, BO, 5 * SIZE -+ ST c32, BO, 6 * SIZE -+ ST c42, BO, 7 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c21, AO, 2 * SIZE -+ ST c22, AO, 3 * SIZE -+ ST c31, AO, 4 * SIZE -+ ST c32, AO, 5 * SIZE -+ ST c41, AO, 6 * SIZE -+ ST c42, AO, 7 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c22, CO2, 1 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c32, CO3, 1 * SIZE -+ ST c41, CO4, 0 * SIZE -+ ST c42, CO4, 1 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+ addi.d CO2, CO2, 2 * SIZE -+ addi.d CO3, CO3, 2 * SIZE -+ addi.d CO4, CO4, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+MTC a1, $r0 -+ MOV c11, a1 -+ MOV c21, a1 -+ MOV c31, a1 -+ addi.d I, I, -1 -+MOV c41, c11 -+ blt $r0, I, .L31 -+ .align 3 -+ -+.L40: -+ andi I, M, 1 -+MOV c61, c11 -+ bge $r0, I, .L49 -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c81, c11 -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, KK, 2 -+move BO, B -+ bge $r0, L, .L45 -+#else -+#ifdef LN -+ slli.d TEMP, K, BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c81, c11 -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L45 -+#endif -+ .align 3 -+.L42: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 16 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ LD a1, AO, 4 * SIZE -+ addi.d L, L, -1 -+ MADD c11, b5, a2, c11 -+ LD b5, BO, 20 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 10 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 11 * SIZE -+ LD a2, AO, 2 * SIZE -+ addi.d AO, AO, 4 * SIZE -+ MADD c11, b6, a2, c11 -+ LD b6, BO, 24 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 13 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 14 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 15 * SIZE -+ LD a2, AO, -1 * SIZE -+ addi.d BO, BO, 16 * SIZE -+ MADD c11, b7, a2, c11 -+ LD b7, BO, 12 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 1 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 2 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 3 * SIZE -+ LD a2, AO, 1 * SIZE -+ blt $r0, L, .L42 -+ .align 3 -+ -+.L45: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L48 -+ .align 3 -+.L46: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 4 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 1 * SIZE -+ LD b4, BO, 7 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+ MOV a2, a2 -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L46 -+.L48: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -4 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c31, c11, b3, c31 -+ NMSUB c41, c11, b4, c41 -+ LD b2, BO, 5 * SIZE -+ LD b3, BO, 6 * SIZE -+ LD b4, BO, 7 * SIZE -+ MUL c21, b2, c21 -+ NMSUB c31, c21, b3, c31 -+ NMSUB c41, c21, b4, c41 -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ MUL c31, b3, c31 -+ NMSUB c41, c31, b4, c41 -+ LD b4, BO, 15 * SIZE -+ MUL c41, b4, c41 -+#endif -+#ifdef RT -+ LD b5, BO, 15 * SIZE -+ LD b6, BO, 14 * SIZE -+ LD b7, BO, 13 * SIZE -+ LD b8, BO, 12 * SIZE -+ MUL c41, b5, c41 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c21, c41, b7, c21 -+ NMSUB c11, c41, b8, c11 -+ LD b6, BO, 10 * SIZE -+ LD b7, BO, 9 * SIZE -+ LD b8, BO, 8 * SIZE -+ MUL c31, b6, c31 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c11, c31, b8, c11 -+ LD b7, BO, 5 * SIZE -+ LD b8, BO, 4 * SIZE -+ MUL c21, b7, c21 -+ NMSUB c11, c21, b8, c11 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+ addi.d CO2, CO2, -1 * SIZE -+ addi.d CO3, CO3, -1 * SIZE -+ addi.d CO4, CO4, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c21, AO, 1 * SIZE -+ ST c31, AO, 2 * SIZE -+ ST c41, AO, 3 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c41, CO4, 0 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+ addi.d CO2, CO2, 1 * SIZE -+ addi.d CO3, CO3, 1 * SIZE -+ addi.d CO4, CO4, 1 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 2 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L49: -+#ifdef LN -+ slli.d TEMP, K, 2 + BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 4 -+#endif -+#ifdef RT -+ addi.d KK, KK, -4 -+#endif -+ .align 3 -+ -+.L70: -+ srai.d J, N, 3 -+nop -+ bge $r0, J, .L999 -+.L10: -+#ifdef RT -+ slli.d TEMP, K, 3 + BASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 3 -+ sub.d C, C, TEMP -+#endif -+ move CO1, C -+MTC c11, $r0 -+ add.d CO2, C, LDC -+ add.d CO3, CO2, LDC -+ addi.d J, J, -1 -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ add.d CO5, CO4, LDC -+ MOV c31, c11 -+ add.d CO6, CO5, LDC -+ MOV c41, c11 -+ add.d CO7, CO6, LDC -+ MOV c51, c11 -+ add.d CO8, CO7, LDC -+ srai.d I, M, 1 -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO8, LDC -+#endif -+MOV c61, c11 -+ bge $r0, I, .L20 -+.L11: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, B, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, KK, 2 -+ MOV c32, c11 -+ LD b3, B, 2 * SIZE -+ MOV c42, c11 -+ LD b4, B, 3 * SIZE -+ MOV c52, c11 -+ LD b5, B, 4 * SIZE -+ MOV c62, c11 -+ LD b6, B, 8 * SIZE -+ MOV c72, c11 -+ LD b7, B, 12 * SIZE -+ MOV c82, c11 -+move BO, B -+ bge $r0, L, .L15 -+#else -+#ifdef LN -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 1 + BASE_SHIFT -+ slli.d TEMP, KK, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ MOV c32, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c42, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c52, c11 -+ LD b5, BO, 4 * SIZE -+ MOV c62, c11 -+ LD b6, BO, 8 * SIZE -+ MOV c72, c11 -+ LD b7, BO, 12 * SIZE -+ MOV c82, c11 -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L15 -+#endif -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ bge $r0, L, .L13 -+ .align 3 -+.L12: -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ MADD c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a4, c51 -+ MADD c61, b2, a4, c61 -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ MADD c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD c71, b3, a3, c71 -+ MADD c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ addi.d L, L, -1 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ blt $r0, L, .L12 -+ .align 3 -+ -+.L13: -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ MADD c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a4, c51 -+ MADD c61, b2, a4, c61 -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ MADD c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD c71, b3, a3, c71 -+ MADD c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ .align 3 -+ -+.L15: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L18 -+ .align 3 -+.L16: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ addi.d L, L, -1 -+ MADD c61, b2, a1, c61 -+ addi.d AO, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ addi.d BO, BO, 8 * SIZE -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 4 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L16 -+.L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -2 -+#else -+ addi.d TEMP, KK, -8 -+#endif -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ LD b5, BO, 4 * SIZE -+ SUB c21, b2, c21 -+ LD b6, BO, 5 * SIZE -+ SUB c31, b3, c31 -+ LD b7, BO, 6 * SIZE -+ SUB c41, b4, c41 -+ LD b8, BO, 7 * SIZE -+ SUB c51, b5, c51 -+ LD b1, BO, 8 * SIZE -+ SUB c61, b6, c61 -+ LD b2, BO, 9 * SIZE -+ SUB c71, b7, c71 -+ LD b3, BO, 10 * SIZE -+ SUB c81, b8, c81 -+ LD b4, BO, 11 * SIZE -+ SUB c12, b1, c12 -+ LD b5, BO, 12 * SIZE -+ SUB c22, b2, c22 -+ LD b6, BO, 13 * SIZE -+ SUB c32, b3, c32 -+ LD b7, BO, 14 * SIZE -+ SUB c42, b4, c42 -+ LD b8, BO, 15 * SIZE -+ SUB c52, b5, c52 -+#ifdef LN -+ LD b1, AO, 3 * SIZE -+#else -+ LD b1, AO, 0 * SIZE -+#endif -+ SUB c62, b6, c62 -+ SUB c72, b7, c72 -+ SUB c82, b8, c82 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ LD b5, AO, 4 * SIZE -+ SUB c12, b2, c12 -+ LD b6, AO, 5 * SIZE -+ SUB c21, b3, c21 -+ LD b7, AO, 6 * SIZE -+ SUB c22, b4, c22 -+ LD b8, AO, 7 * SIZE -+ SUB c31, b5, c31 -+ LD b1, AO, 8 * SIZE -+ SUB c32, b6, c32 -+ LD b2, AO, 9 * SIZE -+ SUB c41, b7, c41 -+ LD b3, AO, 10 * SIZE -+ SUB c42, b8, c42 -+ LD b4, AO, 11 * SIZE -+ LD b5, AO, 12 * SIZE -+ SUB c51, b1, c51 -+ LD b6, AO, 13 * SIZE -+ SUB c52, b2, c52 -+ LD b7, AO, 14 * SIZE -+ SUB c61, b3, c61 -+ LD b8, AO, 15 * SIZE -+ SUB c62, b4, c62 -+ SUB c71, b5, c71 -+ SUB c72, b6, c72 -+ SUB c81, b7, c81 -+ SUB c82, b8, c82 -+#endif -+#ifdef LN -+ MUL c12, b1, c12 -+ LD b2, AO, 2 * SIZE -+ MUL c22, b1, c22 -+ MUL c32, b1, c32 -+ MUL c42, b1, c42 -+ MUL c52, b1, c52 -+ MUL c62, b1, c62 -+ MUL c72, b1, c72 -+ MUL c82, b1, c82 -+ NMSUB c11, c12, b2, c11 -+ LD b3, AO, 0 * SIZE -+ NMSUB c21, c22, b2, c21 -+ NMSUB c31, c32, b2, c31 -+ NMSUB c41, c42, b2, c41 -+ NMSUB c51, c52, b2, c51 -+ NMSUB c61, c62, b2, c61 -+ NMSUB c71, c72, b2, c71 -+ NMSUB c81, c82, b2, c81 -+ MUL c11, b3, c11 -+ addi.d CO1, CO1, -2 * SIZE -+ MUL c21, b3, c21 -+ addi.d CO2, CO2, -2 * SIZE -+ MUL c31, b3, c31 -+ addi.d CO3, CO3, -2 * SIZE -+ MUL c41, b3, c41 -+ addi.d CO4, CO4, -2 * SIZE -+ MUL c51, b3, c51 -+ addi.d CO5, CO5, -2 * SIZE -+ MUL c61, b3, c61 -+ addi.d CO6, CO6, -2 * SIZE -+ MUL c71, b3, c71 -+ addi.d CO7, CO7, -2 * SIZE -+ MUL c81, b3, c81 -+ addi.d CO8, CO8, -2 * SIZE -+#endif -+#ifdef LT -+ MUL c11, b1, c11 -+ LD b2, AO, 1 * SIZE -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+ MUL c51, b1, c51 -+ MUL c61, b1, c61 -+ MUL c71, b1, c71 -+ MUL c81, b1, c81 -+ NMSUB c12, c11, b2, c12 -+ LD b3, AO, 3 * SIZE -+ NMSUB c22, c21, b2, c22 -+ NMSUB c32, c31, b2, c32 -+ NMSUB c42, c41, b2, c42 -+ NMSUB c52, c51, b2, c52 -+ NMSUB c62, c61, b2, c62 -+ NMSUB c72, c71, b2, c72 -+ NMSUB c82, c81, b2, c82 -+ MUL c12, b3, c12 -+ MUL c22, b3, c22 -+ MUL c32, b3, c32 -+ MUL c42, b3, c42 -+ MUL c52, b3, c52 -+ MUL c62, b3, c62 -+ MUL c72, b3, c72 -+ MUL c82, b3, c82 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL c11, b1, c11 -+ MUL c12, b1, c12 -+ LD b5, BO, 4 * SIZE -+ NMSUB c21, c11, b2, c21 -+ NMSUB c22, c12, b2, c22 -+ LD b6, BO, 5 * SIZE -+ NMSUB c31, c11, b3, c31 -+ NMSUB c32, c12, b3, c32 -+ LD b7, BO, 6 * SIZE -+ NMSUB c41, c11, b4, c41 -+ NMSUB c42, c12, b4, c42 -+ LD b8, BO, 7 * SIZE -+ NMSUB c51, c11, b5, c51 -+ NMSUB c52, c12, b5, c52 -+ LD b2, BO, 9 * SIZE -+ NMSUB c61, c11, b6, c61 -+ NMSUB c62, c12, b6, c62 -+ LD b3, BO, 10 * SIZE -+ NMSUB c71, c11, b7, c71 -+ NMSUB c72, c12, b7, c72 -+ LD b4, BO, 11 * SIZE -+ NMSUB c81, c11, b8, c81 -+ NMSUB c82, c12, b8, c82 -+ LD b5, BO, 12 * SIZE -+ MUL c21, b2, c21 -+ MUL c22, b2, c22 -+ LD b6, BO, 13 * SIZE -+ NMSUB c31, c21, b3, c31 -+ NMSUB c32, c22, b3, c32 -+ LD b7, BO, 14 * SIZE -+ NMSUB c41, c21, b4, c41 -+ NMSUB c42, c22, b4, c42 -+ LD b8, BO, 15 * SIZE -+ NMSUB c51, c21, b5, c51 -+ NMSUB c52, c22, b5, c52 -+ LD b3, BO, 18 * SIZE -+ NMSUB c61, c21, b6, c61 -+ NMSUB c62, c22, b6, c62 -+ LD b4, BO, 19 * SIZE -+ NMSUB c71, c21, b7, c71 -+ NMSUB c72, c22, b7, c72 -+ LD b5, BO, 20 * SIZE -+ NMSUB c81, c21, b8, c81 -+ NMSUB c82, c22, b8, c82 -+ LD b6, BO, 21 * SIZE -+ MUL c31, b3, c31 -+ MUL c32, b3, c32 -+ LD b7, BO, 22 * SIZE -+ NMSUB c41, c31, b4, c41 -+ NMSUB c42, c32, b4, c42 -+ LD b8, BO, 23 * SIZE -+ NMSUB c51, c31, b5, c51 -+ NMSUB c52, c32, b5, c52 -+ LD b4, BO, 27 * SIZE -+ NMSUB c61, c31, b6, c61 -+ NMSUB c62, c32, b6, c62 -+ LD b5, BO, 28 * SIZE -+ NMSUB c71, c31, b7, c71 -+ NMSUB c72, c32, b7, c72 -+ LD b6, BO, 29 * SIZE -+ NMSUB c81, c31, b8, c81 -+ NMSUB c82, c32, b8, c82 -+ LD b7, BO, 30 * SIZE -+ MUL c41, b4, c41 -+ MUL c42, b4, c42 -+ LD b8, BO, 31 * SIZE -+ NMSUB c51, c41, b5, c51 -+ NMSUB c52, c42, b5, c52 -+ LD b5, BO, 36 * SIZE -+ NMSUB c61, c41, b6, c61 -+ NMSUB c62, c42, b6, c62 -+ LD b6, BO, 37 * SIZE -+ NMSUB c71, c41, b7, c71 -+ NMSUB c72, c42, b7, c72 -+ LD b7, BO, 38 * SIZE -+ NMSUB c81, c41, b8, c81 -+ NMSUB c82, c42, b8, c82 -+ LD b8, BO, 39 * SIZE -+ MUL c51, b5, c51 -+ MUL c52, b5, c52 -+ NMSUB c61, c51, b6, c61 -+ NMSUB c62, c52, b6, c62 -+ LD b6, BO, 45 * SIZE -+ NMSUB c71, c51, b7, c71 -+ NMSUB c72, c52, b7, c72 -+ LD b7, BO, 46 * SIZE -+ NMSUB c81, c51, b8, c81 -+ NMSUB c82, c52, b8, c82 -+ LD b8, BO, 47 * SIZE -+ MUL c61, b6, c61 -+ MUL c62, b6, c62 -+ NMSUB c71, c61, b7, c71 -+ NMSUB c72, c62, b7, c72 -+ LD b7, BO, 54 * SIZE -+ NMSUB c81, c61, b8, c81 -+ NMSUB c82, c62, b8, c82 -+ LD b8, BO, 55 * SIZE -+ MUL c71, b7, c71 -+ MUL c72, b7, c72 -+ NMSUB c81, c71, b8, c81 -+ NMSUB c82, c72, b8, c82 -+ LD b8, BO, 63 * SIZE -+ MUL c81, b8, c81 -+ MUL c82, b8, c82 -+#endif -+#ifdef RT -+ LD b1, BO, 63 * SIZE -+ LD b2, BO, 62 * SIZE -+ LD b3, BO, 61 * SIZE -+ LD b4, BO, 60 * SIZE -+ MUL c81, b1, c81 -+ MUL c82, b1, c82 -+ LD b5, BO, 59 * SIZE -+ NMSUB c71, c81, b2, c71 -+ NMSUB c72, c82, b2, c72 -+ LD b6, BO, 58 * SIZE -+ NMSUB c61, c81, b3, c61 -+ NMSUB c62, c82, b3, c62 -+ LD b7, BO, 57 * SIZE -+ NMSUB c51, c81, b4, c51 -+ NMSUB c52, c82, b4, c52 -+ LD b8, BO, 56 * SIZE -+ NMSUB c41, c81, b5, c41 -+ NMSUB c42, c82, b5, c42 -+ LD b2, BO, 54 * SIZE -+ NMSUB c31, c81, b6, c31 -+ NMSUB c32, c82, b6, c32 -+ LD b3, BO, 53 * SIZE -+ NMSUB c21, c81, b7, c21 -+ NMSUB c22, c82, b7, c22 -+ LD b4, BO, 52 * SIZE -+ NMSUB c11, c81, b8, c11 -+ NMSUB c12, c82, b8, c12 -+ LD b5, BO, 51 * SIZE -+ MUL c71, b2, c71 -+ MUL c72, b2, c72 -+ LD b6, BO, 50 * SIZE -+ NMSUB c61, c71, b3, c61 -+ NMSUB c62, c72, b3, c62 -+ LD b7, BO, 49 * SIZE -+ NMSUB c51, c71, b4, c51 -+ NMSUB c52, c72, b4, c52 -+ LD b8, BO, 48 * SIZE -+ NMSUB c41, c71, b5, c41 -+ NMSUB c42, c72, b5, c42 -+ LD b3, BO, 45 * SIZE -+ NMSUB c31, c71, b6, c31 -+ NMSUB c32, c72, b6, c32 -+ LD b4, BO, 44 * SIZE -+ NMSUB c21, c71, b7, c21 -+ NMSUB c22, c72, b7, c22 -+ LD b5, BO, 43 * SIZE -+ NMSUB c11, c71, b8, c11 -+ NMSUB c12, c72, b8, c12 -+ LD b6, BO, 42 * SIZE -+ MUL c61, b3, c61 -+ MUL c62, b3, c62 -+ LD b7, BO, 41 * SIZE -+ NMSUB c51, c61, b4, c51 -+ NMSUB c52, c62, b4, c52 -+ LD b8, BO, 40 * SIZE -+ NMSUB c41, c61, b5, c41 -+ NMSUB c42, c62, b5, c42 -+ LD b4, BO, 36 * SIZE -+ NMSUB c31, c61, b6, c31 -+ NMSUB c32, c62, b6, c32 -+ LD b5, BO, 35 * SIZE -+ NMSUB c21, c61, b7, c21 -+ NMSUB c22, c62, b7, c22 -+ LD b6, BO, 34 * SIZE -+ NMSUB c11, c61, b8, c11 -+ NMSUB c12, c62, b8, c12 -+ LD b7, BO, 33 * SIZE -+ MUL c51, b4, c51 -+ MUL c52, b4, c52 -+ LD b8, BO, 32 * SIZE -+ NMSUB c41, c51, b5, c41 -+ NMSUB c42, c52, b5, c42 -+ LD b5, BO, 27 * SIZE -+ NMSUB c31, c51, b6, c31 -+ NMSUB c32, c52, b6, c32 -+ LD b6, BO, 26 * SIZE -+ NMSUB c21, c51, b7, c21 -+ NMSUB c22, c52, b7, c22 -+ LD b7, BO, 25 * SIZE -+ NMSUB c11, c51, b8, c11 -+ NMSUB c12, c52, b8, c12 -+ LD b8, BO, 24 * SIZE -+ MUL c41, b5, c41 -+ MUL c42, b5, c42 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c32, c42, b6, c32 -+ LD b6, BO, 18 * SIZE -+ NMSUB c21, c41, b7, c21 -+ NMSUB c22, c42, b7, c22 -+ LD b7, BO, 17 * SIZE -+ NMSUB c11, c41, b8, c11 -+ NMSUB c12, c42, b8, c12 -+ LD b8, BO, 16 * SIZE -+ MUL c31, b6, c31 -+ MUL c32, b6, c32 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c22, c32, b7, c22 -+ LD b7, BO, 9 * SIZE -+ NMSUB c11, c31, b8, c11 -+ NMSUB c12, c32, b8, c12 -+ LD b8, BO, 8 * SIZE -+ MUL c21, b7, c21 -+ MUL c22, b7, c22 -+ NMSUB c11, c21, b8, c11 -+ NMSUB c12, c22, b8, c12 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+ MUL c12, b8, c12 -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+ ST c51, BO, 4 * SIZE -+ ST c61, BO, 5 * SIZE -+ ST c71, BO, 6 * SIZE -+ ST c81, BO, 7 * SIZE -+ ST c12, BO, 8 * SIZE -+ ST c22, BO, 9 * SIZE -+ ST c32, BO, 10 * SIZE -+ ST c42, BO, 11 * SIZE -+ ST c52, BO, 12 * SIZE -+ ST c62, BO, 13 * SIZE -+ ST c72, BO, 14 * SIZE -+ ST c82, BO, 15 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c21, AO, 2 * SIZE -+ ST c22, AO, 3 * SIZE -+ ST c31, AO, 4 * SIZE -+ ST c32, AO, 5 * SIZE -+ ST c41, AO, 6 * SIZE -+ ST c42, AO, 7 * SIZE -+ ST c51, AO, 8 * SIZE -+ ST c52, AO, 9 * SIZE -+ ST c61, AO, 10 * SIZE -+ ST c62, AO, 11 * SIZE -+ ST c71, AO, 12 * SIZE -+ ST c72, AO, 13 * SIZE -+ ST c81, AO, 14 * SIZE -+ ST c82, AO, 15 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c22, CO2, 1 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c32, CO3, 1 * SIZE -+ ST c41, CO4, 0 * SIZE -+ ST c42, CO4, 1 * SIZE -+ ST c51, CO5, 0 * SIZE -+ ST c52, CO5, 1 * SIZE -+ ST c61, CO6, 0 * SIZE -+ ST c62, CO6, 1 * SIZE -+ ST c71, CO7, 0 * SIZE -+ ST c72, CO7, 1 * SIZE -+ ST c81, CO8, 0 * SIZE -+ ST c82, CO8, 1 * SIZE -+MTC a1, $r0 -+#ifndef LN -+ addi.d CO1, CO1, 2 * SIZE -+ addi.d CO2, CO2, 2 * SIZE -+ addi.d CO3, CO3, 2 * SIZE -+ addi.d CO4, CO4, 2 * SIZE -+ addi.d CO5, CO5, 2 * SIZE -+ addi.d CO6, CO6, 2 * SIZE -+ addi.d CO7, CO7, 2 * SIZE -+ addi.d CO8, CO8, 2 * SIZE -+#endif -+ MOV c11, a1 -+ MOV c21, a1 -+#ifdef RT -+ slli.d TEMP, K, 1 + BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+ MOV c31, a1 -+ MOV c41, a1 -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 1 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 2 -+#endif -+#ifdef LN -+ addi.d KK, KK, -2 -+#endif -+ addi.d I, I, -1 -+ MOV c51, a1 -+MOV c61, a1 -+ blt $r0, I, .L11 -+ .align 3 -+ -+.L20: -+ andi I, M, 1 -+ MOV c61, c11 -+MOV c71, c11 -+ bge $r0, I, .L29 -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, KK, 2 -+ MOV c81, c11 -+move BO, B -+ bge $r0, L, .L25 -+#else -+#ifdef LN -+ slli.d TEMP, K, 0 + BASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, 0 + BASE_SHIFT -+ slli.d TEMP, KK, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 8 * SIZE -+ LD b7, BO, 12 * SIZE -+ srai.d L, TEMP, 2 -+ MOV c81, c11 -+ bge $r0, L, .L25 -+#endif -+ .align 3 -+.L22: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 16 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ LD b5, BO, 20 * SIZE -+ MADD c61, b2, a1, c61 -+ LD b2, BO, 9 * SIZE -+ MADD c71, b3, a1, c71 -+ LD b3, BO, 10 * SIZE -+ MADD c81, b4, a1, c81 -+ LD b4, BO, 11 * SIZE -+ LD a1, AO, 4 * SIZE -+ addi.d L, L, -1 -+ MADD c11, b6, a2, c11 -+ LD b6, BO, 24 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 13 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 14 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a2, c51 -+ LD b7, BO, 28 * SIZE -+ MADD c61, b2, a2, c61 -+ LD b2, BO, 17 * SIZE -+ MADD c71, b3, a2, c71 -+ LD b3, BO, 18 * SIZE -+ MADD c81, b4, a2, c81 -+ LD b4, BO, 19 * SIZE -+ LD a2, AO, 5 * SIZE -+ addi.d AO, AO, 4 * SIZE -+ MADD c11, b1, a3, c11 -+ LD b1, BO, 32 * SIZE -+ MADD c21, b2, a3, c21 -+ LD b2, BO, 21 * SIZE -+ MADD c31, b3, a3, c31 -+ LD b3, BO, 22 * SIZE -+ MADD c41, b4, a3, c41 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ LD b5, BO, 36 * SIZE -+ MADD c61, b2, a3, c61 -+ LD b2, BO, 25 * SIZE -+ MADD c71, b3, a3, c71 -+ LD b3, BO, 26 * SIZE -+ MADD c81, b4, a3, c81 -+ LD b4, BO, 27 * SIZE -+ LD a3, AO, 2 * SIZE -+ addi.d BO, BO, 32 * SIZE -+ MADD c11, b6, a4, c11 -+ LD b6, BO, 8 * SIZE -+ MADD c21, b2, a4, c21 -+ LD b2, BO, -3 * SIZE -+ MADD c31, b3, a4, c31 -+ LD b3, BO, -2 * SIZE -+ MADD c41, b4, a4, c41 -+ LD b4, BO, -1 * SIZE -+ MADD c51, b7, a4, c51 -+ LD b7, BO, 12 * SIZE -+ MADD c61, b2, a4, c61 -+ LD b2, BO, 1 * SIZE -+ MADD c71, b3, a4, c71 -+ LD b3, BO, 2 * SIZE -+ MADD c81, b4, a4, c81 -+ LD b4, BO, 3 * SIZE -+ LD a4, AO, 3 * SIZE -+ blt $r0, L, .L22 -+ .align 3 -+ -+.L25: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L28 -+ .align 3 -+.L26: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 8 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ addi.d L, L, -1 -+ MOV a2, a2 -+ addi.d AO, AO, 1 * SIZE -+ addi.d BO, BO, 8 * SIZE -+ MADD c51, b5, a1, c51 -+ LD b5, BO, 4 * SIZE -+ MADD c61, b2, a1, c61 -+ LD b2, BO, 1 * SIZE -+ MADD c71, b3, a1, c71 -+ LD b3, BO, 2 * SIZE -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L26 -+.L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -8 -+#endif -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+ SUB c51, b5, c51 -+ SUB c61, b6, c61 -+ SUB c71, b7, c71 -+ SUB c81, b8, c81 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ LD b5, AO, 4 * SIZE -+ LD b6, AO, 5 * SIZE -+ LD b7, AO, 6 * SIZE -+ LD b8, AO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c21, b2, c21 -+ SUB c31, b3, c31 -+ SUB c41, b4, c41 -+ SUB c51, b5, c51 -+ SUB c61, b6, c61 -+ SUB c71, b7, c71 -+ SUB c81, b8, c81 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ MUL c11, b1, c11 -+ MUL c21, b1, c21 -+ MUL c31, b1, c31 -+ MUL c41, b1, c41 -+ MUL c51, b1, c51 -+ MUL c61, b1, c61 -+ MUL c71, b1, c71 -+ MUL c81, b1, c81 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ MUL c11, b1, c11 -+ NMSUB c21, c11, b2, c21 -+ NMSUB c31, c11, b3, c31 -+ NMSUB c41, c11, b4, c41 -+ NMSUB c51, c11, b5, c51 -+ NMSUB c61, c11, b6, c61 -+ NMSUB c71, c11, b7, c71 -+ NMSUB c81, c11, b8, c81 -+ LD b2, BO, 9 * SIZE -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ LD b5, BO, 12 * SIZE -+ LD b6, BO, 13 * SIZE -+ LD b7, BO, 14 * SIZE -+ LD b8, BO, 15 * SIZE -+ MUL c21, b2, c21 -+ NMSUB c31, c21, b3, c31 -+ NMSUB c41, c21, b4, c41 -+ NMSUB c51, c21, b5, c51 -+ NMSUB c61, c21, b6, c61 -+ NMSUB c71, c21, b7, c71 -+ NMSUB c81, c21, b8, c81 -+ LD b3, BO, 18 * SIZE -+ LD b4, BO, 19 * SIZE -+ LD b5, BO, 20 * SIZE -+ LD b6, BO, 21 * SIZE -+ LD b7, BO, 22 * SIZE -+ LD b8, BO, 23 * SIZE -+ MUL c31, b3, c31 -+ NMSUB c41, c31, b4, c41 -+ NMSUB c51, c31, b5, c51 -+ NMSUB c61, c31, b6, c61 -+ NMSUB c71, c31, b7, c71 -+ NMSUB c81, c31, b8, c81 -+ LD b4, BO, 27 * SIZE -+ LD b5, BO, 28 * SIZE -+ LD b6, BO, 29 * SIZE -+ LD b7, BO, 30 * SIZE -+ LD b8, BO, 31 * SIZE -+ MUL c41, b4, c41 -+ NMSUB c51, c41, b5, c51 -+ NMSUB c61, c41, b6, c61 -+ NMSUB c71, c41, b7, c71 -+ NMSUB c81, c41, b8, c81 -+ LD b5, BO, 36 * SIZE -+ LD b6, BO, 37 * SIZE -+ LD b7, BO, 38 * SIZE -+ LD b8, BO, 39 * SIZE -+ MUL c51, b5, c51 -+ NMSUB c61, c51, b6, c61 -+ NMSUB c71, c51, b7, c71 -+ NMSUB c81, c51, b8, c81 -+ LD b6, BO, 45 * SIZE -+ LD b7, BO, 46 * SIZE -+ LD b8, BO, 47 * SIZE -+ MUL c61, b6, c61 -+ NMSUB c71, c61, b7, c71 -+ NMSUB c81, c61, b8, c81 -+ LD b7, BO, 54 * SIZE -+ LD b8, BO, 55 * SIZE -+ MUL c71, b7, c71 -+ NMSUB c81, c71, b8, c81 -+ LD b8, BO, 63 * SIZE -+ MUL c81, b8, c81 -+#endif -+#ifdef RT -+ LD b1, BO, 63 * SIZE -+ LD b2, BO, 62 * SIZE -+ LD b3, BO, 61 * SIZE -+ LD b4, BO, 60 * SIZE -+ LD b5, BO, 59 * SIZE -+ LD b6, BO, 58 * SIZE -+ LD b7, BO, 57 * SIZE -+ LD b8, BO, 56 * SIZE -+ MUL c81, b1, c81 -+ NMSUB c71, c81, b2, c71 -+ NMSUB c61, c81, b3, c61 -+ NMSUB c51, c81, b4, c51 -+ NMSUB c41, c81, b5, c41 -+ NMSUB c31, c81, b6, c31 -+ NMSUB c21, c81, b7, c21 -+ NMSUB c11, c81, b8, c11 -+ LD b2, BO, 54 * SIZE -+ LD b3, BO, 53 * SIZE -+ LD b4, BO, 52 * SIZE -+ LD b5, BO, 51 * SIZE -+ LD b6, BO, 50 * SIZE -+ LD b7, BO, 49 * SIZE -+ LD b8, BO, 48 * SIZE -+ MUL c71, b2, c71 -+ NMSUB c61, c71, b3, c61 -+ NMSUB c51, c71, b4, c51 -+ NMSUB c41, c71, b5, c41 -+ NMSUB c31, c71, b6, c31 -+ NMSUB c21, c71, b7, c21 -+ NMSUB c11, c71, b8, c11 -+ LD b3, BO, 45 * SIZE -+ LD b4, BO, 44 * SIZE -+ LD b5, BO, 43 * SIZE -+ LD b6, BO, 42 * SIZE -+ LD b7, BO, 41 * SIZE -+ LD b8, BO, 40 * SIZE -+ MUL c61, b3, c61 -+ NMSUB c51, c61, b4, c51 -+ NMSUB c41, c61, b5, c41 -+ NMSUB c31, c61, b6, c31 -+ NMSUB c21, c61, b7, c21 -+ NMSUB c11, c61, b8, c11 -+ LD b4, BO, 36 * SIZE -+ LD b5, BO, 35 * SIZE -+ LD b6, BO, 34 * SIZE -+ LD b7, BO, 33 * SIZE -+ LD b8, BO, 32 * SIZE -+ MUL c51, b4, c51 -+ NMSUB c41, c51, b5, c41 -+ NMSUB c31, c51, b6, c31 -+ NMSUB c21, c51, b7, c21 -+ NMSUB c11, c51, b8, c11 -+ LD b5, BO, 27 * SIZE -+ LD b6, BO, 26 * SIZE -+ LD b7, BO, 25 * SIZE -+ LD b8, BO, 24 * SIZE -+ MUL c41, b5, c41 -+ NMSUB c31, c41, b6, c31 -+ NMSUB c21, c41, b7, c21 -+ NMSUB c11, c41, b8, c11 -+ LD b6, BO, 18 * SIZE -+ LD b7, BO, 17 * SIZE -+ LD b8, BO, 16 * SIZE -+ MUL c31, b6, c31 -+ NMSUB c21, c31, b7, c21 -+ NMSUB c11, c31, b8, c11 -+ LD b7, BO, 9 * SIZE -+ LD b8, BO, 8 * SIZE -+ MUL c21, b7, c21 -+ NMSUB c11, c21, b8, c11 -+ LD b8, BO, 0 * SIZE -+ MUL c11, b8, c11 -+#endif -+#ifdef LN -+ addi.d CO1, CO1, -1 * SIZE -+ addi.d CO2, CO2, -1 * SIZE -+ addi.d CO3, CO3, -1 * SIZE -+ addi.d CO4, CO4, -1 * SIZE -+ addi.d CO5, CO5, -1 * SIZE -+ addi.d CO6, CO6, -1 * SIZE -+ addi.d CO7, CO7, -1 * SIZE -+ addi.d CO8, CO8, -1 * SIZE -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c21, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c41, BO, 3 * SIZE -+ ST c51, BO, 4 * SIZE -+ ST c61, BO, 5 * SIZE -+ ST c71, BO, 6 * SIZE -+ ST c81, BO, 7 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c21, AO, 1 * SIZE -+ ST c31, AO, 2 * SIZE -+ ST c41, AO, 3 * SIZE -+ ST c51, AO, 4 * SIZE -+ ST c61, AO, 5 * SIZE -+ ST c71, AO, 6 * SIZE -+ ST c81, AO, 7 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c21, CO2, 0 * SIZE -+ ST c31, CO3, 0 * SIZE -+ ST c41, CO4, 0 * SIZE -+ ST c51, CO5, 0 * SIZE -+ ST c61, CO6, 0 * SIZE -+ ST c71, CO7, 0 * SIZE -+ ST c81, CO8, 0 * SIZE -+#ifndef LN -+ addi.d CO1, CO1, 1 * SIZE -+ addi.d CO2, CO2, 1 * SIZE -+ addi.d CO3, CO3, 1 * SIZE -+ addi.d CO4, CO4, 1 * SIZE -+ addi.d CO5, CO5, 1 * SIZE -+ addi.d CO6, CO6, 1 * SIZE -+ addi.d CO7, CO7, 1 * SIZE -+ addi.d CO8, CO8, 1 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, BASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, 0 + BASE_SHIFT -+ slli.d TEMP, TEMP, 3 + BASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L29: -+#ifdef LN -+ slli.d TEMP, K, 3 + BASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 8 -+#endif -+#ifdef RT -+ addi.d KK, KK, -8 -+#endif -+ blt $r0, J, .L10 -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ LDARG $r25, $sp, 16 -+ LDARG $r26, $sp, 24 -+ LDARG $r27, $sp, 32 -+ LDARG $r28, $sp, 40 -+ fld.d $f24, $sp, 48 -+ fld.d $f25, $sp, 56 -+ fld.d $f26, $sp, 64 -+ fld.d $f27, $sp, 72 -+ fld.d $f28, $sp, 80 -+ LDARG $r29, $sp, 88 -+ LDARG $r30, $sp, 96 -+ LDARG $r20, $sp, 104 -+ LDARG $r16, $sp, 112 -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 112 -+ fld.d $f19, $sp, 120 -+ fld.d $f20, $sp, 128 -+ fld.d $f21, $sp, 136 -+#endif -+ addi.d $sp, $sp, 144 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/zamax.S b/kernel/loongarch64/zamax.S -new file mode 100644 -index 0000000..f998bdc ---- /dev/null -+++ b/kernel/loongarch64/zamax.S -@@ -0,0 +1,190 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define t1 $f0 -+#define t2 $f1 -+#define t3 $f2 -+#define t4 $f3 -+#define t5 $f4 -+#define t6 $f5 -+#define t7 $f6 -+#define t8 $f7 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ MTC s1, $r0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, ZBASE_SHIFT -+ bge $r0, INCX, .L999 -+ LD a1, X, 0 * SIZE -+ addi.d N, N, -1 -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ FABS t1, a1 -+ FABS t2, a2 -+ ADD s1, t1, t2 -+ bge $r0, N, .L999 -+ ADD s2, t1, t2 -+ srai.d I, N, 2 -+ ADD s3, t1, t2 -+ ADD s4, t1, t2 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ LD a8, X, 1 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ FABS t1, a1 -+ LD a1, X, 0 * SIZE -+ FABS t2, a2 -+ LD a2, X, 1 * SIZE -+ FABS t3, a3 -+ add.d X, X, INCX -+ FABS t4, a4 -+ FABS t5, a5 -+ LD a3, X, 0 * SIZE -+ FABS t6, a6 -+ LD a4, X, 1 * SIZE -+ FABS t7, a7 -+ add.d X, X, INCX -+ FABS t8, a8 -+ ADD t1, t1, t2 -+ LD a5, X, 0 * SIZE -+ ADD t3, t3, t4 -+ LD a6, X, 1 * SIZE -+ ADD t5, t5, t6 -+ add.d X, X, INCX -+ ADD t7, t7, t8 -+ CMPLT $fcc0, s1, t1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, s2, t3 -+ LD a8, X, 1 * SIZE -+ CMPLT $fcc2, s3, t5 -+ add.d X, X, INCX -+ CMPLT $fcc3, s4, t7 -+ CMOVT s1, s1, t1, $fcc0 -+ addi.d I, I, -1 -+ CMOVT s2, s2, t3, $fcc1 -+ CMOVT s3, s3, t5, $fcc2 -+ CMOVT s4, s4, t7, $fcc3 -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ FABS t1, a1 -+ FABS t2, a2 -+ FABS t3, a3 -+ FABS t4, a4 -+ FABS t5, a5 -+ FABS t6, a6 -+ FABS t7, a7 -+ FABS t8, a8 -+ ADD t1, t1, t2 -+ ADD t3, t3, t4 -+ ADD t5, t5, t6 -+ ADD t7, t7, t8 -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t3 -+ CMPLT $fcc2, s3, t5 -+ CMPLT $fcc3, s4, t7 -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t3, $fcc1 -+ CMOVT s3, s3, t5, $fcc2 -+ CMOVT s4, s4, t7, $fcc3 -+ .align 3 -+ -+.L15: -+ andi I, N, 3 -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ addi.d I, I, -1 -+ FABS t1, a1 -+ FABS t2, a2 -+ ADD t1, t1, t2 -+ CMPLT $fcc0, s1, t1 -+ CMOVT s1, s1, t1, $fcc0 -+ add.d X, X, INCX -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L998: -+ CMPLT $fcc0, s1, s2 -+ CMPLT $fcc1, s3, s4 -+ CMOVT s1, s1, s2, $fcc0 -+ CMOVT s3, s3, s4, $fcc1 -+ CMPLT $fcc0, s1, s3 -+ CMOVT s1, s1, s3, $fcc0 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/zamin.S b/kernel/loongarch64/zamin.S -new file mode 100644 -index 0000000..bde9aeb ---- /dev/null -+++ b/kernel/loongarch64/zamin.S -@@ -0,0 +1,198 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define t1 $f0 -+#define t2 $f1 -+#define t3 $f2 -+#define t4 $f3 -+#define t5 $f4 -+#define t6 $f5 -+#define t7 $f6 -+#define t8 $f7 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ MTC s1, $r0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, ZBASE_SHIFT -+ bge $r0, INCX, .L999 -+ LD a1, X, 0 * SIZE -+ addi.d N, N, -1 -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ FABS t1, a1 -+ FABS t2, a2 -+ ADD s1, t1, t2 -+ bge $r0, N, .L999 -+ NOP -+ ADD s2, t1, t2 -+ srai.d I, N, 2 -+ ADD s3, t1, t2 -+ ADD s4, t1, t2 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ LD a8, X, 1 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ FABS t1, a1 -+ LD a1, X, 0 * SIZE -+ FABS t2, a2 -+ LD a2, X, 1 * SIZE -+ FABS t3, a3 -+ add.d X, X, INCX -+ FABS t4, a4 -+ NOP -+ FABS t5, a5 -+ LD a3, X, 0 * SIZE -+ FABS t6, a6 -+ LD a4, X, 1 * SIZE -+ FABS t7, a7 -+ add.d X, X, INCX -+ FABS t8, a8 -+ NOP -+ ADD t1, t1, t2 -+ LD a5, X, 0 * SIZE -+ ADD t3, t3, t4 -+ LD a6, X, 1 * SIZE -+ ADD t5, t5, t6 -+ add.d X, X, INCX -+ ADD t7, t7, t8 -+ NOP -+ CMPLT $fcc0, t1, s1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, t3, s2 -+ LD a8, X, 1 * SIZE -+ CMPLT $fcc2, t5, s3 -+ add.d X, X, INCX -+ CMPLT $fcc3, t7, s4 -+ NOP -+ CMOVT s1, s1, t1, $fcc0 -+ addi.d I, I, -1 -+ CMOVT s2, s2, t3, $fcc1 -+ NOP -+ CMOVT s3, s3, t5, $fcc2 -+ CMOVT s4, s4, t7, $fcc3 -+ blt $r0, I, .L12 -+ NOP -+ .align 3 -+ -+.L13: -+ FABS t1, a1 -+ FABS t2, a2 -+ FABS t3, a3 -+ FABS t4, a4 -+ FABS t5, a5 -+ FABS t6, a6 -+ FABS t7, a7 -+ FABS t8, a8 -+ ADD t1, t1, t2 -+ ADD t3, t3, t4 -+ ADD t5, t5, t6 -+ ADD t7, t7, t8 -+ CMPLT $fcc0, t1, s1 -+ CMPLT $fcc1, t3, s2 -+ CMPLT $fcc2, t5, s3 -+ CMPLT $fcc3, t7, s4 -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t3, $fcc1 -+ CMOVT s3, s3, t5, $fcc2 -+ CMOVT s4, s4, t7, $fcc3 -+ .align 3 -+ -+.L15: -+ andi I, N, 3 -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ addi.d I, I, -1 -+ FABS t1, a1 -+ FABS t2, a2 -+ ADD t1, t1, t2 -+ CMPLT $fcc0, t1, s1 -+ CMOVT s1, s1, t1, $fcc0 -+ add.d X, X, INCX -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L998: -+ CMPLT $fcc0, s2, s1 -+ CMPLT $fcc1, s4, s3 -+ CMOVT s1, s1, s2, $fcc0 -+ CMOVT s3, s3, s4, $fcc1 -+ CMPLT $fcc0, s3, s1 -+ CMOVT s1, s1, s3, $fcc0 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ NOP -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/zasum.S b/kernel/loongarch64/zasum.S -new file mode 100644 -index 0000000..d1a1a73 ---- /dev/null -+++ b/kernel/loongarch64/zasum.S -@@ -0,0 +1,158 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f23 -+#define a2 $f9 -+#define a3 $f10 -+#define a4 $f11 -+#define a5 $f12 -+#define a6 $f13 -+#define a7 $f14 -+#define a8 $f15 -+#define t1 $f16 -+#define t2 $f17 -+#define t3 $f0 -+#define t4 $f1 -+#define s1 $f22 -+#define s2 $f8 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ MTC s1, $r0 -+ MTC s2, $r0 -+ slli.d INCX, INCX, ZBASE_SHIFT -+ srai.d I, N, 2 -+ bge $r0, N, .L999 -+ bge $r0, I, .L25 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ FABS t1, a1 -+ FABS t2, a2 -+ LD a7, X, 0 * SIZE -+ LD a8, X, 1 * SIZE -+ FABS t3, a3 -+ FABS t4, a4 -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L24 -+ .align 3 -+ -+.L23: -+ ADD s1, s1, t1 -+ LD a1, X, 0 * SIZE -+ FABS t1, a5 -+ addi.d I, I, -1 -+ ADD s2, s2, t2 -+ LD a2, X, 1 * SIZE -+ FABS t2, a6 -+ add.d X, X, INCX -+ ADD s1, s1, t3 -+ LD a3, X, 0 * SIZE -+ FABS t3, a7 -+ NOP -+ ADD s2, s2, t4 -+ LD a4, X, 1 * SIZE -+ FABS t4, a8 -+ add.d X, X, INCX -+ ADD s1, s1, t1 -+ LD a5, X, 0 * SIZE -+ FABS t1, a1 -+ NOP -+ ADD s2, s2, t2 -+ LD a6, X, 1 * SIZE -+ FABS t2, a2 -+ add.d X, X, INCX -+ ADD s1, s1, t3 -+ LD a7, X, 0 * SIZE -+ FABS t3, a3 -+ LD a8, X, 1 * SIZE -+ ADD s2, s2, t4 -+ add.d X, X, INCX -+ FABS t4, a4 -+ blt $r0, I, .L23 -+ .align 3 -+ -+.L24: -+ ADD s1, s1, t1 -+ FABS t1, a5 -+ ADD s2, s2, t2 -+ FABS t2, a6 -+ ADD s1, s1, t3 -+ FABS t3, a7 -+ ADD s2, s2, t4 -+ FABS t4, a8 -+ ADD s1, s1, t1 -+ ADD s2, s2, t2 -+ ADD s1, s1, t3 -+ ADD s2, s2, t4 -+ .align 3 -+ -+.L25: -+ andi I, N, 3 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L26: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ FABS t1, a1 -+ addi.d I, I, -1 -+ FABS t2, a2 -+ add.d X, X, INCX -+ ADD s1, s1, t1 -+ ADD s2, s2, t2 -+ blt $r0, I, .L26 -+ .align 3 -+ -+.L999: -+ ADD s1, s1, s2 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S -new file mode 100644 -index 0000000..0f480ca ---- /dev/null -+++ b/kernel/loongarch64/zcopy.S -@@ -0,0 +1,217 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define Y $r7 -+#define INCY $r8 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f23 -+#define a4 $f9 -+#define a5 $f10 -+#define a6 $f11 -+#define a7 $f12 -+#define a8 $f13 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+ LDINT INCY, 0(INCY) -+#endif -+ -+ li.d TEMP, 2 * SIZE -+ NOP -+ slli.d INCX, INCX, ZBASE_SHIFT -+ bge $r0, N, .L999 -+ slli.d INCY, INCY, ZBASE_SHIFT -+ bne INCX, TEMP, .L20 -+ srai.d I, N, 2 -+ bne INCY, TEMP, .L20 -+ addi.d I, I, -1 -+ blt I, $r0, .L15 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD a3, X, 2 * SIZE -+ LD a4, X, 3 * SIZE -+ LD a5, X, 4 * SIZE -+ LD a6, X, 5 * SIZE -+ LD a7, X, 6 * SIZE -+ LD a8, X, 7 * SIZE -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ ST a1, Y, 0 * SIZE -+ LD a1, X, 8 * SIZE -+ ST a2, Y, 1 * SIZE -+ LD a2, X, 9 * SIZE -+ ST a3, Y, 2 * SIZE -+ LD a3, X, 10 * SIZE -+ ST a4, Y, 3 * SIZE -+ LD a4, X, 11 * SIZE -+ ST a5, Y, 4 * SIZE -+ LD a5, X, 12 * SIZE -+ ST a6, Y, 5 * SIZE -+ LD a6, X, 13 * SIZE -+ ST a7, Y, 6 * SIZE -+ LD a7, X, 14 * SIZE -+ ST a8, Y, 7 * SIZE -+ LD a8, X, 15 * SIZE -+ addi.d I, I, -1 -+ addi.d X, X, 8 * SIZE -+ addi.d Y, Y, 8 * SIZE -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ ST a1, Y, 0 * SIZE -+ ST a2, Y, 1 * SIZE -+ ST a3, Y, 2 * SIZE -+ ST a4, Y, 3 * SIZE -+ ST a5, Y, 4 * SIZE -+ ST a6, Y, 5 * SIZE -+ ST a7, Y, 6 * SIZE -+ ST a8, Y, 7 * SIZE -+ addi.d X, X, 8 * SIZE -+ addi.d Y, Y, 8 * SIZE -+ .align 3 -+ -+.L15: -+ andi I, N, 3 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ addi.d X, X, 2 * SIZE -+ addi.d Y, Y, 2 * SIZE -+ ST a1, Y, -2 * SIZE -+ addi.d I, I, -1 -+ ST a2, Y, -1 * SIZE -+ blt $r0, I, .L16 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ NOP -+ .align 3 -+ -+.L20: -+ srai.d I, N, 2 -+ addi.d I, I, -1 -+ blt I, $r0, .L25 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ LD a8, X, 1 * SIZE -+ add.d X, X, INCX -+ bge $r0, I, .L23 -+ .align 3 -+ -+.L22: -+ ST a1, Y, 0 * SIZE -+ LD a1, X, 0 * SIZE -+ ST a2, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ ST a3, Y, 0 * SIZE -+ LD a3, X, 0 * SIZE -+ ST a4, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ ST a5, Y, 0 * SIZE -+ LD a5, X, 0 * SIZE -+ ST a6, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ ST a7, Y, 0 * SIZE -+ LD a7, X, 0 * SIZE -+ ST a8, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ LD a8, X, 1 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ blt $r0, I, .L22 -+ .align 3 -+ -+.L23: -+ ST a1, Y, 0 * SIZE -+ ST a2, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ ST a3, Y, 0 * SIZE -+ ST a4, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ ST a5, Y, 0 * SIZE -+ ST a6, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ ST a7, Y, 0 * SIZE -+ ST a8, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ .align 3 -+ -+.L25: -+ andi I, N, 3 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L26: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ addi.d I, I, -1 -+ ST a1, Y, 0 * SIZE -+ ST a2, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ blt $r0, I, .L26 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S -new file mode 100644 -index 0000000..81ac19f ---- /dev/null -+++ b/kernel/loongarch64/zdot.S -@@ -0,0 +1,330 @@ -+/*************************************************************************** -+Copyright (c) 2020, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define Y $r7 -+#define INCY $r8 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define b1 $f14 -+#define b2 $f15 -+#define b3 $f16 -+#define b4 $f17 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+ LDINT INCY, 0(INCY) -+#endif -+ -+ MTC s1, $r0 -+ MOV s2, s1 -+ MOV s3, s2 -+ MOV s4, s3 -+ slli.d INCX, INCX, ZBASE_SHIFT -+ li.d TEMP, 2 * SIZE -+ slli.d INCY, INCY, ZBASE_SHIFT -+ bge $r0, N, .L999 -+ srai.d I, N, 2 -+ bne INCX, TEMP, .L20 -+ bne INCY, TEMP, .L20 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD b1, Y, 0 * SIZE -+ addi.d I, I, -1 -+ LD b2, Y, 1 * SIZE -+ bge $r0, I, .L14 -+ .align 3 -+ -+.L13: -+ MADD s1, b1, a1, s1 -+ LD a3, X, 2 * SIZE -+ MADD s2, b1, a2, s2 -+ LD a4, X, 3 * SIZE -+ MADD s3, b2, a1, s3 -+ LD b3, Y, 2 * SIZE -+ MADD s4, b2, a2, s4 -+ LD b4, Y, 3 * SIZE -+ MADD s1, b3, a3, s1 -+ LD a1, X, 4 * SIZE -+ MADD s2, b3, a4, s2 -+ LD a2, X, 5 * SIZE -+ MADD s3, b4, a3, s3 -+ LD b1, Y, 4 * SIZE -+ MADD s4, b4, a4, s4 -+ LD b2, Y, 5 * SIZE -+ MADD s1, b1, a1, s1 -+ LD a3, X, 6 * SIZE -+ MADD s2, b1, a2, s2 -+ LD a4, X, 7 * SIZE -+ MADD s3, b2, a1, s3 -+ LD b3, Y, 6 * SIZE -+ MADD s4, b2, a2, s4 -+ LD b4, Y, 7 * SIZE -+ MADD s1, b3, a3, s1 -+ LD a1, X, 8 * SIZE -+ MADD s2, b3, a4, s2 -+ LD a2, X, 9 * SIZE -+ MADD s3, b4, a3, s3 -+ LD b1, Y, 8 * SIZE -+ MADD s4, b4, a4, s4 -+ LD b2, Y, 9 * SIZE -+ addi.d I, I, -1 -+ addi.d X, X, 8 * SIZE -+ addi.d Y, Y, 8 * SIZE -+ blt $r0, I, .L13 -+ .align 3 -+ -+.L14: -+ MADD s1, b1, a1, s1 -+ LD a3, X, 2 * SIZE -+ MADD s2, b1, a2, s2 -+ LD a4, X, 3 * SIZE -+ MADD s3, b2, a1, s3 -+ LD b3, Y, 2 * SIZE -+ MADD s4, b2, a2, s4 -+ LD b4, Y, 3 * SIZE -+ MADD s1, b3, a3, s1 -+ LD a1, X, 4 * SIZE -+ MADD s2, b3, a4, s2 -+ LD a2, X, 5 * SIZE -+ MADD s3, b4, a3, s3 -+ LD b1, Y, 4 * SIZE -+ MADD s4, b4, a4, s4 -+ LD b2, Y, 5 * SIZE -+ MADD s1, b1, a1, s1 -+ LD a3, X, 6 * SIZE -+ MADD s2, b1, a2, s2 -+ LD a4, X, 7 * SIZE -+ MADD s3, b2, a1, s3 -+ LD b3, Y, 6 * SIZE -+ MADD s4, b2, a2, s4 -+ LD b4, Y, 7 * SIZE -+ MADD s1, b3, a3, s1 -+ addi.d X, X, 8 * SIZE -+ MADD s2, b3, a4, s2 -+ addi.d Y, Y, 8 * SIZE -+ MADD s3, b4, a3, s3 -+ MADD s4, b4, a4, s4 -+ .align 3 -+ -+.L15: -+ andi I, N, 3 -+ bge $r0, I, .L999 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD b1, Y, 0 * SIZE -+ addi.d I, I, -1 -+ LD b2, Y, 1 * SIZE -+ bge $r0, I, .L17 -+ .align 3 -+ -+.L16: -+ MADD s1, b1, a1, s1 -+ addi.d I, I, -1 -+ MADD s2, b1, a2, s2 -+ LD b1, Y, 2 * SIZE -+ MADD s3, b2, a1, s3 -+ LD a1, X, 2 * SIZE -+ MADD s4, b2, a2, s4 -+ LD a2, X, 3 * SIZE -+ LD b2, Y, 3 * SIZE -+ addi.d X, X, 2 * SIZE -+ addi.d Y, Y, 2 * SIZE -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L17: -+ MADD s1, b1, a1, s1 -+ MADD s2, b1, a2, s2 -+ MADD s3, b2, a1, s3 -+ MADD s4, b2, a2, s4 -+ b .L999 -+ .align 3 -+ -+.L20: -+#ifdef F_INTERFACE -+ bgez INCX, .L21 -+ addi.d TEMP, N, -1 -+ mult TEMP, INCX -+ mflo TEMP -+ dsub X, X, TEMP -+ .align 3 -+ -+.L21: -+ bgez INCY, .L22 -+ addi.d TEMP, N, -1 -+ mult TEMP, INCY -+ mflo TEMP -+ dsub Y, Y, TEMP -+ .align 3 -+ -+.L22: -+#endif -+ bge $r0, I, .L25 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD b1, Y, 0 * SIZE -+ LD b2, Y, 1 * SIZE -+ add.d X, X, INCX -+ addi.d I, I, -1 -+ add.d Y, Y, INCY -+ bge $r0, I, .L24 -+ .align 3 -+ -+.L23: -+ MADD s1, b1, a1, s1 -+ LD a3, X, 0 * SIZE -+ MADD s2, b1, a2, s2 -+ LD a4, X, 1 * SIZE -+ MADD s3, b2, a1, s3 -+ LD b3, Y, 0 * SIZE -+ MADD s4, b2, a2, s4 -+ LD b4, Y, 1 * SIZE -+ add.d X, X, INCX -+ add.d Y, Y, INCY -+ MADD s1, b3, a3, s1 -+ LD a1, X, 0 * SIZE -+ MADD s2, b3, a4, s2 -+ LD a2, X, 1 * SIZE -+ MADD s3, b4, a3, s3 -+ LD b1, Y, 0 * SIZE -+ MADD s4, b4, a4, s4 -+ LD b2, Y, 1 * SIZE -+ add.d X, X, INCX -+ add.d Y, Y, INCY -+ MADD s1, b1, a1, s1 -+ LD a3, X, 0 * SIZE -+ MADD s2, b1, a2, s2 -+ LD a4, X, 1 * SIZE -+ MADD s3, b2, a1, s3 -+ LD b3, Y, 0 * SIZE -+ MADD s4, b2, a2, s4 -+ LD b4, Y, 1 * SIZE -+ add.d X, X, INCX -+ add.d Y, Y, INCY -+ MADD s1, b3, a3, s1 -+ LD a1, X, 0 * SIZE -+ MADD s2, b3, a4, s2 -+ LD a2, X, 1 * SIZE -+ MADD s3, b4, a3, s3 -+ LD b1, Y, 0 * SIZE -+ MADD s4, b4, a4, s4 -+ LD b2, Y, 1 * SIZE -+ add.d X, X, INCX -+ addi.d I, I, -1 -+ add.d Y, Y, INCY -+ blt $r0, I, .L23 -+ .align 3 -+ -+.L24: -+ MADD s1, b1, a1, s1 -+ LD a3, X, 0 * SIZE -+ MADD s2, b1, a2, s2 -+ LD a4, X, 1 * SIZE -+ MADD s3, b2, a1, s3 -+ LD b3, Y, 0 * SIZE -+ MADD s4, b2, a2, s4 -+ LD b4, Y, 1 * SIZE -+ add.d X, X, INCX -+ add.d Y, Y, INCY -+ MADD s1, b3, a3, s1 -+ LD a1, X, 0 * SIZE -+ MADD s2, b3, a4, s2 -+ LD a2, X, 1 * SIZE -+ MADD s3, b4, a3, s3 -+ LD b1, Y, 0 * SIZE -+ MADD s4, b4, a4, s4 -+ LD b2, Y, 1 * SIZE -+ add.d X, X, INCX -+ add.d Y, Y, INCY -+ MADD s1, b1, a1, s1 -+ LD a3, X, 0 * SIZE -+ MADD s2, b1, a2, s2 -+ LD a4, X, 1 * SIZE -+ MADD s3, b2, a1, s3 -+ LD b3, Y, 0 * SIZE -+ MADD s4, b2, a2, s4 -+ LD b4, Y, 1 * SIZE -+ MADD s1, b3, a3, s1 -+ add.d X, X, INCX -+ MADD s2, b3, a4, s2 -+ add.d Y, Y, INCY -+ MADD s3, b4, a3, s3 -+ MADD s4, b4, a4, s4 -+ .align 3 -+ -+.L25: -+ andi I, N, 3 -+ bge $r0, I, .L999 -+ .align 3 -+.L26: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD b1, Y, 0 * SIZE -+ LD b2, Y, 1 * SIZE -+ MADD s1, b1, a1, s1 -+ MADD s2, b1, a2, s2 -+ MADD s3, b2, a1, s3 -+ MADD s4, b2, a2, s4 -+ add.d X, X, INCX -+ add.d Y, Y, INCY -+ addi.d I, I, -1 -+ blt $r0, I, .L26 -+ .align 3 -+ -+.L999: -+#ifndef CONJ -+ SUB $f0, s1, s4 -+#else -+ ADD $f0, s1, s4 -+#endif -+#ifndef CONJ -+ ADD $f1, s3, s2 -+#else -+ SUB $f1, s3, s2 -+#endif -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/zgemm3m_kernel.S b/kernel/loongarch64/zgemm3m_kernel.S -new file mode 100644 -index 0000000..f9acb6c ---- /dev/null -+++ b/kernel/loongarch64/zgemm3m_kernel.S -@@ -0,0 +1,1359 @@ -+/*************************************************************************** -+Copyright (c) 2020, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define M $r4 -+#define N $r5 -+#define K $r6 -+#define A $r7 -+#define B $r8 -+#define C $r9 -+#define LDC $r10 -+ -+#define AO $r12 -+#define BO $r13 -+#define I $r17 -+#define J $r18 -+#define L $r11 -+#define CO1 $r14 -+#define CO2 $r15 -+#define CO3 $r23 -+#define CO4 $r24 -+#define CO5 $r25 -+#define CO6 $r26 -+#define CO7 $r27 -+#define CO8 $r28 -+ -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f28 -+#define a4 $f29 -+#define b1 $f23 -+#define b2 $f9 -+#define b3 $f10 -+#define b4 $f11 -+#define b5 $f12 -+#define b6 $f13 -+#define b7 $f14 -+#define b8 $f15 -+#define a5 b8 -+#define c11 $f16 -+#define c12 $f17 -+#define c21 $f3 -+#define c22 $f4 -+#define c31 $f2 -+#define c32 $f5 -+#define c41 $f6 -+#define c42 $f7 -+#define c51 $f18 -+#define c52 $f19 -+#define c61 $f20 -+#define c62 $f21 -+#define c71 $f24 -+#define c72 $f25 -+#define c81 $f26 -+#define c82 $f27 -+#define ALPHA_R $f0 -+#define ALPHA_I $f1 -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -128 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 16 -+ SDARG $r26, $sp, 24 -+ SDARG $r27, $sp, 32 -+ SDARG $r28, $sp, 40 -+ fst.d $f24, $sp, 48 -+ fst.d $f25, $sp, 56 -+ fst.d $f26, $sp, 64 -+ fst.d $f27, $sp, 72 -+ fst.d $f28, $sp, 80 -+ fst.d $f29, $sp, 88 -+ slli.d LDC, LDC, ZBASE_SHIFT -+ srai.d J, N, 3 -+ bge $r0, J, .L30 -+.L10: -+ move CO1, C -+ MTC c11, $r0 -+ add.d CO2, C, LDC -+ move AO, A -+ add.d CO3, CO2, LDC -+ addi.d J, J, -1 -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ add.d CO5, CO4, LDC -+ MOV c31, c11 -+ add.d CO6, CO5, LDC -+ MOV c41, c11 -+ add.d CO7, CO6, LDC -+ MOV c51, c11 -+ add.d CO8, CO7, LDC -+ srai.d I, M, 1 -+ add.d C, CO8, LDC -+MOV c61, c11 -+ bge $r0, I, .L20 -+.L11: -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, B, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, K, 2 -+ MOV c32, c11 -+ LD b3, B, 2 * SIZE -+ MOV c42, c11 -+ LD b4, B, 3 * SIZE -+ MOV c52, c11 -+ LD b5, B, 4 * SIZE -+ MOV c62, c11 -+ LD b6, B, 8 * SIZE -+ MOV c72, c11 -+ LD b7, B, 12 * SIZE -+ MOV c82, c11 -+move BO, B -+ bge $r0, L, .L15 -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ bge $r0, L, .L13 -+ .align 3 -+.L12: -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ LD a4, AO, 2 * SIZE -+ MADD c61, b2, a1, c61 -+ MADD c71, b3, a1, c71 -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a4, c51 -+ MADD c61, b2, a4, c61 -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ LD a4, AO, 6 * SIZE -+ MADD c61, b2, a3, c61 -+ MADD c71, b3, a3, c71 -+ MADD c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ addi.d L, L, -1 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ blt $r0, L, .L12 -+ .align 3 -+ -+.L13: -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ MADD c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a4, c51 -+ MADD c61, b2, a4, c61 -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ MADD c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD c71, b3, a3, c71 -+ MADD c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a4, c21 -+ MADD c31, b3, a4, c31 -+ MADD c41, b4, a4, c41 -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD c71, b3, a4, c71 -+ MADD c81, b4, a4, c81 -+ MADD c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ .align 3 -+ -+.L15: -+ andi L, K, 3 -+ bge $r0, L, .L18 -+ .align 3 -+.L16: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ addi.d L, L, -1 -+ MADD c61, b2, a1, c61 -+ addi.d AO, AO, 2 * SIZE -+ MADD c71, b3, a1, c71 -+ addi.d BO, BO, 8 * SIZE -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ MADD c52, b5, a2, c52 -+ LD b5, BO, 4 * SIZE -+ MADD c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L16 -+.L18: -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO1, 1 * SIZE -+ LD $f23, CO1, 2 * SIZE -+ LD $f9, CO1, 3 * SIZE -+ LD $f10, CO2, 0 * SIZE -+ MADD $f22, c11, ALPHA_R, $f22 -+ LD $f11, CO2, 1 * SIZE -+ MADD $f8, c11, ALPHA_I, $f8 -+ LD $f12, CO2, 2 * SIZE -+ MADD $f23, c12, ALPHA_R, $f23 -+ LD $f13, CO2, 3 * SIZE -+ MADD $f9, c12, ALPHA_I, $f9 -+ MADD $f10, c21, ALPHA_R, $f10 -+ ST $f22, CO1, 0 * SIZE -+ MADD $f11, c21, ALPHA_I, $f11 -+ ST $f8, CO1, 1 * SIZE -+ MADD $f12, c22, ALPHA_R, $f12 -+ ST $f23, CO1, 2 * SIZE -+ MADD $f13, c22, ALPHA_I, $f13 -+ ST $f9, CO1, 3 * SIZE -+ LD $f22, CO3, 0 * SIZE -+ LD $f8, CO3, 1 * SIZE -+ LD $f23, CO3, 2 * SIZE -+ LD $f9, CO3, 3 * SIZE -+ ST $f10, CO2, 0 * SIZE -+ ST $f11, CO2, 1 * SIZE -+ ST $f12, CO2, 2 * SIZE -+ ST $f13, CO2, 3 * SIZE -+ LD $f10, CO4, 0 * SIZE -+ LD $f11, CO4, 1 * SIZE -+ LD $f12, CO4, 2 * SIZE -+ LD $f13, CO4, 3 * SIZE -+ MADD $f22, c31, ALPHA_R, $f22 -+ MADD $f8, c31, ALPHA_I, $f8 -+ MADD $f23, c32, ALPHA_R, $f23 -+ MADD $f9, c32, ALPHA_I, $f9 -+ MADD $f10, c41, ALPHA_R, $f10 -+ ST $f22, CO3, 0 * SIZE -+ MADD $f11, c41, ALPHA_I, $f11 -+ ST $f8, CO3, 1 * SIZE -+ MADD $f12, c42, ALPHA_R, $f12 -+ ST $f23, CO3, 2 * SIZE -+ MADD $f13, c42, ALPHA_I, $f13 -+ ST $f9, CO3, 3 * SIZE -+ LD $f22, CO5, 0 * SIZE -+ LD $f8, CO5, 1 * SIZE -+ LD $f23, CO5, 2 * SIZE -+ LD $f9, CO5, 3 * SIZE -+ ST $f10, CO4, 0 * SIZE -+ ST $f11, CO4, 1 * SIZE -+ ST $f12, CO4, 2 * SIZE -+ ST $f13, CO4, 3 * SIZE -+ LD $f10, CO6, 0 * SIZE -+ LD $f11, CO6, 1 * SIZE -+ LD $f12, CO6, 2 * SIZE -+ LD $f13, CO6, 3 * SIZE -+ MADD $f22, c51, ALPHA_R, $f22 -+ addi.d CO1,CO1, 4 * SIZE -+ MADD $f8, c51, ALPHA_I, $f8 -+ addi.d CO2,CO2, 4 * SIZE -+ MADD $f23, c52, ALPHA_R, $f23 -+ addi.d CO3,CO3, 4 * SIZE -+ MADD $f9, c52, ALPHA_I, $f9 -+ addi.d CO4,CO4, 4 * SIZE -+ MADD $f10, c61, ALPHA_R, $f10 -+ ST $f22, CO5, 0 * SIZE -+ MADD $f11, c61, ALPHA_I, $f11 -+ ST $f8, CO5, 1 * SIZE -+ MADD $f12, c62, ALPHA_R, $f12 -+ ST $f23, CO5, 2 * SIZE -+ MADD $f13, c62, ALPHA_I, $f13 -+ ST $f9, CO5, 3 * SIZE -+ LD $f22, CO7, 0 * SIZE -+ LD $f8, CO7, 1 * SIZE -+ LD $f23, CO7, 2 * SIZE -+ LD $f9, CO7, 3 * SIZE -+ ST $f10, CO6, 0 * SIZE -+ ST $f11, CO6, 1 * SIZE -+ ST $f12, CO6, 2 * SIZE -+ ST $f13, CO6, 3 * SIZE -+ LD $f10, CO8, 0 * SIZE -+ addi.d I, I, -1 -+ LD $f11, CO8, 1 * SIZE -+MTC c11, $r0 -+ LD $f12, CO8, 2 * SIZE -+ LD $f13, CO8, 3 * SIZE -+ MADD $f22, c71, ALPHA_R, $f22 -+ addi.d CO5,CO5, 4 * SIZE -+ MADD $f8, c71, ALPHA_I, $f8 -+ addi.d CO6,CO6, 4 * SIZE -+ MADD $f23, c72, ALPHA_R, $f23 -+ addi.d CO7,CO7, 4 * SIZE -+ MADD $f9, c72, ALPHA_I, $f9 -+ addi.d CO8,CO8, 4 * SIZE -+ MADD $f10, c81, ALPHA_R, $f10 -+ ST $f22, CO7, -4 * SIZE -+ MADD $f11, c81, ALPHA_I, $f11 -+ ST $f8, CO7, -3 * SIZE -+ MADD $f12, c82, ALPHA_R, $f12 -+ ST $f23, CO7, -2 * SIZE -+ MADD $f13, c82, ALPHA_I, $f13 -+ ST $f9, CO7, -1 * SIZE -+ ST $f10, CO8, -4 * SIZE -+ MOV c21, c11 -+ ST $f11, CO8, -3 * SIZE -+ MOV c31, c11 -+ ST $f12, CO8, -2 * SIZE -+ MOV c41, c11 -+ ST $f13, CO8, -1 * SIZE -+ MOV c51, c11 -+MOV c61, c11 -+ blt $r0, I, .L11 -+ .align 3 -+ -+.L20: -+ andi I, M, 1 -+ MOV c61, c11 -+MOV c71, c11 -+ bge $r0, I, .L29 -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, K, 2 -+ MOV c81, c11 -+move BO, B -+ bge $r0, L, .L25 -+ .align 3 -+.L22: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 16 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ MADD c51, b5, a1, c51 -+ LD b5, BO, 20 * SIZE -+ MADD c61, b2, a1, c61 -+ LD b2, BO, 9 * SIZE -+ MADD c71, b3, a1, c71 -+ LD b3, BO, 10 * SIZE -+ MADD c81, b4, a1, c81 -+ LD b4, BO, 11 * SIZE -+ LD a1, AO, 4 * SIZE -+ addi.d L, L, -1 -+ MADD c11, b6, a2, c11 -+ LD b6, BO, 24 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 13 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 14 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 15 * SIZE -+ MADD c51, b7, a2, c51 -+ LD b7, BO, 28 * SIZE -+ MADD c61, b2, a2, c61 -+ LD b2, BO, 17 * SIZE -+ MADD c71, b3, a2, c71 -+ LD b3, BO, 18 * SIZE -+ MADD c81, b4, a2, c81 -+ LD b4, BO, 19 * SIZE -+ LD a2, AO, 5 * SIZE -+ addi.d AO, AO, 4 * SIZE -+ MADD c11, b1, a3, c11 -+ LD b1, BO, 32 * SIZE -+ MADD c21, b2, a3, c21 -+ LD b2, BO, 21 * SIZE -+ MADD c31, b3, a3, c31 -+ LD b3, BO, 22 * SIZE -+ MADD c41, b4, a3, c41 -+ LD b4, BO, 23 * SIZE -+ MADD c51, b5, a3, c51 -+ LD b5, BO, 36 * SIZE -+ MADD c61, b2, a3, c61 -+ LD b2, BO, 25 * SIZE -+ MADD c71, b3, a3, c71 -+ LD b3, BO, 26 * SIZE -+ MADD c81, b4, a3, c81 -+ LD b4, BO, 27 * SIZE -+ LD a3, AO, 2 * SIZE -+ addi.d BO, BO, 32 * SIZE -+ MADD c11, b6, a4, c11 -+ LD b6, BO, 8 * SIZE -+ MADD c21, b2, a4, c21 -+ LD b2, BO, -3 * SIZE -+ MADD c31, b3, a4, c31 -+ LD b3, BO, -2 * SIZE -+ MADD c41, b4, a4, c41 -+ LD b4, BO, -1 * SIZE -+ MADD c51, b7, a4, c51 -+ LD b7, BO, 12 * SIZE -+ MADD c61, b2, a4, c61 -+ LD b2, BO, 1 * SIZE -+ MADD c71, b3, a4, c71 -+ LD b3, BO, 2 * SIZE -+ MADD c81, b4, a4, c81 -+ LD b4, BO, 3 * SIZE -+ LD a4, AO, 3 * SIZE -+ blt $r0, L, .L22 -+ .align 3 -+ -+.L25: -+ andi L, K, 3 -+ bge $r0, L, .L28 -+ .align 3 -+.L26: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 8 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ addi.d L, L, -1 -+ MOV a2, a2 -+ addi.d AO, AO, 1 * SIZE -+ addi.d BO, BO, 8 * SIZE -+ MADD c51, b5, a1, c51 -+ LD b5, BO, 4 * SIZE -+ MADD c61, b2, a1, c61 -+ LD b2, BO, 1 * SIZE -+ MADD c71, b3, a1, c71 -+ LD b3, BO, 2 * SIZE -+ MADD c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L26 -+.L28: -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO1, 1 * SIZE -+ LD $f23, CO2, 0 * SIZE -+ LD $f9, CO2, 1 * SIZE -+ LD $f10, CO3, 0 * SIZE -+ MADD $f22, c11, ALPHA_R, $f22 -+ LD $f11, CO3, 1 * SIZE -+ MADD $f8, c11, ALPHA_I, $f8 -+ LD $f12, CO4, 0 * SIZE -+ MADD $f23, c21, ALPHA_R, $f23 -+ LD $f13, CO4, 1 * SIZE -+ MADD $f9, c21, ALPHA_I, $f9 -+ MADD $f10, c31, ALPHA_R, $f10 -+ ST $f22, CO1, 0 * SIZE -+ MADD $f11, c31, ALPHA_I, $f11 -+ ST $f8, CO1, 1 * SIZE -+ MADD $f12, c41, ALPHA_R, $f12 -+ ST $f23, CO2, 0 * SIZE -+ MADD $f13, c41, ALPHA_I, $f13 -+ ST $f9, CO2, 1 * SIZE -+ LD $f22, CO5, 0 * SIZE -+ LD $f8, CO5, 1 * SIZE -+ LD $f23, CO6, 0 * SIZE -+ LD $f9, CO6, 1 * SIZE -+ ST $f10, CO3, 0 * SIZE -+ ST $f11, CO3, 1 * SIZE -+ ST $f12, CO4, 0 * SIZE -+ ST $f13, CO4, 1 * SIZE -+ LD $f10, CO7, 0 * SIZE -+ MADD $f22, c51, ALPHA_R, $f22 -+ LD $f11, CO7, 1 * SIZE -+ MADD $f8, c51, ALPHA_I, $f8 -+ LD $f12, CO8, 0 * SIZE -+ MADD $f23, c61, ALPHA_R, $f23 -+ LD $f13, CO8, 1 * SIZE -+ MADD $f9, c61, ALPHA_I, $f9 -+ MADD $f10, c71, ALPHA_R, $f10 -+ ST $f22, CO5, 0 * SIZE -+ MADD $f11, c71, ALPHA_I, $f11 -+ ST $f8, CO5, 1 * SIZE -+ MADD $f12, c81, ALPHA_R, $f12 -+ ST $f23, CO6, 0 * SIZE -+ MADD $f13, c81, ALPHA_I, $f13 -+ ST $f9, CO6, 1 * SIZE -+ ST $f10, CO7, 0 * SIZE -+ ST $f11, CO7, 1 * SIZE -+ ST $f12, CO8, 0 * SIZE -+ ST $f13, CO8, 1 * SIZE -+ .align 3 -+ -+.L29: -+move B, BO -+ blt $r0, J, .L10 -+ .align 3 -+ -+.L30: -+ andi J, N, 4 -+move AO, A -+ bge $r0, J, .L50 -+ move CO1, C -+MTC c11, $r0 -+ add.d CO2, C, LDC -+ add.d CO3, CO2, LDC -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ add.d C, CO4, LDC -+ MOV c31, c11 -+ srai.d I, M, 1 -+MOV c41, c11 -+ bge $r0, I, .L40 -+.L31: -+ LD a1, AO, 0 * SIZE -+ LD a3, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ MOV c32, c11 -+ LD b4, B, 3 * SIZE -+ MOV c42, c11 -+ LD b5, B, 4 * SIZE -+ srai.d L, K, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L35 -+ .align 3 -+.L32: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD c11, b5, a1, c11 -+ LD a2, AO, 3 * SIZE -+ MADD c21, b2, a1, c21 -+ MADD c31, b3, a1, c31 -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 8 * SIZE -+ MADD c12, b5, a2, c12 -+ LD b5, BO, 20 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 9 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 10 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 11 * SIZE -+ MADD c11, b6, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD c21, b2, a3, c21 -+ MADD c31, b3, a3, c31 -+ MADD c41, b4, a3, c41 -+ LD a3, AO, 6 * SIZE -+ MADD c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD c11, b7, a3, c11 -+ LD a2, AO, 7 * SIZE -+ MADD c21, b2, a3, c21 -+ addi.d AO, AO, 8 * SIZE -+ MADD c31, b3, a3, c31 -+ addi.d BO, BO, 16 * SIZE -+ MADD c41, b4, a3, c41 -+ LD a3, AO, 4 * SIZE -+ MADD c12, b7, a2, c12 -+ LD b7, BO, 12 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 1 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 2 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L32 -+ .align 3 -+ -+.L35: -+ andi L, K, 3 -+ bge $r0, L, .L38 -+ .align 3 -+.L36: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD c31, b3, a1, c31 -+ addi.d AO, AO, 2 * SIZE -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 0 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 4 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L36 -+.L38: -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO1, 1 * SIZE -+ LD $f23, CO1, 2 * SIZE -+ LD $f9, CO1, 3 * SIZE -+ LD $f10, CO2, 0 * SIZE -+ LD $f11, CO2, 1 * SIZE -+ LD $f12, CO2, 2 * SIZE -+ LD $f13, CO2, 3 * SIZE -+ MADD $f22, c11, ALPHA_R, $f22 -+ MADD $f8, c11, ALPHA_I, $f8 -+ MADD $f23, c12, ALPHA_R, $f23 -+ MADD $f9, c12, ALPHA_I, $f9 -+ MADD $f10, c21, ALPHA_R, $f10 -+ ST $f22, CO1, 0 * SIZE -+ MADD $f11, c21, ALPHA_I, $f11 -+ ST $f8, CO1, 1 * SIZE -+ MADD $f12, c22, ALPHA_R, $f12 -+ ST $f23, CO1, 2 * SIZE -+ MADD $f13, c22, ALPHA_I, $f13 -+ ST $f9, CO1, 3 * SIZE -+ LD $f22, CO3, 0 * SIZE -+ LD $f8, CO3, 1 * SIZE -+ LD $f23, CO3, 2 * SIZE -+ LD $f9, CO3, 3 * SIZE -+ ST $f10, CO2, 0 * SIZE -+ MADD $f22, c31, ALPHA_R, $f22 -+ ST $f11, CO2, 1 * SIZE -+ MADD $f8, c31, ALPHA_I, $f8 -+ ST $f12, CO2, 2 * SIZE -+ MADD $f23, c32, ALPHA_R, $f23 -+ ST $f13, CO2, 3 * SIZE -+ MADD $f9, c32, ALPHA_I, $f9 -+ LD $f10, CO4, 0 * SIZE -+ LD $f11, CO4, 1 * SIZE -+ LD $f12, CO4, 2 * SIZE -+ LD $f13, CO4, 3 * SIZE -+ MADD $f10, c41, ALPHA_R, $f10 -+ addi.d CO1,CO1, 4 * SIZE -+ MADD $f11, c41, ALPHA_I, $f11 -+ addi.d CO2,CO2, 4 * SIZE -+ MADD $f12, c42, ALPHA_R, $f12 -+ addi.d CO3,CO3, 4 * SIZE -+ MADD $f13, c42, ALPHA_I, $f13 -+ addi.d CO4,CO4, 4 * SIZE -+ ST $f22, CO3, -4 * SIZE -+ addi.d I, I, -1 -+ ST $f8, CO3, -3 * SIZE -+ ST $f23, CO3, -2 * SIZE -+ ST $f9, CO3, -1 * SIZE -+ ST $f10, CO4, -4 * SIZE -+MTC c11, $r0 -+ ST $f11, CO4, -3 * SIZE -+ MOV c21, c11 -+ ST $f12, CO4, -2 * SIZE -+ MOV c31, c11 -+ ST $f13, CO4, -1 * SIZE -+MOV c41, c11 -+ blt $r0, I, .L31 -+ .align 3 -+ -+.L40: -+ andi I, M, 1 -+MOV c61, c11 -+ bge $r0, I, .L49 -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c81, c11 -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, K, 2 -+move BO, B -+ bge $r0, L, .L45 -+ .align 3 -+.L42: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 16 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD b4, BO, 7 * SIZE -+ LD a1, AO, 4 * SIZE -+ addi.d L, L, -1 -+ MADD c11, b5, a2, c11 -+ LD b5, BO, 20 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 10 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 11 * SIZE -+ LD a2, AO, 2 * SIZE -+ addi.d AO, AO, 4 * SIZE -+ MADD c11, b6, a2, c11 -+ LD b6, BO, 24 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 13 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 14 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 15 * SIZE -+ LD a2, AO, -1 * SIZE -+ addi.d BO, BO, 16 * SIZE -+ MADD c11, b7, a2, c11 -+ LD b7, BO, 12 * SIZE -+ MADD c21, b2, a2, c21 -+ LD b2, BO, 1 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 2 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 3 * SIZE -+ LD a2, AO, 1 * SIZE -+ blt $r0, L, .L42 -+ .align 3 -+ -+.L45: -+ andi L, K, 3 -+ bge $r0, L, .L48 -+ .align 3 -+.L46: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 4 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a1, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a1, c41 -+ LD a1, AO, 1 * SIZE -+ LD b4, BO, 7 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+ MOV a2, a2 -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L46 -+.L48: -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO1, 1 * SIZE -+ LD $f23, CO2, 0 * SIZE -+ LD $f9, CO2, 1 * SIZE -+ LD $f10, CO3, 0 * SIZE -+ MADD $f22, c11, ALPHA_R, $f22 -+ LD $f11, CO3, 1 * SIZE -+ MADD $f8, c11, ALPHA_I, $f8 -+ LD $f12, CO4, 0 * SIZE -+ MADD $f23, c21, ALPHA_R, $f23 -+ LD $f13, CO4, 1 * SIZE -+ MADD $f9, c21, ALPHA_I, $f9 -+ MADD $f10, c31, ALPHA_R, $f10 -+ ST $f22, CO1, 0 * SIZE -+ MADD $f11, c31, ALPHA_I, $f11 -+ ST $f8, CO1, 1 * SIZE -+ MADD $f12, c41, ALPHA_R, $f12 -+ ST $f23, CO2, 0 * SIZE -+ MADD $f13, c41, ALPHA_I, $f13 -+ ST $f9, CO2, 1 * SIZE -+ ST $f10, CO3, 0 * SIZE -+ ST $f11, CO3, 1 * SIZE -+ ST $f12, CO4, 0 * SIZE -+ ST $f13, CO4, 1 * SIZE -+ .align 3 -+ -+.L49: -+ move B, BO -+ .align 3 -+ -+.L50: -+ andi J, N, 2 -+move AO, A -+ bge $r0, J, .L70 -+ move CO1, C -+ add.d CO2, C, LDC -+ srai.d I, M, 1 -+add.d C, CO2, LDC -+ bge $r0, I, .L60 -+.L51: -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ LD b5, B, 4 * SIZE -+ srai.d L, K, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L55 -+ .align 3 -+.L52: -+ MADD c11, b1, a1, c11 -+ LD a3, AO, 2 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b4, BO, 3 * SIZE -+ MADD c12, b1, a2, c12 -+ LD a4, AO, 3 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b1, BO, 8 * SIZE -+ MADD c11, b3, a3, c11 -+ LD a1, AO, 8 * SIZE -+ MADD c21, b4, a3, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c12, b3, a4, c12 -+ LD a2, AO, 5 * SIZE -+ MADD c22, b4, a4, c22 -+ LD b3, BO, 6 * SIZE -+ MADD c11, b5, a5, c11 -+ LD a3, AO, 6 * SIZE -+ MADD c21, b2, a5, c21 -+ LD b4, BO, 7 * SIZE -+ MADD c12, b5, a2, c12 -+ LD a4, AO, 7 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b5, BO, 12 * SIZE -+ MADD c11, b3, a3, c11 -+ LD a5, AO, 12 * SIZE -+ MADD c21, b4, a3, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c12, b3, a4, c12 -+ LD a2, AO, 9 * SIZE -+ MADD c22, b4, a4, c22 -+ LD b3, BO, 10 * SIZE -+ addi.d AO, AO, 8 * SIZE -+ addi.d L, L, -1 -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L52 -+ .align 3 -+ -+.L55: -+ andi L, K, 3 -+ bge $r0, L, .L58 -+ .align 3 -+.L56: -+ MADD c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD c22, b2, a2, c22 -+ LD b2, BO, 3 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 2 * SIZE -+addi.d BO, BO, 2 * SIZE -+ blt $r0, L, .L56 -+.L58: -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO1, 1 * SIZE -+ LD $f23, CO1, 2 * SIZE -+ LD $f9, CO1, 3 * SIZE -+ LD $f10, CO2, 0 * SIZE -+ LD $f11, CO2, 1 * SIZE -+ LD $f12, CO2, 2 * SIZE -+ LD $f13, CO2, 3 * SIZE -+ MADD $f22, c11, ALPHA_R, $f22 -+ addi.d I, I, -1 -+ MADD $f8, c11, ALPHA_I, $f8 -+ addi.d CO1,CO1, 4 * SIZE -+ MADD $f23, c12, ALPHA_R, $f23 -+ addi.d CO2,CO2, 4 * SIZE -+ MADD $f9, c12, ALPHA_I, $f9 -+ MADD $f10, c21, ALPHA_R, $f10 -+ MADD $f11, c21, ALPHA_I, $f11 -+ MADD $f12, c22, ALPHA_R, $f12 -+ MADD $f13, c22, ALPHA_I, $f13 -+ ST $f22, CO1, -4 * SIZE -+ ST $f8, CO1, -3 * SIZE -+ ST $f23, CO1, -2 * SIZE -+ ST $f9, CO1, -1 * SIZE -+ ST $f10, CO2, -4 * SIZE -+ ST $f11, CO2, -3 * SIZE -+ ST $f12, CO2, -2 * SIZE -+ ST $f13, CO2, -1 * SIZE -+ blt $r0, I, .L51 -+ .align 3 -+ -+.L60: -+ andi I, M, 1 -+ bge $r0, I, .L69 -+ srai.d L, K, 2 -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ MOV c31, c11 -+ LD a4, AO, 3 * SIZE -+ MOV c41, c11 -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L65 -+ .align 3 -+.L62: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 4 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 5 * SIZE -+ MADD c31, b3, a2, c31 -+ LD b3, BO, 6 * SIZE -+ MADD c41, b4, a2, c41 -+ LD b4, BO, 7 * SIZE -+ LD a1, AO, 4 * SIZE -+ LD a2, AO, 5 * SIZE -+ MADD c11, b1, a3, c11 -+ LD b1, BO, 8 * SIZE -+ MADD c21, b2, a3, c21 -+ LD b2, BO, 9 * SIZE -+ MADD c31, b3, a4, c31 -+ LD b3, BO, 10 * SIZE -+ MADD c41, b4, a4, c41 -+ LD b4, BO, 11 * SIZE -+ LD a3, AO, 6 * SIZE -+ LD a4, AO, 7 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 4 * SIZE -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L62 -+ .align 3 -+ -+.L65: -+ andi L, K, 3 -+ bge $r0, L, .L68 -+ .align 3 -+.L66: -+ MADD c11, b1, a1, c11 -+ LD b1, BO, 2 * SIZE -+ MADD c21, b2, a1, c21 -+ LD b2, BO, 3 * SIZE -+ LD a1, AO, 1 * SIZE -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+addi.d BO, BO, 2 * SIZE -+ blt $r0, L, .L66 -+.L68: -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO1, 1 * SIZE -+ LD $f23, CO2, 0 * SIZE -+ LD $f9, CO2, 1 * SIZE -+ ADD c11, c11, c31 -+ ADD c21, c21, c41 -+ MADD $f22, c11, ALPHA_R, $f22 -+ MADD $f8, c11, ALPHA_I, $f8 -+ MADD $f23, c21, ALPHA_R, $f23 -+ MADD $f9, c21, ALPHA_I, $f9 -+ ST $f22, CO1, 0 * SIZE -+ ST $f8, CO1, 1 * SIZE -+ ST $f23, CO2, 0 * SIZE -+ ST $f9, CO2, 1 * SIZE -+ .align 3 -+ -+.L69: -+ move B, BO -+ .align 3 -+ -+.L70: -+ andi J, N, 1 -+move AO, A -+ bge $r0, J, .L999 -+ move CO1, C -+ srai.d I, M, 1 -+add.d C, CO1, LDC -+ bge $r0, I, .L80 -+.L71: -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a5, AO, 4 * SIZE -+ LD b1, B, 0 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ LD b3, B, 2 * SIZE -+ LD b5, B, 4 * SIZE -+ srai.d L, K, 2 -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+move BO, B -+ bge $r0, L, .L75 -+ .align 3 -+.L72: -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 2 * SIZE -+ LD a2, AO, 3 * SIZE -+ LD b1, BO, 1 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 4 * SIZE -+ LD a2, AO, 5 * SIZE -+ LD b1, BO, 2 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ LD a1, AO, 6 * SIZE -+ LD a2, AO, 7 * SIZE -+ LD b1, BO, 3 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ addi.d L, L, -1 -+ addi.d AO, AO, 8 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L72 -+ .align 3 -+ -+.L75: -+ andi L, K, 3 -+ bge $r0, L, .L78 -+ .align 3 -+.L76: -+ LD a1, AO, 0 * SIZE -+ LD a2, AO, 1 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ MADD c12, b1, a2, c12 -+ addi.d L, L, -1 -+ addi.d AO, AO, 2 * SIZE -+addi.d BO, BO, 1 * SIZE -+ blt $r0, L, .L76 -+.L78: -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO1, 1 * SIZE -+ LD $f23, CO1, 2 * SIZE -+ LD $f9, CO1, 3 * SIZE -+ ADD c11, c11, c21 -+ addi.d I, I, -1 -+ ADD c12, c12, c22 -+ addi.d CO1,CO1, 4 * SIZE -+ MADD $f22, c11, ALPHA_R, $f22 -+ MADD $f8, c11, ALPHA_I, $f8 -+ MADD $f23, c12, ALPHA_R, $f23 -+ MADD $f9, c12, ALPHA_I, $f9 -+ ST $f22, CO1, -4 * SIZE -+ ST $f8, CO1, -3 * SIZE -+ ST $f23, CO1, -2 * SIZE -+ ST $f9, CO1, -1 * SIZE -+ blt $r0, I, .L71 -+ .align 3 -+ -+.L80: -+ andi I, M, 1 -+ bge $r0, I, .L89 -+ LD a1, AO, 0 * SIZE -+MTC c11, $r0 -+ LD a2, AO, 1 * SIZE -+ MOV c21, c11 -+ LD a3, AO, 2 * SIZE -+ LD a4, AO, 3 * SIZE -+ LD b1, B, 0 * SIZE -+ LD b2, B, 1 * SIZE -+ LD b3, B, 2 * SIZE -+ LD b4, B, 3 * SIZE -+ LD b5, B, 4 * SIZE -+ LD b6, B, 8 * SIZE -+ LD b7, B, 12 * SIZE -+ srai.d L, K, 2 -+move BO, B -+ bge $r0, L, .L85 -+ .align 3 -+.L82: -+ LD a1, AO, 0 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a1, AO, 1 * SIZE -+ LD b1, BO, 1 * SIZE -+ MADD c21, b1, a1, c21 -+ LD a1, AO, 2 * SIZE -+ LD b1, BO, 2 * SIZE -+ MADD c11, b1, a1, c11 -+ LD a1, AO, 3 * SIZE -+ LD b1, BO, 3 * SIZE -+ MADD c21, b1, a1, c21 -+ addi.d L, L, -1 -+ addi.d AO, AO, 4 * SIZE -+addi.d BO, BO, 4 * SIZE -+ blt $r0, L, .L82 -+ .align 3 -+ -+.L85: -+ andi L, K, 3 -+ bge $r0, L, .L88 -+ .align 3 -+.L86: -+ LD a1, AO, 0 * SIZE -+ LD b1, BO, 0 * SIZE -+ MADD c11, b1, a1, c11 -+ addi.d L, L, -1 -+ addi.d AO, AO, 1 * SIZE -+addi.d BO, BO, 1 * SIZE -+ blt $r0, L, .L86 -+.L88: -+ LD $f22, CO1, 0 * SIZE -+ LD $f8, CO1, 1 * SIZE -+ ADD c11, c11, c21 -+ MADD $f22, c11, ALPHA_R, $f22 -+ MADD $f8, c11, ALPHA_I, $f8 -+ ST $f22, CO1, 0 * SIZE -+ ST $f8, CO1, 1 * SIZE -+ .align 3 -+ -+.L89: -+ move B, BO -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ LDARG $r25, $sp, 16 -+ LDARG $r26, $sp, 24 -+ LDARG $r27, $sp, 32 -+ LDARG $r28, $sp, 40 -+ fld.d $f24, $sp, 48 -+ fld.d $f25, $sp, 56 -+ fld.d $f26, $sp, 64 -+ fld.d $f27, $sp, 72 -+ fld.d $f28, $sp, 80 -+ fld.d $f29, $sp, 88 -+ addi.d $sp, $sp, 128 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/zgemm_kernel.S b/kernel/loongarch64/zgemm_kernel.S -new file mode 100644 -index 0000000..2d50d41 ---- /dev/null -+++ b/kernel/loongarch64/zgemm_kernel.S -@@ -0,0 +1,1047 @@ -+/*************************************************************************** -+Copyright (c) 2020, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define M $r4 -+#define N $r5 -+#define K $r6 -+#define A $r7 -+#define B $r8 -+#define C $r9 -+#define LDC $r10 -+ -+#define AO $r12 -+#define BO $r13 -+#define I $r17 -+#define J $r18 -+#define L $r25 -+#define CO1 $r14 -+#define CO2 $r15 -+#define CO3 $r23 -+#define CO4 $r24 -+ -+#if defined(TRMMKERNEL) -+#define OFFSET $r11 -+#define KK $r26 -+#define TEMP $r27 -+#endif -+ -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f28 -+#define a4 $f29 -+#define b1 $f23 -+#define b2 $f9 -+#define b3 $f10 -+#define b4 $f11 -+#define b5 $f12 -+#define b6 $f13 -+#define b7 $f14 -+#define b8 $f15 -+#define a5 b8 -+#define c11 $f16 -+#define c12 $f17 -+#define c21 $f3 -+#define c22 $f4 -+#define c31 $f2 -+#define c32 $f5 -+#define c41 $f6 -+#define c42 $f7 -+#define c51 $f18 -+#define c52 $f19 -+#define c61 $f20 -+#define c62 $f21 -+#define c71 $f24 -+#define c72 $f25 -+#define c81 $f26 -+#define c82 $f27 -+#define ALPHA_R $f0 -+#define ALPHA_I $f1 -+ -+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -+#define MADD1 MADD -+#define MADD2 MADD -+#define MADD3 MADD -+#define MADD4 NMSUB -+#endif -+ -+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) -+#define MADD1 MADD -+#define MADD2 MADD -+#define MADD3 NMSUB -+#define MADD4 MADD -+#endif -+ -+#if defined(RN) || defined(RT) || defined(CN) || defined(CT) -+#define MADD1 MADD -+#define MADD2 NMSUB -+#define MADD3 MADD -+#define MADD4 MADD -+#endif -+ -+#if defined(RR) || defined(RC) || defined(CR) || defined(CC) -+#define MADD1 MADD -+#define MADD2 NMSUB -+#define MADD3 NMSUB -+#define MADD4 NMSUB -+#endif -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -128 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 64 -+ fst.d $f24, $sp, 16 -+ fst.d $f25, $sp, 24 -+ fst.d $f26, $sp, 32 -+ fst.d $f27, $sp, 40 -+ fst.d $f28, $sp, 48 -+ fst.d $f29, $sp, 56 -+#if defined(TRMMKERNEL) -+ SDARG $r26, $sp, 72 -+ SDARG $r27, $sp, 80 -+#endif -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 88 -+ fst.d $f19, $sp, 96 -+ fst.d $f20, $sp, 104 -+ fst.d $f21, $sp, 112 -+#endif -+ slli.d LDC, LDC, ZBASE_SHIFT -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ sub.d KK, $r0, OFFSET -+#endif -+ srai.d J, N, 2 -+nop -+ bge $r0, J, .L20 -+.L10: -+ move CO1, C -+ MTC c11, $r0 -+ add.d CO2, C, LDC -+ move AO, A -+ add.d CO3, CO2, LDC -+ addi.d J, J, -1 -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ MOV c31, c11 -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move KK, OFFSET -+#endif -+ MOV c41, c11 -+ MOV c51, c11 -+ move I, M -+ add.d C, CO4, LDC -+ MOV c61, c11 -+ bge $r0, I, .L19 -+.L11: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d L, KK, ZBASE_SHIFT -+ slli.d TEMP, KK, 2 + ZBASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ MOV c32, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c42, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c52, c11 -+ LD b5, BO, 4 * SIZE -+ MOV c62, c11 -+ LD b6, BO, 8 * SIZE -+ MOV c72, c11 -+ LD b7, BO, 12 * SIZE -+ MOV c82, c11 -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 1 -+#else -+ addi.d TEMP, KK, 4 -+#endif -+ srai.d L, TEMP, 2 -+ bge $r0, L, .L15 -+#else -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, B, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, K, 2 -+ MOV c32, c11 -+ LD b3, B, 2 * SIZE -+ MOV c42, c11 -+ LD b4, B, 3 * SIZE -+ MOV c52, c11 -+ LD b5, B, 4 * SIZE -+ MOV c62, c11 -+ LD b6, B, 8 * SIZE -+ MOV c72, c11 -+ LD b7, B, 12 * SIZE -+ MOV c82, c11 -+move BO, B -+ bge $r0, L, .L15 -+#endif -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ bge $r0, L, .L13 -+ .align 3 -+.L12: -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c51, b5, a1, c51 -+ MADD3 c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD1 c71, b3, a1, c71 -+ MADD3 c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD1 c51, b7, a4, c51 -+ MADD3 c61, b2, a4, c61 -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD1 c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD3 c21, b2, a3, c21 -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD1 c51, b5, a3, c51 -+ MADD3 c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD1 c71, b3, a3, c71 -+ MADD3 c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ addi.d L, L, -1 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD1 c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD3 c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ blt $r0, L, .L12 -+ .align 3 -+ -+.L13: -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c51, b5, a1, c51 -+ MADD3 c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD1 c71, b3, a1, c71 -+ MADD3 c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD1 c51, b7, a4, c51 -+ MADD3 c61, b2, a4, c61 -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD1 c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD3 c21, b2, a3, c21 -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD1 c51, b5, a3, c51 -+ MADD3 c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD1 c71, b3, a3, c71 -+ MADD3 c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD1 c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD3 c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ .align 3 -+ -+.L15: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L18 -+ .align 3 -+.L16: -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c51, b5, a1, c51 -+ addi.d L, L, -1 -+ MADD3 c61, b2, a1, c61 -+ addi.d AO, AO, 2 * SIZE -+ MADD1 c71, b3, a1, c71 -+ addi.d BO, BO, 8 * SIZE -+ MADD3 c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 4 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L16 -+.L18: -+#ifndef TRMMKERNEL -+ LD b1, CO1, 0 * SIZE -+ ADD c11, c11, c22 -+ LD b2, CO1, 1 * SIZE -+ ADD c12, c12, c21 -+ LD b3, CO2, 0 * SIZE -+ ADD c31, c31, c42 -+ LD b4, CO2, 1 * SIZE -+ ADD c32, c32, c41 -+ LD b5, CO3, 0 * SIZE -+ ADD c51, c51, c62 -+ LD b6, CO3, 1 * SIZE -+ ADD c52, c52, c61 -+ LD b7, CO4, 0 * SIZE -+ ADD c71, c71, c82 -+ LD b8, CO4, 1 * SIZE -+ ADD c72, c72, c81 -+ MADD b1, c11, ALPHA_R, b1 -+ addi.d CO1,CO1, 2 * SIZE -+ MADD b2, c12, ALPHA_R, b2 -+ addi.d CO2,CO2, 2 * SIZE -+ MADD b3, c31, ALPHA_R, b3 -+ addi.d CO3,CO3, 2 * SIZE -+ MADD b4, c32, ALPHA_R, b4 -+ addi.d CO4,CO4, 2 * SIZE -+ MADD b5, c51, ALPHA_R, b5 -+ addi.d I, I, -1 -+ MADD b6, c52, ALPHA_R, b6 -+ MADD b7, c71, ALPHA_R, b7 -+ MADD b8, c72, ALPHA_R, b8 -+ NMSUB b1, c12, ALPHA_I, b1 -+ MADD b2, c11, ALPHA_I, b2 -+ MTC c11, $r0 -+ NMSUB b3, c32, ALPHA_I, b3 -+ MADD b4, c31, ALPHA_I, b4 -+ ST b1, CO1, -2 * SIZE -+ NMSUB b5, c52, ALPHA_I, b5 -+ ST b2, CO1, -1 * SIZE -+ MADD b6, c51, ALPHA_I, b6 -+ ST b3, CO2, -2 * SIZE -+ NMSUB b7, c72, ALPHA_I, b7 -+ ST b4, CO2, -1 * SIZE -+ MADD b8, c71, ALPHA_I, b8 -+ ST b5, CO3, -2 * SIZE -+ MOV c21, c11 -+ ST b6, CO3, -1 * SIZE -+ MOV c31, c11 -+ ST b7, CO4, -2 * SIZE -+ MOV c41, c11 -+ ST b8, CO4, -1 * SIZE -+ MOV c51, c11 -+#else -+ ADD c11, c11, c22 -+ addi.d CO1,CO1, 2 * SIZE -+ ADD c12, c12, c21 -+ addi.d CO2,CO2, 2 * SIZE -+ ADD c31, c31, c42 -+ addi.d CO3,CO3, 2 * SIZE -+ ADD c32, c32, c41 -+ addi.d CO4,CO4, 2 * SIZE -+ ADD c51, c51, c62 -+ addi.d I, I, -1 -+ ADD c52, c52, c61 -+ ADD c71, c71, c82 -+ ADD c72, c72, c81 -+ MUL b1, ALPHA_R, c11 -+ MUL b2, ALPHA_R, c12 -+ MUL b3, ALPHA_R, c31 -+ MUL b4, ALPHA_R, c32 -+ MUL b5, ALPHA_R, c51 -+ MUL b6, ALPHA_R, c52 -+ MUL b7, ALPHA_R, c71 -+ MUL b8, ALPHA_R, c72 -+ NMSUB b1, c12, ALPHA_I, b1 -+ MADD b2, c11, ALPHA_I, b2 -+ MTC c11, $r0 -+ NMSUB b3, c32, ALPHA_I, b3 -+ MADD b4, c31, ALPHA_I, b4 -+ ST b1, CO1, -2 * SIZE -+ NMSUB b5, c52, ALPHA_I, b5 -+ ST b2, CO1, -1 * SIZE -+ MADD b6, c51, ALPHA_I, b6 -+ ST b3, CO2, -2 * SIZE -+ NMSUB b7, c72, ALPHA_I, b7 -+ ST b4, CO2, -1 * SIZE -+ MADD b8, c71, ALPHA_I, b8 -+ ST b5, CO3, -2 * SIZE -+ MOV c21, c11 -+ ST b6, CO3, -1 * SIZE -+ MOV c31, c11 -+ ST b7, CO4, -2 * SIZE -+ MOV c41, c11 -+ ST b8, CO4, -1 * SIZE -+ MOV c51, c11 -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ sub.d TEMP, K, KK -+#ifdef LEFT -+ addi.d TEMP, TEMP, -1 -+#else -+ addi.d TEMP, TEMP, -4 -+#endif -+ slli.d L, TEMP, ZBASE_SHIFT -+ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LEFT -+ addi.d KK, KK, 1 -+#endif -+#endif -+MOV c61, c11 -+ blt $r0, I, .L11 -+ .align 3 -+ -+.L19: -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addi.d KK, KK, 4 -+#endif -+move B, BO -+ blt $r0, J, .L10 -+ .align 3 -+ -+.L20: -+ andi J, N, 2 -+ MTC c11, $r0 -+move CO1, C -+ bge $r0, J, .L30 -+ add.d CO2, C, LDC -+ add.d C, CO2, LDC -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move KK, OFFSET -+#endif -+ move I, M -+move AO, A -+ bge $r0, I, .L29 -+ .align 3 -+ -+.L21: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d L, KK, ZBASE_SHIFT -+ slli.d TEMP, KK, 1 + ZBASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c31, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c41, c11 -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ MOV c12, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c22, c11 -+ LD b5, BO, 4 * SIZE -+ MOV c32, c11 -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 1 -+#else -+ addi.d TEMP, KK, 2 -+#endif -+ srai.d L, TEMP, 2 -+MOV c42, c11 -+ bge $r0, L, .L25 -+#else -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, B, 0 * SIZE -+ MOV c31, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c41, c11 -+ LD b2, B, 1 * SIZE -+ srai.d L, K, 2 -+ LD b3, B, 2 * SIZE -+ MOV c12, c11 -+ LD b4, B, 3 * SIZE -+ MOV c22, c11 -+ LD b5, B, 4 * SIZE -+ MOV c32, c11 -+ MOV c42, c11 -+move BO, B -+ bge $r0, L, .L25 -+#endif -+ .align 3 -+.L22: -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c11, b5, a1, c11 -+ LD a2, AO, 3 * SIZE -+ MADD3 c21, b2, a1, c21 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ LD a1, AO, 8 * SIZE -+ MADD2 c12, b5, a2, c12 -+ LD b5, BO, 12 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 9 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 10 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 11 * SIZE -+ MADD1 c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD3 c21, b2, a3, c21 -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ LD a3, AO, 6 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD1 c11, b5, a3, c11 -+ LD a2, AO, 7 * SIZE -+ MADD3 c21, b2, a3, c21 -+ addi.d AO, AO, 8 * SIZE -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ LD a3, AO, 4 * SIZE -+ MADD2 c12, b5, a2, c12 -+ LD b5, BO, 20 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 17 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 18 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 19 * SIZE -+addi.d BO, BO, 16 * SIZE -+ blt $r0, L, .L22 -+ .align 3 -+ -+.L25: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L28 -+ .align 3 -+.L26: -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD1 c31, b3, a1, c31 -+ addi.d BO, BO, 4 * SIZE -+ MADD3 c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 0 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 1 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 2 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 3 * SIZE -+addi.d AO, AO, 2 * SIZE -+ blt $r0, L, .L26 -+.L28: -+#ifndef TRMMKERNEL -+ LD b1, CO1, 0 * SIZE -+ ADD c11, c11, c22 -+ LD b2, CO1, 1 * SIZE -+ ADD c12, c12, c21 -+ LD b3, CO2, 0 * SIZE -+ ADD c31, c31, c42 -+ LD b4, CO2, 1 * SIZE -+ ADD c32, c32, c41 -+ MADD b1, c11, ALPHA_R, b1 -+ addi.d CO1,CO1, 2 * SIZE -+ MADD b2, c12, ALPHA_R, b2 -+ addi.d CO2,CO2, 2 * SIZE -+ MADD b3, c31, ALPHA_R, b3 -+ addi.d I, I, -1 -+ MADD b4, c32, ALPHA_R, b4 -+ NMSUB b1, c12, ALPHA_I, b1 -+ MADD b2, c11, ALPHA_I, b2 -+ MTC c11, $r0 -+ NMSUB b3, c32, ALPHA_I, b3 -+ MADD b4, c31, ALPHA_I, b4 -+ ST b1, CO1, -2 * SIZE -+ ST b2, CO1, -1 * SIZE -+ ST b3, CO2, -2 * SIZE -+#else -+ ADD c11, c11, c22 -+ ADD c12, c12, c21 -+ ADD c31, c31, c42 -+ ADD c32, c32, c41 -+ MUL b1, ALPHA_R, c11 -+ addi.d CO1,CO1, 2 * SIZE -+ MUL b2, ALPHA_R, c12 -+ addi.d CO2,CO2, 2 * SIZE -+ MUL b3, ALPHA_R, c31 -+ addi.d I, I, -1 -+ MUL b4, ALPHA_R, c32 -+ NMSUB b1, c12, ALPHA_I, b1 -+ MADD b2, c11, ALPHA_I, b2 -+ MTC c11, $r0 -+ NMSUB b3, c32, ALPHA_I, b3 -+ MADD b4, c31, ALPHA_I, b4 -+ ST b1, CO1, -2 * SIZE -+ ST b2, CO1, -1 * SIZE -+ ST b3, CO2, -2 * SIZE -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ sub.d TEMP, K, KK -+#ifdef LEFT -+ addi.d TEMP, TEMP, -1 -+#else -+ addi.d TEMP, TEMP, -2 -+#endif -+ slli.d L, TEMP, ZBASE_SHIFT -+ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LEFT -+ addi.d KK, KK, 1 -+#endif -+#endif -+ ST b4, CO2, -1 * SIZE -+ blt $r0, I, .L21 -+ .align 3 -+ -+.L29: -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addi.d KK, KK, 2 -+#endif -+ move B, BO -+ .align 3 -+ -+.L30: -+ andi J, N, 1 -+ MTC c11, $r0 -+move CO1, C -+ bge $r0, J, .L999 -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move KK, OFFSET -+#endif -+ move I, M -+ add.d C, CO1, LDC -+move AO, A -+ bge $r0, I, .L39 -+ .align 3 -+ -+.L31: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move BO, B -+#else -+ slli.d TEMP, KK, ZBASE_SHIFT -+ add.d AO, AO, TEMP -+ add.d BO, B, TEMP -+#endif -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c31, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c41, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c12, c11 -+ MOV c22, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c32, c11 -+ LD b3, BO, 4 * SIZE -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d TEMP, K, KK -+#elif defined(LEFT) -+ addi.d TEMP, KK, 1 -+#else -+ addi.d TEMP, KK, 1 -+#endif -+ srai.d L, TEMP, 2 -+MOV c42, c11 -+ bge $r0, L, .L35 -+#else -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, B, 0 * SIZE -+ MOV c31, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c41, c11 -+ LD b2, B, 1 * SIZE -+ MOV c12, c11 -+ srai.d L, K, 2 -+ MOV c22, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c32, c11 -+ LD b3, B, 4 * SIZE -+ MOV c42, c11 -+move BO, B -+ bge $r0, L, .L35 -+#endif -+ .align 3 -+.L32: -+ MADD1 c11, b1, a1, c11 -+ LD b4, BO, 3 * SIZE -+ MADD3 c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD a2, AO, 3 * SIZE -+ MADD1 c11, b1, a1, c11 -+ LD b2, BO, 5 * SIZE -+ MADD3 c21, b4, a1, c21 -+ LD a1, AO, 8 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD4 c22, b4, a2, c22 -+ LD a2, AO, 5 * SIZE -+ MADD1 c11, b3, a3, c11 -+ LD b4, BO, 7 * SIZE -+ MADD3 c21, b2, a3, c21 -+ LD a3, AO, 6 * SIZE -+ MADD2 c12, b3, a2, c12 -+ LD b3, BO, 6 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD a2, AO, 7 * SIZE -+ MADD1 c11, b3, a3, c11 -+ LD b2, BO, 9 * SIZE -+ MADD3 c21, b4, a3, c21 -+ LD a3, AO, 12 * SIZE -+ MADD2 c12, b3, a2, c12 -+ LD b3, BO, 12 * SIZE -+ MADD4 c22, b4, a2, c22 -+ LD a2, AO, 9 * SIZE -+ addi.d AO, AO, 8 * SIZE -+ addi.d L, L, -1 -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L32 -+ .align 3 -+ -+.L35: -+#ifndef TRMMKERNEL -+ andi L, K, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L38 -+ .align 3 -+.L36: -+ MADD1 c11, b1, a1, c11 -+ addi.d L, L, -1 -+ MADD3 c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD a2, AO, 3 * SIZE -+ LD b2, BO, 3 * SIZE -+ addi.d BO, BO, 2 * SIZE -+addi.d AO, AO, 2 * SIZE -+ blt $r0, L, .L36 -+.L38: -+#ifndef TRMMKERNEL -+ LD b1, CO1, 0 * SIZE -+ ADD c11, c11, c22 -+ LD b2, CO1, 1 * SIZE -+ ADD c12, c12, c21 -+ MADD b1, c11, ALPHA_R, b1 -+ addi.d CO1,CO1, 2 * SIZE -+ MADD b2, c12, ALPHA_R, b2 -+ addi.d I, I, -1 -+ NMSUB b1, c12, ALPHA_I, b1 -+ MADD b2, c11, ALPHA_I, b2 -+ MTC c11, $r0 -+ ST b1, CO1, -2 * SIZE -+ ST b2, CO1, -1 * SIZE -+ blt $r0, I, .L31 -+#else -+ ADD c11, c11, c22 -+ ADD c12, c12, c21 -+ MUL b1, ALPHA_R, c11 -+ addi.d CO1,CO1, 2 * SIZE -+ MUL b2, ALPHA_R, c12 -+ addi.d I, I, -1 -+ NMSUB b1, c12, ALPHA_I, b1 -+ MADD b2, c11, ALPHA_I, b2 -+ MTC c11, $r0 -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ sub.d TEMP, K, KK -+#ifdef LEFT -+ addi.d TEMP, TEMP, -1 -+#else -+ addi.d TEMP, TEMP, -1 -+#endif -+ slli.d TEMP, TEMP, ZBASE_SHIFT -+ add.d AO, AO, TEMP -+ add.d BO, BO, TEMP -+#endif -+#ifdef LEFT -+ addi.d KK, KK, 1 -+#endif -+ ST b1, CO1, -2 * SIZE -+ ST b2, CO1, -1 * SIZE -+ blt $r0, I, .L31 -+#endif -+ .align 3 -+ -+.L39: -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addi.d KK, KK, 1 -+#endif -+ move B, BO -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ LDARG $r25, $sp, 64 -+ fld.d $f24, $sp, 16 -+ fld.d $f25, $sp, 24 -+ fld.d $f26, $sp, 32 -+ fld.d $f27, $sp, 40 -+ fld.d $f28, $sp, 48 -+ fld.d $f29, $sp, 56 -+#if defined(TRMMKERNEL) -+ LDARG $r26, $sp, 72 -+ LDARG $r27, $sp, 80 -+#endif -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 88 -+ fld.d $f19, $sp, 96 -+ fld.d $f20, $sp, 104 -+ fld.d $f21, $sp, 112 -+#endif -+ addi.d $sp, $sp, 128 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ fmov.d $f1, $f23 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S -new file mode 100644 -index 0000000..d995ce8 ---- /dev/null -+++ b/kernel/loongarch64/zgemv_n.S -@@ -0,0 +1,648 @@ -+/*************************************************************************** -+Copyright (c) 2020, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define M $r4 -+#define N $r5 -+#define A $r7 -+#define LDA $r8 -+#define X $r9 -+#define INCX $r10 -+#define Y $r11 -+#define INCY $r6 -+#define BUFFER $r17 -+ -+#define YORIG $r18 -+#define XX $r12 -+#define YY $r13 -+#define I $r14 -+#define J $r15 -+#define AO1 $r23 -+#define AO2 $r24 -+ -+#define ALPHA_R $f0 -+#define ALPHA_I $f1 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f23 -+#define a4 $f9 -+#define a5 $f10 -+#define a6 $f11 -+#define a7 $f12 -+#define a8 $f13 -+#define x1 $f14 -+#define x2 $f15 -+#define x3 $f16 -+#define x4 $f17 -+#define y1 $f3 -+#define y2 $f4 -+#define y3 $f2 -+#define y4 $f5 -+#define t1 $f6 -+#define t2 $f7 -+#define t3 $f18 -+#define t4 $f19 -+#define t5 $f20 -+#define t6 $f21 -+#define t7 $f24 -+#define t8 $f25 -+ -+#if !defined(CONJ) && !defined(XCONJ) -+#define MADD1 MADD -+#define MADD2 MADD -+#define MADD3 NMSUB -+#define MADD4 MADD -+#endif -+#if defined(CONJ) && !defined(XCONJ) -+#define MADD1 MADD -+#define MADD2 MADD -+#define MADD3 MADD -+#define MADD4 NMSUB -+#endif -+#if !defined(CONJ) && defined(XCONJ) -+#define MADD1 MADD -+#define MADD2 NMSUB -+#define MADD3 MADD -+#define MADD4 MADD -+#endif -+#if defined(CONJ) && defined(XCONJ) -+#define MADD1 MADD -+#define MADD2 NMSUB -+#define MADD3 NMSUB -+#define MADD4 NMSUB -+#endif -+ -+ PROLOGUE -+ -+ LDARG INCY, $sp, 0 -+ LDARG BUFFER, $sp, 8 -+#ifndef __64BIT__ -+ addi.d $sp, $sp, -64 -+#else -+ addi.d $sp, $sp, -32 -+#endif -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ fst.d $f24, $sp, 16 -+ fst.d $f25, $sp, 24 -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 32 -+ fst.d $f19, $sp, 40 -+ fst.d $f20, $sp, 48 -+ fst.d $f21, $sp, 56 -+#endif -+ slli.d LDA, LDA, ZBASE_SHIFT -+ slli.d INCX, INCX, ZBASE_SHIFT -+ bge $r0, M, .L999 -+ slli.d INCY, INCY, ZBASE_SHIFT -+ bge $r0, N, .L999 -+ li.d I, 2 * SIZE -+ move YORIG, Y -+ beq INCY, I, .L10 -+ srai.d I, M, 2 -+ move YORIG, BUFFER -+ move XX, Y -+ move YY, BUFFER -+ bge $r0, I, .L05 -+ .align 3 -+ -+.L02: -+ LD a1, XX, 0 * SIZE -+ LD a2, XX, 1 * SIZE -+ add.d XX, XX, INCY -+ LD a3, XX, 0 * SIZE -+ LD a4, XX, 1 * SIZE -+ add.d XX, XX, INCY -+ LD a5, XX, 0 * SIZE -+ LD a6, XX, 1 * SIZE -+ add.d XX, XX, INCY -+ LD a7, XX, 0 * SIZE -+ LD a8, XX, 1 * SIZE -+ add.d XX, XX, INCY -+ addi.d I, I, -1 -+ addi.d YY, YY, 8 * SIZE -+ ST a1, YY, -8 * SIZE -+ ST a2, YY, -7 * SIZE -+ ST a3, YY, -6 * SIZE -+ ST a4, YY, -5 * SIZE -+ ST a5, YY, -4 * SIZE -+ ST a6, YY, -3 * SIZE -+ ST a7, YY, -2 * SIZE -+ ST a8, YY, -1 * SIZE -+ blt $r0, I, .L02 -+ .align 3 -+ -+.L05: -+ andi I, M, 3 -+ bge $r0, I, .L10 -+ .align 3 -+ -+.L06: -+ LD a1, XX, 0 * SIZE -+ LD a2, XX, 1 * SIZE -+ add.d XX, XX, INCY -+ addi.d I, I, -1 -+ ST a1, YY, 0 * SIZE -+ ST a2, YY, 1 * SIZE -+ addi.d YY, YY, 2 * SIZE -+ blt $r0, I, .L06 -+ .align 3 -+ -+.L10: -+ srai.d J, N, 1 -+ bge $r0, J, .L20 -+ .align 3 -+ -+.L11: -+ LD x1, X, 0 * SIZE -+ LD x2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD x3, X, 0 * SIZE -+ LD x4, X, 1 * SIZE -+ add.d X, X, INCX -+ MUL a1, ALPHA_R, x1 -+ move AO1, A -+ MUL a2, ALPHA_I, x1 -+ add.d AO2, A, LDA -+ MUL a3, ALPHA_R, x3 -+ add.d A, AO2, LDA -+ MUL a4, ALPHA_I, x3 -+#ifndef XCONJ -+ NMSUB x1, x2, ALPHA_I, a1 -+ MADD x2, x2, ALPHA_R, a2 -+ NMSUB x3, x4, ALPHA_I, a3 -+ MADD x4, x4, ALPHA_R, a4 -+#else -+ MADD x1, x2, ALPHA_I, a1 -+ MSUB x2, x2, ALPHA_R, a2 -+ MADD x3, x4, ALPHA_I, a3 -+ MSUB x4, x4, ALPHA_R, a4 -+#endif -+ srai.d I, M, 2 -+ move YY, YORIG -+ bge $r0, I, .L15 -+ LD y1, YY, 0 * SIZE -+ LD a1, AO1, 0 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a3, AO1, 2 * SIZE -+ LD y3, YY, 2 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD y4, YY, 3 * SIZE -+ LD a4, AO1, 3 * SIZE -+ LD a5, AO2, 0 * SIZE -+ LD a6, AO2, 1 * SIZE -+ LD a7, AO2, 2 * SIZE -+ LD a8, AO2, 3 * SIZE -+ MADD1 t1, a1, x1, y1 -+ LD y1, YY, 4 * SIZE -+ MADD2 t2, a1, x2, y2 -+ LD a1, AO1, 4 * SIZE -+ MADD1 t3, a3, x1, y3 -+ LD y2, YY, 5 * SIZE -+ MADD2 t4, a3, x2, y4 -+ LD a3, AO1, 6 * SIZE -+ MADD3 t1, a2, x2, t1 -+ LD y3, YY, 6 * SIZE -+ MADD4 t2, a2, x1, t2 -+ LD a2, AO1, 5 * SIZE -+ MADD3 t3, a4, x2, t3 -+ LD y4, YY, 7 * SIZE -+ MADD4 t4, a4, x1, t4 -+ LD a4, AO1, 7 * SIZE -+ MADD1 t1, a5, x3, t1 -+ MADD2 t2, a5, x4, t2 -+ LD a5, AO2, 4 * SIZE -+ MADD1 t3, a7, x3, t3 -+ MADD2 t4, a7, x4, t4 -+ LD a7, AO2, 6 * SIZE -+ MADD3 t1, a6, x4, t1 -+ MADD4 t2, a6, x3, t2 -+ LD a6, AO2, 5 * SIZE -+ MADD3 t3, a8, x4, t3 -+ addi.d I, I, -1 -+ MADD4 t4, a8, x3, t4 -+ LD a8, AO2, 7 * SIZE -+ bge $r0, I, .L13 -+ .align 3 -+.L12: -+ MADD1 t5, a1, x1, y1 -+ LD y1, YY, 8 * SIZE -+ MADD2 t6, a1, x2, y2 -+ LD a1, AO1, 8 * SIZE -+ MADD1 t7, a3, x1, y3 -+ LD y2, YY, 9 * SIZE -+ MADD2 t8, a3, x2, y4 -+ LD a3, AO1, 10 * SIZE -+ MADD3 t5, a2, x2, t5 -+ LD y3, YY, 10 * SIZE -+ MADD4 t6, a2, x1, t6 -+ LD a2, AO1, 9 * SIZE -+ MADD3 t7, a4, x2, t7 -+ LD y4, YY, 11 * SIZE -+ MADD4 t8, a4, x1, t8 -+ LD a4, AO1, 11 * SIZE -+ MADD1 t5, a5, x3, t5 -+ ST t1, YY, 0 * SIZE -+ MADD2 t6, a5, x4, t6 -+ LD a5, AO2, 8 * SIZE -+ MADD1 t7, a7, x3, t7 -+ ST t2, YY, 1 * SIZE -+ MADD2 t8, a7, x4, t8 -+ LD a7, AO2, 10 * SIZE -+ MADD3 t5, a6, x4, t5 -+ ST t3, YY, 2 * SIZE -+ MADD4 t6, a6, x3, t6 -+ LD a6, AO2, 9 * SIZE -+ MADD3 t7, a8, x4, t7 -+ ST t4, YY, 3 * SIZE -+ MADD4 t8, a8, x3, t8 -+ LD a8, AO2, 11 * SIZE -+ MADD1 t1, a1, x1, y1 -+ LD y1, YY, 12 * SIZE -+ MADD2 t2, a1, x2, y2 -+ LD a1, AO1, 12 * SIZE -+ MADD1 t3, a3, x1, y3 -+ LD y2, YY, 13 * SIZE -+ MADD2 t4, a3, x2, y4 -+ LD a3, AO1, 14 * SIZE -+ MADD3 t1, a2, x2, t1 -+ LD y3, YY, 14 * SIZE -+ MADD4 t2, a2, x1, t2 -+ LD a2, AO1, 13 * SIZE -+ MADD3 t3, a4, x2, t3 -+ LD y4, YY, 15 * SIZE -+ MADD4 t4, a4, x1, t4 -+ LD a4, AO1, 15 * SIZE -+ MADD1 t1, a5, x3, t1 -+ ST t5, YY, 4 * SIZE -+ MADD2 t2, a5, x4, t2 -+ LD a5, AO2, 12 * SIZE -+ MADD1 t3, a7, x3, t3 -+ ST t6, YY, 5 * SIZE -+ MADD2 t4, a7, x4, t4 -+ LD a7, AO2, 14 * SIZE -+ MADD3 t1, a6, x4, t1 -+ ST t7, YY, 6 * SIZE -+ MADD4 t2, a6, x3, t2 -+ LD a6, AO2, 13 * SIZE -+ MADD3 t3, a8, x4, t3 -+ ST t8, YY, 7 * SIZE -+ MADD4 t4, a8, x3, t4 -+ LD a8, AO2, 15 * SIZE -+ addi.d I, I, -1 -+ addi.d YY, YY, 8 * SIZE -+ addi.d AO1, AO1, 8 * SIZE -+ addi.d AO2, AO2, 8 * SIZE -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ ST t1, YY, 0 * SIZE -+ MADD1 t1, a1, x1, y1 -+ ST t2, YY, 1 * SIZE -+ MADD2 t2, a1, x2, y2 -+ ST t3, YY, 2 * SIZE -+ MADD1 t3, a3, x1, y3 -+ ST t4, YY, 3 * SIZE -+ MADD2 t4, a3, x2, y4 -+ MADD3 t1, a2, x2, t1 -+ MADD4 t2, a2, x1, t2 -+ MADD3 t3, a4, x2, t3 -+ MADD4 t4, a4, x1, t4 -+ MADD1 t1, a5, x3, t1 -+ MADD2 t2, a5, x4, t2 -+ MADD1 t3, a7, x3, t3 -+ MADD2 t4, a7, x4, t4 -+ MADD3 t1, a6, x4, t1 -+ addi.d AO1, AO1, 8 * SIZE -+ MADD4 t2, a6, x3, t2 -+ addi.d AO2, AO2, 8 * SIZE -+ MADD3 t3, a8, x4, t3 -+ addi.d YY, YY, 8 * SIZE -+ MADD4 t4, a8, x3, t4 -+ ST t1, YY, -4 * SIZE -+ ST t2, YY, -3 * SIZE -+ ST t3, YY, -2 * SIZE -+ ST t4, YY, -1 * SIZE -+ .align 3 -+ -+.L15: -+ andi I, M, 2 -+ bge $r0, I, .L16 -+ LD a1, AO1, 0 * SIZE -+ LD y1, YY, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a3, AO1, 2 * SIZE -+ LD y3, YY, 2 * SIZE -+ LD a4, AO1, 3 * SIZE -+ LD y4, YY, 3 * SIZE -+ MADD1 t1, a1, x1, y1 -+ LD a5, AO2, 0 * SIZE -+ MADD2 t2, a1, x2, y2 -+ LD a6, AO2, 1 * SIZE -+ MADD1 t3, a3, x1, y3 -+ LD a7, AO2, 2 * SIZE -+ MADD2 t4, a3, x2, y4 -+ LD a8, AO2, 3 * SIZE -+ MADD3 t1, a2, x2, t1 -+ MADD4 t2, a2, x1, t2 -+ MADD3 t3, a4, x2, t3 -+ MADD4 t4, a4, x1, t4 -+ MADD1 t1, a5, x3, t1 -+ MADD2 t2, a5, x4, t2 -+ MADD1 t3, a7, x3, t3 -+ MADD2 t4, a7, x4, t4 -+ MADD3 t1, a6, x4, t1 -+ addi.d YY, YY, 4 * SIZE -+ MADD4 t2, a6, x3, t2 -+ addi.d AO1, AO1, 4 * SIZE -+ MADD3 t3, a8, x4, t3 -+ addi.d AO2, AO2, 4 * SIZE -+ MADD4 t4, a8, x3, t4 -+ ST t1, YY, -4 * SIZE -+ ST t2, YY, -3 * SIZE -+ ST t3, YY, -2 * SIZE -+ ST t4, YY, -1 * SIZE -+ .align 3 -+ -+.L16: -+ andi I, M, 1 -+ bge $r0, I, .L19 -+ LD y1, YY, 0 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a1, AO1, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ MADD1 t1, a1, x1, y1 -+ LD a5, AO2, 0 * SIZE -+ MADD2 t2, a1, x2, y2 -+ LD a6, AO2, 1 * SIZE -+ MADD3 t1, a2, x2, t1 -+ MADD4 t2, a2, x1, t2 -+ MADD1 t1, a5, x3, t1 -+ MADD2 t2, a5, x4, t2 -+ MADD3 t1, a6, x4, t1 -+ MADD4 t2, a6, x3, t2 -+ ST t1, YY, 0 * SIZE -+ ST t2, YY, 1 * SIZE -+ .align 3 -+ -+.L19: -+ addi.d J, J, -1 -+ blt $r0, J, .L11 -+ .align 3 -+ -+.L20: -+ andi J, N, 1 -+ bge $r0, J, .L900 -+ LD x1, X, 0 * SIZE -+ LD x2, X, 1 * SIZE -+ add.d X, X, INCX -+ MUL a1, ALPHA_R, x1 -+ move AO1, A -+ MUL a2, ALPHA_I, x1 -+#ifndef XCONJ -+ NMSUB x1, x2, ALPHA_I, a1 -+ MADD x2, x2, ALPHA_R, a2 -+#else -+ MADD x1, x2, ALPHA_I, a1 -+ MSUB x2, x2, ALPHA_R, a2 -+#endif -+ srai.d I, M, 2 -+ move YY, YORIG -+ bge $r0, I, .L25 -+ LD y1, YY, 0 * SIZE -+ LD a1, AO1, 0 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a3, AO1, 2 * SIZE -+ LD y3, YY, 2 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD y4, YY, 3 * SIZE -+ LD a4, AO1, 3 * SIZE -+ MADD1 t1, a1, x1, y1 -+ LD y1, YY, 4 * SIZE -+ MADD2 t2, a1, x2, y2 -+ LD a1, AO1, 4 * SIZE -+ MADD1 t3, a3, x1, y3 -+ LD y2, YY, 5 * SIZE -+ MADD2 t4, a3, x2, y4 -+ LD a3, AO1, 6 * SIZE -+ MADD3 t1, a2, x2, t1 -+ LD y3, YY, 6 * SIZE -+ MADD4 t2, a2, x1, t2 -+ LD a2, AO1, 5 * SIZE -+ MADD3 t3, a4, x2, t3 -+ LD y4, YY, 7 * SIZE -+ MADD4 t4, a4, x1, t4 -+ addi.d I, I, -1 -+ LD a4, AO1, 7 * SIZE -+ bge $r0, I, .L23 -+ .align 3 -+.L22: -+ MADD1 t5, a1, x1, y1 -+ LD y1, YY, 8 * SIZE -+ MADD2 t6, a1, x2, y2 -+ LD a1, AO1, 8 * SIZE -+ MADD1 t7, a3, x1, y3 -+ LD y2, YY, 9 * SIZE -+ MADD2 t8, a3, x2, y4 -+ LD a3, AO1, 10 * SIZE -+ MADD3 t5, a2, x2, t5 -+ LD y3, YY, 10 * SIZE -+ MADD4 t6, a2, x1, t6 -+ LD a2, AO1, 9 * SIZE -+ MADD3 t7, a4, x2, t7 -+ LD y4, YY, 11 * SIZE -+ MADD4 t8, a4, x1, t8 -+ LD a4, AO1, 11 * SIZE -+ ST t1, YY, 0 * SIZE -+ ST t2, YY, 1 * SIZE -+ ST t3, YY, 2 * SIZE -+ ST t4, YY, 3 * SIZE -+ MADD1 t1, a1, x1, y1 -+ LD y1, YY, 12 * SIZE -+ MADD2 t2, a1, x2, y2 -+ LD a1, AO1, 12 * SIZE -+ MADD1 t3, a3, x1, y3 -+ LD y2, YY, 13 * SIZE -+ MADD2 t4, a3, x2, y4 -+ LD a3, AO1, 14 * SIZE -+ MADD3 t1, a2, x2, t1 -+ LD y3, YY, 14 * SIZE -+ MADD4 t2, a2, x1, t2 -+ LD a2, AO1, 13 * SIZE -+ MADD3 t3, a4, x2, t3 -+ LD y4, YY, 15 * SIZE -+ MADD4 t4, a4, x1, t4 -+ LD a4, AO1, 15 * SIZE -+ ST t5, YY, 4 * SIZE -+ ST t6, YY, 5 * SIZE -+ ST t7, YY, 6 * SIZE -+ ST t8, YY, 7 * SIZE -+ addi.d I, I, -1 -+ addi.d YY, YY, 8 * SIZE -+ addi.d AO1, AO1, 8 * SIZE -+ blt $r0, I, .L22 -+ .align 3 -+ -+.L23: -+ ST t1, YY, 0 * SIZE -+ MADD1 t1, a1, x1, y1 -+ ST t2, YY, 1 * SIZE -+ MADD2 t2, a1, x2, y2 -+ ST t3, YY, 2 * SIZE -+ MADD1 t3, a3, x1, y3 -+ ST t4, YY, 3 * SIZE -+ MADD2 t4, a3, x2, y4 -+ MADD3 t1, a2, x2, t1 -+ addi.d AO1, AO1, 8 * SIZE -+ MADD4 t2, a2, x1, t2 -+ addi.d YY, YY, 8 * SIZE -+ MADD3 t3, a4, x2, t3 -+ MADD4 t4, a4, x1, t4 -+ ST t1, YY, -4 * SIZE -+ ST t2, YY, -3 * SIZE -+ ST t3, YY, -2 * SIZE -+ ST t4, YY, -1 * SIZE -+ .align 3 -+ -+.L25: -+ andi I, M, 2 -+ bge $r0, I, .L26 -+ LD a1, AO1, 0 * SIZE -+ LD y1, YY, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a3, AO1, 2 * SIZE -+ LD y3, YY, 2 * SIZE -+ LD a4, AO1, 3 * SIZE -+ LD y4, YY, 3 * SIZE -+ MADD1 t1, a1, x1, y1 -+ MADD2 t2, a1, x2, y2 -+ MADD1 t3, a3, x1, y3 -+ MADD2 t4, a3, x2, y4 -+ MADD3 t1, a2, x2, t1 -+ addi.d YY, YY, 4 * SIZE -+ MADD4 t2, a2, x1, t2 -+ addi.d AO1, AO1, 4 * SIZE -+ MADD3 t3, a4, x2, t3 -+ MADD4 t4, a4, x1, t4 -+ ST t1, YY, -4 * SIZE -+ ST t2, YY, -3 * SIZE -+ ST t3, YY, -2 * SIZE -+ ST t4, YY, -1 * SIZE -+ .align 3 -+ -+.L26: -+ andi I, M, 1 -+ bge $r0, I, .L900 -+ LD y1, YY, 0 * SIZE -+ LD y2, YY, 1 * SIZE -+ LD a1, AO1, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ MADD1 t1, a1, x1, y1 -+ MADD2 t2, a1, x2, y2 -+ MADD3 t1, a2, x2, t1 -+ MADD4 t2, a2, x1, t2 -+ ST t1, YY, 0 * SIZE -+ ST t2, YY, 1 * SIZE -+ .align 3 -+ -+.L900: -+ li.d YORIG, 2 * SIZE -+ srai.d I, M, 2 -+ beq INCY, YORIG, .L999 -+ move XX, BUFFER -+ bge $r0, I, .L905 -+ .align 3 -+ -+.L902: -+ LD a1, XX, 0 * SIZE -+ LD a2, XX, 1 * SIZE -+ LD a3, XX, 2 * SIZE -+ LD a4, XX, 3 * SIZE -+ LD a5, XX, 4 * SIZE -+ LD a6, XX, 5 * SIZE -+ LD a7, XX, 6 * SIZE -+ LD a8, XX, 7 * SIZE -+ addi.d I, I, -1 -+ ST a1, Y, 0 * SIZE -+ ST a2, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ ST a3, Y, 0 * SIZE -+ ST a4, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ ST a5, Y, 0 * SIZE -+ ST a6, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ ST a7, Y, 0 * SIZE -+ ST a8, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ addi.d XX, XX, 8 * SIZE -+ blt $r0, I, .L902 -+ .align 3 -+ -+.L905: -+ andi I, M, 3 -+ bge $r0, I, .L999 -+ .align 3 -+ -+.L906: -+ LD a1, XX, 0 * SIZE -+ LD a2, XX, 1 * SIZE -+ addi.d XX, XX, 2 * SIZE -+ addi.d I, I, -1 -+ ST a1, Y, 0 * SIZE -+ ST a2, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ blt $r0, I, .L906 -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ fld.d $f24, $sp, 16 -+ fld.d $f25, $sp, 24 -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 32 -+ fld.d $f19, $sp, 40 -+ fld.d $f20, $sp, 48 -+ fld.d $f21, $sp, 56 -+#endif -+#ifdef __64BIT__ -+ addi.d $sp, $sp, 32 -+#else -+ addi.d $sp, $sp, 64 -+#endif -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S -new file mode 100644 -index 0000000..841823e ---- /dev/null -+++ b/kernel/loongarch64/zgemv_t.S -@@ -0,0 +1,556 @@ -+/*************************************************************************** -+Copyright (c) 2020, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define M $r4 -+#define N $r5 -+#define A $r7 -+#define LDA $r8 -+#define X $r9 -+#define INCX $r10 -+#define Y $r11 -+#define INCY $r6 -+#define BUFFER $r17 -+ -+#define XORIG $r18 -+#define XX $r12 -+#define YY $r13 -+#define I $r14 -+#define J $r15 -+#define AO1 $r23 -+#define AO2 $r24 -+ -+#define ALPHA_R $f0 -+#define ALPHA_I $f1 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f23 -+#define a4 $f9 -+#define a5 $f10 -+#define a6 $f11 -+#define a7 $f12 -+#define a8 $f13 -+#define y1 $f14 -+#define y2 $f15 -+#define y3 $f16 -+#define y4 $f17 -+#define x1 $f3 -+#define x2 $f4 -+#define x3 $f2 -+#define x4 $f5 -+#define x5 $f6 -+#define x6 $f7 -+#define x7 $f18 -+#define x8 $f19 -+ -+#if !defined(CONJ) && !defined(XCONJ) -+#define MADD1 MADD -+#define MADD2 MADD -+#define MADD3 NMSUB -+#define MADD4 MADD -+#endif -+#if defined(CONJ) && !defined(XCONJ) -+#define MADD1 MADD -+#define MADD2 MADD -+#define MADD3 MADD -+#define MADD4 NMSUB -+#endif -+#if !defined(CONJ) && defined(XCONJ) -+#define MADD1 MADD -+#define MADD2 NMSUB -+#define MADD3 MADD -+#define MADD4 MADD -+#endif -+#if defined(CONJ) && defined(XCONJ) -+#define MADD1 MADD -+#define MADD2 NMSUB -+#define MADD3 NMSUB -+#define MADD4 NMSUB -+#endif -+ -+ PROLOGUE -+ -+ LDARG INCY, $sp, 0 -+ LDARG BUFFER, $sp, 8 -+#ifdef __64BIT__ -+ addi.d $sp, $sp, -16 -+#else -+ addi.d $sp, $sp, -32 -+#endif -+ MTC y1, $r0 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ slli.d LDA, LDA, ZBASE_SHIFT -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 16 -+ fst.d $f19, $sp, 24 -+#endif -+ slli.d INCX, INCX, ZBASE_SHIFT -+ bge $r0, M, .L999 -+ slli.d INCY, INCY, ZBASE_SHIFT -+ bge $r0, N, .L999 -+ li.d I, 2 * SIZE -+ move XORIG, X -+ beq INCX, I, .L10 -+ srai.d I, M, 2 -+ move XORIG, BUFFER -+ move YY, BUFFER -+ bge $r0, I, .L05 -+ .align 3 -+ -+.L02: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ LD a8, X, 1 * SIZE -+ add.d X, X, INCX -+ addi.d I, I, -1 -+ addi.d YY, YY, 8 * SIZE -+ ST a1, YY, -8 * SIZE -+ ST a2, YY, -7 * SIZE -+ ST a3, YY, -6 * SIZE -+ ST a4, YY, -5 * SIZE -+ ST a5, YY, -4 * SIZE -+ ST a6, YY, -3 * SIZE -+ ST a7, YY, -2 * SIZE -+ ST a8, YY, -1 * SIZE -+ blt $r0, I, .L02 -+ .align 3 -+ -+.L05: -+ andi I, M, 3 -+ bge $r0, I, .L10 -+ .align 3 -+ -+.L06: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ ST a1, YY, 0 * SIZE -+ ST a2, YY, 1 * SIZE -+ addi.d I, I, -1 -+ addi.d YY, YY, 2 * SIZE -+ blt $r0, I, .L06 -+ .align 3 -+ -+.L10: -+ srai.d J, N, 1 -+ move YY, Y -+ bge $r0, J, .L20 -+ .align 3 -+ -+.L11: -+ move AO1, A -+ MOV y2, y1 -+ add.d AO2, A, LDA -+ MOV y3, y1 -+ add.d A, AO2, LDA -+ MOV y4, y1 -+ srai.d I, M, 2 -+ move XX, XORIG -+ bge $r0, I, .L15 -+ LD x1, XX, 0 * SIZE -+ LD x2, XX, 1 * SIZE -+ LD x4, XX, 3 * SIZE -+ LD a1, AO1, 0 * SIZE -+ LD a3, AO2, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD a4, AO2, 1 * SIZE -+ LD a5, AO1, 2 * SIZE -+ LD a7, AO2, 2 * SIZE -+ LD a6, AO1, 3 * SIZE -+ LD a8, AO2, 3 * SIZE -+ addi.d I, I, -1 -+ bge $r0, I, .L13 -+ .align 3 -+.L12: -+ MADD1 y1, a1, x1, y1 -+ LD x3, XX, 2 * SIZE -+ MADD2 y2, a1, x2, y2 -+ LD a1, AO1, 4 * SIZE -+ MADD1 y3, a3, x1, y3 -+ MADD2 y4, a3, x2, y4 -+ LD a3, AO2, 4 * SIZE -+ MADD3 y1, a2, x2, y1 -+ MADD4 y2, a2, x1, y2 -+ LD a2, AO1, 5 * SIZE -+ MADD3 y3, a4, x2, y3 -+ LD x2, XX, 5 * SIZE -+ MADD4 y4, a4, x1, y4 -+ LD a4, AO2, 5 * SIZE -+ MADD1 y1, a5, x3, y1 -+ LD x1, XX, 4 * SIZE -+ MADD2 y2, a5, x4, y2 -+ LD a5, AO1, 6 * SIZE -+ MADD1 y3, a7, x3, y3 -+ MADD2 y4, a7, x4, y4 -+ LD a7, AO2, 6 * SIZE -+ MADD3 y1, a6, x4, y1 -+ addi.d I, I, -1 -+ MADD4 y2, a6, x3, y2 -+ LD a6, AO1, 7 * SIZE -+ MADD3 y3, a8, x4, y3 -+ LD x4, XX, 7 * SIZE -+ MADD4 y4, a8, x3, y4 -+ LD a8, AO2, 7 * SIZE -+ MADD1 y1, a1, x1, y1 -+ LD x3, XX, 6 * SIZE -+ MADD2 y2, a1, x2, y2 -+ LD a1, AO1, 8 * SIZE -+ MADD1 y3, a3, x1, y3 -+ MADD2 y4, a3, x2, y4 -+ LD a3, AO2, 8 * SIZE -+ MADD3 y1, a2, x2, y1 -+ MADD4 y2, a2, x1, y2 -+ LD a2, AO1, 9 * SIZE -+ MADD3 y3, a4, x2, y3 -+ LD x2, XX, 9 * SIZE -+ MADD4 y4, a4, x1, y4 -+ LD a4, AO2, 9 * SIZE -+ MADD1 y1, a5, x3, y1 -+ LD x1, XX, 8 * SIZE -+ MADD2 y2, a5, x4, y2 -+ LD a5, AO1, 10 * SIZE -+ MADD1 y3, a7, x3, y3 -+ addi.d XX, XX, 8 * SIZE -+ MADD2 y4, a7, x4, y4 -+ LD a7, AO2, 10 * SIZE -+ MADD3 y1, a6, x4, y1 -+ addi.d AO2, AO2, 8 * SIZE -+ MADD4 y2, a6, x3, y2 -+ LD a6, AO1, 11 * SIZE -+ MADD3 y3, a8, x4, y3 -+ LD x4, XX, 3 * SIZE -+ MADD4 y4, a8, x3, y4 -+ LD a8, AO2, 3 * SIZE -+ addi.d AO1, AO1, 8 * SIZE -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ MADD1 y1, a1, x1, y1 -+ LD x3, XX, 2 * SIZE -+ MADD2 y2, a1, x2, y2 -+ LD a1, AO1, 4 * SIZE -+ MADD1 y3, a3, x1, y3 -+ MADD2 y4, a3, x2, y4 -+ LD a3, AO2, 4 * SIZE -+ MADD3 y1, a2, x2, y1 -+ MADD4 y2, a2, x1, y2 -+ LD a2, AO1, 5 * SIZE -+ MADD3 y3, a4, x2, y3 -+ LD x2, XX, 5 * SIZE -+ MADD4 y4, a4, x1, y4 -+ LD a4, AO2, 5 * SIZE -+ MADD1 y1, a5, x3, y1 -+ LD x1, XX, 4 * SIZE -+ MADD2 y2, a5, x4, y2 -+ LD a5, AO1, 6 * SIZE -+ MADD1 y3, a7, x3, y3 -+ MADD2 y4, a7, x4, y4 -+ LD a7, AO2, 6 * SIZE -+ MADD3 y1, a6, x4, y1 -+ MADD4 y2, a6, x3, y2 -+ LD a6, AO1, 7 * SIZE -+ MADD3 y3, a8, x4, y3 -+ LD x4, XX, 7 * SIZE -+ MADD4 y4, a8, x3, y4 -+ LD a8, AO2, 7 * SIZE -+ MADD1 y1, a1, x1, y1 -+ LD x3, XX, 6 * SIZE -+ MADD2 y2, a1, x2, y2 -+ MADD1 y3, a3, x1, y3 -+ MADD2 y4, a3, x2, y4 -+ MADD3 y1, a2, x2, y1 -+ MADD4 y2, a2, x1, y2 -+ MADD3 y3, a4, x2, y3 -+ MADD4 y4, a4, x1, y4 -+ MADD1 y1, a5, x3, y1 -+ MADD2 y2, a5, x4, y2 -+ MADD1 y3, a7, x3, y3 -+ MADD2 y4, a7, x4, y4 -+ MADD3 y1, a6, x4, y1 -+ addi.d XX, XX, 8 * SIZE -+ MADD4 y2, a6, x3, y2 -+ addi.d AO1, AO1, 8 * SIZE -+ MADD3 y3, a8, x4, y3 -+ addi.d AO2, AO2, 8 * SIZE -+ MADD4 y4, a8, x3, y4 -+ .align 3 -+ -+.L15: -+ andi I, M, 2 -+ bge $r0, I, .L17 -+ LD x1, XX, 0 * SIZE -+ LD x2, XX, 1 * SIZE -+ LD x3, XX, 2 * SIZE -+ LD x4, XX, 3 * SIZE -+ LD a1, AO1, 0 * SIZE -+ LD a3, AO2, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD a4, AO2, 1 * SIZE -+ LD a5, AO1, 2 * SIZE -+ LD a7, AO2, 2 * SIZE -+ LD a6, AO1, 3 * SIZE -+ LD a8, AO2, 3 * SIZE -+ MADD1 y1, a1, x1, y1 -+ MADD2 y2, a1, x2, y2 -+ MADD1 y3, a3, x1, y3 -+ MADD2 y4, a3, x2, y4 -+ MADD3 y1, a2, x2, y1 -+ MADD4 y2, a2, x1, y2 -+ MADD3 y3, a4, x2, y3 -+ MADD4 y4, a4, x1, y4 -+ MADD1 y1, a5, x3, y1 -+ MADD2 y2, a5, x4, y2 -+ MADD1 y3, a7, x3, y3 -+ MADD2 y4, a7, x4, y4 -+ MADD3 y1, a6, x4, y1 -+ addi.d XX, XX, 4 * SIZE -+ MADD4 y2, a6, x3, y2 -+ addi.d AO1, AO1, 4 * SIZE -+ MADD3 y3, a8, x4, y3 -+ addi.d AO2, AO2, 4 * SIZE -+ MADD4 y4, a8, x3, y4 -+ .align 3 -+ -+.L17: -+ andi I, M, 1 -+.align 3 -+ -+ bge $r0, I, .L19 -+.L18: -+ LD x1, XX, 0 * SIZE -+ LD x2, XX, 1 * SIZE -+ LD a1, AO1, 0 * SIZE -+ LD a3, AO2, 0 * SIZE -+ MADD1 y1, a1, x1, y1 -+ LD a2, AO1, 1 * SIZE -+ MADD2 y2, a1, x2, y2 -+ LD a4, AO2, 1 * SIZE -+ MADD1 y3, a3, x1, y3 -+ MADD2 y4, a3, x2, y4 -+ MADD3 y1, a2, x2, y1 -+ MADD4 y2, a2, x1, y2 -+ MADD3 y3, a4, x2, y3 -+ MADD4 y4, a4, x1, y4 -+ .align 3 -+ -+.L19: -+ LD a1, Y, 0 * SIZE -+ LD a2, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ LD a3, Y, 0 * SIZE -+ LD a4, Y, 1 * SIZE -+ add.d Y, Y, INCY -+ MADD a1, y1, ALPHA_R, a1 -+ MADD a2, y1, ALPHA_I, a2 -+ MADD a3, y3, ALPHA_R, a3 -+ MADD a4, y3, ALPHA_I, a4 -+ NMSUB a1, y2, ALPHA_I, a1 -+ MADD a2, y2, ALPHA_R, a2 -+ NMSUB a3, y4, ALPHA_I, a3 -+ MTC y1, $r0 -+ MADD a4, y4, ALPHA_R, a4 -+ addi.d J, J, -1 -+ ST a1, YY, 0 * SIZE -+ ST a2, YY, 1 * SIZE -+ add.d YY, YY, INCY -+ ST a3, YY, 0 * SIZE -+ ST a4, YY, 1 * SIZE -+ add.d YY, YY, INCY -+ blt $r0, J, .L11 -+ .align 3 -+ -+.L20: -+ andi J, N, 1 -+ MOV y2, y1 -+ srai.d I, M, 2 -+ bge $r0, J, .L999 -+ MOV y3, y1 -+ move AO1, A -+ MOV y4, y1 -+ move XX, XORIG -+ bge $r0, I, .L25 -+ LD a1, AO1, 0 * SIZE -+ LD x1, XX, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD x2, XX, 1 * SIZE -+ LD a5, AO1, 2 * SIZE -+ LD x4, XX, 3 * SIZE -+ addi.d I, I, -1 -+ LD a6, AO1, 3 * SIZE -+ bge $r0, I, .L23 -+ .align 3 -+.L22: -+ MADD1 y1, a1, x1, y1 -+ LD x3, XX, 2 * SIZE -+ MADD2 y2, a1, x2, y2 -+ LD a1, AO1, 4 * SIZE -+ MADD3 y3, a2, x2, y3 -+ LD x2, XX, 5 * SIZE -+ MADD4 y4, a2, x1, y4 -+ LD a2, AO1, 5 * SIZE -+ MADD1 y1, a5, x3, y1 -+ LD x1, XX, 4 * SIZE -+ MADD2 y2, a5, x4, y2 -+ LD a5, AO1, 6 * SIZE -+ MADD3 y3, a6, x4, y3 -+ LD x4, XX, 7 * SIZE -+ MADD4 y4, a6, x3, y4 -+ LD a6, AO1, 7 * SIZE -+ MADD1 y1, a1, x1, y1 -+ LD x3, XX, 6 * SIZE -+ MADD2 y2, a1, x2, y2 -+ LD a1, AO1, 8 * SIZE -+ MADD3 y3, a2, x2, y3 -+ LD x2, XX, 9 * SIZE -+ MADD4 y4, a2, x1, y4 -+ LD a2, AO1, 9 * SIZE -+ MADD1 y1, a5, x3, y1 -+ LD x1, XX, 8 * SIZE -+ MADD2 y2, a5, x4, y2 -+ LD a5, AO1, 10 * SIZE -+ MADD3 y3, a6, x4, y3 -+ LD x4, XX, 11 * SIZE -+ MADD4 y4, a6, x3, y4 -+ LD a6, AO1, 11 * SIZE -+ addi.d I, I, -1 -+ addi.d XX, XX, 8 * SIZE -+ addi.d AO1, AO1, 8 * SIZE -+ blt $r0, I, .L22 -+ .align 3 -+ -+.L23: -+ MADD1 y1, a1, x1, y1 -+ LD x3, XX, 2 * SIZE -+ MADD2 y2, a1, x2, y2 -+ LD a1, AO1, 4 * SIZE -+ MADD3 y3, a2, x2, y3 -+ LD x2, XX, 5 * SIZE -+ MADD4 y4, a2, x1, y4 -+ LD a2, AO1, 5 * SIZE -+ MADD1 y1, a5, x3, y1 -+ LD x1, XX, 4 * SIZE -+ MADD2 y2, a5, x4, y2 -+ LD a5, AO1, 6 * SIZE -+ MADD3 y3, a6, x4, y3 -+ LD x4, XX, 7 * SIZE -+ MADD4 y4, a6, x3, y4 -+ LD a6, AO1, 7 * SIZE -+ MADD1 y1, a1, x1, y1 -+ LD x3, XX, 6 * SIZE -+ MADD2 y2, a1, x2, y2 -+ MADD3 y3, a2, x2, y3 -+ MADD4 y4, a2, x1, y4 -+ MADD1 y1, a5, x3, y1 -+ MADD2 y2, a5, x4, y2 -+ MADD3 y3, a6, x4, y3 -+ addi.d XX, XX, 8 * SIZE -+ MADD4 y4, a6, x3, y4 -+ addi.d AO1, AO1, 8 * SIZE -+ .align 3 -+ -+.L25: -+ andi I, M, 2 -+ bge $r0, I, .L27 -+ LD a1, AO1, 0 * SIZE -+ LD x1, XX, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD x2, XX, 1 * SIZE -+ LD a5, AO1, 2 * SIZE -+ MADD1 y1, a1, x1, y1 -+ LD x3, XX, 2 * SIZE -+ MADD2 y2, a1, x2, y2 -+ LD a6, AO1, 3 * SIZE -+ MADD3 y3, a2, x2, y3 -+ LD x4, XX, 3 * SIZE -+ MADD4 y4, a2, x1, y4 -+ MADD1 y1, a5, x3, y1 -+ MADD2 y2, a5, x4, y2 -+ MADD3 y3, a6, x4, y3 -+ addi.d XX, XX, 4 * SIZE -+ MADD4 y4, a6, x3, y4 -+ addi.d AO1, AO1, 4 * SIZE -+ .align 3 -+ -+.L27: -+ andi I, M, 1 -+.align 3 -+ -+ bge $r0, I, .L29 -+.L28: -+ LD a1, AO1, 0 * SIZE -+ LD x1, XX, 0 * SIZE -+ LD a2, AO1, 1 * SIZE -+ LD x2, XX, 1 * SIZE -+ MADD1 y1, a1, x1, y1 -+ MADD2 y2, a1, x2, y2 -+ MADD3 y3, a2, x2, y3 -+ MADD4 y4, a2, x1, y4 -+ .align 3 -+ -+.L29: -+ LD a1, Y, 0 * SIZE -+ LD a2, Y, 1 * SIZE -+ ADD y1, y1, y3 -+ ADD y2, y2, y4 -+ MADD a1, y1, ALPHA_R, a1 -+ MADD a2, y1, ALPHA_I, a2 -+ NMSUB a1, y2, ALPHA_I, a1 -+ MADD a2, y2, ALPHA_R, a2 -+ ST a1, YY, 0 * SIZE -+ ST a2, YY, 1 * SIZE -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 16 -+ fld.d $f19, $sp, 24 -+#endif -+#ifdef __64BIT__ -+ addi.d $sp, $sp, 16 -+#else -+ addi.d $sp, $sp, 32 -+#endif -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S -new file mode 100644 -index 0000000..49f6402 ---- /dev/null -+++ b/kernel/loongarch64/znrm2.S -@@ -0,0 +1,304 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r5 -+#define INCX $r6 -+#define XX $r7 -+#define I $r17 -+#define TEMP $r18 -+#define a1 $f10 -+#define a2 $f11 -+#define a3 $f12 -+#define a4 $f13 -+#define a5 $f14 -+#define a6 $f15 -+#define a7 $f16 -+#define a8 $f17 -+#define t1 $f0 -+#define t2 $f1 -+#define t3 $f2 -+#define t4 $f3 -+#define s1 $f22 -+#define s2 $f8 -+#define s3 $f23 -+#define s4 $f9 -+#define ALPHA $f4 -+#define max $f5 -+ -+ PROLOGUE -+ -+#ifdef F_INTERFACE -+ LDINT N, 0(N) -+ LDINT INCX, 0(INCX) -+#endif -+ -+ MTC s1, $r0 -+ bge $r0, N, .L999 -+ slli.d INCX, INCX, ZBASE_SHIFT -+ bge $r0, INCX, .L999 -+ move XX, X -+ MOV s2, s1 -+ srai.d I, N, 2 -+ MOV s3, s1 -+ MOV s4, s1 -+ bge $r0, I, .L15 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a7, X, 0 * SIZE -+ LD a8, X, 1 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ bge $r0, I, .L13 -+ .align 3 -+ -+.L12: -+ FABS t1, a1 -+ LD a1, X, 0 * SIZE -+ FABS t2, a2 -+ NOP -+ FABS t3, a3 -+ LD a2, X, 1 * SIZE -+ FABS t4, a4 -+ add.d X, X, INCX -+ CMPLT $fcc0, s1, t1 -+ LD a3, X, 0 * SIZE -+ CMPLT $fcc1, s2, t2 -+ NOP -+ CMPLT $fcc2, s3, t3 -+ LD a4, X, 1 * SIZE -+ CMPLT $fcc3, s4, t4 -+ add.d X, X, INCX -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ FABS t1, a5 -+ LD a5, X, 0 * SIZE -+ FABS t2, a6 -+ NOP -+ FABS t3, a7 -+ LD a6, X, 1 * SIZE -+ FABS t4, a8 -+ add.d X, X, INCX -+ CMPLT $fcc0, s1, t1 -+ LD a7, X, 0 * SIZE -+ CMPLT $fcc1, s2, t2 -+ NOP -+ CMPLT $fcc2, s3, t3 -+ LD a8, X, 1 * SIZE -+ CMPLT $fcc3, s4, t4 -+ add.d X, X, INCX -+ CMOVT s1, s1, t1, $fcc0 -+ addi.d I, I, -1 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L13: -+ FABS t1, a1 -+ FABS t2, a2 -+ FABS t3, a3 -+ FABS t4, a4 -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t2 -+ CMPLT $fcc2, s3, t3 -+ CMPLT $fcc3, s4, t4 -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ FABS t1, a5 -+ FABS t2, a6 -+ FABS t3, a7 -+ FABS t4, a8 -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t2 -+ CMPLT $fcc2, s3, t3 -+ CMPLT $fcc3, s4, t4 -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ CMOVT s3, s3, t3, $fcc2 -+ CMOVT s4, s4, t4, $fcc3 -+ .align 3 -+ -+.L15: -+ andi I, N, 3 -+ bge $r0, I, .L100 -+ .align 3 -+ -+.L16: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ addi.d I, I, -1 -+ FABS t1, a1 -+ FABS t2, a2 -+ CMPLT $fcc0, s1, t1 -+ CMPLT $fcc1, s2, t2 -+ CMOVT s1, s1, t1, $fcc0 -+ CMOVT s2, s2, t2, $fcc1 -+ add.d X, X, INCX -+ blt $r0, I, .L16 -+ .align 3 -+ -+.L100: -+ CMPLT $fcc0, s1, s2 -+ CMPLT $fcc1, s3, s4 -+ CMOVT s1, s1, s2, $fcc0 -+ CMOVT s3, s3, s4, $fcc1 -+ CMPLT $fcc0, s1, s3 -+ CMOVT s1, s1, s3, $fcc0 -+ lu12i.w TEMP, 0x3f800 -+ movgr2fr.d a1, $r0 -+ movgr2fr.w ALPHA, TEMP -+ CMPEQ $fcc0, s1, a1 -+ fcvt.d.s ALPHA, ALPHA -+ bcnez $fcc0, .L999 -+ fdiv.d ALPHA, ALPHA, s1 -+ MOV max, s1 -+ MOV s1, a1 -+ MOV s2, a1 -+ MOV s3, a1 -+ MOV s4, a1 -+ srai.d I, N, 2 -+ bge $r0, I, .L105 -+ LD a1, XX, 0 * SIZE -+ LD a2, XX, 1 * SIZE -+ add.d XX, XX, INCX -+ LD a3, XX, 0 * SIZE -+ LD a4, XX, 1 * SIZE -+ add.d XX, XX, INCX -+ LD a5, XX, 0 * SIZE -+ LD a6, XX, 1 * SIZE -+ add.d XX, XX, INCX -+ LD a7, XX, 0 * SIZE -+ LD a8, XX, 1 * SIZE -+ addi.d I, I, -1 -+ add.d XX, XX, INCX -+ bge $r0, I, .L104 -+ .align 3 -+ -+.L103: -+ MUL t1, ALPHA, a1 -+ LD a1, XX, 0 * SIZE -+ MUL t2, ALPHA, a2 -+ addi.d I, I, -1 -+ MUL t3, ALPHA, a3 -+ LD a2, XX, 1 * SIZE -+ MUL t4, ALPHA, a4 -+ add.d XX, XX, INCX -+ MADD s1, t1, t1, s1 -+ LD a3, XX, 0 * SIZE -+ MADD s2, t2, t2, s2 -+ NOP -+ MADD s3, t3, t3, s3 -+ LD a4, XX, 1 * SIZE -+ MADD s4, t4, t4, s4 -+ add.d XX, XX, INCX -+ MUL t1, ALPHA, a5 -+ LD a5, XX, 0 * SIZE -+ MUL t2, ALPHA, a6 -+ NOP -+ MUL t3, ALPHA, a7 -+ LD a6, XX, 1 * SIZE -+ MUL t4, ALPHA, a8 -+ add.d XX, XX, INCX -+ MADD s1, t1, t1, s1 -+ LD a7, XX, 0 * SIZE -+ MADD s2, t2, t2, s2 -+ LD a8, XX, 1 * SIZE -+ MADD s3, t3, t3, s3 -+ add.d XX, XX, INCX -+ MADD s4, t4, t4, s4 -+ blt $r0, I, .L103 -+ .align 3 -+ -+.L104: -+ MUL t1, ALPHA, a1 -+ MUL t2, ALPHA, a2 -+ MUL t3, ALPHA, a3 -+ MUL t4, ALPHA, a4 -+ MADD s1, t1, t1, s1 -+ MADD s2, t2, t2, s2 -+ MADD s3, t3, t3, s3 -+ MADD s4, t4, t4, s4 -+ MUL t1, ALPHA, a5 -+ MUL t2, ALPHA, a6 -+ MUL t3, ALPHA, a7 -+ MUL t4, ALPHA, a8 -+ MADD s1, t1, t1, s1 -+ MADD s2, t2, t2, s2 -+ MADD s3, t3, t3, s3 -+ MADD s4, t4, t4, s4 -+ .align 3 -+ -+.L105: -+ andi I, N, 3 -+ bge $r0, I, .L998 -+ .align 3 -+ -+.L106: -+ LD a1, XX, 0 * SIZE -+ LD a2, XX, 1 * SIZE -+ addi.d I, I, -1 -+ MUL t1, ALPHA, a1 -+ MUL t2, ALPHA, a2 -+ MADD s1, t1, t1, s1 -+ add.d XX, XX, INCX -+ MADD s2, t2, t2, s2 -+ blt $r0, I, .L106 -+ .align 3 -+ -+.L998: -+ ADD s1, s1, s2 -+ ADD s3, s3, s4 -+ ADD s1, s1, s3 -+ fsqrt.d s1, s1 -+ move $r4, $r17 -+ MUL $f0, max, s1 -+ jirl $r0, $r1, 0x0 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S -new file mode 100644 -index 0000000..a12e527 ---- /dev/null -+++ b/kernel/loongarch64/zscal.S -@@ -0,0 +1,356 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define N $r4 -+#define X $r7 -+#define INCX $r8 -+#define I $r17 -+#define TEMP $r18 -+#define XX $r5 -+#define ALPHA_R $f0 -+#define ALPHA_I $f1 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f23 -+#define a4 $f9 -+#define a5 $f10 -+#define a6 $f11 -+#define a7 $f12 -+#define a8 $f13 -+#define t1 $f14 -+#define t2 $f15 -+#define t3 $f16 -+#define t4 $f17 -+ -+ PROLOGUE -+ -+ li.d TEMP, 2 * SIZE -+ MTC a1, $r0 -+ slli.d INCX, INCX, ZBASE_SHIFT -+ bge $r0, N, .L999 -+ CMPEQ $fcc0, ALPHA_R, a1 -+ CMPEQ $fcc1, ALPHA_I, a1 -+ bceqz $fcc0, .L50 -+ bceqz $fcc1, .L50 -+ srai.d I, N, 2 -+ bne INCX, TEMP, .L20 -+ bge $r0, I, .L15 -+ .align 3 -+ -+.L12: -+ ST a1, X, 0 * SIZE -+ ST a1, X, 1 * SIZE -+ ST a1, X, 2 * SIZE -+ ST a1, X, 3 * SIZE -+ ST a1, X, 4 * SIZE -+ ST a1, X, 5 * SIZE -+ ST a1, X, 6 * SIZE -+ ST a1, X, 7 * SIZE -+ addi.w I, I, -1 -+ addi.d X, X, 8 * SIZE -+ blt $r0, I, .L12 -+ .align 3 -+ -+.L15: -+ andi I, N, 3 -+ bge $r0, I, .L999 -+ .align 3 -+.L16: -+ ST a1, X, 0 * SIZE -+ ST a1, X, 1 * SIZE -+ addi.d I, I, -1 -+ addi.d X, X, 2 * SIZE -+ blt $r0, I, .L16 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ .align 3 -+ -+.L20: -+ srai.d I, N, 2 -+ bge $r0, I, .L25 -+ .align 3 -+ -+.L22: -+ ST a1, X, 0 * SIZE -+ ST a1, X, 1 * SIZE -+ add.d X, X, INCX -+ ST a1, X, 0 * SIZE -+ ST a1, X, 1 * SIZE -+ add.d X, X, INCX -+ ST a1, X, 0 * SIZE -+ ST a1, X, 1 * SIZE -+ add.d X, X, INCX -+ ST a1, X, 0 * SIZE -+ ST a1, X, 1 * SIZE -+ addi.d I, I, -1 -+ add.d X, X, INCX -+ blt $r0, I, .L22 -+ .align 3 -+ -+.L25: -+ andi I, N, 3 -+ bge $r0, I, .L999 -+ .align 3 -+.L26: -+ ST a1, X, 0 * SIZE -+ addi.d I, I, -1 -+ ST a1, X, 1 * SIZE -+ add.d X, X, INCX -+ blt $r0, I, .L26 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ .align 3 -+ -+.L50: -+ srai.d I, N, 2 -+ bne INCX, TEMP, .L60 -+ addi.d I, I, -1 -+ blt I, $r0, .L55 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ LD a3, X, 2 * SIZE -+ LD a4, X, 3 * SIZE -+ LD a5, X, 4 * SIZE -+ LD a6, X, 5 * SIZE -+ MUL t1, ALPHA_R, a1 -+ LD a7, X, 6 * SIZE -+ MUL t2, ALPHA_I, a1 -+ LD a8, X, 7 * SIZE -+ MUL t3, ALPHA_R, a3 -+ MUL t4, ALPHA_I, a3 -+ bge $r0, I, .L53 -+ .align 3 -+ -+.L52: -+ NMSUB t1, a2, ALPHA_I, t1 -+ LD a1, X, 8 * SIZE -+ MADD t2, a2, ALPHA_R, t2 -+ LD a2, X, 9 * SIZE -+ NMSUB t3, a4, ALPHA_I, t3 -+ LD a3, X, 10 * SIZE -+ MADD t4, a4, ALPHA_R, t4 -+ LD a4, X, 11 * SIZE -+ ST t1, X, 0 * SIZE -+ MUL t1, ALPHA_R, a5 -+ ST t2, X, 1 * SIZE -+ MUL t2, ALPHA_I, a5 -+ ST t3, X, 2 * SIZE -+ MUL t3, ALPHA_R, a7 -+ ST t4, X, 3 * SIZE -+ MUL t4, ALPHA_I, a7 -+ NMSUB t1, a6, ALPHA_I, t1 -+ LD a5, X, 12 * SIZE -+ MADD t2, a6, ALPHA_R, t2 -+ LD a6, X, 13 * SIZE -+ NMSUB t3, a8, ALPHA_I, t3 -+ LD a7, X, 14 * SIZE -+ MADD t4, a8, ALPHA_R, t4 -+ LD a8, X, 15 * SIZE -+ ST t1, X, 4 * SIZE -+ MUL t1, ALPHA_R, a1 -+ ST t2, X, 5 * SIZE -+ MUL t2, ALPHA_I, a1 -+ ST t3, X, 6 * SIZE -+ MUL t3, ALPHA_R, a3 -+ ST t4, X, 7 * SIZE -+ MUL t4, ALPHA_I, a3 -+ addi.d I, I, -1 -+ addi.d X, X, 8 * SIZE -+ blt $r0, I, .L52 -+ .align 3 -+ -+.L53: -+ NMSUB t1, a2, ALPHA_I, t1 -+ MADD t2, a2, ALPHA_R, t2 -+ NMSUB t3, a4, ALPHA_I, t3 -+ MADD t4, a4, ALPHA_R, t4 -+ ST t1, X, 0 * SIZE -+ MUL t1, ALPHA_R, a5 -+ ST t2, X, 1 * SIZE -+ MUL t2, ALPHA_I, a5 -+ ST t3, X, 2 * SIZE -+ MUL t3, ALPHA_R, a7 -+ ST t4, X, 3 * SIZE -+ MUL t4, ALPHA_I, a7 -+ NMSUB t1, a6, ALPHA_I, t1 -+ MADD t2, a6, ALPHA_R, t2 -+ NMSUB t3, a8, ALPHA_I, t3 -+ MADD t4, a8, ALPHA_R, t4 -+ ST t1, X, 4 * SIZE -+ ST t2, X, 5 * SIZE -+ ST t3, X, 6 * SIZE -+ ST t4, X, 7 * SIZE -+ addi.d X, X, 8 * SIZE -+ .align 3 -+ -+.L55: -+ andi I, N, 3 -+ bge $r0, I, .L999 -+ .align 3 -+.L56: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ MUL t1, ALPHA_R, a1 -+ MUL t2, ALPHA_I, a1 -+ NMSUB t1, a2, ALPHA_I, t1 -+ MADD t2, a2, ALPHA_R, t2 -+ addi.d X, X, 2 * SIZE -+ addi.d I, I, -1 -+ ST t1, X, -2 * SIZE -+ ST t2, X, -1 * SIZE -+ blt $r0, I, .L56 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ .align 3 -+ -+.L60: -+ srai.d I, N, 2 -+ move XX, X -+ addi.d I, I, -1 -+ blt I, $r0, .L65 -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a3, X, 0 * SIZE -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ LD a5, X, 0 * SIZE -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ MUL t1, ALPHA_R, a1 -+ LD a7, X, 0 * SIZE -+ MUL t2, ALPHA_I, a1 -+ LD a8, X, 1 * SIZE -+ MUL t3, ALPHA_R, a3 -+ add.d X, X, INCX -+ MUL t4, ALPHA_I, a3 -+ bge $r0, I, .L63 -+ .align 3 -+ -+.L62: -+ NMSUB t1, a2, ALPHA_I, t1 -+ LD a1, X, 0 * SIZE -+ MADD t2, a2, ALPHA_R, t2 -+ LD a2, X, 1 * SIZE -+ add.d X, X, INCX -+ NMSUB t3, a4, ALPHA_I, t3 -+ LD a3, X, 0 * SIZE -+ MADD t4, a4, ALPHA_R, t4 -+ LD a4, X, 1 * SIZE -+ add.d X, X, INCX -+ ST t1, XX, 0 * SIZE -+ MUL t1, ALPHA_R, a5 -+ ST t2, XX, 1 * SIZE -+ MUL t2, ALPHA_I, a5 -+ add.d XX, XX, INCX -+ ST t3, XX, 0 * SIZE -+ MUL t3, ALPHA_R, a7 -+ ST t4, XX, 1 * SIZE -+ MUL t4, ALPHA_I, a7 -+ add.d XX, XX, INCX -+ NMSUB t1, a6, ALPHA_I, t1 -+ LD a5, X, 0 * SIZE -+ MADD t2, a6, ALPHA_R, t2 -+ LD a6, X, 1 * SIZE -+ add.d X, X, INCX -+ NMSUB t3, a8, ALPHA_I, t3 -+ LD a7, X, 0 * SIZE -+ MADD t4, a8, ALPHA_R, t4 -+ LD a8, X, 1 * SIZE -+ add.d X, X, INCX -+ ST t1, XX, 0 * SIZE -+ MUL t1, ALPHA_R, a1 -+ ST t2, XX, 1 * SIZE -+ MUL t2, ALPHA_I, a1 -+ add.d XX, XX, INCX -+ ST t3, XX, 0 * SIZE -+ MUL t3, ALPHA_R, a3 -+ ST t4, XX, 1 * SIZE -+ MUL t4, ALPHA_I, a3 -+ addi.d I, I, -1 -+ add.d XX, XX, INCX -+ blt $r0, I, .L62 -+ .align 3 -+ -+.L63: -+ NMSUB t1, a2, ALPHA_I, t1 -+ MADD t2, a2, ALPHA_R, t2 -+ NMSUB t3, a4, ALPHA_I, t3 -+ MADD t4, a4, ALPHA_R, t4 -+ ST t1, XX, 0 * SIZE -+ MUL t1, ALPHA_R, a5 -+ ST t2, XX, 1 * SIZE -+ MUL t2, ALPHA_I, a5 -+ add.d XX, XX, INCX -+ ST t3, XX, 0 * SIZE -+ MUL t3, ALPHA_R, a7 -+ ST t4, XX, 1 * SIZE -+ MUL t4, ALPHA_I, a7 -+ add.d XX, XX, INCX -+ NMSUB t1, a6, ALPHA_I, t1 -+ MADD t2, a6, ALPHA_R, t2 -+ NMSUB t3, a8, ALPHA_I, t3 -+ MADD t4, a8, ALPHA_R, t4 -+ ST t1, XX, 0 * SIZE -+ ST t2, XX, 1 * SIZE -+ add.d XX, XX, INCX -+ ST t3, XX, 0 * SIZE -+ ST t4, XX, 1 * SIZE -+ add.d XX, XX, INCX -+ .align 3 -+ -+.L65: -+ andi I, N, 3 -+ bge $r0, I, .L999 -+ .align 3 -+.L66: -+ LD a1, X, 0 * SIZE -+ LD a2, X, 1 * SIZE -+ MUL t1, ALPHA_R, a1 -+ MUL t2, ALPHA_I, a1 -+ NMSUB t1, a2, ALPHA_I, t1 -+ MADD t2, a2, ALPHA_R, t2 -+ addi.d I, I, -1 -+ ST t1, X, 0 * SIZE -+ ST t2, X, 1 * SIZE -+ add.d X, X, INCX -+ blt $r0, I, .L66 -+ .align 3 -+ -+.L999: -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/ztrsm_kernel_LT.S b/kernel/loongarch64/ztrsm_kernel_LT.S -new file mode 100644 -index 0000000..26b1230 ---- /dev/null -+++ b/kernel/loongarch64/ztrsm_kernel_LT.S -@@ -0,0 +1,1344 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define M $r4 -+#define N $r5 -+#define K $r6 -+#define A $r7 -+#define B $r8 -+#define C $r9 -+#define LDC $r10 -+#define OFFSET $r11 -+ -+#define AO $r12 -+#define BO $r13 -+#define I $r17 -+#define J $r18 -+#define L $r25 -+#define CO1 $r14 -+#define CO2 $r15 -+#define CO3 $r23 -+#define CO4 $r24 -+#define KK $r26 -+#define TEMP $r27 -+#define AORIG $r28 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f26 -+#define a4 $f27 -+#define b1 $f23 -+#define b2 $f9 -+#define b3 $f10 -+#define b4 $f11 -+#define b5 $f12 -+#define b6 $f13 -+#define b7 $f14 -+#define b8 $f15 -+#define a5 b8 -+#define c11 $f16 -+#define c12 $f17 -+#define c21 $f0 -+#define c22 $f1 -+#define c31 $f2 -+#define c32 $f3 -+#define c41 $f4 -+#define c42 $f5 -+#define c51 $f6 -+#define c52 $f7 -+#define c61 $f18 -+#define c62 $f19 -+#define c71 $f20 -+#define c72 $f21 -+#define c81 $f24 -+#define c82 $f25 -+ -+#ifndef CONJ -+#define MADD1 MADD -+#define MADD2 MADD -+#define MADD3 MADD -+#define MADD4 NMSUB -+#define MADD5 MSUB -+#define MADD6 MADD -+#define MADD7 NMSUB -+#define MADD8 MADD -+#else -+#if defined(LN) || defined(LT) -+#define MADD1 MADD -+#define MADD2 NMSUB -+#define MADD3 MADD -+#define MADD4 MADD -+#else -+#define MADD1 MADD -+#define MADD2 MADD -+#define MADD3 NMSUB -+#define MADD4 MADD -+#endif -+#define MADD5 MADD -+#define MADD6 MSUB -+#define MADD7 MADD -+#define MADD8 NMSUB -+#endif -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -128 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 16 -+ SDARG $r26, $sp, 24 -+ SDARG $r27, $sp, 32 -+ SDARG $r28, $sp, 40 -+ fst.d $f24, $sp, 48 -+ fst.d $f25, $sp, 56 -+ fst.d $f26, $sp, 64 -+ fst.d $f27, $sp, 72 -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 88 -+ fst.d $f19, $sp, 96 -+ fst.d $f20, $sp, 104 -+ fst.d $f21, $sp, 112 -+#endif -+ slli.d LDC, LDC, ZBASE_SHIFT -+#ifdef LN -+ mul.w TEMP, M, K -+ slli.d TEMP, TEMP, ZBASE_SHIFT -+ add.d A, A, TEMP -+ slli.d TEMP, M, ZBASE_SHIFT -+ add.d C, C, TEMP -+#endif -+#ifdef RN -+ sub.d KK, $r0, OFFSET -+#endif -+#ifdef RT -+ mul.w TEMP, N, K -+ slli.d TEMP, TEMP, ZBASE_SHIFT -+ add.d B, B, TEMP -+ mul.w TEMP, N, LDC -+ add.d C, C, TEMP -+ sub.d KK, N, OFFSET -+#endif -+ srai.d J, N, 2 -+nop -+ bge $r0, J, .L20 -+.L10: -+#ifdef RT -+ slli.d TEMP, K, 2 + ZBASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 2 -+ sub.d C, C, TEMP -+#endif -+ move CO1, C -+MTC c11, $r0 -+ add.d CO2, C, LDC -+ add.d CO3, CO2, LDC -+ addi.d J, J, -1 -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ MOV c31, c11 -+ MOV c41, c11 -+ MOV c51, c11 -+ move I, M -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO4, LDC -+#endif -+MOV c61, c11 -+ bge $r0, I, .L19 -+ .align 3 -+ -+.L11: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, B, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, KK, 2 -+ MOV c32, c11 -+ LD b3, B, 2 * SIZE -+ MOV c42, c11 -+ LD b4, B, 3 * SIZE -+ MOV c52, c11 -+ LD b5, B, 4 * SIZE -+ MOV c62, c11 -+ LD b6, B, 8 * SIZE -+ MOV c72, c11 -+ LD b7, B, 12 * SIZE -+ MOV c82, c11 -+move BO, B -+ bge $r0, L, .L15 -+#else -+#ifdef LN -+ slli.d TEMP, K, ZBASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, ZBASE_SHIFT -+ slli.d TEMP, KK, 2 + ZBASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, TEMP, 2 -+ MOV c32, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c42, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c52, c11 -+ LD b5, BO, 4 * SIZE -+ MOV c62, c11 -+ LD b6, BO, 8 * SIZE -+ MOV c72, c11 -+ LD b7, BO, 12 * SIZE -+ MOV c82, c11 -+ bge $r0, L, .L15 -+#endif -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ bge $r0, L, .L13 -+ .align 3 -+.L12: -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c51, b5, a1, c51 -+ MADD3 c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD1 c71, b3, a1, c71 -+ MADD3 c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD1 c51, b7, a4, c51 -+ MADD3 c61, b2, a4, c61 -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD1 c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD3 c21, b2, a3, c21 -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD1 c51, b5, a3, c51 -+ MADD3 c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD1 c71, b3, a3, c71 -+ MADD3 c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ addi.d L, L, -1 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD1 c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD3 c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ blt $r0, L, .L12 -+ .align 3 -+ -+.L13: -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c51, b5, a1, c51 -+ MADD3 c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD1 c71, b3, a1, c71 -+ MADD3 c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD1 c51, b7, a4, c51 -+ MADD3 c61, b2, a4, c61 -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD1 c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD3 c21, b2, a3, c21 -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD1 c51, b5, a3, c51 -+ MADD3 c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD1 c71, b3, a3, c71 -+ MADD3 c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD1 c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD3 c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ .align 3 -+ -+.L15: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L18 -+ .align 3 -+.L16: -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c51, b5, a1, c51 -+ addi.d L, L, -1 -+ MADD3 c61, b2, a1, c61 -+ addi.d AO, AO, 2 * SIZE -+ MADD1 c71, b3, a1, c71 -+ addi.d BO, BO, 8 * SIZE -+ MADD3 c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 4 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L16 -+.L18: -+ ADD c11, c11, c22 -+ ADD c12, c12, c21 -+ ADD c31, c31, c42 -+ ADD c32, c32, c41 -+ ADD c51, c51, c62 -+ ADD c52, c52, c61 -+ ADD c71, c71, c82 -+ ADD c72, c72, c81 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -4 -+#endif -+ slli.d L, TEMP, ZBASE_SHIFT -+ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c31, b3, c31 -+ SUB c32, b4, c32 -+ SUB c51, b5, c51 -+ SUB c52, b6, c52 -+ SUB c71, b7, c71 -+ SUB c72, b8, c72 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ LD b5, AO, 4 * SIZE -+ LD b6, AO, 5 * SIZE -+ LD b7, AO, 6 * SIZE -+ LD b8, AO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c31, b3, c31 -+ SUB c32, b4, c32 -+ SUB c51, b5, c51 -+ SUB c52, b6, c52 -+ SUB c71, b7, c71 -+ SUB c72, b8, c72 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MUL a3, b2, c32 -+ MUL a4, b2, c31 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+ MADD5 c31, c31, b1, a3 -+ MADD6 c32, c32, b1, a4 -+ MUL a1, b2, c52 -+ MUL a2, b2, c51 -+ MUL a3, b2, c72 -+ MUL a4, b2, c71 -+ MADD5 c51, c51, b1, a1 -+ MADD6 c52, c52, b1, a2 -+ MADD5 c71, c71, b1, a3 -+ MADD6 c72, c72, b1, a4 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+ NMSUB c31, c11, b3, c31 -+ MADD7 c32, c11, b4, c32 -+ NMSUB c51, c11, b5, c51 -+ MADD7 c52, c11, b6, c52 -+ NMSUB c71, c11, b7, c71 -+ MADD7 c72, c11, b8, c72 -+ MADD8 c31, c12, b4, c31 -+ NMSUB c32, c12, b3, c32 -+ MADD8 c51, c12, b6, c51 -+ NMSUB c52, c12, b5, c52 -+ MADD8 c71, c12, b8, c71 -+ NMSUB c72, c12, b7, c72 -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ LD b5, BO, 12 * SIZE -+ LD b6, BO, 13 * SIZE -+ LD b7, BO, 14 * SIZE -+ LD b8, BO, 15 * SIZE -+ MUL a1, b4, c32 -+ MUL a2, b4, c31 -+ MADD5 c31, c31, b3, a1 -+ MADD6 c32, c32, b3, a2 -+ NMSUB c51, c31, b5, c51 -+ MADD7 c52, c31, b6, c52 -+ NMSUB c71, c31, b7, c71 -+ MADD7 c72, c31, b8, c72 -+ MADD8 c51, c32, b6, c51 -+ NMSUB c52, c32, b5, c52 -+ MADD8 c71, c32, b8, c71 -+ NMSUB c72, c32, b7, c72 -+ LD b5, BO, 20 * SIZE -+ LD b6, BO, 21 * SIZE -+ LD b7, BO, 22 * SIZE -+ LD b8, BO, 23 * SIZE -+ MUL a1, b6, c52 -+ MUL a2, b6, c51 -+ MADD5 c51, c51, b5, a1 -+ MADD6 c52, c52, b5, a2 -+ NMSUB c71, c51, b7, c71 -+ MADD7 c72, c51, b8, c72 -+ MADD8 c71, c52, b8, c71 -+ NMSUB c72, c52, b7, c72 -+ LD b7, BO, 30 * SIZE -+ LD b8, BO, 31 * SIZE -+ MUL a1, b8, c72 -+ MUL a2, b8, c71 -+ MADD5 c71, c71, b7, a1 -+ MADD6 c72, c72, b7, a2 -+#endif -+#ifdef RT -+ LD b1, BO, 30 * SIZE -+ LD b2, BO, 31 * SIZE -+ LD b3, BO, 28 * SIZE -+ LD b4, BO, 29 * SIZE -+ LD b5, BO, 26 * SIZE -+ LD b6, BO, 27 * SIZE -+ LD b7, BO, 24 * SIZE -+ LD b8, BO, 25 * SIZE -+ MUL a1, b2, c72 -+ MUL a2, b2, c71 -+ MADD5 c71, c71, b1, a1 -+ MADD6 c72, c72, b1, a2 -+ NMSUB c51, c71, b3, c51 -+ MADD7 c52, c71, b4, c52 -+ NMSUB c31, c71, b5, c31 -+ MADD7 c32, c71, b6, c32 -+ NMSUB c11, c71, b7, c11 -+ MADD7 c12, c71, b8, c12 -+ MADD8 c51, c72, b4, c51 -+ NMSUB c52, c72, b3, c52 -+ MADD8 c31, c72, b6, c31 -+ NMSUB c32, c72, b5, c32 -+ MADD8 c11, c72, b8, c11 -+ NMSUB c12, c72, b7, c12 -+ LD b3, BO, 20 * SIZE -+ LD b4, BO, 21 * SIZE -+ LD b5, BO, 18 * SIZE -+ LD b6, BO, 19 * SIZE -+ LD b7, BO, 16 * SIZE -+ LD b8, BO, 17 * SIZE -+ MUL a1, b4, c52 -+ MUL a2, b4, c51 -+ MADD5 c51, c51, b3, a1 -+ MADD6 c52, c52, b3, a2 -+ NMSUB c31, c51, b5, c31 -+ MADD7 c32, c51, b6, c32 -+ NMSUB c11, c51, b7, c11 -+ MADD7 c12, c51, b8, c12 -+ MADD8 c31, c52, b6, c31 -+ NMSUB c32, c52, b5, c32 -+ MADD8 c11, c52, b8, c11 -+ NMSUB c12, c52, b7, c12 -+ LD b5, BO, 10 * SIZE -+ LD b6, BO, 11 * SIZE -+ LD b7, BO, 8 * SIZE -+ LD b8, BO, 9 * SIZE -+ MUL a1, b6, c32 -+ MUL a2, b6, c31 -+ MADD5 c31, c31, b5, a1 -+ MADD6 c32, c32, b5, a2 -+ NMSUB c11, c31, b7, c11 -+ MADD7 c12, c31, b8, c12 -+ MADD8 c11, c32, b8, c11 -+ NMSUB c12, c32, b7, c12 -+ LD b7, BO, 0 * SIZE -+ LD b8, BO, 1 * SIZE -+ MUL a1, b8, c12 -+ MUL a2, b8, c11 -+ MADD5 c11, c11, b7, a1 -+ MADD6 c12, c12, b7, a2 -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c12, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c32, BO, 3 * SIZE -+ ST c51, BO, 4 * SIZE -+ ST c52, BO, 5 * SIZE -+ ST c71, BO, 6 * SIZE -+ ST c72, BO, 7 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c31, AO, 2 * SIZE -+ ST c32, AO, 3 * SIZE -+ ST c51, AO, 4 * SIZE -+ ST c52, AO, 5 * SIZE -+ ST c71, AO, 6 * SIZE -+ ST c72, AO, 7 * SIZE -+#endif -+#ifdef LN -+ addi.d CO1,CO1, -2 * SIZE -+ addi.d CO2,CO2, -2 * SIZE -+ addi.d CO3,CO3, -2 * SIZE -+ addi.d CO4,CO4, -2 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c31, CO2, 0 * SIZE -+ ST c32, CO2, 1 * SIZE -+ ST c51, CO3, 0 * SIZE -+ ST c52, CO3, 1 * SIZE -+ ST c71, CO4, 0 * SIZE -+ ST c72, CO4, 1 * SIZE -+#ifndef LN -+ addi.d CO1,CO1, 2 * SIZE -+ addi.d CO2,CO2, 2 * SIZE -+ addi.d CO3,CO3, 2 * SIZE -+ addi.d CO4,CO4, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, ZBASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, ZBASE_SHIFT -+ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+MTC c11, $r0 -+ addi.d I, I, -1 -+ MOV c21, c11 -+ MOV c31, c11 -+ MOV c41, c11 -+ MOV c51, c11 -+MOV c61, c11 -+ blt $r0, I, .L11 -+ .align 3 -+ -+.L19: -+#ifdef LN -+ slli.d TEMP, K, 2 + ZBASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 4 -+#endif -+#ifdef RT -+ addi.d KK, KK, -4 -+#endif -+ blt $r0, J, .L10 -+ .align 3 -+ -+.L20: -+ andi J, N, 2 -+ bge $r0, J, .L30 -+#ifdef RT -+ slli.d TEMP, K, 1 + ZBASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 1 -+ sub.d C, C, TEMP -+#endif -+MTC c11, $r0 -+ move CO1, C -+ add.d CO2, C, LDC -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO2, LDC -+#endif -+ move I, M -+ bge $r0, I, .L29 -+ .align 3 -+ -+.L21: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, B, 0 * SIZE -+ MOV c31, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c41, c11 -+ LD b2, B, 1 * SIZE -+ srai.d L, KK, 2 -+ LD b3, B, 2 * SIZE -+ MOV c12, c11 -+ LD b4, B, 3 * SIZE -+ MOV c22, c11 -+ LD b5, B, 4 * SIZE -+ MOV c32, c11 -+ MOV c42, c11 -+move BO, B -+ bge $r0, L, .L25 -+#else -+#ifdef LN -+ slli.d TEMP, K, ZBASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, ZBASE_SHIFT -+ slli.d TEMP, KK, 1 + ZBASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c31, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c41, c11 -+ LD b2, BO, 1 * SIZE -+ srai.d L, TEMP, 2 -+ LD b3, BO, 2 * SIZE -+ MOV c12, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c22, c11 -+ LD b5, BO, 4 * SIZE -+ MOV c32, c11 -+MOV c42, c11 -+ bge $r0, L, .L25 -+#endif -+ .align 3 -+.L22: -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c11, b5, a1, c11 -+ LD a2, AO, 3 * SIZE -+ MADD3 c21, b2, a1, c21 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ LD a1, AO, 8 * SIZE -+ MADD2 c12, b5, a2, c12 -+ LD b5, BO, 12 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 9 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 10 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 11 * SIZE -+ MADD1 c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD3 c21, b2, a3, c21 -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ LD a3, AO, 6 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD1 c11, b5, a3, c11 -+ LD a2, AO, 7 * SIZE -+ MADD3 c21, b2, a3, c21 -+ addi.d AO, AO, 8 * SIZE -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ LD a3, AO, 4 * SIZE -+ MADD2 c12, b5, a2, c12 -+ LD b5, BO, 20 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 17 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 18 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 19 * SIZE -+addi.d BO, BO, 16 * SIZE -+ blt $r0, L, .L22 -+ .align 3 -+ -+.L25: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L28 -+ .align 3 -+.L26: -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD1 c31, b3, a1, c31 -+ addi.d BO, BO, 4 * SIZE -+ MADD3 c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 0 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 1 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 2 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 3 * SIZE -+addi.d AO, AO, 2 * SIZE -+ blt $r0, L, .L26 -+.L28: -+ ADD c11, c11, c22 -+ ADD c12, c12, c21 -+ ADD c31, c31, c42 -+ ADD c32, c32, c41 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -2 -+#endif -+ slli.d L, TEMP, ZBASE_SHIFT -+ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c31, b3, c31 -+ SUB c32, b4, c32 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c31, b3, c31 -+ SUB c32, b4, c32 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MUL a3, b2, c32 -+ MUL a4, b2, c31 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+ MADD5 c31, c31, b1, a3 -+ MADD6 c32, c32, b1, a4 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+ NMSUB c31, c11, b3, c31 -+ MADD7 c32, c11, b4, c32 -+ MADD8 c31, c12, b4, c31 -+ NMSUB c32, c12, b3, c32 -+ LD b3, BO, 6 * SIZE -+ LD b4, BO, 7 * SIZE -+ MUL a1, b4, c32 -+ MUL a2, b4, c31 -+ MADD5 c31, c31, b3, a1 -+ MADD6 c32, c32, b3, a2 -+#endif -+#ifdef RT -+ LD b5, BO, 6 * SIZE -+ LD b6, BO, 7 * SIZE -+ LD b7, BO, 4 * SIZE -+ LD b8, BO, 5 * SIZE -+ MUL a1, b6, c32 -+ MUL a2, b6, c31 -+ MADD5 c31, c31, b5, a1 -+ MADD6 c32, c32, b5, a2 -+ NMSUB c11, c31, b7, c11 -+ MADD7 c12, c31, b8, c12 -+ MADD8 c11, c32, b8, c11 -+ NMSUB c12, c32, b7, c12 -+ LD b7, BO, 0 * SIZE -+ LD b8, BO, 1 * SIZE -+ MUL a1, b8, c12 -+ MUL a2, b8, c11 -+ MADD5 c11, c11, b7, a1 -+ MADD6 c12, c12, b7, a2 -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c12, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c32, BO, 3 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c31, AO, 2 * SIZE -+ ST c32, AO, 3 * SIZE -+#endif -+#ifdef LN -+ addi.d CO1,CO1, -2 * SIZE -+ addi.d CO2,CO2, -2 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c31, CO2, 0 * SIZE -+ ST c32, CO2, 1 * SIZE -+#ifndef LN -+ addi.d CO1,CO1, 2 * SIZE -+ addi.d CO2,CO2, 2 * SIZE -+#endif -+MTC c11, $r0 -+#ifdef RT -+ slli.d TEMP, K, ZBASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, ZBASE_SHIFT -+ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ addi.d I, I, -1 -+ blt $r0, I, .L21 -+ .align 3 -+ -+.L29: -+#ifdef LN -+ slli.d TEMP, K, 1 + ZBASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 2 -+#endif -+#ifdef RT -+ addi.d KK, KK, -2 -+#endif -+ .align 3 -+ -+.L30: -+ andi J, N, 1 -+ bge $r0, J, .L999 -+#ifdef RT -+ slli.d TEMP, K, ZBASE_SHIFT -+ sub.d B, B, TEMP -+ sub.d C, C, LDC -+#endif -+MTC c11, $r0 -+ move CO1, C -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO1, LDC -+#endif -+ move I, M -+ bge $r0, I, .L39 -+ .align 3 -+ -+.L31: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, B, 0 * SIZE -+ MOV c31, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c41, c11 -+ LD b2, B, 1 * SIZE -+ MOV c12, c11 -+ srai.d L, KK, 2 -+ MOV c22, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c32, c11 -+ LD b3, B, 4 * SIZE -+ MOV c42, c11 -+move BO, B -+ bge $r0, L, .L35 -+#else -+#ifdef LN -+ slli.d TEMP, K, ZBASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d TEMP, KK, ZBASE_SHIFT -+ add.d AO, AORIG, TEMP -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c31, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c41, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c12, c11 -+ srai.d L, TEMP, 2 -+ MOV c22, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c32, c11 -+ LD b3, BO, 4 * SIZE -+MOV c42, c11 -+ bge $r0, L, .L35 -+#endif -+ .align 3 -+.L32: -+ MADD1 c11, b1, a1, c11 -+ LD b4, BO, 3 * SIZE -+ MADD3 c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD a2, AO, 3 * SIZE -+ MADD1 c11, b1, a1, c11 -+ LD b2, BO, 5 * SIZE -+ MADD3 c21, b4, a1, c21 -+ LD a1, AO, 8 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD4 c22, b4, a2, c22 -+ LD a2, AO, 5 * SIZE -+ MADD1 c11, b3, a3, c11 -+ LD b4, BO, 7 * SIZE -+ MADD3 c21, b2, a3, c21 -+ LD a3, AO, 6 * SIZE -+ MADD2 c12, b3, a2, c12 -+ LD b3, BO, 6 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD a2, AO, 7 * SIZE -+ MADD1 c11, b3, a3, c11 -+ LD b2, BO, 9 * SIZE -+ MADD3 c21, b4, a3, c21 -+ LD a3, AO, 12 * SIZE -+ MADD2 c12, b3, a2, c12 -+ LD b3, BO, 12 * SIZE -+ MADD4 c22, b4, a2, c22 -+ LD a2, AO, 9 * SIZE -+ addi.d AO, AO, 8 * SIZE -+ addi.d L, L, -1 -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L32 -+ .align 3 -+ -+.L35: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L38 -+ .align 3 -+.L36: -+ MADD1 c11, b1, a1, c11 -+ addi.d L, L, -1 -+ MADD3 c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD a2, AO, 3 * SIZE -+ LD b2, BO, 3 * SIZE -+ addi.d BO, BO, 2 * SIZE -+addi.d AO, AO, 2 * SIZE -+ blt $r0, L, .L36 -+.L38: -+ ADD c11, c11, c22 -+ ADD c12, c12, c21 -+#if defined(LN) || defined(RT) -+ addi.d TEMP, KK, -1 -+ slli.d TEMP, TEMP, ZBASE_SHIFT -+ add.d AO, AORIG, TEMP -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+#endif -+#if defined(RN) || defined(RT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c12, BO, 1 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+#endif -+#ifdef LN -+ addi.d CO1,CO1, -2 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+#ifndef LN -+ addi.d CO1,CO1, 2 * SIZE -+#endif -+MTC c11, $r0 -+#ifdef RT -+ slli.d TEMP, K, ZBASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d TEMP, TEMP, ZBASE_SHIFT -+ add.d AO, AO, TEMP -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ addi.d I, I, -1 -+ blt $r0, I, .L31 -+ .align 3 -+ -+.L39: -+#ifdef LN -+ slli.d TEMP, K, ZBASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 1 -+#endif -+#ifdef RT -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ LDARG $r25, $sp, 16 -+ LDARG $r26, $sp, 24 -+ LDARG $r27, $sp, 32 -+ LDARG $r28, $sp, 40 -+ fld.d $f24, $sp, 48 -+ fld.d $f25, $sp, 56 -+ fld.d $f26, $sp, 64 -+ fld.d $f27, $sp, 72 -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 88 -+ fld.d $f19, $sp, 96 -+ fld.d $f20, $sp, 104 -+ fld.d $f21, $sp, 112 -+#endif -+ addi.d $sp, $sp, 128 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ -+ EPILOGUE -diff --git a/kernel/loongarch64/ztrsm_kernel_RT.S b/kernel/loongarch64/ztrsm_kernel_RT.S -new file mode 100644 -index 0000000..e9f0436 ---- /dev/null -+++ b/kernel/loongarch64/ztrsm_kernel_RT.S -@@ -0,0 +1,1343 @@ -+/*************************************************************************** -+Copyright (c) 2021, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+ -+#define M $r4 -+#define N $r5 -+#define K $r6 -+#define A $r7 -+#define B $r8 -+#define C $r9 -+#define LDC $r10 -+#define OFFSET $r11 -+ -+#define AO $r12 -+#define BO $r13 -+#define I $r17 -+#define J $r18 -+#define L $r25 -+#define CO1 $r14 -+#define CO2 $r15 -+#define CO3 $r23 -+#define CO4 $r24 -+#define KK $r26 -+#define TEMP $r27 -+#define AORIG $r28 -+#define a1 $f22 -+#define a2 $f8 -+#define a3 $f26 -+#define a4 $f27 -+#define b1 $f23 -+#define b2 $f9 -+#define b3 $f10 -+#define b4 $f11 -+#define b5 $f12 -+#define b6 $f13 -+#define b7 $f14 -+#define b8 $f15 -+#define a5 b8 -+#define c11 $f16 -+#define c12 $f17 -+#define c21 $f0 -+#define c22 $f1 -+#define c31 $f2 -+#define c32 $f3 -+#define c41 $f4 -+#define c42 $f5 -+#define c51 $f6 -+#define c52 $f7 -+#define c61 $f18 -+#define c62 $f19 -+#define c71 $f20 -+#define c72 $f21 -+#define c81 $f24 -+#define c82 $f25 -+ -+#ifndef CONJ -+#define MADD1 MADD -+#define MADD2 MADD -+#define MADD3 MADD -+#define MADD4 NMSUB -+#define MADD5 MSUB -+#define MADD6 MADD -+#define MADD7 NMSUB -+#define MADD8 MADD -+#else -+#if defined(LN) || defined(LT) -+#define MADD1 MADD -+#define MADD2 NMSUB -+#define MADD3 MADD -+#define MADD4 MADD -+#else -+#define MADD1 MADD -+#define MADD2 MADD -+#define MADD3 NMSUB -+#define MADD4 MADD -+#endif -+#define MADD5 MADD -+#define MADD6 MSUB -+#define MADD7 MADD -+#define MADD8 NMSUB -+#endif -+ -+ PROLOGUE -+ -+ addi.d $sp, $sp, -128 -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 16 -+ SDARG $r26, $sp, 24 -+ SDARG $r27, $sp, 32 -+ SDARG $r28, $sp, 40 -+ fst.d $f24, $sp, 48 -+ fst.d $f25, $sp, 56 -+ fst.d $f26, $sp, 64 -+ fst.d $f27, $sp, 72 -+#ifndef __64BIT__ -+ fst.d $f18, $sp, 88 -+ fst.d $f19, $sp, 96 -+ fst.d $f20, $sp, 104 -+ fst.d $f21, $sp, 112 -+#endif -+ slli.d LDC, LDC, ZBASE_SHIFT -+#ifdef LN -+ mul.w TEMP, M, K -+ slli.d TEMP, TEMP, ZBASE_SHIFT -+ add.d A, A, TEMP -+ slli.d TEMP, M, ZBASE_SHIFT -+ add.d C, C, TEMP -+#endif -+#ifdef RN -+ sub.d KK, $r0, OFFSET -+#endif -+#ifdef RT -+ mul.w TEMP, N, K -+ slli.d TEMP, TEMP, ZBASE_SHIFT -+ add.d B, B, TEMP -+ mul.w TEMP, N, LDC -+ add.d C, C, TEMP -+ sub.d KK, N, OFFSET -+#endif -+ andi J, N, 1 -+ bge $r0, J, .L20 -+#ifdef RT -+ slli.d TEMP, K, ZBASE_SHIFT -+ sub.d B, B, TEMP -+ sub.d C, C, LDC -+#endif -+MTC c11, $r0 -+ move CO1, C -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO1, LDC -+#endif -+ move I, M -+ bge $r0, I, .L39 -+ .align 3 -+ -+.L31: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, B, 0 * SIZE -+ MOV c31, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c41, c11 -+ LD b2, B, 1 * SIZE -+ MOV c12, c11 -+ srai.d L, KK, 2 -+ MOV c22, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c32, c11 -+ LD b3, B, 4 * SIZE -+ MOV c42, c11 -+move BO, B -+ bge $r0, L, .L35 -+#else -+#ifdef LN -+ slli.d TEMP, K, ZBASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d TEMP, KK, ZBASE_SHIFT -+ add.d AO, AORIG, TEMP -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c31, c11 -+ LD a2, AO, 1 * SIZE -+ MOV c41, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c12, c11 -+ srai.d L, TEMP, 2 -+ MOV c22, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c32, c11 -+ LD b3, BO, 4 * SIZE -+MOV c42, c11 -+ bge $r0, L, .L35 -+#endif -+ .align 3 -+.L32: -+ MADD1 c11, b1, a1, c11 -+ LD b4, BO, 3 * SIZE -+ MADD3 c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD a2, AO, 3 * SIZE -+ MADD1 c11, b1, a1, c11 -+ LD b2, BO, 5 * SIZE -+ MADD3 c21, b4, a1, c21 -+ LD a1, AO, 8 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD4 c22, b4, a2, c22 -+ LD a2, AO, 5 * SIZE -+ MADD1 c11, b3, a3, c11 -+ LD b4, BO, 7 * SIZE -+ MADD3 c21, b2, a3, c21 -+ LD a3, AO, 6 * SIZE -+ MADD2 c12, b3, a2, c12 -+ LD b3, BO, 6 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD a2, AO, 7 * SIZE -+ MADD1 c11, b3, a3, c11 -+ LD b2, BO, 9 * SIZE -+ MADD3 c21, b4, a3, c21 -+ LD a3, AO, 12 * SIZE -+ MADD2 c12, b3, a2, c12 -+ LD b3, BO, 12 * SIZE -+ MADD4 c22, b4, a2, c22 -+ LD a2, AO, 9 * SIZE -+ addi.d AO, AO, 8 * SIZE -+ addi.d L, L, -1 -+addi.d BO, BO, 8 * SIZE -+ blt $r0, L, .L32 -+ .align 3 -+ -+.L35: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L38 -+ .align 3 -+.L36: -+ MADD1 c11, b1, a1, c11 -+ addi.d L, L, -1 -+ MADD3 c21, b2, a1, c21 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 2 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD a2, AO, 3 * SIZE -+ LD b2, BO, 3 * SIZE -+ addi.d BO, BO, 2 * SIZE -+addi.d AO, AO, 2 * SIZE -+ blt $r0, L, .L36 -+.L38: -+ ADD c11, c11, c22 -+ ADD c12, c12, c21 -+#if defined(LN) || defined(RT) -+ addi.d TEMP, KK, -1 -+ slli.d TEMP, TEMP, ZBASE_SHIFT -+ add.d AO, AORIG, TEMP -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+#endif -+#if defined(RN) || defined(RT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c12, BO, 1 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+#endif -+#ifdef LN -+ addi.d CO1,CO1, -2 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+#ifndef LN -+ addi.d CO1,CO1, 2 * SIZE -+#endif -+MTC c11, $r0 -+#ifdef RT -+ slli.d TEMP, K, ZBASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d TEMP, TEMP, ZBASE_SHIFT -+ add.d AO, AO, TEMP -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ addi.d I, I, -1 -+ blt $r0, I, .L31 -+ .align 3 -+ -+.L39: -+#ifdef LN -+ slli.d TEMP, K, ZBASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 1 -+#endif -+#ifdef RT -+ addi.d KK, KK, -1 -+#endif -+ .align 3 -+ -+.L20: -+ andi J, N, 2 -+ bge $r0, J, .L30 -+#ifdef RT -+ slli.d TEMP, K, 1 + ZBASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 1 -+ sub.d C, C, TEMP -+#endif -+MTC c11, $r0 -+ move CO1, C -+ add.d CO2, C, LDC -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO2, LDC -+#endif -+ move I, M -+ bge $r0, I, .L29 -+ .align 3 -+ -+.L21: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, B, 0 * SIZE -+ MOV c31, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c41, c11 -+ LD b2, B, 1 * SIZE -+ srai.d L, KK, 2 -+ LD b3, B, 2 * SIZE -+ MOV c12, c11 -+ LD b4, B, 3 * SIZE -+ MOV c22, c11 -+ LD b5, B, 4 * SIZE -+ MOV c32, c11 -+ MOV c42, c11 -+move BO, B -+ bge $r0, L, .L25 -+#else -+#ifdef LN -+ slli.d TEMP, K, ZBASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, ZBASE_SHIFT -+ slli.d TEMP, KK, 1 + ZBASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c21, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c31, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c41, c11 -+ LD b2, BO, 1 * SIZE -+ srai.d L, TEMP, 2 -+ LD b3, BO, 2 * SIZE -+ MOV c12, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c22, c11 -+ LD b5, BO, 4 * SIZE -+ MOV c32, c11 -+MOV c42, c11 -+ bge $r0, L, .L25 -+#endif -+ .align 3 -+.L22: -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c11, b5, a1, c11 -+ LD a2, AO, 3 * SIZE -+ MADD3 c21, b2, a1, c21 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ LD a1, AO, 8 * SIZE -+ MADD2 c12, b5, a2, c12 -+ LD b5, BO, 12 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 9 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 10 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 11 * SIZE -+ MADD1 c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD3 c21, b2, a3, c21 -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ LD a3, AO, 6 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD1 c11, b5, a3, c11 -+ LD a2, AO, 7 * SIZE -+ MADD3 c21, b2, a3, c21 -+ addi.d AO, AO, 8 * SIZE -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ LD a3, AO, 4 * SIZE -+ MADD2 c12, b5, a2, c12 -+ LD b5, BO, 20 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 17 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 18 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 19 * SIZE -+addi.d BO, BO, 16 * SIZE -+ blt $r0, L, .L22 -+ .align 3 -+ -+.L25: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L28 -+ .align 3 -+.L26: -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD1 c31, b3, a1, c31 -+ addi.d BO, BO, 4 * SIZE -+ MADD3 c41, b4, a1, c41 -+ LD a1, AO, 2 * SIZE -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 0 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 1 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 2 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 3 * SIZE -+addi.d AO, AO, 2 * SIZE -+ blt $r0, L, .L26 -+.L28: -+ ADD c11, c11, c22 -+ ADD c12, c12, c21 -+ ADD c31, c31, c42 -+ ADD c32, c32, c41 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -2 -+#endif -+ slli.d L, TEMP, ZBASE_SHIFT -+ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c31, b3, c31 -+ SUB c32, b4, c32 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c31, b3, c31 -+ SUB c32, b4, c32 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MUL a3, b2, c32 -+ MUL a4, b2, c31 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+ MADD5 c31, c31, b1, a3 -+ MADD6 c32, c32, b1, a4 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+ NMSUB c31, c11, b3, c31 -+ MADD7 c32, c11, b4, c32 -+ MADD8 c31, c12, b4, c31 -+ NMSUB c32, c12, b3, c32 -+ LD b3, BO, 6 * SIZE -+ LD b4, BO, 7 * SIZE -+ MUL a1, b4, c32 -+ MUL a2, b4, c31 -+ MADD5 c31, c31, b3, a1 -+ MADD6 c32, c32, b3, a2 -+#endif -+#ifdef RT -+ LD b5, BO, 6 * SIZE -+ LD b6, BO, 7 * SIZE -+ LD b7, BO, 4 * SIZE -+ LD b8, BO, 5 * SIZE -+ MUL a1, b6, c32 -+ MUL a2, b6, c31 -+ MADD5 c31, c31, b5, a1 -+ MADD6 c32, c32, b5, a2 -+ NMSUB c11, c31, b7, c11 -+ MADD7 c12, c31, b8, c12 -+ MADD8 c11, c32, b8, c11 -+ NMSUB c12, c32, b7, c12 -+ LD b7, BO, 0 * SIZE -+ LD b8, BO, 1 * SIZE -+ MUL a1, b8, c12 -+ MUL a2, b8, c11 -+ MADD5 c11, c11, b7, a1 -+ MADD6 c12, c12, b7, a2 -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c12, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c32, BO, 3 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c31, AO, 2 * SIZE -+ ST c32, AO, 3 * SIZE -+#endif -+#ifdef LN -+ addi.d CO1,CO1, -2 * SIZE -+ addi.d CO2,CO2, -2 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c31, CO2, 0 * SIZE -+ ST c32, CO2, 1 * SIZE -+#ifndef LN -+ addi.d CO1,CO1, 2 * SIZE -+ addi.d CO2,CO2, 2 * SIZE -+#endif -+MTC c11, $r0 -+#ifdef RT -+ slli.d TEMP, K, ZBASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, ZBASE_SHIFT -+ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+ addi.d I, I, -1 -+ blt $r0, I, .L21 -+ .align 3 -+ -+.L29: -+#ifdef LN -+ slli.d TEMP, K, 1 + ZBASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 2 -+#endif -+#ifdef RT -+ addi.d KK, KK, -2 -+#endif -+ .align 3 -+ -+.L30: -+ srai.d J, N, 2 -+nop -+ bge $r0, J, .L999 -+.L10: -+#ifdef RT -+ slli.d TEMP, K, 2 + ZBASE_SHIFT -+ sub.d B, B, TEMP -+ slli.d TEMP, LDC, 2 -+ sub.d C, C, TEMP -+#endif -+ move CO1, C -+MTC c11, $r0 -+ add.d CO2, C, LDC -+ add.d CO3, CO2, LDC -+ addi.d J, J, -1 -+ add.d CO4, CO3, LDC -+ MOV c21, c11 -+ MOV c31, c11 -+ MOV c41, c11 -+ MOV c51, c11 -+ move I, M -+#ifdef LN -+ add.d KK, M, OFFSET -+#endif -+#ifdef LT -+ move KK, OFFSET -+#endif -+#if defined(LN) || defined(RT) -+ move AORIG, A -+#else -+ move AO, A -+#endif -+#ifndef RT -+ add.d C, CO4, LDC -+#endif -+MOV c61, c11 -+ bge $r0, I, .L19 -+ .align 3 -+ -+.L11: -+#if defined(LT) || defined(RN) -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, B, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, B, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, KK, 2 -+ MOV c32, c11 -+ LD b3, B, 2 * SIZE -+ MOV c42, c11 -+ LD b4, B, 3 * SIZE -+ MOV c52, c11 -+ LD b5, B, 4 * SIZE -+ MOV c62, c11 -+ LD b6, B, 8 * SIZE -+ MOV c72, c11 -+ LD b7, B, 12 * SIZE -+ MOV c82, c11 -+move BO, B -+ bge $r0, L, .L15 -+#else -+#ifdef LN -+ slli.d TEMP, K, ZBASE_SHIFT -+ sub.d AORIG, AORIG, TEMP -+#endif -+ slli.d L, KK, ZBASE_SHIFT -+ slli.d TEMP, KK, 2 + ZBASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+ sub.d TEMP, K, KK -+ LD a1, AO, 0 * SIZE -+ MOV c71, c11 -+ LD b1, BO, 0 * SIZE -+ MOV c81, c11 -+ LD a3, AO, 4 * SIZE -+ MOV c12, c11 -+ LD b2, BO, 1 * SIZE -+ MOV c22, c11 -+ srai.d L, TEMP, 2 -+ MOV c32, c11 -+ LD b3, BO, 2 * SIZE -+ MOV c42, c11 -+ LD b4, BO, 3 * SIZE -+ MOV c52, c11 -+ LD b5, BO, 4 * SIZE -+ MOV c62, c11 -+ LD b6, BO, 8 * SIZE -+ MOV c72, c11 -+ LD b7, BO, 12 * SIZE -+ MOV c82, c11 -+ bge $r0, L, .L15 -+#endif -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ addi.d L, L, -1 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ bge $r0, L, .L13 -+ .align 3 -+.L12: -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c51, b5, a1, c51 -+ MADD3 c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD1 c71, b3, a1, c71 -+ MADD3 c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD1 c51, b7, a4, c51 -+ MADD3 c61, b2, a4, c61 -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD1 c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD3 c21, b2, a3, c21 -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD1 c51, b5, a3, c51 -+ MADD3 c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD1 c71, b3, a3, c71 -+ MADD3 c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ addi.d L, L, -1 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD1 c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD3 c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ blt $r0, L, .L12 -+ .align 3 -+ -+.L13: -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 16 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c51, b5, a1, c51 -+ MADD3 c61, b2, a1, c61 -+ LD a4, AO, 2 * SIZE -+ MADD1 c71, b3, a1, c71 -+ MADD3 c81, b4, a1, c81 -+ LD a1, AO, 8 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 20 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 9 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 10 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 11 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 3 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 24 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 13 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 14 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 15 * SIZE -+ MADD1 c51, b7, a4, c51 -+ MADD3 c61, b2, a4, c61 -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 28 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 17 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 18 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 19 * SIZE -+ MADD1 c11, b1, a3, c11 -+ LD a2, AO, 5 * SIZE -+ MADD3 c21, b2, a3, c21 -+ MADD1 c31, b3, a3, c31 -+ MADD3 c41, b4, a3, c41 -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 32 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 21 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 22 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 23 * SIZE -+ MADD1 c51, b5, a3, c51 -+ MADD3 c61, b2, a3, c61 -+ LD a4, AO, 6 * SIZE -+ MADD1 c71, b3, a3, c71 -+ MADD3 c81, b4, a3, c81 -+ LD a3, AO, 12 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 36 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 25 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 26 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 27 * SIZE -+ MADD1 c11, b6, a4, c11 -+ LD a2, AO, 7 * SIZE -+ MADD3 c21, b2, a4, c21 -+ MADD1 c31, b3, a4, c31 -+ MADD3 c41, b4, a4, c41 -+ MADD2 c12, b6, a2, c12 -+ LD b6, BO, 40 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 29 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 30 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 31 * SIZE -+ MADD1 c51, b7, a4, c51 -+ addi.d BO, BO, 32 * SIZE -+ MADD3 c61, b2, a4, c61 -+ addi.d AO, AO, 8 * SIZE -+ MADD1 c71, b3, a4, c71 -+ MADD3 c81, b4, a4, c81 -+ MADD2 c52, b7, a2, c52 -+ LD b7, BO, 12 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ .align 3 -+ -+.L15: -+#if defined(LT) || defined(RN) -+ andi L, KK, 3 -+#else -+ andi L, TEMP, 3 -+#endif -+ bge $r0, L, .L18 -+ .align 3 -+.L16: -+ MADD1 c11, b1, a1, c11 -+ LD a2, AO, 1 * SIZE -+ MADD3 c21, b2, a1, c21 -+ MADD1 c31, b3, a1, c31 -+ MADD3 c41, b4, a1, c41 -+ MADD2 c12, b1, a2, c12 -+ LD b1, BO, 8 * SIZE -+ MADD4 c22, b2, a2, c22 -+ LD b2, BO, 5 * SIZE -+ MADD2 c32, b3, a2, c32 -+ LD b3, BO, 6 * SIZE -+ MADD4 c42, b4, a2, c42 -+ LD b4, BO, 7 * SIZE -+ MADD1 c51, b5, a1, c51 -+ addi.d L, L, -1 -+ MADD3 c61, b2, a1, c61 -+ addi.d AO, AO, 2 * SIZE -+ MADD1 c71, b3, a1, c71 -+ addi.d BO, BO, 8 * SIZE -+ MADD3 c81, b4, a1, c81 -+ LD a1, AO, 0 * SIZE -+ MADD2 c52, b5, a2, c52 -+ LD b5, BO, 4 * SIZE -+ MADD4 c62, b2, a2, c62 -+ LD b2, BO, 1 * SIZE -+ MADD2 c72, b3, a2, c72 -+ LD b3, BO, 2 * SIZE -+ MADD4 c82, b4, a2, c82 -+ LD b4, BO, 3 * SIZE -+ blt $r0, L, .L16 -+.L18: -+ ADD c11, c11, c22 -+ ADD c12, c12, c21 -+ ADD c31, c31, c42 -+ ADD c32, c32, c41 -+ ADD c51, c51, c62 -+ ADD c52, c52, c61 -+ ADD c71, c71, c82 -+ ADD c72, c72, c81 -+#if defined(LN) || defined(RT) -+#ifdef LN -+ addi.d TEMP, KK, -1 -+#else -+ addi.d TEMP, KK, -4 -+#endif -+ slli.d L, TEMP, ZBASE_SHIFT -+ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT -+ add.d AO, AORIG, L -+ add.d BO, B, TEMP -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c31, b3, c31 -+ SUB c32, b4, c32 -+ SUB c51, b5, c51 -+ SUB c52, b6, c52 -+ SUB c71, b7, c71 -+ SUB c72, b8, c72 -+#else -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ LD b3, AO, 2 * SIZE -+ LD b4, AO, 3 * SIZE -+ LD b5, AO, 4 * SIZE -+ LD b6, AO, 5 * SIZE -+ LD b7, AO, 6 * SIZE -+ LD b8, AO, 7 * SIZE -+ SUB c11, b1, c11 -+ SUB c12, b2, c12 -+ SUB c31, b3, c31 -+ SUB c32, b4, c32 -+ SUB c51, b5, c51 -+ SUB c52, b6, c52 -+ SUB c71, b7, c71 -+ SUB c72, b8, c72 -+#endif -+#if defined(LN) || defined(LT) -+ LD b1, AO, 0 * SIZE -+ LD b2, AO, 1 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MUL a3, b2, c32 -+ MUL a4, b2, c31 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+ MADD5 c31, c31, b1, a3 -+ MADD6 c32, c32, b1, a4 -+ MUL a1, b2, c52 -+ MUL a2, b2, c51 -+ MUL a3, b2, c72 -+ MUL a4, b2, c71 -+ MADD5 c51, c51, b1, a1 -+ MADD6 c52, c52, b1, a2 -+ MADD5 c71, c71, b1, a3 -+ MADD6 c72, c72, b1, a4 -+#endif -+#ifdef RN -+ LD b1, BO, 0 * SIZE -+ LD b2, BO, 1 * SIZE -+ LD b3, BO, 2 * SIZE -+ LD b4, BO, 3 * SIZE -+ LD b5, BO, 4 * SIZE -+ LD b6, BO, 5 * SIZE -+ LD b7, BO, 6 * SIZE -+ LD b8, BO, 7 * SIZE -+ MUL a1, b2, c12 -+ MUL a2, b2, c11 -+ MADD5 c11, c11, b1, a1 -+ MADD6 c12, c12, b1, a2 -+ NMSUB c31, c11, b3, c31 -+ MADD7 c32, c11, b4, c32 -+ NMSUB c51, c11, b5, c51 -+ MADD7 c52, c11, b6, c52 -+ NMSUB c71, c11, b7, c71 -+ MADD7 c72, c11, b8, c72 -+ MADD8 c31, c12, b4, c31 -+ NMSUB c32, c12, b3, c32 -+ MADD8 c51, c12, b6, c51 -+ NMSUB c52, c12, b5, c52 -+ MADD8 c71, c12, b8, c71 -+ NMSUB c72, c12, b7, c72 -+ LD b3, BO, 10 * SIZE -+ LD b4, BO, 11 * SIZE -+ LD b5, BO, 12 * SIZE -+ LD b6, BO, 13 * SIZE -+ LD b7, BO, 14 * SIZE -+ LD b8, BO, 15 * SIZE -+ MUL a1, b4, c32 -+ MUL a2, b4, c31 -+ MADD5 c31, c31, b3, a1 -+ MADD6 c32, c32, b3, a2 -+ NMSUB c51, c31, b5, c51 -+ MADD7 c52, c31, b6, c52 -+ NMSUB c71, c31, b7, c71 -+ MADD7 c72, c31, b8, c72 -+ MADD8 c51, c32, b6, c51 -+ NMSUB c52, c32, b5, c52 -+ MADD8 c71, c32, b8, c71 -+ NMSUB c72, c32, b7, c72 -+ LD b5, BO, 20 * SIZE -+ LD b6, BO, 21 * SIZE -+ LD b7, BO, 22 * SIZE -+ LD b8, BO, 23 * SIZE -+ MUL a1, b6, c52 -+ MUL a2, b6, c51 -+ MADD5 c51, c51, b5, a1 -+ MADD6 c52, c52, b5, a2 -+ NMSUB c71, c51, b7, c71 -+ MADD7 c72, c51, b8, c72 -+ MADD8 c71, c52, b8, c71 -+ NMSUB c72, c52, b7, c72 -+ LD b7, BO, 30 * SIZE -+ LD b8, BO, 31 * SIZE -+ MUL a1, b8, c72 -+ MUL a2, b8, c71 -+ MADD5 c71, c71, b7, a1 -+ MADD6 c72, c72, b7, a2 -+#endif -+#ifdef RT -+ LD b1, BO, 30 * SIZE -+ LD b2, BO, 31 * SIZE -+ LD b3, BO, 28 * SIZE -+ LD b4, BO, 29 * SIZE -+ LD b5, BO, 26 * SIZE -+ LD b6, BO, 27 * SIZE -+ LD b7, BO, 24 * SIZE -+ LD b8, BO, 25 * SIZE -+ MUL a1, b2, c72 -+ MUL a2, b2, c71 -+ MADD5 c71, c71, b1, a1 -+ MADD6 c72, c72, b1, a2 -+ NMSUB c51, c71, b3, c51 -+ MADD7 c52, c71, b4, c52 -+ NMSUB c31, c71, b5, c31 -+ MADD7 c32, c71, b6, c32 -+ NMSUB c11, c71, b7, c11 -+ MADD7 c12, c71, b8, c12 -+ MADD8 c51, c72, b4, c51 -+ NMSUB c52, c72, b3, c52 -+ MADD8 c31, c72, b6, c31 -+ NMSUB c32, c72, b5, c32 -+ MADD8 c11, c72, b8, c11 -+ NMSUB c12, c72, b7, c12 -+ LD b3, BO, 20 * SIZE -+ LD b4, BO, 21 * SIZE -+ LD b5, BO, 18 * SIZE -+ LD b6, BO, 19 * SIZE -+ LD b7, BO, 16 * SIZE -+ LD b8, BO, 17 * SIZE -+ MUL a1, b4, c52 -+ MUL a2, b4, c51 -+ MADD5 c51, c51, b3, a1 -+ MADD6 c52, c52, b3, a2 -+ NMSUB c31, c51, b5, c31 -+ MADD7 c32, c51, b6, c32 -+ NMSUB c11, c51, b7, c11 -+ MADD7 c12, c51, b8, c12 -+ MADD8 c31, c52, b6, c31 -+ NMSUB c32, c52, b5, c32 -+ MADD8 c11, c52, b8, c11 -+ NMSUB c12, c52, b7, c12 -+ LD b5, BO, 10 * SIZE -+ LD b6, BO, 11 * SIZE -+ LD b7, BO, 8 * SIZE -+ LD b8, BO, 9 * SIZE -+ MUL a1, b6, c32 -+ MUL a2, b6, c31 -+ MADD5 c31, c31, b5, a1 -+ MADD6 c32, c32, b5, a2 -+ NMSUB c11, c31, b7, c11 -+ MADD7 c12, c31, b8, c12 -+ MADD8 c11, c32, b8, c11 -+ NMSUB c12, c32, b7, c12 -+ LD b7, BO, 0 * SIZE -+ LD b8, BO, 1 * SIZE -+ MUL a1, b8, c12 -+ MUL a2, b8, c11 -+ MADD5 c11, c11, b7, a1 -+ MADD6 c12, c12, b7, a2 -+#endif -+#if defined(LN) || defined(LT) -+ ST c11, BO, 0 * SIZE -+ ST c12, BO, 1 * SIZE -+ ST c31, BO, 2 * SIZE -+ ST c32, BO, 3 * SIZE -+ ST c51, BO, 4 * SIZE -+ ST c52, BO, 5 * SIZE -+ ST c71, BO, 6 * SIZE -+ ST c72, BO, 7 * SIZE -+#else -+ ST c11, AO, 0 * SIZE -+ ST c12, AO, 1 * SIZE -+ ST c31, AO, 2 * SIZE -+ ST c32, AO, 3 * SIZE -+ ST c51, AO, 4 * SIZE -+ ST c52, AO, 5 * SIZE -+ ST c71, AO, 6 * SIZE -+ ST c72, AO, 7 * SIZE -+#endif -+#ifdef LN -+ addi.d CO1,CO1, -2 * SIZE -+ addi.d CO2,CO2, -2 * SIZE -+ addi.d CO3,CO3, -2 * SIZE -+ addi.d CO4,CO4, -2 * SIZE -+#endif -+ ST c11, CO1, 0 * SIZE -+ ST c12, CO1, 1 * SIZE -+ ST c31, CO2, 0 * SIZE -+ ST c32, CO2, 1 * SIZE -+ ST c51, CO3, 0 * SIZE -+ ST c52, CO3, 1 * SIZE -+ ST c71, CO4, 0 * SIZE -+ ST c72, CO4, 1 * SIZE -+#ifndef LN -+ addi.d CO1,CO1, 2 * SIZE -+ addi.d CO2,CO2, 2 * SIZE -+ addi.d CO3,CO3, 2 * SIZE -+ addi.d CO4,CO4, 2 * SIZE -+#endif -+#ifdef RT -+ slli.d TEMP, K, ZBASE_SHIFT -+ add.d AORIG, AORIG, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ sub.d TEMP, K, KK -+ slli.d L, TEMP, ZBASE_SHIFT -+ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT -+ add.d AO, AO, L -+ add.d BO, BO, TEMP -+#endif -+#ifdef LT -+ addi.d KK, KK, 1 -+#endif -+#ifdef LN -+ addi.d KK, KK, -1 -+#endif -+MTC c11, $r0 -+ addi.d I, I, -1 -+ MOV c21, c11 -+ MOV c31, c11 -+ MOV c41, c11 -+ MOV c51, c11 -+MOV c61, c11 -+ blt $r0, I, .L11 -+ .align 3 -+ -+.L19: -+#ifdef LN -+ slli.d TEMP, K, 2 + ZBASE_SHIFT -+ add.d B, B, TEMP -+#endif -+#if defined(LT) || defined(RN) -+ move B, BO -+#endif -+#ifdef RN -+ addi.d KK, KK, 4 -+#endif -+#ifdef RT -+ addi.d KK, KK, -4 -+#endif -+ blt $r0, J, .L10 -+ .align 3 -+ -+.L999: -+ LDARG $r23, $sp, 0 -+ LDARG $r24, $sp, 8 -+ LDARG $r25, $sp, 16 -+ LDARG $r26, $sp, 24 -+ LDARG $r27, $sp, 32 -+ LDARG $r28, $sp, 40 -+ fld.d $f24, $sp, 48 -+ fld.d $f25, $sp, 56 -+ fld.d $f26, $sp, 64 -+ fld.d $f27, $sp, 72 -+#ifndef __64BIT__ -+ fld.d $f18, $sp, 88 -+ fld.d $f19, $sp, 96 -+ fld.d $f20, $sp, 104 -+ fld.d $f21, $sp, 112 -+#endif -+ addi.d $sp, $sp, 128 -+ move $r4, $r17 -+ fmov.d $f0, $f22 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S -index a095e05..0ccc781 100644 ---- a/kernel/mips64/dnrm2.S -+++ b/kernel/mips64/dnrm2.S -@@ -68,6 +68,7 @@ - - #define ALPHA $f16 - #define max $f17 -+#define INF $f18 - - - PROLOGUE -@@ -86,6 +87,11 @@ - move XX, X - NOP - -+ //Init INF -+ lui TEMP, 0x7FF0 -+ dsll TEMP, TEMP, 32 -+ MTC1 TEMP, INF -+ - LD a1, 0 * SIZE(X) - daddiu N, N, -1 - -@@ -255,6 +261,9 @@ - div.d ALPHA, ALPHA, s1 - MOV max, s1 - -+ CMPEQ $fcc0, ALPHA, INF -+ bc1t $fcc0, .L999 -+ - MOV s1, a1 - MOV s2, a1 - MOV s3, a1 -diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c -index 1e846a6..5fbbeaa 100644 ---- a/kernel/setparam-ref.c -+++ b/kernel/setparam-ref.c -@@ -1004,6 +1004,34 @@ static void init_parameter(void) { - #endif - } - #else // (ARCH_MIPS64) -+#if (ARCH_LOONGARCH64) -+static void init_parameter(void) { -+ -+#ifdef BUILD_BFLOAT16 -+ TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; -+#endif -+ TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; -+ TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; -+ TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; -+ TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; -+ -+#ifdef BUILD_BFLOAT16 -+ TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; -+#endif -+ TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; -+ TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; -+ TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; -+ TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; -+ -+#ifdef BUILD_BFLOAT16 -+ TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; -+#endif -+ TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; -+ TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; -+ TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; -+ TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; -+} -+#else // (ARCH_LOONGARCH64) - #if (ARCH_POWER) - static void init_parameter(void) { - -@@ -1851,5 +1879,6 @@ static void init_parameter(void) { - } - #endif //POWER - #endif //ZARCH -+#endif //(ARCH_LOONGARCH64) - #endif //(ARCH_MIPS64) - #endif //(ARCH_ARM64) -diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile -new file mode 100644 -index 0000000..b87a2eb ---- /dev/null -+++ b/lapack/laswp/loongarch64/Makefile -@@ -0,0 +1,12 @@ -+TOPDIR = ../../.. -+include ../../../Makefile.system -+ -+ifndef LASWP -+LASWP = ../generic/laswp_k.c -+endif -+ -+ifndef ZLASWP -+ZLASWP = ../generic/zlaswp_k.c -+endif -+ -+include ../generic/Makefile -diff --git a/param.h b/param.h -index a35ce69..34dce01 100644 ---- a/param.h -+++ b/param.h -@@ -2689,6 +2689,122 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #define SYMV_P 16 - #endif - -+#if defined (LOONGSON3R5) -+#define SNUMOPT 2 -+#define DNUMOPT 2 -+ -+#define GEMM_DEFAULT_OFFSET_A 0 -+#define GEMM_DEFAULT_OFFSET_B 0 -+#define GEMM_DEFAULT_ALIGN 0x0ffffUL -+ -+#define SGEMM_DEFAULT_UNROLL_N 8 -+#define DGEMM_DEFAULT_UNROLL_N 4 -+#define QGEMM_DEFAULT_UNROLL_N 2 -+#define CGEMM_DEFAULT_UNROLL_N 4 -+#define ZGEMM_DEFAULT_UNROLL_N 4 -+#define XGEMM_DEFAULT_UNROLL_N 1 -+ -+#define SGEMM_DEFAULT_UNROLL_M 2 -+#define DGEMM_DEFAULT_UNROLL_M 16 -+#define QGEMM_DEFAULT_UNROLL_M 2 -+#define CGEMM_DEFAULT_UNROLL_M 1 -+#define ZGEMM_DEFAULT_UNROLL_M 1 -+#define XGEMM_DEFAULT_UNROLL_M 1 -+ -+#define SGEMM_DEFAULT_P 512 -+#define DGEMM_DEFAULT_P 32 -+#define QGEMM_DEFAULT_P 128 -+#define CGEMM_DEFAULT_P 128 -+#define ZGEMM_DEFAULT_P 128 -+#define XGEMM_DEFAULT_P 128 -+ -+#define SGEMM_DEFAULT_R 12288 -+#define DGEMM_DEFAULT_R 858 -+#define QGEMM_DEFAULT_R 4096 -+#define CGEMM_DEFAULT_R 4096 -+#define ZGEMM_DEFAULT_R 4096 -+#define XGEMM_DEFAULT_R 4096 -+ -+#define SGEMM_DEFAULT_Q 128 -+#define DGEMM_DEFAULT_Q 152 -+#define CGEMM_DEFAULT_Q 128 -+#define ZGEMM_DEFAULT_Q 128 -+#define ZGEMM_DEFAULT_Q 128 -+#define XGEMM_DEFAULT_Q 128 -+ -+#define SYMV_P 16 -+#endif -+ -+#ifdef LOONGSON2K1000 -+#define GEMM_DEFAULT_OFFSET_A 0 -+#define GEMM_DEFAULT_OFFSET_B 0 -+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL -+ -+#define SGEMM_DEFAULT_UNROLL_M 2 -+#define SGEMM_DEFAULT_UNROLL_N 8 -+ -+#define DGEMM_DEFAULT_UNROLL_M 2 -+#define DGEMM_DEFAULT_UNROLL_N 8 -+ -+#define CGEMM_DEFAULT_UNROLL_M 1 -+#define CGEMM_DEFAULT_UNROLL_N 4 -+ -+#define ZGEMM_DEFAULT_UNROLL_M 1 -+#define ZGEMM_DEFAULT_UNROLL_N 4 -+ -+#define SGEMM_DEFAULT_P 128 -+#define DGEMM_DEFAULT_P 128 -+#define CGEMM_DEFAULT_P 96 -+#define ZGEMM_DEFAULT_P 64 -+ -+#define SGEMM_DEFAULT_Q 240 -+#define DGEMM_DEFAULT_Q 120 -+#define CGEMM_DEFAULT_Q 120 -+#define ZGEMM_DEFAULT_Q 120 -+ -+#define SGEMM_DEFAULT_R 12288 -+#define DGEMM_DEFAULT_R 8192 -+#define CGEMM_DEFAULT_R 4096 -+#define ZGEMM_DEFAULT_R 4096 -+ -+#define SYMV_P 16 -+#endif -+ -+#ifdef LOONGSONGENERIC -+#define GEMM_DEFAULT_OFFSET_A 0 -+#define GEMM_DEFAULT_OFFSET_B 0 -+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL -+ -+#define SGEMM_DEFAULT_UNROLL_M 2 -+#define SGEMM_DEFAULT_UNROLL_N 8 -+ -+#define DGEMM_DEFAULT_UNROLL_M 2 -+#define DGEMM_DEFAULT_UNROLL_N 8 -+ -+#define CGEMM_DEFAULT_UNROLL_M 1 -+#define CGEMM_DEFAULT_UNROLL_N 4 -+ -+#define ZGEMM_DEFAULT_UNROLL_M 1 -+#define ZGEMM_DEFAULT_UNROLL_N 4 -+ -+#define SGEMM_DEFAULT_P 128 -+#define DGEMM_DEFAULT_P 128 -+#define CGEMM_DEFAULT_P 96 -+#define ZGEMM_DEFAULT_P 64 -+ -+#define SGEMM_DEFAULT_Q 240 -+#define DGEMM_DEFAULT_Q 120 -+#define CGEMM_DEFAULT_Q 120 -+#define ZGEMM_DEFAULT_Q 120 -+ -+#define SGEMM_DEFAULT_R 12288 -+#define DGEMM_DEFAULT_R 8192 -+#define CGEMM_DEFAULT_R 4096 -+#define ZGEMM_DEFAULT_R 4096 -+ -+#define SYMV_P 16 -+#endif -+ - #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) - #define SNUMOPT 2 - #define DNUMOPT 2 --- -2.20.1 - diff --git a/openblas-CVE-2021-4048.patch b/openblas-CVE-2021-4048.patch new file mode 100644 index 0000000000000000000000000000000000000000..696c75b4c7c426a89659cdf9a456d52e0e77da7e --- /dev/null +++ b/openblas-CVE-2021-4048.patch @@ -0,0 +1,107 @@ +From 2be5ee3cca97a597f2ee2118808a2d5eacea050c Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 1 Oct 2021 11:17:21 +0200 +Subject: [PATCH 1/4] Fix out of bounds read in ?llarv (Reference-LAPACK PR + 625) + +--- + lapack-netlib/SRC/clarrv.f | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lapack-netlib/SRC/clarrv.f b/lapack-netlib/SRC/clarrv.f +index a45f55ac..26a9febc 100644 +--- a/lapack-netlib/SRC/clarrv.f ++++ b/lapack-netlib/SRC/clarrv.f +@@ -351,7 +351,7 @@ + * + * Quick return if possible + * +- IF( N.LE.0 ) THEN ++ IF( (N.LE.0) .OR. (M.LE.0) ) THEN + RETURN + END IF + * +-- +2.34.1 + + +From fe497efa0510466fd93578aaf9da1ad8ed4edbe7 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 1 Oct 2021 11:18:20 +0200 +Subject: [PATCH 2/4] Fix out of bounds read in ?llarv (Reference-LAPACK PR + 625) + +--- + lapack-netlib/SRC/dlarrv.f | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lapack-netlib/SRC/dlarrv.f b/lapack-netlib/SRC/dlarrv.f +index 4a59a2bb..a1c6e9c9 100644 +--- a/lapack-netlib/SRC/dlarrv.f ++++ b/lapack-netlib/SRC/dlarrv.f +@@ -353,7 +353,7 @@ + * + * Quick return if possible + * +- IF( N.LE.0 ) THEN ++ IF( (N.LE.0).OR.(M.LE.0) ) THEN + RETURN + END IF + * +-- +2.34.1 + + +From ddb0ff5353637bb5f5ad060c9620e334c143e3d7 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 1 Oct 2021 11:19:07 +0200 +Subject: [PATCH 3/4] Fix out of bounds read in ?llarv (Reference-LAPACK PR + 625) + +--- + lapack-netlib/SRC/slarrv.f | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lapack-netlib/SRC/slarrv.f b/lapack-netlib/SRC/slarrv.f +index 04519fde..9448b2fd 100644 +--- a/lapack-netlib/SRC/slarrv.f ++++ b/lapack-netlib/SRC/slarrv.f +@@ -353,7 +353,7 @@ + * + * Quick return if possible + * +- IF( N.LE.0 ) THEN ++ IF( (N.LE.0).OR.(M.LE.0) ) THEN + RETURN + END IF + * +-- +2.34.1 + + +From 337b65133df174796794871b3988cd03426e6d41 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 1 Oct 2021 11:19:53 +0200 +Subject: [PATCH 4/4] Fix out of bounds read in ?llarv (Reference-LAPACK PR + 625) + +--- + lapack-netlib/SRC/zlarrv.f | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lapack-netlib/SRC/zlarrv.f b/lapack-netlib/SRC/zlarrv.f +index 23976dbe..8d10e3c2 100644 +--- a/lapack-netlib/SRC/zlarrv.f ++++ b/lapack-netlib/SRC/zlarrv.f +@@ -351,7 +351,7 @@ + * + * Quick return if possible + * +- IF( N.LE.0 ) THEN ++ IF( (N.LE.0).OR.(M.LE.0) ) THEN + RETURN + END IF + * +-- +2.34.1 + diff --git a/openblas.spec b/openblas.spec index 1eda872bcbd9f52477a11288a16cb929e5ed451b..b85cdbf42100a88ca7e6c050e0b5435559302cf7 100644 --- a/openblas.spec +++ b/openblas.spec @@ -1,12 +1,9 @@ -%define anolis_release .0.2 %bcond_with system_lapack # Version of bundled lapack %global lapackver 3.9.1 -%ifnarch loongarch64 # Use strip from gcc-toolset-11 %global __strip "scl enable gcc-toolset-11 -- strip" -%endif # DO NOT "CLEAN UP" OR MODIFY THIS SPEC FILE WITHOUT ASKING THE # MAINTAINER FIRST! @@ -21,7 +18,7 @@ Name: openblas Version: 0.3.15 -Release: 3%{anolis_release}%{?dist} +Release: 4%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 Group: Development/Libraries License: BSD @@ -39,18 +36,13 @@ Patch3: openblas-0.3.15-noopt.patch Patch4: openblas-0.3.15-asmflags.patch # Remove optimization pragmas on ppc64le Patch5: openblas-0.3.15-power-optimize.patch -# Add optimization for LoongArch -Patch6: openblas-0.3.15-opt-loongarch64.patch -%ifarch loongarch64 -BuildRequires: gcc -BuildRequires: gcc-gfortran -%else +Patch6: openblas-CVE-2021-4048.patch + BuildRequires: scl-utils BuildRequires: gcc-toolset-11-gcc BuildRequires: gcc-toolset-11-gcc-gfortran BuildRequires: gcc-toolset-11-annobin-plugin-gcc -%endif BuildRequires: perl-devel BuildRequires: multilib-rpm-config @@ -64,9 +56,6 @@ BuildRequires: multilib-rpm-config %else %global execstack 1 %endif -%ifarch loongarch64 -%global execstack 0 -%endif %if %{execstack} BuildRequires: /usr/bin/execstack %endif @@ -251,7 +240,7 @@ cd OpenBLAS-%{version} %patch3 -p1 -b .noopt %patch4 -p1 -b .asmflags %patch5 -p1 -b .power-optimize -%patch6 -p1 -b .opt-loongarch64 +%patch6 -p1 -b .cve-2021-4048 # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -355,10 +344,8 @@ rm -rf netliblapack64 %endif %build -%ifnarch loongarch64 # Enable gcc-toolset-11 source scl_source enable gcc-toolset-11 -%endif %if !%{lapacke} LAPACKE="NO_LAPACKE=1" @@ -398,13 +385,6 @@ TARGET="TARGET=ARMV8 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" %ifarch s390x TARGET="TARGET=ZARCH_GENERIC DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" %endif -%ifarch loongarch64 -TARGET="TARGET=LOONGSONGENERIC DYNAMIC_ARCH=0" -USE_LOCKING="" -%else -USE_LOCKING="USE_LOCKING=1" -%endif - %if 0%{?rhel} == 5 # Gfortran too old to recognize -frecursive @@ -425,7 +405,7 @@ make -C Rblas $TARGET USE_THREAD=0 USEOPENMP=0 FC=gfortran CC=gcc COMMON_OP # Declare some necessary build flags COMMON="%{optflags} -fPIC" FCOMMON="$COMMON -frecursive -cpp" -make -C serial $TARGET USE_THREAD=0 $USE_LOCKING USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas" $AVX $LAPACKE INTERFACE64=0 +make -C serial $TARGET USE_THREAD=0 USE_LOCKING=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas" $AVX $LAPACKE INTERFACE64=0 make -C threaded $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblasp" $AVX $LAPACKE INTERFACE64=0 # USE_THREAD determines use of SMP, not of pthreads @@ -436,7 +416,7 @@ make -C openmp $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_O %if %build64 COMMON="%{optflags} -fPIC" FCOMMON="$COMMON -frecursive -fdefault-integer-8 -cpp" -make -C serial64 $TARGET USE_THREAD=0 $USE_LOCKING USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas64" $AVX $LAPACKE INTERFACE64=1 +make -C serial64 $TARGET USE_THREAD=0 USE_LOCKING=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas64" $AVX $LAPACKE INTERFACE64=1 make -C threaded64 $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblasp64" $AVX $LAPACKE INTERFACE64=1 COMMON="%{optflags} -fPIC -fopenmp -pthread" @@ -445,7 +425,7 @@ make -C openmp64 $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_O COMMON="%{optflags} -fPIC" FCOMMON="$COMMON -frecursive -fdefault-integer-8 -cpp" -make -C serial64_ $TARGET USE_THREAD=0 $USE_LOCKING USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas64_" $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_ +make -C serial64_ $TARGET USE_THREAD=0 USE_LOCKING=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas64_" $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_ make -C threaded64_ $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblasp64_" $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_ COMMON="%{optflags} -fPIC -fopenmp -pthread" @@ -454,10 +434,8 @@ make -C openmp64_ $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_ %endif %install -%ifnarch loongarch64 # Enable gcc-toolset-11 source scl_source enable gcc-toolset-11 -%endif rm -rf %{buildroot} # Install serial library and headers @@ -477,9 +455,6 @@ suffix="" %ifarch armv7hl suffix="_armv7" %endif -%ifarch loongarch64 -suffix="_loongsongeneric" -%endif slibname=`basename %{buildroot}%{_libdir}/libopenblas${suffix}-*.so .so` mv %{buildroot}%{_libdir}/${slibname}.a %{buildroot}%{_libdir}/lib%{name}.a if [[ "$suffix" != "" ]]; then @@ -705,11 +680,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog -* Thu Oct 20 2022 Liwei Ge - 0.3.15-3.0.2 -- Fix build on loongarch64 - -* Fri Sep 30 2022 Liwei Ge - 0.3.15-3.0.1 -- Rebuild for loongarch +* Wed Jun 15 2022 Matej Mužila - 0.3.15-4 +- Fix out-of-bounds read in *larrv +- Resolves: CVE-2021-4048 * Fri Nov 12 2021 Nikola Forró - 0.3.15-3 - Fix missing header files in openblas-devel subpackage by enabling