diff --git a/openblas-0.3.15-opt-loongarch64.patch b/openblas-0.3.15-opt-loongarch64.patch new file mode 100644 index 0000000000000000000000000000000000000000..d83710a33156f4582a0ebecc30557058441ee54e --- /dev/null +++ b/openblas-0.3.15-opt-loongarch64.patch @@ -0,0 +1,31426 @@ +From b844ee9613bf88f20bbd78ff1fbad29740c99ba2 Mon Sep 17 00:00:00 2001 +From: Shiyou Yin +Date: Wed, 7 Sep 2022 18:06:33 +0800 +Subject: [PATCH] Add support for LoongArch64. + +include latest commit(fbfe1daf6ea71a50bc36cb29d0d27e0359926ef7) in branch develop. +--- + Makefile.loongarch64 | 3 + + Makefile.system | 14 + + TargetList.txt | 4 + + c_check | 53 +- + cmake/arch.cmake | 4 + + cmake/cc.cmake | 9 + + cmake/fc.cmake | 7 + + cmake/system_check.cmake | 4 +- + common.h | 6 +- + common_loongarch64.h | 199 ++ + common_macro.h | 3 +- + cpuid_loongarch64.c | 143 + + ctest.c | 4 + + driver/others/Makefile | 8 + + driver/others/dynamic_loongarch64.c | 128 + + getarch.c | 58 +- + kernel/loongarch64/KERNEL | 238 ++ + kernel/loongarch64/KERNEL.LOONGSON3R5 | 14 + + kernel/loongarch64/KERNEL.generic | 167 + + kernel/loongarch64/Makefile | 1 + + kernel/loongarch64/amax.S | 230 ++ + kernel/loongarch64/amin.S | 186 ++ + kernel/loongarch64/asum.S | 232 ++ + kernel/loongarch64/cnrm2.S | 159 + + kernel/loongarch64/copy.S | 225 ++ + kernel/loongarch64/dgemm_kernel_16x4.S | 4250 ++++++++++++++++++++++++ + kernel/loongarch64/dgemm_ncopy_16.S | 691 ++++ + kernel/loongarch64/dgemm_ncopy_4.S | 237 ++ + kernel/loongarch64/dgemm_tcopy_16.S | 710 ++++ + kernel/loongarch64/dgemm_tcopy_4.S | 270 ++ + kernel/loongarch64/dnrm2.S | 324 ++ + kernel/loongarch64/dot.S | 391 +++ + kernel/loongarch64/gemm_kernel.S | 1859 +++++++++++ + kernel/loongarch64/gemv_n.S | 531 +++ + kernel/loongarch64/gemv_t.S | 436 +++ + kernel/loongarch64/iamax.S | 233 ++ + kernel/loongarch64/iamin.S | 233 ++ + kernel/loongarch64/izamax.S | 217 ++ + kernel/loongarch64/izamin.S | 217 ++ + kernel/loongarch64/max.S | 174 + + kernel/loongarch64/min.S | 174 + + kernel/loongarch64/scal.S | 330 ++ + kernel/loongarch64/snrm2.S | 249 ++ + kernel/loongarch64/swap.S | 330 ++ + kernel/loongarch64/trsm_kernel_LN.S | 2863 ++++++++++++++++ + kernel/loongarch64/trsm_kernel_LT.S | 2854 ++++++++++++++++ + kernel/loongarch64/trsm_kernel_RT.S | 2850 ++++++++++++++++ + kernel/loongarch64/zamax.S | 190 ++ + kernel/loongarch64/zamin.S | 198 ++ + kernel/loongarch64/zasum.S | 158 + + kernel/loongarch64/zcopy.S | 217 ++ + kernel/loongarch64/zdot.S | 330 ++ + kernel/loongarch64/zgemm3m_kernel.S | 1359 ++++++++ + kernel/loongarch64/zgemm_kernel.S | 1047 ++++++ + kernel/loongarch64/zgemv_n.S | 648 ++++ + kernel/loongarch64/zgemv_t.S | 556 ++++ + kernel/loongarch64/znrm2.S | 304 ++ + kernel/loongarch64/zscal.S | 356 ++ + kernel/loongarch64/ztrsm_kernel_LT.S | 1344 ++++++++ + kernel/loongarch64/ztrsm_kernel_RT.S | 1343 ++++++++ + kernel/mips64/dnrm2.S | 9 + + kernel/setparam-ref.c | 29 + + lapack/laswp/loongarch64/Makefile | 12 + + param.h | 116 + + 64 files changed, 30708 insertions(+), 30 deletions(-) + create mode 100644 Makefile.loongarch64 + create mode 100644 common_loongarch64.h + create mode 100644 cpuid_loongarch64.c + create mode 100644 driver/others/dynamic_loongarch64.c + create mode 100644 kernel/loongarch64/KERNEL + create mode 100644 kernel/loongarch64/KERNEL.LOONGSON3R5 + create mode 100644 kernel/loongarch64/KERNEL.generic + create mode 100644 kernel/loongarch64/Makefile + create mode 100644 kernel/loongarch64/amax.S + create mode 100644 kernel/loongarch64/amin.S + create mode 100644 kernel/loongarch64/asum.S + create mode 100644 kernel/loongarch64/cnrm2.S + create mode 100644 kernel/loongarch64/copy.S + create mode 100644 kernel/loongarch64/dgemm_kernel_16x4.S + create mode 100644 kernel/loongarch64/dgemm_ncopy_16.S + create mode 100644 kernel/loongarch64/dgemm_ncopy_4.S + create mode 100644 kernel/loongarch64/dgemm_tcopy_16.S + create mode 100644 kernel/loongarch64/dgemm_tcopy_4.S + create mode 100644 kernel/loongarch64/dnrm2.S + create mode 100644 kernel/loongarch64/dot.S + create mode 100644 kernel/loongarch64/gemm_kernel.S + create mode 100644 kernel/loongarch64/gemv_n.S + create mode 100644 kernel/loongarch64/gemv_t.S + create mode 100644 kernel/loongarch64/iamax.S + create mode 100644 kernel/loongarch64/iamin.S + create mode 100644 kernel/loongarch64/izamax.S + create mode 100644 kernel/loongarch64/izamin.S + create mode 100644 kernel/loongarch64/max.S + create mode 100644 kernel/loongarch64/min.S + create mode 100644 kernel/loongarch64/scal.S + create mode 100644 kernel/loongarch64/snrm2.S + create mode 100644 kernel/loongarch64/swap.S + create mode 100644 kernel/loongarch64/trsm_kernel_LN.S + create mode 100644 kernel/loongarch64/trsm_kernel_LT.S + create mode 100644 kernel/loongarch64/trsm_kernel_RT.S + create mode 100644 kernel/loongarch64/zamax.S + create mode 100644 kernel/loongarch64/zamin.S + create mode 100644 kernel/loongarch64/zasum.S + create mode 100644 kernel/loongarch64/zcopy.S + create mode 100644 kernel/loongarch64/zdot.S + create mode 100644 kernel/loongarch64/zgemm3m_kernel.S + create mode 100644 kernel/loongarch64/zgemm_kernel.S + create mode 100644 kernel/loongarch64/zgemv_n.S + create mode 100644 kernel/loongarch64/zgemv_t.S + create mode 100644 kernel/loongarch64/znrm2.S + create mode 100644 kernel/loongarch64/zscal.S + create mode 100644 kernel/loongarch64/ztrsm_kernel_LT.S + create mode 100644 kernel/loongarch64/ztrsm_kernel_RT.S + create mode 100644 lapack/laswp/loongarch64/Makefile + +diff --git a/Makefile.loongarch64 b/Makefile.loongarch64 +new file mode 100644 +index 0000000..05ea9c6 +--- /dev/null ++++ b/Makefile.loongarch64 +@@ -0,0 +1,3 @@ ++ifdef BINARY64 ++else ++endif +diff --git a/Makefile.system b/Makefile.system +index 80739dc..5aca7c0 100644 +--- a/Makefile.system ++++ b/Makefile.system +@@ -636,6 +636,10 @@ ifeq ($(ARCH), mips64) + DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 + endif + ++ifeq ($(ARCH), loongarch64) ++DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC ++endif ++ + ifeq ($(ARCH), zarch) + DYNAMIC_CORE = ZARCH_GENERIC + +@@ -772,6 +776,11 @@ NO_BINARY_MODE = 1 + BINARY_DEFINED = 1 + endif + ++ifeq ($(ARCH), loongarch64) ++NO_BINARY_MODE = 1 ++BINARY_DEFINED = 1 ++endif ++ + + # + # C Compiler dependent settings +@@ -842,6 +851,11 @@ ifeq ($(OSNAME), AIX) + BINARY_DEFINED = 1 + endif + ++ifeq ($(ARCH), loongarch64) ++CCOMMON_OPT += -march=loongarch64 -mabi=lp64 ++FCOMMON_OPT += -march=loongarch64 -mabi=lp64 ++endif ++ + endif + + ifndef BINARY_DEFINED +diff --git a/TargetList.txt b/TargetList.txt +index d199649..cd3e756 100644 +--- a/TargetList.txt ++++ b/TargetList.txt +@@ -109,3 +109,7 @@ Z14 + RISCV64_GENERIC + C910V + ++11.LOONGARCH64: ++LOONGSONGENERIC ++LOONGSON3R5 ++LOONGSON2K1000 +diff --git a/c_check b/c_check +index e24943a..030f5e6 100644 +--- a/c_check ++++ b/c_check +@@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/); + $os = Android if ($data =~ /OS_ANDROID/); + $os = Haiku if ($data =~ /OS_HAIKU/); + +-$architecture = x86 if ($data =~ /ARCH_X86/); +-$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +-$architecture = power if ($data =~ /ARCH_POWER/); +-$architecture = mips if ($data =~ /ARCH_MIPS/); +-$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +-$architecture = alpha if ($data =~ /ARCH_ALPHA/); +-$architecture = sparc if ($data =~ /ARCH_SPARC/); +-$architecture = ia64 if ($data =~ /ARCH_IA64/); +-$architecture = arm if ($data =~ /ARCH_ARM/); +-$architecture = arm64 if ($data =~ /ARCH_ARM64/); +-$architecture = zarch if ($data =~ /ARCH_ZARCH/); +-$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); ++$architecture = x86 if ($data =~ /ARCH_X86/); ++$architecture = x86_64 if ($data =~ /ARCH_X86_64/); ++$architecture = power if ($data =~ /ARCH_POWER/); ++$architecture = mips if ($data =~ /ARCH_MIPS/); ++$architecture = mips64 if ($data =~ /ARCH_MIPS64/); ++$architecture = alpha if ($data =~ /ARCH_ALPHA/); ++$architecture = sparc if ($data =~ /ARCH_SPARC/); ++$architecture = ia64 if ($data =~ /ARCH_IA64/); ++$architecture = arm if ($data =~ /ARCH_ARM/); ++$architecture = arm64 if ($data =~ /ARCH_ARM64/); ++$architecture = zarch if ($data =~ /ARCH_ZARCH/); ++$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); ++$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); + + $defined = 0; + +@@ -143,6 +144,11 @@ if ($architecture eq "riscv64") { + $binary = 64; + } + ++if ($architecture eq "loongarch64") { ++ $defined = 1; ++ $binary = 64; ++} ++ + if ($compiler eq "PGI") { + $compiler_name .= " -tp p7" if ($binary eq "32"); + $compiler_name .= " -tp p7-64" if ($binary eq "64"); +@@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { + } + } + +-$architecture = x86 if ($data =~ /ARCH_X86/); +-$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +-$architecture = power if ($data =~ /ARCH_POWER/); +-$architecture = mips if ($data =~ /ARCH_MIPS/); +-$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +-$architecture = alpha if ($data =~ /ARCH_ALPHA/); +-$architecture = sparc if ($data =~ /ARCH_SPARC/); +-$architecture = ia64 if ($data =~ /ARCH_IA64/); +-$architecture = arm if ($data =~ /ARCH_ARM/); +-$architecture = arm64 if ($data =~ /ARCH_ARM64/); +-$architecture = zarch if ($data =~ /ARCH_ZARCH/); ++$architecture = x86 if ($data =~ /ARCH_X86/); ++$architecture = x86_64 if ($data =~ /ARCH_X86_64/); ++$architecture = power if ($data =~ /ARCH_POWER/); ++$architecture = mips if ($data =~ /ARCH_MIPS/); ++$architecture = mips64 if ($data =~ /ARCH_MIPS64/); ++$architecture = alpha if ($data =~ /ARCH_ALPHA/); ++$architecture = sparc if ($data =~ /ARCH_SPARC/); ++$architecture = ia64 if ($data =~ /ARCH_IA64/); ++$architecture = arm if ($data =~ /ARCH_ARM/); ++$architecture = arm64 if ($data =~ /ARCH_ARM64/); ++$architecture = zarch if ($data =~ /ARCH_ZARCH/); ++$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); + + $binformat = bin32; + $binformat = bin64 if ($data =~ /BINARY_64/); +diff --git a/cmake/arch.cmake b/cmake/arch.cmake +index 4451f9e..b1d18cc 100644 +--- a/cmake/arch.cmake ++++ b/cmake/arch.cmake +@@ -113,6 +113,10 @@ if (MIPS64) + set(NO_BINARY_MODE 1) + endif () + ++if (LOONGARCH64) ++ set(NO_BINARY_MODE 1) ++endif () ++ + if (${ARCH} STREQUAL "alpha") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +diff --git a/cmake/cc.cmake b/cmake/cc.cmake +index 7695215..d0b195c 100644 +--- a/cmake/cc.cmake ++++ b/cmake/cc.cmake +@@ -29,6 +29,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS + set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") + endif () + ++ if (LOONGARCH64) ++ if (BINARY64) ++ set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") ++ else () ++ set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") ++ endif () ++ set(BINARY_DEFINED 1) ++ endif () ++ + if (CMAKE_SYSTEM_NAME STREQUAL "AIX") + set(BINARY_DEFINED 1) + endif () +diff --git a/cmake/fc.cmake b/cmake/fc.cmake +index fc1f9bb..6316645 100644 +--- a/cmake/fc.cmake ++++ b/cmake/fc.cmake +@@ -61,6 +61,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") + endif () + endif () ++ if (LOONGARCH64) ++ if (BINARY64) ++ set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") ++ else () ++ set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") ++ endif () ++ endif () + else () + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -m64") +diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake +index fdc79c8..8d0558c 100644 +--- a/cmake/system_check.cmake ++++ b/cmake/system_check.cmake +@@ -38,6 +38,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") + set(PPC 1) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") + set(MIPS64 1) ++elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") ++ set(LOONGARCH64 1) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + if (NOT BINARY) + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") +@@ -95,7 +97,7 @@ else() + endif () + + if (NOT BINARY) +- if (X86_64 OR ARM64 OR PPC OR MIPS64) ++ if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64) + set(BINARY 64) + else () + set(BINARY 32) +diff --git a/common.h b/common.h +index ac79593..ff5254a 100644 +--- a/common.h ++++ b/common.h +@@ -449,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 + #include "common_mips.h" + #endif + +- ++ + #ifdef ARCH_RISCV64 + #include "common_riscv64.h" + #endif +@@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 + #include "common_zarch.h" + #endif + ++#ifdef ARCH_LOONGARCH64 ++#include "common_loongarch64.h" ++#endif ++ + #ifndef ASSEMBLER + #ifdef OS_WINDOWSSTORE + typedef char env_var_t[MAX_PATH]; +diff --git a/common_loongarch64.h b/common_loongarch64.h +new file mode 100644 +index 0000000..e15539b +--- /dev/null ++++ b/common_loongarch64.h +@@ -0,0 +1,199 @@ ++/***************************************************************************** ++Copyright (c) 2011-2020, The OpenBLAS Project ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++ ++ 1. Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ ++ 2. Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ 3. Neither the name of the OpenBLAS project nor the names of ++ its contributors may be used to endorse or promote products ++ derived from this software without specific prior written ++ permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************************/ ++ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#ifndef COMMON_LOONGARCH64 ++#define COMMON_LOONGARCH64 ++ ++#define MB __sync_synchronize() ++#define WMB __sync_synchronize() ++#define RMB __sync_synchronize() ++ ++#define INLINE inline ++ ++#ifndef ASSEMBLER ++ ++static inline int blas_quickdivide(blasint x, blasint y){ ++ return x / y; ++} ++ ++#ifdef DOUBLE ++#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") ++#else ++#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory") ++#endif ++ ++#define GET_IMAGE_CANCEL ++ ++#else ++ ++#ifdef DOUBLE ++#define LD fld.d ++#define ST fst.d ++#define MADD fmadd.d ++#define NMADD fnmadd.d ++#define MSUB fmsub.d ++#define NMSUB fnmsub.d ++#define ADD fadd.d ++#define SUB fsub.d ++#define MUL fmul.d ++#define MOV fmov.d ++#define CMOVT fsel ++#define MTC movgr2fr.d ++#define FABS fabs.d ++#define CMPEQ fcmp.ceq.d ++#define CMPLE fcmp.cle.d ++#define CMPLT fcmp.clt.d ++#define NEG fneg.d ++#else ++#define LD fld.s ++#define ST fst.s ++#define MADD fmadd.s ++#define NMADD fnmadd.s ++#define MSUB fmsub.s ++#define NMSUB fnmsub.s ++#define ADD fadd.s ++#define SUB fsub.s ++#define MUL fmul.s ++#define MOV fmov.s ++#define CMOVT fsel ++#define MTC movgr2fr.w ++#define FABS fabs.s ++#define CMPEQ fcmp.ceq.s ++#define CMPLE fcmp.cle.s ++#define CMPLT fcmp.clt.s ++#define NEG fneg.s ++#endif /* defined(DOUBLE) */ ++ ++#if defined(__64BIT__) && defined(USE64BITINT) ++#define LDINT ld.d ++#define LDARG ld.d ++#define SDARG st.d ++#elif defined(__64BIT__) && !defined(USE64BITINT) ++#define LDINT ld.w ++#define LDARG ld.d ++#define SDARG st.d ++#else ++#define LDINT ld.w ++#define LDARG ld.w ++#define SDARG st.w ++#endif ++ ++ ++#ifndef F_INTERFACE ++#define REALNAME ASMNAME ++#else ++#define REALNAME ASMFNAME ++#endif /* defined(F_INTERFACE) */ ++ ++#if defined(ASSEMBLER) && !defined(NEEDPARAM) ++ ++#define PROLOGUE \ ++ .text ;\ ++ .align 5 ;\ ++ .globl REALNAME ;\ ++ .type REALNAME, @function ;\ ++REALNAME: ;\ ++ ++#if defined(__linux__) && defined(__ELF__) ++#define GNUSTACK .section .note.GNU-stack,"",@progbits ++#else ++#define GNUSTACK ++#endif /* defined(__linux__) && defined(__ELF__) */ ++ ++#define EPILOGUE \ ++ .end REALNAME ;\ ++ GNUSTACK ++ ++#define PROFCODE ++ ++#define MOVT(dst, src, cc) \ ++ bceqz cc, 1f; \ ++ add.d dst, src, $r0; \ ++ 1: ++ ++#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */ ++ ++#endif /* defined(ASSEMBLER) */ ++ ++#define SEEK_ADDRESS ++ ++#define BUFFER_SIZE ( 32 << 20) ++ ++#define PAGESIZE (16UL << 10) ++#define FIXED_PAGESIZE (16UL << 10) ++#define HUGE_PAGESIZE ( 2 << 20) ++ ++#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) ++ ++#ifndef MAP_ANONYMOUS ++#define MAP_ANONYMOUS MAP_ANON ++#endif ++ ++#endif +diff --git a/common_macro.h b/common_macro.h +index c6ea1bf..0136f18 100644 +--- a/common_macro.h ++++ b/common_macro.h +@@ -2490,7 +2490,8 @@ + #endif + + #ifndef ASSEMBLER +-#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) ++#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ ++|| defined(ARCH_LOONGARCH64) + extern BLASLONG gemm_offset_a; + extern BLASLONG gemm_offset_b; + extern BLASLONG sbgemm_p; +diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c +new file mode 100644 +index 0000000..ca07c7f +--- /dev/null ++++ b/cpuid_loongarch64.c +@@ -0,0 +1,143 @@ ++/***************************************************************************** ++Copyright (c) 2011-2020, The OpenBLAS Project ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++ ++ 1. Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ ++ 2. Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ 3. Neither the name of the OpenBLAS project nor the names of ++ its contributors may be used to endorse or promote products ++ derived from this software without specific prior written ++ permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++**********************************************************************************/ ++ ++#include ++ ++/* If LASX extension instructions supported, ++ * using core LOONGSON3R5 ++ * If only LSX extension instructions supported, ++ * using core LOONGSON2K1000 ++ * If neither LASX nor LSX extension instructions supported, ++ * using core LOONGSONGENERIC (As far as I know, there is no such ++ * CPU yet) ++ */ ++ ++#define CPU_GENERIC 0 ++#define CPU_LOONGSON3R5 1 ++#define CPU_LOONGSON2K1000 2 ++ ++#define LOONGARCH_CFG2 0x02 ++#define LOONGARCH_LASX 1<<7 ++#define LOONGARCH_LSX 1<<6 ++ ++static char *cpuname[] = { ++ "LOONGSONGENERIC", ++ "LOONGSON3R5", ++ "LOONGSON2K1000" ++}; ++ ++static char *cpuname_lower[] = { ++ "loongsongeneric", ++ "loongson3r5", ++ "loongson2k1000" ++}; ++ ++int detect(void) { ++#ifdef __linux ++ uint32_t reg = 0; ++ ++ __asm__ volatile ( ++ "cpucfg %0, %1 \n\t" ++ : "+&r"(reg) ++ : "r"(LOONGARCH_CFG2) ++ ); ++ ++ if (reg & LOONGARCH_LASX) ++ return CPU_LOONGSON3R5; ++ else if (reg & LOONGARCH_LSX) ++ return CPU_LOONGSON2K1000; ++ else ++ return CPU_GENERIC; ++#endif ++ return CPU_GENERIC; ++} ++ ++char *get_corename(void) { ++ return cpuname[detect()]; ++} ++ ++void get_architecture(void) { ++ printf("LOONGARCH64"); ++} ++ ++void get_subarchitecture(void) { ++ int d = detect(); ++ printf("%s", cpuname[d]); ++} ++ ++void get_subdirname(void) { ++ printf("loongarch64"); ++} ++ ++void get_cpuconfig(void) { ++ int d = detect(); ++ switch (d) { ++ case CPU_LOONGSON3R5: ++ printf("#define LOONGSON3R5\n"); ++ printf("#define L1_DATA_SIZE 65536\n"); ++ printf("#define L1_DATA_LINESIZE 64\n"); ++ printf("#define L2_SIZE 1048576\n"); ++ printf("#define L2_LINESIZE 64\n"); ++ printf("#define DTB_DEFAULT_ENTRIES 64\n"); ++ printf("#define DTB_SIZE 4096\n"); ++ printf("#define L2_ASSOCIATIVE 16\n"); ++ break; ++ ++ case CPU_LOONGSON2K1000: ++ printf("#define LOONGSON2K1000\n"); ++ printf("#define L1_DATA_SIZE 65536\n"); ++ printf("#define L1_DATA_LINESIZE 64\n"); ++ printf("#define L2_SIZE 262144\n"); ++ printf("#define L2_LINESIZE 64\n"); ++ printf("#define DTB_DEFAULT_ENTRIES 64\n"); ++ printf("#define DTB_SIZE 4096\n"); ++ printf("#define L2_ASSOCIATIVE 16\n"); ++ break; ++ ++ default: ++ printf("#define LOONGSONGENERIC\n"); ++ printf("#define L1_DATA_SIZE 65536\n"); ++ printf("#define L1_DATA_LINESIZE 64\n"); ++ printf("#define L2_SIZE 262144\n"); ++ printf("#define L2_LINESIZE 64\n"); ++ printf("#define DTB_DEFAULT_ENTRIES 64\n"); ++ printf("#define DTB_SIZE 4096\n"); ++ printf("#define L2_ASSOCIATIVE 16\n"); ++ break; ++ } ++} ++ ++void get_libname(void){ ++ int d = detect(); ++ printf("%s", cpuname_lower[d]); ++} +diff --git a/ctest.c b/ctest.c +index d674a8c..4f18918 100644 +--- a/ctest.c ++++ b/ctest.c +@@ -157,6 +157,10 @@ ARCH_ARM64 + ARCH_RISCV64 + #endif + ++#ifdef __loongarch64 ++ARCH_LOONGARCH64 ++#endif ++ + #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) + HAVE_C11 + #endif +diff --git a/driver/others/Makefile b/driver/others/Makefile +index 4a421ef..e4e9ee1 100644 +--- a/driver/others/Makefile ++++ b/driver/others/Makefile +@@ -27,11 +27,15 @@ else + ifeq ($(ARCH),mips64) + COMMONOBJS += dynamic_mips64.$(SUFFIX) + else ++ifeq ($(ARCH),loongarch64) ++COMMONOBJS += dynamic_loongarch64.$(SUFFIX) ++else + COMMONOBJS += dynamic.$(SUFFIX) + endif + endif + endif + endif ++endif + else + COMMONOBJS += parameter.$(SUFFIX) + endif +@@ -99,11 +103,15 @@ else + ifeq ($(ARCH),mips64) + HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) + else ++ifeq ($(ARCH),loongarch64) ++HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX) ++else + HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) + endif + endif + endif + endif ++endif + else + HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) + endif +diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c +new file mode 100644 +index 0000000..52f8bcb +--- /dev/null ++++ b/driver/others/dynamic_loongarch64.c +@@ -0,0 +1,128 @@ ++/******************************************************************************* ++Copyright (c) 2022, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++ ++#include "common.h" ++ ++extern gotoblas_t gotoblas_LOONGSON3R5; ++extern gotoblas_t gotoblas_LOONGSON2K1000; ++extern gotoblas_t gotoblas_LOONGSONGENERIC; ++ ++extern void openblas_warning(int verbose, const char * msg); ++ ++#define NUM_CORETYPES 3 ++ ++static char *corename[] = { ++ "loongson3r5", ++ "loongson2k1000", ++ "loongsongeneric", ++ "unknown" ++}; ++ ++char *gotoblas_corename(void) { ++ if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0]; ++ if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1]; ++ if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2]; ++ return corename[NUM_CORETYPES]; ++} ++ ++static gotoblas_t *force_coretype(char *coretype) { ++ int i; ++ int found = -1; ++ char message[128]; ++ ++ for ( i=0 ; i < NUM_CORETYPES; i++) ++ { ++ if (!strncasecmp(coretype, corename[i], 20)) ++ { ++ found = i; ++ break; ++ } ++ } ++ ++ switch (found) ++ { ++ case 0: return (&gotoblas_LOONGSON3R5); ++ case 1: return (&gotoblas_LOONGSON2K1000); ++ case 2: return (&gotoblas_LOONGSONGENERIC); ++ } ++ snprintf(message, 128, "Core not found: %s\n", coretype); ++ openblas_warning(1, message); ++ return NULL; ++} ++ ++#define LASX_MASK 1<<7 ++#define LSX_MASK 1<<6 ++#define LOONGARCH_CFG2 0x02 ++ ++static gotoblas_t *get_coretype(void) { ++ int ret = 0; ++ __asm__ volatile ( ++ "cpucfg %0, %1 \n\t" ++ : "+&r"(ret) ++ : "r"(LOONGARCH_CFG2) ++ ); ++ ++ if (ret & LASX_MASK) ++ return &gotoblas_LOONGSON3R5; ++ else if (ret & LSX_MASK) ++ return &gotoblas_LOONGSON2K1000; ++ else ++ return &gotoblas_LOONGSONGENERIC; ++} ++ ++void gotoblas_dynamic_init(void) { ++ char coremsg[128]; ++ char coren[22]; ++ char *p; ++ ++ if (gotoblas) return; ++ ++ p = getenv("OPENBLAS_CORETYPE"); ++ if ( p ) ++ { ++ gotoblas = force_coretype(p); ++ } ++ else ++ { ++ gotoblas = get_coretype(); ++ } ++ ++ if (gotoblas && gotoblas->init) { ++ strncpy(coren, gotoblas_corename(), 20); ++ sprintf(coremsg, "Core: %s\n", coren); ++ openblas_warning(2, coremsg); ++ gotoblas -> init(); ++ } else { ++ openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); ++ exit(1); ++ } ++ ++} ++ ++void gotoblas_dynamic_quit(void) { ++ gotoblas = NULL; ++} +diff --git a/getarch.c b/getarch.c +index f48944f..5906458 100644 +--- a/getarch.c ++++ b/getarch.c +@@ -140,8 +140,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + /* #define FORCE_PPC440FP2 */ + /* #define FORCE_CELL */ + /* #define FORCE_SICORTEX */ +-/* #define FORCE_LOONGSON3R3 */ +-/* #define FORCE_LOONGSON3R4 */ ++/* #define FORCE_LOONGSON3R3 */ ++/* #define FORCE_LOONGSON3R4 */ ++/* #define FORCE_LOONGSON3R5 */ ++/* #define FORCE_LOONGSON2K1000 */ ++/* #define FORCE_LOONGSONGENERIC */ + /* #define FORCE_I6400 */ + /* #define FORCE_P6600 */ + /* #define FORCE_P5600 */ +@@ -842,6 +845,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #else + #endif + ++#ifdef FORCE_LOONGSON3R5 ++#define FORCE ++#define ARCHITECTURE "LOONGARCH" ++#define SUBARCHITECTURE "LOONGSON3R5" ++#define SUBDIRNAME "loongarch64" ++#define ARCHCONFIG "-DLOONGSON3R5 " \ ++ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ ++ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ ++ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " ++#define LIBNAME "loongson3r5" ++#define CORENAME "LOONGSON3R5" ++#else ++#endif ++ ++#ifdef FORCE_LOONGSON2K1000 ++#define FORCE ++#define ARCHITECTURE "LOONGARCH" ++#define SUBARCHITECTURE "LOONGSON2K1000" ++#define SUBDIRNAME "loongarch64" ++#define ARCHCONFIG "-DLOONGSON2K1000 " \ ++ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ ++ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ ++ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " ++#define LIBNAME "loongson2k1000" ++#define CORENAME "LOONGSON2K1000" ++#else ++#endif ++ ++#ifdef FORCE_LOONGSONGENERIC ++#define FORCE ++#define ARCHITECTURE "LOONGARCH" ++#define SUBARCHITECTURE "LOONGSONGENERIC" ++#define SUBDIRNAME "loongarch64" ++#define ARCHCONFIG "-DLOONGSONGENERIC " \ ++ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ ++ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ ++ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " ++#define LIBNAME "loongsongeneric" ++#define CORENAME "LOONGSONGENERIC" ++#else ++#endif ++ + #ifdef FORCE_I6400 + #define FORCE + #define ARCHITECTURE "MIPS" +@@ -1373,6 +1418,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #define OPENBLAS_SUPPORTED + #endif + ++#ifdef __loongarch64 ++#include "cpuid_loongarch64.c" ++#define OPENBLAS_SUPPORTED ++#endif ++ + #ifdef __riscv + #include "cpuid_riscv64.c" + #define OPENBLAS_SUPPORTED +@@ -1448,7 +1498,7 @@ int main(int argc, char *argv[]){ + #ifdef FORCE + printf("CORE=%s\n", CORENAME); + #else +-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) ++#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) + printf("CORE=%s\n", get_corename()); + #endif + #endif +@@ -1596,7 +1646,7 @@ printf("ELF_VERSION=2\n"); + #ifdef FORCE + printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); + #else +-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) ++#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) + printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); + #endif + #endif +diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL +new file mode 100644 +index 0000000..e5d145a +--- /dev/null ++++ b/kernel/loongarch64/KERNEL +@@ -0,0 +1,238 @@ ++ifndef SAXPYKERNEL ++SAXPYKERNEL = ../arm/axpy.c ++endif ++ ++ifndef DAXPYKERNEL ++DAXPYKERNEL = ../arm/axpy.c ++endif ++ ++ifndef CAXPYKERNEL ++CAXPYKERNEL = ../arm/zaxpy.c ++endif ++ ++ifndef ZAXPYKERNEL ++ZAXPYKERNEL = ../arm/zaxpy.c ++endif ++ ++ifndef SROTKERNEL ++SROTKERNEL = ../arm/rot.c ++endif ++ ++ifndef DROTKERNEL ++DROTKERNEL = ../arm/rot.c ++endif ++ ++ifndef CROTKERNEL ++CROTKERNEL = ../arm/zrot.c ++endif ++ ++ifndef ZROTKERNEL ++ZROTKERNEL = ../arm/zrot.c ++endif ++ ++ifndef CSWAPKERNEL ++CSWAPKERNEL = ../arm/zswap.c ++endif ++ ++ifndef ZSWAPKERNEL ++ZSWAPKERNEL = ../arm/zswap.c ++endif ++ ++ifndef SSUMKERNEL ++SSUMKERNEL = ../arm/sum.c ++endif ++ ++ifndef DSUMKERNEL ++DSUMKERNEL = ../arm/sum.c ++endif ++ ++ifndef CSUMKERNEL ++CSUMKERNEL = ../arm/zsum.c ++endif ++ ++ifndef ZSUMKERNEL ++ZSUMKERNEL = ../arm/zsum.c ++endif ++ ++ifndef ISMAXKERNEL ++ISMAXKERNEL = ../arm/imax.c ++endif ++ ++ifndef IDMAXKERNEL ++IDMAXKERNEL = ../arm/imax.c ++endif ++ ++ifndef ISMINKERNEL ++ISMINKERNEL = ../arm/imin.c ++endif ++ ++ifndef IDMINKERNEL ++IDMINKERNEL = ../arm/imin.c ++endif ++ ++ifndef SNRM2KERNEL ++SNRM2KERNEL = snrm2.S ++endif ++ ++ifndef DNRM2KERNEL ++DNRM2KERNEL = dnrm2.S ++endif ++ ++ifndef CNRM2KERNEL ++CNRM2KERNEL = cnrm2.S ++endif ++ ++ifndef ZNRM2KERNEL ++ZNRM2KERNEL = znrm2.S ++endif ++ ++ifndef SCABS_KERNEL ++SCABS_KERNEL = ../generic/cabs.c ++endif ++ ++ifndef DCABS_KERNEL ++DCABS_KERNEL = ../generic/cabs.c ++endif ++ ++ifndef QCABS_KERNEL ++QCABS_KERNEL = ../generic/cabs.c ++endif ++ ++ifndef LSAME_KERNEL ++LSAME_KERNEL = ../generic/lsame.c ++endif ++ ++ifndef SGEMMKERNEL ++SGEMMKERNEL = gemm_kernel.S ++SGEMMINCOPY = ../generic/gemm_ncopy_2.c ++SGEMMITCOPY = ../generic/gemm_tcopy_2.c ++SGEMMONCOPY = ../generic/gemm_ncopy_8.c ++SGEMMOTCOPY = ../generic/gemm_tcopy_8.c ++SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) ++SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) ++SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) ++SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) ++endif ++ ++ifndef DGEMMKERNEL ++DGEMMKERNEL = gemm_kernel.S ++DGEMMINCOPY = ../generic/gemm_ncopy_2.c ++DGEMMITCOPY = ../generic/gemm_tcopy_2.c ++DGEMMONCOPY = ../generic/gemm_ncopy_8.c ++DGEMMOTCOPY = ../generic/gemm_tcopy_8.c ++DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) ++DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) ++DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) ++DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) ++endif ++ ++ifndef CGEMMKERNEL ++CGEMMKERNEL = zgemm_kernel.S ++CGEMMINCOPY = ../generic/zgemm_ncopy_1.c ++CGEMMITCOPY = ../generic/zgemm_tcopy_1.c ++CGEMMONCOPY = ../generic/zgemm_ncopy_4.c ++CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ++CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) ++CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) ++CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) ++CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ++endif ++ ++ifndef ZGEMMKERNEL ++ZGEMMKERNEL = zgemm_kernel.S ++ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ++ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ++ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ++ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ++ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ++ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ++ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ++ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ++endif ++ ++ifndef SGEMM_BETA ++SGEMM_BETA = ../generic/gemm_beta.c ++endif ++ifndef DGEMM_BETA ++DGEMM_BETA = ../generic/gemm_beta.c ++endif ++ifndef CGEMM_BETA ++CGEMM_BETA = ../generic/zgemm_beta.c ++endif ++ifndef ZGEMM_BETA ++ZGEMM_BETA = ../generic/zgemm_beta.c ++endif ++ ++ifndef STRSMKERNEL_LN ++STRSMKERNEL_LN = trsm_kernel_LN.S ++endif ++ ++ifndef STRSMKERNEL_LT ++STRSMKERNEL_LT = trsm_kernel_LT.S ++endif ++ ++ifndef STRSMKERNEL_RN ++STRSMKERNEL_RN = trsm_kernel_LT.S ++endif ++ ++ifndef STRSMKERNEL_RT ++STRSMKERNEL_RT = trsm_kernel_RT.S ++endif ++ ++ifndef DTRSMKERNEL_LN ++DTRSMKERNEL_LN = trsm_kernel_LN.S ++endif ++ ++ifndef DTRSMKERNEL_LT ++DTRSMKERNEL_LT = trsm_kernel_LT.S ++endif ++ ++ifndef DTRSMKERNEL_RN ++DTRSMKERNEL_RN = trsm_kernel_LT.S ++endif ++ ++ifndef DTRSMKERNEL_RT ++DTRSMKERNEL_RT = trsm_kernel_RT.S ++endif ++ ++ifndef CTRSMKERNEL_LN ++CTRSMKERNEL_LN = ztrsm_kernel_LT.S ++endif ++ ++ifndef CTRSMKERNEL_LT ++CTRSMKERNEL_LT = ztrsm_kernel_LT.S ++endif ++ ++ifndef CTRSMKERNEL_RN ++CTRSMKERNEL_RN = ztrsm_kernel_LT.S ++endif ++ ++ifndef CTRSMKERNEL_RT ++CTRSMKERNEL_RT = ztrsm_kernel_RT.S ++endif ++ ++ifndef ZTRSMKERNEL_LN ++ZTRSMKERNEL_LN = ztrsm_kernel_LT.S ++endif ++ ++ifndef ZTRSMKERNEL_LT ++ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ++endif ++ ++ifndef ZTRSMKERNEL_RN ++ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ++endif ++ ++ifndef ZTRSMKERNEL_RT ++ZTRSMKERNEL_RT = ztrsm_kernel_RT.S ++endif ++ ++ifndef CGEMM3MKERNEL ++CGEMM3MKERNEL = zgemm3m_kernel.S ++endif ++ ++ifndef ZGEMM3MKERNEL ++ZGEMM3MKERNEL = zgemm3m_kernel.S ++endif ++ ++DSDOTKERNEL = dot.S +diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 +new file mode 100644 +index 0000000..cda3590 +--- /dev/null ++++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 +@@ -0,0 +1,14 @@ ++DGEMMKERNEL = dgemm_kernel_16x4.S ++DGEMMINCOPY = dgemm_ncopy_16.S ++DGEMMITCOPY = dgemm_tcopy_16.S ++DGEMMONCOPY = dgemm_ncopy_4.S ++DGEMMOTCOPY = dgemm_tcopy_4.S ++DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) ++DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) ++DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) ++DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) ++ ++DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ++DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ++DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ++DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic +new file mode 100644 +index 0000000..b772a6f +--- /dev/null ++++ b/kernel/loongarch64/KERNEL.generic +@@ -0,0 +1,167 @@ ++SGEMM_BETA = ../generic/gemm_beta.c ++DGEMM_BETA = ../generic/gemm_beta.c ++CGEMM_BETA = ../generic/zgemm_beta.c ++ZGEMM_BETA = ../generic/zgemm_beta.c ++ ++STRMMKERNEL = ../generic/trmmkernel_2x2.c ++DTRMMKERNEL = ../generic/trmmkernel_2x2.c ++CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ++ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ++ ++SGEMMKERNEL = ../generic/gemmkernel_2x2.c ++SGEMMONCOPY = ../generic/gemm_ncopy_2.c ++SGEMMOTCOPY = ../generic/gemm_tcopy_2.c ++SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) ++SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) ++ ++DGEMMKERNEL = ../generic/gemmkernel_2x2.c ++DGEMMONCOPY = ../generic/gemm_ncopy_2.c ++DGEMMOTCOPY = ../generic/gemm_tcopy_2.c ++DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) ++DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) ++ ++CGEMMKERNEL = ../generic/zgemmkernel_2x2.c ++CGEMMONCOPY = ../generic/zgemm_ncopy_2.c ++CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ++CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) ++CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ++ ++ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ++ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ++ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ++ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ++ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ++ ++STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ++STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ++STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ++STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ++ ++DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ++DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ++DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ++DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ++ ++CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ++CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ++CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ++CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ++ ++ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ++ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ++ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ++ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ++ ++#Pure C for other kernels ++SAMAXKERNEL = ../arm/amax.c ++DAMAXKERNEL = ../arm/amax.c ++CAMAXKERNEL = ../arm/zamax.c ++ZAMAXKERNEL = ../arm/zamax.c ++ ++SAMINKERNEL = ../arm/amin.c ++DAMINKERNEL = ../arm/amin.c ++CAMINKERNEL = ../arm/zamin.c ++ZAMINKERNEL = ../arm/zamin.c ++ ++SMAXKERNEL = ../arm/max.c ++DMAXKERNEL = ../arm/max.c ++ ++SMINKERNEL = ../arm/min.c ++DMINKERNEL = ../arm/min.c ++ ++ISAMAXKERNEL = ../arm/iamax.c ++IDAMAXKERNEL = ../arm/iamax.c ++ICAMAXKERNEL = ../arm/izamax.c ++IZAMAXKERNEL = ../arm/izamax.c ++ ++ISAMINKERNEL = ../arm/iamin.c ++IDAMINKERNEL = ../arm/iamin.c ++ICAMINKERNEL = ../arm/izamin.c ++IZAMINKERNEL = ../arm/izamin.c ++ ++ISMAXKERNEL = ../arm/imax.c ++IDMAXKERNEL = ../arm/imax.c ++ ++ISMINKERNEL = ../arm/imin.c ++IDMINKERNEL = ../arm/imin.c ++ ++SASUMKERNEL = ../arm/asum.c ++DASUMKERNEL = ../arm/asum.c ++CASUMKERNEL = ../arm/zasum.c ++ZASUMKERNEL = ../arm/zasum.c ++ ++SSUMKERNEL = ../arm/sum.c ++DSUMKERNEL = ../arm/sum.c ++CSUMKERNEL = ../arm/zsum.c ++ZSUMKERNEL = ../arm/zsum.c ++ ++ ++SAXPYKERNEL = ../arm/axpy.c ++DAXPYKERNEL = ../arm/axpy.c ++CAXPYKERNEL = ../arm/zaxpy.c ++ZAXPYKERNEL = ../arm/zaxpy.c ++ ++SCOPYKERNEL = ../arm/copy.c ++DCOPYKERNEL = ../arm/copy.c ++CCOPYKERNEL = ../arm/zcopy.c ++ZCOPYKERNEL = ../arm/zcopy.c ++ ++SDOTKERNEL = ../generic/dot.c ++DDOTKERNEL = ../arm/dot.c ++CDOTKERNEL = ../arm/zdot.c ++ZDOTKERNEL = ../arm/zdot.c ++ ++SNRM2KERNEL = ../arm/nrm2.c ++DNRM2KERNEL = ../arm/nrm2.c ++CNRM2KERNEL = ../arm/znrm2.c ++ZNRM2KERNEL = ../arm/znrm2.c ++ ++SROTKERNEL = ../arm/rot.c ++DROTKERNEL = ../arm/rot.c ++CROTKERNEL = ../arm/zrot.c ++ZROTKERNEL = ../arm/zrot.c ++ ++SSCALKERNEL = ../arm/scal.c ++DSCALKERNEL = ../arm/scal.c ++CSCALKERNEL = ../arm/zscal.c ++ZSCALKERNEL = ../arm/zscal.c ++ ++SSWAPKERNEL = ../arm/swap.c ++DSWAPKERNEL = ../arm/swap.c ++CSWAPKERNEL = ../arm/zswap.c ++ZSWAPKERNEL = ../arm/zswap.c ++ ++SGEMVNKERNEL = ../arm/gemv_n.c ++DGEMVNKERNEL = ../arm/gemv_n.c ++CGEMVNKERNEL = ../arm/zgemv_n.c ++ZGEMVNKERNEL = ../arm/zgemv_n.c ++ ++SGEMVTKERNEL = ../arm/gemv_t.c ++DGEMVTKERNEL = ../arm/gemv_t.c ++CGEMVTKERNEL = ../arm/zgemv_t.c ++ZGEMVTKERNEL = ../arm/zgemv_t.c ++ ++SSYMV_U_KERNEL = ../generic/symv_k.c ++SSYMV_L_KERNEL = ../generic/symv_k.c ++DSYMV_U_KERNEL = ../generic/symv_k.c ++DSYMV_L_KERNEL = ../generic/symv_k.c ++QSYMV_U_KERNEL = ../generic/symv_k.c ++QSYMV_L_KERNEL = ../generic/symv_k.c ++CSYMV_U_KERNEL = ../generic/zsymv_k.c ++CSYMV_L_KERNEL = ../generic/zsymv_k.c ++ZSYMV_U_KERNEL = ../generic/zsymv_k.c ++ZSYMV_L_KERNEL = ../generic/zsymv_k.c ++XSYMV_U_KERNEL = ../generic/zsymv_k.c ++XSYMV_L_KERNEL = ../generic/zsymv_k.c ++ ++ZHEMV_U_KERNEL = ../generic/zhemv_k.c ++ZHEMV_L_KERNEL = ../generic/zhemv_k.c ++ ++LSAME_KERNEL = ../generic/lsame.c ++SCABS_KERNEL = ../generic/cabs.c ++DCABS_KERNEL = ../generic/cabs.c ++QCABS_KERNEL = ../generic/cabs.c ++ ++#Dump kernel ++CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ++ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +diff --git a/kernel/loongarch64/Makefile b/kernel/loongarch64/Makefile +new file mode 100644 +index 0000000..520349b +--- /dev/null ++++ b/kernel/loongarch64/Makefile +@@ -0,0 +1 @@ ++clean :: +diff --git a/kernel/loongarch64/amax.S b/kernel/loongarch64/amax.S +new file mode 100644 +index 0000000..4b135c5 +--- /dev/null ++++ b/kernel/loongarch64/amax.S +@@ -0,0 +1,230 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++ ++#define I $r17 ++#define TEMP $r18 ++ ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++ ++#define t1 $f0 ++#define t2 $f1 ++#define t3 $f2 ++#define t4 $f3 ++ ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ MTC s1, $r0 ++ bge $r0, N, .L999 ++ ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, INCX, .L999 ++ ++ LD a1, X, 0 * SIZE ++ addi.d N, N, -1 ++ ++ add.d X, X, INCX ++ FABS s1, a1 ++ ++ FABS s2, a1 ++ bge $r0, N, .L999 ++ ++ FABS s3, a1 ++ srai.d I, N, 3 ++ ++ FABS s4, a1 ++ bge $r0, I, .L15 ++ ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a8, X, 0 * SIZE ++ addi.d I, I, -1 ++ ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ FABS t1, a1 ++ LD a1, X, 0 * SIZE ++ FABS t2, a2 ++ add.d X, X, INCX ++ ++ FABS t3, a3 ++ LD a2, X, 0 * SIZE ++ FABS t4, a4 ++ add.d X, X, INCX ++ ++ CMPLT $fcc0, s1, t1 ++ LD a3, X, 0 * SIZE ++ CMPLT $fcc1, s2, t2 ++ add.d X, X, INCX ++ ++ CMPLT $fcc2, s3, t3 ++ LD a4, X, 0 * SIZE ++ CMPLT $fcc3, s4, t4 ++ add.d X, X, INCX ++ ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ ++ FABS t1, a5 ++ LD a5, X, 0 * SIZE ++ FABS t2, a6 ++ add.d X, X, INCX ++ ++ FABS t3, a7 ++ LD a6, X, 0 * SIZE ++ FABS t4, a8 ++ add.d X, X, INCX ++ ++ CMPLT $fcc0, s1, t1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, s2, t2 ++ add.d X, X, INCX ++ ++ CMPLT $fcc2, s3, t3 ++ LD a8, X, 0 * SIZE ++ CMPLT $fcc3, s4, t4 ++ add.d X, X, INCX ++ ++ CMOVT s1, s1, t1, $fcc0 ++ addi.d I, I, -1 ++ ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ ++ CMOVT s4, s4, t4, $fcc3 ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ FABS t1, a1 ++ FABS t2, a2 ++ FABS t3, a3 ++ FABS t4, a4 ++ ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t2 ++ CMPLT $fcc2, s3, t3 ++ CMPLT $fcc3, s4, t4 ++ ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ ++ FABS t1, a5 ++ FABS t2, a6 ++ FABS t3, a7 ++ FABS t4, a8 ++ ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t2 ++ CMPLT $fcc2, s3, t3 ++ CMPLT $fcc3, s4, t4 ++ ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ .align 3 ++ ++.L15: ++ andi I, N, 7 ++ ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ ++ FABS t1, a1 ++ ++ CMPLT $fcc0, s1, t1 ++ ++ CMOVT s1, s1, t1, $fcc0 ++ ++ add.d X, X, INCX ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L998: ++ CMPLT $fcc0, s1, s2 ++ CMPLT $fcc1, s3, s4 ++ ++ CMOVT s1, s1, s2, $fcc0 ++ CMOVT s3, s3, s4, $fcc1 ++ ++ CMPLT $fcc0, s1, s3 ++ CMOVT s1, s1, s3, $fcc0 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/amin.S b/kernel/loongarch64/amin.S +new file mode 100644 +index 0000000..ff9978f +--- /dev/null ++++ b/kernel/loongarch64/amin.S +@@ -0,0 +1,186 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define t1 $f0 ++#define t2 $f1 ++#define t3 $f2 ++#define t4 $f3 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++ ++ PROLOGUE ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ MTC s1, $r0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, INCX, .L999 ++ LD a1, X, 0 * SIZE ++ addi.d N, N, -1 ++ add.d X, X, INCX ++ FABS s1, a1 ++ FABS s2, a1 ++ bge $r0, N, .L999 ++ FABS s3, a1 ++ srai.d I, N, 3 ++ FABS s4, a1 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a8, X, 0 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++.L12: ++ FABS t1, a1 ++ LD a1, X, 0 * SIZE ++ FABS t2, a2 ++ add.d X, X, INCX ++ FABS t3, a3 ++ LD a2, X, 0 * SIZE ++ FABS t4, a4 ++ add.d X, X, INCX ++ CMPLT $fcc0, t1, s1 ++ LD a3, X, 0 * SIZE ++ CMPLT $fcc1, t2, s2 ++ add.d X, X, INCX ++ CMPLT $fcc2, t3, s3 ++ LD a4, X, 0 * SIZE ++ CMPLT $fcc3, t4, s4 ++ add.d X, X, INCX ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ FABS t1, a5 ++ LD a5, X, 0 * SIZE ++ FABS t2, a6 ++ add.d X, X, INCX ++ FABS t3, a7 ++ LD a6, X, 0 * SIZE ++ FABS t4, a8 ++ add.d X, X, INCX ++ CMPLT $fcc0, t1, s1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, t2, s2 ++ add.d X, X, INCX ++ CMPLT $fcc2, t3, s3 ++ LD a8, X, 0 * SIZE ++ CMPLT $fcc3, t4, s4 ++ add.d X, X, INCX ++ CMOVT s1, s1, t1, $fcc0 ++ addi.d I, I, -1 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ blt $r0, I, .L12 ++ .align 3 ++.L13: ++ FABS t1, a1 ++ FABS t2, a2 ++ FABS t3, a3 ++ FABS t4, a4 ++ CMPLT $fcc0, t1, s1 ++ CMPLT $fcc1, t2, s2 ++ CMPLT $fcc2, t3, s3 ++ CMPLT $fcc3, t4, s4 ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ FABS t1, a5 ++ FABS t2, a6 ++ FABS t3, a7 ++ FABS t4, a8 ++ CMPLT $fcc0, t1, s1 ++ CMPLT $fcc1, t2, s2 ++ CMPLT $fcc2, t3, s3 ++ CMPLT $fcc3, t4, s4 ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ .align 3 ++.L15: ++ andi I, N, 7 ++NOP ++ bge $r0, I, .L998 ++ .align 3 ++.L16: ++ LD a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ FABS t1, a1 ++ CMPLT $fcc0, t1, s1 ++ CMOVT s1, s1, t1, $fcc0 ++ add.d X, X, INCX ++ blt $r0, I, .L16 ++ .align 3 ++.L998: ++ CMPLT $fcc0, s2, s1 ++ CMPLT $fcc1, s4, s3 ++ CMOVT s1, s1, s2, $fcc0 ++ CMOVT s3, s3, s4, $fcc1 ++ CMPLT $fcc0, s3, s1 ++ CMOVT s1, s1, s3, $fcc0 ++ .align 3 ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S +new file mode 100644 +index 0000000..7d21ce0 +--- /dev/null ++++ b/kernel/loongarch64/asum.S +@@ -0,0 +1,232 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f23 ++#define a2 $f9 ++#define a3 $f10 ++#define a4 $f11 ++#define a5 $f12 ++#define a6 $f13 ++#define a7 $f14 ++#define a8 $f15 ++#define t1 $f16 ++#define t2 $f17 ++#define t3 $f0 ++#define t4 $f1 ++#define s1 $f22 ++#define s2 $f8 ++ PROLOGUE ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ MTC s1, $r0 ++ MTC s2, $r0 ++ slli.d INCX, INCX, BASE_SHIFT ++ li.d TEMP, SIZE ++ bge $r0, N, .L999 ++ srai.d I, N, 3 ++ bne INCX, TEMP, .L20 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD a3, X, 2 * SIZE ++ LD a4, X, 3 * SIZE ++ LD a5, X, 4 * SIZE ++ FABS t1, a1 ++ LD a6, X, 5 * SIZE ++ FABS t2, a2 ++ LD a7, X, 6 * SIZE ++ FABS t3, a3 ++ FABS t4, a4 ++ addi.d I, I, -1 ++ LD a8, X, 7 * SIZE ++ bge $r0, I, .L13 ++ .align 3 ++.L12: ++ ADD s1, s1, t1 ++ LD a1, X, 8 * SIZE ++ FABS t1, a5 ++ addi.d I, I, -1 ++ ADD s2, s2, t2 ++ LD a2, X, 9 * SIZE ++ FABS t2, a6 ++ NOP ++ ADD s1, s1, t3 ++ LD a3, X, 10 * SIZE ++ FABS t3, a7 ++ NOP ++ ADD s2, s2, t4 ++ LD a4, X, 11 * SIZE ++ FABS t4, a8 ++ addi.d X, X, 8 * SIZE ++ ADD s1, s1, t1 ++ LD a5, X, 4 * SIZE ++ FABS t1, a1 ++ NOP ++ ADD s2, s2, t2 ++ LD a6, X, 5 * SIZE ++ FABS t2, a2 ++ NOP ++ ADD s1, s1, t3 ++ LD a7, X, 6 * SIZE ++ FABS t3, a3 ++ NOP ++ ADD s2, s2, t4 ++ LD a8, X, 7 * SIZE ++ FABS t4, a4 ++ blt $r0, I, .L12 ++ .align 3 ++.L13: ++ ADD s1, s1, t1 ++ addi.d X, X, 8 * SIZE ++ FABS t1, a5 ++ NOP ++ ADD s2, s2, t2 ++ FABS t2, a6 ++ ADD s1, s1, t3 ++ FABS t3, a7 ++ ADD s2, s2, t4 ++ FABS t4, a8 ++ ADD s1, s1, t1 ++ ADD s2, s2, t2 ++ ADD s1, s1, t3 ++ ADD s2, s2, t4 ++ .align 3 ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++.L16: ++ LD a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ FABS t1, a1 ++ ADD s1, s1, t1 ++ addi.d X, X, SIZE ++ blt $r0, I, .L16 ++ b .L999 ++ .align 3 ++.L20: ++ bge $r0, I, .L25 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ FABS t1, a1 ++ LD a7, X, 0 * SIZE ++ FABS t2, a2 ++ add.d X, X, INCX ++ FABS t3, a3 ++ LD a8, X, 0 * SIZE ++ FABS t4, a4 ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L24 ++ .align 3 ++.L23: ++ ADD s1, s1, t1 ++ LD a1, X, 0 * SIZE ++ FABS t1, a5 ++ add.d X, X, INCX ++ ADD s2, s2, t2 ++ LD a2, X, 0 * SIZE ++ FABS t2, a6 ++ add.d X, X, INCX ++ ADD s1, s1, t3 ++ LD a3, X, 0 * SIZE ++ FABS t3, a7 ++ add.d X, X, INCX ++ ADD s2, s2, t4 ++ LD a4, X, 0 * SIZE ++ FABS t4, a8 ++ add.d X, X, INCX ++ ADD s1, s1, t1 ++ LD a5, X, 0 * SIZE ++ FABS t1, a1 ++ add.d X, X, INCX ++ ADD s2, s2, t2 ++ LD a6, X, 0 * SIZE ++ FABS t2, a2 ++ add.d X, X, INCX ++ ADD s1, s1, t3 ++ LD a7, X, 0 * SIZE ++ FABS t3, a3 ++ add.d X, X, INCX ++ ADD s2, s2, t4 ++ LD a8, X, 0 * SIZE ++ FABS t4, a4 ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ blt $r0, I, .L23 ++ .align 3 ++.L24: ++ ADD s1, s1, t1 ++ FABS t1, a5 ++ ADD s2, s2, t2 ++ FABS t2, a6 ++ ADD s1, s1, t3 ++ FABS t3, a7 ++ ADD s2, s2, t4 ++ FABS t4, a8 ++ ADD s1, s1, t1 ++ ADD s2, s2, t2 ++ ADD s1, s1, t3 ++ ADD s2, s2, t4 ++ .align 3 ++.L25: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++.L26: ++ LD a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ FABS t1, a1 ++ add.d X, X, INCX ++ ADD s1, s1, t1 ++ blt $r0, I, .L26 ++ .align 3 ++.L999: ++ ADD s1, s1, s2 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S +new file mode 100644 +index 0000000..9d27987 +--- /dev/null ++++ b/kernel/loongarch64/cnrm2.S +@@ -0,0 +1,159 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f12 ++#define a2 $f13 ++#define a3 $f14 ++#define a4 $f15 ++#define a5 $f16 ++#define a6 $f17 ++#define a7 $f0 ++#define a8 $f1 ++#define s1 $f22 ++#define s2 $f8 ++#define t1 $f23 ++#define t2 $f9 ++#define t3 $f10 ++#define t4 $f11 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ movgr2fr.d s1, $r0 ++ li.d TEMP, 2 * SIZE ++ fmov.d s2, s1 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, ZBASE_SHIFT ++ bge $r0, INCX, .L999 ++ srai.d I, N, 2 ++ bge $r0, I, .L25 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ fcvt.d.s t1, a1 ++ LD a7, X, 0 * SIZE ++ fcvt.d.s t2, a2 ++ LD a8, X, 1 * SIZE ++ fcvt.d.s t3, a3 ++ addi.d I, I, -1 ++ fcvt.d.s t4, a4 ++ add.d X, X, INCX ++ bge $r0, I, .L24 ++ .align 3 ++ ++.L23: ++ fmadd.d s1, t1, t1, s1 ++ LD a1, X, 0 * SIZE ++ fcvt.d.s t1, a5 ++ fmadd.d s2, t2, t2, s2 ++ LD a2, X, 1 * SIZE ++ fcvt.d.s t2, a6 ++ add.d X, X, INCX ++ fmadd.d s1, t3, t3, s1 ++ LD a3, X, 0 * SIZE ++ fcvt.d.s t3, a7 ++ fmadd.d s2, t4, t4, s2 ++ LD a4, X, 1 * SIZE ++ fcvt.d.s t4, a8 ++ add.d X, X, INCX ++ fmadd.d s1, t1, t1, s1 ++ LD a5, X, 0 * SIZE ++ fcvt.d.s t1, a1 ++ addi.d I, I, -1 ++ fmadd.d s2, t2, t2, s2 ++ LD a6, X, 1 * SIZE ++ fcvt.d.s t2, a2 ++ add.d X, X, INCX ++ fmadd.d s1, t3, t3, s1 ++ LD a7, X, 0 * SIZE ++ fcvt.d.s t3, a3 ++ LD a8, X, 1 * SIZE ++ fmadd.d s2, t4, t4, s2 ++ add.d X, X, INCX ++ fcvt.d.s t4, a4 ++ blt $r0, I, .L23 ++ .align 3 ++ ++.L24: ++ fmadd.d s1, t1, t1, s1 ++ fcvt.d.s t1, a5 ++ fmadd.d s2, t2, t2, s2 ++ fcvt.d.s t2, a6 ++ fmadd.d s1, t3, t3, s1 ++ fcvt.d.s t3, a7 ++ fmadd.d s2, t4, t4, s2 ++ fcvt.d.s t4, a8 ++ fmadd.d s1, t1, t1, s1 ++ fmadd.d s2, t2, t2, s2 ++ fmadd.d s1, t3, t3, s1 ++ fmadd.d s2, t4, t4, s2 ++ .align 3 ++ ++.L25: ++ andi I, N, 3 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L26: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ addi.d I, I, -1 ++ fcvt.d.s t1, a1 ++ fcvt.d.s t2, a2 ++ fmadd.d s1, t1, t1, s1 ++ add.d X, X, INCX ++ fmadd.d s2, t2, t2, s2 ++ blt $r0, I, .L26 ++ .align 3 ++ ++.L999: ++ fadd.d s1, s1, s2 ++ fsqrt.d s1, s1 ++ move $r4, $r17 ++ fcvt.s.d $f0, s1 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S +new file mode 100644 +index 0000000..3156f60 +--- /dev/null ++++ b/kernel/loongarch64/copy.S +@@ -0,0 +1,225 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define Y $r7 ++#define INCY $r8 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f23 ++#define a4 $f9 ++#define a5 $f10 ++#define a6 $f11 ++#define a7 $f12 ++#define a8 $f13 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++ LDINT INCY, 0(INCY) ++#endif ++ ++ li.d TEMP, SIZE ++ NOP ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, N, .L999 ++ slli.d INCY, INCY, BASE_SHIFT ++ bne INCX, TEMP, .L20 ++ srai.d I, N, 3 ++ bne INCY, TEMP, .L20 ++ addi.d I, I, -1 ++ blt I, $r0, .L15 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD a3, X, 2 * SIZE ++ LD a4, X, 3 * SIZE ++ LD a5, X, 4 * SIZE ++ LD a6, X, 5 * SIZE ++ LD a7, X, 6 * SIZE ++ LD a8, X, 7 * SIZE ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ ST a1, Y, 0 * SIZE ++ LD a1, X, 8 * SIZE ++ ST a2, Y, 1 * SIZE ++ LD a2, X, 9 * SIZE ++ ST a3, Y, 2 * SIZE ++ LD a3, X, 10 * SIZE ++ ST a4, Y, 3 * SIZE ++ LD a4, X, 11 * SIZE ++ ST a5, Y, 4 * SIZE ++ LD a5, X, 12 * SIZE ++ ST a6, Y, 5 * SIZE ++ LD a6, X, 13 * SIZE ++ ST a7, Y, 6 * SIZE ++ LD a7, X, 14 * SIZE ++ ST a8, Y, 7 * SIZE ++ LD a8, X, 15 * SIZE ++ addi.d I, I, -1 ++ addi.d X, X, 8 * SIZE ++ addi.d Y, Y, 8 * SIZE ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ ST a1, Y, 0 * SIZE ++ ST a2, Y, 1 * SIZE ++ ST a3, Y, 2 * SIZE ++ ST a4, Y, 3 * SIZE ++ ST a5, Y, 4 * SIZE ++ ST a6, Y, 5 * SIZE ++ ST a7, Y, 6 * SIZE ++ ST a8, Y, 7 * SIZE ++ addi.d X, X, 8 * SIZE ++ addi.d Y, Y, 8 * SIZE ++ .align 3 ++ ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ addi.d X, X, SIZE ++ addi.d I, I, -1 ++ addi.d Y, Y, SIZE ++ ST a1, Y, -1 * SIZE ++ blt $r0, I, .L16 ++ b .L999 ++ .align 3 ++ ++.L20: ++ srai.d I, N, 3 ++ addi.d I, I, -1 ++ blt I, $r0, .L25 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a8, X, 0 * SIZE ++ add.d X, X, INCX ++ bge $r0, I, .L23 ++ .align 3 ++ ++.L22: ++ ST a1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a2, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a3, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a4, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a5, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a6, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a7, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a8, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a8, X, 0 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ blt $r0, I, .L22 ++ .align 3 ++ ++.L23: ++ ST a1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a2, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a3, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a4, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a5, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a6, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a7, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a8, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ .align 3 ++ ++.L25: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L26: ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ addi.d I, I, -1 ++ ST a1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ blt $r0, I, .L26 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S +new file mode 100644 +index 0000000..13faa97 +--- /dev/null ++++ b/kernel/loongarch64/dgemm_kernel_16x4.S +@@ -0,0 +1,4250 @@ ++/******************************************************************************* ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++/* Function parameters */ ++#define M $r4 // param 1: bm ++#define N $r5 // param 2: bn ++#define K $r6 // param 3: bk ++#define ALPHA $f0 // param 4: alpha ++#define A $r7 // param 5: ba ++#define B $r8 // param 6: bb ++#define C $r9 // param 7: bc ++#define LDC $r10 // param 8: ldc ++ ++#ifdef TRMMKERNEL ++#define OFFSET $r11 // param 9: offset ++#endif ++#define OFF $r12 ++ ++/* Cycle control parameters */ ++#define I $r13 ++#define J $r14 ++#define L $r15 ++#define TL $r16 ++/* Matrix address */ ++#define A0 $r17 ++#define B0 $r18 ++#define C0 $r19 ++#define C1 $r20 ++#define C2 $r23 ++#define C3 $r24 ++#define T0 $r25 /* !! DO NOT USE $r21 and $r22 !! */ ++#define T1 $r26 ++#define T2 $r27 ++#define ZERO $r0 ++ ++/* LASX vectors */ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define D0 $xr7 ++#define D1 $xr8 ++#define D2 $xr9 ++#define D3 $xr10 ++#define D4 $xr11 ++#define D5 $xr12 ++#define D6 $xr13 ++#define D7 $xr14 ++#define D8 $xr15 ++#define D9 $xr16 ++#define D10 $xr17 ++#define D11 $xr18 ++#define D12 $xr19 ++#define D13 $xr20 ++#define D14 $xr21 ++#define D15 $xr22 ++#define VALPHA $xr23 ++ ++/* Prefetch interval */ ++#define A_PRE 0x200 ++#define B_PRE 0x100 ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -56 ++ /* Store regs */ ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 16 ++ SDARG $r26, $sp, 24 ++ SDARG $r27, $sp, 32 ++ ST $f23, $sp, 40 ++ ST ALPHA, $sp, 48 ++ ++ /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ ++ xvld VALPHA, $sp, 48 ++ xvreplve0.d VALPHA, VALPHA ++ ++#if defined (TRMMKERNEL) && !defined(LEFT) ++ sub.d OFF, ZERO, OFFSET ++#else ++ xor OFF, OFF, OFF ++#endif ++ ++ /* if (!(N >> 2)) goto L_N3 */ ++ srai.d J, N, 2 /* J = bn >> 2 */ ++ andi N, N, 0x03 ++ beq ZERO, J, .L_N3 ++ ++.L_J1: /* J-- && This loop include Condition 1 */ ++ ++/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* ++* dgemm_core_16x4 */ ++ move C0, C ++ move A0, A ++ slli.d T0, LDC, 3 ++ add.d C1, C0, T0 ++ addi.d J, J, -1 /* J-- */ ++ add.d C2, C1, T0 ++ add.d C3, C2, T0 ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move OFF, OFFSET ++#endif ++ ++ /* if (!(M >> 4)) goto L_M8 */ ++ srai.d I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_M8 ++ ++.L_I1: /* I-- */ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x07 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x05 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 16 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ /* Calculate the first set of D0~D15, ++ * avoidig set 0 operation ++ * Load 16 * 64 from A0 ++ * U0 = {a3, a2, a1, a0} ++ * U1 = {a7, a6, a5, a4} ++ * U2 = {a11, a10, a9, a8} ++ * U3 = {a15, a14, a13, a12} ++ */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ preld 0, C0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ preld 0, C0, 0x40 ++ xvfmul.d D2, U2, U4 ++ xvfmul.d D3, U3, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ preld 0, C1, 0x00 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ xvfmul.d D5, U1, U4 ++ preld 0, C1, 0x40 ++ xvfmul.d D6, U2, U4 ++ xvfmul.d D7, U3, U4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ preld 0, C2, 0x00 ++ /* line 3 */ ++ xvfmul.d D8, U0, U4 ++ xvfmul.d D9, U1, U4 ++ preld 0, C2, 0x40 ++ xvfmul.d D10, U2, U4 ++ xvfmul.d D11, U3, U4 ++ ++ xvldrepl.d U4, B0, 0x18 ++ preld 0, C3, 0x00 ++ /* line 4 */ ++ xvfmul.d D12, U0, U4 ++ xvfmul.d D13, U1, U4 ++ preld 0, C3, 0x40 ++ xvfmul.d D14, U2, U4 ++ xvfmul.d D15, U3, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_L7 */ ++ beq ZERO,TL, .L_L7 ++ ++ /* Calculate 8 sets of D0~D15 */ ++.L_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ preld 0, B0, B_PRE ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ preld 0, A0, A_PRE ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D10, U2, U4, D10 ++ xvfmadd.d D11, U3, U4, D11 ++ preld 0, A0, A_PRE + 0x40 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ xvfmadd.d D14, U2, U4, D14 ++ xvfmadd.d D15, U3, U4, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ /***8-2***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ preld 0, B0, B_PRE ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ preld 0, A0, A_PRE ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D10, U2, U4, D10 ++ xvfmadd.d D11, U3, U4, D11 ++ preld 0, A0, A_PRE + 0x40 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ xvfmadd.d D14, U2, U4, D14 ++ xvfmadd.d D15, U3, U4, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ /***8-3***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ preld 0, B0, B_PRE ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ preld 0, A0, A_PRE ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D10, U2, U4, D10 ++ xvfmadd.d D11, U3, U4, D11 ++ preld 0, A0, A_PRE + 0x40 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ xvfmadd.d D14, U2, U4, D14 ++ xvfmadd.d D15, U3, U4, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ /***8-4***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ preld 0, B0, B_PRE ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ preld 0, A0, A_PRE ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D10, U2, U4, D10 ++ xvfmadd.d D11, U3, U4, D11 ++ preld 0, A0, A_PRE + 0x40 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ xvfmadd.d D14, U2, U4, D14 ++ xvfmadd.d D15, U3, U4, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ /***8-5***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ preld 0, B0, B_PRE ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ preld 0, A0, A_PRE ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D10, U2, U4, D10 ++ xvfmadd.d D11, U3, U4, D11 ++ preld 0, A0, A_PRE + 0x40 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ xvfmadd.d D14, U2, U4, D14 ++ xvfmadd.d D15, U3, U4, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ /***8-6***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ preld 0, B0, B_PRE ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ preld 0, A0, A_PRE ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D10, U2, U4, D10 ++ xvfmadd.d D11, U3, U4, D11 ++ preld 0, A0, A_PRE + 0x40 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ xvfmadd.d D14, U2, U4, D14 ++ xvfmadd.d D15, U3, U4, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ /***8-7***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ preld 0, B0, B_PRE ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ preld 0, A0, A_PRE ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D10, U2, U4, D10 ++ xvfmadd.d D11, U3, U4, D11 ++ preld 0, A0, A_PRE + 0x40 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ xvfmadd.d D14, U2, U4, D14 ++ xvfmadd.d D15, U3, U4, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ /***8-8***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ preld 0, B0, B_PRE ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ preld 0, A0, A_PRE ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D10, U2, U4, D10 ++ xvfmadd.d D11, U3, U4, D11 ++ preld 0, A0, A_PRE + 0x40 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ xvfmadd.d D14, U2, U4, D14 ++ xvfmadd.d D15, U3, U4, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_TL1 ++ ++ /* Maybe we need calculate the last ++ * 7 sets of D0~D15? ++ */ ++.L_L7: ++ /* if (!(L & 7)) goto L_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_L0 ++ ++.L_L71: ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D10, U2, U4, D10 ++ xvfmadd.d D11, U3, U4, D11 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ xvfmadd.d D14, U2, U4, D14 ++ xvfmadd.d D15, U3, U4, D15 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_L71 ++ ++.L_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D1, D1, VALPHA ++ xvfmul.d D2, D2, VALPHA ++ xvfmul.d D3, D3, VALPHA ++ xvfmul.d D4, D4, VALPHA ++ xvfmul.d D5, D5, VALPHA ++ xvfmul.d D6, D6, VALPHA ++ xvfmul.d D7, D7, VALPHA ++ xvfmul.d D8, D8, VALPHA ++ xvfmul.d D9, D9, VALPHA ++ xvfmul.d D10, D10, VALPHA ++ xvfmul.d D11, D11, VALPHA ++ xvfmul.d D12, D12, VALPHA ++ xvfmul.d D13, D13, VALPHA ++ xvfmul.d D14, D14, VALPHA ++ xvfmul.d D15, D15, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ xvfmadd.d D1, D1, VALPHA, U1 ++ xvfmadd.d D2, D2, VALPHA, U2 ++ xvfmadd.d D3, D3, VALPHA, U3 ++ ++ /* Load C1 */ ++ xvld U0, C1, 0x00 ++ xvld U1, C1, 0x20 ++ xvld U2, C1, 0x40 ++ xvld U3, C1, 0x60 ++ xvfmadd.d D4, D4, VALPHA, U0 ++ xvfmadd.d D5, D5, VALPHA, U1 ++ xvfmadd.d D6, D6, VALPHA, U2 ++ xvfmadd.d D7, D7, VALPHA, U3 ++ ++ /* Load C2 */ ++ xvld U0, C2, 0x00 ++ xvld U1, C2, 0x20 ++ xvld U2, C2, 0x40 ++ xvld U3, C2, 0x60 ++ xvfmadd.d D8, D8, VALPHA, U0 ++ xvfmadd.d D9, D9, VALPHA, U1 ++ xvfmadd.d D10, D10, VALPHA, U2 ++ xvfmadd.d D11, D11, VALPHA, U3 ++ ++ /* Load C3 */ ++ xvld U0, C3, 0x00 ++ xvld U1, C3, 0x20 ++ xvld U2, C3, 0x40 ++ xvld U3, C3, 0x60 ++ xvfmadd.d D12, D12, VALPHA, U0 ++ xvfmadd.d D13, D13, VALPHA, U1 ++ xvfmadd.d D14, D14, VALPHA, U2 ++ xvfmadd.d D15, D15, VALPHA, U3 ++#endif // #if defined(TRMMKERNEL) ++ ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ xvst D1, C0, 0x20 ++ xvst D2, C0, 0x40 ++ xvst D3, C0, 0x60 ++ /* Store C1 */ ++ xvst D4, C1, 0x00 ++ xvst D5, C1, 0x20 ++ xvst D6, C1, 0x40 ++ xvst D7, C1, 0x60 ++ /* Store C2 */ ++ xvst D8, C2, 0x00 ++ xvst D9, C2, 0x20 ++ xvst D10, C2, 0x40 ++ xvst D11, C2, 0x60 ++ /* Store C3 */ ++ xvst D12, C3, 0x00 ++ xvst D13, C3, 0x20 ++ xvst D14, C3, 0x40 ++ xvst D15, C3, 0x60 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x80 ++ addi.d C1, C1, 0x80 ++ addi.d C2, C2, 0x80 ++ addi.d C3, C3, 0x80 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d L, L, -16 ++#else ++ /* number of values in B */ ++ addi.d L, L, -4 ++#endif ++ slli.d T0, L, 0x07 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x05 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x10 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++ addi.d I, I, -1 /* I-- */ ++ blt ZERO,I, .L_I1 ++ ++.L_M8: ++ /* We have done M & 16, considering M=8/4/2/1 */ ++ andi I, M, 15 ++ beq ZERO,I, .L_M0 ++ ++ andi I, M, 8 ++ beq ZERO,I, .L_M4 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x06 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x05 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 8 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif // #if defined(TRMMKERNEL) ++ ++ /* Load 8 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ xvfmul.d D5, U1, U4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ /* line 3 */ ++ xvfmul.d D8, U0, U4 ++ xvfmul.d D9, U1, U4 ++ ++ xvldrepl.d U4, B0, 0x18 ++ /* line 4 */ ++ xvfmul.d D12, U0, U4 ++ xvfmul.d D13, U1, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M8_L7 */ ++ beq ZERO,TL, .L_M8_L7 ++ ++.L_M8_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_M8_TL1 ++ ++.L_M8_L7: ++ /* if (!(L & 7)) goto L_M8_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M8_L0 ++ ++.L_M8_L71: ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D9, U1, U4, D9 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D13, U1, U4, D13 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_M8_L71 ++ ++.L_M8_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D1, D1, VALPHA ++ xvfmul.d D4, D4, VALPHA ++ xvfmul.d D5, D5, VALPHA ++ xvfmul.d D8, D8, VALPHA ++ xvfmul.d D9, D9, VALPHA ++ xvfmul.d D12, D12, VALPHA ++ xvfmul.d D13, D13, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ xvfmadd.d D1, D1, VALPHA, U1 ++ ++ /* Load C1 */ ++ xvld U0, C1, 0x00 ++ xvld U1, C1, 0x20 ++ xvfmadd.d D4, D4, VALPHA, U0 ++ xvfmadd.d D5, D5, VALPHA, U1 ++ ++ /* Load C2 */ ++ xvld U0, C2, 0x00 ++ xvld U1, C2, 0x20 ++ xvfmadd.d D8, D8, VALPHA, U0 ++ xvfmadd.d D9, D9, VALPHA, U1 ++ ++ /* Load C3 */ ++ xvld U0, C3, 0x00 ++ xvld U1, C3, 0x20 ++ xvfmadd.d D12, D12, VALPHA, U0 ++ xvfmadd.d D13, D13, VALPHA, U1 ++#endif // #if defined(TRMMKERNEL) ++ ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ xvst D1, C0, 0x20 ++ /* Store C1 */ ++ xvst D4, C1, 0x00 ++ xvst D5, C1, 0x20 ++ /* Store C2 */ ++ xvst D8, C2, 0x00 ++ xvst D9, C2, 0x20 ++ /* Store C3 */ ++ xvst D12, C3, 0x00 ++ xvst D13, C3, 0x20 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x40 ++ addi.d C1, C1, 0x40 ++ addi.d C2, C2, 0x40 ++ addi.d C3, C3, 0x40 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d L, L, -8 ++#else ++ /* number of values in B */ ++ addi.d L, L, -4 ++#endif ++ slli.d T0, L, 0x06 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x05 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d OFF, OFF, 0x08 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N >> 2 ) && (M & 8)) End************/ ++ ++.L_M4: ++ andi I, M, 4 ++ beq ZERO,I, .L_M2 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x05 ++ add.d A0, A0, T0 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 4 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 4 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ /* line 3 */ ++ xvfmul.d D8, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x18 ++ /* line 4 */ ++ xvfmul.d D12, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M4_L7 */ ++ beq ZERO,TL, .L_M4_L7 ++ ++.L_M4_TL1: /* TL-- */ ++ /***8-1***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_M4_TL1 ++ ++.L_M4_L7: ++ /* if (!(L & 7)) goto L_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M4_L0 ++ ++.L_M4_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_M4_L71 ++ ++.L_M4_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D4, D4, VALPHA ++ xvfmul.d D8, D8, VALPHA ++ xvfmul.d D12, D12, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ ++ /* Load C1 */ ++ xvld U0, C1, 0x00 ++ xvfmadd.d D4, D4, VALPHA, U0 ++ ++ /* Load C2 */ ++ xvld U0, C2, 0x00 ++ xvfmadd.d D8, D8, VALPHA, U0 ++ ++ /* Load C3 */ ++ xvld U0, C3, 0x00 ++ xvfmadd.d D12, D12, VALPHA, U0 ++#endif // #if defined(TRMMKERNEL) ++ ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ /* Store C1 */ ++ xvst D4, C1, 0x00 ++ /* Store C2 */ ++ xvst D8, C2, 0x00 ++ /* Store C3 */ ++ xvst D12, C3, 0x00 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x20 ++ addi.d C1, C1, 0x20 ++ addi.d C2, C2, 0x20 ++ addi.d C3, C3, 0x20 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d L, L, -4 ++#else ++ /* number of values in B */ ++ addi.d L, L, -4 ++#endif ++ slli.d T0, L, 0x05 ++ add.d A0, A0, T0 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d OFF, OFF, 0x04 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ ++ ++.L_M2: ++ andi I, M, 2 ++ beq ZERO,I, .L_M1 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x04 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x05 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 2 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 2 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ /* line 3 */ ++ xvfmul.d D8, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x18 ++ /* line 4 */ ++ xvfmul.d D12, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M2_L7 */ ++ beq ZERO,TL, .L_M2_L7 ++ ++.L_M2_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 2 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_M2_TL1 ++ ++.L_M2_L7: ++ /* if (!(L & 7)) goto L_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M2_L0 ++ ++.L_M2_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_M2_L71 ++ ++.L_M2_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D4, D4, VALPHA ++ xvfmul.d D8, D8, VALPHA ++ xvfmul.d D12, D12, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ ++ /* Load C1 */ ++ xvld U0, C1, 0x00 ++ xvfmadd.d D4, D4, VALPHA, U0 ++ ++ /* Load C2 */ ++ xvld U0, C2, 0x00 ++ xvfmadd.d D8, D8, VALPHA, U0 ++ ++ /* Load C3 */ ++ xvld U0, C3, 0x00 ++ xvfmadd.d D12, D12, VALPHA, U0 ++#endif // #if defined(TRMMKERNEL) ++ ++ xvstelm.d D0, C0, 0x00, 0x00 ++ xvstelm.d D4, C1, 0x00, 0x00 ++ xvstelm.d D8, C2, 0x00, 0x00 ++ xvstelm.d D12, C3, 0x00, 0x00 ++ xvstelm.d D0, C0, 0x08, 0x01 ++ xvstelm.d D4, C1, 0x08, 0x01 ++ xvstelm.d D8, C2, 0x08, 0x01 ++ xvstelm.d D12, C3, 0x08, 0x01 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x10 ++ addi.d C1, C1, 0x10 ++ addi.d C2, C2, 0x10 ++ addi.d C3, C3, 0x10 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d L, L, -2 ++#else ++ /* number of values in B */ ++ addi.d L, L, -4 ++#endif ++ slli.d T0, L, 0x04 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x05 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d OFF, OFF, 0x02 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ ++ ++.L_M1: ++ andi I, M, 1 ++ beq ZERO,I, .L_M0 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x03 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x05 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 1 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 1 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ /* line 3 */ ++ xvfmul.d D8, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x18 ++ /* line 4 */ ++ xvfmul.d D12, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M1_L7 */ ++ beq ZERO,TL, .L_M1_L7 ++ ++.L_M1_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 1 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_M1_TL1 ++ ++.L_M1_L7: ++ /* if (!(L & 7)) goto L_M1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M1_L0 ++ ++.L_M1_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_M1_L71 ++ ++.L_M1_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D4, D4, VALPHA ++ xvfmul.d D8, D8, VALPHA ++ xvfmul.d D12, D12, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ ++ /* Load C1 */ ++ xvld U0, C1, 0x00 ++ xvfmadd.d D4, D4, VALPHA, U0 ++ ++ /* Load C2 */ ++ xvld U0, C2, 0x00 ++ xvfmadd.d D8, D8, VALPHA, U0 ++ ++ /* Load C3 */ ++ xvld U0, C3, 0x00 ++ xvfmadd.d D12, D12, VALPHA, U0 ++#endif // #if defined(TRMMKERNEL) ++ ++ xvstelm.d D0, C0, 0x00, 0x00 ++ xvstelm.d D4, C1, 0x00, 0x00 ++ xvstelm.d D8, C2, 0x00, 0x00 ++ xvstelm.d D12, C3, 0x00, 0x00 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x08 ++ addi.d C1, C1, 0x08 ++ addi.d C2, C2, 0x08 ++ addi.d C3, C3, 0x08 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d L, L, -1 ++#else ++ /* number of values in B */ ++ addi.d L, L, -4 ++#endif ++ slli.d T0, L, 0x03 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x05 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d OFF, OFF, 0x01 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ ++ ++.L_M0: ++ /* Add stride for B and C ++ * B += (K * 32) ++ * C += (LDC * 32) ++ */ ++ /* since the array type is double, ++ * so we must mul 32 ++ */ ++ slli.d T0, K, 5 ++ slli.d T1, LDC, 5 ++ add.d B, B, T0 ++ add.d C, C, T1 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addi.d OFF, OFF, 0x04 ++#endif ++ ++ blt ZERO, J, .L_J1 ++ ++//////////////// go back to L_J1 ///////////////// ++///////////////////////////////////////////////// ++/************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ ++ ++.L_N3: ++ andi J, N, 2 ++ beq ZERO, J, .L_N1 ++ ++/************************* Condition 2 if((N & 2) && (M >> 4)) START !!! ************************* ++* dgemm_core_16x2 */ ++ ++ move C0, C ++ move A0, A ++ slli.d T0, LDC, 3 ++ add.d C1, C0, T0 ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move OFF, OFFSET ++#endif ++ ++ /* if (!(M >> 4)) goto L_N3_M8 */ ++ srai.d I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_N3_M8 ++ ++.L_N3_I1: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x07 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x04 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 16 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 16 * 64 from A0 ++ * U0 = {a3, a2, a1, a0} ++ * U1 = {a7, a6, a5, a4} ++ * U2 = {a11, a10, a9, a8} ++ * U3 = {a15, a14, a13, a12} ++ */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ xvfmul.d D2, U2, U4 ++ xvfmul.d D3, U3, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ xvfmul.d D5, U1, U4 ++ xvfmul.d D6, U2, U4 ++ xvfmul.d D7, U3, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N3_L7 */ ++ beq ZERO,TL, .L_N3_L7 ++ ++.L_N3_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ /***8-2***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ /***8-3***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ /***8-4***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ /***8-5***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ /***8-6***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ /***8-7***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ /***8-8***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N3_TL1 ++ ++.L_N3_L7: ++ /* if (!(L & 7)) goto L_N3_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N3_L0 ++ ++.L_N3_L71: ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D6, U2, U4, D6 ++ xvfmadd.d D7, U3, U4, D7 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N3_L71 ++ ++.L_N3_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D1, D1, VALPHA ++ xvfmul.d D2, D2, VALPHA ++ xvfmul.d D3, D3, VALPHA ++ xvfmul.d D4, D4, VALPHA ++ xvfmul.d D5, D5, VALPHA ++ xvfmul.d D6, D6, VALPHA ++ xvfmul.d D7, D7, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ xvfmadd.d D1, D1, VALPHA, U1 ++ xvfmadd.d D2, D2, VALPHA, U2 ++ xvfmadd.d D3, D3, VALPHA, U3 ++ ++ /* Load C1 */ ++ xvld U0, C1, 0x00 ++ xvld U1, C1, 0x20 ++ xvld U2, C1, 0x40 ++ xvld U3, C1, 0x60 ++ xvfmadd.d D4, D4, VALPHA, U0 ++ xvfmadd.d D5, D5, VALPHA, U1 ++ xvfmadd.d D6, D6, VALPHA, U2 ++ xvfmadd.d D7, D7, VALPHA, U3 ++#endif // #if defined(TRMMKERNEL) ++ ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ xvst D1, C0, 0x20 ++ xvst D2, C0, 0x40 ++ xvst D3, C0, 0x60 ++ /* Store C1 */ ++ xvst D4, C1, 0x00 ++ xvst D5, C1, 0x20 ++ xvst D6, C1, 0x40 ++ xvst D7, C1, 0x60 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x80 ++ addi.d C1, C1, 0x80 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ addi.d L, L, -16 ++#else ++ addi.d L, L, -2 ++#endif ++ slli.d T0, L, 0x07 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x04 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x10 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++ addi.d I, I, -1 /* I-- */ ++ blt ZERO,I, .L_N3_I1 ++ ++.L_N3_M8: ++ /* We have done M & 16, considering M=8/4/2/1 */ ++ andi I, M, 15 ++ beq ZERO,I, .L_N3_M0 ++ ++ andi I, M, 8 ++ beq ZERO,I, .L_N3_M4 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x06 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x04 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 8 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 8 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ xvfmul.d D5, U1, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N3_M8_L7 */ ++ beq ZERO,TL, .L_N3_M8_L7 ++ ++.L_N3_M8_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N3_M8_TL1 ++ ++.L_N3_M8_L7: ++ /* if (!(L & 7)) goto L_N3_M8_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N3_M8_L0 ++ ++.L_N3_M8_L71: ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ xvfmadd.d D5, U1, U4, D5 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N3_M8_L71 ++ ++.L_N3_M8_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D1, D1, VALPHA ++ xvfmul.d D4, D4, VALPHA ++ xvfmul.d D5, D5, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ xvfmadd.d D1, D1, VALPHA, U1 ++ ++ /* Load C1 */ ++ xvld U0, C1, 0x00 ++ xvld U1, C1, 0x20 ++ xvfmadd.d D4, D4, VALPHA, U0 ++ xvfmadd.d D5, D5, VALPHA, U1 ++#endif // #if defined(TRMMKERNEL) ++ ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ xvst D1, C0, 0x20 ++ /* Store C1 */ ++ xvst D4, C1, 0x00 ++ xvst D5, C1, 0x20 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x40 ++ addi.d C1, C1, 0x40 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ addi.d L, L, -8 ++#else ++ addi.d L, L, -2 ++#endif ++ slli.d T0, L, 0x06 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x04 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x08 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N & 2) && (M & 8) ) End************/ ++ ++.L_N3_M4: ++ andi I, M, 4 ++ beq ZERO,I, .L_N3_M2 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x05 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x04 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 4 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 4 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N3_M4_L7 */ ++ beq ZERO,TL, .L_N3_M4_L7 ++ ++.L_N3_M4_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 8 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ ++ /* Cumulative D0~D15 */ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N3_M4_TL1 ++ ++.L_N3_M4_L7: ++ /* if (!(L & 7)) goto L_N3_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N3_M4_L0 ++ ++.L_N3_M4_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N3_M4_L71 ++ ++.L_N3_M4_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D4, D4, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ ++ /* Load C1 */ ++ xvld U0, C1, 0x00 ++ xvfmadd.d D4, D4, VALPHA, U0 ++#endif // #if defined(TRMMKERNEL) ++ ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ /* Store C1 */ ++ xvst D4, C1, 0x00 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x20 ++ addi.d C1, C1, 0x20 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ addi.d L, L, -4 ++#else ++ addi.d L, L, -2 ++#endif ++ slli.d T0, L, 0x05 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x04 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x04 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N & 2 ) && (M & 4) ) End************/ ++ ++.L_N3_M2: ++ andi I, M, 2 ++ beq ZERO,I, .L_N3_M1 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x04 ++ add.d A0, A0, T0 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 2 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 2 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N3_M2_L7 */ ++ beq ZERO,TL, .L_N3_M2_L7 ++ ++.L_N3_M2_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 2 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N3_M2_TL1 ++ ++.L_N3_M2_L7: ++ /* if (!(L & 7)) goto L_N3_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N3_M2_L0 ++ ++.L_N3_M2_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N3_M2_L71 ++ ++.L_N3_M2_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D4, D4, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ ++ /* Load C1 */ ++ xvld U0, C1, 0x00 ++ xvfmadd.d D4, D4, VALPHA, U0 ++#endif // #if defined(TRMMKERNEL) ++ ++ xvstelm.d D0, C0, 0x00, 0x00 ++ xvstelm.d D4, C1, 0x00, 0x00 ++ xvstelm.d D0, C0, 0x08, 0x01 ++ xvstelm.d D4, C1, 0x08, 0x01 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x10 ++ addi.d C1, C1, 0x10 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ addi.d L, L, -2 ++#else ++ addi.d L, L, -2 ++#endif ++ slli.d T0, L, 0x04 ++ add.d A0, A0, T0 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x02 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N & 2 ) && (M & 2) ) End************/ ++ ++.L_N3_M1: ++ andi I, M, 1 ++ beq ZERO,I, .L_N3_M0 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x03 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x04 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 1 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 1 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N3_M1_L7 */ ++ beq ZERO,TL, .L_N3_M1_L7 ++ ++.L_N3_M1_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 1 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N3_M1_TL1 ++ ++.L_N3_M1_L7: ++ /* if (!(L & 7)) goto L_N3_M1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N3_M1_L0 ++ ++.L_N3_M1_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N3_M1_L71 ++ ++.L_N3_M1_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D4, D4, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ ++ /* Load C1 */ ++ xvld U0, C1, 0x00 ++ xvfmadd.d D4, D4, VALPHA, U0 ++#endif // #if defined(TRMMKERNEL) ++ ++ xvstelm.d D0, C0, 0x00, 0x00 ++ xvstelm.d D4, C1, 0x00, 0x00 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x08 ++ addi.d C1, C1, 0x08 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ addi.d L, L, -1 ++#else ++ addi.d L, L, -2 ++#endif ++ slli.d T0, L, 0x03 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x04 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x01 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N & 2 ) && (M & 1) ) End************/ ++ ++.L_N3_M0: ++ /* Add stride for B and C ++ * B += (K * 16) ++ * C += (LDC * 16) ++ */ ++ /* since the array type is double, ++ * so we must mul 16 ++ */ ++ slli.d T0, K, 4 ++ slli.d T1, LDC, 4 ++ add.d B, B, T0 ++ add.d C, C, T1 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addi.d OFF, OFF, 0x02 ++#endif ++ ++ /* We must reinit I */ ++ srai.d I, M, 4 /* I = bm >> 4 */ ++ ++/************************* Condition 2 if((N & 2) && (M >> 4)) End !!! ************************* ++* dgemm_core_16x2 */ ++ ++.L_N1: ++ andi J, N, 1 ++ beq ZERO, J, .L_N0 ++ ++/************************* Condition 3 if((N & 1) && (M >> 4)) START !!! ************************* ++* dgemm_core_16x1 */ ++ ++ move C0, C ++ move A0, A ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move OFF, OFFSET ++#endif ++ ++ /* if (!(M >> 4)) goto L_N1_M8 */ ++ srai.d I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_N1_M8 ++ ++.L_N1_I1: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x07 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x03 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 16 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 16 * 64 from A0 ++ * U0 = {a3, a2, a1, a0} ++ * U1 = {a7, a6, a5, a4} ++ * U2 = {a11, a10, a9, a8} ++ * U3 = {a15, a14, a13, a12} ++ */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ xvfmul.d D2, U2, U4 ++ xvfmul.d D3, U3, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_L7 */ ++ beq ZERO,TL, .L_N1_L7 ++ ++.L_N1_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ /***8-2***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ /***8-3***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ /***8-4***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ /***8-5***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ /***8-6***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ /***8-7***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ /***8-8***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_TL1 ++ ++.L_N1_L7: ++ /* if (!(L & 7)) goto L_N1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_L0 ++ ++.L_N1_L71: ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N1_L71 ++ ++.L_N1_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D1, D1, VALPHA ++ xvfmul.d D2, D2, VALPHA ++ xvfmul.d D3, D3, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ xvfmadd.d D1, D1, VALPHA, U1 ++ xvfmadd.d D2, D2, VALPHA, U2 ++ xvfmadd.d D3, D3, VALPHA, U3 ++#endif // #if defined(TRMMKERNEL) ++ ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ xvst D1, C0, 0x20 ++ xvst D2, C0, 0x40 ++ xvst D3, C0, 0x60 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x80 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ addi.d L, L, -16 ++#else ++ addi.d L, L, -1 ++#endif ++ slli.d T0, L, 0x07 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x03 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x10 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++ addi.d I, I, -1 /* I-- */ ++ blt ZERO,I, .L_N1_I1 ++ ++.L_N1_M8: ++ /* We have done M & 16, considering M=8/4/2/1 */ ++ andi I, M, 15 ++ beq ZERO,I, .L_N1_M0 ++ ++ andi I, M, 8 ++ beq ZERO,I, .L_N1_M4 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x06 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x03 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 8 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 8 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M8_L7 */ ++ beq ZERO,TL, .L_N1_M8_L7 ++ ++.L_N1_M8_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_M8_TL1 ++ ++.L_N1_M8_L7: ++ /* if (!(L & 7)) goto L_N1_M8_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_M8_L0 ++ ++.L_N1_M8_L71: ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N1_M8_L71 ++ ++.L_N1_M8_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D1, D1, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ xvfmadd.d D1, D1, VALPHA, U1 ++#endif // #if defined(TRMMKERNEL) ++ ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ xvst D1, C0, 0x20 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x40 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ addi.d L, L, -8 ++#else ++ addi.d L, L, -1 ++#endif ++ slli.d T0, L, 0x06 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x03 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x08 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N & 1) && (M & 8) ) End************/ ++ ++.L_N1_M4: ++ andi I, M, 4 ++ beq ZERO,I, .L_N1_M2 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x05 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x03 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 4 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 4 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M4_L7 */ ++ beq ZERO,TL, .L_N1_M4_L7 ++ ++.L_N1_M4_TL1: /* TL-- */ ++ /***8-1***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_M4_TL1 ++ ++.L_N1_M4_L7: ++ /* if (!(L & 7)) goto L_N1_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_M4_L0 ++ ++.L_N1_M4_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N1_M4_L71 ++ ++.L_N1_M4_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++#endif // #if defined(TRMMKERNEL) ++ ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x20 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ addi.d L, L, -4 ++#else ++ addi.d L, L, -1 ++#endif ++ slli.d T0, L, 0x05 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x03 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x04 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N & 1) && (M & 4) ) End************/ ++ ++.L_N1_M2: ++ andi I, M, 2 ++ beq ZERO,I, .L_N1_M1 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x04 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x03 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 2 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 2 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M2_L7 */ ++ beq ZERO,TL, .L_N1_M2_L7 ++ ++.L_N1_M2_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 2 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_M2_TL1 ++ ++.L_N1_M2_L7: ++ /* if (!(L & 7)) goto L_N1_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_M2_L0 ++ ++.L_N1_M2_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N1_M2_L71 ++ ++.L_N1_M2_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++#endif // #if defined(TRMMKERNEL) ++ ++ xvstelm.d D0, C0, 0x00, 0x00 ++ xvstelm.d D0, C0, 0x08, 0x01 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x10 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ addi.d L, L, -2 ++#else ++ addi.d L, L, -1 ++#endif ++ slli.d T0, L, 0x04 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x03 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x02 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N & 1 ) && (M & 2) ) End************/ ++ ++.L_N1_M1: ++ andi I, M, 1 ++ beq ZERO,I, .L_N1_M0 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x03 ++ add.d A0, A0, T0 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 1 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 1 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M1_L7 */ ++ beq ZERO,TL, .L_N1_M1_L7 ++ ++.L_N1_M1_TL1: /* TL-- */ ++ /***8-1***/ ++ /* Load 1 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ /***8-2***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ /***8-3***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ /***8-4***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ /***8-5***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ /***8-6***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ /***8-7***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ /***8-8***/ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_M1_TL1 ++ ++.L_N1_M1_L7: ++ /* if (!(L & 7)) goto L_N1_M1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_M1_L0 ++ ++.L_N1_M1_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N1_M1_L71 ++ ++.L_N1_M1_L0: ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++#endif // #if defined(TRMMKERNEL) ++ ++ xvstelm.d D0, C0, 0x00, 0x00 ++ ++ /* Add stride for C */ ++ addi.d C0, C0, 0x08 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ addi.d L, L, -1 ++#else ++ addi.d L, L, -1 ++#endif ++ slli.d T0, L, 0x03 ++ add.d A0, A0, T0 ++ add.d B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ addi.d OFF, OFF, 0x01 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++/********LOOP (if(N & 1 ) && (M & 1) ) End************/ ++ ++.L_N1_M0: ++ ++/************************* Condition 3 if((N & 1) && (M >> 4)) End !!! ************************* ++* dgemm_core_16x1 */ ++ ++.L_N0: ++ /* Restore regs */ ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ LDARG $r25, $sp, 16 ++ LDARG $r26, $sp, 24 ++ LDARG $r27, $sp, 32 ++ LD $f23, $sp, 40 ++ addi.d $sp, $sp, 56 ++ ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/dgemm_ncopy_16.S b/kernel/loongarch64/dgemm_ncopy_16.S +new file mode 100644 +index 0000000..95c8790 +--- /dev/null ++++ b/kernel/loongarch64/dgemm_ncopy_16.S +@@ -0,0 +1,691 @@ ++/******************************************************************************* ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++/* Function parameters */ ++#define M $r4 // param 1: m ++#define N $r5 // param 2: n ++#define SRC $r6 // param 3: src ++#define LDA $r7 // param 4: lda ++#define DST $r8 // param 5: dst ++ ++#define I $r9 ++#define J $r10 ++#define S1 $r12 ++#define S2 $r13 ++#define S3 $r14 ++#define S4 $r15 ++#define S5 $r16 ++#define S6 $r17 ++#define S7 $r18 ++#define S8 $r19 ++#define S9 $r20 ++#define S10 $r23 ++#define S11 $r24 ++#define S12 $r25 ++#define S13 $r26 ++#define S14 $r27 ++#define S15 $r28 ++#define S16 $r29 ++#define TD $r30 ++#define TS $r31 ++#define TL $r7 ++#define T0 $r6 ++#define ZERO $r0 ++ ++#define F0 $f0 ++#define F1 $f1 ++#define F2 $f2 ++#define F3 $f3 ++#define F4 $f4 ++#define F5 $f5 ++#define F6 $f6 ++#define F7 $f7 ++/* LASX vectors */ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++#define U8 $xr8 ++#define U9 $xr9 ++#define U10 $xr10 ++#define U11 $xr11 ++#define U12 $xr12 ++#define U13 $xr13 ++#define U14 $xr14 ++#define U15 $xr15 ++#define D0 $xr16 ++#define D1 $xr17 ++#define D2 $xr18 ++#define D3 $xr19 ++#define D4 $xr20 ++#define D5 $xr21 ++#define D6 $xr22 ++#define D7 $xr23 ++#define D8 $xr24 ++#define D9 $xr25 ++#define D10 $xr26 ++#define D11 $xr27 ++#define D12 $xr28 ++#define D13 $xr29 ++#define D14 $xr30 ++#define D15 $xr31 ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -0x90 ++ SDARG $r23, $sp, 0x00 ++ SDARG $r24, $sp, 0x08 ++ SDARG $r25, $sp, 0x10 ++ SDARG $r26, $sp, 0x18 ++ SDARG $r27, $sp, 0x20 ++ SDARG $r28, $sp, 0x28 ++ SDARG $r29, $sp, 0x30 ++ SDARG $r30, $sp, 0x38 ++ SDARG $r31, $sp, 0x40 ++ ST $f23, $sp, 0x48 ++ ST $f24, $sp, 0x50 ++ ST $f25, $sp, 0x58 ++ ST $f26, $sp, 0x60 ++ ST $f27, $sp, 0x68 ++ ST $f28, $sp, 0x70 ++ ST $f29, $sp, 0x78 ++ ST $f30, $sp, 0x80 ++ ST $f31, $sp, 0x88 ++ ++ move TD, DST ++ move TS, SRC ++ slli.d TL, LDA, 0x03 ++ slli.d T0, TL, 0x01 ++ srai.d J, N, 0x04 ++ beq J, ZERO, .L_N8 ++ ++.L_J1: /* J-- */ ++ move S1, TS ++ add.d S2, TS, TL ++ srai.d I, M, 0x03 ++ add.d S3, S2, TL ++ addi.d J, J, -1 ++ add.d S4, S3, TL ++ add.d S5, S3, T0 ++ add.d S6, S4, T0 ++ add.d S7, S5, T0 ++ add.d S8, S6, T0 ++ add.d S9, S7, T0 ++ add.d S10, S8, T0 ++ add.d S11, S9, T0 ++ add.d S12, S10, T0 ++ add.d S13, S11, T0 ++ add.d S14, S12, T0 ++ add.d S15, S13, T0 ++ add.d S16, S14, T0 ++ add.d TS, S15, T0 ++ beq I, ZERO, .L_I7 ++ ++.L_I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ xvld U4, S5, 0x00 ++ xvld U5, S6, 0x00 ++ xvld U6, S7, 0x00 ++ xvld U7, S8, 0x00 ++ xvld U8, S9, 0x00 ++ xvld U9, S10, 0x00 ++ xvld U10, S11, 0x00 ++ xvld U11, S12, 0x00 ++ xvld U12, S13, 0x00 ++ xvld U13, S14, 0x00 ++ xvld U14, S15, 0x00 ++ xvld U15, S16, 0x00 ++ ++ xvpackev.d D0, U1, U0 ++ xvpackod.d D1, U1, U0 ++ xvpackev.d D2, U3, U2 ++ xvpackod.d D3, U3, U2 ++ xvpackev.d D4, U5, U4 ++ xvpackod.d D5, U5, U4 ++ xvpackev.d D6, U7, U6 ++ xvpackod.d D7, U7, U6 ++ ++ xvpackev.d D8, U9, U8 ++ xvpackod.d D9, U9, U8 ++ xvpackev.d D10, U11, U10 ++ xvpackod.d D11, U11, U10 ++ xvpackev.d D12, U13, U12 ++ xvpackod.d D13, U13, U12 ++ xvpackev.d D14, U15, U14 ++ xvpackod.d D15, U15, U14 ++ ++ xvand.v U0, D0, D0 ++ xvpermi.q D0, D2, 0x02 // 0 ++ xvand.v U4, D4, D4 ++ xvpermi.q D4, D6, 0x02 // 1 ++ xvand.v U1, D1, D1 ++ xvpermi.q D1, D3, 0x02 // 4 ++ xvand.v U5, D5, D5 ++ xvpermi.q D5, D7, 0x02 // 5 ++ xvpermi.q D2, U0, 0x31 // 8 ++ xvpermi.q D6, U4, 0x31 // 9 ++ xvpermi.q D3, U1, 0x31 // 12 ++ xvpermi.q D7, U5, 0x31 // 13 ++ ++ xvand.v U8, D8, D8 ++ xvpermi.q D8, D10, 0x02 // 2 ++ xvand.v U12, D12, D12 ++ xvpermi.q D12, D14, 0x02 // 3 ++ xvand.v U9, D9, D9 ++ xvpermi.q D9, D11, 0x02 // 6 ++ xvand.v U13, D13, D13 ++ xvpermi.q D13, D15, 0x02 // 7 ++ xvpermi.q D10, U8, 0x31 // 10 ++ xvpermi.q D14, U12, 0x31 // 11 ++ xvpermi.q D11, U9, 0x31 // 14 ++ xvpermi.q D15, U13, 0x31 // 15 ++ ++ xvst D0, TD, 0x00 // 0 ++ xvst D4, TD, 0x20 // 1 ++ xvst D8, TD, 0x40 // 2 ++ xvst D12, TD, 0x60 // 3 ++ xvst D1, TD, 0x80 // 4 ++ xvst D5, TD, 0xA0 // 5 ++ xvst D9, TD, 0xC0 // 6 ++ xvst D13, TD, 0xE0 // 7 ++ addi.d TD, TD, 0x100 ++ xvst D2, TD, 0x00 // 8 ++ xvst D6, TD, 0x20 // 9 ++ xvst D10, TD, 0x40 // 10 ++ xvst D14, TD, 0x60 // 11 ++ xvst D3, TD, 0x80 // 12 ++ xvst D7, TD, 0xA0 // 13 ++ xvst D11, TD, 0xC0 // 14 ++ xvst D15, TD, 0xE0 // 15 ++ addi.d TD, TD, 0x100 ++ ++ xvld U0, S1, 0x20 ++ xvld U1, S2, 0x20 ++ xvld U2, S3, 0x20 ++ xvld U3, S4, 0x20 ++ xvld U4, S5, 0x20 ++ xvld U5, S6, 0x20 ++ xvld U6, S7, 0x20 ++ xvld U7, S8, 0x20 ++ xvld U8, S9, 0x20 ++ xvld U9, S10, 0x20 ++ xvld U10, S11, 0x20 ++ xvld U11, S12, 0x20 ++ xvld U12, S13, 0x20 ++ xvld U13, S14, 0x20 ++ xvld U14, S15, 0x20 ++ xvld U15, S16, 0x20 ++ ++ xvpackev.d D0, U1, U0 ++ xvpackod.d D1, U1, U0 ++ xvpackev.d D2, U3, U2 ++ xvpackod.d D3, U3, U2 ++ xvpackev.d D4, U5, U4 ++ xvpackod.d D5, U5, U4 ++ xvpackev.d D6, U7, U6 ++ xvpackod.d D7, U7, U6 ++ ++ xvpackev.d D8, U9, U8 ++ xvpackod.d D9, U9, U8 ++ xvpackev.d D10, U11, U10 ++ xvpackod.d D11, U11, U10 ++ xvpackev.d D12, U13, U12 ++ xvpackod.d D13, U13, U12 ++ xvpackev.d D14, U15, U14 ++ xvpackod.d D15, U15, U14 ++ ++ xvand.v U0, D0, D0 ++ xvpermi.q D0, D2, 0x02 // 0 ++ xvand.v U4, D4, D4 ++ xvpermi.q D4, D6, 0x02 // 1 ++ xvand.v U1, D1, D1 ++ xvpermi.q D1, D3, 0x02 // 4 ++ xvand.v U5, D5, D5 ++ xvpermi.q D5, D7, 0x02 // 5 ++ xvpermi.q D2, U0, 0x31 // 8 ++ xvpermi.q D6, U4, 0x31 // 9 ++ xvpermi.q D3, U1, 0x31 // 12 ++ xvpermi.q D7, U5, 0x31 // 13 ++ ++ xvand.v U8, D8, D8 ++ xvpermi.q D8, D10, 0x02 // 2 ++ xvand.v U12, D12, D12 ++ xvpermi.q D12, D14, 0x02 // 3 ++ xvand.v U9, D9, D9 ++ xvpermi.q D9, D11, 0x02 // 6 ++ xvand.v U13, D13, D13 ++ xvpermi.q D13, D15, 0x02 // 7 ++ xvpermi.q D10, U8, 0x31 // 10 ++ xvpermi.q D14, U12, 0x31 // 11 ++ xvpermi.q D11, U9, 0x31 // 14 ++ xvpermi.q D15, U13, 0x31 // 15 ++ ++ xvst D0, TD, 0x00 // 0 ++ xvst D4, TD, 0x20 // 1 ++ xvst D8, TD, 0x40 // 2 ++ xvst D12, TD, 0x60 // 3 ++ xvst D1, TD, 0x80 // 4 ++ xvst D5, TD, 0xA0 // 5 ++ xvst D9, TD, 0xC0 // 6 ++ xvst D13, TD, 0xE0 // 7 ++ addi.d TD, TD, 0x100 ++ xvst D2, TD, 0x00 // 8 ++ xvst D6, TD, 0x20 // 9 ++ xvst D10, TD, 0x40 // 10 ++ xvst D14, TD, 0x60 // 11 ++ xvst D3, TD, 0x80 // 12 ++ xvst D7, TD, 0xA0 // 13 ++ xvst D11, TD, 0xC0 // 14 ++ xvst D15, TD, 0xE0 // 15 ++ addi.d TD, TD, 0x100 ++ ++ ++ addi.d S1, S1, 0x40 ++ addi.d S2, S2, 0x40 ++ addi.d S3, S3, 0x40 ++ addi.d S4, S4, 0x40 ++ addi.d S5, S5, 0x40 ++ addi.d S6, S6, 0x40 ++ addi.d S7, S7, 0x40 ++ addi.d S8, S8, 0x40 ++ addi.d S9, S9, 0x40 ++ addi.d S10, S10, 0x40 ++ addi.d S11, S11, 0x40 ++ addi.d S12, S12, 0x40 ++ addi.d S13, S13, 0x40 ++ addi.d S14, S14, 0x40 ++ addi.d S15, S15, 0x40 ++ addi.d S16, S16, 0x40 ++ ++ addi.d I, I, -1 ++ blt ZERO, I, .L_I1 ++ ++.L_I7: ++ andi I, M, 0x07 ++ beq I, ZERO, .L_I0 ++ ++.L_II1: /* I-- */ ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ fld.d F2, S3, 0x00 ++ fld.d F3, S4, 0x00 ++ fld.d F4, S5, 0x00 ++ fld.d F5, S6, 0x00 ++ fld.d F6, S7, 0x00 ++ fld.d F7, S8, 0x00 ++ ++ fst.d F0, TD, 0x00 ++ addi.d S1, S1, 0x08 ++ fst.d F1, TD, 0x08 ++ addi.d S2, S2, 0x08 ++ fst.d F2, TD, 0x10 ++ addi.d S3, S3, 0x08 ++ fst.d F3, TD, 0x18 ++ addi.d S4, S4, 0x08 ++ fst.d F4, TD, 0x20 ++ addi.d S5, S5, 0x08 ++ fst.d F5, TD, 0x28 ++ addi.d S6, S6, 0x08 ++ fst.d F6, TD, 0x30 ++ addi.d S7, S7, 0x08 ++ fst.d F7, TD, 0x38 ++ addi.d S8, S8, 0x08 ++ addi.d TD, TD, 0x40 ++ ++ fld.d F0, S9, 0x00 ++ fld.d F1, S10, 0x00 ++ fld.d F2, S11, 0x00 ++ fld.d F3, S12, 0x00 ++ fld.d F4, S13, 0x00 ++ fld.d F5, S14, 0x00 ++ fld.d F6, S15, 0x00 ++ fld.d F7, S16, 0x00 ++ ++ fst.d F0, TD, 0x00 ++ addi.d S9, S9, 0x08 ++ fst.d F1, TD, 0x08 ++ addi.d S10, S10, 0x08 ++ fst.d F2, TD, 0x10 ++ addi.d S11, S11, 0x08 ++ fst.d F3, TD, 0x18 ++ addi.d S12, S12, 0x08 ++ fst.d F4, TD, 0x20 ++ addi.d S13, S13, 0x08 ++ fst.d F5, TD, 0x28 ++ addi.d S14, S14, 0x08 ++ fst.d F6, TD, 0x30 ++ addi.d S15, S15, 0x08 ++ fst.d F7, TD, 0x38 ++ addi.d S16, S16, 0x08 ++ addi.d TD, TD, 0x40 ++ ++ addi.d I, I, -1 ++ blt ZERO, I, .L_II1 ++ ++.L_I0: ++ blt ZERO, J, .L_J1 ++ ++.L_N8: ++ andi J, N, 0x08 ++ beq ZERO, J, .L_N4 ++ ++ move S1, TS ++ add.d S2, TS, TL ++ srai.d I, M, 0x03 ++ add.d S3, S2, TL ++ add.d S4, S2, T0 ++ add.d S5, S3, T0 ++ add.d S6, S4, T0 ++ add.d S7, S5, T0 ++ add.d S8, S6, T0 ++ add.d TS, S7, T0 ++ beq I, ZERO, .L_8I3 ++ ++.L_8I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ xvld U4, S5, 0x00 ++ xvld U5, S6, 0x00 ++ xvld U6, S7, 0x00 ++ xvld U7, S8, 0x00 ++ ++ xvpackev.d D0, U1, U0 ++ xvpackod.d D1, U1, U0 ++ xvpackev.d D2, U3, U2 ++ xvpackod.d D3, U3, U2 ++ xvpackev.d D4, U5, U4 ++ xvpackod.d D5, U5, U4 ++ xvpackev.d D6, U7, U6 ++ xvpackod.d D7, U7, U6 ++ ++ xvand.v U0, D0, D0 ++ xvpermi.q D0, D2, 0x02 // 0 ++ xvand.v U4, D4, D4 ++ xvpermi.q D4, D6, 0x02 // 1 ++ xvand.v U1, D1, D1 ++ xvpermi.q D1, D3, 0x02 // 2 ++ xvand.v U5, D5, D5 ++ xvpermi.q D5, D7, 0x02 // 3 ++ xvpermi.q D2, U0, 0x31 // 4 ++ xvpermi.q D6, U4, 0x31 // 5 ++ xvpermi.q D3, U1, 0x31 // 6 ++ xvpermi.q D7, U5, 0x31 // 7 ++ ++ xvst D0, TD, 0x00 ++ xvst D4, TD, 0x20 ++ xvst D1, TD, 0x40 ++ xvst D5, TD, 0x60 ++ xvst D2, TD, 0x80 ++ xvst D6, TD, 0xA0 ++ xvst D3, TD, 0xC0 ++ xvst D7, TD, 0xE0 ++ addi.d TD, TD, 0x100 ++ ++ xvld U0, S1, 0x20 ++ xvld U1, S2, 0x20 ++ xvld U2, S3, 0x20 ++ xvld U3, S4, 0x20 ++ xvld U4, S5, 0x20 ++ xvld U5, S6, 0x20 ++ xvld U6, S7, 0x20 ++ xvld U7, S8, 0x20 ++ ++ xvpackev.d D0, U1, U0 ++ xvpackod.d D1, U1, U0 ++ xvpackev.d D2, U3, U2 ++ xvpackod.d D3, U3, U2 ++ xvpackev.d D4, U5, U4 ++ xvpackod.d D5, U5, U4 ++ xvpackev.d D6, U7, U6 ++ xvpackod.d D7, U7, U6 ++ ++ xvand.v U0, D0, D0 ++ xvpermi.q D0, D2, 0x02 // 0 ++ xvand.v U4, D4, D4 ++ xvpermi.q D4, D6, 0x02 // 1 ++ xvand.v U1, D1, D1 ++ xvpermi.q D1, D3, 0x02 // 2 ++ xvand.v U5, D5, D5 ++ xvpermi.q D5, D7, 0x02 // 3 ++ xvpermi.q D2, U0, 0x31 // 4 ++ xvpermi.q D6, U4, 0x31 // 5 ++ xvpermi.q D3, U1, 0x31 // 6 ++ xvpermi.q D7, U5, 0x31 // 7 ++ ++ xvst D0, TD, 0x00 ++ xvst D4, TD, 0x20 ++ xvst D1, TD, 0x40 ++ xvst D5, TD, 0x60 ++ xvst D2, TD, 0x80 ++ xvst D6, TD, 0xA0 ++ xvst D3, TD, 0xC0 ++ xvst D7, TD, 0xE0 ++ addi.d TD, TD, 0x100 ++ ++ addi.d S1, S1, 0x40 ++ addi.d S2, S2, 0x40 ++ addi.d S3, S3, 0x40 ++ addi.d S4, S4, 0x40 ++ addi.d S5, S5, 0x40 ++ addi.d S6, S6, 0x40 ++ addi.d S7, S7, 0x40 ++ addi.d S8, S8, 0x40 ++ ++ addi.d I, I, -1 ++ blt ZERO, I, .L_8I1 ++ ++.L_8I3: ++ andi I, M, 0x07 ++ beq I, ZERO, .L_N4 ++ ++.L_8I11: ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ fld.d F2, S3, 0x00 ++ fld.d F3, S4, 0x00 ++ fld.d F4, S5, 0x00 ++ fld.d F5, S6, 0x00 ++ fld.d F6, S7, 0x00 ++ fld.d F7, S8, 0x00 ++ ++ fst.d F0, TD, 0x00 ++ addi.d S1, S1, 0x08 ++ fst.d F1, TD, 0x08 ++ addi.d S2, S2, 0x08 ++ fst.d F2, TD, 0x10 ++ addi.d S3, S3, 0x08 ++ fst.d F3, TD, 0x18 ++ addi.d S4, S4, 0x08 ++ fst.d F4, TD, 0x20 ++ addi.d S5, S5, 0x08 ++ fst.d F5, TD, 0x28 ++ addi.d S6, S6, 0x08 ++ fst.d F6, TD, 0x30 ++ addi.d S7, S7, 0x08 ++ fst.d F7, TD, 0x38 ++ addi.d S8, S8, 0x08 ++ ++ addi.d TD, TD, 0x40 ++ addi.d I, I, -1 ++ blt ZERO, I, .L_8I11 ++ ++.L_N4: ++ andi J, N, 0x04 ++ beq ZERO, J, .L_N2 ++ ++ move S1, TS ++ add.d S2, TS, TL ++ srai.d I, M, 0x02 ++ add.d S3, S2, TL ++ add.d S4, S2, T0 ++ add.d TS, S3, T0 ++ beq I, ZERO, .L_I3 ++ ++.L_4I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ ++ xvpackev.d D0, U1, U0 ++ xvpackod.d D1, U1, U0 ++ xvpackev.d D2, U3, U2 ++ xvpackod.d D3, U3, U2 ++ ++ xvand.v U0, D0, D0 ++ xvpermi.q D0, D2, 0x02 // 0 ++ xvand.v U1, D1, D1 ++ xvpermi.q D1, D3, 0x02 // 1 ++ xvpermi.q D2, U0, 0x31 // 2 ++ xvpermi.q D3, U1, 0x31 // 3 ++ ++ xvst D0, TD, 0x00 ++ xvst D1, TD, 0x20 ++ xvst D2, TD, 0x40 ++ xvst D3, TD, 0x60 ++ ++ addi.d S1, S1, 0x20 ++ addi.d S2, S2, 0x20 ++ addi.d S3, S3, 0x20 ++ addi.d S4, S4, 0x20 ++ addi.d TD, TD, 0x80 ++ ++ addi.d I, I, -1 ++ blt ZERO, I, .L_4I1 ++ ++.L_I3: ++ andi I, M, 0x03 ++ beq I, ZERO, .L_N2 ++ ++.L_4II1: ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ fld.d F2, S3, 0x00 ++ fld.d F3, S4, 0x00 ++ ++ fst.d F0, TD, 0x00 ++ addi.d S1, S1, 0x08 ++ fst.d F1, TD, 0x08 ++ addi.d S2, S2, 0x08 ++ fst.d F2, TD, 0x10 ++ addi.d S3, S3, 0x08 ++ fst.d F3, TD, 0x18 ++ addi.d S4, S4, 0x08 ++ ++ addi.d TD, TD, 0x20 ++ addi.d I, I, -1 ++ blt ZERO, I, .L_4II1 ++ ++.L_N2: ++ andi J, N, 0x02 ++ beq ZERO, J, .L_N1 ++ ++ move S1, TS ++ add.d S2, TS, TL ++ srai.d I, M, 0x01 ++ add.d TS, S2, TL ++ beq I, ZERO, .L_NI1 ++ ++.L_2I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ ++ xvpackev.d D0, U1, U0 ++ xvpackod.d D1, U1, U0 ++ ++ xvpermi.q D0, D1, 0x02 // 0 ++ ++ xvst D0, TD, 0x00 ++ ++ addi.d S1, S1, 0x10 ++ addi.d S2, S2, 0x10 ++ addi.d TD, TD, 0x20 ++ ++ addi.d I, I, -1 ++ blt ZERO, I, .L_2I1 ++ ++.L_NI1: ++ andi I, M, 0x01 ++ beq I, ZERO, .L_N1 ++ ++ ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ ++ fst.d F0, TD, 0x00 ++ addi.d S1, S1, 0x08 ++ fst.d F1, TD, 0x08 ++ addi.d S2, S2, 0x08 ++ addi.d TD, TD, 0x10 ++ ++.L_N1: ++ move S1, TS ++ beq ZERO, M, .L_N0 ++ ++.L_M1: ++ fld.d F0, S1, 0x00 ++ addi.d S1, S1, 0x08 ++ fst.d F0, TD, 0x00 ++ addi.d TD, TD, 0x08 ++ addi.d M, M, -1 ++ blt ZERO, M, .L_M1 ++ ++.L_N0: ++ LDARG $r23, $sp, 0x00 ++ LDARG $r24, $sp, 0x08 ++ LDARG $r25, $sp, 0x10 ++ LDARG $r26, $sp, 0x18 ++ LDARG $r27, $sp, 0x20 ++ LDARG $r28, $sp, 0x28 ++ LDARG $r29, $sp, 0x30 ++ LDARG $r30, $sp, 0x38 ++ LDARG $r31, $sp, 0x40 ++ LD $f23, $sp, 0x48 ++ LD $f24, $sp, 0x50 ++ LD $f25, $sp, 0x58 ++ LD $f26, $sp, 0x60 ++ LD $f27, $sp, 0x68 ++ LD $f28, $sp, 0x70 ++ LD $f29, $sp, 0x78 ++ LD $f30, $sp, 0x80 ++ LD $f31, $sp, 0x88 ++ addi.d $sp, $sp, 0x90 ++ jirl $r0, $r1, 0x00 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/dgemm_ncopy_4.S b/kernel/loongarch64/dgemm_ncopy_4.S +new file mode 100644 +index 0000000..b1f322a +--- /dev/null ++++ b/kernel/loongarch64/dgemm_ncopy_4.S +@@ -0,0 +1,237 @@ ++/******************************************************************************* ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++/* Function parameters */ ++#define M $r4 // param 1: m ++#define N $r5 // param 2: n ++#define SRC $r6 // param 3: src ++#define LDA $r7 // param 4: lda ++#define DST $r8 // param 5: dst ++ ++#define I $r9 ++#define J $r10 ++#define S1 $r12 ++#define S2 $r13 ++#define S3 $r14 ++#define S4 $r15 ++#define S5 $r16 ++#define S6 $r17 ++#define S7 $r18 ++#define S8 $r19 ++#define TD $r20 ++#define TS $r11 ++#define TL $r7 ++#define T0 $r23 ++#define ZERO $r0 ++ ++#define F0 $f0 ++#define F1 $f1 ++#define F2 $f2 ++#define F3 $f3 ++#define F4 $f4 ++#define F5 $f5 ++#define F6 $f6 ++#define F7 $f7 ++/* LASX vectors */ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++#define D0 $xr14 ++#define D1 $xr8 ++#define D2 $xr9 ++#define D3 $xr10 ++#define D4 $xr11 ++#define D5 $xr12 ++#define D6 $xr13 ++#define D7 $xr15 ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -8 ++ SDARG $r23, $sp, 0 ++ ++ move TD, DST ++ move TS, SRC ++ slli.d TL, LDA, 0x03 ++ slli.d T0, TL, 0x01 ++ srai.d J, N, 0x02 ++ beq J, ZERO, .L_N2 ++ ++.L_J1: /* J-- */ ++ move S1, TS ++ add.d S2, TS, TL ++ srai.d I, M, 0x02 ++ add.d S3, S2, TL ++ add.d S4, S2, T0 ++ add.d TS, S3, T0 ++ addi.d J, J, -1 ++ beq I, ZERO, .L_I3 ++ ++.L_I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ ++ xvpackev.d D0, U1, U0 ++ xvpackod.d D1, U1, U0 ++ xvpackev.d D2, U3, U2 ++ xvpackod.d D3, U3, U2 ++ ++ xvand.v U0, D0, D0 ++ xvpermi.q D0, D2, 0x02 // 0 ++ xvand.v U1, D1, D1 ++ xvpermi.q D1, D3, 0x02 // 1 ++ xvpermi.q D2, U0, 0x31 // 2 ++ xvpermi.q D3, U1, 0x31 // 3 ++ ++ xvst D0, TD, 0x00 ++ xvst D1, TD, 0x20 ++ xvst D2, TD, 0x40 ++ xvst D3, TD, 0x60 ++ ++ addi.d S1, S1, 0x20 ++ addi.d S2, S2, 0x20 ++ addi.d S3, S3, 0x20 ++ addi.d S4, S4, 0x20 ++ addi.d TD, TD, 0x80 ++ ++ addi.d I, I, -1 ++ blt ZERO, I, .L_I1 ++ ++.L_I3: ++ andi I, M, 0x03 ++ beq I, ZERO, .L_I0 ++ ++.L_II1: ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ fld.d F2, S3, 0x00 ++ fld.d F3, S4, 0x00 ++ ++ fst.d F0, TD, 0x00 ++ addi.d S1, S1, 0x08 ++ fst.d F1, TD, 0x08 ++ addi.d S2, S2, 0x08 ++ fst.d F2, TD, 0x10 ++ addi.d S3, S3, 0x08 ++ fst.d F3, TD, 0x18 ++ addi.d S4, S4, 0x08 ++ ++ addi.d TD, TD, 0x20 ++ addi.d I, I, -1 ++ blt ZERO, I, .L_II1 ++ ++.L_I0: ++ blt ZERO, J, .L_J1 ++ ++.L_N2: ++ andi J, N, 0x02 ++ beq ZERO, J, .L_N1 ++ ++ move S1, TS ++ add.d S2, TS, TL ++ srai.d I, M, 0x02 ++ add.d TS, S2, TL ++ beq I, ZERO, .L_2I3 ++ ++.L_2I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ ++ xvpackev.d D0, U1, U0 ++ xvpackod.d D1, U1, U0 ++ ++ xvand.v U0, D0, D0 ++ xvpermi.q D0, D1, 0x02 // 0 ++ xvpermi.q D1, U0, 0x31 // 1 ++ ++ xvst D0, TD, 0x00 ++ xvst D1, TD, 0x20 ++ addi.d S1, S1, 0x20 ++ addi.d S2, S2, 0x20 ++ addi.d TD, TD, 0x40 ++ addi.d I, I, -1 ++ blt ZERO, I, .L_2I1 ++ ++.L_2I3: ++ andi I, M, 0x03 ++ beq ZERO, I, .L_N1 ++ ++.L_2II1: /* I-- */ ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ fst.d F0, TD, 0x00 ++ addi.d I, I, -1 ++ fst.d F1, TD, 0x08 ++ addi.d S1, S1, 0x08 ++ addi.d S2, S2, 0x08 ++ addi.d TD, TD, 0x10 ++ blt ZERO, I, .L_2II1 ++ ++.L_N1: ++ andi J, N, 0x01 ++ beq ZERO, J, .L_N0 ++ ++ move S1, TS ++ srai.d I, M, 0x02 ++ beq ZERO, I, .L_1I3 ++ ++.L_1I1: ++ xvld U0, S1, 0x00 ++ addi.d S1, S1, 0x20 ++ xvst U0, TD, 0x00 ++ addi.d I, I, -1 ++ addi.d TD, TD, 0x20 ++ blt ZERO, I, .L_1I1 ++ ++.L_1I3: ++ andi I, M, 0x03 ++ beq ZERO, I, .L_N0 ++ ++.L_1II1: ++ fld.d F0, S1, 0x00 ++ addi.d S1, S1, 0x08 ++ fst.d F0, TD, 0x00 ++ addi.d I, I, -1 ++ addi.d TD, TD, 0x08 ++ blt ZERO, I, .L_1II1 ++ ++.L_N0: ++ LDARG $r23, $sp, 0 ++ addi.d $sp, $sp, 8 ++ jirl $r0, $r1, 0x00 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/dgemm_tcopy_16.S b/kernel/loongarch64/dgemm_tcopy_16.S +new file mode 100644 +index 0000000..afafe5b +--- /dev/null ++++ b/kernel/loongarch64/dgemm_tcopy_16.S +@@ -0,0 +1,710 @@ ++/******************************************************************************* ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++/* Function parameters */ ++#define M $r4 // param 1: m ++#define N $r5 // param 2: n ++#define SRC $r6 // param 3: src ++#define LDA $r7 // param 4: lda ++#define DST $r8 // param 5: dst ++ ++#define I $r9 ++#define J $r10 ++#define S0 $r11 ++#define S1 $r12 ++#define S2 $r13 ++#define S3 $r14 ++#define S4 $r15 ++#define S5 $r16 ++#define S6 $r17 ++#define S7 $r18 ++#define S8 $r19 ++#define P0 $r20 ++#define P1 $r23 ++#define P2 $r24 ++#define P3 $r25 ++#define P4 $r26 ++#define P5 $r27 ++#define T0 $r28 ++#define T1 $r29 ++#define TL $r7 ++#define ZERO $r0 ++ ++#define F0 $f0 ++#define F1 $f1 ++#define F2 $f2 ++#define F3 $f3 ++#define F4 $f4 ++#define F5 $f5 ++#define F6 $f6 ++#define F7 $f7 ++/* LASX vectors */ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -56 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 16 ++ SDARG $r26, $sp, 24 ++ SDARG $r27, $sp, 32 ++ SDARG $r28, $sp, 40 ++ SDARG $r29, $sp, 48 ++ ++ move S0, SRC ++ move P0, DST ++ ++ srai.d T0, N, 0x04 ++ srai.d T1, N, 0x03 ++ slli.d T0, T0, 0x04 ++ slli.d T1, T1, 0x03 ++ mul.d P2, M, T0 ++ mul.d P3, M, T1 ++ slli.d P2, P2, 0x03 ++ slli.d P3, P3, 0x03 ++ add.d P2, DST, P2 ++ add.d P3, DST, P3 ++ ++ srai.d T0, N, 0x02 ++ srai.d T1, N, 0x01 ++ slli.d T0, T0, 0x02 ++ slli.d T1, T1, 0x01 ++ mul.d P4, M, T0 ++ mul.d P5, M, T1 ++ slli.d P4, P4, 0x03 ++ slli.d P5, P5, 0x03 ++ add.d P4, DST, P4 ++ add.d P5, DST, P5 ++ ++ slli.d TL, LDA, 0x03 ++ srai.d J, M, 0x03 ++ slli.d T0, TL, 0x01 ++ slli.d T1, M, 0x07 ++ beq ZERO, J, .L_M7 ++ ++.L_J1: /* J-- */ ++ move S1, S0 ++ add.d S2, S0, TL ++ add.d S3, S1, T0 ++ add.d S4, S2, T0 ++ add.d S5, S3, T0 ++ add.d S6, S4, T0 ++ add.d S7, S5, T0 ++ add.d S8, S6, T0 ++ add.d S0, S7, T0 ++ ++ move P1, P0 ++ addi.d P0, P0, 0x400 ++ ++ srai.d I, N, 0x04 ++ addi.d J, J, -1 ++ beq ZERO, I, .L_N15 ++ ++.L_I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ xvld U2, S1, 0x40 ++ xvld U3, S1, 0x60 ++ xvld U4, S2, 0x00 ++ xvld U5, S2, 0x20 ++ xvld U6, S2, 0x40 ++ xvld U7, S2, 0x60 ++ ++ xvst U0, P1, 0x00 ++ xvst U1, P1, 0x20 ++ xvst U2, P1, 0x40 ++ xvst U3, P1, 0x60 ++ xvst U4, P1, 0x80 ++ xvst U5, P1, 0xA0 ++ xvst U6, P1, 0xC0 ++ xvst U7, P1, 0xE0 ++ ++ xvld U0, S3, 0x00 ++ xvld U1, S3, 0x20 ++ xvld U2, S3, 0x40 ++ xvld U3, S3, 0x60 ++ xvld U4, S4, 0x00 ++ xvld U5, S4, 0x20 ++ xvld U6, S4, 0x40 ++ xvld U7, S4, 0x60 ++ ++ xvst U0, P1, 0x100 ++ xvst U1, P1, 0x120 ++ xvst U2, P1, 0x140 ++ xvst U3, P1, 0x160 ++ xvst U4, P1, 0x180 ++ xvst U5, P1, 0x1A0 ++ xvst U6, P1, 0x1C0 ++ xvst U7, P1, 0x1E0 ++ ++ xvld U0, S5, 0x00 ++ xvld U1, S5, 0x20 ++ xvld U2, S5, 0x40 ++ xvld U3, S5, 0x60 ++ xvld U4, S6, 0x00 ++ xvld U5, S6, 0x20 ++ xvld U6, S6, 0x40 ++ xvld U7, S6, 0x60 ++ ++ xvst U0, P1, 0x200 ++ xvst U1, P1, 0x220 ++ xvst U2, P1, 0x240 ++ xvst U3, P1, 0x260 ++ xvst U4, P1, 0x280 ++ xvst U5, P1, 0x2A0 ++ xvst U6, P1, 0x2C0 ++ xvst U7, P1, 0x2E0 ++ ++ xvld U0, S7, 0x00 ++ xvld U1, S7, 0x20 ++ xvld U2, S7, 0x40 ++ xvld U3, S7, 0x60 ++ xvld U4, S8, 0x00 ++ xvld U5, S8, 0x20 ++ xvld U6, S8, 0x40 ++ xvld U7, S8, 0x60 ++ ++ xvst U0, P1, 0x300 ++ xvst U1, P1, 0x320 ++ xvst U2, P1, 0x340 ++ xvst U3, P1, 0x360 ++ xvst U4, P1, 0x380 ++ xvst U5, P1, 0x3A0 ++ xvst U6, P1, 0x3C0 ++ xvst U7, P1, 0x3E0 ++ ++ addi.d S1, S1, 0x80 ++ addi.d S2, S2, 0x80 ++ addi.d S3, S3, 0x80 ++ addi.d S4, S4, 0x80 ++ addi.d S5, S5, 0x80 ++ addi.d S6, S6, 0x80 ++ addi.d S7, S7, 0x80 ++ addi.d S8, S8, 0x80 ++ addi.d I, I, -1 ++ add.d P1, P1, T1 ++ blt ZERO, I, .L_I1 ++ ++.L_N15: ++ andi I, N, 0x08 ++ beq ZERO, I, .L_N7 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ xvld U2, S2, 0x00 ++ xvld U3, S2, 0x20 ++ xvld U4, S3, 0x00 ++ xvld U5, S3, 0x20 ++ xvld U6, S4, 0x00 ++ xvld U7, S4, 0x20 ++ ++ xvst U0, P2, 0x00 ++ xvst U1, P2, 0x20 ++ xvst U2, P2, 0x40 ++ xvst U3, P2, 0x60 ++ xvst U4, P2, 0x80 ++ xvst U5, P2, 0xA0 ++ xvst U6, P2, 0xC0 ++ xvst U7, P2, 0xE0 ++ ++ xvld U0, S5, 0x00 ++ xvld U1, S5, 0x20 ++ xvld U2, S6, 0x00 ++ xvld U3, S6, 0x20 ++ xvld U4, S7, 0x00 ++ xvld U5, S7, 0x20 ++ xvld U6, S8, 0x00 ++ xvld U7, S8, 0x20 ++ ++ xvst U0, P2, 0x100 ++ xvst U1, P2, 0x120 ++ xvst U2, P2, 0x140 ++ xvst U3, P2, 0x160 ++ xvst U4, P2, 0x180 ++ xvst U5, P2, 0x1A0 ++ xvst U6, P2, 0x1C0 ++ xvst U7, P2, 0x1E0 ++ ++ addi.d S1, S1, 0x40 ++ addi.d S2, S2, 0x40 ++ addi.d S3, S3, 0x40 ++ addi.d S4, S4, 0x40 ++ addi.d S5, S5, 0x40 ++ addi.d S6, S6, 0x40 ++ addi.d S7, S7, 0x40 ++ addi.d S8, S8, 0x40 ++ addi.d P2, P2, 0x200 ++ ++.L_N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_N3 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ xvld U4, S5, 0x00 ++ xvld U5, S6, 0x00 ++ xvld U6, S7, 0x00 ++ xvld U7, S8, 0x00 ++ ++ xvst U0, P3, 0x00 ++ xvst U1, P3, 0x20 ++ xvst U2, P3, 0x40 ++ xvst U3, P3, 0x60 ++ xvst U4, P3, 0x80 ++ xvst U5, P3, 0xA0 ++ xvst U6, P3, 0xC0 ++ xvst U7, P3, 0xE0 ++ ++ addi.d S1, S1, 0x20 ++ addi.d S2, S2, 0x20 ++ addi.d S3, S3, 0x20 ++ addi.d S4, S4, 0x20 ++ addi.d S5, S5, 0x20 ++ addi.d S6, S6, 0x20 ++ addi.d S7, S7, 0x20 ++ addi.d S8, S8, 0x20 ++ addi.d P3, P3, 0x100 ++ ++.L_N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_N1 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ xvld U4, S5, 0x00 ++ xvld U5, S6, 0x00 ++ xvld U6, S7, 0x00 ++ xvld U7, S8, 0x00 ++ ++ xvpermi.q U0, U1, 0x02 ++ xvpermi.q U2, U3, 0x02 ++ xvpermi.q U4, U5, 0x02 ++ xvpermi.q U6, U7, 0x02 ++ ++ xvst U0, P4, 0x00 ++ xvst U2, P4, 0x20 ++ xvst U4, P4, 0x40 ++ xvst U6, P4, 0x60 ++ ++ addi.d S1, S1, 0x10 ++ addi.d S2, S2, 0x10 ++ addi.d S3, S3, 0x10 ++ addi.d S4, S4, 0x10 ++ addi.d S5, S5, 0x10 ++ addi.d S6, S6, 0x10 ++ addi.d S7, S7, 0x10 ++ addi.d S8, S8, 0x10 ++ addi.d P4, P4, 0x80 ++ ++.L_N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_N0 ++ ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ fld.d F2, S3, 0x00 ++ fld.d F3, S4, 0x00 ++ fld.d F4, S5, 0x00 ++ fld.d F5, S6, 0x00 ++ fld.d F6, S7, 0x00 ++ fld.d F7, S8, 0x00 ++ ++ fst.d F0, P5, 0x00 ++ fst.d F1, P5, 0x08 ++ fst.d F2, P5, 0x10 ++ fst.d F3, P5, 0x18 ++ fst.d F4, P5, 0x20 ++ fst.d F5, P5, 0x28 ++ fst.d F6, P5, 0x30 ++ fst.d F7, P5, 0x38 ++ ++ addi.d S1, S1, 0x08 ++ addi.d S2, S2, 0x08 ++ addi.d S3, S3, 0x08 ++ addi.d S4, S4, 0x08 ++ addi.d S5, S5, 0x08 ++ addi.d S6, S6, 0x08 ++ addi.d S7, S7, 0x08 ++ addi.d S8, S8, 0x08 ++ addi.d P5, P5, 0x40 ++ ++.L_N0: ++ blt ZERO, J, .L_J1 ++ ++.L_M7: ++ andi J, M, 0x04 ++ beq ZERO, J, .L_M3 ++ ++ move S1, S0 ++ add.d S2, S0, TL ++ add.d S3, S1, T0 ++ add.d S4, S2, T0 ++ add.d S0, S3, T0 ++ ++ move P1, P0 ++ addi.d P0, P0, 0x200 ++ ++ srai.d I, N, 0x04 ++ beq ZERO, I, .L_4N15 ++ ++.L_4I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ xvld U2, S1, 0x40 ++ xvld U3, S1, 0x60 ++ xvld U4, S2, 0x00 ++ xvld U5, S2, 0x20 ++ xvld U6, S2, 0x40 ++ xvld U7, S2, 0x60 ++ ++ xvst U0, P1, 0x00 ++ xvst U1, P1, 0x20 ++ xvst U2, P1, 0x40 ++ xvst U3, P1, 0x60 ++ xvst U4, P1, 0x80 ++ xvst U5, P1, 0xA0 ++ xvst U6, P1, 0xC0 ++ xvst U7, P1, 0xE0 ++ ++ xvld U0, S3, 0x00 ++ xvld U1, S3, 0x20 ++ xvld U2, S3, 0x40 ++ xvld U3, S3, 0x60 ++ xvld U4, S4, 0x00 ++ xvld U5, S4, 0x20 ++ xvld U6, S4, 0x40 ++ xvld U7, S4, 0x60 ++ ++ xvst U0, P1, 0x100 ++ xvst U1, P1, 0x120 ++ xvst U2, P1, 0x140 ++ xvst U3, P1, 0x160 ++ xvst U4, P1, 0x180 ++ xvst U5, P1, 0x1A0 ++ xvst U6, P1, 0x1C0 ++ xvst U7, P1, 0x1E0 ++ ++ addi.d S1, S1, 0x80 ++ addi.d S2, S2, 0x80 ++ addi.d S3, S3, 0x80 ++ addi.d S4, S4, 0x80 ++ addi.d I, I, -1 ++ add.d P1, P1, T1 ++ blt ZERO, I, .L_4I1 ++ ++.L_4N15: ++ andi I, N, 0x08 ++ beq ZERO, I, .L_4N7 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ xvld U2, S2, 0x00 ++ xvld U3, S2, 0x20 ++ xvld U4, S3, 0x00 ++ xvld U5, S3, 0x20 ++ xvld U6, S4, 0x00 ++ xvld U7, S4, 0x20 ++ ++ xvst U0, P2, 0x00 ++ xvst U1, P2, 0x20 ++ xvst U2, P2, 0x40 ++ xvst U3, P2, 0x60 ++ xvst U4, P2, 0x80 ++ xvst U5, P2, 0xA0 ++ xvst U6, P2, 0xC0 ++ xvst U7, P2, 0xE0 ++ ++ addi.d S1, S1, 0x40 ++ addi.d S2, S2, 0x40 ++ addi.d S3, S3, 0x40 ++ addi.d S4, S4, 0x40 ++ addi.d P2, P2, 0x100 ++ ++.L_4N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_4N3 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ ++ xvst U0, P3, 0x00 ++ xvst U1, P3, 0x20 ++ xvst U2, P3, 0x40 ++ xvst U3, P3, 0x60 ++ ++ addi.d S1, S1, 0x20 ++ addi.d S2, S2, 0x20 ++ addi.d S3, S3, 0x20 ++ addi.d S4, S4, 0x20 ++ addi.d P3, P3, 0x80 ++ ++.L_4N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_4N1 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ ++ xvpermi.q U0, U1, 0x02 ++ xvpermi.q U2, U3, 0x02 ++ ++ xvst U0, P4, 0x00 ++ xvst U2, P4, 0x20 ++ ++ addi.d S1, S1, 0x10 ++ addi.d S2, S2, 0x10 ++ addi.d S3, S3, 0x10 ++ addi.d S4, S4, 0x10 ++ addi.d P4, P4, 0x40 ++ ++.L_4N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_M3 ++ ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ fld.d F2, S3, 0x00 ++ fld.d F3, S4, 0x00 ++ ++ fst.d F0, P5, 0x00 ++ fst.d F1, P5, 0x08 ++ fst.d F2, P5, 0x10 ++ fst.d F3, P5, 0x18 ++ ++ addi.d S1, S1, 0x08 ++ addi.d S2, S2, 0x08 ++ addi.d S3, S3, 0x08 ++ addi.d S4, S4, 0x08 ++ addi.d P5, P5, 0x20 ++ ++.L_M3: ++ andi J, M, 0x02 ++ beq ZERO, J, .L_M1 ++ ++ move S1, S0 ++ add.d S2, S0, TL ++ add.d S0, S0, T0 ++ ++ move P1, P0 ++ addi.d P0, P0, 0x100 ++ ++ srai.d I, N, 0x04 ++ beq ZERO, I, .L_2N15 ++ ++.L_2I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ xvld U2, S1, 0x40 ++ xvld U3, S1, 0x60 ++ xvld U4, S2, 0x00 ++ xvld U5, S2, 0x20 ++ xvld U6, S2, 0x40 ++ xvld U7, S2, 0x60 ++ ++ xvst U0, P1, 0x00 ++ xvst U1, P1, 0x20 ++ xvst U2, P1, 0x40 ++ xvst U3, P1, 0x60 ++ xvst U4, P1, 0x80 ++ xvst U5, P1, 0xA0 ++ xvst U6, P1, 0xC0 ++ xvst U7, P1, 0xE0 ++ ++ addi.d S1, S1, 0x80 ++ addi.d S2, S2, 0x80 ++ addi.d I, I, -1 ++ add.d P1, P1, T1 ++ blt ZERO, I, .L_2I1 ++ ++.L_2N15: ++ andi I, N, 0x08 ++ beq ZERO, I, .L_2N7 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ xvld U2, S2, 0x00 ++ xvld U3, S2, 0x20 ++ ++ xvst U0, P2, 0x00 ++ xvst U1, P2, 0x20 ++ xvst U2, P2, 0x40 ++ xvst U3, P2, 0x60 ++ ++ addi.d S1, S1, 0x40 ++ addi.d S2, S2, 0x40 ++ addi.d P2, P2, 0x80 ++ ++.L_2N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_2N3 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ ++ xvst U0, P3, 0x00 ++ xvst U1, P3, 0x20 ++ ++ addi.d S1, S1, 0x20 ++ addi.d S2, S2, 0x20 ++ addi.d P3, P3, 0x40 ++ ++.L_2N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_2N1 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ ++ xvpermi.q U0, U1, 0x02 ++ ++ xvst U0, P4, 0x00 ++ ++ addi.d S1, S1, 0x10 ++ addi.d S2, S2, 0x10 ++ addi.d P4, P4, 0x20 ++ ++.L_2N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_M1 ++ ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ ++ fst.d F0, P5, 0x00 ++ fst.d F1, P5, 0x08 ++ ++ addi.d S1, S1, 0x08 ++ addi.d S2, S2, 0x08 ++ addi.d P5, P5, 0x10 ++ ++.L_M1: ++ andi J, M, 0x01 ++ beq ZERO, J, .L_M0 ++ ++ move S1, S0 ++ add.d S2, S0, TL ++ ++ move P1, P0 ++ addi.d P0, P0, 0x80 ++ ++ srai.d I, N, 0x04 ++ beq ZERO, I, .L_1N15 ++ ++.L_1I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ xvld U2, S1, 0x40 ++ xvld U3, S1, 0x60 ++ ++ xvst U0, P1, 0x00 ++ xvst U1, P1, 0x20 ++ xvst U2, P1, 0x40 ++ xvst U3, P1, 0x60 ++ ++ addi.d S1, S1, 0x80 ++ addi.d I, I, -1 ++ add.d P1, P1, T1 ++ blt ZERO, I, .L_1I1 ++ ++.L_1N15: ++ andi I, N, 0x08 ++ beq ZERO, I, .L_1N7 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ ++ xvst U0, P2, 0x00 ++ xvst U1, P2, 0x20 ++ ++ addi.d S1, S1, 0x40 ++ addi.d P2, P2, 0x40 ++ ++.L_1N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_1N3 ++ ++ xvld U0, S1, 0x00 ++ ++ xvst U0, P3, 0x00 ++ ++ addi.d S1, S1, 0x20 ++ addi.d P3, P3, 0x20 ++ ++.L_1N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_1N1 ++ ++ fld.d F0, S1, 0x00 ++ fld.d F1, S1, 0x08 ++ ++ fst.d F0, P4, 0x00 ++ fst.d F1, P4, 0x08 ++ ++ addi.d S1, S1, 0x10 ++ addi.d P4, P4, 0x10 ++ ++.L_1N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_M0 ++ ++ fld.d F0, S1, 0x00 ++ ++ fst.d F0, P5, 0x00 ++ ++ addi.d S1, S1, 0x08 ++ addi.d P5, P5, 0x08 ++ ++.L_M0: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ LDARG $r25, $sp, 16 ++ LDARG $r26, $sp, 24 ++ LDARG $r27, $sp, 32 ++ LDARG $r28, $sp, 40 ++ LDARG $r29, $sp, 48 ++ addi.d $sp, $sp, 56 ++ jirl $r0, $r1, 0x00 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/dgemm_tcopy_4.S b/kernel/loongarch64/dgemm_tcopy_4.S +new file mode 100644 +index 0000000..700989c +--- /dev/null ++++ b/kernel/loongarch64/dgemm_tcopy_4.S +@@ -0,0 +1,270 @@ ++/******************************************************************************* ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++/* Function parameters */ ++#define M $r4 // param 1: m ++#define N $r5 // param 2: n ++#define SRC $r6 // param 3: src ++#define LDA $r7 // param 4: lda ++#define DST $r8 // param 5: dst ++ ++#define I $r9 ++#define J $r10 ++#define S0 $r11 ++#define S1 $r12 ++#define S2 $r13 ++#define S3 $r14 ++#define S4 $r15 ++#define P0 $r16 ++#define P1 $r17 ++#define P2 $r18 ++#define P3 $r19 ++#define T0 $r20 ++#define T1 $r23 ++#define TL $r7 ++#define ZERO $r0 ++ ++#define F0 $f0 ++#define F1 $f1 ++#define F2 $f2 ++#define F3 $f3 ++/* LASX vectors */ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -8 ++ SDARG $r23, $sp, 0 ++ ++ move S0, SRC ++ move P0, DST ++ ++ srai.d T0, N, 0x02 ++ slli.d T0, T0, 0x02 ++ srai.d T1, N, 0x01 ++ slli.d T1, T1, 0x01 ++ mul.d T0, M, T0 ++ mul.d T1, M, T1 ++ slli.d T0, T0, 0x03 ++ slli.d T1, T1, 0x03 ++ add.d P2, DST, T0 ++ add.d P3, DST, T1 ++ ++ slli.d TL, LDA, 0x03 ++ srai.d J, M, 0x02 ++ slli.d T0, TL, 0x01 ++ slli.d T1, M, 0x05 ++ beq ZERO, J, .L_M3 ++ ++.L_J1: /* J-- */ ++ move S1, S0 ++ add.d S2, S0, TL ++ add.d S3, S1, T0 ++ add.d S4, S2, T0 ++ add.d S0, S3, T0 ++ ++ move P1, P0 ++ addi.d P0, P0, 0x80 ++ ++ srai.d I, N, 0x02 ++ addi.d J, J, -1 ++ beq ZERO, I, .L_N3 ++ ++.L_I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ ++ xvst U0, P1, 0x00 ++ xvst U1, P1, 0x20 ++ xvst U2, P1, 0x40 ++ xvst U3, P1, 0x60 ++ ++ addi.d S1, S1, 0x20 ++ addi.d S2, S2, 0x20 ++ addi.d S3, S3, 0x20 ++ addi.d S4, S4, 0x20 ++ add.d P1, P1, T1 ++ ++ addi.d I, I, -1 ++ blt ZERO, I, .L_I1 ++ ++.L_N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_N1 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ ++ xvpermi.q U0, U1, 0x02 ++ xvpermi.q U2, U3, 0x02 ++ ++ xvst U0, P2, 0x00 ++ xvst U2, P2, 0x20 ++ ++ addi.d S1, S1, 0x10 ++ addi.d S2, S2, 0x10 ++ addi.d S3, S3, 0x10 ++ addi.d S4, S4, 0x10 ++ addi.d P2, P2, 0x40 ++ ++.L_N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_N0 ++ ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ fld.d F2, S3, 0x00 ++ fld.d F3, S4, 0x00 ++ ++ fst.d F0, P3, 0x00 ++ fst.d F1, P3, 0x08 ++ fst.d F2, P3, 0x10 ++ fst.d F3, P3, 0x18 ++ ++ addi.d S1, S1, 0x08 ++ addi.d S2, S2, 0x08 ++ addi.d S3, S3, 0x08 ++ addi.d S4, S4, 0x08 ++ addi.d P3, P3, 0x20 ++ ++.L_N0: ++ blt ZERO, J, .L_J1 ++ ++.L_M3: ++ andi J, M, 0x02 ++ beq ZERO, J, .L_M1 ++ ++ move S1, S0 ++ add.d S2, S0, TL ++ add.d S0, S0, T0 ++ ++ move P1, P0 ++ addi.d P0, P0, 0x40 ++ ++ srai.d I, N, 0x02 ++ beq ZERO, I, .L_2N3 ++ ++.L_2I1: /* I-- */ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ ++ xvst U0, P1, 0x00 ++ xvst U1, P1, 0x20 ++ ++ addi.d S1, S1, 0x20 ++ addi.d S2, S2, 0x20 ++ addi.d I, I, -1 ++ add.d P1, P1, T1 ++ ++ blt ZERO, I, .L_2I1 ++ ++.L_2N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_2N1 ++ ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ ++ xvpermi.q U0, U1, 0x02 ++ ++ xvst U0, P2, 0x00 ++ ++ addi.d S1, S1, 0x10 ++ addi.d S2, S2, 0x10 ++ addi.d P2, P2, 0x20 ++ ++.L_2N1: ++ addi.d I, N, 0x01 ++ beq ZERO, I, .L_M1 ++ ++ fld.d F0, S1, 0x00 ++ fld.d F1, S2, 0x00 ++ ++ fst.d F0, P3, 0x00 ++ fst.d F1, P3, 0x08 ++ ++ addi.d S1, S1, 0x08 ++ addi.d S2, S2, 0x08 ++ addi.d P3, P3, 0x10 ++ ++.L_M1: ++ andi J, M, 0x01 ++ beq ZERO, J, .L_M0 ++ ++ move S1, S0 ++ move P1, P0 ++ ++ srai.d I, N, 0x02 ++ beq ZERO, I, .L_1N3 ++ ++.L_1I1: ++ xvld U0, S1, 0x00 ++ ++ xvst U0, P1, 0x00 ++ ++ addi.d S1, S1, 0x20 ++ addi.d I, I, -1 ++ add.d P1, P1, T1 ++ ++ blt ZERO, I, .L_1I1 ++ ++.L_1N3: ++ andi I, N, 0x02 ++ beq I, ZERO, .L_1N1 ++ ++ fld.d F0, S1, 0x00 ++ fld.d F1, S1, 0x08 ++ ++ fst.d F0, P2, 0x00 ++ fst.d F1, P2, 0x08 ++ ++ addi.d S1, S1, 0x10 ++ addi.d P2, P2, 0x10 ++ ++.L_1N1: ++ andi I, N, 0x01 ++ beq I, ZERO, .L_M0 ++ ++ fld.d F0, S1, 0x00 ++ ++ fst.d F0, P3, 0x00 ++ ++.L_M0: ++ LDARG $r23, $sp, 0 ++ addi.d $sp, $sp, 8 ++ jirl $r0, $r1, 0x00 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S +new file mode 100644 +index 0000000..ff937ae +--- /dev/null ++++ b/kernel/loongarch64/dnrm2.S +@@ -0,0 +1,324 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define XX $r7 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define t1 $f0 ++#define t2 $f1 ++#define t3 $f2 ++#define t4 $f3 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++#define ALPHA $f4 ++#define max $f5 ++#define INF $f6 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ // Init INF ++ addi.d TEMP, $r0, 0x7FF ++ slli.d TEMP, TEMP, 52 ++ MTC INF, TEMP ++ ++ MTC s1, $r0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, INCX, .L999 ++ move XX, X ++ NOP ++ LD a1, X, 0 * SIZE ++ addi.d N, N, -1 ++ add.d X, X, INCX ++ FABS s1, a1 ++ FABS s2, a1 ++ bge $r0, N, .L999 ++ FABS s3, a1 ++ srai.d I, N, 3 ++ FABS s4, a1 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a8, X, 0 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ FABS t1, a1 ++ LD a1, X, 0 * SIZE ++ FABS t2, a2 ++ add.d X, X, INCX ++ FABS t3, a3 ++ LD a2, X, 0 * SIZE ++ FABS t4, a4 ++ add.d X, X, INCX ++ CMPLT $fcc0, s1, t1 ++ LD a3, X, 0 * SIZE ++ CMPLT $fcc1, s2, t2 ++ add.d X, X, INCX ++ CMPLT $fcc2, s3, t3 ++ LD a4, X, 0 * SIZE ++ CMPLT $fcc3, s4, t4 ++ add.d X, X, INCX ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ FABS t1, a5 ++ LD a5, X, 0 * SIZE ++ FABS t2, a6 ++ add.d X, X, INCX ++ FABS t3, a7 ++ LD a6, X, 0 * SIZE ++ FABS t4, a8 ++ add.d X, X, INCX ++ CMPLT $fcc0, s1, t1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, s2, t2 ++ add.d X, X, INCX ++ CMPLT $fcc2, s3, t3 ++ LD a8, X, 0 * SIZE ++ CMPLT $fcc3, s4, t4 ++ add.d X, X, INCX ++ CMOVT s1, s1, t1, $fcc0 ++ addi.d I, I, -1 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ FABS t1, a1 ++ FABS t2, a2 ++ FABS t3, a3 ++ FABS t4, a4 ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t2 ++ CMPLT $fcc2, s3, t3 ++ CMPLT $fcc3, s4, t4 ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ FABS t1, a5 ++ FABS t2, a6 ++ FABS t3, a7 ++ FABS t4, a8 ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t2 ++ CMPLT $fcc2, s3, t3 ++ CMPLT $fcc3, s4, t4 ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ .align 3 ++ ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L100 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ FABS t1, a1 ++ CMPLT $fcc0, s1, t1 ++ CMOVT s1, s1, t1, $fcc0 ++ add.d X, X, INCX ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L100: ++ CMPLT $fcc0, s1, s2 ++ CMPLT $fcc1, s3, s4 ++ CMOVT s1, s1, s2, $fcc0 ++ CMOVT s3, s3, s4, $fcc1 ++ CMPLT $fcc0, s1, s3 ++ CMOVT s1, s1, s3, $fcc0 ++ addi.d N, N, 1 ++ lu12i.w TEMP, 0x3f800 ++ movgr2fr.d a1, $r0 ++ movgr2fr.w ALPHA, TEMP ++ CMPEQ $fcc0, s1, a1 ++ fcvt.d.s ALPHA, ALPHA ++ bcnez $fcc0, .L999 ++ ++ fdiv.d ALPHA, ALPHA, s1 ++ CMPEQ $fcc0, INF, ALPHA ++ bcnez $fcc0, .L999 ++ ++ MOV max, s1 ++ MOV s1, a1 ++ MOV s2, a1 ++ MOV s3, a1 ++ MOV s4, a1 ++ srai.d I, N, 3 ++ bge $r0, I, .L105 ++ LD a1, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD a2, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD a3, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD a4, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD a5, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD a6, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD a7, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD a8, XX, 0 * SIZE ++ addi.d I, I, -1 ++ add.d XX, XX, INCX ++ bge $r0, I, .L104 ++ .align 3 ++ ++.L103: ++ MUL t1, ALPHA, a1 ++ LD a1, XX, 0 * SIZE ++ MUL t2, ALPHA, a2 ++ add.d XX, XX, INCX ++ MUL t3, ALPHA, a3 ++ LD a2, XX, 0 * SIZE ++ MUL t4, ALPHA, a4 ++ add.d XX, XX, INCX ++ MADD s1, t1, t1, s1 ++ LD a3, XX, 0 * SIZE ++ MADD s2, t2, t2, s2 ++ add.d XX, XX, INCX ++ MADD s3, t3, t3, s3 ++ LD a4, XX, 0 * SIZE ++ MADD s4, t4, t4, s4 ++ add.d XX, XX, INCX ++ MUL t1, ALPHA, a5 ++ LD a5, XX, 0 * SIZE ++ MUL t2, ALPHA, a6 ++ add.d XX, XX, INCX ++ MUL t3, ALPHA, a7 ++ LD a6, XX, 0 * SIZE ++ MUL t4, ALPHA, a8 ++ add.d XX, XX, INCX ++ MADD s1, t1, t1, s1 ++ LD a7, XX, 0 * SIZE ++ MADD s2, t2, t2, s2 ++ add.d XX, XX, INCX ++ MADD s3, t3, t3, s3 ++ LD a8, XX, 0 * SIZE ++ MADD s4, t4, t4, s4 ++ addi.d I, I, -1 ++ add.d XX, XX, INCX ++ blt $r0, I, .L103 ++ .align 3 ++ ++.L104: ++ MUL t1, ALPHA, a1 ++ MUL t2, ALPHA, a2 ++ MUL t3, ALPHA, a3 ++ MUL t4, ALPHA, a4 ++ MADD s1, t1, t1, s1 ++ MADD s2, t2, t2, s2 ++ MADD s3, t3, t3, s3 ++ MADD s4, t4, t4, s4 ++ MUL t1, ALPHA, a5 ++ MUL t2, ALPHA, a6 ++ MUL t3, ALPHA, a7 ++ MUL t4, ALPHA, a8 ++ MADD s1, t1, t1, s1 ++ MADD s2, t2, t2, s2 ++ MADD s3, t3, t3, s3 ++ MADD s4, t4, t4, s4 ++ .align 3 ++ ++.L105: ++ andi I, N, 7 ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L106: ++ LD a1, XX, 0 * SIZE ++ addi.d I, I, -1 ++ MUL t1, ALPHA, a1 ++ add.d XX, XX, INCX ++ MADD s1, t1, t1, s1 ++ blt $r0, I, .L106 ++ .align 3 ++ ++.L998: ++ ADD s1, s1, s2 ++ ADD s3, s3, s4 ++ ADD s1, s1, s3 ++ fsqrt.d s1, s1 ++ move $r4, $r17 ++ MUL $f0, max, s1 ++ jirl $r0, $r1, 0x0 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S +new file mode 100644 +index 0000000..1e4c81a +--- /dev/null ++++ b/kernel/loongarch64/dot.S +@@ -0,0 +1,391 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define Y $r7 ++#define INCY $r8 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f23 ++#define a2 $f9 ++#define a3 $f10 ++#define a4 $f11 ++#define b1 $f12 ++#define b2 $f13 ++#define b3 $f14 ++#define b4 $f15 ++#define s1 $f22 ++#define s2 $f8 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++ LDINT INCY, 0(INCY) ++#endif ++ ++ MTC s1, $r0 ++ MTC s2, $r0 ++ slli.d INCX, INCX, BASE_SHIFT ++ li.d TEMP, SIZE ++ slli.d INCY, INCY, BASE_SHIFT ++ bge $r0, N, .L999 ++ srai.d I, N, 3 ++ bne INCX, TEMP, .L20 ++ bne INCY, TEMP, .L20 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ LD b1, Y, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD b2, Y, 1 * SIZE ++ LD a3, X, 2 * SIZE ++ LD b3, Y, 2 * SIZE ++ LD a4, X, 3 * SIZE ++ addi.d I, I, -1 ++ LD b4, Y, 3 * SIZE ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s1, b1, a1, s1 ++#else ++ MADD s1, b1, a1, s1 ++#endif ++ LD a1, X, 4 * SIZE ++ LD b1, Y, 4 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a2, a2 ++ fcvt.d.s b2, b2 ++ fmadd.d s2, b2, a2, s2 ++#else ++ MADD s2, b2, a2, s2 ++#endif ++ LD a2, X, 5 * SIZE ++ LD b2, Y, 5 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a3, a3 ++ fcvt.d.s b3, b3 ++ fmadd.d s1, b3, a3, s1 ++#else ++ MADD s1, b3, a3, s1 ++#endif ++ LD a3, X, 6 * SIZE ++ LD b3, Y, 6 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a4, a4 ++ fcvt.d.s b4, b4 ++ fmadd.d s2, b4, a4, s2 ++#else ++ MADD s2, b4, a4, s2 ++#endif ++ LD a4, X, 7 * SIZE ++ LD b4, Y, 7 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s1, b1, a1, s1 ++#else ++ MADD s1, b1, a1, s1 ++#endif ++ LD a1, X, 8 * SIZE ++ LD b1, Y, 8 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a2, a2 ++ fcvt.d.s b2, b2 ++ fmadd.d s2, b2, a2, s2 ++#else ++ MADD s2, b2, a2, s2 ++#endif ++ LD a2, X, 9 * SIZE ++ LD b2, Y, 9 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a3, a3 ++ fcvt.d.s b3, b3 ++ fmadd.d s1, b3, a3, s1 ++#else ++ MADD s1, b3, a3, s1 ++#endif ++ LD a3, X, 10 * SIZE ++ LD b3, Y, 10 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a4, a4 ++ fcvt.d.s b4, b4 ++ fmadd.d s2, b4, a4, s2 ++#else ++ MADD s2, b4, a4, s2 ++#endif ++ LD a4, X, 11 * SIZE ++ LD b4, Y, 11 * SIZE ++ addi.d I, I, -1 ++ addi.d X, X, 8 * SIZE ++addi.d Y, Y, 8 * SIZE ++ blt $r0, I, .L12 ++ .align 3 ++.L13: ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s1, b1, a1, s1 ++#else ++ MADD s1, b1, a1, s1 ++#endif ++ LD a1, X, 4 * SIZE ++ LD b1, Y, 4 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a2, a2 ++ fcvt.d.s b2, b2 ++ fmadd.d s2, b2, a2, s2 ++#else ++ MADD s2, b2, a2, s2 ++#endif ++ LD a2, X, 5 * SIZE ++ LD b2, Y, 5 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a3, a3 ++ fcvt.d.s b3, b3 ++ fmadd.d s1, b3, a3, s1 ++#else ++ MADD s1, b3, a3, s1 ++#endif ++ LD a3, X, 6 * SIZE ++ LD b3, Y, 6 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a4, a4 ++ fcvt.d.s b4, b4 ++ fmadd.d s2, b4, a4, s2 ++#else ++ MADD s2, b4, a4, s2 ++#endif ++ LD a4, X, 7 * SIZE ++ LD b4, Y, 7 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s1, b1, a1, s1 ++#else ++ MADD s1, b1, a1, s1 ++#endif ++ addi.d X, X, 8 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a2, a2 ++ fcvt.d.s b2, b2 ++ fmadd.d s2, b2, a2, s2 ++#else ++ MADD s2, b2, a2, s2 ++#endif ++ addi.d Y, Y, 8 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a3, a3 ++ fcvt.d.s b3, b3 ++ fmadd.d s1, b3, a3, s1 ++#else ++ MADD s1, b3, a3, s1 ++#endif ++#ifdef DSDOT ++ fcvt.d.s a4, a4 ++ fcvt.d.s b4, b4 ++ fmadd.d s2, b4, a4, s2 ++#else ++ MADD s2, b4, a4, s2 ++#endif ++ .align 3 ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++.L16: ++ LD a1, X, 0 * SIZE ++ LD b1, Y, 0 * SIZE ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s1, b1, a1, s1 ++#else ++ MADD s1, b1, a1, s1 ++#endif ++ addi.d I, I, -1 ++ addi.d X, X, SIZE ++ addi.d Y, Y, SIZE ++ blt $r0, I, .L16 ++ b .L999 ++ .align 3 ++ ++.L20: ++#ifdef F_INTERFACE ++ bgez INCX, .L21 ++ addi.d TEMP, N, -1 ++ mult TEMP, INCX ++ mflo TEMP ++ dsub X, X, TEMP ++ .align 3 ++ ++.L21: ++ bgez INCY, .L22 ++ addi.d TEMP, N, -1 ++ mult TEMP, INCY ++ mflo TEMP ++ dsub Y, Y, TEMP ++ .align 3 ++ ++.L22: ++#endif ++ bge $r0, I, .L25 ++ .align 3 ++ ++.L23: ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s1, b1, a1, s1 ++#else ++ MADD s1, b1, a1, s1 ++#endif ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s2, b1, a1, s2 ++#else ++ MADD s2, b1, a1, s2 ++#endif ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s1, b1, a1, s1 ++#else ++ MADD s1, b1, a1, s1 ++#endif ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s2, b1, a1, s2 ++#else ++ MADD s2, b1, a1, s2 ++#endif ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s1, b1, a1, s1 ++#else ++ MADD s1, b1, a1, s1 ++#endif ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s2, b1, a1, s2 ++#else ++ MADD s2, b1, a1, s2 ++#endif ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s1, b1, a1, s1 ++#else ++ MADD s1, b1, a1, s1 ++#endif ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ addi.d I, I, -1 ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s2, b1, a1, s2 ++#else ++ MADD s2, b1, a1, s2 ++#endif ++ blt $r0, I, .L23 ++ .align 3 ++ ++.L25: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L26: ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ addi.d I, I, -1 ++#ifdef DSDOT ++ fcvt.d.s a1, a1 ++ fcvt.d.s b1, b1 ++ fmadd.d s1, b1, a1, s1 ++#else ++ MADD s1, b1, a1, s1 ++#endif ++ blt $r0, I, .L26 ++ .align 3 ++ ++.L999: ++#ifdef DSDOT ++ fadd.d $f0, s1, s2 ++#else ++ ADD $f0, s1, s2 ++#endif ++ move $r4, $r17 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/gemm_kernel.S b/kernel/loongarch64/gemm_kernel.S +new file mode 100644 +index 0000000..8926bf1 +--- /dev/null ++++ b/kernel/loongarch64/gemm_kernel.S +@@ -0,0 +1,1859 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define M $r4 ++#define N $r5 ++#define K $r6 ++#define A $r7 ++#define B $r8 ++#define C $r9 ++#define LDC $r10 ++#define AO $r12 ++#define BO $r13 ++#define I $r17 ++#define J $r18 ++#define L $r30 ++#define PREFETCHSIZE (4 * 10) ++#define CO1 $r14 ++#define CO2 $r15 ++#define CO3 $r23 ++#define CO4 $r24 ++#define CO5 $r25 ++#define CO6 $r26 ++#define CO7 $r27 ++#define CO8 $r28 ++#define BB $r29 ++ ++#if defined(TRMMKERNEL) ++#define OFFSET $r11 ++#define KK $r20 ++#define TEMP $r16 ++#endif ++ ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f27 ++#define a4 $f28 ++#define b1 $f23 ++#define b2 $f9 ++#define b3 $f10 ++#define b4 $f11 ++#define b5 $f12 ++#define b6 $f13 ++#define b7 $f14 ++#define b8 $f15 ++#define a5 b8 ++#define c11 $f16 ++#define c12 $f17 ++#define c21 $f3 ++#define c22 $f1 ++#define c31 $f2 ++#define c32 $f4 ++#define c41 $f5 ++#define c42 $f6 ++#define c51 $f7 ++#define c52 $f18 ++#define c61 $f19 ++#define c62 $f20 ++#define c71 $f21 ++#define c72 $f24 ++#define c81 $f25 ++#define c82 $f26 ++#define ALPHA $f0 ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -160 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 16 ++ SDARG $r26, $sp, 24 ++ SDARG $r27, $sp, 32 ++ SDARG $r28, $sp, 40 ++ SDARG $r29, $sp, 48 ++ SDARG $r30, $sp, 96 ++ fst.d $f24, $sp, 56 ++ fst.d $f25, $sp, 64 ++ fst.d $f26, $sp, 72 ++ fst.d $f27, $sp, 80 ++ fst.d $f28, $sp, 88 ++#if defined(TRMMKERNEL) ++ SDARG $r20, $sp, 104 ++ SDARG $r16, $sp, 112 ++#endif ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 120 ++ fst.d $f19, $sp, 128 ++ fst.d $f20, $sp, 136 ++ fst.d $f21, $sp, 144 ++#endif ++ slli.d LDC, LDC, BASE_SHIFT ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ sub.d KK, $r0, OFFSET ++#endif ++ srai.d J, N, 3 ++nop ++ bge $r0, J, .L30 ++.L10: ++ move CO1, C ++ MTC c11, $r0 ++ add.d CO2, C, LDC ++ move AO, A ++ add.d CO3, CO2, LDC ++ addi.d J, J, -1 ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ add.d CO5, CO4, LDC ++ MOV c31, c11 ++ add.d CO6, CO5, LDC ++ MOV c41, c11 ++ add.d CO7, CO6, LDC ++ MOV c51, c11 ++ add.d CO8, CO7, LDC ++ srai.d I, M, 1 ++ add.d C, CO8, LDC ++ slli.d BB, K, 2 + BASE_SHIFT ++ add.d BB, B, BB ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move KK, OFFSET ++#endif ++MOV c61, c11 ++ bge $r0, I, .L20 ++.L11: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 3 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ MOV c32, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c42, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c52, c11 ++ LD b5, BO, 4 * SIZE ++ MOV c62, c11 ++ LD b6, BO, 8 * SIZE ++ MOV c72, c11 ++ LD b7, BO, 12 * SIZE ++ MOV c82, c11 ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 2 ++#else ++ addi.d TEMP, KK, 8 ++#endif ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L15 ++#else ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, B, 0 * SIZE ++ MOV c81, c11 ++ preld 1, CO1, 3 * SIZE ++ preld 1, CO2, 3 * SIZE ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, K, 2 ++ MOV c32, c11 ++ LD b3, B, 2 * SIZE ++ MOV c42, c11 ++ LD b4, B, 3 * SIZE ++ MOV c52, c11 ++ LD b5, B, 4 * SIZE ++ MOV c62, c11 ++ LD b6, B, 8 * SIZE ++ MOV c72, c11 ++ LD b7, B, 12 * SIZE ++ MOV c82, c11 ++move BO, B ++ bge $r0, L, .L15 ++#endif ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ bge $r0, L, .L13 ++ preld 1, CO3, 2 * SIZE ++ .align 3 ++.L12: ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ LD a4, AO, 2 * SIZE ++ MADD c61, b2, a1, c61 ++ MADD c71, b3, a1, c71 ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a4, c51 ++ MADD c61, b2, a4, c61 ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ LD a4, AO, 6 * SIZE ++ MADD c61, b2, a3, c61 ++ MADD c71, b3, a3, c71 ++ MADD c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ addi.d L, L, -1 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ blt $r0, L, .L12 ++ .align 3 ++ ++.L13: ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ MADD c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ preld 1, CO4, 3 * SIZE ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a4, c51 ++ preld 1, CO5, 3 * SIZE ++ MADD c61, b2, a4, c61 ++ MADD c71, b3, a4, c71 ++ preld 1, CO6, 3 * SIZE ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ preld 1, CO7, 3 * SIZE ++ MADD c41, b4, a3, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ MADD c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD c71, b3, a3, c71 ++ MADD c81, b4, a3, c81 ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ .align 3 ++ ++.L15: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ preld 1, CO8, 3 * SIZE ++ bge $r0, L, .L18 ++ .align 3 ++.L16: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ addi.d L, L, -1 ++ MADD c61, b2, a1, c61 ++ addi.d AO, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ addi.d BO, BO, 8 * SIZE ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 4 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L16 ++.L18: ++#ifndef TRMMKERNEL ++ LD $f22, CO1, 0 * SIZE ++ addi.d CO3,CO3, 2 * SIZE ++ LD $f8, CO1, 1 * SIZE ++ addi.d CO1,CO1, 2 * SIZE ++ LD $f23, CO2, 0 * SIZE ++ addi.d CO4,CO4, 2 * SIZE ++ LD $f9, CO2, 1 * SIZE ++ addi.d CO2,CO2, 2 * SIZE ++ LD $f10, CO3, -2 * SIZE ++ addi.d CO5,CO5, 2 * SIZE ++ LD $f11, CO3, -1 * SIZE ++ addi.d CO6,CO6, 2 * SIZE ++ LD $f12, CO4, -2 * SIZE ++ addi.d CO7,CO7, 2 * SIZE ++ LD $f13, CO4, -1 * SIZE ++ addi.d I, I, -1 ++ MADD c11, c11, ALPHA, $f22 ++ LD $f22, CO5, -2 * SIZE ++ MADD c12, c12, ALPHA, $f8 ++ LD $f8, CO5, -1 * SIZE ++ MADD c21, c21, ALPHA, $f23 ++ LD $f23, CO6, -2 * SIZE ++ MADD c22, c22, ALPHA, $f9 ++ LD $f9, CO6, -1 * SIZE ++ MADD c31, c31, ALPHA, $f10 ++ LD $f10, CO7, -2 * SIZE ++ MADD c32, c32, ALPHA, $f11 ++ LD $f11, CO7, -1 * SIZE ++ MADD c41, c41, ALPHA, $f12 ++ LD $f12, CO8, 0 * SIZE ++ MADD c42, c42, ALPHA, $f13 ++ LD $f13, CO8, 1 * SIZE ++ preld 0, BB, 0 * SIZE ++ preld 0, BB, 8 * SIZE ++ ST c11, CO1, -2 * SIZE ++ MTC c11, $r0 ++ ST c12, CO1, -1 * SIZE ++ addi.d CO8,CO8, 2 * SIZE ++ ST c21, CO2, -2 * SIZE ++ MOV c21, c11 ++ ST c22, CO2, -1 * SIZE ++ addi.d BB, BB, 16 * SIZE ++ MADD c51, c51, ALPHA, $f22 ++ ST c31, CO3, -2 * SIZE ++ MADD c52, c52, ALPHA, $f8 ++ ST c32, CO3, -1 * SIZE ++ MADD c61, c61, ALPHA, $f23 ++ ST c41, CO4, -2 * SIZE ++ MADD c62, c62, ALPHA, $f9 ++ ST c42, CO4, -1 * SIZE ++ MADD c71, c71, ALPHA, $f10 ++ ST c51, CO5, -2 * SIZE ++ MADD c72, c72, ALPHA, $f11 ++ ST c52, CO5, -1 * SIZE ++ MADD c81, c81, ALPHA, $f12 ++ ST c61, CO6, -2 * SIZE ++ MADD c82, c82, ALPHA, $f13 ++ ST c62, CO6, -1 * SIZE ++ ST c71, CO7, -2 * SIZE ++ MOV c31, c11 ++ ST c72, CO7, -1 * SIZE ++ MOV c41, c11 ++ ST c81, CO8, -2 * SIZE ++ MOV c51, c11 ++ ST c82, CO8, -1 * SIZE ++MOV c61, c11 ++ blt $r0, I, .L11 ++#else ++ addi.d CO4,CO4, 2 * SIZE ++ addi.d CO5,CO5, 2 * SIZE ++ addi.d CO6,CO6, 2 * SIZE ++ addi.d CO7,CO7, 2 * SIZE ++ preld 0, BB, 0 * SIZE ++ preld 0, BB, 8 * SIZE ++ MUL c11, ALPHA, c11 ++ addi.d CO1,CO1, 2 * SIZE ++ MUL c12, ALPHA, c12 ++ MTC a1, $r0 ++ MUL c21, ALPHA, c21 ++ addi.d CO2,CO2, 2 * SIZE ++ MUL c22, ALPHA, c22 ++ addi.d CO3,CO3, 2 * SIZE ++ ST c11, CO1, -2 * SIZE ++ MUL c31, ALPHA, c31 ++ ST c12, CO1, -1 * SIZE ++ MUL c32, ALPHA, c32 ++ ST c21, CO2, -2 * SIZE ++ MUL c41, ALPHA, c41 ++ ST c22, CO2, -1 * SIZE ++ MUL c42, ALPHA, c42 ++ ST c31, CO3, -2 * SIZE ++ MUL c51, ALPHA, c51 ++ ST c32, CO3, -1 * SIZE ++ MUL c52, ALPHA, c52 ++ ST c41, CO4, -2 * SIZE ++ MUL c61, ALPHA, c61 ++ ST c42, CO4, -1 * SIZE ++ MUL c62, ALPHA, c62 ++ ST c51, CO5, -2 * SIZE ++ MUL c71, ALPHA, c71 ++ ST c52, CO5, -1 * SIZE ++ MUL c72, ALPHA, c72 ++ ST c61, CO6, -2 * SIZE ++ MUL c81, ALPHA, c81 ++ ST c62, CO6, -1 * SIZE ++ MUL c82, ALPHA, c82 ++ ST c71, CO7, -2 * SIZE ++ MOV c11, a1 ++ ST c72, CO7, -1 * SIZE ++ MOV c21, a1 ++ addi.d CO8,CO8, 2 * SIZE ++ addi.d BB, BB, 16 * SIZE ++ ST c81, CO8, -2 * SIZE ++ MOV c31, a1 ++ ST c82, CO8, -1 * SIZE ++ MOV c41, a1 ++ addi.d I, I, -1 ++ MOV c51, a1 ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ sub.d TEMP, K, KK ++#ifdef LEFT ++ addi.d TEMP, TEMP, -2 ++#else ++ addi.d TEMP, TEMP, -8 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LEFT ++ addi.d KK, KK, 2 ++#endif ++MOV c61, a1 ++ blt $r0, I, .L11 ++#endif ++ .align 3 ++ ++.L20: ++ andi I, M, 1 ++ MOV c61, c11 ++MOV c71, c11 ++ bge $r0, I, .L29 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 3 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 1 ++#else ++ addi.d TEMP, KK, 8 ++#endif ++ srai.d L, TEMP, 2 ++MOV c81, c11 ++ bge $r0, L, .L25 ++#else ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, K, 2 ++ MOV c81, c11 ++move BO, B ++ bge $r0, L, .L25 ++#endif ++ .align 3 ++.L22: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 16 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ LD b5, BO, 20 * SIZE ++ MADD c61, b2, a1, c61 ++ LD b2, BO, 9 * SIZE ++ MADD c71, b3, a1, c71 ++ LD b3, BO, 10 * SIZE ++ MADD c81, b4, a1, c81 ++ LD b4, BO, 11 * SIZE ++ LD a1, AO, 4 * SIZE ++ addi.d L, L, -1 ++ MADD c11, b6, a2, c11 ++ LD b6, BO, 24 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 13 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 14 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a2, c51 ++ LD b7, BO, 28 * SIZE ++ MADD c61, b2, a2, c61 ++ LD b2, BO, 17 * SIZE ++ MADD c71, b3, a2, c71 ++ LD b3, BO, 18 * SIZE ++ MADD c81, b4, a2, c81 ++ LD b4, BO, 19 * SIZE ++ LD a2, AO, 5 * SIZE ++ addi.d AO, AO, 4 * SIZE ++ MADD c11, b1, a3, c11 ++ LD b1, BO, 32 * SIZE ++ MADD c21, b2, a3, c21 ++ LD b2, BO, 21 * SIZE ++ MADD c31, b3, a3, c31 ++ LD b3, BO, 22 * SIZE ++ MADD c41, b4, a3, c41 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ LD b5, BO, 36 * SIZE ++ MADD c61, b2, a3, c61 ++ LD b2, BO, 25 * SIZE ++ MADD c71, b3, a3, c71 ++ LD b3, BO, 26 * SIZE ++ MADD c81, b4, a3, c81 ++ LD b4, BO, 27 * SIZE ++ LD a3, AO, 2 * SIZE ++ addi.d BO, BO, 32 * SIZE ++ MADD c11, b6, a4, c11 ++ LD b6, BO, 8 * SIZE ++ MADD c21, b2, a4, c21 ++ LD b2, BO, -3 * SIZE ++ MADD c31, b3, a4, c31 ++ LD b3, BO, -2 * SIZE ++ MADD c41, b4, a4, c41 ++ LD b4, BO, -1 * SIZE ++ MADD c51, b7, a4, c51 ++ LD b7, BO, 12 * SIZE ++ MADD c61, b2, a4, c61 ++ LD b2, BO, 1 * SIZE ++ MADD c71, b3, a4, c71 ++ LD b3, BO, 2 * SIZE ++ MADD c81, b4, a4, c81 ++ LD b4, BO, 3 * SIZE ++ LD a4, AO, 3 * SIZE ++ blt $r0, L, .L22 ++ .align 3 ++ ++.L25: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L28 ++ .align 3 ++.L26: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 8 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ addi.d L, L, -1 ++ MOV a2, a2 ++ addi.d AO, AO, 1 * SIZE ++ addi.d BO, BO, 8 * SIZE ++ MADD c51, b5, a1, c51 ++ LD b5, BO, 4 * SIZE ++ MADD c61, b2, a1, c61 ++ LD b2, BO, 1 * SIZE ++ MADD c71, b3, a1, c71 ++ LD b3, BO, 2 * SIZE ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L26 ++.L28: ++#ifndef TRMMKERNEL ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO2, 0 * SIZE ++ LD $f23, CO3, 0 * SIZE ++ LD $f9, CO4, 0 * SIZE ++ MADD c11, c11, ALPHA, $f22 ++ LD $f10, CO5, 0 * SIZE ++ MADD c21, c21, ALPHA, $f8 ++ LD $f11, CO6, 0 * SIZE ++ MADD c31, c31, ALPHA, $f23 ++ LD $f12, CO7, 0 * SIZE ++ MADD c41, c41, ALPHA, $f9 ++ LD $f13, CO8, 0 * SIZE ++ MADD c51, c51, ALPHA, $f10 ++ ST c11, CO1, 0 * SIZE ++ MADD c61, c61, ALPHA, $f11 ++ ST c21, CO2, 0 * SIZE ++ MADD c71, c71, ALPHA, $f12 ++ ST c31, CO3, 0 * SIZE ++ MADD c81, c81, ALPHA, $f13 ++ ST c41, CO4, 0 * SIZE ++ ST c51, CO5, 0 * SIZE ++ ST c61, CO6, 0 * SIZE ++ ST c71, CO7, 0 * SIZE ++ ST c81, CO8, 0 * SIZE ++#else ++ MUL c11, ALPHA, c11 ++ MUL c21, ALPHA, c21 ++ MUL c31, ALPHA, c31 ++ MUL c41, ALPHA, c41 ++ ST c11, CO1, 0 * SIZE ++ MUL c51, ALPHA, c51 ++ ST c21, CO2, 0 * SIZE ++ MUL c61, ALPHA, c61 ++ ST c31, CO3, 0 * SIZE ++ MUL c71, ALPHA, c71 ++ ST c41, CO4, 0 * SIZE ++ MUL c81, ALPHA, c81 ++ ST c51, CO5, 0 * SIZE ++ ST c61, CO6, 0 * SIZE ++ ST c71, CO7, 0 * SIZE ++ ST c81, CO8, 0 * SIZE ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ sub.d TEMP, K, KK ++#ifdef LEFT ++ addi.d TEMP, TEMP, -1 ++#else ++ addi.d TEMP, TEMP, -8 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LEFT ++ addi.d KK, KK, 1 ++#endif ++#endif ++ .align 3 ++ ++.L29: ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addi.d KK, KK, 8 ++#endif ++move B, BO ++ blt $r0, J, .L10 ++ .align 3 ++ ++.L30: ++ andi J, N, 4 ++move AO, A ++ bge $r0, J, .L50 ++ move CO1, C ++ MTC c11, $r0 ++ add.d CO2, C, LDC ++ add.d CO3, CO2, LDC ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ add.d C, CO4, LDC ++ MOV c31, c11 ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move KK, OFFSET ++#endif ++ srai.d I, M, 1 ++MOV c41, c11 ++ bge $r0, I, .L40 ++.L31: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 2 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ LD a3, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c32, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c42, c11 ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 2 ++#else ++ addi.d TEMP, KK, 4 ++#endif ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L35 ++#else ++ LD a1, AO, 0 * SIZE ++ LD a3, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ MOV c32, c11 ++ LD b4, B, 3 * SIZE ++ MOV c42, c11 ++ LD b5, B, 4 * SIZE ++ srai.d L, K, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L35 ++#endif ++ .align 3 ++.L32: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c11, b5, a1, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 8 * SIZE ++ MADD c12, b5, a2, c12 ++ LD b5, BO, 20 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 9 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 10 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ LD a3, AO, 6 * SIZE ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c11, b7, a3, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a3, c21 ++ addi.d AO, AO, 8 * SIZE ++ MADD c31, b3, a3, c31 ++ addi.d BO, BO, 16 * SIZE ++ MADD c41, b4, a3, c41 ++ LD a3, AO, 4 * SIZE ++ MADD c12, b7, a2, c12 ++ LD b7, BO, 12 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 1 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 2 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L32 ++ .align 3 ++ ++.L35: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L38 ++ .align 3 ++.L36: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ addi.d AO, AO, 2 * SIZE ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 0 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 4 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L36 ++.L38: ++#ifndef TRMMKERNEL ++ LD $f22, CO1, 0 * SIZE ++ addi.d CO3,CO3, 2 * SIZE ++ LD $f8, CO1, 1 * SIZE ++ addi.d CO1,CO1, 2 * SIZE ++ LD $f23, CO2, 0 * SIZE ++ addi.d CO4,CO4, 2 * SIZE ++ LD $f9, CO2, 1 * SIZE ++ addi.d CO2,CO2, 2 * SIZE ++ LD $f10, CO3, -2 * SIZE ++ MADD c11, c11, ALPHA, $f22 ++ LD $f11, CO3, -1 * SIZE ++ MADD c12, c12, ALPHA, $f8 ++ LD $f12, CO4, -2 * SIZE ++ MADD c21, c21, ALPHA, $f23 ++ LD $f13, CO4, -1 * SIZE ++ MADD c22, c22, ALPHA, $f9 ++ MADD c31, c31, ALPHA, $f10 ++ ST c11, CO1, -2 * SIZE ++ MADD c32, c32, ALPHA, $f11 ++ ST c12, CO1, -1 * SIZE ++ MADD c41, c41, ALPHA, $f12 ++ ST c21, CO2, -2 * SIZE ++ MADD c42, c42, ALPHA, $f13 ++ ST c22, CO2, -1 * SIZE ++ ST c31, CO3, -2 * SIZE ++ MTC c11, $r0 ++ ST c32, CO3, -1 * SIZE ++ addi.d I, I, -1 ++ ST c41, CO4, -2 * SIZE ++ MOV c21, c11 ++ ST c42, CO4, -1 * SIZE ++ MOV c31, c11 ++#else ++ MUL c11, ALPHA, c11 ++ addi.d CO3,CO3, 2 * SIZE ++ MUL c12, ALPHA, c12 ++ addi.d CO1,CO1, 2 * SIZE ++ MUL c21, ALPHA, c21 ++ addi.d CO4,CO4, 2 * SIZE ++ MUL c22, ALPHA, c22 ++ addi.d CO2,CO2, 2 * SIZE ++ ST c11, CO1, -2 * SIZE ++ MUL c31, ALPHA, c31 ++ ST c12, CO1, -1 * SIZE ++ MUL c32, ALPHA, c32 ++ ST c21, CO2, -2 * SIZE ++ MUL c41, ALPHA, c41 ++ ST c22, CO2, -1 * SIZE ++ MUL c42, ALPHA, c42 ++ ST c31, CO3, -2 * SIZE ++ MTC c11, $r0 ++ ST c32, CO3, -1 * SIZE ++ addi.d I, I, -1 ++ ST c41, CO4, -2 * SIZE ++ MOV c21, c11 ++ ST c42, CO4, -1 * SIZE ++ MOV c31, c11 ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ sub.d TEMP, K, KK ++#ifdef LEFT ++ addi.d TEMP, TEMP, -2 ++#else ++ addi.d TEMP, TEMP, -4 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LEFT ++ addi.d KK, KK, 2 ++#endif ++#endif ++MOV c41, c11 ++ blt $r0, I, .L31 ++ .align 3 ++ ++.L40: ++ andi I, M, 1 ++MOV c61, c11 ++ bge $r0, I, .L49 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 2 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c81, c11 ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 1 ++#else ++ addi.d TEMP, KK, 4 ++#endif ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L45 ++#else ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c81, c11 ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, K, 2 ++move BO, B ++ bge $r0, L, .L45 ++#endif ++ .align 3 ++.L42: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 16 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ LD a1, AO, 4 * SIZE ++ addi.d L, L, -1 ++ MADD c11, b5, a2, c11 ++ LD b5, BO, 20 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 10 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 11 * SIZE ++ LD a2, AO, 2 * SIZE ++ addi.d AO, AO, 4 * SIZE ++ MADD c11, b6, a2, c11 ++ LD b6, BO, 24 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 13 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 14 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 15 * SIZE ++ LD a2, AO, -1 * SIZE ++ addi.d BO, BO, 16 * SIZE ++ MADD c11, b7, a2, c11 ++ LD b7, BO, 12 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 1 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 2 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 3 * SIZE ++ LD a2, AO, 1 * SIZE ++ blt $r0, L, .L42 ++ .align 3 ++ ++.L45: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L48 ++ .align 3 ++.L46: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 4 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 1 * SIZE ++ LD b4, BO, 7 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++ MOV a2, a2 ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L46 ++.L48: ++#ifndef TRMMKERNEL ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO2, 0 * SIZE ++ LD $f23, CO3, 0 * SIZE ++ LD $f9, CO4, 0 * SIZE ++ MADD c11, c11, ALPHA, $f22 ++ MADD c21, c21, ALPHA, $f8 ++ MADD c31, c31, ALPHA, $f23 ++ MADD c41, c41, ALPHA, $f9 ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c41, CO4, 0 * SIZE ++#else ++ MUL c11, ALPHA, c11 ++ MUL c21, ALPHA, c21 ++ MUL c31, ALPHA, c31 ++ MUL c41, ALPHA, c41 ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c41, CO4, 0 * SIZE ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ sub.d TEMP, K, KK ++#ifdef LEFT ++ addi.d TEMP, TEMP, -1 ++#else ++ addi.d TEMP, TEMP, -4 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LEFT ++ addi.d KK, KK, 1 ++#endif ++#endif ++ .align 3 ++ ++.L49: ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addi.d KK, KK, 4 ++#endif ++ move B, BO ++ .align 3 ++ ++.L50: ++ andi J, N, 2 ++move AO, A ++ bge $r0, J, .L70 ++ move CO1, C ++ add.d CO2, C, LDC ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move KK, OFFSET ++#endif ++ srai.d I, M, 1 ++add.d C, CO2, LDC ++ bge $r0, I, .L60 ++.L51: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 1 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 2 ++#else ++ addi.d TEMP, KK, 2 ++#endif ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L55 ++#else ++ LD a1, AO, 0 * SIZE ++ MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ LD b5, B, 4 * SIZE ++ srai.d L, K, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L55 ++#endif ++ .align 3 ++.L52: ++ MADD c11, b1, a1, c11 ++ LD a3, AO, 2 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b4, BO, 3 * SIZE ++ MADD c12, b1, a2, c12 ++ LD a4, AO, 3 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b1, BO, 8 * SIZE ++ MADD c11, b3, a3, c11 ++ LD a1, AO, 8 * SIZE ++ MADD c21, b4, a3, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c12, b3, a4, c12 ++ LD a2, AO, 5 * SIZE ++ MADD c22, b4, a4, c22 ++ LD b3, BO, 6 * SIZE ++ MADD c11, b5, a5, c11 ++ LD a3, AO, 6 * SIZE ++ MADD c21, b2, a5, c21 ++ LD b4, BO, 7 * SIZE ++ MADD c12, b5, a2, c12 ++ LD a4, AO, 7 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b5, BO, 12 * SIZE ++ MADD c11, b3, a3, c11 ++ LD a5, AO, 12 * SIZE ++ MADD c21, b4, a3, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c12, b3, a4, c12 ++ LD a2, AO, 9 * SIZE ++ MADD c22, b4, a4, c22 ++ LD b3, BO, 10 * SIZE ++ addi.d AO, AO, 8 * SIZE ++ addi.d L, L, -1 ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L52 ++ .align 3 ++ ++.L55: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L58 ++ .align 3 ++.L56: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 3 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 2 * SIZE ++addi.d BO, BO, 2 * SIZE ++ blt $r0, L, .L56 ++.L58: ++#ifndef TRMMKERNEL ++ LD $f22, CO1, 0 * SIZE ++ addi.d I, I, -1 ++ LD $f8, CO1, 1 * SIZE ++ addi.d CO1,CO1, 2 * SIZE ++ LD $f23, CO2, 0 * SIZE ++ LD $f9, CO2, 1 * SIZE ++ addi.d CO2,CO2, 2 * SIZE ++ MADD c11, c11, ALPHA, $f22 ++ MADD c12, c12, ALPHA, $f8 ++ MADD c21, c21, ALPHA, $f23 ++ MADD c22, c22, ALPHA, $f9 ++ ST c11, CO1, -2 * SIZE ++ ST c12, CO1, -1 * SIZE ++ ST c21, CO2, -2 * SIZE ++ ST c22, CO2, -1 * SIZE ++ blt $r0, I, .L51 ++#else ++ addi.d I, I, -1 ++ addi.d CO1,CO1, 2 * SIZE ++ addi.d CO2,CO2, 2 * SIZE ++ MUL c11, ALPHA, c11 ++ MUL c12, ALPHA, c12 ++ MUL c21, ALPHA, c21 ++ MUL c22, ALPHA, c22 ++ ST c11, CO1, -2 * SIZE ++ ST c12, CO1, -1 * SIZE ++ ST c21, CO2, -2 * SIZE ++ ST c22, CO2, -1 * SIZE ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ sub.d TEMP, K, KK ++#ifdef LEFT ++ addi.d TEMP, TEMP, -2 ++#else ++ addi.d TEMP, TEMP, -2 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LEFT ++ addi.d KK, KK, 2 ++#endif ++ blt $r0, I, .L51 ++#endif ++ .align 3 ++ ++.L60: ++ andi I, M, 1 ++ bge $r0, I, .L69 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 1 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ MOV c31, c11 ++ LD a4, AO, 3 * SIZE ++ MOV c41, c11 ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 1 ++#else ++ addi.d TEMP, KK, 2 ++#endif ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L65 ++#else ++ srai.d L, K, 2 ++ LD a1, AO, 0 * SIZE ++ MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ MOV c31, c11 ++ LD a4, AO, 3 * SIZE ++ MOV c41, c11 ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L65 ++#endif ++ .align 3 ++.L62: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 4 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 7 * SIZE ++ LD a1, AO, 4 * SIZE ++ LD a2, AO, 5 * SIZE ++ MADD c11, b1, a3, c11 ++ LD b1, BO, 8 * SIZE ++ MADD c21, b2, a3, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c31, b3, a4, c31 ++ LD b3, BO, 10 * SIZE ++ MADD c41, b4, a4, c41 ++ LD b4, BO, 11 * SIZE ++ LD a3, AO, 6 * SIZE ++ LD a4, AO, 7 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 4 * SIZE ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L62 ++ .align 3 ++ ++.L65: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L68 ++ .align 3 ++.L66: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 2 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 3 * SIZE ++ LD a1, AO, 1 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++addi.d BO, BO, 2 * SIZE ++ blt $r0, L, .L66 ++.L68: ++#ifndef TRMMKERNEL ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO2, 0 * SIZE ++ ADD c11, c11, c31 ++ ADD c21, c21, c41 ++ MADD c11, c11, ALPHA, $f22 ++ MADD c21, c21, ALPHA, $f8 ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++#else ++ ADD c11, c11, c31 ++ ADD c21, c21, c41 ++ MUL c11, ALPHA, c11 ++ MUL c21, ALPHA, c21 ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ sub.d TEMP, K, KK ++#ifdef LEFT ++ addi.d TEMP, TEMP, -1 ++#else ++ addi.d TEMP, TEMP, -2 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LEFT ++ addi.d KK, KK, 1 ++#endif ++#endif ++ .align 3 ++ ++.L69: ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addi.d KK, KK, 2 ++#endif ++ move B, BO ++ .align 3 ++ ++.L70: ++ andi J, N, 1 ++move AO, A ++ bge $r0, J, .L999 ++ move CO1, C ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move KK, OFFSET ++#endif ++ srai.d I, M, 1 ++add.d C, CO1, LDC ++ bge $r0, I, .L80 ++.L71: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 0 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 2 ++#else ++ addi.d TEMP, KK, 1 ++#endif ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L75 ++#else ++ LD a1, AO, 0 * SIZE ++ MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ LD b5, B, 4 * SIZE ++ srai.d L, K, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L75 ++#endif ++ .align 3 ++.L72: ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 2 * SIZE ++ LD a2, AO, 3 * SIZE ++ LD b1, BO, 1 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 4 * SIZE ++ LD a2, AO, 5 * SIZE ++ LD b1, BO, 2 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 6 * SIZE ++ LD a2, AO, 7 * SIZE ++ LD b1, BO, 3 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ addi.d L, L, -1 ++ addi.d AO, AO, 8 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L72 ++ .align 3 ++ ++.L75: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L78 ++ .align 3 ++.L76: ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ addi.d L, L, -1 ++ addi.d AO, AO, 2 * SIZE ++addi.d BO, BO, 1 * SIZE ++ blt $r0, L, .L76 ++.L78: ++#ifndef TRMMKERNEL ++ LD $f22, CO1, 0 * SIZE ++ addi.d I, I, -1 ++ LD $f8, CO1, 1 * SIZE ++ addi.d CO1,CO1, 2 * SIZE ++ ADD c11, c11, c21 ++ ADD c12, c12, c22 ++ MADD c11, c11, ALPHA, $f22 ++ MADD c12, c12, ALPHA, $f8 ++ ST c11, CO1, -2 * SIZE ++ ST c12, CO1, -1 * SIZE ++ blt $r0, I, .L71 ++#else ++ ADD c11, c11, c21 ++ addi.d I, I, -1 ++ ADD c12, c12, c22 ++ addi.d CO1,CO1, 2 * SIZE ++ MUL c11, ALPHA, c11 ++ MUL c12, ALPHA, c12 ++ ST c11, CO1, -2 * SIZE ++ ST c12, CO1, -1 * SIZE ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ sub.d TEMP, K, KK ++#ifdef LEFT ++ addi.d TEMP, TEMP, -2 ++#else ++ addi.d TEMP, TEMP, -1 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LEFT ++ addi.d KK, KK, 2 ++#endif ++ blt $r0, I, .L71 ++#endif ++ .align 3 ++ ++.L80: ++ andi I, M, 1 ++ bge $r0, I, .L89 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 0 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 1 ++#else ++ addi.d TEMP, KK, 1 ++#endif ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L85 ++#else ++ LD a1, AO, 0 * SIZE ++ MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, K, 2 ++move BO, B ++ bge $r0, L, .L85 ++#endif ++ .align 3 ++.L82: ++ LD a1, AO, 0 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a1, AO, 1 * SIZE ++ LD b1, BO, 1 * SIZE ++ MADD c21, b1, a1, c21 ++ LD a1, AO, 2 * SIZE ++ LD b1, BO, 2 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a1, AO, 3 * SIZE ++ LD b1, BO, 3 * SIZE ++ MADD c21, b1, a1, c21 ++ addi.d L, L, -1 ++ addi.d AO, AO, 4 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L82 ++ .align 3 ++ ++.L85: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L88 ++ .align 3 ++.L86: ++ LD a1, AO, 0 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++addi.d BO, BO, 1 * SIZE ++ blt $r0, L, .L86 ++.L88: ++#ifndef TRMMKERNEL ++ LD $f22, CO1, 0 * SIZE ++ ADD c11, c11, c21 ++ MADD c11, c11, ALPHA, $f22 ++ ST c11, CO1, 0 * SIZE ++#else ++ ADD c11, c11, c21 ++ MUL c11, ALPHA, c11 ++ ST c11, CO1, 0 * SIZE ++#endif ++ .align 3 ++ ++.L89: ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addi.d KK, KK, 1 ++#endif ++ move B, BO ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ LDARG $r25, $sp, 16 ++ LDARG $r26, $sp, 24 ++ LDARG $r27, $sp, 32 ++ LDARG $r28, $sp, 40 ++ LDARG $r29, $sp, 48 ++ LDARG $r30, $sp, 96 ++ fld.d $f24, $sp, 56 ++ fld.d $f25, $sp, 64 ++ fld.d $f26, $sp, 72 ++ fld.d $f27, $sp, 80 ++ fld.d $f28, $sp, 88 ++#if defined(TRMMKERNEL) ++ LDARG $r20, $sp, 104 ++ LDARG $r16, $sp, 112 ++#endif ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 120 ++ fld.d $f19, $sp, 128 ++ fld.d $f20, $sp, 136 ++ fld.d $f21, $sp, 144 ++#endif ++ addi.d $sp, $sp, 160 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S +new file mode 100644 +index 0000000..9ab43ae +--- /dev/null ++++ b/kernel/loongarch64/gemv_n.S +@@ -0,0 +1,531 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++/* Unused param dummy1 */ ++#define M $r4 ++#define N $r5 ++#define A $r7 ++#define LDA $r8 ++#define X $r9 ++#define INCX $r10 ++#define Y $r11 ++#define INCY $r6 ++#define BUFFER $r16 ++#define YORIG $r18 ++#define XX $r12 ++#define YY $r13 ++#define I $r14 ++#define J $r15 ++#define AO1 $r23 ++#define AO2 $r24 ++#define ALPHA $f0 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f23 ++#define a4 $f9 ++#define a5 $f10 ++#define a6 $f11 ++#define a7 $f12 ++#define a8 $f13 ++#define x1 $f14 ++#define x2 $f15 ++#define y1 $f16 ++#define y2 $f17 ++#define y3 $f3 ++#define y4 $f1 ++#define y5 $f2 ++#define y6 $f4 ++#define y7 $f5 ++#define y8 $f6 ++#define t1 $f7 ++#define t2 $f18 ++#define t3 $f19 ++#define t4 $f20 ++ ++ PROLOGUE ++ ++ LDARG INCY, $sp, 0 ++ LDARG BUFFER, $sp, 8 ++#ifdef __64BIT__ ++ addi.d $sp, $sp, -16 ++#else ++ addi.d $sp, $sp, -48 ++#endif ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ slli.d LDA, LDA, BASE_SHIFT ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 16 ++ fst.d $f19, $sp, 24 ++ fst.d $f20, $sp, 32 ++#endif ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, M, .L999 ++ slli.d INCY, INCY, BASE_SHIFT ++ bge $r0, N, .L999 ++ li.d I, SIZE ++ move YORIG, Y ++ beq INCY, I, .L10 ++ srai.d I, M, 2 ++ move YORIG, BUFFER ++ move XX, Y ++ move YY, BUFFER ++ bge $r0, I, .L05 ++ .align 3 ++ ++.L02: ++ LD a1, XX, 0 * SIZE ++ add.d XX, XX, INCY ++ LD a2, XX, 0 * SIZE ++ add.d XX, XX, INCY ++ LD a3, XX, 0 * SIZE ++ add.d XX, XX, INCY ++ LD a4, XX, 0 * SIZE ++ add.d XX, XX, INCY ++ ST a1, YY, 0 * SIZE ++ ST a2, YY, 1 * SIZE ++ ST a3, YY, 2 * SIZE ++ ST a4, YY, 3 * SIZE ++ addi.d I, I, -1 ++ addi.d YY, YY, 4 * SIZE ++ blt $r0, I, .L02 ++ .align 3 ++ ++.L05: ++ andi I, M, 3 ++ bge $r0, I, .L10 ++ .align 3 ++ ++.L06: ++ LD a1, XX, 0 * SIZE ++ add.d XX, XX, INCY ++ ST a1, YY, 0 * SIZE ++ addi.d I, I, -1 ++ addi.d YY, YY, 1 * SIZE ++ blt $r0, I, .L06 ++ .align 3 ++ ++.L10: ++ srai.d J, N, 1 ++ bge $r0, J, .L20 ++ .align 3 ++ ++.L11: ++ LD x1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD x2, X, 0 * SIZE ++ add.d X, X, INCX ++ move AO1, A ++ add.d AO2, A, LDA ++ add.d A, AO2, LDA ++ move YY, YORIG ++ MUL x1, ALPHA, x1 ++ srai.d I, M, 3 ++ MUL x2, ALPHA, x2 ++ bge $r0, I, .L15 ++ LD a1, AO1, 0 * SIZE ++ LD y1, YY, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a3, AO1, 2 * SIZE ++ LD y3, YY, 2 * SIZE ++ LD a4, AO1, 3 * SIZE ++ LD y4, YY, 3 * SIZE ++ LD a5, AO2, 0 * SIZE ++ LD y5, YY, 4 * SIZE ++ LD a6, AO2, 1 * SIZE ++ LD y6, YY, 5 * SIZE ++ LD a7, AO2, 2 * SIZE ++ LD y7, YY, 6 * SIZE ++ LD a8, AO2, 3 * SIZE ++ addi.d I, I, -1 ++ LD y8, YY, 7 * SIZE ++ bge $r0, I, .L13 ++ .align 3 ++.L12: ++ MADD t1, a1, x1, y1 ++ LD a1, AO1, 4 * SIZE ++ MADD t2, a2, x1, y2 ++ LD a2, AO1, 5 * SIZE ++ LD y1, YY, 8 * SIZE ++ LD y2, YY, 9 * SIZE ++ MADD t3, a3, x1, y3 ++ LD a3, AO1, 6 * SIZE ++ MADD t4, a4, x1, y4 ++ LD a4, AO1, 7 * SIZE ++ LD y3, YY, 10 * SIZE ++ LD y4, YY, 11 * SIZE ++ MADD t1, a5, x2, t1 ++ LD a5, AO2, 4 * SIZE ++ MADD t2, a6, x2, t2 ++ LD a6, AO2, 5 * SIZE ++ MADD t3, a7, x2, t3 ++ LD a7, AO2, 6 * SIZE ++ MADD t4, a8, x2, t4 ++ LD a8, AO2, 7 * SIZE ++ ST t1, YY, 0 * SIZE ++ ST t2, YY, 1 * SIZE ++ ST t3, YY, 2 * SIZE ++ ST t4, YY, 3 * SIZE ++ MADD t1, a1, x1, y5 ++ LD a1, AO1, 8 * SIZE ++ MADD t2, a2, x1, y6 ++ LD a2, AO1, 9 * SIZE ++ LD y5, YY, 12 * SIZE ++ LD y6, YY, 13 * SIZE ++ MADD t3, a3, x1, y7 ++ LD a3, AO1, 10 * SIZE ++ MADD t4, a4, x1, y8 ++ LD a4, AO1, 11 * SIZE ++ LD y7, YY, 14 * SIZE ++ LD y8, YY, 15 * SIZE ++ MADD t1, a5, x2, t1 ++ LD a5, AO2, 8 * SIZE ++ MADD t2, a6, x2, t2 ++ LD a6, AO2, 9 * SIZE ++ MADD t3, a7, x2, t3 ++ LD a7, AO2, 10 * SIZE ++ MADD t4, a8, x2, t4 ++ LD a8, AO2, 11 * SIZE ++ ST t1, YY, 4 * SIZE ++ ST t2, YY, 5 * SIZE ++ ST t3, YY, 6 * SIZE ++ ST t4, YY, 7 * SIZE ++ addi.d I, I, -1 ++ addi.d YY, YY, 8 * SIZE ++ addi.d AO1, AO1, 8 * SIZE ++ addi.d AO2, AO2, 8 * SIZE ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ MADD t1, a1, x1, y1 ++ LD a1, AO1, 4 * SIZE ++ MADD t2, a2, x1, y2 ++ LD a2, AO1, 5 * SIZE ++ MADD t3, a3, x1, y3 ++ LD a3, AO1, 6 * SIZE ++ MADD t4, a4, x1, y4 ++ LD a4, AO1, 7 * SIZE ++ MADD t1, a5, x2, t1 ++ LD a5, AO2, 4 * SIZE ++ MADD t2, a6, x2, t2 ++ LD a6, AO2, 5 * SIZE ++ MADD t3, a7, x2, t3 ++ LD a7, AO2, 6 * SIZE ++ MADD t4, a8, x2, t4 ++ LD a8, AO2, 7 * SIZE ++ ST t1, YY, 0 * SIZE ++ MADD t1, a1, x1, y5 ++ ST t2, YY, 1 * SIZE ++ MADD t2, a2, x1, y6 ++ ST t3, YY, 2 * SIZE ++ MADD t3, a3, x1, y7 ++ ST t4, YY, 3 * SIZE ++ MADD t4, a4, x1, y8 ++ MADD t1, a5, x2, t1 ++ addi.d AO1, AO1, 8 * SIZE ++ MADD t2, a6, x2, t2 ++ addi.d AO2, AO2, 8 * SIZE ++ MADD t3, a7, x2, t3 ++ addi.d YY, YY, 8 * SIZE ++ MADD t4, a8, x2, t4 ++ ST t1, YY, -4 * SIZE ++ ST t2, YY, -3 * SIZE ++ ST t3, YY, -2 * SIZE ++ ST t4, YY, -1 * SIZE ++ .align 3 ++ ++.L15: ++ andi I, M, 4 ++ bge $r0, I, .L16 ++ LD a1, AO1, 0 * SIZE ++ LD y1, YY, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a3, AO1, 2 * SIZE ++ LD y3, YY, 2 * SIZE ++ LD a4, AO1, 3 * SIZE ++ LD y4, YY, 3 * SIZE ++ LD a5, AO2, 0 * SIZE ++ MADD y1, a1, x1, y1 ++ LD a6, AO2, 1 * SIZE ++ MADD y2, a2, x1, y2 ++ LD a7, AO2, 2 * SIZE ++ MADD y3, a3, x1, y3 ++ LD a8, AO2, 3 * SIZE ++ MADD y4, a4, x1, y4 ++ MADD y1, a5, x2, y1 ++ addi.d YY, YY, 4 * SIZE ++ MADD y2, a6, x2, y2 ++ addi.d AO1, AO1, 4 * SIZE ++ MADD y3, a7, x2, y3 ++ addi.d AO2, AO2, 4 * SIZE ++ MADD y4, a8, x2, y4 ++ ST y1, YY, -4 * SIZE ++ ST y2, YY, -3 * SIZE ++ ST y3, YY, -2 * SIZE ++ ST y4, YY, -1 * SIZE ++ .align 3 ++ ++.L16: ++ andi I, M, 2 ++ bge $r0, I, .L17 ++ LD a1, AO1, 0 * SIZE ++ LD y1, YY, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a5, AO2, 0 * SIZE ++ LD a6, AO2, 1 * SIZE ++ MADD y1, a1, x1, y1 ++ MADD y2, a2, x1, y2 ++ addi.d YY, YY, 2 * SIZE ++ MADD y1, a5, x2, y1 ++ addi.d AO1, AO1, 2 * SIZE ++ MADD y2, a6, x2, y2 ++ addi.d AO2, AO2, 2 * SIZE ++ ST y1, YY, -2 * SIZE ++ ST y2, YY, -1 * SIZE ++ .align 3 ++ ++.L17: ++ andi I, M, 1 ++ bge $r0, I, .L19 ++ LD y1, YY, 0 * SIZE ++ LD a1, AO1, 0 * SIZE ++ LD a5, AO2, 0 * SIZE ++ MADD y1, a1, x1, y1 ++ MADD y1, a5, x2, y1 ++ ST y1, YY, 0 * SIZE ++ .align 3 ++ ++.L19: ++ addi.d J, J, -1 ++ blt $r0, J, .L11 ++ .align 3 ++ ++.L20: ++ andi J, N, 1 ++ bge $r0, J, .L900 ++ .align 3 ++ ++.L21: ++ LD x1, X, 0 * SIZE ++ add.d X, X, INCX ++ move YY, YORIG ++ move AO1, A ++ srai.d I, M, 3 ++ MUL x1, ALPHA, x1 ++ bge $r0, I, .L25 ++ LD a1, AO1, 0 * SIZE ++ LD y1, YY, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a3, AO1, 2 * SIZE ++ LD y3, YY, 2 * SIZE ++ LD a4, AO1, 3 * SIZE ++ LD y4, YY, 3 * SIZE ++ LD y5, YY, 4 * SIZE ++ LD y6, YY, 5 * SIZE ++ LD y7, YY, 6 * SIZE ++ addi.d I, I, -1 ++ LD y8, YY, 7 * SIZE ++ bge $r0, I, .L23 ++ .align 3 ++.L22: ++ MADD t1, a1, x1, y1 ++ LD a1, AO1, 4 * SIZE ++ MADD t2, a2, x1, y2 ++ LD a2, AO1, 5 * SIZE ++ LD y1, YY, 8 * SIZE ++ LD y2, YY, 9 * SIZE ++ MADD t3, a3, x1, y3 ++ LD a3, AO1, 6 * SIZE ++ MADD t4, a4, x1, y4 ++ LD a4, AO1, 7 * SIZE ++ LD y3, YY, 10 * SIZE ++ LD y4, YY, 11 * SIZE ++ ST t1, YY, 0 * SIZE ++ ST t2, YY, 1 * SIZE ++ ST t3, YY, 2 * SIZE ++ ST t4, YY, 3 * SIZE ++ MADD t1, a1, x1, y5 ++ LD a1, AO1, 8 * SIZE ++ MADD t2, a2, x1, y6 ++ LD a2, AO1, 9 * SIZE ++ LD y5, YY, 12 * SIZE ++ LD y6, YY, 13 * SIZE ++ MADD t3, a3, x1, y7 ++ LD a3, AO1, 10 * SIZE ++ MADD t4, a4, x1, y8 ++ LD a4, AO1, 11 * SIZE ++ LD y7, YY, 14 * SIZE ++ LD y8, YY, 15 * SIZE ++ ST t1, YY, 4 * SIZE ++ ST t2, YY, 5 * SIZE ++ ST t3, YY, 6 * SIZE ++ ST t4, YY, 7 * SIZE ++ addi.d I, I, -1 ++ addi.d YY, YY, 8 * SIZE ++ addi.d AO1, AO1, 8 * SIZE ++ blt $r0, I, .L22 ++ .align 3 ++ ++.L23: ++ MADD t1, a1, x1, y1 ++ LD a1, AO1, 4 * SIZE ++ MADD t2, a2, x1, y2 ++ LD a2, AO1, 5 * SIZE ++ MADD t3, a3, x1, y3 ++ LD a3, AO1, 6 * SIZE ++ MADD t4, a4, x1, y4 ++ LD a4, AO1, 7 * SIZE ++ ST t1, YY, 0 * SIZE ++ MADD t1, a1, x1, y5 ++ ST t2, YY, 1 * SIZE ++ MADD t2, a2, x1, y6 ++ ST t3, YY, 2 * SIZE ++ MADD t3, a3, x1, y7 ++ ST t4, YY, 3 * SIZE ++ MADD t4, a4, x1, y8 ++ ST t1, YY, 4 * SIZE ++ ST t2, YY, 5 * SIZE ++ ST t3, YY, 6 * SIZE ++ ST t4, YY, 7 * SIZE ++ addi.d AO1, AO1, 8 * SIZE ++ addi.d YY, YY, 8 * SIZE ++ .align 3 ++ ++.L25: ++ andi I, M, 4 ++ bge $r0, I, .L26 ++ LD a1, AO1, 0 * SIZE ++ LD y1, YY, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a3, AO1, 2 * SIZE ++ LD y3, YY, 2 * SIZE ++ LD a4, AO1, 3 * SIZE ++ LD y4, YY, 3 * SIZE ++ MADD y1, a1, x1, y1 ++ MADD y2, a2, x1, y2 ++ MADD y3, a3, x1, y3 ++ addi.d YY, YY, 4 * SIZE ++ MADD y4, a4, x1, y4 ++ addi.d AO1, AO1, 4 * SIZE ++ ST y1, YY, -4 * SIZE ++ ST y2, YY, -3 * SIZE ++ ST y3, YY, -2 * SIZE ++ ST y4, YY, -1 * SIZE ++ .align 3 ++ ++.L26: ++ andi I, M, 2 ++ bge $r0, I, .L27 ++ LD a1, AO1, 0 * SIZE ++ LD y1, YY, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD y2, YY, 1 * SIZE ++ MADD y1, a1, x1, y1 ++ addi.d YY, YY, 2 * SIZE ++ MADD y2, a2, x1, y2 ++ addi.d AO1, AO1, 2 * SIZE ++ ST y1, YY, -2 * SIZE ++ ST y2, YY, -1 * SIZE ++ .align 3 ++ ++.L27: ++ andi I, M, 1 ++ bge $r0, I, .L900 ++ LD y1, YY, 0 * SIZE ++ LD a1, AO1, 0 * SIZE ++ MADD y1, a1, x1, y1 ++ ST y1, YY, 0 * SIZE ++ .align 3 ++ ++.L900: ++ li.d YORIG, SIZE ++ srai.d I, M, 2 ++ beq INCY, YORIG, .L999 ++ move XX, BUFFER ++ bge $r0, I, .L905 ++ .align 3 ++ ++.L902: ++ LD a1, XX, 0 * SIZE ++ LD a2, XX, 1 * SIZE ++ LD a3, XX, 2 * SIZE ++ LD a4, XX, 3 * SIZE ++ ST a1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a2, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a3, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a4, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ addi.d I, I, -1 ++ addi.d XX, XX, 4 * SIZE ++ blt $r0, I, .L902 ++ .align 3 ++ ++.L905: ++ andi I, M, 3 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L906: ++ LD a1, XX, 0 * SIZE ++ addi.d XX, XX, 1 * SIZE ++ ST a1, Y, 0 * SIZE ++ addi.d I, I, -1 ++ add.d Y, Y, INCY ++ blt $r0, I, .L906 ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 16 ++ fld.d $f19, $sp, 24 ++ fld.d $f20, $sp, 32 ++#endif ++#ifdef __64BIT__ ++ addi.d $sp, $sp, 16 ++#else ++ addi.d $sp, $sp, 48 ++#endif ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S +new file mode 100644 +index 0000000..af42327 +--- /dev/null ++++ b/kernel/loongarch64/gemv_t.S +@@ -0,0 +1,436 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++/* Unused param dummy1 */ ++#define M $r4 ++#define N $r5 ++#define A $r7 ++#define LDA $r8 ++#define X $r9 ++#define INCX $r10 ++#define Y $r11 ++#define INCY $r6 ++#define BUFFER $r16 ++#define XORIG $r18 ++#define XX $r12 ++#define YY $r13 ++#define I $r14 ++#define J $r15 ++#define AO1 $r23 ++#define AO2 $r24 ++#define ALPHA $f0 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f23 ++#define a4 $f9 ++#define a5 $f10 ++#define a6 $f11 ++#define a7 $f12 ++#define a8 $f13 ++#define y1 $f14 ++#define y2 $f15 ++#define y3 $f16 ++#define y4 $f17 ++#define x1 $f3 ++#define x2 $f1 ++#define x3 $f2 ++#define x4 $f4 ++#define x5 $f5 ++#define x6 $f6 ++#define x7 $f7 ++#define x8 $f18 ++ ++ PROLOGUE ++ ++ LDARG INCY, $sp, 0 ++ LDARG BUFFER, $sp, 8 ++#ifdef __64BIT__ ++ addi.d $sp, $sp, -16 ++#else ++ addi.d $sp, $sp, -32 ++#endif ++ MTC y1, $r0 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ slli.d LDA, LDA, BASE_SHIFT ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 16 ++#endif ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, M, .L999 ++ slli.d INCY, INCY, BASE_SHIFT ++ bge $r0, N, .L999 ++ li.d I, SIZE ++ move XORIG, X ++ beq INCX, I, .L10 ++ srai.d I, M, 2 ++ move XORIG, BUFFER ++ move YY, BUFFER ++ bge $r0, I, .L05 ++ .align 3 ++ ++.L02: ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a1, YY, 0 * SIZE ++ ST a2, YY, 1 * SIZE ++ ST a3, YY, 2 * SIZE ++ ST a4, YY, 3 * SIZE ++ addi.d I, I, -1 ++ addi.d YY, YY, 4 * SIZE ++ blt $r0, I, .L02 ++ .align 3 ++ ++.L05: ++ andi I, M, 3 ++ bge $r0, I, .L10 ++ .align 3 ++ ++.L06: ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a1, YY, 0 * SIZE ++ addi.d I, I, -1 ++ addi.d YY, YY, 1 * SIZE ++ blt $r0, I, .L06 ++ .align 3 ++ ++.L10: ++ srai.d J, N, 1 ++ move YY, Y ++ bge $r0, J, .L20 ++ .align 3 ++ ++.L11: ++ move AO1, A ++ MOV y2, y1 ++ add.d AO2, A, LDA ++ MOV y3, y1 ++ add.d A, AO2, LDA ++ MOV y4, y1 ++ srai.d I, M, 3 ++ move XX, XORIG ++ bge $r0, I, .L15 ++ LD a1, AO1, 0 * SIZE ++ LD x1, XX, 0 * SIZE ++ LD a2, AO2, 0 * SIZE ++ LD x2, XX, 1 * SIZE ++ LD a3, AO1, 1 * SIZE ++ LD x3, XX, 2 * SIZE ++ LD a4, AO2, 1 * SIZE ++ LD x4, XX, 3 * SIZE ++ LD a5, AO1, 2 * SIZE ++ LD x5, XX, 4 * SIZE ++ LD a6, AO2, 2 * SIZE ++ LD x6, XX, 5 * SIZE ++ LD a7, AO1, 3 * SIZE ++ LD x7, XX, 6 * SIZE ++ LD a8, AO2, 3 * SIZE ++ addi.d I, I, -1 ++ LD x8, XX, 7 * SIZE ++ bge $r0, I, .L13 ++ .align 3 ++.L12: ++ MADD y1, a1, x1, y1 ++ LD a1, AO1, 4 * SIZE ++ MADD y2, a2, x1, y2 ++ LD a2, AO2, 4 * SIZE ++ MADD y3, a3, x2, y3 ++ LD a3, AO1, 5 * SIZE ++ MADD y4, a4, x2, y4 ++ LD a4, AO2, 5 * SIZE ++ LD x1, XX, 8 * SIZE ++ LD x2, XX, 9 * SIZE ++ MADD y1, a5, x3, y1 ++ LD a5, AO1, 6 * SIZE ++ MADD y2, a6, x3, y2 ++ LD a6, AO2, 6 * SIZE ++ MADD y3, a7, x4, y3 ++ LD a7, AO1, 7 * SIZE ++ MADD y4, a8, x4, y4 ++ LD a8, AO2, 7 * SIZE ++ LD x3, XX, 10 * SIZE ++ LD x4, XX, 11 * SIZE ++ MADD y1, a1, x5, y1 ++ LD a1, AO1, 8 * SIZE ++ MADD y2, a2, x5, y2 ++ LD a2, AO2, 8 * SIZE ++ MADD y3, a3, x6, y3 ++ LD a3, AO1, 9 * SIZE ++ MADD y4, a4, x6, y4 ++ LD a4, AO2, 9 * SIZE ++ LD x5, XX, 12 * SIZE ++ LD x6, XX, 13 * SIZE ++ MADD y1, a5, x7, y1 ++ LD a5, AO1, 10 * SIZE ++ MADD y2, a6, x7, y2 ++ LD a6, AO2, 10 * SIZE ++ MADD y3, a7, x8, y3 ++ LD a7, AO1, 11 * SIZE ++ MADD y4, a8, x8, y4 ++ LD a8, AO2, 11 * SIZE ++ LD x7, XX, 14 * SIZE ++ LD x8, XX, 15 * SIZE ++ addi.d I, I, -1 ++ addi.d XX, XX, 8 * SIZE ++ addi.d AO1, AO1, 8 * SIZE ++ addi.d AO2, AO2, 8 * SIZE ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ MADD y1, a1, x1, y1 ++ LD a1, AO1, 4 * SIZE ++ MADD y2, a2, x1, y2 ++ LD a2, AO2, 4 * SIZE ++ MADD y3, a3, x2, y3 ++ LD a3, AO1, 5 * SIZE ++ MADD y4, a4, x2, y4 ++ LD a4, AO2, 5 * SIZE ++ MADD y1, a5, x3, y1 ++ LD a5, AO1, 6 * SIZE ++ MADD y2, a6, x3, y2 ++ LD a6, AO2, 6 * SIZE ++ MADD y3, a7, x4, y3 ++ LD a7, AO1, 7 * SIZE ++ MADD y4, a8, x4, y4 ++ LD a8, AO2, 7 * SIZE ++ MADD y1, a1, x5, y1 ++ MADD y2, a2, x5, y2 ++ MADD y3, a3, x6, y3 ++ MADD y4, a4, x6, y4 ++ MADD y1, a5, x7, y1 ++ addi.d XX, XX, 8 * SIZE ++ MADD y2, a6, x7, y2 ++ addi.d AO1, AO1, 8 * SIZE ++ MADD y3, a7, x8, y3 ++ addi.d AO2, AO2, 8 * SIZE ++ MADD y4, a8, x8, y4 ++ .align 3 ++ ++.L15: ++ andi I, M, 4 ++ bge $r0, I, .L17 ++ LD a1, AO1, 0 * SIZE ++ LD x1, XX, 0 * SIZE ++ LD a2, AO2, 0 * SIZE ++ LD a3, AO1, 1 * SIZE ++ LD x2, XX, 1 * SIZE ++ LD a4, AO2, 1 * SIZE ++ LD a5, AO1, 2 * SIZE ++ LD x3, XX, 2 * SIZE ++ MADD y1, a1, x1, y1 ++ LD a6, AO2, 2 * SIZE ++ MADD y2, a2, x1, y2 ++ LD a7, AO1, 3 * SIZE ++ MADD y3, a3, x2, y3 ++ LD x4, XX, 3 * SIZE ++ MADD y4, a4, x2, y4 ++ LD a8, AO2, 3 * SIZE ++ MADD y1, a5, x3, y1 ++ MADD y2, a6, x3, y2 ++ addi.d XX, XX, 4 * SIZE ++ MADD y3, a7, x4, y3 ++ addi.d AO1, AO1, 4 * SIZE ++ MADD y4, a8, x4, y4 ++ addi.d AO2, AO2, 4 * SIZE ++ .align 3 ++ ++.L17: ++ andi I, M, 3 ++ ADD y1, y1, y3 ++ ADD y2, y2, y4 ++ bge $r0, I, .L19 ++ .align 3 ++.L18: ++ LD x1, XX, 0 * SIZE ++ LD a1, AO1, 0 * SIZE ++ LD a2, AO2, 0 * SIZE ++ addi.d I, I, -1 ++ addi.d XX, XX, 1 * SIZE ++ addi.d AO1, AO1, 1 * SIZE ++ addi.d AO2, AO2, 1 * SIZE ++ MADD y1, a1, x1, y1 ++ MADD y2, a2, x1, y2 ++ blt $r0, I, .L18 ++ .align 3 ++ ++.L19: ++ LD a1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a2, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ MADD a1, y1, ALPHA, a1 ++ addi.d J, J, -1 ++ MADD a2, y2, ALPHA, a2 ++ MTC y1, $r0 ++ ST a1, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ ST a2, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ blt $r0, J, .L11 ++ .align 3 ++ ++.L20: ++ andi J, N, 1 ++ MOV y3, y1 ++ move AO1, A ++ bge $r0, J, .L999 ++ srai.d I, M, 3 ++ move XX, XORIG ++ bge $r0, I, .L25 ++ LD a1, AO1, 0 * SIZE ++ LD x1, XX, 0 * SIZE ++ LD a3, AO1, 1 * SIZE ++ LD x2, XX, 1 * SIZE ++ LD a5, AO1, 2 * SIZE ++ LD x3, XX, 2 * SIZE ++ LD a7, AO1, 3 * SIZE ++ LD x4, XX, 3 * SIZE ++ LD x5, XX, 4 * SIZE ++ LD x6, XX, 5 * SIZE ++ LD x7, XX, 6 * SIZE ++ addi.d I, I, -1 ++ LD x8, XX, 7 * SIZE ++ bge $r0, I, .L23 ++ .align 3 ++.L22: ++ MADD y1, a1, x1, y1 ++ LD a1, AO1, 4 * SIZE ++ MADD y3, a3, x2, y3 ++ LD a3, AO1, 5 * SIZE ++ LD x1, XX, 8 * SIZE ++ LD x2, XX, 9 * SIZE ++ MADD y1, a5, x3, y1 ++ LD a5, AO1, 6 * SIZE ++ MADD y3, a7, x4, y3 ++ LD a7, AO1, 7 * SIZE ++ LD x3, XX, 10 * SIZE ++ LD x4, XX, 11 * SIZE ++ MADD y1, a1, x5, y1 ++ LD a1, AO1, 8 * SIZE ++ MADD y3, a3, x6, y3 ++ LD a3, AO1, 9 * SIZE ++ LD x5, XX, 12 * SIZE ++ LD x6, XX, 13 * SIZE ++ MADD y1, a5, x7, y1 ++ LD a5, AO1, 10 * SIZE ++ MADD y3, a7, x8, y3 ++ LD a7, AO1, 11 * SIZE ++ LD x7, XX, 14 * SIZE ++ LD x8, XX, 15 * SIZE ++ addi.d I, I, -1 ++ addi.d XX, XX, 8 * SIZE ++ addi.d AO1, AO1, 8 * SIZE ++ blt $r0, I, .L22 ++ .align 3 ++ ++.L23: ++ MADD y1, a1, x1, y1 ++ LD a1, AO1, 4 * SIZE ++ MADD y3, a3, x2, y3 ++ LD a3, AO1, 5 * SIZE ++ MADD y1, a5, x3, y1 ++ LD a5, AO1, 6 * SIZE ++ MADD y3, a7, x4, y3 ++ LD a7, AO1, 7 * SIZE ++ MADD y1, a1, x5, y1 ++ MADD y3, a3, x6, y3 ++ MADD y1, a5, x7, y1 ++ MADD y3, a7, x8, y3 ++ addi.d XX, XX, 8 * SIZE ++ addi.d AO1, AO1, 8 * SIZE ++ .align 3 ++ ++.L25: ++ andi I, M, 4 ++ bge $r0, I, .L27 ++ LD a1, AO1, 0 * SIZE ++ LD x1, XX, 0 * SIZE ++ LD a3, AO1, 1 * SIZE ++ LD x2, XX, 1 * SIZE ++ LD a5, AO1, 2 * SIZE ++ LD x3, XX, 2 * SIZE ++ MADD y1, a1, x1, y1 ++ LD a7, AO1, 3 * SIZE ++ MADD y3, a3, x2, y3 ++ LD x4, XX, 3 * SIZE ++ MADD y1, a5, x3, y1 ++ addi.d XX, XX, 4 * SIZE ++ MADD y3, a7, x4, y3 ++ addi.d AO1, AO1, 4 * SIZE ++ .align 3 ++ ++.L27: ++ andi I, M, 3 ++ ADD y1, y1, y3 ++ bge $r0, I, .L29 ++ .align 3 ++.L28: ++ LD x1, XX, 0 * SIZE ++ LD a1, AO1, 0 * SIZE ++ addi.d I, I, -1 ++ addi.d XX, XX, 1 * SIZE ++ addi.d AO1, AO1, 1 * SIZE ++ MADD y1, a1, x1, y1 ++ blt $r0, I, .L28 ++ .align 3 ++ ++.L29: ++ LD a1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ MADD a1, y1, ALPHA, a1 ++ ST a1, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 16 ++#endif ++#ifdef __64BIT__ ++ addi.d $sp, $sp, 16 ++#else ++ addi.d $sp, $sp, 32 ++#endif ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S +new file mode 100644 +index 0000000..31b1a9e +--- /dev/null ++++ b/kernel/loongarch64/iamax.S +@@ -0,0 +1,233 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r18 ++#define TEMP $r7 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define t1 $f0 ++#define t2 $f1 ++#define t3 $f2 ++#define t4 $f3 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++#define x1 $r17 ++#define x2 $r8 ++#define x3 $r9 ++#define x4 $r10 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ li.d x1, 0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, INCX, .L999 ++ LD a1, X, 0 * SIZE ++ addi.d N, N, -1 ++ li.d x1, 1 ++ bge $r0, N, .L999 ++ FABS s1, a1 ++ add.d X, X, INCX ++ FABS s2, a1 ++ li.d x2, 1 ++ FABS s3, a1 ++ srai.d I, N, 3 ++ FABS s4, a1 ++ li.d x3, 1 ++ li.d TEMP, 2 ++ li.d x4, 1 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a8, X, 0 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ FABS t1, a1 ++ LD a1, X, 0 * SIZE ++ FABS t2, a2 ++ add.d X, X, INCX ++ FABS t3, a3 ++ LD a2, X, 0 * SIZE ++ FABS t4, a4 ++ add.d X, X, INCX ++ CMPLT $fcc0, s1, t1 ++ LD a3, X, 0 * SIZE ++ CMPLT $fcc1, s2, t2 ++ add.d X, X, INCX ++ CMPLT $fcc2, s3, t3 ++ LD a4, X, 0 * SIZE ++ CMPLT $fcc3, s4, t4 ++ add.d X, X, INCX ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t2, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t3, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t4, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ addi.d TEMP, TEMP, 4 ++ addi.d I, I, -1 ++ FABS t1, a5 ++ LD a5, X, 0 * SIZE ++ FABS t2, a6 ++ add.d X, X, INCX ++ FABS t3, a7 ++ LD a6, X, 0 * SIZE ++ FABS t4, a8 ++ add.d X, X, INCX ++ CMPLT $fcc0, s1, t1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, s2, t2 ++ add.d X, X, INCX ++ CMPLT $fcc2, s3, t3 ++ LD a8, X, 0 * SIZE ++ CMPLT $fcc3, s4, t4 ++ add.d X, X, INCX ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t2, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t3, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t4, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ addi.d TEMP, TEMP, 4 ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ FABS t1, a1 ++ FABS t2, a2 ++ FABS t3, a3 ++ FABS t4, a4 ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t2 ++ CMPLT $fcc2, s3, t3 ++ CMPLT $fcc3, s4, t4 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t2, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t3, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t4, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ FABS t1, a5 ++ addi.d TEMP, TEMP, 4 ++ FABS t2, a6 ++ FABS t3, a7 ++ FABS t4, a8 ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t2 ++ CMPLT $fcc2, s3, t3 ++ CMPLT $fcc3, s4, t4 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t2, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t3, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t4, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ addi.d TEMP, TEMP, 4 ++ addi.d x2, x2, 1 ++ addi.d x3, x3, 2 ++ addi.d x4, x4, 3 ++ .align 3 ++ ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ FABS t1, a1 ++ addi.d I, I, -1 ++ CMPLT $fcc0, s1, t1 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ addi.d TEMP, TEMP, 1 ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L998: ++ CMPLT $fcc0, s1, s2 ++ CMPLT $fcc1, s3, s4 ++ CMOVT s1, s1, s2, $fcc0 ++ MOVT(x1, x2, $fcc0) ++ CMOVT s3, s3, s4, $fcc1 ++ MOVT(x3, x4, $fcc1) ++ CMPLT $fcc0, s1, s3 ++ CMOVT s1, s1, s3, $fcc0 ++ MOVT(x1, x3, $fcc0) ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S +new file mode 100644 +index 0000000..9364b97 +--- /dev/null ++++ b/kernel/loongarch64/iamin.S +@@ -0,0 +1,233 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r18 ++#define TEMP $r7 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define t1 $f0 ++#define t2 $f1 ++#define t3 $f2 ++#define t4 $f3 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++#define x1 $r17 ++#define x2 $r8 ++#define x3 $r9 ++#define x4 $r10 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ li.d x1, 0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, INCX, .L999 ++ LD a1, X, 0 * SIZE ++ addi.d N, N, -1 ++ li.d x1, 1 ++ bge $r0, N, .L999 ++ FABS s1, a1 ++ add.d X, X, INCX ++ FABS s2, a1 ++ li.d x2, 1 ++ FABS s3, a1 ++ srai.d I, N, 3 ++ FABS s4, a1 ++ li.d x3, 1 ++ li.d TEMP, 2 ++ li.d x4, 1 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a8, X, 0 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ FABS t1, a1 ++ LD a1, X, 0 * SIZE ++ FABS t2, a2 ++ add.d X, X, INCX ++ FABS t3, a3 ++ LD a2, X, 0 * SIZE ++ FABS t4, a4 ++ add.d X, X, INCX ++ CMPLT $fcc0, t1, s1 ++ LD a3, X, 0 * SIZE ++ CMPLT $fcc1, t2, s2 ++ add.d X, X, INCX ++ CMPLT $fcc2, t3, s3 ++ LD a4, X, 0 * SIZE ++ CMPLT $fcc3, t4, s4 ++ add.d X, X, INCX ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t2, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t3, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t4, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ addi.d TEMP, TEMP, 4 ++ addi.d I, I, -1 ++ FABS t1, a5 ++ LD a5, X, 0 * SIZE ++ FABS t2, a6 ++ add.d X, X, INCX ++ FABS t3, a7 ++ LD a6, X, 0 * SIZE ++ FABS t4, a8 ++ add.d X, X, INCX ++ CMPLT $fcc0, t1, s1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, t2, s2 ++ add.d X, X, INCX ++ CMPLT $fcc2, t3, s3 ++ LD a8, X, 0 * SIZE ++ CMPLT $fcc3, t4, s4 ++ add.d X, X, INCX ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t2, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t3, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t4, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ addi.d TEMP, TEMP, 4 ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ FABS t1, a1 ++ FABS t2, a2 ++ FABS t3, a3 ++ FABS t4, a4 ++ CMPLT $fcc0, t1, s1 ++ CMPLT $fcc1, t2, s2 ++ CMPLT $fcc2, t3, s3 ++ CMPLT $fcc3, t4, s4 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t2, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t3, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t4, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ FABS t1, a5 ++ addi.d TEMP, TEMP, 4 ++ FABS t2, a6 ++ FABS t3, a7 ++ FABS t4, a8 ++ CMPLT $fcc0, t1, s1 ++ CMPLT $fcc1, t2, s2 ++ CMPLT $fcc2, t3, s3 ++ CMPLT $fcc3, t4, s4 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t2, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t3, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t4, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ addi.d TEMP, TEMP, 4 ++ addi.d x2, x2, 1 ++ addi.d x3, x3, 2 ++ addi.d x4, x4, 3 ++ .align 3 ++ ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ FABS t1, a1 ++ addi.d I, I, -1 ++ CMPLT $fcc0, t1, s1 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ addi.d TEMP, TEMP, 1 ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L998: ++ CMPLT $fcc0, s2, s1 ++ CMPLT $fcc1, s4, s3 ++ CMOVT s1, s1, s2, $fcc0 ++ MOVT(x1, x2, $fcc0) ++ CMOVT s3, s3, s4, $fcc1 ++ MOVT(x3, x4, $fcc1) ++ CMPLT $fcc0, s3, s1 ++ CMOVT s1, s1, s3, $fcc0 ++ MOVT(x1, x3, $fcc0) ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S +new file mode 100644 +index 0000000..8d3ae52 +--- /dev/null ++++ b/kernel/loongarch64/izamax.S +@@ -0,0 +1,217 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r18 ++#define TEMP $r7 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define t1 $f0 ++#define t2 $f1 ++#define t3 $f2 ++#define t4 $f3 ++#define t5 $f4 ++#define t6 $f5 ++#define t7 $f6 ++#define t8 $f7 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++#define x1 $r17 ++#define x2 $r8 ++#define x3 $r9 ++#define x4 $r10 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ li.d x1, 0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, ZBASE_SHIFT ++ bge $r0, INCX, .L999 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ FABS t1, a1 ++ FABS t2, a2 ++ ADD s1, t1, t2 ++ ADD s2, t1, t2 ++ ADD s3, t1, t2 ++ ADD s4, t1, t2 ++ addi.d N, N, -1 ++ li.d x1, 1 ++ bge $r0, N, .L999 ++ add.d X, X, INCX ++ li.d x2, 1 ++ srai.d I, N, 2 ++ li.d x3, 1 ++ li.d TEMP, 2 ++ li.d x4, 1 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ LD a8, X, 1 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ FABS t1, a1 ++ LD a1, X, 0 * SIZE ++ FABS t2, a2 ++ LD a2, X, 1 * SIZE ++ FABS t3, a3 ++ add.d X, X, INCX ++ FABS t4, a4 ++ FABS t5, a5 ++ LD a3, X, 0 * SIZE ++ FABS t6, a6 ++ LD a4, X, 1 * SIZE ++ FABS t7, a7 ++ add.d X, X, INCX ++ FABS t8, a8 ++ ADD t1, t1, t2 ++ LD a5, X, 0 * SIZE ++ ADD t3, t3, t4 ++ LD a6, X, 1 * SIZE ++ ADD t5, t5, t6 ++ add.d X, X, INCX ++ ADD t7, t7, t8 ++ CMPLT $fcc0, s1, t1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, s2, t3 ++ LD a8, X, 1 * SIZE ++ CMPLT $fcc2, s3, t5 ++ add.d X, X, INCX ++ CMPLT $fcc3, s4, t7 ++ addi.d I, I, -1 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t3, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t5, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t7, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ addi.d TEMP, TEMP, 4 ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ FABS t1, a1 ++ FABS t2, a2 ++ FABS t3, a3 ++ FABS t4, a4 ++ FABS t5, a5 ++ FABS t6, a6 ++ FABS t7, a7 ++ FABS t8, a8 ++ ADD t1, t1, t2 ++ ADD t3, t3, t4 ++ ADD t5, t5, t6 ++ ADD t7, t7, t8 ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t3 ++ CMPLT $fcc2, s3, t5 ++ CMPLT $fcc3, s4, t7 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t3, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t5, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t7, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ addi.d TEMP, TEMP, 4 ++ addi.d x2, x2, 1 ++ addi.d x3, x3, 2 ++ addi.d x4, x4, 3 ++ .align 3 ++ ++.L15: ++ andi I, N, 3 ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ FABS t1, a1 ++ FABS t2, a2 ++ ADD t1, t1, t2 ++ addi.d I, I, -1 ++ CMPLT $fcc0, s1, t1 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ addi.d TEMP, TEMP, 1 ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L998: ++ CMPLT $fcc0, s1, s2 ++ CMPLT $fcc1, s3, s4 ++ CMOVT s1, s1, s2, $fcc0 ++ MOVT(x1, x2, $fcc0) ++ CMOVT s3, s3, s4, $fcc1 ++ MOVT(x3, x4, $fcc1) ++ CMPLT $fcc0, s1, s3 ++ CMOVT s1, s1, s3, $fcc0 ++ MOVT(x1, x3, $fcc0) ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S +new file mode 100644 +index 0000000..38a109c +--- /dev/null ++++ b/kernel/loongarch64/izamin.S +@@ -0,0 +1,217 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r18 ++#define TEMP $r7 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define t1 $f0 ++#define t2 $f1 ++#define t3 $f2 ++#define t4 $f3 ++#define t5 $f4 ++#define t6 $f5 ++#define t7 $f6 ++#define t8 $f7 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++#define x1 $r17 ++#define x2 $r8 ++#define x3 $r9 ++#define x4 $r10 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ li.d x1, 0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, ZBASE_SHIFT ++ bge $r0, INCX, .L999 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ FABS t1, a1 ++ FABS t2, a2 ++ ADD s1, t1, t2 ++ ADD s2, t1, t2 ++ ADD s3, t1, t2 ++ ADD s4, t1, t2 ++ addi.d N, N, -1 ++ li.d x1, 1 ++ bge $r0, N, .L999 ++ add.d X, X, INCX ++ li.d x2, 1 ++ srai.d I, N, 2 ++ li.d x3, 1 ++ li.d TEMP, 2 ++ li.d x4, 1 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ LD a8, X, 1 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ FABS t1, a1 ++ LD a1, X, 0 * SIZE ++ FABS t2, a2 ++ LD a2, X, 1 * SIZE ++ FABS t3, a3 ++ add.d X, X, INCX ++ FABS t4, a4 ++ FABS t5, a5 ++ LD a3, X, 0 * SIZE ++ FABS t6, a6 ++ LD a4, X, 1 * SIZE ++ FABS t7, a7 ++ add.d X, X, INCX ++ FABS t8, a8 ++ ADD t1, t1, t2 ++ LD a5, X, 0 * SIZE ++ ADD t3, t3, t4 ++ LD a6, X, 1 * SIZE ++ ADD t5, t5, t6 ++ add.d X, X, INCX ++ ADD t7, t7, t8 ++ CMPLT $fcc0, t1, s1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, t3, s2 ++ LD a8, X, 1 * SIZE ++ CMPLT $fcc2, t5, s3 ++ add.d X, X, INCX ++ CMPLT $fcc3, t7, s4 ++ addi.d I, I, -1 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t3, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t5, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t7, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ addi.d TEMP, TEMP, 4 ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ FABS t1, a1 ++ FABS t2, a2 ++ FABS t3, a3 ++ FABS t4, a4 ++ FABS t5, a5 ++ FABS t6, a6 ++ FABS t7, a7 ++ FABS t8, a8 ++ ADD t1, t1, t2 ++ ADD t3, t3, t4 ++ ADD t5, t5, t6 ++ ADD t7, t7, t8 ++ CMPLT $fcc0, t1, s1 ++ CMPLT $fcc1, t3, s2 ++ CMPLT $fcc2, t5, s3 ++ CMPLT $fcc3, t7, s4 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ CMOVT s2, s2, t3, $fcc1 ++ MOVT(x2, TEMP, $fcc1) ++ CMOVT s3, s3, t5, $fcc2 ++ MOVT(x3, TEMP, $fcc2) ++ CMOVT s4, s4, t7, $fcc3 ++ MOVT(x4, TEMP, $fcc3) ++ addi.d TEMP, TEMP, 4 ++ addi.d x2, x2, 1 ++ addi.d x3, x3, 2 ++ addi.d x4, x4, 3 ++ .align 3 ++ ++.L15: ++ andi I, N, 3 ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ FABS t1, a1 ++ FABS t2, a2 ++ ADD t1, t1, t2 ++ addi.d I, I, -1 ++ CMPLT $fcc0, t1, s1 ++ CMOVT s1, s1, t1, $fcc0 ++ MOVT(x1, TEMP, $fcc0) ++ addi.d TEMP, TEMP, 1 ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L998: ++ CMPLT $fcc0, s2, s1 ++ CMPLT $fcc1, s4, s3 ++ CMOVT s1, s1, s2, $fcc0 ++ MOVT(x1, x2, $fcc0) ++ CMOVT s3, s3, s4, $fcc1 ++ MOVT(x3, x4, $fcc1) ++ CMPLT $fcc0, s3, s1 ++ CMOVT s1, s1, s3, $fcc0 ++ MOVT(x1, x3, $fcc0) ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/max.S b/kernel/loongarch64/max.S +new file mode 100644 +index 0000000..56c3f99 +--- /dev/null ++++ b/kernel/loongarch64/max.S +@@ -0,0 +1,174 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ MTC s1, $r0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, INCX, .L999 ++ LD s1, X, 0 * SIZE ++ addi.d N, N, -1 ++ add.d X, X, INCX ++ MOV s2, s1 ++ bge $r0, N, .L999 ++ MOV s3, s1 ++ srai.d I, N, 3 ++ MOV s4, s1 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ CMPLT $fcc0, s1, a1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, s2, a2 ++ add.d X, X, INCX ++ CMPLT $fcc2, s3, a3 ++ LD a8, X, 0 * SIZE ++ CMPLT $fcc3, s4, a4 ++ add.d X, X, INCX ++ CMOVT s1, s1, a1, $fcc0 ++ LD a1, X, 0 * SIZE ++ CMOVT s2, s2, a2, $fcc1 ++ add.d X, X, INCX ++ CMOVT s3, s3, a3, $fcc2 ++ LD a2, X, 0 * SIZE ++ CMOVT s4, s4, a4, $fcc3 ++ add.d X, X, INCX ++ CMPLT $fcc0, s1, a5 ++ LD a3, X, 0 * SIZE ++ CMPLT $fcc1, s2, a6 ++ add.d X, X, INCX ++ CMPLT $fcc2, s3, a7 ++ LD a4, X, 0 * SIZE ++ CMPLT $fcc3, s4, a8 ++ add.d X, X, INCX ++ CMOVT s1, s1, a5, $fcc0 ++ LD a5, X, 0 * SIZE ++ CMOVT s2, s2, a6, $fcc1 ++ add.d X, X, INCX ++ CMOVT s3, s3, a7, $fcc2 ++ LD a6, X, 0 * SIZE ++ CMOVT s4, s4, a8, $fcc3 ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ CMPLT $fcc0, s1, a1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, s2, a2 ++ add.d X, X, INCX ++ CMPLT $fcc2, s3, a3 ++ LD a8, X, 0 * SIZE ++ CMPLT $fcc3, s4, a4 ++ add.d X, X, INCX ++ CMOVT s1, s1, a1, $fcc0 ++ CMOVT s2, s2, a2, $fcc1 ++ CMOVT s3, s3, a3, $fcc2 ++ CMOVT s4, s4, a4, $fcc3 ++ CMPLT $fcc0, s1, a5 ++ CMPLT $fcc1, s2, a6 ++ CMPLT $fcc2, s3, a7 ++ CMPLT $fcc3, s4, a8 ++ CMOVT s1, s1, a5, $fcc0 ++ CMOVT s2, s2, a6, $fcc1 ++ CMOVT s3, s3, a7, $fcc2 ++ CMOVT s4, s4, a8, $fcc3 ++ .align 3 ++ ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ CMPLT $fcc0, s1, a1 ++ CMOVT s1, s1, a1, $fcc0 ++ add.d X, X, INCX ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L998: ++ CMPLT $fcc0, s1, s2 ++ CMPLT $fcc1, s3, s4 ++ CMOVT s1, s1, s2, $fcc0 ++ CMOVT s3, s3, s4, $fcc1 ++ CMPLT $fcc0, s1, s3 ++ CMOVT s1, s1, s3, $fcc0 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/min.S b/kernel/loongarch64/min.S +new file mode 100644 +index 0000000..bb2fcfb +--- /dev/null ++++ b/kernel/loongarch64/min.S +@@ -0,0 +1,174 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ MTC s1, $r0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, INCX, .L999 ++ LD s1, X, 0 * SIZE ++ addi.d N, N, -1 ++ add.d X, X, INCX ++ MOV s2, s1 ++ bge $r0, N, .L999 ++ MOV s3, s1 ++ srai.d I, N, 3 ++ MOV s4, s1 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ CMPLT $fcc0, a1, s1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, a2, s2 ++ add.d X, X, INCX ++ CMPLT $fcc2, a3, s3 ++ LD a8, X, 0 * SIZE ++ CMPLT $fcc3, a4, s4 ++ add.d X, X, INCX ++ CMOVT s1, s1, a1, $fcc0 ++ LD a1, X, 0 * SIZE ++ CMOVT s2, s2, a2, $fcc1 ++ add.d X, X, INCX ++ CMOVT s3, s3, a3, $fcc2 ++ LD a2, X, 0 * SIZE ++ CMOVT s4, s4, a4, $fcc3 ++ add.d X, X, INCX ++ CMPLT $fcc0, a5, s1 ++ LD a3, X, 0 * SIZE ++ CMPLT $fcc1, a6, s2 ++ add.d X, X, INCX ++ CMPLT $fcc2, a7, s3 ++ LD a4, X, 0 * SIZE ++ CMPLT $fcc3, a8, s4 ++ add.d X, X, INCX ++ CMOVT s1, s1, a5, $fcc0 ++ LD a5, X, 0 * SIZE ++ CMOVT s2, s2, a6, $fcc1 ++ add.d X, X, INCX ++ CMOVT s3, s3, a7, $fcc2 ++ LD a6, X, 0 * SIZE ++ CMOVT s4, s4, a8, $fcc3 ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ CMPLT $fcc0, a1, s1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, a2, s2 ++ add.d X, X, INCX ++ CMPLT $fcc2, a3, s3 ++ LD a8, X, 0 * SIZE ++ CMPLT $fcc3, a4, s4 ++ add.d X, X, INCX ++ CMOVT s1, s1, a1, $fcc0 ++ CMOVT s2, s2, a2, $fcc1 ++ CMOVT s3, s3, a3, $fcc2 ++ CMOVT s4, s4, a4, $fcc3 ++ CMPLT $fcc0, a5, s1 ++ CMPLT $fcc1, a6, s2 ++ CMPLT $fcc2, a7, s3 ++ CMPLT $fcc3, a8, s4 ++ CMOVT s1, s1, a5, $fcc0 ++ CMOVT s2, s2, a6, $fcc1 ++ CMOVT s3, s3, a7, $fcc2 ++ CMOVT s4, s4, a8, $fcc3 ++ .align 3 ++ ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ CMPLT $fcc0, a1, s1 ++ CMOVT s1, s1, a1, $fcc0 ++ add.d X, X, INCX ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L998: ++ CMPLT $fcc0, s2, s1 ++ CMPLT $fcc1, s4, s3 ++ CMOVT s1, s1, s2, $fcc0 ++ CMOVT s3, s3, s4, $fcc1 ++ CMPLT $fcc0, s3, s1 ++ CMOVT s1, s1, s3, $fcc0 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S +new file mode 100644 +index 0000000..566bce6 +--- /dev/null ++++ b/kernel/loongarch64/scal.S +@@ -0,0 +1,330 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r7 ++#define INCX $r8 ++ ++#define I $r17 ++#define TEMP $r18 ++#define XX $r5 ++#define ALPHA $f0 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f23 ++#define a4 $f9 ++#define a5 $f10 ++#define a6 $f11 ++#define a7 $f12 ++#define a8 $f13 ++#define t1 $f14 ++#define t2 $f15 ++#define t3 $f16 ++#define t4 $f17 ++ ++ PROLOGUE ++ ++ li.d TEMP, SIZE ++ MTC a1, $r0 ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, N, .L999 ++ CMPEQ $fcc0, ALPHA, a1 ++ bceqz $fcc0, .L50 ++ srai.d I, N, 3 ++ bne INCX, TEMP, .L20 ++ bge $r0, I, .L15 ++ .align 3 ++ ++.L12: ++ ST a1, X, 0 * SIZE ++ ST a1, X, 1 * SIZE ++ ST a1, X, 2 * SIZE ++ ST a1, X, 3 * SIZE ++ ST a1, X, 4 * SIZE ++ ST a1, X, 5 * SIZE ++ ST a1, X, 6 * SIZE ++ ST a1, X, 7 * SIZE ++ addi.w I, I, -1 ++ addi.d X, X, 8 * SIZE ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++.L16: ++ ST a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ addi.d X, X, SIZE ++ blt $r0, I, .L16 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ .align 3 ++ ++.L20: ++ srai.d I, N, 3 ++ bge $r0, I, .L25 ++ .align 3 ++ ++.L22: ++ ST a1, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a1, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a1, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a1, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a1, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a1, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a1, X, 0 * SIZE ++ add.d X, X, INCX ++ ST a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ blt $r0, I, .L22 ++ .align 3 ++ ++.L25: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++.L26: ++ addi.d I, I, -1 ++ ST a1, X, 0 * SIZE ++ add.d X, X, INCX ++ blt $r0, I, .L26 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ .align 3 ++ ++.L50: ++ srai.d I, N, 3 ++ bne INCX, TEMP, .L60 ++ addi.d I, I, -1 ++ blt I, $r0, .L55 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD a3, X, 2 * SIZE ++ LD a4, X, 3 * SIZE ++ LD a5, X, 4 * SIZE ++ LD a6, X, 5 * SIZE ++ LD a7, X, 6 * SIZE ++ LD a8, X, 7 * SIZE ++ bge $r0, I, .L53 ++ .align 3 ++ ++.L52: ++ MUL t1, ALPHA, a1 ++ LD a1, X, 8 * SIZE ++ MUL t2, ALPHA, a2 ++ LD a2, X, 9 * SIZE ++ MUL t3, ALPHA, a3 ++ LD a3, X, 10 * SIZE ++ MUL t4, ALPHA, a4 ++ LD a4, X, 11 * SIZE ++ ST t1, X, 0 * SIZE ++ MUL t1, ALPHA, a5 ++ LD a5, X, 12 * SIZE ++ ST t2, X, 1 * SIZE ++ MUL t2, ALPHA, a6 ++ LD a6, X, 13 * SIZE ++ ST t3, X, 2 * SIZE ++ MUL t3, ALPHA, a7 ++ LD a7, X, 14 * SIZE ++ ST t4, X, 3 * SIZE ++ MUL t4, ALPHA, a8 ++ LD a8, X, 15 * SIZE ++ addi.d I, I, -1 ++ ST t1, X, 4 * SIZE ++ ST t2, X, 5 * SIZE ++ ST t3, X, 6 * SIZE ++ ST t4, X, 7 * SIZE ++ addi.d X, X, 8 * SIZE ++ blt $r0, I, .L52 ++ .align 3 ++ ++.L53: ++ MUL t1, ALPHA, a1 ++ MUL t2, ALPHA, a2 ++ MUL t3, ALPHA, a3 ++ MUL t4, ALPHA, a4 ++ ST t1, X, 0 * SIZE ++ MUL t1, ALPHA, a5 ++ ST t2, X, 1 * SIZE ++ MUL t2, ALPHA, a6 ++ ST t3, X, 2 * SIZE ++ MUL t3, ALPHA, a7 ++ ST t4, X, 3 * SIZE ++ MUL t4, ALPHA, a8 ++ ST t1, X, 4 * SIZE ++ ST t2, X, 5 * SIZE ++ ST t3, X, 6 * SIZE ++ ST t4, X, 7 * SIZE ++ addi.d X, X, 8 * SIZE ++ .align 3 ++ ++.L55: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++.L56: ++ LD a1, X, 0 * SIZE ++ MUL t1, ALPHA, a1 ++ addi.d X, X, SIZE ++ addi.d I, I, -1 ++ ST t1, X, -1 * SIZE ++ blt $r0, I, .L56 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ .align 3 ++ ++.L60: ++ srai.d I, N, 3 ++ move XX, X ++ addi.d I, I, -1 ++ blt I, $r0, .L65 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a8, X, 0 * SIZE ++ add.d X, X, INCX ++ bge $r0, I, .L63 ++ .align 3 ++ ++.L62: ++ MUL t1, ALPHA, a1 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ MUL t2, ALPHA, a2 ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ MUL t3, ALPHA, a3 ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ MUL t4, ALPHA, a4 ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ ST t1, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t2, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t3, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t4, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ MUL t1, ALPHA, a5 ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ MUL t2, ALPHA, a6 ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ MUL t3, ALPHA, a7 ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ MUL t4, ALPHA, a8 ++ LD a8, X, 0 * SIZE ++ add.d X, X, INCX ++ ST t1, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t2, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t3, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t4, XX, 0 * SIZE ++ addi.d I, I, -1 ++ add.d XX, XX, INCX ++ blt $r0, I, .L62 ++ .align 3 ++ ++.L63: ++ MUL t1, ALPHA, a1 ++ MUL t2, ALPHA, a2 ++ MUL t3, ALPHA, a3 ++ MUL t4, ALPHA, a4 ++ ST t1, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t2, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t3, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t4, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ MUL t1, ALPHA, a5 ++ MUL t2, ALPHA, a6 ++ MUL t3, ALPHA, a7 ++ MUL t4, ALPHA, a8 ++ ST t1, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t2, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t3, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST t4, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ .align 3 ++ ++.L65: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++.L66: ++ LD a1, X, 0 * SIZE ++ MUL t1, ALPHA, a1 ++ addi.d I, I, -1 ++ ST t1, X, 0 * SIZE ++ add.d X, X, INCX ++ blt $r0, I, .L66 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S +new file mode 100644 +index 0000000..57c21a0 +--- /dev/null ++++ b/kernel/loongarch64/snrm2.S +@@ -0,0 +1,249 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f12 ++#define a2 $f13 ++#define a3 $f14 ++#define a4 $f15 ++#define a5 $f16 ++#define a6 $f17 ++#define a7 $f0 ++#define a8 $f1 ++#define s1 $f22 ++#define s2 $f8 ++#define t1 $f23 ++#define t2 $f9 ++#define t3 $f10 ++#define t4 $f11 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ movgr2fr.d s1, $r0 ++ li.d TEMP, SIZE ++ fmov.d s2, s1 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, INCX, .L999 ++ srai.d I, N, 3 ++ bne INCX, TEMP, .L20 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD a3, X, 2 * SIZE ++ LD a4, X, 3 * SIZE ++ LD a5, X, 4 * SIZE ++ addi.d I, I, -1 ++ fcvt.d.s t1, a1 ++ LD a6, X, 5 * SIZE ++ fcvt.d.s t2, a2 ++ LD a7, X, 6 * SIZE ++ fcvt.d.s t3, a3 ++ LD a8, X, 7 * SIZE ++ fcvt.d.s t4, a4 ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ fmadd.d s1, t1, t1, s1 ++ LD a1, X, 8 * SIZE ++ fcvt.d.s t1, a5 ++ NOP ++ fmadd.d s2, t2, t2, s2 ++ LD a2, X, 9 * SIZE ++ fcvt.d.s t2, a6 ++ NOP ++ fmadd.d s1, t3, t3, s1 ++ LD a3, X, 10 * SIZE ++ fcvt.d.s t3, a7 ++ NOP ++ fmadd.d s2, t4, t4, s2 ++ LD a4, X, 11 * SIZE ++ fcvt.d.s t4, a8 ++ NOP ++ fmadd.d s1, t1, t1, s1 ++ LD a5, X, 12 * SIZE ++ fcvt.d.s t1, a1 ++ NOP ++ fmadd.d s2, t2, t2, s2 ++ LD a6, X, 13 * SIZE ++ fcvt.d.s t2, a2 ++ addi.d I, I, -1 ++ fmadd.d s1, t3, t3, s1 ++ LD a7, X, 14 * SIZE ++ fcvt.d.s t3, a3 ++ addi.d X, X, 8 * SIZE ++ fmadd.d s2, t4, t4, s2 ++ LD a8, X, 7 * SIZE ++ fcvt.d.s t4, a4 ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ fmadd.d s1, t1, t1, s1 ++ fcvt.d.s t1, a5 ++ fmadd.d s2, t2, t2, s2 ++ fcvt.d.s t2, a6 ++ fmadd.d s1, t3, t3, s1 ++ fcvt.d.s t3, a7 ++ fmadd.d s2, t4, t4, s2 ++ fcvt.d.s t4, a8 ++ fmadd.d s1, t1, t1, s1 ++ fmadd.d s2, t2, t2, s2 ++ fmadd.d s1, t3, t3, s1 ++ fmadd.d s2, t4, t4, s2 ++ addi.d X, X, 8 * SIZE ++ .align 3 ++ ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ fcvt.d.s t1, a1 ++ fmadd.d s1, t1, t1, s1 ++ addi.d X, X, SIZE ++ blt $r0, I, .L16 ++ b .L999 ++ .align 3 ++ ++.L20: ++ bge $r0, I, .L25 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ LD a8, X, 0 * SIZE ++ addi.d I, I, -1 ++ fcvt.d.s t1, a1 ++ fcvt.d.s t2, a2 ++ fcvt.d.s t3, a3 ++ fcvt.d.s t4, a4 ++ add.d X, X, INCX ++ bge $r0, I, .L24 ++ .align 3 ++ ++.L23: ++ fmadd.d s1, t1, t1, s1 ++ LD a1, X, 0 * SIZE ++ fcvt.d.s t1, a5 ++ add.d X, X, INCX ++ fmadd.d s2, t2, t2, s2 ++ LD a2, X, 0 * SIZE ++ fcvt.d.s t2, a6 ++ add.d X, X, INCX ++ fmadd.d s1, t3, t3, s1 ++ LD a3, X, 0 * SIZE ++ fcvt.d.s t3, a7 ++ add.d X, X, INCX ++ fmadd.d s2, t4, t4, s2 ++ LD a4, X, 0 * SIZE ++ fcvt.d.s t4, a8 ++ add.d X, X, INCX ++ fmadd.d s1, t1, t1, s1 ++ LD a5, X, 0 * SIZE ++ fcvt.d.s t1, a1 ++ add.d X, X, INCX ++ fmadd.d s2, t2, t2, s2 ++ LD a6, X, 0 * SIZE ++ fcvt.d.s t2, a2 ++ add.d X, X, INCX ++ fmadd.d s1, t3, t3, s1 ++ LD a7, X, 0 * SIZE ++ fcvt.d.s t3, a3 ++ add.d X, X, INCX ++ fmadd.d s2, t4, t4, s2 ++ LD a8, X, 0 * SIZE ++ fcvt.d.s t4, a4 ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ blt $r0, I, .L23 ++ .align 3 ++ ++.L24: ++ fmadd.d s1, t1, t1, s1 ++ fcvt.d.s t1, a5 ++ fmadd.d s2, t2, t2, s2 ++ fcvt.d.s t2, a6 ++ fmadd.d s1, t3, t3, s1 ++ fcvt.d.s t3, a7 ++ fmadd.d s2, t4, t4, s2 ++ fcvt.d.s t4, a8 ++ fmadd.d s1, t1, t1, s1 ++ fmadd.d s2, t2, t2, s2 ++ fmadd.d s1, t3, t3, s1 ++ fmadd.d s2, t4, t4, s2 ++ .align 3 ++ ++.L25: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L26: ++ LD a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ fcvt.d.s t1, a1 ++ add.d X, X, INCX ++ fmadd.d s1, t1, t1, s1 ++ blt $r0, I, .L26 ++ .align 3 ++ ++.L999: ++ fadd.d s1, s1, s2 ++ fsqrt.d s1, s1 ++ move $r4, $r17 ++ fcvt.s.d $f0, s1 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S +new file mode 100644 +index 0000000..4578a8d +--- /dev/null ++++ b/kernel/loongarch64/swap.S +@@ -0,0 +1,330 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r7 ++#define INCX $r8 ++#define Y $r9 ++#define INCY $r10 ++ ++#define I $r17 ++#define TEMP $r18 ++#define XX $r5 ++#define YY $r6 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f23 ++#define a4 $f9 ++#define a5 $f10 ++#define a6 $f11 ++#define a7 $f12 ++#define a8 $f13 ++#define b1 $f14 ++#define b2 $f15 ++#define b3 $f16 ++#define b4 $f17 ++#define b5 $f0 ++#define b6 $f1 ++#define b7 $f2 ++#define b8 $f3 ++ ++ PROLOGUE ++ ++ li.d TEMP, SIZE ++ slli.d INCX, INCX, BASE_SHIFT ++ bge $r0, N, .L999 ++ slli.d INCY, INCY, BASE_SHIFT ++ bne INCX, TEMP, .L20 ++ srai.d I, N, 3 ++ bne INCY, TEMP, .L20 ++ addi.d I, I, -1 ++ blt I, $r0, .L15 ++ LD a1, X, 0 * SIZE ++ LD b1, Y, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD b2, Y, 1 * SIZE ++ LD a3, X, 2 * SIZE ++ LD b3, Y, 2 * SIZE ++ LD a4, X, 3 * SIZE ++ LD b4, Y, 3 * SIZE ++ LD a5, X, 4 * SIZE ++ LD b5, Y, 4 * SIZE ++ LD a6, X, 5 * SIZE ++ LD b6, Y, 5 * SIZE ++ LD a7, X, 6 * SIZE ++ LD b7, Y, 6 * SIZE ++ LD a8, X, 7 * SIZE ++ LD b8, Y, 7 * SIZE ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ ST a1, Y, 0 * SIZE ++ LD a1, X, 8 * SIZE ++ ST b1, X, 0 * SIZE ++ LD b1, Y, 8 * SIZE ++ ST a2, Y, 1 * SIZE ++ LD a2, X, 9 * SIZE ++ ST b2, X, 1 * SIZE ++ LD b2, Y, 9 * SIZE ++ ST a3, Y, 2 * SIZE ++ LD a3, X, 10 * SIZE ++ ST b3, X, 2 * SIZE ++ LD b3, Y, 10 * SIZE ++ ST a4, Y, 3 * SIZE ++ LD a4, X, 11 * SIZE ++ ST b4, X, 3 * SIZE ++ LD b4, Y, 11 * SIZE ++ ST a5, Y, 4 * SIZE ++ LD a5, X, 12 * SIZE ++ ST b5, X, 4 * SIZE ++ LD b5, Y, 12 * SIZE ++ ST a6, Y, 5 * SIZE ++ LD a6, X, 13 * SIZE ++ ST b6, X, 5 * SIZE ++ LD b6, Y, 13 * SIZE ++ ST a7, Y, 6 * SIZE ++ LD a7, X, 14 * SIZE ++ ST b7, X, 6 * SIZE ++ LD b7, Y, 14 * SIZE ++ ST a8, Y, 7 * SIZE ++ LD a8, X, 15 * SIZE ++ ST b8, X, 7 * SIZE ++ LD b8, Y, 15 * SIZE ++ addi.d I, I, -1 ++ addi.d X, X, 8 * SIZE ++ addi.d Y, Y, 8 * SIZE ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ ST a1, Y, 0 * SIZE ++ ST b1, X, 0 * SIZE ++ ST a2, Y, 1 * SIZE ++ ST b2, X, 1 * SIZE ++ ST a3, Y, 2 * SIZE ++ ST b3, X, 2 * SIZE ++ ST a4, Y, 3 * SIZE ++ ST b4, X, 3 * SIZE ++ ST a5, Y, 4 * SIZE ++ ST b5, X, 4 * SIZE ++ ST a6, Y, 5 * SIZE ++ ST b6, X, 5 * SIZE ++ ST a7, Y, 6 * SIZE ++ ST b7, X, 6 * SIZE ++ ST a8, Y, 7 * SIZE ++ ST b8, X, 7 * SIZE ++ addi.d X, X, 8 * SIZE ++ addi.d Y, Y, 8 * SIZE ++ .align 3 ++ ++.L15: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++.L16: ++ LD a1, X, 0 * SIZE ++ LD b1, Y, 0 * SIZE ++ addi.d X, X, SIZE ++ addi.d I, I, -1 ++ addi.d Y, Y, SIZE ++ ST b1, X, -1 * SIZE ++ ST a1, Y, -1 * SIZE ++ blt $r0, I, .L16 ++ b .L999 ++ .align 3 ++ ++.L20: ++ srai.d I, N, 3 ++ move XX, X ++ move YY, Y ++ addi.d I, I, -1 ++ blt I, $r0, .L25 ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b2, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b3, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b4, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b5, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b6, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b7, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ LD a8, X, 0 * SIZE ++ add.d X, X, INCX ++ LD b8, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ bge $r0, I, .L23 ++ .align 3 ++ ++.L22: ++ ST a1, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ LD a1, X, 0 * SIZE ++ add.d X, X, INCX ++ ST b1, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD b1, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a2, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ LD a2, X, 0 * SIZE ++ add.d X, X, INCX ++ ST b2, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD b2, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a3, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ LD a3, X, 0 * SIZE ++ add.d X, X, INCX ++ ST b3, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD b3, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a4, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ LD a4, X, 0 * SIZE ++ add.d X, X, INCX ++ ST b4, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD b4, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a5, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ LD a5, X, 0 * SIZE ++ add.d X, X, INCX ++ ST b5, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD b5, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a6, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ LD a6, X, 0 * SIZE ++ add.d X, X, INCX ++ ST b6, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD b6, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a7, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ LD a7, X, 0 * SIZE ++ add.d X, X, INCX ++ ST b7, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD b7, Y, 0 * SIZE ++ add.d Y, Y, INCY ++ ST a8, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ LD a8, X, 0 * SIZE ++ add.d X, X, INCX ++ ST b8, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ LD b8, Y, 0 * SIZE ++ addi.d I, I, -1 ++ add.d Y, Y, INCY ++ blt $r0, I, .L22 ++ .align 3 ++ ++.L23: ++ ST a1, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ ST b1, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST a2, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ ST b2, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST a3, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ ST b3, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST a4, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ ST b4, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST a5, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ ST b5, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST a6, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ ST b6, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST a7, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ ST b7, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ ST a8, YY, 0 * SIZE ++ add.d YY, YY, INCY ++ ST b8, XX, 0 * SIZE ++ add.d XX, XX, INCX ++ .align 3 ++ ++.L25: ++ andi I, N, 7 ++ bge $r0, I, .L999 ++ .align 3 ++.L26: ++ LD a1, X, 0 * SIZE ++ LD b1, Y, 0 * SIZE ++ addi.d I, I, -1 ++ ST a1, Y, 0 * SIZE ++ ST b1, X, 0 * SIZE ++ add.d X, X, INCX ++ add.d Y, Y, INCY ++ blt $r0, I, .L26 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/trsm_kernel_LN.S b/kernel/loongarch64/trsm_kernel_LN.S +new file mode 100644 +index 0000000..a0bd29f +--- /dev/null ++++ b/kernel/loongarch64/trsm_kernel_LN.S +@@ -0,0 +1,2863 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define M $r4 ++#define N $r5 ++#define K $r6 ++#define A $r7 ++#define B $r8 ++#define C $r9 ++#define LDC $r10 ++#define OFFSET $r11 ++#define AO $r12 ++#define BO $r13 ++#define I $r17 ++#define J $r18 ++#define L $r29 ++#define CO1 $r14 ++#define CO2 $r15 ++#define CO3 $r23 ++#define CO4 $r24 ++#define CO5 $r25 ++#define CO6 $r26 ++#define CO7 $r27 ++#define CO8 $r28 ++#define KK $r30 ++#define TEMP $r20 ++#define AORIG $r16 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f27 ++#define a4 $f28 ++#define b1 $f23 ++#define b2 $f9 ++#define b3 $f10 ++#define b4 $f11 ++#define b5 $f12 ++#define b6 $f13 ++#define b7 $f14 ++#define b8 $f15 ++#define a5 b8 ++#define c11 $f16 ++#define c12 $f17 ++#define c21 $f3 ++#define c22 $f1 ++#define c31 $f2 ++#define c32 $f4 ++#define c41 $f5 ++#define c42 $f6 ++#define c51 $f7 ++#define c52 $f18 ++#define c61 $f19 ++#define c62 $f20 ++#define c71 $f21 ++#define c72 $f24 ++#define c81 $f25 ++#define c82 $f26 ++#define ALPHA $f0 ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -144 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 16 ++ SDARG $r26, $sp, 24 ++ SDARG $r27, $sp, 32 ++ SDARG $r28, $sp, 40 ++ fst.d $f24, $sp, 48 ++ fst.d $f25, $sp, 56 ++ fst.d $f26, $sp, 64 ++ fst.d $f27, $sp, 72 ++ fst.d $f28, $sp, 80 ++ SDARG $r29, $sp, 88 ++ SDARG $r30, $sp, 96 ++ SDARG $r20, $sp, 104 ++ SDARG $r16, $sp, 112 ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 112 ++ fst.d $f19, $sp, 120 ++ fst.d $f20, $sp, 128 ++ fst.d $f21, $sp, 136 ++#endif ++ slli.d LDC, LDC, BASE_SHIFT ++#ifdef LN ++ mul.w TEMP, M, K ++ slli.d TEMP, TEMP, BASE_SHIFT ++ add.d A, A, TEMP ++ slli.d TEMP, M, BASE_SHIFT ++ add.d C, C, TEMP ++#endif ++#ifdef RN ++ neg KK, OFFSET ++#endif ++#ifdef RT ++ mul.w TEMP, N, K ++ slli.d TEMP, TEMP, BASE_SHIFT ++ add.d B, B, TEMP ++ mul.w TEMP, N, LDC ++ add.d C, C, TEMP ++ sub.d KK, N, OFFSET ++#endif ++ srai.d J, N, 3 ++nop ++ bge $r0, J, .L30 ++.L10: ++#ifdef RT ++ slli.d TEMP, K, 3 + BASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 3 ++ sub.d C, C, TEMP ++#endif ++ move CO1, C ++MTC c11, $r0 ++ add.d CO2, C, LDC ++ add.d CO3, CO2, LDC ++ addi.d J, J, -1 ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ add.d CO5, CO4, LDC ++ MOV c31, c11 ++ add.d CO6, CO5, LDC ++ MOV c41, c11 ++ add.d CO7, CO6, LDC ++ MOV c51, c11 ++ add.d CO8, CO7, LDC ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO8, LDC ++#endif ++ andi I, M, 1 ++ MOV c61, c11 ++MOV c71, c11 ++ bge $r0, I, .L20 ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, KK, 2 ++ MOV c81, c11 ++move BO, B ++ bge $r0, L, .L25 ++#else ++#ifdef LN ++ slli.d TEMP, K, 0 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ srai.d L, TEMP, 2 ++ MOV c81, c11 ++ bge $r0, L, .L25 ++#endif ++ .align 3 ++.L22: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 16 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ LD b5, BO, 20 * SIZE ++ MADD c61, b2, a1, c61 ++ LD b2, BO, 9 * SIZE ++ MADD c71, b3, a1, c71 ++ LD b3, BO, 10 * SIZE ++ MADD c81, b4, a1, c81 ++ LD b4, BO, 11 * SIZE ++ LD a1, AO, 4 * SIZE ++ addi.d L, L, -1 ++ MADD c11, b6, a2, c11 ++ LD b6, BO, 24 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 13 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 14 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a2, c51 ++ LD b7, BO, 28 * SIZE ++ MADD c61, b2, a2, c61 ++ LD b2, BO, 17 * SIZE ++ MADD c71, b3, a2, c71 ++ LD b3, BO, 18 * SIZE ++ MADD c81, b4, a2, c81 ++ LD b4, BO, 19 * SIZE ++ LD a2, AO, 5 * SIZE ++ addi.d AO, AO, 4 * SIZE ++ MADD c11, b1, a3, c11 ++ LD b1, BO, 32 * SIZE ++ MADD c21, b2, a3, c21 ++ LD b2, BO, 21 * SIZE ++ MADD c31, b3, a3, c31 ++ LD b3, BO, 22 * SIZE ++ MADD c41, b4, a3, c41 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ LD b5, BO, 36 * SIZE ++ MADD c61, b2, a3, c61 ++ LD b2, BO, 25 * SIZE ++ MADD c71, b3, a3, c71 ++ LD b3, BO, 26 * SIZE ++ MADD c81, b4, a3, c81 ++ LD b4, BO, 27 * SIZE ++ LD a3, AO, 2 * SIZE ++ addi.d BO, BO, 32 * SIZE ++ MADD c11, b6, a4, c11 ++ LD b6, BO, 8 * SIZE ++ MADD c21, b2, a4, c21 ++ LD b2, BO, -3 * SIZE ++ MADD c31, b3, a4, c31 ++ LD b3, BO, -2 * SIZE ++ MADD c41, b4, a4, c41 ++ LD b4, BO, -1 * SIZE ++ MADD c51, b7, a4, c51 ++ LD b7, BO, 12 * SIZE ++ MADD c61, b2, a4, c61 ++ LD b2, BO, 1 * SIZE ++ MADD c71, b3, a4, c71 ++ LD b3, BO, 2 * SIZE ++ MADD c81, b4, a4, c81 ++ LD b4, BO, 3 * SIZE ++ LD a4, AO, 3 * SIZE ++ blt $r0, L, .L22 ++ .align 3 ++ ++.L25: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L28 ++ .align 3 ++.L26: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 8 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ addi.d L, L, -1 ++ MOV a2, a2 ++ addi.d AO, AO, 1 * SIZE ++ addi.d BO, BO, 8 * SIZE ++ MADD c51, b5, a1, c51 ++ LD b5, BO, 4 * SIZE ++ MADD c61, b2, a1, c61 ++ LD b2, BO, 1 * SIZE ++ MADD c71, b3, a1, c71 ++ LD b3, BO, 2 * SIZE ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L26 ++.L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -8 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++ SUB c51, b5, c51 ++ SUB c61, b6, c61 ++ SUB c71, b7, c71 ++ SUB c81, b8, c81 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ LD b5, AO, 4 * SIZE ++ LD b6, AO, 5 * SIZE ++ LD b7, AO, 6 * SIZE ++ LD b8, AO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++ SUB c51, b5, c51 ++ SUB c61, b6, c61 ++ SUB c71, b7, c71 ++ SUB c81, b8, c81 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++ MUL c51, b1, c51 ++ MUL c61, b1, c61 ++ MUL c71, b1, c71 ++ MUL c81, b1, c81 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c31, c11, b3, c31 ++ NMSUB c41, c11, b4, c41 ++ NMSUB c51, c11, b5, c51 ++ NMSUB c61, c11, b6, c61 ++ NMSUB c71, c11, b7, c71 ++ NMSUB c81, c11, b8, c81 ++ LD b2, BO, 9 * SIZE ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ LD b5, BO, 12 * SIZE ++ LD b6, BO, 13 * SIZE ++ LD b7, BO, 14 * SIZE ++ LD b8, BO, 15 * SIZE ++ MUL c21, b2, c21 ++ NMSUB c31, c21, b3, c31 ++ NMSUB c41, c21, b4, c41 ++ NMSUB c51, c21, b5, c51 ++ NMSUB c61, c21, b6, c61 ++ NMSUB c71, c21, b7, c71 ++ NMSUB c81, c21, b8, c81 ++ LD b3, BO, 18 * SIZE ++ LD b4, BO, 19 * SIZE ++ LD b5, BO, 20 * SIZE ++ LD b6, BO, 21 * SIZE ++ LD b7, BO, 22 * SIZE ++ LD b8, BO, 23 * SIZE ++ MUL c31, b3, c31 ++ NMSUB c41, c31, b4, c41 ++ NMSUB c51, c31, b5, c51 ++ NMSUB c61, c31, b6, c61 ++ NMSUB c71, c31, b7, c71 ++ NMSUB c81, c31, b8, c81 ++ LD b4, BO, 27 * SIZE ++ LD b5, BO, 28 * SIZE ++ LD b6, BO, 29 * SIZE ++ LD b7, BO, 30 * SIZE ++ LD b8, BO, 31 * SIZE ++ MUL c41, b4, c41 ++ NMSUB c51, c41, b5, c51 ++ NMSUB c61, c41, b6, c61 ++ NMSUB c71, c41, b7, c71 ++ NMSUB c81, c41, b8, c81 ++ LD b5, BO, 36 * SIZE ++ LD b6, BO, 37 * SIZE ++ LD b7, BO, 38 * SIZE ++ LD b8, BO, 39 * SIZE ++ MUL c51, b5, c51 ++ NMSUB c61, c51, b6, c61 ++ NMSUB c71, c51, b7, c71 ++ NMSUB c81, c51, b8, c81 ++ LD b6, BO, 45 * SIZE ++ LD b7, BO, 46 * SIZE ++ LD b8, BO, 47 * SIZE ++ MUL c61, b6, c61 ++ NMSUB c71, c61, b7, c71 ++ NMSUB c81, c61, b8, c81 ++ LD b7, BO, 54 * SIZE ++ LD b8, BO, 55 * SIZE ++ MUL c71, b7, c71 ++ NMSUB c81, c71, b8, c81 ++ LD b8, BO, 63 * SIZE ++ MUL c81, b8, c81 ++#endif ++#ifdef RT ++ LD b1, BO, 63 * SIZE ++ LD b2, BO, 62 * SIZE ++ LD b3, BO, 61 * SIZE ++ LD b4, BO, 60 * SIZE ++ LD b5, BO, 59 * SIZE ++ LD b6, BO, 58 * SIZE ++ LD b7, BO, 57 * SIZE ++ LD b8, BO, 56 * SIZE ++ MUL c81, b1, c81 ++ NMSUB c71, c81, b2, c71 ++ NMSUB c61, c81, b3, c61 ++ NMSUB c51, c81, b4, c51 ++ NMSUB c41, c81, b5, c41 ++ NMSUB c31, c81, b6, c31 ++ NMSUB c21, c81, b7, c21 ++ NMSUB c11, c81, b8, c11 ++ LD b2, BO, 54 * SIZE ++ LD b3, BO, 53 * SIZE ++ LD b4, BO, 52 * SIZE ++ LD b5, BO, 51 * SIZE ++ LD b6, BO, 50 * SIZE ++ LD b7, BO, 49 * SIZE ++ LD b8, BO, 48 * SIZE ++ MUL c71, b2, c71 ++ NMSUB c61, c71, b3, c61 ++ NMSUB c51, c71, b4, c51 ++ NMSUB c41, c71, b5, c41 ++ NMSUB c31, c71, b6, c31 ++ NMSUB c21, c71, b7, c21 ++ NMSUB c11, c71, b8, c11 ++ LD b3, BO, 45 * SIZE ++ LD b4, BO, 44 * SIZE ++ LD b5, BO, 43 * SIZE ++ LD b6, BO, 42 * SIZE ++ LD b7, BO, 41 * SIZE ++ LD b8, BO, 40 * SIZE ++ MUL c61, b3, c61 ++ NMSUB c51, c61, b4, c51 ++ NMSUB c41, c61, b5, c41 ++ NMSUB c31, c61, b6, c31 ++ NMSUB c21, c61, b7, c21 ++ NMSUB c11, c61, b8, c11 ++ LD b4, BO, 36 * SIZE ++ LD b5, BO, 35 * SIZE ++ LD b6, BO, 34 * SIZE ++ LD b7, BO, 33 * SIZE ++ LD b8, BO, 32 * SIZE ++ MUL c51, b4, c51 ++ NMSUB c41, c51, b5, c41 ++ NMSUB c31, c51, b6, c31 ++ NMSUB c21, c51, b7, c21 ++ NMSUB c11, c51, b8, c11 ++ LD b5, BO, 27 * SIZE ++ LD b6, BO, 26 * SIZE ++ LD b7, BO, 25 * SIZE ++ LD b8, BO, 24 * SIZE ++ MUL c41, b5, c41 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c21, c41, b7, c21 ++ NMSUB c11, c41, b8, c11 ++ LD b6, BO, 18 * SIZE ++ LD b7, BO, 17 * SIZE ++ LD b8, BO, 16 * SIZE ++ MUL c31, b6, c31 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c11, c31, b8, c11 ++ LD b7, BO, 9 * SIZE ++ LD b8, BO, 8 * SIZE ++ MUL c21, b7, c21 ++ NMSUB c11, c21, b8, c11 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++ addi.d CO2, CO2, -1 * SIZE ++ addi.d CO3, CO3, -1 * SIZE ++ addi.d CO4, CO4, -1 * SIZE ++ addi.d CO5, CO5, -1 * SIZE ++ addi.d CO6, CO6, -1 * SIZE ++ addi.d CO7, CO7, -1 * SIZE ++ addi.d CO8, CO8, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++ ST c51, BO, 4 * SIZE ++ ST c61, BO, 5 * SIZE ++ ST c71, BO, 6 * SIZE ++ ST c81, BO, 7 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c21, AO, 1 * SIZE ++ ST c31, AO, 2 * SIZE ++ ST c41, AO, 3 * SIZE ++ ST c51, AO, 4 * SIZE ++ ST c61, AO, 5 * SIZE ++ ST c71, AO, 6 * SIZE ++ ST c81, AO, 7 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c41, CO4, 0 * SIZE ++ ST c51, CO5, 0 * SIZE ++ ST c61, CO6, 0 * SIZE ++ ST c71, CO7, 0 * SIZE ++ ST c81, CO8, 0 * SIZE ++MTC c11, $r0 ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++ addi.d CO2, CO2, 1 * SIZE ++ addi.d CO3, CO3, 1 * SIZE ++ addi.d CO4, CO4, 1 * SIZE ++ addi.d CO5, CO5, 1 * SIZE ++ addi.d CO6, CO6, 1 * SIZE ++ addi.d CO7, CO7, 1 * SIZE ++ addi.d CO8, CO8, 1 * SIZE ++#endif ++ MOV c21, c11 ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++ MOV c31, c11 ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++ MOV c41, c11 ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L20: ++ srai.d I, M, 1 ++ MOV c51, c11 ++MOV c61, c11 ++ bge $r0, I, .L29 ++.L11: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, B, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, KK, 2 ++ MOV c32, c11 ++ LD b3, B, 2 * SIZE ++ MOV c42, c11 ++ LD b4, B, 3 * SIZE ++ MOV c52, c11 ++ LD b5, B, 4 * SIZE ++ MOV c62, c11 ++ LD b6, B, 8 * SIZE ++ MOV c72, c11 ++ LD b7, B, 12 * SIZE ++ MOV c82, c11 ++move BO, B ++ bge $r0, L, .L15 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ MOV c32, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c42, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c52, c11 ++ LD b5, BO, 4 * SIZE ++ MOV c62, c11 ++ LD b6, BO, 8 * SIZE ++ MOV c72, c11 ++ LD b7, BO, 12 * SIZE ++ MOV c82, c11 ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L15 ++#endif ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ bge $r0, L, .L13 ++ .align 3 ++.L12: ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ MADD c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a4, c51 ++ MADD c61, b2, a4, c61 ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ MADD c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD c71, b3, a3, c71 ++ MADD c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ addi.d L, L, -1 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ blt $r0, L, .L12 ++ .align 3 ++ ++.L13: ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ MADD c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a4, c51 ++ MADD c61, b2, a4, c61 ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ MADD c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD c71, b3, a3, c71 ++ MADD c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ .align 3 ++ ++.L15: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L18 ++ .align 3 ++.L16: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ addi.d L, L, -1 ++ MADD c61, b2, a1, c61 ++ addi.d AO, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ addi.d BO, BO, 8 * SIZE ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 4 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L16 ++.L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -8 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ LD b5, BO, 4 * SIZE ++ SUB c21, b2, c21 ++ LD b6, BO, 5 * SIZE ++ SUB c31, b3, c31 ++ LD b7, BO, 6 * SIZE ++ SUB c41, b4, c41 ++ LD b8, BO, 7 * SIZE ++ SUB c51, b5, c51 ++ LD b1, BO, 8 * SIZE ++ SUB c61, b6, c61 ++ LD b2, BO, 9 * SIZE ++ SUB c71, b7, c71 ++ LD b3, BO, 10 * SIZE ++ SUB c81, b8, c81 ++ LD b4, BO, 11 * SIZE ++ SUB c12, b1, c12 ++ LD b5, BO, 12 * SIZE ++ SUB c22, b2, c22 ++ LD b6, BO, 13 * SIZE ++ SUB c32, b3, c32 ++ LD b7, BO, 14 * SIZE ++ SUB c42, b4, c42 ++ LD b8, BO, 15 * SIZE ++ SUB c52, b5, c52 ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++#else ++ LD b1, AO, 0 * SIZE ++#endif ++ SUB c62, b6, c62 ++ SUB c72, b7, c72 ++ SUB c82, b8, c82 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ LD b5, AO, 4 * SIZE ++ SUB c12, b2, c12 ++ LD b6, AO, 5 * SIZE ++ SUB c21, b3, c21 ++ LD b7, AO, 6 * SIZE ++ SUB c22, b4, c22 ++ LD b8, AO, 7 * SIZE ++ SUB c31, b5, c31 ++ LD b1, AO, 8 * SIZE ++ SUB c32, b6, c32 ++ LD b2, AO, 9 * SIZE ++ SUB c41, b7, c41 ++ LD b3, AO, 10 * SIZE ++ SUB c42, b8, c42 ++ LD b4, AO, 11 * SIZE ++ LD b5, AO, 12 * SIZE ++ SUB c51, b1, c51 ++ LD b6, AO, 13 * SIZE ++ SUB c52, b2, c52 ++ LD b7, AO, 14 * SIZE ++ SUB c61, b3, c61 ++ LD b8, AO, 15 * SIZE ++ SUB c62, b4, c62 ++ SUB c71, b5, c71 ++ SUB c72, b6, c72 ++ SUB c81, b7, c81 ++ SUB c82, b8, c82 ++#endif ++#ifdef LN ++ MUL c12, b1, c12 ++ LD b2, AO, 2 * SIZE ++ MUL c22, b1, c22 ++ MUL c32, b1, c32 ++ MUL c42, b1, c42 ++ MUL c52, b1, c52 ++ MUL c62, b1, c62 ++ MUL c72, b1, c72 ++ MUL c82, b1, c82 ++ NMSUB c11, c12, b2, c11 ++ LD b3, AO, 0 * SIZE ++ NMSUB c21, c22, b2, c21 ++ NMSUB c31, c32, b2, c31 ++ NMSUB c41, c42, b2, c41 ++ NMSUB c51, c52, b2, c51 ++ NMSUB c61, c62, b2, c61 ++ NMSUB c71, c72, b2, c71 ++ NMSUB c81, c82, b2, c81 ++ MUL c11, b3, c11 ++ addi.d CO1, CO1, -2 * SIZE ++ MUL c21, b3, c21 ++ addi.d CO2, CO2, -2 * SIZE ++ MUL c31, b3, c31 ++ addi.d CO3, CO3, -2 * SIZE ++ MUL c41, b3, c41 ++ addi.d CO4, CO4, -2 * SIZE ++ MUL c51, b3, c51 ++ addi.d CO5, CO5, -2 * SIZE ++ MUL c61, b3, c61 ++ addi.d CO6, CO6, -2 * SIZE ++ MUL c71, b3, c71 ++ addi.d CO7, CO7, -2 * SIZE ++ MUL c81, b3, c81 ++ addi.d CO8, CO8, -2 * SIZE ++#endif ++#ifdef LT ++ MUL c11, b1, c11 ++ LD b2, AO, 1 * SIZE ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++ MUL c51, b1, c51 ++ MUL c61, b1, c61 ++ MUL c71, b1, c71 ++ MUL c81, b1, c81 ++ NMSUB c12, c11, b2, c12 ++ LD b3, AO, 3 * SIZE ++ NMSUB c22, c21, b2, c22 ++ NMSUB c32, c31, b2, c32 ++ NMSUB c42, c41, b2, c42 ++ NMSUB c52, c51, b2, c52 ++ NMSUB c62, c61, b2, c62 ++ NMSUB c72, c71, b2, c72 ++ NMSUB c82, c81, b2, c82 ++ MUL c12, b3, c12 ++ MUL c22, b3, c22 ++ MUL c32, b3, c32 ++ MUL c42, b3, c42 ++ MUL c52, b3, c52 ++ MUL c62, b3, c62 ++ MUL c72, b3, c72 ++ MUL c82, b3, c82 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++ LD b5, BO, 4 * SIZE ++ NMSUB c21, c11, b2, c21 ++ NMSUB c22, c12, b2, c22 ++ LD b6, BO, 5 * SIZE ++ NMSUB c31, c11, b3, c31 ++ NMSUB c32, c12, b3, c32 ++ LD b7, BO, 6 * SIZE ++ NMSUB c41, c11, b4, c41 ++ NMSUB c42, c12, b4, c42 ++ LD b8, BO, 7 * SIZE ++ NMSUB c51, c11, b5, c51 ++ NMSUB c52, c12, b5, c52 ++ LD b2, BO, 9 * SIZE ++ NMSUB c61, c11, b6, c61 ++ NMSUB c62, c12, b6, c62 ++ LD b3, BO, 10 * SIZE ++ NMSUB c71, c11, b7, c71 ++ NMSUB c72, c12, b7, c72 ++ LD b4, BO, 11 * SIZE ++ NMSUB c81, c11, b8, c81 ++ NMSUB c82, c12, b8, c82 ++ LD b5, BO, 12 * SIZE ++ MUL c21, b2, c21 ++ MUL c22, b2, c22 ++ LD b6, BO, 13 * SIZE ++ NMSUB c31, c21, b3, c31 ++ NMSUB c32, c22, b3, c32 ++ LD b7, BO, 14 * SIZE ++ NMSUB c41, c21, b4, c41 ++ NMSUB c42, c22, b4, c42 ++ LD b8, BO, 15 * SIZE ++ NMSUB c51, c21, b5, c51 ++ NMSUB c52, c22, b5, c52 ++ LD b3, BO, 18 * SIZE ++ NMSUB c61, c21, b6, c61 ++ NMSUB c62, c22, b6, c62 ++ LD b4, BO, 19 * SIZE ++ NMSUB c71, c21, b7, c71 ++ NMSUB c72, c22, b7, c72 ++ LD b5, BO, 20 * SIZE ++ NMSUB c81, c21, b8, c81 ++ NMSUB c82, c22, b8, c82 ++ LD b6, BO, 21 * SIZE ++ MUL c31, b3, c31 ++ MUL c32, b3, c32 ++ LD b7, BO, 22 * SIZE ++ NMSUB c41, c31, b4, c41 ++ NMSUB c42, c32, b4, c42 ++ LD b8, BO, 23 * SIZE ++ NMSUB c51, c31, b5, c51 ++ NMSUB c52, c32, b5, c52 ++ LD b4, BO, 27 * SIZE ++ NMSUB c61, c31, b6, c61 ++ NMSUB c62, c32, b6, c62 ++ LD b5, BO, 28 * SIZE ++ NMSUB c71, c31, b7, c71 ++ NMSUB c72, c32, b7, c72 ++ LD b6, BO, 29 * SIZE ++ NMSUB c81, c31, b8, c81 ++ NMSUB c82, c32, b8, c82 ++ LD b7, BO, 30 * SIZE ++ MUL c41, b4, c41 ++ MUL c42, b4, c42 ++ LD b8, BO, 31 * SIZE ++ NMSUB c51, c41, b5, c51 ++ NMSUB c52, c42, b5, c52 ++ LD b5, BO, 36 * SIZE ++ NMSUB c61, c41, b6, c61 ++ NMSUB c62, c42, b6, c62 ++ LD b6, BO, 37 * SIZE ++ NMSUB c71, c41, b7, c71 ++ NMSUB c72, c42, b7, c72 ++ LD b7, BO, 38 * SIZE ++ NMSUB c81, c41, b8, c81 ++ NMSUB c82, c42, b8, c82 ++ LD b8, BO, 39 * SIZE ++ MUL c51, b5, c51 ++ MUL c52, b5, c52 ++ NMSUB c61, c51, b6, c61 ++ NMSUB c62, c52, b6, c62 ++ LD b6, BO, 45 * SIZE ++ NMSUB c71, c51, b7, c71 ++ NMSUB c72, c52, b7, c72 ++ LD b7, BO, 46 * SIZE ++ NMSUB c81, c51, b8, c81 ++ NMSUB c82, c52, b8, c82 ++ LD b8, BO, 47 * SIZE ++ MUL c61, b6, c61 ++ MUL c62, b6, c62 ++ NMSUB c71, c61, b7, c71 ++ NMSUB c72, c62, b7, c72 ++ LD b7, BO, 54 * SIZE ++ NMSUB c81, c61, b8, c81 ++ NMSUB c82, c62, b8, c82 ++ LD b8, BO, 55 * SIZE ++ MUL c71, b7, c71 ++ MUL c72, b7, c72 ++ NMSUB c81, c71, b8, c81 ++ NMSUB c82, c72, b8, c82 ++ LD b8, BO, 63 * SIZE ++ MUL c81, b8, c81 ++ MUL c82, b8, c82 ++#endif ++#ifdef RT ++ LD b1, BO, 63 * SIZE ++ LD b2, BO, 62 * SIZE ++ LD b3, BO, 61 * SIZE ++ LD b4, BO, 60 * SIZE ++ MUL c81, b1, c81 ++ MUL c82, b1, c82 ++ LD b5, BO, 59 * SIZE ++ NMSUB c71, c81, b2, c71 ++ NMSUB c72, c82, b2, c72 ++ LD b6, BO, 58 * SIZE ++ NMSUB c61, c81, b3, c61 ++ NMSUB c62, c82, b3, c62 ++ LD b7, BO, 57 * SIZE ++ NMSUB c51, c81, b4, c51 ++ NMSUB c52, c82, b4, c52 ++ LD b8, BO, 56 * SIZE ++ NMSUB c41, c81, b5, c41 ++ NMSUB c42, c82, b5, c42 ++ LD b2, BO, 54 * SIZE ++ NMSUB c31, c81, b6, c31 ++ NMSUB c32, c82, b6, c32 ++ LD b3, BO, 53 * SIZE ++ NMSUB c21, c81, b7, c21 ++ NMSUB c22, c82, b7, c22 ++ LD b4, BO, 52 * SIZE ++ NMSUB c11, c81, b8, c11 ++ NMSUB c12, c82, b8, c12 ++ LD b5, BO, 51 * SIZE ++ MUL c71, b2, c71 ++ MUL c72, b2, c72 ++ LD b6, BO, 50 * SIZE ++ NMSUB c61, c71, b3, c61 ++ NMSUB c62, c72, b3, c62 ++ LD b7, BO, 49 * SIZE ++ NMSUB c51, c71, b4, c51 ++ NMSUB c52, c72, b4, c52 ++ LD b8, BO, 48 * SIZE ++ NMSUB c41, c71, b5, c41 ++ NMSUB c42, c72, b5, c42 ++ LD b3, BO, 45 * SIZE ++ NMSUB c31, c71, b6, c31 ++ NMSUB c32, c72, b6, c32 ++ LD b4, BO, 44 * SIZE ++ NMSUB c21, c71, b7, c21 ++ NMSUB c22, c72, b7, c22 ++ LD b5, BO, 43 * SIZE ++ NMSUB c11, c71, b8, c11 ++ NMSUB c12, c72, b8, c12 ++ LD b6, BO, 42 * SIZE ++ MUL c61, b3, c61 ++ MUL c62, b3, c62 ++ LD b7, BO, 41 * SIZE ++ NMSUB c51, c61, b4, c51 ++ NMSUB c52, c62, b4, c52 ++ LD b8, BO, 40 * SIZE ++ NMSUB c41, c61, b5, c41 ++ NMSUB c42, c62, b5, c42 ++ LD b4, BO, 36 * SIZE ++ NMSUB c31, c61, b6, c31 ++ NMSUB c32, c62, b6, c32 ++ LD b5, BO, 35 * SIZE ++ NMSUB c21, c61, b7, c21 ++ NMSUB c22, c62, b7, c22 ++ LD b6, BO, 34 * SIZE ++ NMSUB c11, c61, b8, c11 ++ NMSUB c12, c62, b8, c12 ++ LD b7, BO, 33 * SIZE ++ MUL c51, b4, c51 ++ MUL c52, b4, c52 ++ LD b8, BO, 32 * SIZE ++ NMSUB c41, c51, b5, c41 ++ NMSUB c42, c52, b5, c42 ++ LD b5, BO, 27 * SIZE ++ NMSUB c31, c51, b6, c31 ++ NMSUB c32, c52, b6, c32 ++ LD b6, BO, 26 * SIZE ++ NMSUB c21, c51, b7, c21 ++ NMSUB c22, c52, b7, c22 ++ LD b7, BO, 25 * SIZE ++ NMSUB c11, c51, b8, c11 ++ NMSUB c12, c52, b8, c12 ++ LD b8, BO, 24 * SIZE ++ MUL c41, b5, c41 ++ MUL c42, b5, c42 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c32, c42, b6, c32 ++ LD b6, BO, 18 * SIZE ++ NMSUB c21, c41, b7, c21 ++ NMSUB c22, c42, b7, c22 ++ LD b7, BO, 17 * SIZE ++ NMSUB c11, c41, b8, c11 ++ NMSUB c12, c42, b8, c12 ++ LD b8, BO, 16 * SIZE ++ MUL c31, b6, c31 ++ MUL c32, b6, c32 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c22, c32, b7, c22 ++ LD b7, BO, 9 * SIZE ++ NMSUB c11, c31, b8, c11 ++ NMSUB c12, c32, b8, c12 ++ LD b8, BO, 8 * SIZE ++ MUL c21, b7, c21 ++ MUL c22, b7, c22 ++ NMSUB c11, c21, b8, c11 ++ NMSUB c12, c22, b8, c12 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++ MUL c12, b8, c12 ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++ ST c51, BO, 4 * SIZE ++ ST c61, BO, 5 * SIZE ++ ST c71, BO, 6 * SIZE ++ ST c81, BO, 7 * SIZE ++ ST c12, BO, 8 * SIZE ++ ST c22, BO, 9 * SIZE ++ ST c32, BO, 10 * SIZE ++ ST c42, BO, 11 * SIZE ++ ST c52, BO, 12 * SIZE ++ ST c62, BO, 13 * SIZE ++ ST c72, BO, 14 * SIZE ++ ST c82, BO, 15 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c21, AO, 2 * SIZE ++ ST c22, AO, 3 * SIZE ++ ST c31, AO, 4 * SIZE ++ ST c32, AO, 5 * SIZE ++ ST c41, AO, 6 * SIZE ++ ST c42, AO, 7 * SIZE ++ ST c51, AO, 8 * SIZE ++ ST c52, AO, 9 * SIZE ++ ST c61, AO, 10 * SIZE ++ ST c62, AO, 11 * SIZE ++ ST c71, AO, 12 * SIZE ++ ST c72, AO, 13 * SIZE ++ ST c81, AO, 14 * SIZE ++ ST c82, AO, 15 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c22, CO2, 1 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c32, CO3, 1 * SIZE ++ ST c41, CO4, 0 * SIZE ++ ST c42, CO4, 1 * SIZE ++ ST c51, CO5, 0 * SIZE ++ ST c52, CO5, 1 * SIZE ++ ST c61, CO6, 0 * SIZE ++ ST c62, CO6, 1 * SIZE ++ ST c71, CO7, 0 * SIZE ++ ST c72, CO7, 1 * SIZE ++ ST c81, CO8, 0 * SIZE ++ ST c82, CO8, 1 * SIZE ++MTC a1, $r0 ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++ addi.d CO2, CO2, 2 * SIZE ++ addi.d CO3, CO3, 2 * SIZE ++ addi.d CO4, CO4, 2 * SIZE ++ addi.d CO5, CO5, 2 * SIZE ++ addi.d CO6, CO6, 2 * SIZE ++ addi.d CO7, CO7, 2 * SIZE ++ addi.d CO8, CO8, 2 * SIZE ++#endif ++ MOV c11, a1 ++ MOV c21, a1 ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++ MOV c31, a1 ++ MOV c41, a1 ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++ addi.d I, I, -1 ++ MOV c51, a1 ++MOV c61, a1 ++ blt $r0, I, .L11 ++ .align 3 ++ ++.L29: ++#ifdef LN ++ slli.d TEMP, K, 3 + BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 8 ++#endif ++#ifdef RT ++ addi.d KK, KK, -8 ++#endif ++ blt $r0, J, .L10 ++ .align 3 ++ ++.L30: ++ andi J, N, 4 ++move AO, A ++ bge $r0, J, .L50 ++#ifdef RT ++ slli.d TEMP, K, 2 + BASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 2 ++ sub.d C, C, TEMP ++#endif ++ move CO1, C ++MTC c11, $r0 ++ add.d CO2, C, LDC ++ add.d CO3, CO2, LDC ++ MOV c21, c11 ++ add.d CO4, CO3, LDC ++ MOV c31, c11 ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO4, LDC ++#endif ++ andi I, M, 1 ++MOV c41, c11 ++ bge $r0, I, .L40 ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c81, c11 ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, KK, 2 ++move BO, B ++ bge $r0, L, .L45 ++#else ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c81, c11 ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L45 ++#endif ++ .align 3 ++.L42: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 16 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ LD a1, AO, 4 * SIZE ++ addi.d L, L, -1 ++ MADD c11, b5, a2, c11 ++ LD b5, BO, 20 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 10 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 11 * SIZE ++ LD a2, AO, 2 * SIZE ++ addi.d AO, AO, 4 * SIZE ++ MADD c11, b6, a2, c11 ++ LD b6, BO, 24 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 13 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 14 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 15 * SIZE ++ LD a2, AO, -1 * SIZE ++ addi.d BO, BO, 16 * SIZE ++ MADD c11, b7, a2, c11 ++ LD b7, BO, 12 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 1 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 2 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 3 * SIZE ++ LD a2, AO, 1 * SIZE ++ blt $r0, L, .L42 ++ .align 3 ++ ++.L45: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L48 ++ .align 3 ++.L46: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 4 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 1 * SIZE ++ LD b4, BO, 7 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++ MOV a2, a2 ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L46 ++.L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -4 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c31, c11, b3, c31 ++ NMSUB c41, c11, b4, c41 ++ LD b2, BO, 5 * SIZE ++ LD b3, BO, 6 * SIZE ++ LD b4, BO, 7 * SIZE ++ MUL c21, b2, c21 ++ NMSUB c31, c21, b3, c31 ++ NMSUB c41, c21, b4, c41 ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ MUL c31, b3, c31 ++ NMSUB c41, c31, b4, c41 ++ LD b4, BO, 15 * SIZE ++ MUL c41, b4, c41 ++#endif ++#ifdef RT ++ LD b5, BO, 15 * SIZE ++ LD b6, BO, 14 * SIZE ++ LD b7, BO, 13 * SIZE ++ LD b8, BO, 12 * SIZE ++ MUL c41, b5, c41 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c21, c41, b7, c21 ++ NMSUB c11, c41, b8, c11 ++ LD b6, BO, 10 * SIZE ++ LD b7, BO, 9 * SIZE ++ LD b8, BO, 8 * SIZE ++ MUL c31, b6, c31 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c11, c31, b8, c11 ++ LD b7, BO, 5 * SIZE ++ LD b8, BO, 4 * SIZE ++ MUL c21, b7, c21 ++ NMSUB c11, c21, b8, c11 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++ addi.d CO2, CO2, -1 * SIZE ++ addi.d CO3, CO3, -1 * SIZE ++ addi.d CO4, CO4, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c21, AO, 1 * SIZE ++ ST c31, AO, 2 * SIZE ++ ST c41, AO, 3 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c41, CO4, 0 * SIZE ++MTC c11, $r0 ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++ addi.d CO2, CO2, 1 * SIZE ++ addi.d CO3, CO3, 1 * SIZE ++ addi.d CO4, CO4, 1 * SIZE ++#endif ++ MOV c21, c11 ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++ MOV c31, c11 ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L40: ++ srai.d I, M, 1 ++ MOV c61, c11 ++MOV c41, c11 ++ bge $r0, I, .L49 ++.L31: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ LD a3, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ MOV c32, c11 ++ LD b4, B, 3 * SIZE ++ MOV c42, c11 ++ LD b5, B, 4 * SIZE ++ srai.d L, KK, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L35 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ LD a3, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c32, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c42, c11 ++ LD b5, BO, 4 * SIZE ++ srai.d L, TEMP, 2 ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L35 ++#endif ++ .align 3 ++.L32: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c11, b5, a1, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 8 * SIZE ++ MADD c12, b5, a2, c12 ++ LD b5, BO, 20 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 9 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 10 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ LD a3, AO, 6 * SIZE ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c11, b7, a3, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a3, c21 ++ addi.d AO, AO, 8 * SIZE ++ MADD c31, b3, a3, c31 ++ addi.d BO, BO, 16 * SIZE ++ MADD c41, b4, a3, c41 ++ LD a3, AO, 4 * SIZE ++ MADD c12, b7, a2, c12 ++ LD b7, BO, 12 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 1 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 2 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L32 ++ .align 3 ++ ++.L35: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L38 ++ .align 3 ++.L36: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ addi.d AO, AO, 2 * SIZE ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 0 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 4 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L36 ++.L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -4 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++ SUB c12, b5, c12 ++ SUB c22, b6, c22 ++ SUB c32, b7, c32 ++ SUB c42, b8, c42 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ LD b5, AO, 4 * SIZE ++ LD b6, AO, 5 * SIZE ++ LD b7, AO, 6 * SIZE ++ LD b8, AO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c21, b3, c21 ++ SUB c22, b4, c22 ++ SUB c31, b5, c31 ++ SUB c32, b6, c32 ++ SUB c41, b7, c41 ++ SUB c42, b8, c42 ++#endif ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++ LD b2, AO, 2 * SIZE ++ LD b3, AO, 0 * SIZE ++ MUL c12, b1, c12 ++ MUL c22, b1, c22 ++ MUL c32, b1, c32 ++ MUL c42, b1, c42 ++ NMSUB c11, c12, b2, c11 ++ NMSUB c21, c22, b2, c21 ++ NMSUB c31, c32, b2, c31 ++ NMSUB c41, c42, b2, c41 ++ MUL c11, b3, c11 ++ MUL c21, b3, c21 ++ MUL c31, b3, c31 ++ MUL c41, b3, c41 ++#endif ++#ifdef LT ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++ NMSUB c12, c11, b2, c12 ++ NMSUB c22, c21, b2, c22 ++ NMSUB c32, c31, b2, c32 ++ NMSUB c42, c41, b2, c42 ++ MUL c12, b3, c12 ++ MUL c22, b3, c22 ++ MUL c32, b3, c32 ++ MUL c42, b3, c42 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c22, c12, b2, c22 ++ NMSUB c31, c11, b3, c31 ++ NMSUB c32, c12, b3, c32 ++ NMSUB c41, c11, b4, c41 ++ NMSUB c42, c12, b4, c42 ++ LD b2, BO, 5 * SIZE ++ LD b3, BO, 6 * SIZE ++ LD b4, BO, 7 * SIZE ++ MUL c21, b2, c21 ++ MUL c22, b2, c22 ++ NMSUB c31, c21, b3, c31 ++ NMSUB c32, c22, b3, c32 ++ NMSUB c41, c21, b4, c41 ++ NMSUB c42, c22, b4, c42 ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ MUL c31, b3, c31 ++ MUL c32, b3, c32 ++ NMSUB c41, c31, b4, c41 ++ NMSUB c42, c32, b4, c42 ++ LD b4, BO, 15 * SIZE ++ MUL c41, b4, c41 ++ MUL c42, b4, c42 ++#endif ++#ifdef RT ++ LD b5, BO, 15 * SIZE ++ LD b6, BO, 14 * SIZE ++ LD b7, BO, 13 * SIZE ++ LD b8, BO, 12 * SIZE ++ MUL c41, b5, c41 ++ MUL c42, b5, c42 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c32, c42, b6, c32 ++ NMSUB c21, c41, b7, c21 ++ NMSUB c22, c42, b7, c22 ++ NMSUB c11, c41, b8, c11 ++ NMSUB c12, c42, b8, c12 ++ LD b6, BO, 10 * SIZE ++ LD b7, BO, 9 * SIZE ++ LD b8, BO, 8 * SIZE ++ MUL c31, b6, c31 ++ MUL c32, b6, c32 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c22, c32, b7, c22 ++ NMSUB c11, c31, b8, c11 ++ NMSUB c12, c32, b8, c12 ++ LD b7, BO, 5 * SIZE ++ LD b8, BO, 4 * SIZE ++ MUL c21, b7, c21 ++ MUL c22, b7, c22 ++ NMSUB c11, c21, b8, c11 ++ NMSUB c12, c22, b8, c12 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++ MUL c12, b8, c12 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -2 * SIZE ++ addi.d CO2, CO2, -2 * SIZE ++ addi.d CO3, CO3, -2 * SIZE ++ addi.d CO4, CO4, -2 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++ ST c12, BO, 4 * SIZE ++ ST c22, BO, 5 * SIZE ++ ST c32, BO, 6 * SIZE ++ ST c42, BO, 7 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c21, AO, 2 * SIZE ++ ST c22, AO, 3 * SIZE ++ ST c31, AO, 4 * SIZE ++ ST c32, AO, 5 * SIZE ++ ST c41, AO, 6 * SIZE ++ ST c42, AO, 7 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c22, CO2, 1 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c32, CO3, 1 * SIZE ++ ST c41, CO4, 0 * SIZE ++ ST c42, CO4, 1 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++ addi.d CO2, CO2, 2 * SIZE ++ addi.d CO3, CO3, 2 * SIZE ++ addi.d CO4, CO4, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++MTC a1, $r0 ++ MOV c11, a1 ++ MOV c21, a1 ++ MOV c31, a1 ++ addi.d I, I, -1 ++MOV c41, c11 ++ blt $r0, I, .L31 ++ .align 3 ++ ++.L49: ++#ifdef LN ++ slli.d TEMP, K, 2 + BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 4 ++#endif ++#ifdef RT ++ addi.d KK, KK, -4 ++#endif ++ .align 3 ++ ++.L50: ++ andi J, N, 2 ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++#else ++ move AO, A ++#endif ++ bge $r0, J, .L70 ++#ifdef RT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 1 ++ sub.d C, C, TEMP ++#endif ++ move AO, A ++ move CO1, C ++ add.d CO2, C, LDC ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO2, LDC ++#endif ++ andi I, M, 1 ++ bge $r0, I, .L60 ++#if defined(LT) || defined(RN) ++ srai.d L, KK, 2 ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ MOV c31, c11 ++ LD a4, AO, 3 * SIZE ++ MOV c41, c11 ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L65 ++#else ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ srai.d L, TEMP, 2 ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ MOV c31, c11 ++ LD a4, AO, 3 * SIZE ++ MOV c41, c11 ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L65 ++#endif ++ .align 3 ++.L62: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 4 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 7 * SIZE ++ LD a1, AO, 4 * SIZE ++ LD a2, AO, 5 * SIZE ++ MADD c11, b1, a3, c11 ++ LD b1, BO, 8 * SIZE ++ MADD c21, b2, a3, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c31, b3, a4, c31 ++ LD b3, BO, 10 * SIZE ++ MADD c41, b4, a4, c41 ++ LD b4, BO, 11 * SIZE ++ LD a3, AO, 6 * SIZE ++ LD a4, AO, 7 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 4 * SIZE ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L62 ++ .align 3 ++ ++.L65: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L68 ++ .align 3 ++.L66: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 2 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 3 * SIZE ++ LD a1, AO, 1 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++addi.d BO, BO, 2 * SIZE ++ blt $r0, L, .L66 ++.L68: ++ ADD c11, c11, c31 ++ ADD c21, c21, c41 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -2 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++#endif ++#if defined(LN) || defined(LT) ++ LD b3, AO, 0 * SIZE ++ MUL c11, b3, c11 ++ MUL c21, b3, c21 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c21, c11, b2, c21 ++ MUL c21, b3, c21 ++#endif ++#ifdef RT ++ LD b1, BO, 3 * SIZE ++ LD b2, BO, 2 * SIZE ++ LD b3, BO, 0 * SIZE ++ MUL c21, b1, c21 ++ NMSUB c11, c21, b2, c11 ++ MUL c11, b3, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++ addi.d CO2, CO2, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c21, AO, 1 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++ addi.d CO2, CO2, 1 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 0 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L60: ++ srai.d I, M, 1 ++ bge $r0, I, .L69 ++.L51: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ LD b5, B, 4 * SIZE ++ srai.d L, KK, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L55 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ LD b5, BO, 4 * SIZE ++ srai.d L, TEMP, 2 ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L55 ++#endif ++ .align 3 ++.L52: ++ MADD c11, b1, a1, c11 ++ LD a3, AO, 2 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b4, BO, 3 * SIZE ++ MADD c12, b1, a2, c12 ++ LD a4, AO, 3 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b1, BO, 8 * SIZE ++ MADD c11, b3, a3, c11 ++ LD a1, AO, 8 * SIZE ++ MADD c21, b4, a3, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c12, b3, a4, c12 ++ LD a2, AO, 5 * SIZE ++ MADD c22, b4, a4, c22 ++ LD b3, BO, 6 * SIZE ++ MADD c11, b5, a5, c11 ++ LD a3, AO, 6 * SIZE ++ MADD c21, b2, a5, c21 ++ LD b4, BO, 7 * SIZE ++ MADD c12, b5, a2, c12 ++ LD a4, AO, 7 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b5, BO, 12 * SIZE ++ MADD c11, b3, a3, c11 ++ LD a5, AO, 12 * SIZE ++ MADD c21, b4, a3, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c12, b3, a4, c12 ++ LD a2, AO, 9 * SIZE ++ MADD c22, b4, a4, c22 ++ LD b3, BO, 10 * SIZE ++ addi.d AO, AO, 8 * SIZE ++ addi.d L, L, -1 ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L52 ++ .align 3 ++ ++.L55: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L58 ++ .align 3 ++.L56: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 3 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 2 * SIZE ++addi.d BO, BO, 2 * SIZE ++ blt $r0, L, .L56 ++.L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -2 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c12, b3, c12 ++ SUB c22, b4, c22 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c21, b3, c21 ++ SUB c22, b4, c22 ++#endif ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++ LD b2, AO, 2 * SIZE ++ LD b3, AO, 0 * SIZE ++ MUL c12, b1, c12 ++ MUL c22, b1, c22 ++ NMSUB c11, c12, b2, c11 ++ NMSUB c21, c22, b2, c21 ++ MUL c11, b3, c11 ++ MUL c21, b3, c21 ++#endif ++#ifdef LT ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ NMSUB c12, c11, b2, c12 ++ NMSUB c22, c21, b2, c22 ++ MUL c12, b3, c12 ++ MUL c22, b3, c22 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c22, c12, b2, c22 ++ MUL c21, b3, c21 ++ MUL c22, b3, c22 ++#endif ++#ifdef RT ++ LD b1, BO, 3 * SIZE ++ LD b2, BO, 2 * SIZE ++ LD b3, BO, 0 * SIZE ++ MUL c21, b1, c21 ++ MUL c22, b1, c22 ++ NMSUB c11, c21, b2, c11 ++ NMSUB c12, c22, b2, c12 ++ MUL c11, b3, c11 ++ MUL c12, b3, c12 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -2 * SIZE ++ addi.d CO2, CO2, -2 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c12, BO, 2 * SIZE ++ ST c22, BO, 3 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c21, AO, 2 * SIZE ++ ST c22, AO, 3 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c22, CO2, 1 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++ addi.d CO2, CO2, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AO, TEMP ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++MTC a1, $r0 ++ MOV c11, a1 ++ MOV c21, a1 ++ MOV c31, a1 ++ addi.d I, I, -1 ++MOV c41, c11 ++ blt $r0, I, .L51 ++ .align 3 ++ ++.L69: ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 2 ++#endif ++#ifdef RT ++ addi.d KK, KK, -2 ++#endif ++ .align 3 ++ ++.L70: ++ andi J, N, 1 ++ bge $r0, J, .L999 ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d B, B, TEMP ++ sub.d C, C, LDC ++#endif ++ move AO, A ++ move CO1, C ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO1, LDC ++#endif ++ andi I, M, 1 ++ bge $r0, I, .L80 ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, KK, 2 ++move BO, B ++ bge $r0, L, .L85 ++#else ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d TEMP, KK, BASE_SHIFT ++ add.d AO, AORIG, TEMP ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L85 ++#endif ++ .align 3 ++.L82: ++ LD a1, AO, 0 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a1, AO, 1 * SIZE ++ LD b1, BO, 1 * SIZE ++ MADD c21, b1, a1, c21 ++ LD a1, AO, 2 * SIZE ++ LD b1, BO, 2 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a1, AO, 3 * SIZE ++ LD b1, BO, 3 * SIZE ++ MADD c21, b1, a1, c21 ++ addi.d L, L, -1 ++ addi.d AO, AO, 4 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L82 ++ .align 3 ++ ++.L85: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L88 ++ .align 3 ++.L86: ++ LD a1, AO, 0 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++addi.d BO, BO, 1 * SIZE ++ blt $r0, L, .L86 ++.L88: ++ ADD c11, c11, c21 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -1 ++#endif ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AORIG, TEMP ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ SUB c11, b1, c11 ++#else ++ LD b1, AO, 0 * SIZE ++ SUB c11, b1, c11 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ MUL c11, b1, c11 ++#endif ++#if defined(RN) || defined(RT) ++ LD b1, BO, 0 * SIZE ++ MUL c11, b1, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AO, TEMP ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L80: ++ srai.d I, M, 1 ++ bge $r0, I, .L89 ++.L71: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ LD b5, B, 4 * SIZE ++ srai.d L, KK, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L75 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 0 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ LD b5, BO, 4 * SIZE ++ srai.d L, TEMP, 2 ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L75 ++#endif ++ .align 3 ++.L72: ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 2 * SIZE ++ LD a2, AO, 3 * SIZE ++ LD b1, BO, 1 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 4 * SIZE ++ LD a2, AO, 5 * SIZE ++ LD b1, BO, 2 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 6 * SIZE ++ LD a2, AO, 7 * SIZE ++ LD b1, BO, 3 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ addi.d L, L, -1 ++ addi.d AO, AO, 8 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L72 ++ .align 3 ++ ++.L75: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L78 ++ .align 3 ++.L76: ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ addi.d L, L, -1 ++ addi.d AO, AO, 2 * SIZE ++addi.d BO, BO, 1 * SIZE ++ blt $r0, L, .L76 ++.L78: ++ ADD c11, c11, c21 ++ ADD c12, c12, c22 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -1 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++#endif ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++ LD b2, AO, 2 * SIZE ++ LD b3, AO, 0 * SIZE ++ MUL c12, b1, c12 ++ NMSUB c11, c12, b2, c11 ++ MUL c11, b3, c11 ++#endif ++#ifdef LT ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 3 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c12, c11, b2, c12 ++ MUL c12, b3, c12 ++#endif ++#if defined(RN) || defined(RT) ++ LD b1, BO, 0 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -2 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c12, BO, 1 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++ addi.d I, I, -1 ++ blt $r0, I, .L71 ++ .align 3 ++ ++.L89: ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 1 ++#endif ++#ifdef RT ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ LDARG $r25, $sp, 16 ++ LDARG $r26, $sp, 24 ++ LDARG $r27, $sp, 32 ++ LDARG $r28, $sp, 40 ++ fld.d $f24, $sp, 48 ++ fld.d $f25, $sp, 56 ++ fld.d $f26, $sp, 64 ++ fld.d $f27, $sp, 72 ++ fld.d $f28, $sp, 80 ++ LDARG $r29, $sp, 88 ++ LDARG $r30, $sp, 96 ++ LDARG $r20, $sp, 104 ++ LDARG $r16, $sp, 112 ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 112 ++ fld.d $f19, $sp, 120 ++ fld.d $f20, $sp, 128 ++ fld.d $f21, $sp, 136 ++#endif ++ addi.d $sp, $sp, 144 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/trsm_kernel_LT.S b/kernel/loongarch64/trsm_kernel_LT.S +new file mode 100644 +index 0000000..aa6822c +--- /dev/null ++++ b/kernel/loongarch64/trsm_kernel_LT.S +@@ -0,0 +1,2854 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define M $r4 ++#define N $r5 ++#define K $r6 ++#define A $r7 ++#define B $r8 ++#define C $r9 ++#define LDC $r10 ++#define OFFSET $r11 ++#define AO $r12 ++#define BO $r13 ++#define I $r17 ++#define J $r18 ++#define L $r29 ++#define CO1 $r14 ++#define CO2 $r15 ++#define CO3 $r23 ++#define CO4 $r24 ++#define CO5 $r25 ++#define CO6 $r26 ++#define CO7 $r27 ++#define CO8 $r28 ++#define KK $r30 ++#define TEMP $r20 ++#define AORIG $r16 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f27 ++#define a4 $f28 ++#define b1 $f23 ++#define b2 $f9 ++#define b3 $f10 ++#define b4 $f11 ++#define b5 $f12 ++#define b6 $f13 ++#define b7 $f14 ++#define b8 $f15 ++#define a5 b8 ++#define c11 $f16 ++#define c12 $f17 ++#define c21 $f3 ++#define c22 $f1 ++#define c31 $f2 ++#define c32 $f4 ++#define c41 $f5 ++#define c42 $f6 ++#define c51 $f7 ++#define c52 $f18 ++#define c61 $f19 ++#define c62 $f20 ++#define c71 $f21 ++#define c72 $f24 ++#define c81 $f25 ++#define c82 $f26 ++#define ALPHA $f0 ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -144 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 16 ++ SDARG $r26, $sp, 24 ++ SDARG $r27, $sp, 32 ++ SDARG $r28, $sp, 40 ++ fst.d $f24, $sp, 48 ++ fst.d $f25, $sp, 56 ++ fst.d $f26, $sp, 64 ++ fst.d $f27, $sp, 72 ++ fst.d $f28, $sp, 80 ++ SDARG $r29, $sp, 88 ++ SDARG $r30, $sp, 96 ++ SDARG $r20, $sp, 104 ++ SDARG $r16, $sp, 112 ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 112 ++ fst.d $f19, $sp, 120 ++ fst.d $f20, $sp, 128 ++ fst.d $f21, $sp, 136 ++#endif ++ slli.d LDC, LDC, BASE_SHIFT ++#ifdef LN ++ mul.w TEMP, M, K ++ slli.d TEMP, TEMP, BASE_SHIFT ++ add.d A, A, TEMP ++ slli.d TEMP, M, BASE_SHIFT ++ add.d C, C, TEMP ++#endif ++#ifdef RN ++ sub.d KK, $r0, OFFSET ++#endif ++#ifdef RT ++ mul.w TEMP, N, K ++ slli.d TEMP, TEMP, BASE_SHIFT ++ add.d B, B, TEMP ++ mul.w TEMP, N, LDC ++ add.d C, C, TEMP ++ sub.d KK, N, OFFSET ++#endif ++ srai.d J, N, 3 ++nop ++ bge $r0, J, .L30 ++.L10: ++#ifdef RT ++ slli.d TEMP, K, 3 + BASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 3 ++ sub.d C, C, TEMP ++#endif ++ move CO1, C ++MTC c11, $r0 ++ add.d CO2, C, LDC ++ add.d CO3, CO2, LDC ++ addi.d J, J, -1 ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ add.d CO5, CO4, LDC ++ MOV c31, c11 ++ add.d CO6, CO5, LDC ++ MOV c41, c11 ++ add.d CO7, CO6, LDC ++ MOV c51, c11 ++ add.d CO8, CO7, LDC ++ srai.d I, M, 1 ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO8, LDC ++#endif ++MOV c61, c11 ++ bge $r0, I, .L20 ++.L11: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, B, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, KK, 2 ++ MOV c32, c11 ++ LD b3, B, 2 * SIZE ++ MOV c42, c11 ++ LD b4, B, 3 * SIZE ++ MOV c52, c11 ++ LD b5, B, 4 * SIZE ++ MOV c62, c11 ++ LD b6, B, 8 * SIZE ++ MOV c72, c11 ++ LD b7, B, 12 * SIZE ++ MOV c82, c11 ++move BO, B ++ bge $r0, L, .L15 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, TEMP, 2 ++ MOV c32, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c42, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c52, c11 ++ LD b5, BO, 4 * SIZE ++ MOV c62, c11 ++ LD b6, BO, 8 * SIZE ++ MOV c72, c11 ++ LD b7, BO, 12 * SIZE ++ MOV c82, c11 ++ bge $r0, L, .L15 ++#endif ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ bge $r0, L, .L13 ++ .align 3 ++.L12: ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ MADD c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a4, c51 ++ MADD c61, b2, a4, c61 ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ MADD c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD c71, b3, a3, c71 ++ MADD c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ addi.d L, L, -1 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ blt $r0, L, .L12 ++ .align 3 ++ ++.L13: ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ MADD c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a4, c51 ++ MADD c61, b2, a4, c61 ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ MADD c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD c71, b3, a3, c71 ++ MADD c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ .align 3 ++ ++.L15: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L18 ++ .align 3 ++.L16: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ addi.d L, L, -1 ++ MADD c61, b2, a1, c61 ++ addi.d AO, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ addi.d BO, BO, 8 * SIZE ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 4 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L16 ++.L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -8 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ LD b5, BO, 4 * SIZE ++ SUB c21, b2, c21 ++ LD b6, BO, 5 * SIZE ++ SUB c31, b3, c31 ++ LD b7, BO, 6 * SIZE ++ SUB c41, b4, c41 ++ LD b8, BO, 7 * SIZE ++ SUB c51, b5, c51 ++ LD b1, BO, 8 * SIZE ++ SUB c61, b6, c61 ++ LD b2, BO, 9 * SIZE ++ SUB c71, b7, c71 ++ LD b3, BO, 10 * SIZE ++ SUB c81, b8, c81 ++ LD b4, BO, 11 * SIZE ++ SUB c12, b1, c12 ++ LD b5, BO, 12 * SIZE ++ SUB c22, b2, c22 ++ LD b6, BO, 13 * SIZE ++ SUB c32, b3, c32 ++ LD b7, BO, 14 * SIZE ++ SUB c42, b4, c42 ++ LD b8, BO, 15 * SIZE ++ SUB c52, b5, c52 ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++#else ++ LD b1, AO, 0 * SIZE ++#endif ++ SUB c62, b6, c62 ++ SUB c72, b7, c72 ++ SUB c82, b8, c82 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ LD b5, AO, 4 * SIZE ++ SUB c12, b2, c12 ++ LD b6, AO, 5 * SIZE ++ SUB c21, b3, c21 ++ LD b7, AO, 6 * SIZE ++ SUB c22, b4, c22 ++ LD b8, AO, 7 * SIZE ++ SUB c31, b5, c31 ++ LD b1, AO, 8 * SIZE ++ SUB c32, b6, c32 ++ LD b2, AO, 9 * SIZE ++ SUB c41, b7, c41 ++ LD b3, AO, 10 * SIZE ++ SUB c42, b8, c42 ++ LD b4, AO, 11 * SIZE ++ LD b5, AO, 12 * SIZE ++ SUB c51, b1, c51 ++ LD b6, AO, 13 * SIZE ++ SUB c52, b2, c52 ++ LD b7, AO, 14 * SIZE ++ SUB c61, b3, c61 ++ LD b8, AO, 15 * SIZE ++ SUB c62, b4, c62 ++ SUB c71, b5, c71 ++ SUB c72, b6, c72 ++ SUB c81, b7, c81 ++ SUB c82, b8, c82 ++#endif ++#ifdef LN ++ MUL c12, b1, c12 ++ LD b2, AO, 2 * SIZE ++ MUL c22, b1, c22 ++ MUL c32, b1, c32 ++ MUL c42, b1, c42 ++ MUL c52, b1, c52 ++ MUL c62, b1, c62 ++ MUL c72, b1, c72 ++ MUL c82, b1, c82 ++ NMSUB c11, c12, b2, c11 ++ LD b3, AO, 0 * SIZE ++ NMSUB c21, c22, b2, c21 ++ NMSUB c31, c32, b2, c31 ++ NMSUB c41, c42, b2, c41 ++ NMSUB c51, c52, b2, c51 ++ NMSUB c61, c62, b2, c61 ++ NMSUB c71, c72, b2, c71 ++ NMSUB c81, c82, b2, c81 ++ MUL c11, b3, c11 ++ addi.d CO1, CO1, -2 * SIZE ++ MUL c21, b3, c21 ++ addi.d CO2, CO2, -2 * SIZE ++ MUL c31, b3, c31 ++ addi.d CO3, CO3, -2 * SIZE ++ MUL c41, b3, c41 ++ addi.d CO4, CO4, -2 * SIZE ++ MUL c51, b3, c51 ++ addi.d CO5, CO5, -2 * SIZE ++ MUL c61, b3, c61 ++ addi.d CO6, CO6, -2 * SIZE ++ MUL c71, b3, c71 ++ addi.d CO7, CO7, -2 * SIZE ++ MUL c81, b3, c81 ++ addi.d CO8, CO8, -2 * SIZE ++#endif ++#ifdef LT ++ MUL c11, b1, c11 ++ LD b2, AO, 1 * SIZE ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++ MUL c51, b1, c51 ++ MUL c61, b1, c61 ++ MUL c71, b1, c71 ++ MUL c81, b1, c81 ++ NMSUB c12, c11, b2, c12 ++ LD b3, AO, 3 * SIZE ++ NMSUB c22, c21, b2, c22 ++ NMSUB c32, c31, b2, c32 ++ NMSUB c42, c41, b2, c42 ++ NMSUB c52, c51, b2, c52 ++ NMSUB c62, c61, b2, c62 ++ NMSUB c72, c71, b2, c72 ++ NMSUB c82, c81, b2, c82 ++ MUL c12, b3, c12 ++ MUL c22, b3, c22 ++ MUL c32, b3, c32 ++ MUL c42, b3, c42 ++ MUL c52, b3, c52 ++ MUL c62, b3, c62 ++ MUL c72, b3, c72 ++ MUL c82, b3, c82 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++ LD b5, BO, 4 * SIZE ++ NMSUB c21, c11, b2, c21 ++ NMSUB c22, c12, b2, c22 ++ LD b6, BO, 5 * SIZE ++ NMSUB c31, c11, b3, c31 ++ NMSUB c32, c12, b3, c32 ++ LD b7, BO, 6 * SIZE ++ NMSUB c41, c11, b4, c41 ++ NMSUB c42, c12, b4, c42 ++ LD b8, BO, 7 * SIZE ++ NMSUB c51, c11, b5, c51 ++ NMSUB c52, c12, b5, c52 ++ LD b2, BO, 9 * SIZE ++ NMSUB c61, c11, b6, c61 ++ NMSUB c62, c12, b6, c62 ++ LD b3, BO, 10 * SIZE ++ NMSUB c71, c11, b7, c71 ++ NMSUB c72, c12, b7, c72 ++ LD b4, BO, 11 * SIZE ++ NMSUB c81, c11, b8, c81 ++ NMSUB c82, c12, b8, c82 ++ LD b5, BO, 12 * SIZE ++ MUL c21, b2, c21 ++ MUL c22, b2, c22 ++ LD b6, BO, 13 * SIZE ++ NMSUB c31, c21, b3, c31 ++ NMSUB c32, c22, b3, c32 ++ LD b7, BO, 14 * SIZE ++ NMSUB c41, c21, b4, c41 ++ NMSUB c42, c22, b4, c42 ++ LD b8, BO, 15 * SIZE ++ NMSUB c51, c21, b5, c51 ++ NMSUB c52, c22, b5, c52 ++ LD b3, BO, 18 * SIZE ++ NMSUB c61, c21, b6, c61 ++ NMSUB c62, c22, b6, c62 ++ LD b4, BO, 19 * SIZE ++ NMSUB c71, c21, b7, c71 ++ NMSUB c72, c22, b7, c72 ++ LD b5, BO, 20 * SIZE ++ NMSUB c81, c21, b8, c81 ++ NMSUB c82, c22, b8, c82 ++ LD b6, BO, 21 * SIZE ++ MUL c31, b3, c31 ++ MUL c32, b3, c32 ++ LD b7, BO, 22 * SIZE ++ NMSUB c41, c31, b4, c41 ++ NMSUB c42, c32, b4, c42 ++ LD b8, BO, 23 * SIZE ++ NMSUB c51, c31, b5, c51 ++ NMSUB c52, c32, b5, c52 ++ LD b4, BO, 27 * SIZE ++ NMSUB c61, c31, b6, c61 ++ NMSUB c62, c32, b6, c62 ++ LD b5, BO, 28 * SIZE ++ NMSUB c71, c31, b7, c71 ++ NMSUB c72, c32, b7, c72 ++ LD b6, BO, 29 * SIZE ++ NMSUB c81, c31, b8, c81 ++ NMSUB c82, c32, b8, c82 ++ LD b7, BO, 30 * SIZE ++ MUL c41, b4, c41 ++ MUL c42, b4, c42 ++ LD b8, BO, 31 * SIZE ++ NMSUB c51, c41, b5, c51 ++ NMSUB c52, c42, b5, c52 ++ LD b5, BO, 36 * SIZE ++ NMSUB c61, c41, b6, c61 ++ NMSUB c62, c42, b6, c62 ++ LD b6, BO, 37 * SIZE ++ NMSUB c71, c41, b7, c71 ++ NMSUB c72, c42, b7, c72 ++ LD b7, BO, 38 * SIZE ++ NMSUB c81, c41, b8, c81 ++ NMSUB c82, c42, b8, c82 ++ LD b8, BO, 39 * SIZE ++ MUL c51, b5, c51 ++ MUL c52, b5, c52 ++ NMSUB c61, c51, b6, c61 ++ NMSUB c62, c52, b6, c62 ++ LD b6, BO, 45 * SIZE ++ NMSUB c71, c51, b7, c71 ++ NMSUB c72, c52, b7, c72 ++ LD b7, BO, 46 * SIZE ++ NMSUB c81, c51, b8, c81 ++ NMSUB c82, c52, b8, c82 ++ LD b8, BO, 47 * SIZE ++ MUL c61, b6, c61 ++ MUL c62, b6, c62 ++ NMSUB c71, c61, b7, c71 ++ NMSUB c72, c62, b7, c72 ++ LD b7, BO, 54 * SIZE ++ NMSUB c81, c61, b8, c81 ++ NMSUB c82, c62, b8, c82 ++ LD b8, BO, 55 * SIZE ++ MUL c71, b7, c71 ++ MUL c72, b7, c72 ++ NMSUB c81, c71, b8, c81 ++ NMSUB c82, c72, b8, c82 ++ LD b8, BO, 63 * SIZE ++ MUL c81, b8, c81 ++ MUL c82, b8, c82 ++#endif ++#ifdef RT ++ LD b1, BO, 63 * SIZE ++ LD b2, BO, 62 * SIZE ++ LD b3, BO, 61 * SIZE ++ LD b4, BO, 60 * SIZE ++ MUL c81, b1, c81 ++ MUL c82, b1, c82 ++ LD b5, BO, 59 * SIZE ++ NMSUB c71, c81, b2, c71 ++ NMSUB c72, c82, b2, c72 ++ LD b6, BO, 58 * SIZE ++ NMSUB c61, c81, b3, c61 ++ NMSUB c62, c82, b3, c62 ++ LD b7, BO, 57 * SIZE ++ NMSUB c51, c81, b4, c51 ++ NMSUB c52, c82, b4, c52 ++ LD b8, BO, 56 * SIZE ++ NMSUB c41, c81, b5, c41 ++ NMSUB c42, c82, b5, c42 ++ LD b2, BO, 54 * SIZE ++ NMSUB c31, c81, b6, c31 ++ NMSUB c32, c82, b6, c32 ++ LD b3, BO, 53 * SIZE ++ NMSUB c21, c81, b7, c21 ++ NMSUB c22, c82, b7, c22 ++ LD b4, BO, 52 * SIZE ++ NMSUB c11, c81, b8, c11 ++ NMSUB c12, c82, b8, c12 ++ LD b5, BO, 51 * SIZE ++ MUL c71, b2, c71 ++ MUL c72, b2, c72 ++ LD b6, BO, 50 * SIZE ++ NMSUB c61, c71, b3, c61 ++ NMSUB c62, c72, b3, c62 ++ LD b7, BO, 49 * SIZE ++ NMSUB c51, c71, b4, c51 ++ NMSUB c52, c72, b4, c52 ++ LD b8, BO, 48 * SIZE ++ NMSUB c41, c71, b5, c41 ++ NMSUB c42, c72, b5, c42 ++ LD b3, BO, 45 * SIZE ++ NMSUB c31, c71, b6, c31 ++ NMSUB c32, c72, b6, c32 ++ LD b4, BO, 44 * SIZE ++ NMSUB c21, c71, b7, c21 ++ NMSUB c22, c72, b7, c22 ++ LD b5, BO, 43 * SIZE ++ NMSUB c11, c71, b8, c11 ++ NMSUB c12, c72, b8, c12 ++ LD b6, BO, 42 * SIZE ++ MUL c61, b3, c61 ++ MUL c62, b3, c62 ++ LD b7, BO, 41 * SIZE ++ NMSUB c51, c61, b4, c51 ++ NMSUB c52, c62, b4, c52 ++ LD b8, BO, 40 * SIZE ++ NMSUB c41, c61, b5, c41 ++ NMSUB c42, c62, b5, c42 ++ LD b4, BO, 36 * SIZE ++ NMSUB c31, c61, b6, c31 ++ NMSUB c32, c62, b6, c32 ++ LD b5, BO, 35 * SIZE ++ NMSUB c21, c61, b7, c21 ++ NMSUB c22, c62, b7, c22 ++ LD b6, BO, 34 * SIZE ++ NMSUB c11, c61, b8, c11 ++ NMSUB c12, c62, b8, c12 ++ LD b7, BO, 33 * SIZE ++ MUL c51, b4, c51 ++ MUL c52, b4, c52 ++ LD b8, BO, 32 * SIZE ++ NMSUB c41, c51, b5, c41 ++ NMSUB c42, c52, b5, c42 ++ LD b5, BO, 27 * SIZE ++ NMSUB c31, c51, b6, c31 ++ NMSUB c32, c52, b6, c32 ++ LD b6, BO, 26 * SIZE ++ NMSUB c21, c51, b7, c21 ++ NMSUB c22, c52, b7, c22 ++ LD b7, BO, 25 * SIZE ++ NMSUB c11, c51, b8, c11 ++ NMSUB c12, c52, b8, c12 ++ LD b8, BO, 24 * SIZE ++ MUL c41, b5, c41 ++ MUL c42, b5, c42 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c32, c42, b6, c32 ++ LD b6, BO, 18 * SIZE ++ NMSUB c21, c41, b7, c21 ++ NMSUB c22, c42, b7, c22 ++ LD b7, BO, 17 * SIZE ++ NMSUB c11, c41, b8, c11 ++ NMSUB c12, c42, b8, c12 ++ LD b8, BO, 16 * SIZE ++ MUL c31, b6, c31 ++ MUL c32, b6, c32 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c22, c32, b7, c22 ++ LD b7, BO, 9 * SIZE ++ NMSUB c11, c31, b8, c11 ++ NMSUB c12, c32, b8, c12 ++ LD b8, BO, 8 * SIZE ++ MUL c21, b7, c21 ++ MUL c22, b7, c22 ++ NMSUB c11, c21, b8, c11 ++ NMSUB c12, c22, b8, c12 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++ MUL c12, b8, c12 ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++ ST c51, BO, 4 * SIZE ++ ST c61, BO, 5 * SIZE ++ ST c71, BO, 6 * SIZE ++ ST c81, BO, 7 * SIZE ++ ST c12, BO, 8 * SIZE ++ ST c22, BO, 9 * SIZE ++ ST c32, BO, 10 * SIZE ++ ST c42, BO, 11 * SIZE ++ ST c52, BO, 12 * SIZE ++ ST c62, BO, 13 * SIZE ++ ST c72, BO, 14 * SIZE ++ ST c82, BO, 15 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c21, AO, 2 * SIZE ++ ST c22, AO, 3 * SIZE ++ ST c31, AO, 4 * SIZE ++ ST c32, AO, 5 * SIZE ++ ST c41, AO, 6 * SIZE ++ ST c42, AO, 7 * SIZE ++ ST c51, AO, 8 * SIZE ++ ST c52, AO, 9 * SIZE ++ ST c61, AO, 10 * SIZE ++ ST c62, AO, 11 * SIZE ++ ST c71, AO, 12 * SIZE ++ ST c72, AO, 13 * SIZE ++ ST c81, AO, 14 * SIZE ++ ST c82, AO, 15 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c22, CO2, 1 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c32, CO3, 1 * SIZE ++ ST c41, CO4, 0 * SIZE ++ ST c42, CO4, 1 * SIZE ++ ST c51, CO5, 0 * SIZE ++ ST c52, CO5, 1 * SIZE ++ ST c61, CO6, 0 * SIZE ++ ST c62, CO6, 1 * SIZE ++ ST c71, CO7, 0 * SIZE ++ ST c72, CO7, 1 * SIZE ++ ST c81, CO8, 0 * SIZE ++ ST c82, CO8, 1 * SIZE ++MTC a1, $r0 ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++ addi.d CO2, CO2, 2 * SIZE ++ addi.d CO3, CO3, 2 * SIZE ++ addi.d CO4, CO4, 2 * SIZE ++ addi.d CO5, CO5, 2 * SIZE ++ addi.d CO6, CO6, 2 * SIZE ++ addi.d CO7, CO7, 2 * SIZE ++ addi.d CO8, CO8, 2 * SIZE ++#endif ++ MOV c11, a1 ++ MOV c21, a1 ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++ MOV c31, a1 ++ MOV c41, a1 ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++ addi.d I, I, -1 ++ MOV c51, a1 ++MOV c61, a1 ++ blt $r0, I, .L11 ++ .align 3 ++ ++.L20: ++ andi I, M, 1 ++ MOV c61, c11 ++MOV c71, c11 ++ bge $r0, I, .L29 ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, KK, 2 ++ MOV c81, c11 ++move BO, B ++ bge $r0, L, .L25 ++#else ++#ifdef LN ++ slli.d TEMP, K, 0 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ srai.d L, TEMP, 2 ++ MOV c81, c11 ++ bge $r0, L, .L25 ++#endif ++ .align 3 ++.L22: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 16 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ LD b5, BO, 20 * SIZE ++ MADD c61, b2, a1, c61 ++ LD b2, BO, 9 * SIZE ++ MADD c71, b3, a1, c71 ++ LD b3, BO, 10 * SIZE ++ MADD c81, b4, a1, c81 ++ LD b4, BO, 11 * SIZE ++ LD a1, AO, 4 * SIZE ++ addi.d L, L, -1 ++ MADD c11, b6, a2, c11 ++ LD b6, BO, 24 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 13 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 14 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a2, c51 ++ LD b7, BO, 28 * SIZE ++ MADD c61, b2, a2, c61 ++ LD b2, BO, 17 * SIZE ++ MADD c71, b3, a2, c71 ++ LD b3, BO, 18 * SIZE ++ MADD c81, b4, a2, c81 ++ LD b4, BO, 19 * SIZE ++ LD a2, AO, 5 * SIZE ++ addi.d AO, AO, 4 * SIZE ++ MADD c11, b1, a3, c11 ++ LD b1, BO, 32 * SIZE ++ MADD c21, b2, a3, c21 ++ LD b2, BO, 21 * SIZE ++ MADD c31, b3, a3, c31 ++ LD b3, BO, 22 * SIZE ++ MADD c41, b4, a3, c41 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ LD b5, BO, 36 * SIZE ++ MADD c61, b2, a3, c61 ++ LD b2, BO, 25 * SIZE ++ MADD c71, b3, a3, c71 ++ LD b3, BO, 26 * SIZE ++ MADD c81, b4, a3, c81 ++ LD b4, BO, 27 * SIZE ++ LD a3, AO, 2 * SIZE ++ addi.d BO, BO, 32 * SIZE ++ MADD c11, b6, a4, c11 ++ LD b6, BO, 8 * SIZE ++ MADD c21, b2, a4, c21 ++ LD b2, BO, -3 * SIZE ++ MADD c31, b3, a4, c31 ++ LD b3, BO, -2 * SIZE ++ MADD c41, b4, a4, c41 ++ LD b4, BO, -1 * SIZE ++ MADD c51, b7, a4, c51 ++ LD b7, BO, 12 * SIZE ++ MADD c61, b2, a4, c61 ++ LD b2, BO, 1 * SIZE ++ MADD c71, b3, a4, c71 ++ LD b3, BO, 2 * SIZE ++ MADD c81, b4, a4, c81 ++ LD b4, BO, 3 * SIZE ++ LD a4, AO, 3 * SIZE ++ blt $r0, L, .L22 ++ .align 3 ++ ++.L25: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L28 ++ .align 3 ++.L26: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 8 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ addi.d L, L, -1 ++ MOV a2, a2 ++ addi.d AO, AO, 1 * SIZE ++ addi.d BO, BO, 8 * SIZE ++ MADD c51, b5, a1, c51 ++ LD b5, BO, 4 * SIZE ++ MADD c61, b2, a1, c61 ++ LD b2, BO, 1 * SIZE ++ MADD c71, b3, a1, c71 ++ LD b3, BO, 2 * SIZE ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L26 ++.L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -8 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++ SUB c51, b5, c51 ++ SUB c61, b6, c61 ++ SUB c71, b7, c71 ++ SUB c81, b8, c81 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ LD b5, AO, 4 * SIZE ++ LD b6, AO, 5 * SIZE ++ LD b7, AO, 6 * SIZE ++ LD b8, AO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++ SUB c51, b5, c51 ++ SUB c61, b6, c61 ++ SUB c71, b7, c71 ++ SUB c81, b8, c81 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++ MUL c51, b1, c51 ++ MUL c61, b1, c61 ++ MUL c71, b1, c71 ++ MUL c81, b1, c81 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c31, c11, b3, c31 ++ NMSUB c41, c11, b4, c41 ++ NMSUB c51, c11, b5, c51 ++ NMSUB c61, c11, b6, c61 ++ NMSUB c71, c11, b7, c71 ++ NMSUB c81, c11, b8, c81 ++ LD b2, BO, 9 * SIZE ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ LD b5, BO, 12 * SIZE ++ LD b6, BO, 13 * SIZE ++ LD b7, BO, 14 * SIZE ++ LD b8, BO, 15 * SIZE ++ MUL c21, b2, c21 ++ NMSUB c31, c21, b3, c31 ++ NMSUB c41, c21, b4, c41 ++ NMSUB c51, c21, b5, c51 ++ NMSUB c61, c21, b6, c61 ++ NMSUB c71, c21, b7, c71 ++ NMSUB c81, c21, b8, c81 ++ LD b3, BO, 18 * SIZE ++ LD b4, BO, 19 * SIZE ++ LD b5, BO, 20 * SIZE ++ LD b6, BO, 21 * SIZE ++ LD b7, BO, 22 * SIZE ++ LD b8, BO, 23 * SIZE ++ MUL c31, b3, c31 ++ NMSUB c41, c31, b4, c41 ++ NMSUB c51, c31, b5, c51 ++ NMSUB c61, c31, b6, c61 ++ NMSUB c71, c31, b7, c71 ++ NMSUB c81, c31, b8, c81 ++ LD b4, BO, 27 * SIZE ++ LD b5, BO, 28 * SIZE ++ LD b6, BO, 29 * SIZE ++ LD b7, BO, 30 * SIZE ++ LD b8, BO, 31 * SIZE ++ MUL c41, b4, c41 ++ NMSUB c51, c41, b5, c51 ++ NMSUB c61, c41, b6, c61 ++ NMSUB c71, c41, b7, c71 ++ NMSUB c81, c41, b8, c81 ++ LD b5, BO, 36 * SIZE ++ LD b6, BO, 37 * SIZE ++ LD b7, BO, 38 * SIZE ++ LD b8, BO, 39 * SIZE ++ MUL c51, b5, c51 ++ NMSUB c61, c51, b6, c61 ++ NMSUB c71, c51, b7, c71 ++ NMSUB c81, c51, b8, c81 ++ LD b6, BO, 45 * SIZE ++ LD b7, BO, 46 * SIZE ++ LD b8, BO, 47 * SIZE ++ MUL c61, b6, c61 ++ NMSUB c71, c61, b7, c71 ++ NMSUB c81, c61, b8, c81 ++ LD b7, BO, 54 * SIZE ++ LD b8, BO, 55 * SIZE ++ MUL c71, b7, c71 ++ NMSUB c81, c71, b8, c81 ++ LD b8, BO, 63 * SIZE ++ MUL c81, b8, c81 ++#endif ++#ifdef RT ++ LD b1, BO, 63 * SIZE ++ LD b2, BO, 62 * SIZE ++ LD b3, BO, 61 * SIZE ++ LD b4, BO, 60 * SIZE ++ LD b5, BO, 59 * SIZE ++ LD b6, BO, 58 * SIZE ++ LD b7, BO, 57 * SIZE ++ LD b8, BO, 56 * SIZE ++ MUL c81, b1, c81 ++ NMSUB c71, c81, b2, c71 ++ NMSUB c61, c81, b3, c61 ++ NMSUB c51, c81, b4, c51 ++ NMSUB c41, c81, b5, c41 ++ NMSUB c31, c81, b6, c31 ++ NMSUB c21, c81, b7, c21 ++ NMSUB c11, c81, b8, c11 ++ LD b2, BO, 54 * SIZE ++ LD b3, BO, 53 * SIZE ++ LD b4, BO, 52 * SIZE ++ LD b5, BO, 51 * SIZE ++ LD b6, BO, 50 * SIZE ++ LD b7, BO, 49 * SIZE ++ LD b8, BO, 48 * SIZE ++ MUL c71, b2, c71 ++ NMSUB c61, c71, b3, c61 ++ NMSUB c51, c71, b4, c51 ++ NMSUB c41, c71, b5, c41 ++ NMSUB c31, c71, b6, c31 ++ NMSUB c21, c71, b7, c21 ++ NMSUB c11, c71, b8, c11 ++ LD b3, BO, 45 * SIZE ++ LD b4, BO, 44 * SIZE ++ LD b5, BO, 43 * SIZE ++ LD b6, BO, 42 * SIZE ++ LD b7, BO, 41 * SIZE ++ LD b8, BO, 40 * SIZE ++ MUL c61, b3, c61 ++ NMSUB c51, c61, b4, c51 ++ NMSUB c41, c61, b5, c41 ++ NMSUB c31, c61, b6, c31 ++ NMSUB c21, c61, b7, c21 ++ NMSUB c11, c61, b8, c11 ++ LD b4, BO, 36 * SIZE ++ LD b5, BO, 35 * SIZE ++ LD b6, BO, 34 * SIZE ++ LD b7, BO, 33 * SIZE ++ LD b8, BO, 32 * SIZE ++ MUL c51, b4, c51 ++ NMSUB c41, c51, b5, c41 ++ NMSUB c31, c51, b6, c31 ++ NMSUB c21, c51, b7, c21 ++ NMSUB c11, c51, b8, c11 ++ LD b5, BO, 27 * SIZE ++ LD b6, BO, 26 * SIZE ++ LD b7, BO, 25 * SIZE ++ LD b8, BO, 24 * SIZE ++ MUL c41, b5, c41 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c21, c41, b7, c21 ++ NMSUB c11, c41, b8, c11 ++ LD b6, BO, 18 * SIZE ++ LD b7, BO, 17 * SIZE ++ LD b8, BO, 16 * SIZE ++ MUL c31, b6, c31 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c11, c31, b8, c11 ++ LD b7, BO, 9 * SIZE ++ LD b8, BO, 8 * SIZE ++ MUL c21, b7, c21 ++ NMSUB c11, c21, b8, c11 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++ addi.d CO2, CO2, -1 * SIZE ++ addi.d CO3, CO3, -1 * SIZE ++ addi.d CO4, CO4, -1 * SIZE ++ addi.d CO5, CO5, -1 * SIZE ++ addi.d CO6, CO6, -1 * SIZE ++ addi.d CO7, CO7, -1 * SIZE ++ addi.d CO8, CO8, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++ ST c51, BO, 4 * SIZE ++ ST c61, BO, 5 * SIZE ++ ST c71, BO, 6 * SIZE ++ ST c81, BO, 7 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c21, AO, 1 * SIZE ++ ST c31, AO, 2 * SIZE ++ ST c41, AO, 3 * SIZE ++ ST c51, AO, 4 * SIZE ++ ST c61, AO, 5 * SIZE ++ ST c71, AO, 6 * SIZE ++ ST c81, AO, 7 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c41, CO4, 0 * SIZE ++ ST c51, CO5, 0 * SIZE ++ ST c61, CO6, 0 * SIZE ++ ST c71, CO7, 0 * SIZE ++ ST c81, CO8, 0 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++ addi.d CO2, CO2, 1 * SIZE ++ addi.d CO3, CO3, 1 * SIZE ++ addi.d CO4, CO4, 1 * SIZE ++ addi.d CO5, CO5, 1 * SIZE ++ addi.d CO6, CO6, 1 * SIZE ++ addi.d CO7, CO7, 1 * SIZE ++ addi.d CO8, CO8, 1 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L29: ++#ifdef LN ++ slli.d TEMP, K, 3 + BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 8 ++#endif ++#ifdef RT ++ addi.d KK, KK, -8 ++#endif ++ blt $r0, J, .L10 ++ .align 3 ++ ++.L30: ++ andi J, N, 4 ++move AO, A ++ bge $r0, J, .L50 ++#ifdef RT ++ slli.d TEMP, K, 2 + BASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 2 ++ sub.d C, C, TEMP ++#endif ++ move CO1, C ++MTC c11, $r0 ++ add.d CO2, C, LDC ++ add.d CO3, CO2, LDC ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ srai.d I, M, 1 ++ MOV c31, c11 ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO4, LDC ++#endif ++MOV c41, c11 ++ bge $r0, I, .L40 ++.L31: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ LD a3, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ MOV c32, c11 ++ LD b4, B, 3 * SIZE ++ MOV c42, c11 ++ LD b5, B, 4 * SIZE ++ srai.d L, KK, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L35 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ LD a3, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c32, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c42, c11 ++ LD b5, BO, 4 * SIZE ++ srai.d L, TEMP, 2 ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L35 ++#endif ++ .align 3 ++.L32: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c11, b5, a1, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 8 * SIZE ++ MADD c12, b5, a2, c12 ++ LD b5, BO, 20 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 9 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 10 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ LD a3, AO, 6 * SIZE ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c11, b7, a3, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a3, c21 ++ addi.d AO, AO, 8 * SIZE ++ MADD c31, b3, a3, c31 ++ addi.d BO, BO, 16 * SIZE ++ MADD c41, b4, a3, c41 ++ LD a3, AO, 4 * SIZE ++ MADD c12, b7, a2, c12 ++ LD b7, BO, 12 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 1 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 2 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L32 ++ .align 3 ++ ++.L35: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L38 ++ .align 3 ++.L36: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ addi.d AO, AO, 2 * SIZE ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 0 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 4 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L36 ++.L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -4 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++ SUB c12, b5, c12 ++ SUB c22, b6, c22 ++ SUB c32, b7, c32 ++ SUB c42, b8, c42 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ LD b5, AO, 4 * SIZE ++ LD b6, AO, 5 * SIZE ++ LD b7, AO, 6 * SIZE ++ LD b8, AO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c21, b3, c21 ++ SUB c22, b4, c22 ++ SUB c31, b5, c31 ++ SUB c32, b6, c32 ++ SUB c41, b7, c41 ++ SUB c42, b8, c42 ++#endif ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++ LD b2, AO, 2 * SIZE ++ LD b3, AO, 0 * SIZE ++ MUL c12, b1, c12 ++ MUL c22, b1, c22 ++ MUL c32, b1, c32 ++ MUL c42, b1, c42 ++ NMSUB c11, c12, b2, c11 ++ NMSUB c21, c22, b2, c21 ++ NMSUB c31, c32, b2, c31 ++ NMSUB c41, c42, b2, c41 ++ MUL c11, b3, c11 ++ MUL c21, b3, c21 ++ MUL c31, b3, c31 ++ MUL c41, b3, c41 ++#endif ++#ifdef LT ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++ NMSUB c12, c11, b2, c12 ++ NMSUB c22, c21, b2, c22 ++ NMSUB c32, c31, b2, c32 ++ NMSUB c42, c41, b2, c42 ++ MUL c12, b3, c12 ++ MUL c22, b3, c22 ++ MUL c32, b3, c32 ++ MUL c42, b3, c42 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c22, c12, b2, c22 ++ NMSUB c31, c11, b3, c31 ++ NMSUB c32, c12, b3, c32 ++ NMSUB c41, c11, b4, c41 ++ NMSUB c42, c12, b4, c42 ++ LD b2, BO, 5 * SIZE ++ LD b3, BO, 6 * SIZE ++ LD b4, BO, 7 * SIZE ++ MUL c21, b2, c21 ++ MUL c22, b2, c22 ++ NMSUB c31, c21, b3, c31 ++ NMSUB c32, c22, b3, c32 ++ NMSUB c41, c21, b4, c41 ++ NMSUB c42, c22, b4, c42 ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ MUL c31, b3, c31 ++ MUL c32, b3, c32 ++ NMSUB c41, c31, b4, c41 ++ NMSUB c42, c32, b4, c42 ++ LD b4, BO, 15 * SIZE ++ MUL c41, b4, c41 ++ MUL c42, b4, c42 ++#endif ++#ifdef RT ++ LD b5, BO, 15 * SIZE ++ LD b6, BO, 14 * SIZE ++ LD b7, BO, 13 * SIZE ++ LD b8, BO, 12 * SIZE ++ MUL c41, b5, c41 ++ MUL c42, b5, c42 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c32, c42, b6, c32 ++ NMSUB c21, c41, b7, c21 ++ NMSUB c22, c42, b7, c22 ++ NMSUB c11, c41, b8, c11 ++ NMSUB c12, c42, b8, c12 ++ LD b6, BO, 10 * SIZE ++ LD b7, BO, 9 * SIZE ++ LD b8, BO, 8 * SIZE ++ MUL c31, b6, c31 ++ MUL c32, b6, c32 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c22, c32, b7, c22 ++ NMSUB c11, c31, b8, c11 ++ NMSUB c12, c32, b8, c12 ++ LD b7, BO, 5 * SIZE ++ LD b8, BO, 4 * SIZE ++ MUL c21, b7, c21 ++ MUL c22, b7, c22 ++ NMSUB c11, c21, b8, c11 ++ NMSUB c12, c22, b8, c12 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++ MUL c12, b8, c12 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -2 * SIZE ++ addi.d CO2, CO2, -2 * SIZE ++ addi.d CO3, CO3, -2 * SIZE ++ addi.d CO4, CO4, -2 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++ ST c12, BO, 4 * SIZE ++ ST c22, BO, 5 * SIZE ++ ST c32, BO, 6 * SIZE ++ ST c42, BO, 7 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c21, AO, 2 * SIZE ++ ST c22, AO, 3 * SIZE ++ ST c31, AO, 4 * SIZE ++ ST c32, AO, 5 * SIZE ++ ST c41, AO, 6 * SIZE ++ ST c42, AO, 7 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c22, CO2, 1 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c32, CO3, 1 * SIZE ++ ST c41, CO4, 0 * SIZE ++ ST c42, CO4, 1 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++ addi.d CO2, CO2, 2 * SIZE ++ addi.d CO3, CO3, 2 * SIZE ++ addi.d CO4, CO4, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++MTC a1, $r0 ++ MOV c11, a1 ++ MOV c21, a1 ++ MOV c31, a1 ++ addi.d I, I, -1 ++MOV c41, c11 ++ blt $r0, I, .L31 ++ .align 3 ++ ++.L40: ++ andi I, M, 1 ++MOV c61, c11 ++ bge $r0, I, .L49 ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c81, c11 ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, KK, 2 ++move BO, B ++ bge $r0, L, .L45 ++#else ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c81, c11 ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L45 ++#endif ++ .align 3 ++.L42: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 16 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ LD a1, AO, 4 * SIZE ++ addi.d L, L, -1 ++ MADD c11, b5, a2, c11 ++ LD b5, BO, 20 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 10 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 11 * SIZE ++ LD a2, AO, 2 * SIZE ++ addi.d AO, AO, 4 * SIZE ++ MADD c11, b6, a2, c11 ++ LD b6, BO, 24 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 13 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 14 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 15 * SIZE ++ LD a2, AO, -1 * SIZE ++ addi.d BO, BO, 16 * SIZE ++ MADD c11, b7, a2, c11 ++ LD b7, BO, 12 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 1 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 2 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 3 * SIZE ++ LD a2, AO, 1 * SIZE ++ blt $r0, L, .L42 ++ .align 3 ++ ++.L45: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L48 ++ .align 3 ++.L46: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 4 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 1 * SIZE ++ LD b4, BO, 7 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++ MOV a2, a2 ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L46 ++.L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -4 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c31, c11, b3, c31 ++ NMSUB c41, c11, b4, c41 ++ LD b2, BO, 5 * SIZE ++ LD b3, BO, 6 * SIZE ++ LD b4, BO, 7 * SIZE ++ MUL c21, b2, c21 ++ NMSUB c31, c21, b3, c31 ++ NMSUB c41, c21, b4, c41 ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ MUL c31, b3, c31 ++ NMSUB c41, c31, b4, c41 ++ LD b4, BO, 15 * SIZE ++ MUL c41, b4, c41 ++#endif ++#ifdef RT ++ LD b5, BO, 15 * SIZE ++ LD b6, BO, 14 * SIZE ++ LD b7, BO, 13 * SIZE ++ LD b8, BO, 12 * SIZE ++ MUL c41, b5, c41 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c21, c41, b7, c21 ++ NMSUB c11, c41, b8, c11 ++ LD b6, BO, 10 * SIZE ++ LD b7, BO, 9 * SIZE ++ LD b8, BO, 8 * SIZE ++ MUL c31, b6, c31 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c11, c31, b8, c11 ++ LD b7, BO, 5 * SIZE ++ LD b8, BO, 4 * SIZE ++ MUL c21, b7, c21 ++ NMSUB c11, c21, b8, c11 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++ addi.d CO2, CO2, -1 * SIZE ++ addi.d CO3, CO3, -1 * SIZE ++ addi.d CO4, CO4, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c21, AO, 1 * SIZE ++ ST c31, AO, 2 * SIZE ++ ST c41, AO, 3 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c41, CO4, 0 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++ addi.d CO2, CO2, 1 * SIZE ++ addi.d CO3, CO3, 1 * SIZE ++ addi.d CO4, CO4, 1 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L49: ++#ifdef LN ++ slli.d TEMP, K, 2 + BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 4 ++#endif ++#ifdef RT ++ addi.d KK, KK, -4 ++#endif ++ .align 3 ++ ++.L50: ++ andi J, N, 2 ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++#else ++ move AO, A ++#endif ++ bge $r0, J, .L70 ++#ifdef RT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 1 ++ sub.d C, C, TEMP ++#endif ++ move AO, A ++ move CO1, C ++ add.d CO2, C, LDC ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO2, LDC ++#endif ++ srai.d I, M, 1 ++ bge $r0, I, .L60 ++.L51: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ LD b5, B, 4 * SIZE ++ srai.d L, KK, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L55 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ LD b5, BO, 4 * SIZE ++ srai.d L, TEMP, 2 ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L55 ++#endif ++ .align 3 ++.L52: ++ MADD c11, b1, a1, c11 ++ LD a3, AO, 2 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b4, BO, 3 * SIZE ++ MADD c12, b1, a2, c12 ++ LD a4, AO, 3 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b1, BO, 8 * SIZE ++ MADD c11, b3, a3, c11 ++ LD a1, AO, 8 * SIZE ++ MADD c21, b4, a3, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c12, b3, a4, c12 ++ LD a2, AO, 5 * SIZE ++ MADD c22, b4, a4, c22 ++ LD b3, BO, 6 * SIZE ++ MADD c11, b5, a5, c11 ++ LD a3, AO, 6 * SIZE ++ MADD c21, b2, a5, c21 ++ LD b4, BO, 7 * SIZE ++ MADD c12, b5, a2, c12 ++ LD a4, AO, 7 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b5, BO, 12 * SIZE ++ MADD c11, b3, a3, c11 ++ LD a5, AO, 12 * SIZE ++ MADD c21, b4, a3, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c12, b3, a4, c12 ++ LD a2, AO, 9 * SIZE ++ MADD c22, b4, a4, c22 ++ LD b3, BO, 10 * SIZE ++ addi.d AO, AO, 8 * SIZE ++ addi.d L, L, -1 ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L52 ++ .align 3 ++ ++.L55: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L58 ++ .align 3 ++.L56: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 3 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 2 * SIZE ++addi.d BO, BO, 2 * SIZE ++ blt $r0, L, .L56 ++.L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -2 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c12, b3, c12 ++ SUB c22, b4, c22 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c21, b3, c21 ++ SUB c22, b4, c22 ++#endif ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++ LD b2, AO, 2 * SIZE ++ LD b3, AO, 0 * SIZE ++ MUL c12, b1, c12 ++ MUL c22, b1, c22 ++ NMSUB c11, c12, b2, c11 ++ NMSUB c21, c22, b2, c21 ++ MUL c11, b3, c11 ++ MUL c21, b3, c21 ++#endif ++#ifdef LT ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ NMSUB c12, c11, b2, c12 ++ NMSUB c22, c21, b2, c22 ++ MUL c12, b3, c12 ++ MUL c22, b3, c22 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c22, c12, b2, c22 ++ MUL c21, b3, c21 ++ MUL c22, b3, c22 ++#endif ++#ifdef RT ++ LD b1, BO, 3 * SIZE ++ LD b2, BO, 2 * SIZE ++ LD b3, BO, 0 * SIZE ++ MUL c21, b1, c21 ++ MUL c22, b1, c22 ++ NMSUB c11, c21, b2, c11 ++ NMSUB c12, c22, b2, c12 ++ MUL c11, b3, c11 ++ MUL c12, b3, c12 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -2 * SIZE ++ addi.d CO2, CO2, -2 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c12, BO, 2 * SIZE ++ ST c22, BO, 3 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c21, AO, 2 * SIZE ++ ST c22, AO, 3 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c22, CO2, 1 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++ addi.d CO2, CO2, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AO, TEMP ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++MTC a1, $r0 ++ MOV c11, a1 ++ MOV c21, a1 ++ MOV c31, a1 ++ addi.d I, I, -1 ++MOV c41, c11 ++ blt $r0, I, .L51 ++ .align 3 ++ ++.L60: ++ andi I, M, 1 ++ bge $r0, I, .L69 ++#if defined(LT) || defined(RN) ++ srai.d L, KK, 2 ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ MOV c31, c11 ++ LD a4, AO, 3 * SIZE ++ MOV c41, c11 ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L65 ++#else ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ srai.d L, TEMP, 2 ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ MOV c31, c11 ++ LD a4, AO, 3 * SIZE ++ MOV c41, c11 ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L65 ++#endif ++ .align 3 ++.L62: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 4 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 7 * SIZE ++ LD a1, AO, 4 * SIZE ++ LD a2, AO, 5 * SIZE ++ MADD c11, b1, a3, c11 ++ LD b1, BO, 8 * SIZE ++ MADD c21, b2, a3, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c31, b3, a4, c31 ++ LD b3, BO, 10 * SIZE ++ MADD c41, b4, a4, c41 ++ LD b4, BO, 11 * SIZE ++ LD a3, AO, 6 * SIZE ++ LD a4, AO, 7 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 4 * SIZE ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L62 ++ .align 3 ++ ++.L65: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L68 ++ .align 3 ++.L66: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 2 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 3 * SIZE ++ LD a1, AO, 1 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++addi.d BO, BO, 2 * SIZE ++ blt $r0, L, .L66 ++.L68: ++ ADD c11, c11, c31 ++ ADD c21, c21, c41 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -2 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++#endif ++#if defined(LN) || defined(LT) ++ LD b3, AO, 0 * SIZE ++ MUL c11, b3, c11 ++ MUL c21, b3, c21 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c21, c11, b2, c21 ++ MUL c21, b3, c21 ++#endif ++#ifdef RT ++ LD b1, BO, 3 * SIZE ++ LD b2, BO, 2 * SIZE ++ LD b3, BO, 0 * SIZE ++ MUL c21, b1, c21 ++ NMSUB c11, c21, b2, c11 ++ MUL c11, b3, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++ addi.d CO2, CO2, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c21, AO, 1 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++ addi.d CO2, CO2, 1 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 0 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L69: ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 2 ++#endif ++#ifdef RT ++ addi.d KK, KK, -2 ++#endif ++ .align 3 ++ ++.L70: ++ andi J, N, 1 ++ bge $r0, J, .L999 ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d B, B, TEMP ++ sub.d C, C, LDC ++#endif ++ move AO, A ++ move CO1, C ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO1, LDC ++#endif ++ srai.d I, M, 1 ++ bge $r0, I, .L80 ++.L71: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ LD b5, B, 4 * SIZE ++ srai.d L, KK, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L75 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 0 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ LD b5, BO, 4 * SIZE ++ srai.d L, TEMP, 2 ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L75 ++#endif ++ .align 3 ++.L72: ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 2 * SIZE ++ LD a2, AO, 3 * SIZE ++ LD b1, BO, 1 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 4 * SIZE ++ LD a2, AO, 5 * SIZE ++ LD b1, BO, 2 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 6 * SIZE ++ LD a2, AO, 7 * SIZE ++ LD b1, BO, 3 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ addi.d L, L, -1 ++ addi.d AO, AO, 8 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L72 ++ .align 3 ++ ++.L75: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L78 ++ .align 3 ++.L76: ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ addi.d L, L, -1 ++ addi.d AO, AO, 2 * SIZE ++addi.d BO, BO, 1 * SIZE ++ blt $r0, L, .L76 ++.L78: ++ ADD c11, c11, c21 ++ ADD c12, c12, c22 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -1 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++#endif ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++ LD b2, AO, 2 * SIZE ++ LD b3, AO, 0 * SIZE ++ MUL c12, b1, c12 ++ NMSUB c11, c12, b2, c11 ++ MUL c11, b3, c11 ++#endif ++#ifdef LT ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 3 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c12, c11, b2, c12 ++ MUL c12, b3, c12 ++#endif ++#if defined(RN) || defined(RT) ++ LD b1, BO, 0 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -2 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c12, BO, 1 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++ addi.d I, I, -1 ++ blt $r0, I, .L71 ++ .align 3 ++ ++.L80: ++ andi I, M, 1 ++ bge $r0, I, .L89 ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, KK, 2 ++move BO, B ++ bge $r0, L, .L85 ++#else ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d TEMP, KK, BASE_SHIFT ++ add.d AO, AORIG, TEMP ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L85 ++#endif ++ .align 3 ++.L82: ++ LD a1, AO, 0 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a1, AO, 1 * SIZE ++ LD b1, BO, 1 * SIZE ++ MADD c21, b1, a1, c21 ++ LD a1, AO, 2 * SIZE ++ LD b1, BO, 2 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a1, AO, 3 * SIZE ++ LD b1, BO, 3 * SIZE ++ MADD c21, b1, a1, c21 ++ addi.d L, L, -1 ++ addi.d AO, AO, 4 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L82 ++ .align 3 ++ ++.L85: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L88 ++ .align 3 ++.L86: ++ LD a1, AO, 0 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++addi.d BO, BO, 1 * SIZE ++ blt $r0, L, .L86 ++.L88: ++ ADD c11, c11, c21 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -1 ++#endif ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AORIG, TEMP ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ SUB c11, b1, c11 ++#else ++ LD b1, AO, 0 * SIZE ++ SUB c11, b1, c11 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ MUL c11, b1, c11 ++#endif ++#if defined(RN) || defined(RT) ++ LD b1, BO, 0 * SIZE ++ MUL c11, b1, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AO, TEMP ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L89: ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 1 ++#endif ++#ifdef RT ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ LDARG $r25, $sp, 16 ++ LDARG $r26, $sp, 24 ++ LDARG $r27, $sp, 32 ++ LDARG $r28, $sp, 40 ++ fld.d $f24, $sp, 48 ++ fld.d $f25, $sp, 56 ++ fld.d $f26, $sp, 64 ++ fld.d $f27, $sp, 72 ++ fld.d $f28, $sp, 80 ++ LDARG $r29, $sp, 88 ++ LDARG $r30, $sp, 96 ++ LDARG $r20, $sp, 104 ++ LDARG $r16, $sp, 112 ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 112 ++ fld.d $f19, $sp, 120 ++ fld.d $f20, $sp, 128 ++ fld.d $f21, $sp, 136 ++#endif ++ addi.d $sp, $sp, 144 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/trsm_kernel_RT.S b/kernel/loongarch64/trsm_kernel_RT.S +new file mode 100644 +index 0000000..c86d9c1 +--- /dev/null ++++ b/kernel/loongarch64/trsm_kernel_RT.S +@@ -0,0 +1,2850 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define M $r4 ++#define N $r5 ++#define K $r6 ++#define A $r7 ++#define B $r8 ++#define C $r9 ++#define LDC $r10 ++#define OFFSET $r11 ++#define AO $r12 ++#define BO $r13 ++#define I $r17 ++#define J $r18 ++#define L $r29 ++#define CO1 $r14 ++#define CO2 $r15 ++#define CO3 $r23 ++#define CO4 $r24 ++#define CO5 $r25 ++#define CO6 $r26 ++#define CO7 $r27 ++#define CO8 $r28 ++#define KK $r30 ++#define TEMP $r20 ++#define AORIG $r16 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f27 ++#define a4 $f28 ++#define b1 $f23 ++#define b2 $f9 ++#define b3 $f10 ++#define b4 $f11 ++#define b5 $f12 ++#define b6 $f13 ++#define b7 $f14 ++#define b8 $f15 ++#define a5 b8 ++#define c11 $f16 ++#define c12 $f17 ++#define c21 $f3 ++#define c22 $f1 ++#define c31 $f2 ++#define c32 $f4 ++#define c41 $f5 ++#define c42 $f6 ++#define c51 $f7 ++#define c52 $f18 ++#define c61 $f19 ++#define c62 $f20 ++#define c71 $f21 ++#define c72 $f24 ++#define c81 $f25 ++#define c82 $f26 ++#define ALPHA $f0 ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -144 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 16 ++ SDARG $r26, $sp, 24 ++ SDARG $r27, $sp, 32 ++ SDARG $r28, $sp, 40 ++ fst.d $f24, $sp, 48 ++ fst.d $f25, $sp, 56 ++ fst.d $f26, $sp, 64 ++ fst.d $f27, $sp, 72 ++ fst.d $f28, $sp, 80 ++ SDARG $r29, $sp, 88 ++ SDARG $r30, $sp, 96 ++ SDARG $r20, $sp, 104 ++ SDARG $r16, $sp, 112 ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 112 ++ fst.d $f19, $sp, 120 ++ fst.d $f20, $sp, 128 ++ fst.d $f21, $sp, 136 ++#endif ++ slli.d LDC, LDC, BASE_SHIFT ++#ifdef LN ++ mul.w TEMP, M, K ++ slli.d TEMP, TEMP, BASE_SHIFT ++ add.d A, A, TEMP ++ slli.d TEMP, M, BASE_SHIFT ++ add.d C, C, TEMP ++#endif ++#ifdef RN ++ sub.d KK, $r0, OFFSET ++#endif ++#ifdef RT ++ mul.w TEMP, N, K ++ slli.d TEMP, TEMP, BASE_SHIFT ++ add.d B, B, TEMP ++ mul.w TEMP, N, LDC ++ add.d C, C, TEMP ++ sub.d KK, N, OFFSET ++#endif ++ andi J, N, 1 ++ bge $r0, J, .L30 ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d B, B, TEMP ++ sub.d C, C, LDC ++#endif ++ move AO, A ++ move CO1, C ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO1, LDC ++#endif ++ srai.d I, M, 1 ++ bge $r0, I, .L80 ++.L71: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ LD b5, B, 4 * SIZE ++ srai.d L, KK, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L75 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 0 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ LD b5, BO, 4 * SIZE ++ srai.d L, TEMP, 2 ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L75 ++#endif ++ .align 3 ++.L72: ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 2 * SIZE ++ LD a2, AO, 3 * SIZE ++ LD b1, BO, 1 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 4 * SIZE ++ LD a2, AO, 5 * SIZE ++ LD b1, BO, 2 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 6 * SIZE ++ LD a2, AO, 7 * SIZE ++ LD b1, BO, 3 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ addi.d L, L, -1 ++ addi.d AO, AO, 8 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L72 ++ .align 3 ++ ++.L75: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L78 ++ .align 3 ++.L76: ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ addi.d L, L, -1 ++ addi.d AO, AO, 2 * SIZE ++addi.d BO, BO, 1 * SIZE ++ blt $r0, L, .L76 ++.L78: ++ ADD c11, c11, c21 ++ ADD c12, c12, c22 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -1 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++#endif ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++ LD b2, AO, 2 * SIZE ++ LD b3, AO, 0 * SIZE ++ MUL c12, b1, c12 ++ NMSUB c11, c12, b2, c11 ++ MUL c11, b3, c11 ++#endif ++#ifdef LT ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 3 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c12, c11, b2, c12 ++ MUL c12, b3, c12 ++#endif ++#if defined(RN) || defined(RT) ++ LD b1, BO, 0 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -2 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c12, BO, 1 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++ addi.d I, I, -1 ++ blt $r0, I, .L71 ++ .align 3 ++ ++.L80: ++ andi I, M, 1 ++ bge $r0, I, .L89 ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ MOV c21, c11 ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, KK, 2 ++move BO, B ++ bge $r0, L, .L85 ++#else ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d TEMP, KK, BASE_SHIFT ++ add.d AO, AORIG, TEMP ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MOV c21, c11 ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L85 ++#endif ++ .align 3 ++.L82: ++ LD a1, AO, 0 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a1, AO, 1 * SIZE ++ LD b1, BO, 1 * SIZE ++ MADD c21, b1, a1, c21 ++ LD a1, AO, 2 * SIZE ++ LD b1, BO, 2 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a1, AO, 3 * SIZE ++ LD b1, BO, 3 * SIZE ++ MADD c21, b1, a1, c21 ++ addi.d L, L, -1 ++ addi.d AO, AO, 4 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L82 ++ .align 3 ++ ++.L85: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L88 ++ .align 3 ++.L86: ++ LD a1, AO, 0 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++addi.d BO, BO, 1 * SIZE ++ blt $r0, L, .L86 ++.L88: ++ ADD c11, c11, c21 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -1 ++#endif ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AORIG, TEMP ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ SUB c11, b1, c11 ++#else ++ LD b1, AO, 0 * SIZE ++ SUB c11, b1, c11 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ MUL c11, b1, c11 ++#endif ++#if defined(RN) || defined(RT) ++ LD b1, BO, 0 * SIZE ++ MUL c11, b1, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d TEMP, TEMP, 0 + BASE_SHIFT ++ add.d AO, AO, TEMP ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L89: ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 1 ++#endif ++#ifdef RT ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L30: ++ andi J, N, 2 ++ bge $r0, J, .L50 ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 1 ++ sub.d C, C, TEMP ++#endif ++ move AO, A ++ move CO1, C ++ add.d CO2, C, LDC ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO2, LDC ++#endif ++ srai.d I, M, 1 ++ bge $r0, I, .L60 ++.L51: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ LD b5, B, 4 * SIZE ++ srai.d L, KK, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L55 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ LD b5, BO, 4 * SIZE ++ srai.d L, TEMP, 2 ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L55 ++#endif ++ .align 3 ++.L52: ++ MADD c11, b1, a1, c11 ++ LD a3, AO, 2 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b4, BO, 3 * SIZE ++ MADD c12, b1, a2, c12 ++ LD a4, AO, 3 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b1, BO, 8 * SIZE ++ MADD c11, b3, a3, c11 ++ LD a1, AO, 8 * SIZE ++ MADD c21, b4, a3, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c12, b3, a4, c12 ++ LD a2, AO, 5 * SIZE ++ MADD c22, b4, a4, c22 ++ LD b3, BO, 6 * SIZE ++ MADD c11, b5, a5, c11 ++ LD a3, AO, 6 * SIZE ++ MADD c21, b2, a5, c21 ++ LD b4, BO, 7 * SIZE ++ MADD c12, b5, a2, c12 ++ LD a4, AO, 7 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b5, BO, 12 * SIZE ++ MADD c11, b3, a3, c11 ++ LD a5, AO, 12 * SIZE ++ MADD c21, b4, a3, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c12, b3, a4, c12 ++ LD a2, AO, 9 * SIZE ++ MADD c22, b4, a4, c22 ++ LD b3, BO, 10 * SIZE ++ addi.d AO, AO, 8 * SIZE ++ addi.d L, L, -1 ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L52 ++ .align 3 ++ ++.L55: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L58 ++ .align 3 ++.L56: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 3 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 2 * SIZE ++addi.d BO, BO, 2 * SIZE ++ blt $r0, L, .L56 ++.L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -2 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c12, b3, c12 ++ SUB c22, b4, c22 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c21, b3, c21 ++ SUB c22, b4, c22 ++#endif ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++ LD b2, AO, 2 * SIZE ++ LD b3, AO, 0 * SIZE ++ MUL c12, b1, c12 ++ MUL c22, b1, c22 ++ NMSUB c11, c12, b2, c11 ++ NMSUB c21, c22, b2, c21 ++ MUL c11, b3, c11 ++ MUL c21, b3, c21 ++#endif ++#ifdef LT ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ NMSUB c12, c11, b2, c12 ++ NMSUB c22, c21, b2, c22 ++ MUL c12, b3, c12 ++ MUL c22, b3, c22 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c22, c12, b2, c22 ++ MUL c21, b3, c21 ++ MUL c22, b3, c22 ++#endif ++#ifdef RT ++ LD b1, BO, 3 * SIZE ++ LD b2, BO, 2 * SIZE ++ LD b3, BO, 0 * SIZE ++ MUL c21, b1, c21 ++ MUL c22, b1, c22 ++ NMSUB c11, c21, b2, c11 ++ NMSUB c12, c22, b2, c12 ++ MUL c11, b3, c11 ++ MUL c12, b3, c12 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -2 * SIZE ++ addi.d CO2, CO2, -2 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c12, BO, 2 * SIZE ++ ST c22, BO, 3 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c21, AO, 2 * SIZE ++ ST c22, AO, 3 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c22, CO2, 1 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++ addi.d CO2, CO2, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AO, TEMP ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++MTC a1, $r0 ++ MOV c11, a1 ++ MOV c21, a1 ++ MOV c31, a1 ++ addi.d I, I, -1 ++MOV c41, c11 ++ blt $r0, I, .L51 ++ .align 3 ++ ++.L60: ++ andi I, M, 1 ++ bge $r0, I, .L69 ++#if defined(LT) || defined(RN) ++ srai.d L, KK, 2 ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ MOV c31, c11 ++ LD a4, AO, 3 * SIZE ++ MOV c41, c11 ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L65 ++#else ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ srai.d L, TEMP, 2 ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ MOV c31, c11 ++ LD a4, AO, 3 * SIZE ++ MOV c41, c11 ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L65 ++#endif ++ .align 3 ++.L62: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 4 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 7 * SIZE ++ LD a1, AO, 4 * SIZE ++ LD a2, AO, 5 * SIZE ++ MADD c11, b1, a3, c11 ++ LD b1, BO, 8 * SIZE ++ MADD c21, b2, a3, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c31, b3, a4, c31 ++ LD b3, BO, 10 * SIZE ++ MADD c41, b4, a4, c41 ++ LD b4, BO, 11 * SIZE ++ LD a3, AO, 6 * SIZE ++ LD a4, AO, 7 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 4 * SIZE ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L62 ++ .align 3 ++ ++.L65: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L68 ++ .align 3 ++.L66: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 2 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 3 * SIZE ++ LD a1, AO, 1 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++addi.d BO, BO, 2 * SIZE ++ blt $r0, L, .L66 ++.L68: ++ ADD c11, c11, c31 ++ ADD c21, c21, c41 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -2 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++#endif ++#if defined(LN) || defined(LT) ++ LD b3, AO, 0 * SIZE ++ MUL c11, b3, c11 ++ MUL c21, b3, c21 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c21, c11, b2, c21 ++ MUL c21, b3, c21 ++#endif ++#ifdef RT ++ LD b1, BO, 3 * SIZE ++ LD b2, BO, 2 * SIZE ++ LD b3, BO, 0 * SIZE ++ MUL c21, b1, c21 ++ NMSUB c11, c21, b2, c11 ++ MUL c11, b3, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++ addi.d CO2, CO2, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c21, AO, 1 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++ addi.d CO2, CO2, 1 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 0 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 1 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L69: ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 2 ++#endif ++#ifdef RT ++ addi.d KK, KK, -2 ++#endif ++ .align 3 ++ ++.L50: ++ andi J, N, 4 ++move AO, A ++ bge $r0, J, .L70 ++#ifdef RT ++ slli.d TEMP, K, 2 + BASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 2 ++ sub.d C, C, TEMP ++#endif ++ move CO1, C ++MTC c11, $r0 ++ add.d CO2, C, LDC ++ add.d CO3, CO2, LDC ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ srai.d I, M, 1 ++ MOV c31, c11 ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO4, LDC ++#endif ++MOV c41, c11 ++ bge $r0, I, .L40 ++.L31: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ LD a3, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ MOV c32, c11 ++ LD b4, B, 3 * SIZE ++ MOV c42, c11 ++ LD b5, B, 4 * SIZE ++ srai.d L, KK, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L35 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ LD a3, AO, 4 * SIZE ++ LD b1, BO, 0 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c32, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c42, c11 ++ LD b5, BO, 4 * SIZE ++ srai.d L, TEMP, 2 ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ bge $r0, L, .L35 ++#endif ++ .align 3 ++.L32: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c11, b5, a1, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 8 * SIZE ++ MADD c12, b5, a2, c12 ++ LD b5, BO, 20 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 9 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 10 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ LD a3, AO, 6 * SIZE ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c11, b7, a3, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a3, c21 ++ addi.d AO, AO, 8 * SIZE ++ MADD c31, b3, a3, c31 ++ addi.d BO, BO, 16 * SIZE ++ MADD c41, b4, a3, c41 ++ LD a3, AO, 4 * SIZE ++ MADD c12, b7, a2, c12 ++ LD b7, BO, 12 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 1 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 2 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L32 ++ .align 3 ++ ++.L35: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L38 ++ .align 3 ++.L36: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ addi.d AO, AO, 2 * SIZE ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 0 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 4 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L36 ++.L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -4 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++ SUB c12, b5, c12 ++ SUB c22, b6, c22 ++ SUB c32, b7, c32 ++ SUB c42, b8, c42 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ LD b5, AO, 4 * SIZE ++ LD b6, AO, 5 * SIZE ++ LD b7, AO, 6 * SIZE ++ LD b8, AO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c21, b3, c21 ++ SUB c22, b4, c22 ++ SUB c31, b5, c31 ++ SUB c32, b6, c32 ++ SUB c41, b7, c41 ++ SUB c42, b8, c42 ++#endif ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++ LD b2, AO, 2 * SIZE ++ LD b3, AO, 0 * SIZE ++ MUL c12, b1, c12 ++ MUL c22, b1, c22 ++ MUL c32, b1, c32 ++ MUL c42, b1, c42 ++ NMSUB c11, c12, b2, c11 ++ NMSUB c21, c22, b2, c21 ++ NMSUB c31, c32, b2, c31 ++ NMSUB c41, c42, b2, c41 ++ MUL c11, b3, c11 ++ MUL c21, b3, c21 ++ MUL c31, b3, c31 ++ MUL c41, b3, c41 ++#endif ++#ifdef LT ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++ NMSUB c12, c11, b2, c12 ++ NMSUB c22, c21, b2, c22 ++ NMSUB c32, c31, b2, c32 ++ NMSUB c42, c41, b2, c42 ++ MUL c12, b3, c12 ++ MUL c22, b3, c22 ++ MUL c32, b3, c32 ++ MUL c42, b3, c42 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c22, c12, b2, c22 ++ NMSUB c31, c11, b3, c31 ++ NMSUB c32, c12, b3, c32 ++ NMSUB c41, c11, b4, c41 ++ NMSUB c42, c12, b4, c42 ++ LD b2, BO, 5 * SIZE ++ LD b3, BO, 6 * SIZE ++ LD b4, BO, 7 * SIZE ++ MUL c21, b2, c21 ++ MUL c22, b2, c22 ++ NMSUB c31, c21, b3, c31 ++ NMSUB c32, c22, b3, c32 ++ NMSUB c41, c21, b4, c41 ++ NMSUB c42, c22, b4, c42 ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ MUL c31, b3, c31 ++ MUL c32, b3, c32 ++ NMSUB c41, c31, b4, c41 ++ NMSUB c42, c32, b4, c42 ++ LD b4, BO, 15 * SIZE ++ MUL c41, b4, c41 ++ MUL c42, b4, c42 ++#endif ++#ifdef RT ++ LD b5, BO, 15 * SIZE ++ LD b6, BO, 14 * SIZE ++ LD b7, BO, 13 * SIZE ++ LD b8, BO, 12 * SIZE ++ MUL c41, b5, c41 ++ MUL c42, b5, c42 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c32, c42, b6, c32 ++ NMSUB c21, c41, b7, c21 ++ NMSUB c22, c42, b7, c22 ++ NMSUB c11, c41, b8, c11 ++ NMSUB c12, c42, b8, c12 ++ LD b6, BO, 10 * SIZE ++ LD b7, BO, 9 * SIZE ++ LD b8, BO, 8 * SIZE ++ MUL c31, b6, c31 ++ MUL c32, b6, c32 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c22, c32, b7, c22 ++ NMSUB c11, c31, b8, c11 ++ NMSUB c12, c32, b8, c12 ++ LD b7, BO, 5 * SIZE ++ LD b8, BO, 4 * SIZE ++ MUL c21, b7, c21 ++ MUL c22, b7, c22 ++ NMSUB c11, c21, b8, c11 ++ NMSUB c12, c22, b8, c12 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++ MUL c12, b8, c12 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -2 * SIZE ++ addi.d CO2, CO2, -2 * SIZE ++ addi.d CO3, CO3, -2 * SIZE ++ addi.d CO4, CO4, -2 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++ ST c12, BO, 4 * SIZE ++ ST c22, BO, 5 * SIZE ++ ST c32, BO, 6 * SIZE ++ ST c42, BO, 7 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c21, AO, 2 * SIZE ++ ST c22, AO, 3 * SIZE ++ ST c31, AO, 4 * SIZE ++ ST c32, AO, 5 * SIZE ++ ST c41, AO, 6 * SIZE ++ ST c42, AO, 7 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c22, CO2, 1 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c32, CO3, 1 * SIZE ++ ST c41, CO4, 0 * SIZE ++ ST c42, CO4, 1 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++ addi.d CO2, CO2, 2 * SIZE ++ addi.d CO3, CO3, 2 * SIZE ++ addi.d CO4, CO4, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++MTC a1, $r0 ++ MOV c11, a1 ++ MOV c21, a1 ++ MOV c31, a1 ++ addi.d I, I, -1 ++MOV c41, c11 ++ blt $r0, I, .L31 ++ .align 3 ++ ++.L40: ++ andi I, M, 1 ++MOV c61, c11 ++ bge $r0, I, .L49 ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c81, c11 ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, KK, 2 ++move BO, B ++ bge $r0, L, .L45 ++#else ++#ifdef LN ++ slli.d TEMP, K, BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c81, c11 ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L45 ++#endif ++ .align 3 ++.L42: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 16 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ LD a1, AO, 4 * SIZE ++ addi.d L, L, -1 ++ MADD c11, b5, a2, c11 ++ LD b5, BO, 20 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 10 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 11 * SIZE ++ LD a2, AO, 2 * SIZE ++ addi.d AO, AO, 4 * SIZE ++ MADD c11, b6, a2, c11 ++ LD b6, BO, 24 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 13 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 14 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 15 * SIZE ++ LD a2, AO, -1 * SIZE ++ addi.d BO, BO, 16 * SIZE ++ MADD c11, b7, a2, c11 ++ LD b7, BO, 12 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 1 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 2 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 3 * SIZE ++ LD a2, AO, 1 * SIZE ++ blt $r0, L, .L42 ++ .align 3 ++ ++.L45: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L48 ++ .align 3 ++.L46: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 4 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 1 * SIZE ++ LD b4, BO, 7 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++ MOV a2, a2 ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L46 ++.L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -4 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c31, c11, b3, c31 ++ NMSUB c41, c11, b4, c41 ++ LD b2, BO, 5 * SIZE ++ LD b3, BO, 6 * SIZE ++ LD b4, BO, 7 * SIZE ++ MUL c21, b2, c21 ++ NMSUB c31, c21, b3, c31 ++ NMSUB c41, c21, b4, c41 ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ MUL c31, b3, c31 ++ NMSUB c41, c31, b4, c41 ++ LD b4, BO, 15 * SIZE ++ MUL c41, b4, c41 ++#endif ++#ifdef RT ++ LD b5, BO, 15 * SIZE ++ LD b6, BO, 14 * SIZE ++ LD b7, BO, 13 * SIZE ++ LD b8, BO, 12 * SIZE ++ MUL c41, b5, c41 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c21, c41, b7, c21 ++ NMSUB c11, c41, b8, c11 ++ LD b6, BO, 10 * SIZE ++ LD b7, BO, 9 * SIZE ++ LD b8, BO, 8 * SIZE ++ MUL c31, b6, c31 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c11, c31, b8, c11 ++ LD b7, BO, 5 * SIZE ++ LD b8, BO, 4 * SIZE ++ MUL c21, b7, c21 ++ NMSUB c11, c21, b8, c11 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++ addi.d CO2, CO2, -1 * SIZE ++ addi.d CO3, CO3, -1 * SIZE ++ addi.d CO4, CO4, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c21, AO, 1 * SIZE ++ ST c31, AO, 2 * SIZE ++ ST c41, AO, 3 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c41, CO4, 0 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++ addi.d CO2, CO2, 1 * SIZE ++ addi.d CO3, CO3, 1 * SIZE ++ addi.d CO4, CO4, 1 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 2 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L49: ++#ifdef LN ++ slli.d TEMP, K, 2 + BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 4 ++#endif ++#ifdef RT ++ addi.d KK, KK, -4 ++#endif ++ .align 3 ++ ++.L70: ++ srai.d J, N, 3 ++nop ++ bge $r0, J, .L999 ++.L10: ++#ifdef RT ++ slli.d TEMP, K, 3 + BASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 3 ++ sub.d C, C, TEMP ++#endif ++ move CO1, C ++MTC c11, $r0 ++ add.d CO2, C, LDC ++ add.d CO3, CO2, LDC ++ addi.d J, J, -1 ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ add.d CO5, CO4, LDC ++ MOV c31, c11 ++ add.d CO6, CO5, LDC ++ MOV c41, c11 ++ add.d CO7, CO6, LDC ++ MOV c51, c11 ++ add.d CO8, CO7, LDC ++ srai.d I, M, 1 ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO8, LDC ++#endif ++MOV c61, c11 ++ bge $r0, I, .L20 ++.L11: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, B, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, KK, 2 ++ MOV c32, c11 ++ LD b3, B, 2 * SIZE ++ MOV c42, c11 ++ LD b4, B, 3 * SIZE ++ MOV c52, c11 ++ LD b5, B, 4 * SIZE ++ MOV c62, c11 ++ LD b6, B, 8 * SIZE ++ MOV c72, c11 ++ LD b7, B, 12 * SIZE ++ MOV c82, c11 ++move BO, B ++ bge $r0, L, .L15 ++#else ++#ifdef LN ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 1 + BASE_SHIFT ++ slli.d TEMP, KK, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ MOV c32, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c42, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c52, c11 ++ LD b5, BO, 4 * SIZE ++ MOV c62, c11 ++ LD b6, BO, 8 * SIZE ++ MOV c72, c11 ++ LD b7, BO, 12 * SIZE ++ MOV c82, c11 ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L15 ++#endif ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ bge $r0, L, .L13 ++ .align 3 ++.L12: ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ MADD c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a4, c51 ++ MADD c61, b2, a4, c61 ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ MADD c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD c71, b3, a3, c71 ++ MADD c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ addi.d L, L, -1 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ blt $r0, L, .L12 ++ .align 3 ++ ++.L13: ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ MADD c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a4, c51 ++ MADD c61, b2, a4, c61 ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ MADD c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD c71, b3, a3, c71 ++ MADD c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ .align 3 ++ ++.L15: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L18 ++ .align 3 ++.L16: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ addi.d L, L, -1 ++ MADD c61, b2, a1, c61 ++ addi.d AO, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ addi.d BO, BO, 8 * SIZE ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 4 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L16 ++.L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -2 ++#else ++ addi.d TEMP, KK, -8 ++#endif ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ LD b5, BO, 4 * SIZE ++ SUB c21, b2, c21 ++ LD b6, BO, 5 * SIZE ++ SUB c31, b3, c31 ++ LD b7, BO, 6 * SIZE ++ SUB c41, b4, c41 ++ LD b8, BO, 7 * SIZE ++ SUB c51, b5, c51 ++ LD b1, BO, 8 * SIZE ++ SUB c61, b6, c61 ++ LD b2, BO, 9 * SIZE ++ SUB c71, b7, c71 ++ LD b3, BO, 10 * SIZE ++ SUB c81, b8, c81 ++ LD b4, BO, 11 * SIZE ++ SUB c12, b1, c12 ++ LD b5, BO, 12 * SIZE ++ SUB c22, b2, c22 ++ LD b6, BO, 13 * SIZE ++ SUB c32, b3, c32 ++ LD b7, BO, 14 * SIZE ++ SUB c42, b4, c42 ++ LD b8, BO, 15 * SIZE ++ SUB c52, b5, c52 ++#ifdef LN ++ LD b1, AO, 3 * SIZE ++#else ++ LD b1, AO, 0 * SIZE ++#endif ++ SUB c62, b6, c62 ++ SUB c72, b7, c72 ++ SUB c82, b8, c82 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ LD b5, AO, 4 * SIZE ++ SUB c12, b2, c12 ++ LD b6, AO, 5 * SIZE ++ SUB c21, b3, c21 ++ LD b7, AO, 6 * SIZE ++ SUB c22, b4, c22 ++ LD b8, AO, 7 * SIZE ++ SUB c31, b5, c31 ++ LD b1, AO, 8 * SIZE ++ SUB c32, b6, c32 ++ LD b2, AO, 9 * SIZE ++ SUB c41, b7, c41 ++ LD b3, AO, 10 * SIZE ++ SUB c42, b8, c42 ++ LD b4, AO, 11 * SIZE ++ LD b5, AO, 12 * SIZE ++ SUB c51, b1, c51 ++ LD b6, AO, 13 * SIZE ++ SUB c52, b2, c52 ++ LD b7, AO, 14 * SIZE ++ SUB c61, b3, c61 ++ LD b8, AO, 15 * SIZE ++ SUB c62, b4, c62 ++ SUB c71, b5, c71 ++ SUB c72, b6, c72 ++ SUB c81, b7, c81 ++ SUB c82, b8, c82 ++#endif ++#ifdef LN ++ MUL c12, b1, c12 ++ LD b2, AO, 2 * SIZE ++ MUL c22, b1, c22 ++ MUL c32, b1, c32 ++ MUL c42, b1, c42 ++ MUL c52, b1, c52 ++ MUL c62, b1, c62 ++ MUL c72, b1, c72 ++ MUL c82, b1, c82 ++ NMSUB c11, c12, b2, c11 ++ LD b3, AO, 0 * SIZE ++ NMSUB c21, c22, b2, c21 ++ NMSUB c31, c32, b2, c31 ++ NMSUB c41, c42, b2, c41 ++ NMSUB c51, c52, b2, c51 ++ NMSUB c61, c62, b2, c61 ++ NMSUB c71, c72, b2, c71 ++ NMSUB c81, c82, b2, c81 ++ MUL c11, b3, c11 ++ addi.d CO1, CO1, -2 * SIZE ++ MUL c21, b3, c21 ++ addi.d CO2, CO2, -2 * SIZE ++ MUL c31, b3, c31 ++ addi.d CO3, CO3, -2 * SIZE ++ MUL c41, b3, c41 ++ addi.d CO4, CO4, -2 * SIZE ++ MUL c51, b3, c51 ++ addi.d CO5, CO5, -2 * SIZE ++ MUL c61, b3, c61 ++ addi.d CO6, CO6, -2 * SIZE ++ MUL c71, b3, c71 ++ addi.d CO7, CO7, -2 * SIZE ++ MUL c81, b3, c81 ++ addi.d CO8, CO8, -2 * SIZE ++#endif ++#ifdef LT ++ MUL c11, b1, c11 ++ LD b2, AO, 1 * SIZE ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++ MUL c51, b1, c51 ++ MUL c61, b1, c61 ++ MUL c71, b1, c71 ++ MUL c81, b1, c81 ++ NMSUB c12, c11, b2, c12 ++ LD b3, AO, 3 * SIZE ++ NMSUB c22, c21, b2, c22 ++ NMSUB c32, c31, b2, c32 ++ NMSUB c42, c41, b2, c42 ++ NMSUB c52, c51, b2, c52 ++ NMSUB c62, c61, b2, c62 ++ NMSUB c72, c71, b2, c72 ++ NMSUB c82, c81, b2, c82 ++ MUL c12, b3, c12 ++ MUL c22, b3, c22 ++ MUL c32, b3, c32 ++ MUL c42, b3, c42 ++ MUL c52, b3, c52 ++ MUL c62, b3, c62 ++ MUL c72, b3, c72 ++ MUL c82, b3, c82 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL c11, b1, c11 ++ MUL c12, b1, c12 ++ LD b5, BO, 4 * SIZE ++ NMSUB c21, c11, b2, c21 ++ NMSUB c22, c12, b2, c22 ++ LD b6, BO, 5 * SIZE ++ NMSUB c31, c11, b3, c31 ++ NMSUB c32, c12, b3, c32 ++ LD b7, BO, 6 * SIZE ++ NMSUB c41, c11, b4, c41 ++ NMSUB c42, c12, b4, c42 ++ LD b8, BO, 7 * SIZE ++ NMSUB c51, c11, b5, c51 ++ NMSUB c52, c12, b5, c52 ++ LD b2, BO, 9 * SIZE ++ NMSUB c61, c11, b6, c61 ++ NMSUB c62, c12, b6, c62 ++ LD b3, BO, 10 * SIZE ++ NMSUB c71, c11, b7, c71 ++ NMSUB c72, c12, b7, c72 ++ LD b4, BO, 11 * SIZE ++ NMSUB c81, c11, b8, c81 ++ NMSUB c82, c12, b8, c82 ++ LD b5, BO, 12 * SIZE ++ MUL c21, b2, c21 ++ MUL c22, b2, c22 ++ LD b6, BO, 13 * SIZE ++ NMSUB c31, c21, b3, c31 ++ NMSUB c32, c22, b3, c32 ++ LD b7, BO, 14 * SIZE ++ NMSUB c41, c21, b4, c41 ++ NMSUB c42, c22, b4, c42 ++ LD b8, BO, 15 * SIZE ++ NMSUB c51, c21, b5, c51 ++ NMSUB c52, c22, b5, c52 ++ LD b3, BO, 18 * SIZE ++ NMSUB c61, c21, b6, c61 ++ NMSUB c62, c22, b6, c62 ++ LD b4, BO, 19 * SIZE ++ NMSUB c71, c21, b7, c71 ++ NMSUB c72, c22, b7, c72 ++ LD b5, BO, 20 * SIZE ++ NMSUB c81, c21, b8, c81 ++ NMSUB c82, c22, b8, c82 ++ LD b6, BO, 21 * SIZE ++ MUL c31, b3, c31 ++ MUL c32, b3, c32 ++ LD b7, BO, 22 * SIZE ++ NMSUB c41, c31, b4, c41 ++ NMSUB c42, c32, b4, c42 ++ LD b8, BO, 23 * SIZE ++ NMSUB c51, c31, b5, c51 ++ NMSUB c52, c32, b5, c52 ++ LD b4, BO, 27 * SIZE ++ NMSUB c61, c31, b6, c61 ++ NMSUB c62, c32, b6, c62 ++ LD b5, BO, 28 * SIZE ++ NMSUB c71, c31, b7, c71 ++ NMSUB c72, c32, b7, c72 ++ LD b6, BO, 29 * SIZE ++ NMSUB c81, c31, b8, c81 ++ NMSUB c82, c32, b8, c82 ++ LD b7, BO, 30 * SIZE ++ MUL c41, b4, c41 ++ MUL c42, b4, c42 ++ LD b8, BO, 31 * SIZE ++ NMSUB c51, c41, b5, c51 ++ NMSUB c52, c42, b5, c52 ++ LD b5, BO, 36 * SIZE ++ NMSUB c61, c41, b6, c61 ++ NMSUB c62, c42, b6, c62 ++ LD b6, BO, 37 * SIZE ++ NMSUB c71, c41, b7, c71 ++ NMSUB c72, c42, b7, c72 ++ LD b7, BO, 38 * SIZE ++ NMSUB c81, c41, b8, c81 ++ NMSUB c82, c42, b8, c82 ++ LD b8, BO, 39 * SIZE ++ MUL c51, b5, c51 ++ MUL c52, b5, c52 ++ NMSUB c61, c51, b6, c61 ++ NMSUB c62, c52, b6, c62 ++ LD b6, BO, 45 * SIZE ++ NMSUB c71, c51, b7, c71 ++ NMSUB c72, c52, b7, c72 ++ LD b7, BO, 46 * SIZE ++ NMSUB c81, c51, b8, c81 ++ NMSUB c82, c52, b8, c82 ++ LD b8, BO, 47 * SIZE ++ MUL c61, b6, c61 ++ MUL c62, b6, c62 ++ NMSUB c71, c61, b7, c71 ++ NMSUB c72, c62, b7, c72 ++ LD b7, BO, 54 * SIZE ++ NMSUB c81, c61, b8, c81 ++ NMSUB c82, c62, b8, c82 ++ LD b8, BO, 55 * SIZE ++ MUL c71, b7, c71 ++ MUL c72, b7, c72 ++ NMSUB c81, c71, b8, c81 ++ NMSUB c82, c72, b8, c82 ++ LD b8, BO, 63 * SIZE ++ MUL c81, b8, c81 ++ MUL c82, b8, c82 ++#endif ++#ifdef RT ++ LD b1, BO, 63 * SIZE ++ LD b2, BO, 62 * SIZE ++ LD b3, BO, 61 * SIZE ++ LD b4, BO, 60 * SIZE ++ MUL c81, b1, c81 ++ MUL c82, b1, c82 ++ LD b5, BO, 59 * SIZE ++ NMSUB c71, c81, b2, c71 ++ NMSUB c72, c82, b2, c72 ++ LD b6, BO, 58 * SIZE ++ NMSUB c61, c81, b3, c61 ++ NMSUB c62, c82, b3, c62 ++ LD b7, BO, 57 * SIZE ++ NMSUB c51, c81, b4, c51 ++ NMSUB c52, c82, b4, c52 ++ LD b8, BO, 56 * SIZE ++ NMSUB c41, c81, b5, c41 ++ NMSUB c42, c82, b5, c42 ++ LD b2, BO, 54 * SIZE ++ NMSUB c31, c81, b6, c31 ++ NMSUB c32, c82, b6, c32 ++ LD b3, BO, 53 * SIZE ++ NMSUB c21, c81, b7, c21 ++ NMSUB c22, c82, b7, c22 ++ LD b4, BO, 52 * SIZE ++ NMSUB c11, c81, b8, c11 ++ NMSUB c12, c82, b8, c12 ++ LD b5, BO, 51 * SIZE ++ MUL c71, b2, c71 ++ MUL c72, b2, c72 ++ LD b6, BO, 50 * SIZE ++ NMSUB c61, c71, b3, c61 ++ NMSUB c62, c72, b3, c62 ++ LD b7, BO, 49 * SIZE ++ NMSUB c51, c71, b4, c51 ++ NMSUB c52, c72, b4, c52 ++ LD b8, BO, 48 * SIZE ++ NMSUB c41, c71, b5, c41 ++ NMSUB c42, c72, b5, c42 ++ LD b3, BO, 45 * SIZE ++ NMSUB c31, c71, b6, c31 ++ NMSUB c32, c72, b6, c32 ++ LD b4, BO, 44 * SIZE ++ NMSUB c21, c71, b7, c21 ++ NMSUB c22, c72, b7, c22 ++ LD b5, BO, 43 * SIZE ++ NMSUB c11, c71, b8, c11 ++ NMSUB c12, c72, b8, c12 ++ LD b6, BO, 42 * SIZE ++ MUL c61, b3, c61 ++ MUL c62, b3, c62 ++ LD b7, BO, 41 * SIZE ++ NMSUB c51, c61, b4, c51 ++ NMSUB c52, c62, b4, c52 ++ LD b8, BO, 40 * SIZE ++ NMSUB c41, c61, b5, c41 ++ NMSUB c42, c62, b5, c42 ++ LD b4, BO, 36 * SIZE ++ NMSUB c31, c61, b6, c31 ++ NMSUB c32, c62, b6, c32 ++ LD b5, BO, 35 * SIZE ++ NMSUB c21, c61, b7, c21 ++ NMSUB c22, c62, b7, c22 ++ LD b6, BO, 34 * SIZE ++ NMSUB c11, c61, b8, c11 ++ NMSUB c12, c62, b8, c12 ++ LD b7, BO, 33 * SIZE ++ MUL c51, b4, c51 ++ MUL c52, b4, c52 ++ LD b8, BO, 32 * SIZE ++ NMSUB c41, c51, b5, c41 ++ NMSUB c42, c52, b5, c42 ++ LD b5, BO, 27 * SIZE ++ NMSUB c31, c51, b6, c31 ++ NMSUB c32, c52, b6, c32 ++ LD b6, BO, 26 * SIZE ++ NMSUB c21, c51, b7, c21 ++ NMSUB c22, c52, b7, c22 ++ LD b7, BO, 25 * SIZE ++ NMSUB c11, c51, b8, c11 ++ NMSUB c12, c52, b8, c12 ++ LD b8, BO, 24 * SIZE ++ MUL c41, b5, c41 ++ MUL c42, b5, c42 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c32, c42, b6, c32 ++ LD b6, BO, 18 * SIZE ++ NMSUB c21, c41, b7, c21 ++ NMSUB c22, c42, b7, c22 ++ LD b7, BO, 17 * SIZE ++ NMSUB c11, c41, b8, c11 ++ NMSUB c12, c42, b8, c12 ++ LD b8, BO, 16 * SIZE ++ MUL c31, b6, c31 ++ MUL c32, b6, c32 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c22, c32, b7, c22 ++ LD b7, BO, 9 * SIZE ++ NMSUB c11, c31, b8, c11 ++ NMSUB c12, c32, b8, c12 ++ LD b8, BO, 8 * SIZE ++ MUL c21, b7, c21 ++ MUL c22, b7, c22 ++ NMSUB c11, c21, b8, c11 ++ NMSUB c12, c22, b8, c12 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++ MUL c12, b8, c12 ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++ ST c51, BO, 4 * SIZE ++ ST c61, BO, 5 * SIZE ++ ST c71, BO, 6 * SIZE ++ ST c81, BO, 7 * SIZE ++ ST c12, BO, 8 * SIZE ++ ST c22, BO, 9 * SIZE ++ ST c32, BO, 10 * SIZE ++ ST c42, BO, 11 * SIZE ++ ST c52, BO, 12 * SIZE ++ ST c62, BO, 13 * SIZE ++ ST c72, BO, 14 * SIZE ++ ST c82, BO, 15 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c21, AO, 2 * SIZE ++ ST c22, AO, 3 * SIZE ++ ST c31, AO, 4 * SIZE ++ ST c32, AO, 5 * SIZE ++ ST c41, AO, 6 * SIZE ++ ST c42, AO, 7 * SIZE ++ ST c51, AO, 8 * SIZE ++ ST c52, AO, 9 * SIZE ++ ST c61, AO, 10 * SIZE ++ ST c62, AO, 11 * SIZE ++ ST c71, AO, 12 * SIZE ++ ST c72, AO, 13 * SIZE ++ ST c81, AO, 14 * SIZE ++ ST c82, AO, 15 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c22, CO2, 1 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c32, CO3, 1 * SIZE ++ ST c41, CO4, 0 * SIZE ++ ST c42, CO4, 1 * SIZE ++ ST c51, CO5, 0 * SIZE ++ ST c52, CO5, 1 * SIZE ++ ST c61, CO6, 0 * SIZE ++ ST c62, CO6, 1 * SIZE ++ ST c71, CO7, 0 * SIZE ++ ST c72, CO7, 1 * SIZE ++ ST c81, CO8, 0 * SIZE ++ ST c82, CO8, 1 * SIZE ++MTC a1, $r0 ++#ifndef LN ++ addi.d CO1, CO1, 2 * SIZE ++ addi.d CO2, CO2, 2 * SIZE ++ addi.d CO3, CO3, 2 * SIZE ++ addi.d CO4, CO4, 2 * SIZE ++ addi.d CO5, CO5, 2 * SIZE ++ addi.d CO6, CO6, 2 * SIZE ++ addi.d CO7, CO7, 2 * SIZE ++ addi.d CO8, CO8, 2 * SIZE ++#endif ++ MOV c11, a1 ++ MOV c21, a1 ++#ifdef RT ++ slli.d TEMP, K, 1 + BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++ MOV c31, a1 ++ MOV c41, a1 ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 1 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 2 ++#endif ++#ifdef LN ++ addi.d KK, KK, -2 ++#endif ++ addi.d I, I, -1 ++ MOV c51, a1 ++MOV c61, a1 ++ blt $r0, I, .L11 ++ .align 3 ++ ++.L20: ++ andi I, M, 1 ++ MOV c61, c11 ++MOV c71, c11 ++ bge $r0, I, .L29 ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, KK, 2 ++ MOV c81, c11 ++move BO, B ++ bge $r0, L, .L25 ++#else ++#ifdef LN ++ slli.d TEMP, K, 0 + BASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, 0 + BASE_SHIFT ++ slli.d TEMP, KK, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 8 * SIZE ++ LD b7, BO, 12 * SIZE ++ srai.d L, TEMP, 2 ++ MOV c81, c11 ++ bge $r0, L, .L25 ++#endif ++ .align 3 ++.L22: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 16 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ LD b5, BO, 20 * SIZE ++ MADD c61, b2, a1, c61 ++ LD b2, BO, 9 * SIZE ++ MADD c71, b3, a1, c71 ++ LD b3, BO, 10 * SIZE ++ MADD c81, b4, a1, c81 ++ LD b4, BO, 11 * SIZE ++ LD a1, AO, 4 * SIZE ++ addi.d L, L, -1 ++ MADD c11, b6, a2, c11 ++ LD b6, BO, 24 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 13 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 14 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a2, c51 ++ LD b7, BO, 28 * SIZE ++ MADD c61, b2, a2, c61 ++ LD b2, BO, 17 * SIZE ++ MADD c71, b3, a2, c71 ++ LD b3, BO, 18 * SIZE ++ MADD c81, b4, a2, c81 ++ LD b4, BO, 19 * SIZE ++ LD a2, AO, 5 * SIZE ++ addi.d AO, AO, 4 * SIZE ++ MADD c11, b1, a3, c11 ++ LD b1, BO, 32 * SIZE ++ MADD c21, b2, a3, c21 ++ LD b2, BO, 21 * SIZE ++ MADD c31, b3, a3, c31 ++ LD b3, BO, 22 * SIZE ++ MADD c41, b4, a3, c41 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ LD b5, BO, 36 * SIZE ++ MADD c61, b2, a3, c61 ++ LD b2, BO, 25 * SIZE ++ MADD c71, b3, a3, c71 ++ LD b3, BO, 26 * SIZE ++ MADD c81, b4, a3, c81 ++ LD b4, BO, 27 * SIZE ++ LD a3, AO, 2 * SIZE ++ addi.d BO, BO, 32 * SIZE ++ MADD c11, b6, a4, c11 ++ LD b6, BO, 8 * SIZE ++ MADD c21, b2, a4, c21 ++ LD b2, BO, -3 * SIZE ++ MADD c31, b3, a4, c31 ++ LD b3, BO, -2 * SIZE ++ MADD c41, b4, a4, c41 ++ LD b4, BO, -1 * SIZE ++ MADD c51, b7, a4, c51 ++ LD b7, BO, 12 * SIZE ++ MADD c61, b2, a4, c61 ++ LD b2, BO, 1 * SIZE ++ MADD c71, b3, a4, c71 ++ LD b3, BO, 2 * SIZE ++ MADD c81, b4, a4, c81 ++ LD b4, BO, 3 * SIZE ++ LD a4, AO, 3 * SIZE ++ blt $r0, L, .L22 ++ .align 3 ++ ++.L25: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L28 ++ .align 3 ++.L26: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 8 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ addi.d L, L, -1 ++ MOV a2, a2 ++ addi.d AO, AO, 1 * SIZE ++ addi.d BO, BO, 8 * SIZE ++ MADD c51, b5, a1, c51 ++ LD b5, BO, 4 * SIZE ++ MADD c61, b2, a1, c61 ++ LD b2, BO, 1 * SIZE ++ MADD c71, b3, a1, c71 ++ LD b3, BO, 2 * SIZE ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L26 ++.L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -8 ++#endif ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++ SUB c51, b5, c51 ++ SUB c61, b6, c61 ++ SUB c71, b7, c71 ++ SUB c81, b8, c81 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ LD b5, AO, 4 * SIZE ++ LD b6, AO, 5 * SIZE ++ LD b7, AO, 6 * SIZE ++ LD b8, AO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c21, b2, c21 ++ SUB c31, b3, c31 ++ SUB c41, b4, c41 ++ SUB c51, b5, c51 ++ SUB c61, b6, c61 ++ SUB c71, b7, c71 ++ SUB c81, b8, c81 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ MUL c11, b1, c11 ++ MUL c21, b1, c21 ++ MUL c31, b1, c31 ++ MUL c41, b1, c41 ++ MUL c51, b1, c51 ++ MUL c61, b1, c61 ++ MUL c71, b1, c71 ++ MUL c81, b1, c81 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ MUL c11, b1, c11 ++ NMSUB c21, c11, b2, c21 ++ NMSUB c31, c11, b3, c31 ++ NMSUB c41, c11, b4, c41 ++ NMSUB c51, c11, b5, c51 ++ NMSUB c61, c11, b6, c61 ++ NMSUB c71, c11, b7, c71 ++ NMSUB c81, c11, b8, c81 ++ LD b2, BO, 9 * SIZE ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ LD b5, BO, 12 * SIZE ++ LD b6, BO, 13 * SIZE ++ LD b7, BO, 14 * SIZE ++ LD b8, BO, 15 * SIZE ++ MUL c21, b2, c21 ++ NMSUB c31, c21, b3, c31 ++ NMSUB c41, c21, b4, c41 ++ NMSUB c51, c21, b5, c51 ++ NMSUB c61, c21, b6, c61 ++ NMSUB c71, c21, b7, c71 ++ NMSUB c81, c21, b8, c81 ++ LD b3, BO, 18 * SIZE ++ LD b4, BO, 19 * SIZE ++ LD b5, BO, 20 * SIZE ++ LD b6, BO, 21 * SIZE ++ LD b7, BO, 22 * SIZE ++ LD b8, BO, 23 * SIZE ++ MUL c31, b3, c31 ++ NMSUB c41, c31, b4, c41 ++ NMSUB c51, c31, b5, c51 ++ NMSUB c61, c31, b6, c61 ++ NMSUB c71, c31, b7, c71 ++ NMSUB c81, c31, b8, c81 ++ LD b4, BO, 27 * SIZE ++ LD b5, BO, 28 * SIZE ++ LD b6, BO, 29 * SIZE ++ LD b7, BO, 30 * SIZE ++ LD b8, BO, 31 * SIZE ++ MUL c41, b4, c41 ++ NMSUB c51, c41, b5, c51 ++ NMSUB c61, c41, b6, c61 ++ NMSUB c71, c41, b7, c71 ++ NMSUB c81, c41, b8, c81 ++ LD b5, BO, 36 * SIZE ++ LD b6, BO, 37 * SIZE ++ LD b7, BO, 38 * SIZE ++ LD b8, BO, 39 * SIZE ++ MUL c51, b5, c51 ++ NMSUB c61, c51, b6, c61 ++ NMSUB c71, c51, b7, c71 ++ NMSUB c81, c51, b8, c81 ++ LD b6, BO, 45 * SIZE ++ LD b7, BO, 46 * SIZE ++ LD b8, BO, 47 * SIZE ++ MUL c61, b6, c61 ++ NMSUB c71, c61, b7, c71 ++ NMSUB c81, c61, b8, c81 ++ LD b7, BO, 54 * SIZE ++ LD b8, BO, 55 * SIZE ++ MUL c71, b7, c71 ++ NMSUB c81, c71, b8, c81 ++ LD b8, BO, 63 * SIZE ++ MUL c81, b8, c81 ++#endif ++#ifdef RT ++ LD b1, BO, 63 * SIZE ++ LD b2, BO, 62 * SIZE ++ LD b3, BO, 61 * SIZE ++ LD b4, BO, 60 * SIZE ++ LD b5, BO, 59 * SIZE ++ LD b6, BO, 58 * SIZE ++ LD b7, BO, 57 * SIZE ++ LD b8, BO, 56 * SIZE ++ MUL c81, b1, c81 ++ NMSUB c71, c81, b2, c71 ++ NMSUB c61, c81, b3, c61 ++ NMSUB c51, c81, b4, c51 ++ NMSUB c41, c81, b5, c41 ++ NMSUB c31, c81, b6, c31 ++ NMSUB c21, c81, b7, c21 ++ NMSUB c11, c81, b8, c11 ++ LD b2, BO, 54 * SIZE ++ LD b3, BO, 53 * SIZE ++ LD b4, BO, 52 * SIZE ++ LD b5, BO, 51 * SIZE ++ LD b6, BO, 50 * SIZE ++ LD b7, BO, 49 * SIZE ++ LD b8, BO, 48 * SIZE ++ MUL c71, b2, c71 ++ NMSUB c61, c71, b3, c61 ++ NMSUB c51, c71, b4, c51 ++ NMSUB c41, c71, b5, c41 ++ NMSUB c31, c71, b6, c31 ++ NMSUB c21, c71, b7, c21 ++ NMSUB c11, c71, b8, c11 ++ LD b3, BO, 45 * SIZE ++ LD b4, BO, 44 * SIZE ++ LD b5, BO, 43 * SIZE ++ LD b6, BO, 42 * SIZE ++ LD b7, BO, 41 * SIZE ++ LD b8, BO, 40 * SIZE ++ MUL c61, b3, c61 ++ NMSUB c51, c61, b4, c51 ++ NMSUB c41, c61, b5, c41 ++ NMSUB c31, c61, b6, c31 ++ NMSUB c21, c61, b7, c21 ++ NMSUB c11, c61, b8, c11 ++ LD b4, BO, 36 * SIZE ++ LD b5, BO, 35 * SIZE ++ LD b6, BO, 34 * SIZE ++ LD b7, BO, 33 * SIZE ++ LD b8, BO, 32 * SIZE ++ MUL c51, b4, c51 ++ NMSUB c41, c51, b5, c41 ++ NMSUB c31, c51, b6, c31 ++ NMSUB c21, c51, b7, c21 ++ NMSUB c11, c51, b8, c11 ++ LD b5, BO, 27 * SIZE ++ LD b6, BO, 26 * SIZE ++ LD b7, BO, 25 * SIZE ++ LD b8, BO, 24 * SIZE ++ MUL c41, b5, c41 ++ NMSUB c31, c41, b6, c31 ++ NMSUB c21, c41, b7, c21 ++ NMSUB c11, c41, b8, c11 ++ LD b6, BO, 18 * SIZE ++ LD b7, BO, 17 * SIZE ++ LD b8, BO, 16 * SIZE ++ MUL c31, b6, c31 ++ NMSUB c21, c31, b7, c21 ++ NMSUB c11, c31, b8, c11 ++ LD b7, BO, 9 * SIZE ++ LD b8, BO, 8 * SIZE ++ MUL c21, b7, c21 ++ NMSUB c11, c21, b8, c11 ++ LD b8, BO, 0 * SIZE ++ MUL c11, b8, c11 ++#endif ++#ifdef LN ++ addi.d CO1, CO1, -1 * SIZE ++ addi.d CO2, CO2, -1 * SIZE ++ addi.d CO3, CO3, -1 * SIZE ++ addi.d CO4, CO4, -1 * SIZE ++ addi.d CO5, CO5, -1 * SIZE ++ addi.d CO6, CO6, -1 * SIZE ++ addi.d CO7, CO7, -1 * SIZE ++ addi.d CO8, CO8, -1 * SIZE ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c21, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c41, BO, 3 * SIZE ++ ST c51, BO, 4 * SIZE ++ ST c61, BO, 5 * SIZE ++ ST c71, BO, 6 * SIZE ++ ST c81, BO, 7 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c21, AO, 1 * SIZE ++ ST c31, AO, 2 * SIZE ++ ST c41, AO, 3 * SIZE ++ ST c51, AO, 4 * SIZE ++ ST c61, AO, 5 * SIZE ++ ST c71, AO, 6 * SIZE ++ ST c81, AO, 7 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c21, CO2, 0 * SIZE ++ ST c31, CO3, 0 * SIZE ++ ST c41, CO4, 0 * SIZE ++ ST c51, CO5, 0 * SIZE ++ ST c61, CO6, 0 * SIZE ++ ST c71, CO7, 0 * SIZE ++ ST c81, CO8, 0 * SIZE ++#ifndef LN ++ addi.d CO1, CO1, 1 * SIZE ++ addi.d CO2, CO2, 1 * SIZE ++ addi.d CO3, CO3, 1 * SIZE ++ addi.d CO4, CO4, 1 * SIZE ++ addi.d CO5, CO5, 1 * SIZE ++ addi.d CO6, CO6, 1 * SIZE ++ addi.d CO7, CO7, 1 * SIZE ++ addi.d CO8, CO8, 1 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, BASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, 0 + BASE_SHIFT ++ slli.d TEMP, TEMP, 3 + BASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L29: ++#ifdef LN ++ slli.d TEMP, K, 3 + BASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 8 ++#endif ++#ifdef RT ++ addi.d KK, KK, -8 ++#endif ++ blt $r0, J, .L10 ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ LDARG $r25, $sp, 16 ++ LDARG $r26, $sp, 24 ++ LDARG $r27, $sp, 32 ++ LDARG $r28, $sp, 40 ++ fld.d $f24, $sp, 48 ++ fld.d $f25, $sp, 56 ++ fld.d $f26, $sp, 64 ++ fld.d $f27, $sp, 72 ++ fld.d $f28, $sp, 80 ++ LDARG $r29, $sp, 88 ++ LDARG $r30, $sp, 96 ++ LDARG $r20, $sp, 104 ++ LDARG $r16, $sp, 112 ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 112 ++ fld.d $f19, $sp, 120 ++ fld.d $f20, $sp, 128 ++ fld.d $f21, $sp, 136 ++#endif ++ addi.d $sp, $sp, 144 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/zamax.S b/kernel/loongarch64/zamax.S +new file mode 100644 +index 0000000..f998bdc +--- /dev/null ++++ b/kernel/loongarch64/zamax.S +@@ -0,0 +1,190 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define t1 $f0 ++#define t2 $f1 ++#define t3 $f2 ++#define t4 $f3 ++#define t5 $f4 ++#define t6 $f5 ++#define t7 $f6 ++#define t8 $f7 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ MTC s1, $r0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, ZBASE_SHIFT ++ bge $r0, INCX, .L999 ++ LD a1, X, 0 * SIZE ++ addi.d N, N, -1 ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ FABS t1, a1 ++ FABS t2, a2 ++ ADD s1, t1, t2 ++ bge $r0, N, .L999 ++ ADD s2, t1, t2 ++ srai.d I, N, 2 ++ ADD s3, t1, t2 ++ ADD s4, t1, t2 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ LD a8, X, 1 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ FABS t1, a1 ++ LD a1, X, 0 * SIZE ++ FABS t2, a2 ++ LD a2, X, 1 * SIZE ++ FABS t3, a3 ++ add.d X, X, INCX ++ FABS t4, a4 ++ FABS t5, a5 ++ LD a3, X, 0 * SIZE ++ FABS t6, a6 ++ LD a4, X, 1 * SIZE ++ FABS t7, a7 ++ add.d X, X, INCX ++ FABS t8, a8 ++ ADD t1, t1, t2 ++ LD a5, X, 0 * SIZE ++ ADD t3, t3, t4 ++ LD a6, X, 1 * SIZE ++ ADD t5, t5, t6 ++ add.d X, X, INCX ++ ADD t7, t7, t8 ++ CMPLT $fcc0, s1, t1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, s2, t3 ++ LD a8, X, 1 * SIZE ++ CMPLT $fcc2, s3, t5 ++ add.d X, X, INCX ++ CMPLT $fcc3, s4, t7 ++ CMOVT s1, s1, t1, $fcc0 ++ addi.d I, I, -1 ++ CMOVT s2, s2, t3, $fcc1 ++ CMOVT s3, s3, t5, $fcc2 ++ CMOVT s4, s4, t7, $fcc3 ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ FABS t1, a1 ++ FABS t2, a2 ++ FABS t3, a3 ++ FABS t4, a4 ++ FABS t5, a5 ++ FABS t6, a6 ++ FABS t7, a7 ++ FABS t8, a8 ++ ADD t1, t1, t2 ++ ADD t3, t3, t4 ++ ADD t5, t5, t6 ++ ADD t7, t7, t8 ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t3 ++ CMPLT $fcc2, s3, t5 ++ CMPLT $fcc3, s4, t7 ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t3, $fcc1 ++ CMOVT s3, s3, t5, $fcc2 ++ CMOVT s4, s4, t7, $fcc3 ++ .align 3 ++ ++.L15: ++ andi I, N, 3 ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ addi.d I, I, -1 ++ FABS t1, a1 ++ FABS t2, a2 ++ ADD t1, t1, t2 ++ CMPLT $fcc0, s1, t1 ++ CMOVT s1, s1, t1, $fcc0 ++ add.d X, X, INCX ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L998: ++ CMPLT $fcc0, s1, s2 ++ CMPLT $fcc1, s3, s4 ++ CMOVT s1, s1, s2, $fcc0 ++ CMOVT s3, s3, s4, $fcc1 ++ CMPLT $fcc0, s1, s3 ++ CMOVT s1, s1, s3, $fcc0 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/zamin.S b/kernel/loongarch64/zamin.S +new file mode 100644 +index 0000000..bde9aeb +--- /dev/null ++++ b/kernel/loongarch64/zamin.S +@@ -0,0 +1,198 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define t1 $f0 ++#define t2 $f1 ++#define t3 $f2 ++#define t4 $f3 ++#define t5 $f4 ++#define t6 $f5 ++#define t7 $f6 ++#define t8 $f7 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ MTC s1, $r0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, ZBASE_SHIFT ++ bge $r0, INCX, .L999 ++ LD a1, X, 0 * SIZE ++ addi.d N, N, -1 ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ FABS t1, a1 ++ FABS t2, a2 ++ ADD s1, t1, t2 ++ bge $r0, N, .L999 ++ NOP ++ ADD s2, t1, t2 ++ srai.d I, N, 2 ++ ADD s3, t1, t2 ++ ADD s4, t1, t2 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ LD a8, X, 1 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ FABS t1, a1 ++ LD a1, X, 0 * SIZE ++ FABS t2, a2 ++ LD a2, X, 1 * SIZE ++ FABS t3, a3 ++ add.d X, X, INCX ++ FABS t4, a4 ++ NOP ++ FABS t5, a5 ++ LD a3, X, 0 * SIZE ++ FABS t6, a6 ++ LD a4, X, 1 * SIZE ++ FABS t7, a7 ++ add.d X, X, INCX ++ FABS t8, a8 ++ NOP ++ ADD t1, t1, t2 ++ LD a5, X, 0 * SIZE ++ ADD t3, t3, t4 ++ LD a6, X, 1 * SIZE ++ ADD t5, t5, t6 ++ add.d X, X, INCX ++ ADD t7, t7, t8 ++ NOP ++ CMPLT $fcc0, t1, s1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, t3, s2 ++ LD a8, X, 1 * SIZE ++ CMPLT $fcc2, t5, s3 ++ add.d X, X, INCX ++ CMPLT $fcc3, t7, s4 ++ NOP ++ CMOVT s1, s1, t1, $fcc0 ++ addi.d I, I, -1 ++ CMOVT s2, s2, t3, $fcc1 ++ NOP ++ CMOVT s3, s3, t5, $fcc2 ++ CMOVT s4, s4, t7, $fcc3 ++ blt $r0, I, .L12 ++ NOP ++ .align 3 ++ ++.L13: ++ FABS t1, a1 ++ FABS t2, a2 ++ FABS t3, a3 ++ FABS t4, a4 ++ FABS t5, a5 ++ FABS t6, a6 ++ FABS t7, a7 ++ FABS t8, a8 ++ ADD t1, t1, t2 ++ ADD t3, t3, t4 ++ ADD t5, t5, t6 ++ ADD t7, t7, t8 ++ CMPLT $fcc0, t1, s1 ++ CMPLT $fcc1, t3, s2 ++ CMPLT $fcc2, t5, s3 ++ CMPLT $fcc3, t7, s4 ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t3, $fcc1 ++ CMOVT s3, s3, t5, $fcc2 ++ CMOVT s4, s4, t7, $fcc3 ++ .align 3 ++ ++.L15: ++ andi I, N, 3 ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ addi.d I, I, -1 ++ FABS t1, a1 ++ FABS t2, a2 ++ ADD t1, t1, t2 ++ CMPLT $fcc0, t1, s1 ++ CMOVT s1, s1, t1, $fcc0 ++ add.d X, X, INCX ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L998: ++ CMPLT $fcc0, s2, s1 ++ CMPLT $fcc1, s4, s3 ++ CMOVT s1, s1, s2, $fcc0 ++ CMOVT s3, s3, s4, $fcc1 ++ CMPLT $fcc0, s3, s1 ++ CMOVT s1, s1, s3, $fcc0 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ NOP ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/zasum.S b/kernel/loongarch64/zasum.S +new file mode 100644 +index 0000000..d1a1a73 +--- /dev/null ++++ b/kernel/loongarch64/zasum.S +@@ -0,0 +1,158 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f23 ++#define a2 $f9 ++#define a3 $f10 ++#define a4 $f11 ++#define a5 $f12 ++#define a6 $f13 ++#define a7 $f14 ++#define a8 $f15 ++#define t1 $f16 ++#define t2 $f17 ++#define t3 $f0 ++#define t4 $f1 ++#define s1 $f22 ++#define s2 $f8 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ MTC s1, $r0 ++ MTC s2, $r0 ++ slli.d INCX, INCX, ZBASE_SHIFT ++ srai.d I, N, 2 ++ bge $r0, N, .L999 ++ bge $r0, I, .L25 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ FABS t1, a1 ++ FABS t2, a2 ++ LD a7, X, 0 * SIZE ++ LD a8, X, 1 * SIZE ++ FABS t3, a3 ++ FABS t4, a4 ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L24 ++ .align 3 ++ ++.L23: ++ ADD s1, s1, t1 ++ LD a1, X, 0 * SIZE ++ FABS t1, a5 ++ addi.d I, I, -1 ++ ADD s2, s2, t2 ++ LD a2, X, 1 * SIZE ++ FABS t2, a6 ++ add.d X, X, INCX ++ ADD s1, s1, t3 ++ LD a3, X, 0 * SIZE ++ FABS t3, a7 ++ NOP ++ ADD s2, s2, t4 ++ LD a4, X, 1 * SIZE ++ FABS t4, a8 ++ add.d X, X, INCX ++ ADD s1, s1, t1 ++ LD a5, X, 0 * SIZE ++ FABS t1, a1 ++ NOP ++ ADD s2, s2, t2 ++ LD a6, X, 1 * SIZE ++ FABS t2, a2 ++ add.d X, X, INCX ++ ADD s1, s1, t3 ++ LD a7, X, 0 * SIZE ++ FABS t3, a3 ++ LD a8, X, 1 * SIZE ++ ADD s2, s2, t4 ++ add.d X, X, INCX ++ FABS t4, a4 ++ blt $r0, I, .L23 ++ .align 3 ++ ++.L24: ++ ADD s1, s1, t1 ++ FABS t1, a5 ++ ADD s2, s2, t2 ++ FABS t2, a6 ++ ADD s1, s1, t3 ++ FABS t3, a7 ++ ADD s2, s2, t4 ++ FABS t4, a8 ++ ADD s1, s1, t1 ++ ADD s2, s2, t2 ++ ADD s1, s1, t3 ++ ADD s2, s2, t4 ++ .align 3 ++ ++.L25: ++ andi I, N, 3 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L26: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ FABS t1, a1 ++ addi.d I, I, -1 ++ FABS t2, a2 ++ add.d X, X, INCX ++ ADD s1, s1, t1 ++ ADD s2, s2, t2 ++ blt $r0, I, .L26 ++ .align 3 ++ ++.L999: ++ ADD s1, s1, s2 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S +new file mode 100644 +index 0000000..0f480ca +--- /dev/null ++++ b/kernel/loongarch64/zcopy.S +@@ -0,0 +1,217 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define Y $r7 ++#define INCY $r8 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f23 ++#define a4 $f9 ++#define a5 $f10 ++#define a6 $f11 ++#define a7 $f12 ++#define a8 $f13 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++ LDINT INCY, 0(INCY) ++#endif ++ ++ li.d TEMP, 2 * SIZE ++ NOP ++ slli.d INCX, INCX, ZBASE_SHIFT ++ bge $r0, N, .L999 ++ slli.d INCY, INCY, ZBASE_SHIFT ++ bne INCX, TEMP, .L20 ++ srai.d I, N, 2 ++ bne INCY, TEMP, .L20 ++ addi.d I, I, -1 ++ blt I, $r0, .L15 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD a3, X, 2 * SIZE ++ LD a4, X, 3 * SIZE ++ LD a5, X, 4 * SIZE ++ LD a6, X, 5 * SIZE ++ LD a7, X, 6 * SIZE ++ LD a8, X, 7 * SIZE ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ ST a1, Y, 0 * SIZE ++ LD a1, X, 8 * SIZE ++ ST a2, Y, 1 * SIZE ++ LD a2, X, 9 * SIZE ++ ST a3, Y, 2 * SIZE ++ LD a3, X, 10 * SIZE ++ ST a4, Y, 3 * SIZE ++ LD a4, X, 11 * SIZE ++ ST a5, Y, 4 * SIZE ++ LD a5, X, 12 * SIZE ++ ST a6, Y, 5 * SIZE ++ LD a6, X, 13 * SIZE ++ ST a7, Y, 6 * SIZE ++ LD a7, X, 14 * SIZE ++ ST a8, Y, 7 * SIZE ++ LD a8, X, 15 * SIZE ++ addi.d I, I, -1 ++ addi.d X, X, 8 * SIZE ++ addi.d Y, Y, 8 * SIZE ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ ST a1, Y, 0 * SIZE ++ ST a2, Y, 1 * SIZE ++ ST a3, Y, 2 * SIZE ++ ST a4, Y, 3 * SIZE ++ ST a5, Y, 4 * SIZE ++ ST a6, Y, 5 * SIZE ++ ST a7, Y, 6 * SIZE ++ ST a8, Y, 7 * SIZE ++ addi.d X, X, 8 * SIZE ++ addi.d Y, Y, 8 * SIZE ++ .align 3 ++ ++.L15: ++ andi I, N, 3 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ addi.d X, X, 2 * SIZE ++ addi.d Y, Y, 2 * SIZE ++ ST a1, Y, -2 * SIZE ++ addi.d I, I, -1 ++ ST a2, Y, -1 * SIZE ++ blt $r0, I, .L16 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ NOP ++ .align 3 ++ ++.L20: ++ srai.d I, N, 2 ++ addi.d I, I, -1 ++ blt I, $r0, .L25 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ LD a8, X, 1 * SIZE ++ add.d X, X, INCX ++ bge $r0, I, .L23 ++ .align 3 ++ ++.L22: ++ ST a1, Y, 0 * SIZE ++ LD a1, X, 0 * SIZE ++ ST a2, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ ST a3, Y, 0 * SIZE ++ LD a3, X, 0 * SIZE ++ ST a4, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ ST a5, Y, 0 * SIZE ++ LD a5, X, 0 * SIZE ++ ST a6, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ ST a7, Y, 0 * SIZE ++ LD a7, X, 0 * SIZE ++ ST a8, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ LD a8, X, 1 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ blt $r0, I, .L22 ++ .align 3 ++ ++.L23: ++ ST a1, Y, 0 * SIZE ++ ST a2, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ ST a3, Y, 0 * SIZE ++ ST a4, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ ST a5, Y, 0 * SIZE ++ ST a6, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ ST a7, Y, 0 * SIZE ++ ST a8, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ .align 3 ++ ++.L25: ++ andi I, N, 3 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L26: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ addi.d I, I, -1 ++ ST a1, Y, 0 * SIZE ++ ST a2, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ blt $r0, I, .L26 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S +new file mode 100644 +index 0000000..81ac19f +--- /dev/null ++++ b/kernel/loongarch64/zdot.S +@@ -0,0 +1,330 @@ ++/*************************************************************************** ++Copyright (c) 2020, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define Y $r7 ++#define INCY $r8 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define b1 $f14 ++#define b2 $f15 ++#define b3 $f16 ++#define b4 $f17 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++ LDINT INCY, 0(INCY) ++#endif ++ ++ MTC s1, $r0 ++ MOV s2, s1 ++ MOV s3, s2 ++ MOV s4, s3 ++ slli.d INCX, INCX, ZBASE_SHIFT ++ li.d TEMP, 2 * SIZE ++ slli.d INCY, INCY, ZBASE_SHIFT ++ bge $r0, N, .L999 ++ srai.d I, N, 2 ++ bne INCX, TEMP, .L20 ++ bne INCY, TEMP, .L20 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD b1, Y, 0 * SIZE ++ addi.d I, I, -1 ++ LD b2, Y, 1 * SIZE ++ bge $r0, I, .L14 ++ .align 3 ++ ++.L13: ++ MADD s1, b1, a1, s1 ++ LD a3, X, 2 * SIZE ++ MADD s2, b1, a2, s2 ++ LD a4, X, 3 * SIZE ++ MADD s3, b2, a1, s3 ++ LD b3, Y, 2 * SIZE ++ MADD s4, b2, a2, s4 ++ LD b4, Y, 3 * SIZE ++ MADD s1, b3, a3, s1 ++ LD a1, X, 4 * SIZE ++ MADD s2, b3, a4, s2 ++ LD a2, X, 5 * SIZE ++ MADD s3, b4, a3, s3 ++ LD b1, Y, 4 * SIZE ++ MADD s4, b4, a4, s4 ++ LD b2, Y, 5 * SIZE ++ MADD s1, b1, a1, s1 ++ LD a3, X, 6 * SIZE ++ MADD s2, b1, a2, s2 ++ LD a4, X, 7 * SIZE ++ MADD s3, b2, a1, s3 ++ LD b3, Y, 6 * SIZE ++ MADD s4, b2, a2, s4 ++ LD b4, Y, 7 * SIZE ++ MADD s1, b3, a3, s1 ++ LD a1, X, 8 * SIZE ++ MADD s2, b3, a4, s2 ++ LD a2, X, 9 * SIZE ++ MADD s3, b4, a3, s3 ++ LD b1, Y, 8 * SIZE ++ MADD s4, b4, a4, s4 ++ LD b2, Y, 9 * SIZE ++ addi.d I, I, -1 ++ addi.d X, X, 8 * SIZE ++ addi.d Y, Y, 8 * SIZE ++ blt $r0, I, .L13 ++ .align 3 ++ ++.L14: ++ MADD s1, b1, a1, s1 ++ LD a3, X, 2 * SIZE ++ MADD s2, b1, a2, s2 ++ LD a4, X, 3 * SIZE ++ MADD s3, b2, a1, s3 ++ LD b3, Y, 2 * SIZE ++ MADD s4, b2, a2, s4 ++ LD b4, Y, 3 * SIZE ++ MADD s1, b3, a3, s1 ++ LD a1, X, 4 * SIZE ++ MADD s2, b3, a4, s2 ++ LD a2, X, 5 * SIZE ++ MADD s3, b4, a3, s3 ++ LD b1, Y, 4 * SIZE ++ MADD s4, b4, a4, s4 ++ LD b2, Y, 5 * SIZE ++ MADD s1, b1, a1, s1 ++ LD a3, X, 6 * SIZE ++ MADD s2, b1, a2, s2 ++ LD a4, X, 7 * SIZE ++ MADD s3, b2, a1, s3 ++ LD b3, Y, 6 * SIZE ++ MADD s4, b2, a2, s4 ++ LD b4, Y, 7 * SIZE ++ MADD s1, b3, a3, s1 ++ addi.d X, X, 8 * SIZE ++ MADD s2, b3, a4, s2 ++ addi.d Y, Y, 8 * SIZE ++ MADD s3, b4, a3, s3 ++ MADD s4, b4, a4, s4 ++ .align 3 ++ ++.L15: ++ andi I, N, 3 ++ bge $r0, I, .L999 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD b1, Y, 0 * SIZE ++ addi.d I, I, -1 ++ LD b2, Y, 1 * SIZE ++ bge $r0, I, .L17 ++ .align 3 ++ ++.L16: ++ MADD s1, b1, a1, s1 ++ addi.d I, I, -1 ++ MADD s2, b1, a2, s2 ++ LD b1, Y, 2 * SIZE ++ MADD s3, b2, a1, s3 ++ LD a1, X, 2 * SIZE ++ MADD s4, b2, a2, s4 ++ LD a2, X, 3 * SIZE ++ LD b2, Y, 3 * SIZE ++ addi.d X, X, 2 * SIZE ++ addi.d Y, Y, 2 * SIZE ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L17: ++ MADD s1, b1, a1, s1 ++ MADD s2, b1, a2, s2 ++ MADD s3, b2, a1, s3 ++ MADD s4, b2, a2, s4 ++ b .L999 ++ .align 3 ++ ++.L20: ++#ifdef F_INTERFACE ++ bgez INCX, .L21 ++ addi.d TEMP, N, -1 ++ mult TEMP, INCX ++ mflo TEMP ++ dsub X, X, TEMP ++ .align 3 ++ ++.L21: ++ bgez INCY, .L22 ++ addi.d TEMP, N, -1 ++ mult TEMP, INCY ++ mflo TEMP ++ dsub Y, Y, TEMP ++ .align 3 ++ ++.L22: ++#endif ++ bge $r0, I, .L25 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD b1, Y, 0 * SIZE ++ LD b2, Y, 1 * SIZE ++ add.d X, X, INCX ++ addi.d I, I, -1 ++ add.d Y, Y, INCY ++ bge $r0, I, .L24 ++ .align 3 ++ ++.L23: ++ MADD s1, b1, a1, s1 ++ LD a3, X, 0 * SIZE ++ MADD s2, b1, a2, s2 ++ LD a4, X, 1 * SIZE ++ MADD s3, b2, a1, s3 ++ LD b3, Y, 0 * SIZE ++ MADD s4, b2, a2, s4 ++ LD b4, Y, 1 * SIZE ++ add.d X, X, INCX ++ add.d Y, Y, INCY ++ MADD s1, b3, a3, s1 ++ LD a1, X, 0 * SIZE ++ MADD s2, b3, a4, s2 ++ LD a2, X, 1 * SIZE ++ MADD s3, b4, a3, s3 ++ LD b1, Y, 0 * SIZE ++ MADD s4, b4, a4, s4 ++ LD b2, Y, 1 * SIZE ++ add.d X, X, INCX ++ add.d Y, Y, INCY ++ MADD s1, b1, a1, s1 ++ LD a3, X, 0 * SIZE ++ MADD s2, b1, a2, s2 ++ LD a4, X, 1 * SIZE ++ MADD s3, b2, a1, s3 ++ LD b3, Y, 0 * SIZE ++ MADD s4, b2, a2, s4 ++ LD b4, Y, 1 * SIZE ++ add.d X, X, INCX ++ add.d Y, Y, INCY ++ MADD s1, b3, a3, s1 ++ LD a1, X, 0 * SIZE ++ MADD s2, b3, a4, s2 ++ LD a2, X, 1 * SIZE ++ MADD s3, b4, a3, s3 ++ LD b1, Y, 0 * SIZE ++ MADD s4, b4, a4, s4 ++ LD b2, Y, 1 * SIZE ++ add.d X, X, INCX ++ addi.d I, I, -1 ++ add.d Y, Y, INCY ++ blt $r0, I, .L23 ++ .align 3 ++ ++.L24: ++ MADD s1, b1, a1, s1 ++ LD a3, X, 0 * SIZE ++ MADD s2, b1, a2, s2 ++ LD a4, X, 1 * SIZE ++ MADD s3, b2, a1, s3 ++ LD b3, Y, 0 * SIZE ++ MADD s4, b2, a2, s4 ++ LD b4, Y, 1 * SIZE ++ add.d X, X, INCX ++ add.d Y, Y, INCY ++ MADD s1, b3, a3, s1 ++ LD a1, X, 0 * SIZE ++ MADD s2, b3, a4, s2 ++ LD a2, X, 1 * SIZE ++ MADD s3, b4, a3, s3 ++ LD b1, Y, 0 * SIZE ++ MADD s4, b4, a4, s4 ++ LD b2, Y, 1 * SIZE ++ add.d X, X, INCX ++ add.d Y, Y, INCY ++ MADD s1, b1, a1, s1 ++ LD a3, X, 0 * SIZE ++ MADD s2, b1, a2, s2 ++ LD a4, X, 1 * SIZE ++ MADD s3, b2, a1, s3 ++ LD b3, Y, 0 * SIZE ++ MADD s4, b2, a2, s4 ++ LD b4, Y, 1 * SIZE ++ MADD s1, b3, a3, s1 ++ add.d X, X, INCX ++ MADD s2, b3, a4, s2 ++ add.d Y, Y, INCY ++ MADD s3, b4, a3, s3 ++ MADD s4, b4, a4, s4 ++ .align 3 ++ ++.L25: ++ andi I, N, 3 ++ bge $r0, I, .L999 ++ .align 3 ++.L26: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD b1, Y, 0 * SIZE ++ LD b2, Y, 1 * SIZE ++ MADD s1, b1, a1, s1 ++ MADD s2, b1, a2, s2 ++ MADD s3, b2, a1, s3 ++ MADD s4, b2, a2, s4 ++ add.d X, X, INCX ++ add.d Y, Y, INCY ++ addi.d I, I, -1 ++ blt $r0, I, .L26 ++ .align 3 ++ ++.L999: ++#ifndef CONJ ++ SUB $f0, s1, s4 ++#else ++ ADD $f0, s1, s4 ++#endif ++#ifndef CONJ ++ ADD $f1, s3, s2 ++#else ++ SUB $f1, s3, s2 ++#endif ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/zgemm3m_kernel.S b/kernel/loongarch64/zgemm3m_kernel.S +new file mode 100644 +index 0000000..f9acb6c +--- /dev/null ++++ b/kernel/loongarch64/zgemm3m_kernel.S +@@ -0,0 +1,1359 @@ ++/*************************************************************************** ++Copyright (c) 2020, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define M $r4 ++#define N $r5 ++#define K $r6 ++#define A $r7 ++#define B $r8 ++#define C $r9 ++#define LDC $r10 ++ ++#define AO $r12 ++#define BO $r13 ++#define I $r17 ++#define J $r18 ++#define L $r11 ++#define CO1 $r14 ++#define CO2 $r15 ++#define CO3 $r23 ++#define CO4 $r24 ++#define CO5 $r25 ++#define CO6 $r26 ++#define CO7 $r27 ++#define CO8 $r28 ++ ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f28 ++#define a4 $f29 ++#define b1 $f23 ++#define b2 $f9 ++#define b3 $f10 ++#define b4 $f11 ++#define b5 $f12 ++#define b6 $f13 ++#define b7 $f14 ++#define b8 $f15 ++#define a5 b8 ++#define c11 $f16 ++#define c12 $f17 ++#define c21 $f3 ++#define c22 $f4 ++#define c31 $f2 ++#define c32 $f5 ++#define c41 $f6 ++#define c42 $f7 ++#define c51 $f18 ++#define c52 $f19 ++#define c61 $f20 ++#define c62 $f21 ++#define c71 $f24 ++#define c72 $f25 ++#define c81 $f26 ++#define c82 $f27 ++#define ALPHA_R $f0 ++#define ALPHA_I $f1 ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -128 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 16 ++ SDARG $r26, $sp, 24 ++ SDARG $r27, $sp, 32 ++ SDARG $r28, $sp, 40 ++ fst.d $f24, $sp, 48 ++ fst.d $f25, $sp, 56 ++ fst.d $f26, $sp, 64 ++ fst.d $f27, $sp, 72 ++ fst.d $f28, $sp, 80 ++ fst.d $f29, $sp, 88 ++ slli.d LDC, LDC, ZBASE_SHIFT ++ srai.d J, N, 3 ++ bge $r0, J, .L30 ++.L10: ++ move CO1, C ++ MTC c11, $r0 ++ add.d CO2, C, LDC ++ move AO, A ++ add.d CO3, CO2, LDC ++ addi.d J, J, -1 ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ add.d CO5, CO4, LDC ++ MOV c31, c11 ++ add.d CO6, CO5, LDC ++ MOV c41, c11 ++ add.d CO7, CO6, LDC ++ MOV c51, c11 ++ add.d CO8, CO7, LDC ++ srai.d I, M, 1 ++ add.d C, CO8, LDC ++MOV c61, c11 ++ bge $r0, I, .L20 ++.L11: ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, B, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, K, 2 ++ MOV c32, c11 ++ LD b3, B, 2 * SIZE ++ MOV c42, c11 ++ LD b4, B, 3 * SIZE ++ MOV c52, c11 ++ LD b5, B, 4 * SIZE ++ MOV c62, c11 ++ LD b6, B, 8 * SIZE ++ MOV c72, c11 ++ LD b7, B, 12 * SIZE ++ MOV c82, c11 ++move BO, B ++ bge $r0, L, .L15 ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ bge $r0, L, .L13 ++ .align 3 ++.L12: ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ LD a4, AO, 2 * SIZE ++ MADD c61, b2, a1, c61 ++ MADD c71, b3, a1, c71 ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a4, c51 ++ MADD c61, b2, a4, c61 ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ LD a4, AO, 6 * SIZE ++ MADD c61, b2, a3, c61 ++ MADD c71, b3, a3, c71 ++ MADD c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ addi.d L, L, -1 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ blt $r0, L, .L12 ++ .align 3 ++ ++.L13: ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ MADD c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a4, c51 ++ MADD c61, b2, a4, c61 ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ MADD c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD c71, b3, a3, c71 ++ MADD c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a4, c21 ++ MADD c31, b3, a4, c31 ++ MADD c41, b4, a4, c41 ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD c71, b3, a4, c71 ++ MADD c81, b4, a4, c81 ++ MADD c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ .align 3 ++ ++.L15: ++ andi L, K, 3 ++ bge $r0, L, .L18 ++ .align 3 ++.L16: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ addi.d L, L, -1 ++ MADD c61, b2, a1, c61 ++ addi.d AO, AO, 2 * SIZE ++ MADD c71, b3, a1, c71 ++ addi.d BO, BO, 8 * SIZE ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ MADD c52, b5, a2, c52 ++ LD b5, BO, 4 * SIZE ++ MADD c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L16 ++.L18: ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO1, 1 * SIZE ++ LD $f23, CO1, 2 * SIZE ++ LD $f9, CO1, 3 * SIZE ++ LD $f10, CO2, 0 * SIZE ++ MADD $f22, c11, ALPHA_R, $f22 ++ LD $f11, CO2, 1 * SIZE ++ MADD $f8, c11, ALPHA_I, $f8 ++ LD $f12, CO2, 2 * SIZE ++ MADD $f23, c12, ALPHA_R, $f23 ++ LD $f13, CO2, 3 * SIZE ++ MADD $f9, c12, ALPHA_I, $f9 ++ MADD $f10, c21, ALPHA_R, $f10 ++ ST $f22, CO1, 0 * SIZE ++ MADD $f11, c21, ALPHA_I, $f11 ++ ST $f8, CO1, 1 * SIZE ++ MADD $f12, c22, ALPHA_R, $f12 ++ ST $f23, CO1, 2 * SIZE ++ MADD $f13, c22, ALPHA_I, $f13 ++ ST $f9, CO1, 3 * SIZE ++ LD $f22, CO3, 0 * SIZE ++ LD $f8, CO3, 1 * SIZE ++ LD $f23, CO3, 2 * SIZE ++ LD $f9, CO3, 3 * SIZE ++ ST $f10, CO2, 0 * SIZE ++ ST $f11, CO2, 1 * SIZE ++ ST $f12, CO2, 2 * SIZE ++ ST $f13, CO2, 3 * SIZE ++ LD $f10, CO4, 0 * SIZE ++ LD $f11, CO4, 1 * SIZE ++ LD $f12, CO4, 2 * SIZE ++ LD $f13, CO4, 3 * SIZE ++ MADD $f22, c31, ALPHA_R, $f22 ++ MADD $f8, c31, ALPHA_I, $f8 ++ MADD $f23, c32, ALPHA_R, $f23 ++ MADD $f9, c32, ALPHA_I, $f9 ++ MADD $f10, c41, ALPHA_R, $f10 ++ ST $f22, CO3, 0 * SIZE ++ MADD $f11, c41, ALPHA_I, $f11 ++ ST $f8, CO3, 1 * SIZE ++ MADD $f12, c42, ALPHA_R, $f12 ++ ST $f23, CO3, 2 * SIZE ++ MADD $f13, c42, ALPHA_I, $f13 ++ ST $f9, CO3, 3 * SIZE ++ LD $f22, CO5, 0 * SIZE ++ LD $f8, CO5, 1 * SIZE ++ LD $f23, CO5, 2 * SIZE ++ LD $f9, CO5, 3 * SIZE ++ ST $f10, CO4, 0 * SIZE ++ ST $f11, CO4, 1 * SIZE ++ ST $f12, CO4, 2 * SIZE ++ ST $f13, CO4, 3 * SIZE ++ LD $f10, CO6, 0 * SIZE ++ LD $f11, CO6, 1 * SIZE ++ LD $f12, CO6, 2 * SIZE ++ LD $f13, CO6, 3 * SIZE ++ MADD $f22, c51, ALPHA_R, $f22 ++ addi.d CO1,CO1, 4 * SIZE ++ MADD $f8, c51, ALPHA_I, $f8 ++ addi.d CO2,CO2, 4 * SIZE ++ MADD $f23, c52, ALPHA_R, $f23 ++ addi.d CO3,CO3, 4 * SIZE ++ MADD $f9, c52, ALPHA_I, $f9 ++ addi.d CO4,CO4, 4 * SIZE ++ MADD $f10, c61, ALPHA_R, $f10 ++ ST $f22, CO5, 0 * SIZE ++ MADD $f11, c61, ALPHA_I, $f11 ++ ST $f8, CO5, 1 * SIZE ++ MADD $f12, c62, ALPHA_R, $f12 ++ ST $f23, CO5, 2 * SIZE ++ MADD $f13, c62, ALPHA_I, $f13 ++ ST $f9, CO5, 3 * SIZE ++ LD $f22, CO7, 0 * SIZE ++ LD $f8, CO7, 1 * SIZE ++ LD $f23, CO7, 2 * SIZE ++ LD $f9, CO7, 3 * SIZE ++ ST $f10, CO6, 0 * SIZE ++ ST $f11, CO6, 1 * SIZE ++ ST $f12, CO6, 2 * SIZE ++ ST $f13, CO6, 3 * SIZE ++ LD $f10, CO8, 0 * SIZE ++ addi.d I, I, -1 ++ LD $f11, CO8, 1 * SIZE ++MTC c11, $r0 ++ LD $f12, CO8, 2 * SIZE ++ LD $f13, CO8, 3 * SIZE ++ MADD $f22, c71, ALPHA_R, $f22 ++ addi.d CO5,CO5, 4 * SIZE ++ MADD $f8, c71, ALPHA_I, $f8 ++ addi.d CO6,CO6, 4 * SIZE ++ MADD $f23, c72, ALPHA_R, $f23 ++ addi.d CO7,CO7, 4 * SIZE ++ MADD $f9, c72, ALPHA_I, $f9 ++ addi.d CO8,CO8, 4 * SIZE ++ MADD $f10, c81, ALPHA_R, $f10 ++ ST $f22, CO7, -4 * SIZE ++ MADD $f11, c81, ALPHA_I, $f11 ++ ST $f8, CO7, -3 * SIZE ++ MADD $f12, c82, ALPHA_R, $f12 ++ ST $f23, CO7, -2 * SIZE ++ MADD $f13, c82, ALPHA_I, $f13 ++ ST $f9, CO7, -1 * SIZE ++ ST $f10, CO8, -4 * SIZE ++ MOV c21, c11 ++ ST $f11, CO8, -3 * SIZE ++ MOV c31, c11 ++ ST $f12, CO8, -2 * SIZE ++ MOV c41, c11 ++ ST $f13, CO8, -1 * SIZE ++ MOV c51, c11 ++MOV c61, c11 ++ blt $r0, I, .L11 ++ .align 3 ++ ++.L20: ++ andi I, M, 1 ++ MOV c61, c11 ++MOV c71, c11 ++ bge $r0, I, .L29 ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, K, 2 ++ MOV c81, c11 ++move BO, B ++ bge $r0, L, .L25 ++ .align 3 ++.L22: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 16 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ MADD c51, b5, a1, c51 ++ LD b5, BO, 20 * SIZE ++ MADD c61, b2, a1, c61 ++ LD b2, BO, 9 * SIZE ++ MADD c71, b3, a1, c71 ++ LD b3, BO, 10 * SIZE ++ MADD c81, b4, a1, c81 ++ LD b4, BO, 11 * SIZE ++ LD a1, AO, 4 * SIZE ++ addi.d L, L, -1 ++ MADD c11, b6, a2, c11 ++ LD b6, BO, 24 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 13 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 14 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 15 * SIZE ++ MADD c51, b7, a2, c51 ++ LD b7, BO, 28 * SIZE ++ MADD c61, b2, a2, c61 ++ LD b2, BO, 17 * SIZE ++ MADD c71, b3, a2, c71 ++ LD b3, BO, 18 * SIZE ++ MADD c81, b4, a2, c81 ++ LD b4, BO, 19 * SIZE ++ LD a2, AO, 5 * SIZE ++ addi.d AO, AO, 4 * SIZE ++ MADD c11, b1, a3, c11 ++ LD b1, BO, 32 * SIZE ++ MADD c21, b2, a3, c21 ++ LD b2, BO, 21 * SIZE ++ MADD c31, b3, a3, c31 ++ LD b3, BO, 22 * SIZE ++ MADD c41, b4, a3, c41 ++ LD b4, BO, 23 * SIZE ++ MADD c51, b5, a3, c51 ++ LD b5, BO, 36 * SIZE ++ MADD c61, b2, a3, c61 ++ LD b2, BO, 25 * SIZE ++ MADD c71, b3, a3, c71 ++ LD b3, BO, 26 * SIZE ++ MADD c81, b4, a3, c81 ++ LD b4, BO, 27 * SIZE ++ LD a3, AO, 2 * SIZE ++ addi.d BO, BO, 32 * SIZE ++ MADD c11, b6, a4, c11 ++ LD b6, BO, 8 * SIZE ++ MADD c21, b2, a4, c21 ++ LD b2, BO, -3 * SIZE ++ MADD c31, b3, a4, c31 ++ LD b3, BO, -2 * SIZE ++ MADD c41, b4, a4, c41 ++ LD b4, BO, -1 * SIZE ++ MADD c51, b7, a4, c51 ++ LD b7, BO, 12 * SIZE ++ MADD c61, b2, a4, c61 ++ LD b2, BO, 1 * SIZE ++ MADD c71, b3, a4, c71 ++ LD b3, BO, 2 * SIZE ++ MADD c81, b4, a4, c81 ++ LD b4, BO, 3 * SIZE ++ LD a4, AO, 3 * SIZE ++ blt $r0, L, .L22 ++ .align 3 ++ ++.L25: ++ andi L, K, 3 ++ bge $r0, L, .L28 ++ .align 3 ++.L26: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 8 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ addi.d L, L, -1 ++ MOV a2, a2 ++ addi.d AO, AO, 1 * SIZE ++ addi.d BO, BO, 8 * SIZE ++ MADD c51, b5, a1, c51 ++ LD b5, BO, 4 * SIZE ++ MADD c61, b2, a1, c61 ++ LD b2, BO, 1 * SIZE ++ MADD c71, b3, a1, c71 ++ LD b3, BO, 2 * SIZE ++ MADD c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L26 ++.L28: ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO1, 1 * SIZE ++ LD $f23, CO2, 0 * SIZE ++ LD $f9, CO2, 1 * SIZE ++ LD $f10, CO3, 0 * SIZE ++ MADD $f22, c11, ALPHA_R, $f22 ++ LD $f11, CO3, 1 * SIZE ++ MADD $f8, c11, ALPHA_I, $f8 ++ LD $f12, CO4, 0 * SIZE ++ MADD $f23, c21, ALPHA_R, $f23 ++ LD $f13, CO4, 1 * SIZE ++ MADD $f9, c21, ALPHA_I, $f9 ++ MADD $f10, c31, ALPHA_R, $f10 ++ ST $f22, CO1, 0 * SIZE ++ MADD $f11, c31, ALPHA_I, $f11 ++ ST $f8, CO1, 1 * SIZE ++ MADD $f12, c41, ALPHA_R, $f12 ++ ST $f23, CO2, 0 * SIZE ++ MADD $f13, c41, ALPHA_I, $f13 ++ ST $f9, CO2, 1 * SIZE ++ LD $f22, CO5, 0 * SIZE ++ LD $f8, CO5, 1 * SIZE ++ LD $f23, CO6, 0 * SIZE ++ LD $f9, CO6, 1 * SIZE ++ ST $f10, CO3, 0 * SIZE ++ ST $f11, CO3, 1 * SIZE ++ ST $f12, CO4, 0 * SIZE ++ ST $f13, CO4, 1 * SIZE ++ LD $f10, CO7, 0 * SIZE ++ MADD $f22, c51, ALPHA_R, $f22 ++ LD $f11, CO7, 1 * SIZE ++ MADD $f8, c51, ALPHA_I, $f8 ++ LD $f12, CO8, 0 * SIZE ++ MADD $f23, c61, ALPHA_R, $f23 ++ LD $f13, CO8, 1 * SIZE ++ MADD $f9, c61, ALPHA_I, $f9 ++ MADD $f10, c71, ALPHA_R, $f10 ++ ST $f22, CO5, 0 * SIZE ++ MADD $f11, c71, ALPHA_I, $f11 ++ ST $f8, CO5, 1 * SIZE ++ MADD $f12, c81, ALPHA_R, $f12 ++ ST $f23, CO6, 0 * SIZE ++ MADD $f13, c81, ALPHA_I, $f13 ++ ST $f9, CO6, 1 * SIZE ++ ST $f10, CO7, 0 * SIZE ++ ST $f11, CO7, 1 * SIZE ++ ST $f12, CO8, 0 * SIZE ++ ST $f13, CO8, 1 * SIZE ++ .align 3 ++ ++.L29: ++move B, BO ++ blt $r0, J, .L10 ++ .align 3 ++ ++.L30: ++ andi J, N, 4 ++move AO, A ++ bge $r0, J, .L50 ++ move CO1, C ++MTC c11, $r0 ++ add.d CO2, C, LDC ++ add.d CO3, CO2, LDC ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ add.d C, CO4, LDC ++ MOV c31, c11 ++ srai.d I, M, 1 ++MOV c41, c11 ++ bge $r0, I, .L40 ++.L31: ++ LD a1, AO, 0 * SIZE ++ LD a3, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ MOV c32, c11 ++ LD b4, B, 3 * SIZE ++ MOV c42, c11 ++ LD b5, B, 4 * SIZE ++ srai.d L, K, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L35 ++ .align 3 ++.L32: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD c11, b5, a1, c11 ++ LD a2, AO, 3 * SIZE ++ MADD c21, b2, a1, c21 ++ MADD c31, b3, a1, c31 ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 8 * SIZE ++ MADD c12, b5, a2, c12 ++ LD b5, BO, 20 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 9 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 10 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 11 * SIZE ++ MADD c11, b6, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD c21, b2, a3, c21 ++ MADD c31, b3, a3, c31 ++ MADD c41, b4, a3, c41 ++ LD a3, AO, 6 * SIZE ++ MADD c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD c11, b7, a3, c11 ++ LD a2, AO, 7 * SIZE ++ MADD c21, b2, a3, c21 ++ addi.d AO, AO, 8 * SIZE ++ MADD c31, b3, a3, c31 ++ addi.d BO, BO, 16 * SIZE ++ MADD c41, b4, a3, c41 ++ LD a3, AO, 4 * SIZE ++ MADD c12, b7, a2, c12 ++ LD b7, BO, 12 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 1 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 2 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L32 ++ .align 3 ++ ++.L35: ++ andi L, K, 3 ++ bge $r0, L, .L38 ++ .align 3 ++.L36: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD c31, b3, a1, c31 ++ addi.d AO, AO, 2 * SIZE ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 0 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 4 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L36 ++.L38: ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO1, 1 * SIZE ++ LD $f23, CO1, 2 * SIZE ++ LD $f9, CO1, 3 * SIZE ++ LD $f10, CO2, 0 * SIZE ++ LD $f11, CO2, 1 * SIZE ++ LD $f12, CO2, 2 * SIZE ++ LD $f13, CO2, 3 * SIZE ++ MADD $f22, c11, ALPHA_R, $f22 ++ MADD $f8, c11, ALPHA_I, $f8 ++ MADD $f23, c12, ALPHA_R, $f23 ++ MADD $f9, c12, ALPHA_I, $f9 ++ MADD $f10, c21, ALPHA_R, $f10 ++ ST $f22, CO1, 0 * SIZE ++ MADD $f11, c21, ALPHA_I, $f11 ++ ST $f8, CO1, 1 * SIZE ++ MADD $f12, c22, ALPHA_R, $f12 ++ ST $f23, CO1, 2 * SIZE ++ MADD $f13, c22, ALPHA_I, $f13 ++ ST $f9, CO1, 3 * SIZE ++ LD $f22, CO3, 0 * SIZE ++ LD $f8, CO3, 1 * SIZE ++ LD $f23, CO3, 2 * SIZE ++ LD $f9, CO3, 3 * SIZE ++ ST $f10, CO2, 0 * SIZE ++ MADD $f22, c31, ALPHA_R, $f22 ++ ST $f11, CO2, 1 * SIZE ++ MADD $f8, c31, ALPHA_I, $f8 ++ ST $f12, CO2, 2 * SIZE ++ MADD $f23, c32, ALPHA_R, $f23 ++ ST $f13, CO2, 3 * SIZE ++ MADD $f9, c32, ALPHA_I, $f9 ++ LD $f10, CO4, 0 * SIZE ++ LD $f11, CO4, 1 * SIZE ++ LD $f12, CO4, 2 * SIZE ++ LD $f13, CO4, 3 * SIZE ++ MADD $f10, c41, ALPHA_R, $f10 ++ addi.d CO1,CO1, 4 * SIZE ++ MADD $f11, c41, ALPHA_I, $f11 ++ addi.d CO2,CO2, 4 * SIZE ++ MADD $f12, c42, ALPHA_R, $f12 ++ addi.d CO3,CO3, 4 * SIZE ++ MADD $f13, c42, ALPHA_I, $f13 ++ addi.d CO4,CO4, 4 * SIZE ++ ST $f22, CO3, -4 * SIZE ++ addi.d I, I, -1 ++ ST $f8, CO3, -3 * SIZE ++ ST $f23, CO3, -2 * SIZE ++ ST $f9, CO3, -1 * SIZE ++ ST $f10, CO4, -4 * SIZE ++MTC c11, $r0 ++ ST $f11, CO4, -3 * SIZE ++ MOV c21, c11 ++ ST $f12, CO4, -2 * SIZE ++ MOV c31, c11 ++ ST $f13, CO4, -1 * SIZE ++MOV c41, c11 ++ blt $r0, I, .L31 ++ .align 3 ++ ++.L40: ++ andi I, M, 1 ++MOV c61, c11 ++ bge $r0, I, .L49 ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c81, c11 ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, K, 2 ++move BO, B ++ bge $r0, L, .L45 ++ .align 3 ++.L42: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 16 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD b4, BO, 7 * SIZE ++ LD a1, AO, 4 * SIZE ++ addi.d L, L, -1 ++ MADD c11, b5, a2, c11 ++ LD b5, BO, 20 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 10 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 11 * SIZE ++ LD a2, AO, 2 * SIZE ++ addi.d AO, AO, 4 * SIZE ++ MADD c11, b6, a2, c11 ++ LD b6, BO, 24 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 13 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 14 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 15 * SIZE ++ LD a2, AO, -1 * SIZE ++ addi.d BO, BO, 16 * SIZE ++ MADD c11, b7, a2, c11 ++ LD b7, BO, 12 * SIZE ++ MADD c21, b2, a2, c21 ++ LD b2, BO, 1 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 2 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 3 * SIZE ++ LD a2, AO, 1 * SIZE ++ blt $r0, L, .L42 ++ .align 3 ++ ++.L45: ++ andi L, K, 3 ++ bge $r0, L, .L48 ++ .align 3 ++.L46: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 4 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a1, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a1, c41 ++ LD a1, AO, 1 * SIZE ++ LD b4, BO, 7 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++ MOV a2, a2 ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L46 ++.L48: ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO1, 1 * SIZE ++ LD $f23, CO2, 0 * SIZE ++ LD $f9, CO2, 1 * SIZE ++ LD $f10, CO3, 0 * SIZE ++ MADD $f22, c11, ALPHA_R, $f22 ++ LD $f11, CO3, 1 * SIZE ++ MADD $f8, c11, ALPHA_I, $f8 ++ LD $f12, CO4, 0 * SIZE ++ MADD $f23, c21, ALPHA_R, $f23 ++ LD $f13, CO4, 1 * SIZE ++ MADD $f9, c21, ALPHA_I, $f9 ++ MADD $f10, c31, ALPHA_R, $f10 ++ ST $f22, CO1, 0 * SIZE ++ MADD $f11, c31, ALPHA_I, $f11 ++ ST $f8, CO1, 1 * SIZE ++ MADD $f12, c41, ALPHA_R, $f12 ++ ST $f23, CO2, 0 * SIZE ++ MADD $f13, c41, ALPHA_I, $f13 ++ ST $f9, CO2, 1 * SIZE ++ ST $f10, CO3, 0 * SIZE ++ ST $f11, CO3, 1 * SIZE ++ ST $f12, CO4, 0 * SIZE ++ ST $f13, CO4, 1 * SIZE ++ .align 3 ++ ++.L49: ++ move B, BO ++ .align 3 ++ ++.L50: ++ andi J, N, 2 ++move AO, A ++ bge $r0, J, .L70 ++ move CO1, C ++ add.d CO2, C, LDC ++ srai.d I, M, 1 ++add.d C, CO2, LDC ++ bge $r0, I, .L60 ++.L51: ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ LD b5, B, 4 * SIZE ++ srai.d L, K, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L55 ++ .align 3 ++.L52: ++ MADD c11, b1, a1, c11 ++ LD a3, AO, 2 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b4, BO, 3 * SIZE ++ MADD c12, b1, a2, c12 ++ LD a4, AO, 3 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b1, BO, 8 * SIZE ++ MADD c11, b3, a3, c11 ++ LD a1, AO, 8 * SIZE ++ MADD c21, b4, a3, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c12, b3, a4, c12 ++ LD a2, AO, 5 * SIZE ++ MADD c22, b4, a4, c22 ++ LD b3, BO, 6 * SIZE ++ MADD c11, b5, a5, c11 ++ LD a3, AO, 6 * SIZE ++ MADD c21, b2, a5, c21 ++ LD b4, BO, 7 * SIZE ++ MADD c12, b5, a2, c12 ++ LD a4, AO, 7 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b5, BO, 12 * SIZE ++ MADD c11, b3, a3, c11 ++ LD a5, AO, 12 * SIZE ++ MADD c21, b4, a3, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c12, b3, a4, c12 ++ LD a2, AO, 9 * SIZE ++ MADD c22, b4, a4, c22 ++ LD b3, BO, 10 * SIZE ++ addi.d AO, AO, 8 * SIZE ++ addi.d L, L, -1 ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L52 ++ .align 3 ++ ++.L55: ++ andi L, K, 3 ++ bge $r0, L, .L58 ++ .align 3 ++.L56: ++ MADD c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD c22, b2, a2, c22 ++ LD b2, BO, 3 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 2 * SIZE ++addi.d BO, BO, 2 * SIZE ++ blt $r0, L, .L56 ++.L58: ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO1, 1 * SIZE ++ LD $f23, CO1, 2 * SIZE ++ LD $f9, CO1, 3 * SIZE ++ LD $f10, CO2, 0 * SIZE ++ LD $f11, CO2, 1 * SIZE ++ LD $f12, CO2, 2 * SIZE ++ LD $f13, CO2, 3 * SIZE ++ MADD $f22, c11, ALPHA_R, $f22 ++ addi.d I, I, -1 ++ MADD $f8, c11, ALPHA_I, $f8 ++ addi.d CO1,CO1, 4 * SIZE ++ MADD $f23, c12, ALPHA_R, $f23 ++ addi.d CO2,CO2, 4 * SIZE ++ MADD $f9, c12, ALPHA_I, $f9 ++ MADD $f10, c21, ALPHA_R, $f10 ++ MADD $f11, c21, ALPHA_I, $f11 ++ MADD $f12, c22, ALPHA_R, $f12 ++ MADD $f13, c22, ALPHA_I, $f13 ++ ST $f22, CO1, -4 * SIZE ++ ST $f8, CO1, -3 * SIZE ++ ST $f23, CO1, -2 * SIZE ++ ST $f9, CO1, -1 * SIZE ++ ST $f10, CO2, -4 * SIZE ++ ST $f11, CO2, -3 * SIZE ++ ST $f12, CO2, -2 * SIZE ++ ST $f13, CO2, -1 * SIZE ++ blt $r0, I, .L51 ++ .align 3 ++ ++.L60: ++ andi I, M, 1 ++ bge $r0, I, .L69 ++ srai.d L, K, 2 ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ MOV c31, c11 ++ LD a4, AO, 3 * SIZE ++ MOV c41, c11 ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L65 ++ .align 3 ++.L62: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 4 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 5 * SIZE ++ MADD c31, b3, a2, c31 ++ LD b3, BO, 6 * SIZE ++ MADD c41, b4, a2, c41 ++ LD b4, BO, 7 * SIZE ++ LD a1, AO, 4 * SIZE ++ LD a2, AO, 5 * SIZE ++ MADD c11, b1, a3, c11 ++ LD b1, BO, 8 * SIZE ++ MADD c21, b2, a3, c21 ++ LD b2, BO, 9 * SIZE ++ MADD c31, b3, a4, c31 ++ LD b3, BO, 10 * SIZE ++ MADD c41, b4, a4, c41 ++ LD b4, BO, 11 * SIZE ++ LD a3, AO, 6 * SIZE ++ LD a4, AO, 7 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 4 * SIZE ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L62 ++ .align 3 ++ ++.L65: ++ andi L, K, 3 ++ bge $r0, L, .L68 ++ .align 3 ++.L66: ++ MADD c11, b1, a1, c11 ++ LD b1, BO, 2 * SIZE ++ MADD c21, b2, a1, c21 ++ LD b2, BO, 3 * SIZE ++ LD a1, AO, 1 * SIZE ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++addi.d BO, BO, 2 * SIZE ++ blt $r0, L, .L66 ++.L68: ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO1, 1 * SIZE ++ LD $f23, CO2, 0 * SIZE ++ LD $f9, CO2, 1 * SIZE ++ ADD c11, c11, c31 ++ ADD c21, c21, c41 ++ MADD $f22, c11, ALPHA_R, $f22 ++ MADD $f8, c11, ALPHA_I, $f8 ++ MADD $f23, c21, ALPHA_R, $f23 ++ MADD $f9, c21, ALPHA_I, $f9 ++ ST $f22, CO1, 0 * SIZE ++ ST $f8, CO1, 1 * SIZE ++ ST $f23, CO2, 0 * SIZE ++ ST $f9, CO2, 1 * SIZE ++ .align 3 ++ ++.L69: ++ move B, BO ++ .align 3 ++ ++.L70: ++ andi J, N, 1 ++move AO, A ++ bge $r0, J, .L999 ++ move CO1, C ++ srai.d I, M, 1 ++add.d C, CO1, LDC ++ bge $r0, I, .L80 ++.L71: ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a5, AO, 4 * SIZE ++ LD b1, B, 0 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ LD b3, B, 2 * SIZE ++ LD b5, B, 4 * SIZE ++ srai.d L, K, 2 ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++move BO, B ++ bge $r0, L, .L75 ++ .align 3 ++.L72: ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 2 * SIZE ++ LD a2, AO, 3 * SIZE ++ LD b1, BO, 1 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 4 * SIZE ++ LD a2, AO, 5 * SIZE ++ LD b1, BO, 2 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ LD a1, AO, 6 * SIZE ++ LD a2, AO, 7 * SIZE ++ LD b1, BO, 3 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ addi.d L, L, -1 ++ addi.d AO, AO, 8 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L72 ++ .align 3 ++ ++.L75: ++ andi L, K, 3 ++ bge $r0, L, .L78 ++ .align 3 ++.L76: ++ LD a1, AO, 0 * SIZE ++ LD a2, AO, 1 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ MADD c12, b1, a2, c12 ++ addi.d L, L, -1 ++ addi.d AO, AO, 2 * SIZE ++addi.d BO, BO, 1 * SIZE ++ blt $r0, L, .L76 ++.L78: ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO1, 1 * SIZE ++ LD $f23, CO1, 2 * SIZE ++ LD $f9, CO1, 3 * SIZE ++ ADD c11, c11, c21 ++ addi.d I, I, -1 ++ ADD c12, c12, c22 ++ addi.d CO1,CO1, 4 * SIZE ++ MADD $f22, c11, ALPHA_R, $f22 ++ MADD $f8, c11, ALPHA_I, $f8 ++ MADD $f23, c12, ALPHA_R, $f23 ++ MADD $f9, c12, ALPHA_I, $f9 ++ ST $f22, CO1, -4 * SIZE ++ ST $f8, CO1, -3 * SIZE ++ ST $f23, CO1, -2 * SIZE ++ ST $f9, CO1, -1 * SIZE ++ blt $r0, I, .L71 ++ .align 3 ++ ++.L80: ++ andi I, M, 1 ++ bge $r0, I, .L89 ++ LD a1, AO, 0 * SIZE ++MTC c11, $r0 ++ LD a2, AO, 1 * SIZE ++ MOV c21, c11 ++ LD a3, AO, 2 * SIZE ++ LD a4, AO, 3 * SIZE ++ LD b1, B, 0 * SIZE ++ LD b2, B, 1 * SIZE ++ LD b3, B, 2 * SIZE ++ LD b4, B, 3 * SIZE ++ LD b5, B, 4 * SIZE ++ LD b6, B, 8 * SIZE ++ LD b7, B, 12 * SIZE ++ srai.d L, K, 2 ++move BO, B ++ bge $r0, L, .L85 ++ .align 3 ++.L82: ++ LD a1, AO, 0 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a1, AO, 1 * SIZE ++ LD b1, BO, 1 * SIZE ++ MADD c21, b1, a1, c21 ++ LD a1, AO, 2 * SIZE ++ LD b1, BO, 2 * SIZE ++ MADD c11, b1, a1, c11 ++ LD a1, AO, 3 * SIZE ++ LD b1, BO, 3 * SIZE ++ MADD c21, b1, a1, c21 ++ addi.d L, L, -1 ++ addi.d AO, AO, 4 * SIZE ++addi.d BO, BO, 4 * SIZE ++ blt $r0, L, .L82 ++ .align 3 ++ ++.L85: ++ andi L, K, 3 ++ bge $r0, L, .L88 ++ .align 3 ++.L86: ++ LD a1, AO, 0 * SIZE ++ LD b1, BO, 0 * SIZE ++ MADD c11, b1, a1, c11 ++ addi.d L, L, -1 ++ addi.d AO, AO, 1 * SIZE ++addi.d BO, BO, 1 * SIZE ++ blt $r0, L, .L86 ++.L88: ++ LD $f22, CO1, 0 * SIZE ++ LD $f8, CO1, 1 * SIZE ++ ADD c11, c11, c21 ++ MADD $f22, c11, ALPHA_R, $f22 ++ MADD $f8, c11, ALPHA_I, $f8 ++ ST $f22, CO1, 0 * SIZE ++ ST $f8, CO1, 1 * SIZE ++ .align 3 ++ ++.L89: ++ move B, BO ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ LDARG $r25, $sp, 16 ++ LDARG $r26, $sp, 24 ++ LDARG $r27, $sp, 32 ++ LDARG $r28, $sp, 40 ++ fld.d $f24, $sp, 48 ++ fld.d $f25, $sp, 56 ++ fld.d $f26, $sp, 64 ++ fld.d $f27, $sp, 72 ++ fld.d $f28, $sp, 80 ++ fld.d $f29, $sp, 88 ++ addi.d $sp, $sp, 128 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/zgemm_kernel.S b/kernel/loongarch64/zgemm_kernel.S +new file mode 100644 +index 0000000..2d50d41 +--- /dev/null ++++ b/kernel/loongarch64/zgemm_kernel.S +@@ -0,0 +1,1047 @@ ++/*************************************************************************** ++Copyright (c) 2020, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define M $r4 ++#define N $r5 ++#define K $r6 ++#define A $r7 ++#define B $r8 ++#define C $r9 ++#define LDC $r10 ++ ++#define AO $r12 ++#define BO $r13 ++#define I $r17 ++#define J $r18 ++#define L $r25 ++#define CO1 $r14 ++#define CO2 $r15 ++#define CO3 $r23 ++#define CO4 $r24 ++ ++#if defined(TRMMKERNEL) ++#define OFFSET $r11 ++#define KK $r26 ++#define TEMP $r27 ++#endif ++ ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f28 ++#define a4 $f29 ++#define b1 $f23 ++#define b2 $f9 ++#define b3 $f10 ++#define b4 $f11 ++#define b5 $f12 ++#define b6 $f13 ++#define b7 $f14 ++#define b8 $f15 ++#define a5 b8 ++#define c11 $f16 ++#define c12 $f17 ++#define c21 $f3 ++#define c22 $f4 ++#define c31 $f2 ++#define c32 $f5 ++#define c41 $f6 ++#define c42 $f7 ++#define c51 $f18 ++#define c52 $f19 ++#define c61 $f20 ++#define c62 $f21 ++#define c71 $f24 ++#define c72 $f25 ++#define c81 $f26 ++#define c82 $f27 ++#define ALPHA_R $f0 ++#define ALPHA_I $f1 ++ ++#if defined(NN) || defined(NT) || defined(TN) || defined(TT) ++#define MADD1 MADD ++#define MADD2 MADD ++#define MADD3 MADD ++#define MADD4 NMSUB ++#endif ++ ++#if defined(NR) || defined(NC) || defined(TR) || defined(TC) ++#define MADD1 MADD ++#define MADD2 MADD ++#define MADD3 NMSUB ++#define MADD4 MADD ++#endif ++ ++#if defined(RN) || defined(RT) || defined(CN) || defined(CT) ++#define MADD1 MADD ++#define MADD2 NMSUB ++#define MADD3 MADD ++#define MADD4 MADD ++#endif ++ ++#if defined(RR) || defined(RC) || defined(CR) || defined(CC) ++#define MADD1 MADD ++#define MADD2 NMSUB ++#define MADD3 NMSUB ++#define MADD4 NMSUB ++#endif ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -128 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 64 ++ fst.d $f24, $sp, 16 ++ fst.d $f25, $sp, 24 ++ fst.d $f26, $sp, 32 ++ fst.d $f27, $sp, 40 ++ fst.d $f28, $sp, 48 ++ fst.d $f29, $sp, 56 ++#if defined(TRMMKERNEL) ++ SDARG $r26, $sp, 72 ++ SDARG $r27, $sp, 80 ++#endif ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 88 ++ fst.d $f19, $sp, 96 ++ fst.d $f20, $sp, 104 ++ fst.d $f21, $sp, 112 ++#endif ++ slli.d LDC, LDC, ZBASE_SHIFT ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ sub.d KK, $r0, OFFSET ++#endif ++ srai.d J, N, 2 ++nop ++ bge $r0, J, .L20 ++.L10: ++ move CO1, C ++ MTC c11, $r0 ++ add.d CO2, C, LDC ++ move AO, A ++ add.d CO3, CO2, LDC ++ addi.d J, J, -1 ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ MOV c31, c11 ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move KK, OFFSET ++#endif ++ MOV c41, c11 ++ MOV c51, c11 ++ move I, M ++ add.d C, CO4, LDC ++ MOV c61, c11 ++ bge $r0, I, .L19 ++.L11: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d L, KK, ZBASE_SHIFT ++ slli.d TEMP, KK, 2 + ZBASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ MOV c32, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c42, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c52, c11 ++ LD b5, BO, 4 * SIZE ++ MOV c62, c11 ++ LD b6, BO, 8 * SIZE ++ MOV c72, c11 ++ LD b7, BO, 12 * SIZE ++ MOV c82, c11 ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 1 ++#else ++ addi.d TEMP, KK, 4 ++#endif ++ srai.d L, TEMP, 2 ++ bge $r0, L, .L15 ++#else ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, B, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, K, 2 ++ MOV c32, c11 ++ LD b3, B, 2 * SIZE ++ MOV c42, c11 ++ LD b4, B, 3 * SIZE ++ MOV c52, c11 ++ LD b5, B, 4 * SIZE ++ MOV c62, c11 ++ LD b6, B, 8 * SIZE ++ MOV c72, c11 ++ LD b7, B, 12 * SIZE ++ MOV c82, c11 ++move BO, B ++ bge $r0, L, .L15 ++#endif ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ bge $r0, L, .L13 ++ .align 3 ++.L12: ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c51, b5, a1, c51 ++ MADD3 c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD1 c71, b3, a1, c71 ++ MADD3 c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD1 c51, b7, a4, c51 ++ MADD3 c61, b2, a4, c61 ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD1 c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD3 c21, b2, a3, c21 ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD1 c51, b5, a3, c51 ++ MADD3 c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD1 c71, b3, a3, c71 ++ MADD3 c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ addi.d L, L, -1 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD1 c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD3 c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ blt $r0, L, .L12 ++ .align 3 ++ ++.L13: ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c51, b5, a1, c51 ++ MADD3 c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD1 c71, b3, a1, c71 ++ MADD3 c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD1 c51, b7, a4, c51 ++ MADD3 c61, b2, a4, c61 ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD1 c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD3 c21, b2, a3, c21 ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD1 c51, b5, a3, c51 ++ MADD3 c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD1 c71, b3, a3, c71 ++ MADD3 c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD1 c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD3 c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ .align 3 ++ ++.L15: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L18 ++ .align 3 ++.L16: ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c51, b5, a1, c51 ++ addi.d L, L, -1 ++ MADD3 c61, b2, a1, c61 ++ addi.d AO, AO, 2 * SIZE ++ MADD1 c71, b3, a1, c71 ++ addi.d BO, BO, 8 * SIZE ++ MADD3 c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 4 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L16 ++.L18: ++#ifndef TRMMKERNEL ++ LD b1, CO1, 0 * SIZE ++ ADD c11, c11, c22 ++ LD b2, CO1, 1 * SIZE ++ ADD c12, c12, c21 ++ LD b3, CO2, 0 * SIZE ++ ADD c31, c31, c42 ++ LD b4, CO2, 1 * SIZE ++ ADD c32, c32, c41 ++ LD b5, CO3, 0 * SIZE ++ ADD c51, c51, c62 ++ LD b6, CO3, 1 * SIZE ++ ADD c52, c52, c61 ++ LD b7, CO4, 0 * SIZE ++ ADD c71, c71, c82 ++ LD b8, CO4, 1 * SIZE ++ ADD c72, c72, c81 ++ MADD b1, c11, ALPHA_R, b1 ++ addi.d CO1,CO1, 2 * SIZE ++ MADD b2, c12, ALPHA_R, b2 ++ addi.d CO2,CO2, 2 * SIZE ++ MADD b3, c31, ALPHA_R, b3 ++ addi.d CO3,CO3, 2 * SIZE ++ MADD b4, c32, ALPHA_R, b4 ++ addi.d CO4,CO4, 2 * SIZE ++ MADD b5, c51, ALPHA_R, b5 ++ addi.d I, I, -1 ++ MADD b6, c52, ALPHA_R, b6 ++ MADD b7, c71, ALPHA_R, b7 ++ MADD b8, c72, ALPHA_R, b8 ++ NMSUB b1, c12, ALPHA_I, b1 ++ MADD b2, c11, ALPHA_I, b2 ++ MTC c11, $r0 ++ NMSUB b3, c32, ALPHA_I, b3 ++ MADD b4, c31, ALPHA_I, b4 ++ ST b1, CO1, -2 * SIZE ++ NMSUB b5, c52, ALPHA_I, b5 ++ ST b2, CO1, -1 * SIZE ++ MADD b6, c51, ALPHA_I, b6 ++ ST b3, CO2, -2 * SIZE ++ NMSUB b7, c72, ALPHA_I, b7 ++ ST b4, CO2, -1 * SIZE ++ MADD b8, c71, ALPHA_I, b8 ++ ST b5, CO3, -2 * SIZE ++ MOV c21, c11 ++ ST b6, CO3, -1 * SIZE ++ MOV c31, c11 ++ ST b7, CO4, -2 * SIZE ++ MOV c41, c11 ++ ST b8, CO4, -1 * SIZE ++ MOV c51, c11 ++#else ++ ADD c11, c11, c22 ++ addi.d CO1,CO1, 2 * SIZE ++ ADD c12, c12, c21 ++ addi.d CO2,CO2, 2 * SIZE ++ ADD c31, c31, c42 ++ addi.d CO3,CO3, 2 * SIZE ++ ADD c32, c32, c41 ++ addi.d CO4,CO4, 2 * SIZE ++ ADD c51, c51, c62 ++ addi.d I, I, -1 ++ ADD c52, c52, c61 ++ ADD c71, c71, c82 ++ ADD c72, c72, c81 ++ MUL b1, ALPHA_R, c11 ++ MUL b2, ALPHA_R, c12 ++ MUL b3, ALPHA_R, c31 ++ MUL b4, ALPHA_R, c32 ++ MUL b5, ALPHA_R, c51 ++ MUL b6, ALPHA_R, c52 ++ MUL b7, ALPHA_R, c71 ++ MUL b8, ALPHA_R, c72 ++ NMSUB b1, c12, ALPHA_I, b1 ++ MADD b2, c11, ALPHA_I, b2 ++ MTC c11, $r0 ++ NMSUB b3, c32, ALPHA_I, b3 ++ MADD b4, c31, ALPHA_I, b4 ++ ST b1, CO1, -2 * SIZE ++ NMSUB b5, c52, ALPHA_I, b5 ++ ST b2, CO1, -1 * SIZE ++ MADD b6, c51, ALPHA_I, b6 ++ ST b3, CO2, -2 * SIZE ++ NMSUB b7, c72, ALPHA_I, b7 ++ ST b4, CO2, -1 * SIZE ++ MADD b8, c71, ALPHA_I, b8 ++ ST b5, CO3, -2 * SIZE ++ MOV c21, c11 ++ ST b6, CO3, -1 * SIZE ++ MOV c31, c11 ++ ST b7, CO4, -2 * SIZE ++ MOV c41, c11 ++ ST b8, CO4, -1 * SIZE ++ MOV c51, c11 ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ sub.d TEMP, K, KK ++#ifdef LEFT ++ addi.d TEMP, TEMP, -1 ++#else ++ addi.d TEMP, TEMP, -4 ++#endif ++ slli.d L, TEMP, ZBASE_SHIFT ++ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LEFT ++ addi.d KK, KK, 1 ++#endif ++#endif ++MOV c61, c11 ++ blt $r0, I, .L11 ++ .align 3 ++ ++.L19: ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addi.d KK, KK, 4 ++#endif ++move B, BO ++ blt $r0, J, .L10 ++ .align 3 ++ ++.L20: ++ andi J, N, 2 ++ MTC c11, $r0 ++move CO1, C ++ bge $r0, J, .L30 ++ add.d CO2, C, LDC ++ add.d C, CO2, LDC ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move KK, OFFSET ++#endif ++ move I, M ++move AO, A ++ bge $r0, I, .L29 ++ .align 3 ++ ++.L21: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d L, KK, ZBASE_SHIFT ++ slli.d TEMP, KK, 1 + ZBASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c31, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c41, c11 ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ MOV c12, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c22, c11 ++ LD b5, BO, 4 * SIZE ++ MOV c32, c11 ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 1 ++#else ++ addi.d TEMP, KK, 2 ++#endif ++ srai.d L, TEMP, 2 ++MOV c42, c11 ++ bge $r0, L, .L25 ++#else ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, B, 0 * SIZE ++ MOV c31, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c41, c11 ++ LD b2, B, 1 * SIZE ++ srai.d L, K, 2 ++ LD b3, B, 2 * SIZE ++ MOV c12, c11 ++ LD b4, B, 3 * SIZE ++ MOV c22, c11 ++ LD b5, B, 4 * SIZE ++ MOV c32, c11 ++ MOV c42, c11 ++move BO, B ++ bge $r0, L, .L25 ++#endif ++ .align 3 ++.L22: ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c11, b5, a1, c11 ++ LD a2, AO, 3 * SIZE ++ MADD3 c21, b2, a1, c21 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ LD a1, AO, 8 * SIZE ++ MADD2 c12, b5, a2, c12 ++ LD b5, BO, 12 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 9 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 10 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 11 * SIZE ++ MADD1 c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD3 c21, b2, a3, c21 ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ LD a3, AO, 6 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD1 c11, b5, a3, c11 ++ LD a2, AO, 7 * SIZE ++ MADD3 c21, b2, a3, c21 ++ addi.d AO, AO, 8 * SIZE ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ LD a3, AO, 4 * SIZE ++ MADD2 c12, b5, a2, c12 ++ LD b5, BO, 20 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 17 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 18 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 19 * SIZE ++addi.d BO, BO, 16 * SIZE ++ blt $r0, L, .L22 ++ .align 3 ++ ++.L25: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L28 ++ .align 3 ++.L26: ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD1 c31, b3, a1, c31 ++ addi.d BO, BO, 4 * SIZE ++ MADD3 c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 0 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 1 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 2 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 3 * SIZE ++addi.d AO, AO, 2 * SIZE ++ blt $r0, L, .L26 ++.L28: ++#ifndef TRMMKERNEL ++ LD b1, CO1, 0 * SIZE ++ ADD c11, c11, c22 ++ LD b2, CO1, 1 * SIZE ++ ADD c12, c12, c21 ++ LD b3, CO2, 0 * SIZE ++ ADD c31, c31, c42 ++ LD b4, CO2, 1 * SIZE ++ ADD c32, c32, c41 ++ MADD b1, c11, ALPHA_R, b1 ++ addi.d CO1,CO1, 2 * SIZE ++ MADD b2, c12, ALPHA_R, b2 ++ addi.d CO2,CO2, 2 * SIZE ++ MADD b3, c31, ALPHA_R, b3 ++ addi.d I, I, -1 ++ MADD b4, c32, ALPHA_R, b4 ++ NMSUB b1, c12, ALPHA_I, b1 ++ MADD b2, c11, ALPHA_I, b2 ++ MTC c11, $r0 ++ NMSUB b3, c32, ALPHA_I, b3 ++ MADD b4, c31, ALPHA_I, b4 ++ ST b1, CO1, -2 * SIZE ++ ST b2, CO1, -1 * SIZE ++ ST b3, CO2, -2 * SIZE ++#else ++ ADD c11, c11, c22 ++ ADD c12, c12, c21 ++ ADD c31, c31, c42 ++ ADD c32, c32, c41 ++ MUL b1, ALPHA_R, c11 ++ addi.d CO1,CO1, 2 * SIZE ++ MUL b2, ALPHA_R, c12 ++ addi.d CO2,CO2, 2 * SIZE ++ MUL b3, ALPHA_R, c31 ++ addi.d I, I, -1 ++ MUL b4, ALPHA_R, c32 ++ NMSUB b1, c12, ALPHA_I, b1 ++ MADD b2, c11, ALPHA_I, b2 ++ MTC c11, $r0 ++ NMSUB b3, c32, ALPHA_I, b3 ++ MADD b4, c31, ALPHA_I, b4 ++ ST b1, CO1, -2 * SIZE ++ ST b2, CO1, -1 * SIZE ++ ST b3, CO2, -2 * SIZE ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ sub.d TEMP, K, KK ++#ifdef LEFT ++ addi.d TEMP, TEMP, -1 ++#else ++ addi.d TEMP, TEMP, -2 ++#endif ++ slli.d L, TEMP, ZBASE_SHIFT ++ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LEFT ++ addi.d KK, KK, 1 ++#endif ++#endif ++ ST b4, CO2, -1 * SIZE ++ blt $r0, I, .L21 ++ .align 3 ++ ++.L29: ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addi.d KK, KK, 2 ++#endif ++ move B, BO ++ .align 3 ++ ++.L30: ++ andi J, N, 1 ++ MTC c11, $r0 ++move CO1, C ++ bge $r0, J, .L999 ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move KK, OFFSET ++#endif ++ move I, M ++ add.d C, CO1, LDC ++move AO, A ++ bge $r0, I, .L39 ++ .align 3 ++ ++.L31: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move BO, B ++#else ++ slli.d TEMP, KK, ZBASE_SHIFT ++ add.d AO, AO, TEMP ++ add.d BO, B, TEMP ++#endif ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c31, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c41, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c12, c11 ++ MOV c22, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c32, c11 ++ LD b3, BO, 4 * SIZE ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d TEMP, K, KK ++#elif defined(LEFT) ++ addi.d TEMP, KK, 1 ++#else ++ addi.d TEMP, KK, 1 ++#endif ++ srai.d L, TEMP, 2 ++MOV c42, c11 ++ bge $r0, L, .L35 ++#else ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, B, 0 * SIZE ++ MOV c31, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c41, c11 ++ LD b2, B, 1 * SIZE ++ MOV c12, c11 ++ srai.d L, K, 2 ++ MOV c22, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c32, c11 ++ LD b3, B, 4 * SIZE ++ MOV c42, c11 ++move BO, B ++ bge $r0, L, .L35 ++#endif ++ .align 3 ++.L32: ++ MADD1 c11, b1, a1, c11 ++ LD b4, BO, 3 * SIZE ++ MADD3 c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD a2, AO, 3 * SIZE ++ MADD1 c11, b1, a1, c11 ++ LD b2, BO, 5 * SIZE ++ MADD3 c21, b4, a1, c21 ++ LD a1, AO, 8 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD4 c22, b4, a2, c22 ++ LD a2, AO, 5 * SIZE ++ MADD1 c11, b3, a3, c11 ++ LD b4, BO, 7 * SIZE ++ MADD3 c21, b2, a3, c21 ++ LD a3, AO, 6 * SIZE ++ MADD2 c12, b3, a2, c12 ++ LD b3, BO, 6 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD a2, AO, 7 * SIZE ++ MADD1 c11, b3, a3, c11 ++ LD b2, BO, 9 * SIZE ++ MADD3 c21, b4, a3, c21 ++ LD a3, AO, 12 * SIZE ++ MADD2 c12, b3, a2, c12 ++ LD b3, BO, 12 * SIZE ++ MADD4 c22, b4, a2, c22 ++ LD a2, AO, 9 * SIZE ++ addi.d AO, AO, 8 * SIZE ++ addi.d L, L, -1 ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L32 ++ .align 3 ++ ++.L35: ++#ifndef TRMMKERNEL ++ andi L, K, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L38 ++ .align 3 ++.L36: ++ MADD1 c11, b1, a1, c11 ++ addi.d L, L, -1 ++ MADD3 c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD a2, AO, 3 * SIZE ++ LD b2, BO, 3 * SIZE ++ addi.d BO, BO, 2 * SIZE ++addi.d AO, AO, 2 * SIZE ++ blt $r0, L, .L36 ++.L38: ++#ifndef TRMMKERNEL ++ LD b1, CO1, 0 * SIZE ++ ADD c11, c11, c22 ++ LD b2, CO1, 1 * SIZE ++ ADD c12, c12, c21 ++ MADD b1, c11, ALPHA_R, b1 ++ addi.d CO1,CO1, 2 * SIZE ++ MADD b2, c12, ALPHA_R, b2 ++ addi.d I, I, -1 ++ NMSUB b1, c12, ALPHA_I, b1 ++ MADD b2, c11, ALPHA_I, b2 ++ MTC c11, $r0 ++ ST b1, CO1, -2 * SIZE ++ ST b2, CO1, -1 * SIZE ++ blt $r0, I, .L31 ++#else ++ ADD c11, c11, c22 ++ ADD c12, c12, c21 ++ MUL b1, ALPHA_R, c11 ++ addi.d CO1,CO1, 2 * SIZE ++ MUL b2, ALPHA_R, c12 ++ addi.d I, I, -1 ++ NMSUB b1, c12, ALPHA_I, b1 ++ MADD b2, c11, ALPHA_I, b2 ++ MTC c11, $r0 ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ sub.d TEMP, K, KK ++#ifdef LEFT ++ addi.d TEMP, TEMP, -1 ++#else ++ addi.d TEMP, TEMP, -1 ++#endif ++ slli.d TEMP, TEMP, ZBASE_SHIFT ++ add.d AO, AO, TEMP ++ add.d BO, BO, TEMP ++#endif ++#ifdef LEFT ++ addi.d KK, KK, 1 ++#endif ++ ST b1, CO1, -2 * SIZE ++ ST b2, CO1, -1 * SIZE ++ blt $r0, I, .L31 ++#endif ++ .align 3 ++ ++.L39: ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addi.d KK, KK, 1 ++#endif ++ move B, BO ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ LDARG $r25, $sp, 64 ++ fld.d $f24, $sp, 16 ++ fld.d $f25, $sp, 24 ++ fld.d $f26, $sp, 32 ++ fld.d $f27, $sp, 40 ++ fld.d $f28, $sp, 48 ++ fld.d $f29, $sp, 56 ++#if defined(TRMMKERNEL) ++ LDARG $r26, $sp, 72 ++ LDARG $r27, $sp, 80 ++#endif ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 88 ++ fld.d $f19, $sp, 96 ++ fld.d $f20, $sp, 104 ++ fld.d $f21, $sp, 112 ++#endif ++ addi.d $sp, $sp, 128 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ fmov.d $f1, $f23 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S +new file mode 100644 +index 0000000..d995ce8 +--- /dev/null ++++ b/kernel/loongarch64/zgemv_n.S +@@ -0,0 +1,648 @@ ++/*************************************************************************** ++Copyright (c) 2020, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define M $r4 ++#define N $r5 ++#define A $r7 ++#define LDA $r8 ++#define X $r9 ++#define INCX $r10 ++#define Y $r11 ++#define INCY $r6 ++#define BUFFER $r17 ++ ++#define YORIG $r18 ++#define XX $r12 ++#define YY $r13 ++#define I $r14 ++#define J $r15 ++#define AO1 $r23 ++#define AO2 $r24 ++ ++#define ALPHA_R $f0 ++#define ALPHA_I $f1 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f23 ++#define a4 $f9 ++#define a5 $f10 ++#define a6 $f11 ++#define a7 $f12 ++#define a8 $f13 ++#define x1 $f14 ++#define x2 $f15 ++#define x3 $f16 ++#define x4 $f17 ++#define y1 $f3 ++#define y2 $f4 ++#define y3 $f2 ++#define y4 $f5 ++#define t1 $f6 ++#define t2 $f7 ++#define t3 $f18 ++#define t4 $f19 ++#define t5 $f20 ++#define t6 $f21 ++#define t7 $f24 ++#define t8 $f25 ++ ++#if !defined(CONJ) && !defined(XCONJ) ++#define MADD1 MADD ++#define MADD2 MADD ++#define MADD3 NMSUB ++#define MADD4 MADD ++#endif ++#if defined(CONJ) && !defined(XCONJ) ++#define MADD1 MADD ++#define MADD2 MADD ++#define MADD3 MADD ++#define MADD4 NMSUB ++#endif ++#if !defined(CONJ) && defined(XCONJ) ++#define MADD1 MADD ++#define MADD2 NMSUB ++#define MADD3 MADD ++#define MADD4 MADD ++#endif ++#if defined(CONJ) && defined(XCONJ) ++#define MADD1 MADD ++#define MADD2 NMSUB ++#define MADD3 NMSUB ++#define MADD4 NMSUB ++#endif ++ ++ PROLOGUE ++ ++ LDARG INCY, $sp, 0 ++ LDARG BUFFER, $sp, 8 ++#ifndef __64BIT__ ++ addi.d $sp, $sp, -64 ++#else ++ addi.d $sp, $sp, -32 ++#endif ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ fst.d $f24, $sp, 16 ++ fst.d $f25, $sp, 24 ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 32 ++ fst.d $f19, $sp, 40 ++ fst.d $f20, $sp, 48 ++ fst.d $f21, $sp, 56 ++#endif ++ slli.d LDA, LDA, ZBASE_SHIFT ++ slli.d INCX, INCX, ZBASE_SHIFT ++ bge $r0, M, .L999 ++ slli.d INCY, INCY, ZBASE_SHIFT ++ bge $r0, N, .L999 ++ li.d I, 2 * SIZE ++ move YORIG, Y ++ beq INCY, I, .L10 ++ srai.d I, M, 2 ++ move YORIG, BUFFER ++ move XX, Y ++ move YY, BUFFER ++ bge $r0, I, .L05 ++ .align 3 ++ ++.L02: ++ LD a1, XX, 0 * SIZE ++ LD a2, XX, 1 * SIZE ++ add.d XX, XX, INCY ++ LD a3, XX, 0 * SIZE ++ LD a4, XX, 1 * SIZE ++ add.d XX, XX, INCY ++ LD a5, XX, 0 * SIZE ++ LD a6, XX, 1 * SIZE ++ add.d XX, XX, INCY ++ LD a7, XX, 0 * SIZE ++ LD a8, XX, 1 * SIZE ++ add.d XX, XX, INCY ++ addi.d I, I, -1 ++ addi.d YY, YY, 8 * SIZE ++ ST a1, YY, -8 * SIZE ++ ST a2, YY, -7 * SIZE ++ ST a3, YY, -6 * SIZE ++ ST a4, YY, -5 * SIZE ++ ST a5, YY, -4 * SIZE ++ ST a6, YY, -3 * SIZE ++ ST a7, YY, -2 * SIZE ++ ST a8, YY, -1 * SIZE ++ blt $r0, I, .L02 ++ .align 3 ++ ++.L05: ++ andi I, M, 3 ++ bge $r0, I, .L10 ++ .align 3 ++ ++.L06: ++ LD a1, XX, 0 * SIZE ++ LD a2, XX, 1 * SIZE ++ add.d XX, XX, INCY ++ addi.d I, I, -1 ++ ST a1, YY, 0 * SIZE ++ ST a2, YY, 1 * SIZE ++ addi.d YY, YY, 2 * SIZE ++ blt $r0, I, .L06 ++ .align 3 ++ ++.L10: ++ srai.d J, N, 1 ++ bge $r0, J, .L20 ++ .align 3 ++ ++.L11: ++ LD x1, X, 0 * SIZE ++ LD x2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD x3, X, 0 * SIZE ++ LD x4, X, 1 * SIZE ++ add.d X, X, INCX ++ MUL a1, ALPHA_R, x1 ++ move AO1, A ++ MUL a2, ALPHA_I, x1 ++ add.d AO2, A, LDA ++ MUL a3, ALPHA_R, x3 ++ add.d A, AO2, LDA ++ MUL a4, ALPHA_I, x3 ++#ifndef XCONJ ++ NMSUB x1, x2, ALPHA_I, a1 ++ MADD x2, x2, ALPHA_R, a2 ++ NMSUB x3, x4, ALPHA_I, a3 ++ MADD x4, x4, ALPHA_R, a4 ++#else ++ MADD x1, x2, ALPHA_I, a1 ++ MSUB x2, x2, ALPHA_R, a2 ++ MADD x3, x4, ALPHA_I, a3 ++ MSUB x4, x4, ALPHA_R, a4 ++#endif ++ srai.d I, M, 2 ++ move YY, YORIG ++ bge $r0, I, .L15 ++ LD y1, YY, 0 * SIZE ++ LD a1, AO1, 0 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a3, AO1, 2 * SIZE ++ LD y3, YY, 2 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD y4, YY, 3 * SIZE ++ LD a4, AO1, 3 * SIZE ++ LD a5, AO2, 0 * SIZE ++ LD a6, AO2, 1 * SIZE ++ LD a7, AO2, 2 * SIZE ++ LD a8, AO2, 3 * SIZE ++ MADD1 t1, a1, x1, y1 ++ LD y1, YY, 4 * SIZE ++ MADD2 t2, a1, x2, y2 ++ LD a1, AO1, 4 * SIZE ++ MADD1 t3, a3, x1, y3 ++ LD y2, YY, 5 * SIZE ++ MADD2 t4, a3, x2, y4 ++ LD a3, AO1, 6 * SIZE ++ MADD3 t1, a2, x2, t1 ++ LD y3, YY, 6 * SIZE ++ MADD4 t2, a2, x1, t2 ++ LD a2, AO1, 5 * SIZE ++ MADD3 t3, a4, x2, t3 ++ LD y4, YY, 7 * SIZE ++ MADD4 t4, a4, x1, t4 ++ LD a4, AO1, 7 * SIZE ++ MADD1 t1, a5, x3, t1 ++ MADD2 t2, a5, x4, t2 ++ LD a5, AO2, 4 * SIZE ++ MADD1 t3, a7, x3, t3 ++ MADD2 t4, a7, x4, t4 ++ LD a7, AO2, 6 * SIZE ++ MADD3 t1, a6, x4, t1 ++ MADD4 t2, a6, x3, t2 ++ LD a6, AO2, 5 * SIZE ++ MADD3 t3, a8, x4, t3 ++ addi.d I, I, -1 ++ MADD4 t4, a8, x3, t4 ++ LD a8, AO2, 7 * SIZE ++ bge $r0, I, .L13 ++ .align 3 ++.L12: ++ MADD1 t5, a1, x1, y1 ++ LD y1, YY, 8 * SIZE ++ MADD2 t6, a1, x2, y2 ++ LD a1, AO1, 8 * SIZE ++ MADD1 t7, a3, x1, y3 ++ LD y2, YY, 9 * SIZE ++ MADD2 t8, a3, x2, y4 ++ LD a3, AO1, 10 * SIZE ++ MADD3 t5, a2, x2, t5 ++ LD y3, YY, 10 * SIZE ++ MADD4 t6, a2, x1, t6 ++ LD a2, AO1, 9 * SIZE ++ MADD3 t7, a4, x2, t7 ++ LD y4, YY, 11 * SIZE ++ MADD4 t8, a4, x1, t8 ++ LD a4, AO1, 11 * SIZE ++ MADD1 t5, a5, x3, t5 ++ ST t1, YY, 0 * SIZE ++ MADD2 t6, a5, x4, t6 ++ LD a5, AO2, 8 * SIZE ++ MADD1 t7, a7, x3, t7 ++ ST t2, YY, 1 * SIZE ++ MADD2 t8, a7, x4, t8 ++ LD a7, AO2, 10 * SIZE ++ MADD3 t5, a6, x4, t5 ++ ST t3, YY, 2 * SIZE ++ MADD4 t6, a6, x3, t6 ++ LD a6, AO2, 9 * SIZE ++ MADD3 t7, a8, x4, t7 ++ ST t4, YY, 3 * SIZE ++ MADD4 t8, a8, x3, t8 ++ LD a8, AO2, 11 * SIZE ++ MADD1 t1, a1, x1, y1 ++ LD y1, YY, 12 * SIZE ++ MADD2 t2, a1, x2, y2 ++ LD a1, AO1, 12 * SIZE ++ MADD1 t3, a3, x1, y3 ++ LD y2, YY, 13 * SIZE ++ MADD2 t4, a3, x2, y4 ++ LD a3, AO1, 14 * SIZE ++ MADD3 t1, a2, x2, t1 ++ LD y3, YY, 14 * SIZE ++ MADD4 t2, a2, x1, t2 ++ LD a2, AO1, 13 * SIZE ++ MADD3 t3, a4, x2, t3 ++ LD y4, YY, 15 * SIZE ++ MADD4 t4, a4, x1, t4 ++ LD a4, AO1, 15 * SIZE ++ MADD1 t1, a5, x3, t1 ++ ST t5, YY, 4 * SIZE ++ MADD2 t2, a5, x4, t2 ++ LD a5, AO2, 12 * SIZE ++ MADD1 t3, a7, x3, t3 ++ ST t6, YY, 5 * SIZE ++ MADD2 t4, a7, x4, t4 ++ LD a7, AO2, 14 * SIZE ++ MADD3 t1, a6, x4, t1 ++ ST t7, YY, 6 * SIZE ++ MADD4 t2, a6, x3, t2 ++ LD a6, AO2, 13 * SIZE ++ MADD3 t3, a8, x4, t3 ++ ST t8, YY, 7 * SIZE ++ MADD4 t4, a8, x3, t4 ++ LD a8, AO2, 15 * SIZE ++ addi.d I, I, -1 ++ addi.d YY, YY, 8 * SIZE ++ addi.d AO1, AO1, 8 * SIZE ++ addi.d AO2, AO2, 8 * SIZE ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ ST t1, YY, 0 * SIZE ++ MADD1 t1, a1, x1, y1 ++ ST t2, YY, 1 * SIZE ++ MADD2 t2, a1, x2, y2 ++ ST t3, YY, 2 * SIZE ++ MADD1 t3, a3, x1, y3 ++ ST t4, YY, 3 * SIZE ++ MADD2 t4, a3, x2, y4 ++ MADD3 t1, a2, x2, t1 ++ MADD4 t2, a2, x1, t2 ++ MADD3 t3, a4, x2, t3 ++ MADD4 t4, a4, x1, t4 ++ MADD1 t1, a5, x3, t1 ++ MADD2 t2, a5, x4, t2 ++ MADD1 t3, a7, x3, t3 ++ MADD2 t4, a7, x4, t4 ++ MADD3 t1, a6, x4, t1 ++ addi.d AO1, AO1, 8 * SIZE ++ MADD4 t2, a6, x3, t2 ++ addi.d AO2, AO2, 8 * SIZE ++ MADD3 t3, a8, x4, t3 ++ addi.d YY, YY, 8 * SIZE ++ MADD4 t4, a8, x3, t4 ++ ST t1, YY, -4 * SIZE ++ ST t2, YY, -3 * SIZE ++ ST t3, YY, -2 * SIZE ++ ST t4, YY, -1 * SIZE ++ .align 3 ++ ++.L15: ++ andi I, M, 2 ++ bge $r0, I, .L16 ++ LD a1, AO1, 0 * SIZE ++ LD y1, YY, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a3, AO1, 2 * SIZE ++ LD y3, YY, 2 * SIZE ++ LD a4, AO1, 3 * SIZE ++ LD y4, YY, 3 * SIZE ++ MADD1 t1, a1, x1, y1 ++ LD a5, AO2, 0 * SIZE ++ MADD2 t2, a1, x2, y2 ++ LD a6, AO2, 1 * SIZE ++ MADD1 t3, a3, x1, y3 ++ LD a7, AO2, 2 * SIZE ++ MADD2 t4, a3, x2, y4 ++ LD a8, AO2, 3 * SIZE ++ MADD3 t1, a2, x2, t1 ++ MADD4 t2, a2, x1, t2 ++ MADD3 t3, a4, x2, t3 ++ MADD4 t4, a4, x1, t4 ++ MADD1 t1, a5, x3, t1 ++ MADD2 t2, a5, x4, t2 ++ MADD1 t3, a7, x3, t3 ++ MADD2 t4, a7, x4, t4 ++ MADD3 t1, a6, x4, t1 ++ addi.d YY, YY, 4 * SIZE ++ MADD4 t2, a6, x3, t2 ++ addi.d AO1, AO1, 4 * SIZE ++ MADD3 t3, a8, x4, t3 ++ addi.d AO2, AO2, 4 * SIZE ++ MADD4 t4, a8, x3, t4 ++ ST t1, YY, -4 * SIZE ++ ST t2, YY, -3 * SIZE ++ ST t3, YY, -2 * SIZE ++ ST t4, YY, -1 * SIZE ++ .align 3 ++ ++.L16: ++ andi I, M, 1 ++ bge $r0, I, .L19 ++ LD y1, YY, 0 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a1, AO1, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ MADD1 t1, a1, x1, y1 ++ LD a5, AO2, 0 * SIZE ++ MADD2 t2, a1, x2, y2 ++ LD a6, AO2, 1 * SIZE ++ MADD3 t1, a2, x2, t1 ++ MADD4 t2, a2, x1, t2 ++ MADD1 t1, a5, x3, t1 ++ MADD2 t2, a5, x4, t2 ++ MADD3 t1, a6, x4, t1 ++ MADD4 t2, a6, x3, t2 ++ ST t1, YY, 0 * SIZE ++ ST t2, YY, 1 * SIZE ++ .align 3 ++ ++.L19: ++ addi.d J, J, -1 ++ blt $r0, J, .L11 ++ .align 3 ++ ++.L20: ++ andi J, N, 1 ++ bge $r0, J, .L900 ++ LD x1, X, 0 * SIZE ++ LD x2, X, 1 * SIZE ++ add.d X, X, INCX ++ MUL a1, ALPHA_R, x1 ++ move AO1, A ++ MUL a2, ALPHA_I, x1 ++#ifndef XCONJ ++ NMSUB x1, x2, ALPHA_I, a1 ++ MADD x2, x2, ALPHA_R, a2 ++#else ++ MADD x1, x2, ALPHA_I, a1 ++ MSUB x2, x2, ALPHA_R, a2 ++#endif ++ srai.d I, M, 2 ++ move YY, YORIG ++ bge $r0, I, .L25 ++ LD y1, YY, 0 * SIZE ++ LD a1, AO1, 0 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a3, AO1, 2 * SIZE ++ LD y3, YY, 2 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD y4, YY, 3 * SIZE ++ LD a4, AO1, 3 * SIZE ++ MADD1 t1, a1, x1, y1 ++ LD y1, YY, 4 * SIZE ++ MADD2 t2, a1, x2, y2 ++ LD a1, AO1, 4 * SIZE ++ MADD1 t3, a3, x1, y3 ++ LD y2, YY, 5 * SIZE ++ MADD2 t4, a3, x2, y4 ++ LD a3, AO1, 6 * SIZE ++ MADD3 t1, a2, x2, t1 ++ LD y3, YY, 6 * SIZE ++ MADD4 t2, a2, x1, t2 ++ LD a2, AO1, 5 * SIZE ++ MADD3 t3, a4, x2, t3 ++ LD y4, YY, 7 * SIZE ++ MADD4 t4, a4, x1, t4 ++ addi.d I, I, -1 ++ LD a4, AO1, 7 * SIZE ++ bge $r0, I, .L23 ++ .align 3 ++.L22: ++ MADD1 t5, a1, x1, y1 ++ LD y1, YY, 8 * SIZE ++ MADD2 t6, a1, x2, y2 ++ LD a1, AO1, 8 * SIZE ++ MADD1 t7, a3, x1, y3 ++ LD y2, YY, 9 * SIZE ++ MADD2 t8, a3, x2, y4 ++ LD a3, AO1, 10 * SIZE ++ MADD3 t5, a2, x2, t5 ++ LD y3, YY, 10 * SIZE ++ MADD4 t6, a2, x1, t6 ++ LD a2, AO1, 9 * SIZE ++ MADD3 t7, a4, x2, t7 ++ LD y4, YY, 11 * SIZE ++ MADD4 t8, a4, x1, t8 ++ LD a4, AO1, 11 * SIZE ++ ST t1, YY, 0 * SIZE ++ ST t2, YY, 1 * SIZE ++ ST t3, YY, 2 * SIZE ++ ST t4, YY, 3 * SIZE ++ MADD1 t1, a1, x1, y1 ++ LD y1, YY, 12 * SIZE ++ MADD2 t2, a1, x2, y2 ++ LD a1, AO1, 12 * SIZE ++ MADD1 t3, a3, x1, y3 ++ LD y2, YY, 13 * SIZE ++ MADD2 t4, a3, x2, y4 ++ LD a3, AO1, 14 * SIZE ++ MADD3 t1, a2, x2, t1 ++ LD y3, YY, 14 * SIZE ++ MADD4 t2, a2, x1, t2 ++ LD a2, AO1, 13 * SIZE ++ MADD3 t3, a4, x2, t3 ++ LD y4, YY, 15 * SIZE ++ MADD4 t4, a4, x1, t4 ++ LD a4, AO1, 15 * SIZE ++ ST t5, YY, 4 * SIZE ++ ST t6, YY, 5 * SIZE ++ ST t7, YY, 6 * SIZE ++ ST t8, YY, 7 * SIZE ++ addi.d I, I, -1 ++ addi.d YY, YY, 8 * SIZE ++ addi.d AO1, AO1, 8 * SIZE ++ blt $r0, I, .L22 ++ .align 3 ++ ++.L23: ++ ST t1, YY, 0 * SIZE ++ MADD1 t1, a1, x1, y1 ++ ST t2, YY, 1 * SIZE ++ MADD2 t2, a1, x2, y2 ++ ST t3, YY, 2 * SIZE ++ MADD1 t3, a3, x1, y3 ++ ST t4, YY, 3 * SIZE ++ MADD2 t4, a3, x2, y4 ++ MADD3 t1, a2, x2, t1 ++ addi.d AO1, AO1, 8 * SIZE ++ MADD4 t2, a2, x1, t2 ++ addi.d YY, YY, 8 * SIZE ++ MADD3 t3, a4, x2, t3 ++ MADD4 t4, a4, x1, t4 ++ ST t1, YY, -4 * SIZE ++ ST t2, YY, -3 * SIZE ++ ST t3, YY, -2 * SIZE ++ ST t4, YY, -1 * SIZE ++ .align 3 ++ ++.L25: ++ andi I, M, 2 ++ bge $r0, I, .L26 ++ LD a1, AO1, 0 * SIZE ++ LD y1, YY, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a3, AO1, 2 * SIZE ++ LD y3, YY, 2 * SIZE ++ LD a4, AO1, 3 * SIZE ++ LD y4, YY, 3 * SIZE ++ MADD1 t1, a1, x1, y1 ++ MADD2 t2, a1, x2, y2 ++ MADD1 t3, a3, x1, y3 ++ MADD2 t4, a3, x2, y4 ++ MADD3 t1, a2, x2, t1 ++ addi.d YY, YY, 4 * SIZE ++ MADD4 t2, a2, x1, t2 ++ addi.d AO1, AO1, 4 * SIZE ++ MADD3 t3, a4, x2, t3 ++ MADD4 t4, a4, x1, t4 ++ ST t1, YY, -4 * SIZE ++ ST t2, YY, -3 * SIZE ++ ST t3, YY, -2 * SIZE ++ ST t4, YY, -1 * SIZE ++ .align 3 ++ ++.L26: ++ andi I, M, 1 ++ bge $r0, I, .L900 ++ LD y1, YY, 0 * SIZE ++ LD y2, YY, 1 * SIZE ++ LD a1, AO1, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ MADD1 t1, a1, x1, y1 ++ MADD2 t2, a1, x2, y2 ++ MADD3 t1, a2, x2, t1 ++ MADD4 t2, a2, x1, t2 ++ ST t1, YY, 0 * SIZE ++ ST t2, YY, 1 * SIZE ++ .align 3 ++ ++.L900: ++ li.d YORIG, 2 * SIZE ++ srai.d I, M, 2 ++ beq INCY, YORIG, .L999 ++ move XX, BUFFER ++ bge $r0, I, .L905 ++ .align 3 ++ ++.L902: ++ LD a1, XX, 0 * SIZE ++ LD a2, XX, 1 * SIZE ++ LD a3, XX, 2 * SIZE ++ LD a4, XX, 3 * SIZE ++ LD a5, XX, 4 * SIZE ++ LD a6, XX, 5 * SIZE ++ LD a7, XX, 6 * SIZE ++ LD a8, XX, 7 * SIZE ++ addi.d I, I, -1 ++ ST a1, Y, 0 * SIZE ++ ST a2, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ ST a3, Y, 0 * SIZE ++ ST a4, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ ST a5, Y, 0 * SIZE ++ ST a6, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ ST a7, Y, 0 * SIZE ++ ST a8, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ addi.d XX, XX, 8 * SIZE ++ blt $r0, I, .L902 ++ .align 3 ++ ++.L905: ++ andi I, M, 3 ++ bge $r0, I, .L999 ++ .align 3 ++ ++.L906: ++ LD a1, XX, 0 * SIZE ++ LD a2, XX, 1 * SIZE ++ addi.d XX, XX, 2 * SIZE ++ addi.d I, I, -1 ++ ST a1, Y, 0 * SIZE ++ ST a2, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ blt $r0, I, .L906 ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ fld.d $f24, $sp, 16 ++ fld.d $f25, $sp, 24 ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 32 ++ fld.d $f19, $sp, 40 ++ fld.d $f20, $sp, 48 ++ fld.d $f21, $sp, 56 ++#endif ++#ifdef __64BIT__ ++ addi.d $sp, $sp, 32 ++#else ++ addi.d $sp, $sp, 64 ++#endif ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S +new file mode 100644 +index 0000000..841823e +--- /dev/null ++++ b/kernel/loongarch64/zgemv_t.S +@@ -0,0 +1,556 @@ ++/*************************************************************************** ++Copyright (c) 2020, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define M $r4 ++#define N $r5 ++#define A $r7 ++#define LDA $r8 ++#define X $r9 ++#define INCX $r10 ++#define Y $r11 ++#define INCY $r6 ++#define BUFFER $r17 ++ ++#define XORIG $r18 ++#define XX $r12 ++#define YY $r13 ++#define I $r14 ++#define J $r15 ++#define AO1 $r23 ++#define AO2 $r24 ++ ++#define ALPHA_R $f0 ++#define ALPHA_I $f1 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f23 ++#define a4 $f9 ++#define a5 $f10 ++#define a6 $f11 ++#define a7 $f12 ++#define a8 $f13 ++#define y1 $f14 ++#define y2 $f15 ++#define y3 $f16 ++#define y4 $f17 ++#define x1 $f3 ++#define x2 $f4 ++#define x3 $f2 ++#define x4 $f5 ++#define x5 $f6 ++#define x6 $f7 ++#define x7 $f18 ++#define x8 $f19 ++ ++#if !defined(CONJ) && !defined(XCONJ) ++#define MADD1 MADD ++#define MADD2 MADD ++#define MADD3 NMSUB ++#define MADD4 MADD ++#endif ++#if defined(CONJ) && !defined(XCONJ) ++#define MADD1 MADD ++#define MADD2 MADD ++#define MADD3 MADD ++#define MADD4 NMSUB ++#endif ++#if !defined(CONJ) && defined(XCONJ) ++#define MADD1 MADD ++#define MADD2 NMSUB ++#define MADD3 MADD ++#define MADD4 MADD ++#endif ++#if defined(CONJ) && defined(XCONJ) ++#define MADD1 MADD ++#define MADD2 NMSUB ++#define MADD3 NMSUB ++#define MADD4 NMSUB ++#endif ++ ++ PROLOGUE ++ ++ LDARG INCY, $sp, 0 ++ LDARG BUFFER, $sp, 8 ++#ifdef __64BIT__ ++ addi.d $sp, $sp, -16 ++#else ++ addi.d $sp, $sp, -32 ++#endif ++ MTC y1, $r0 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ slli.d LDA, LDA, ZBASE_SHIFT ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 16 ++ fst.d $f19, $sp, 24 ++#endif ++ slli.d INCX, INCX, ZBASE_SHIFT ++ bge $r0, M, .L999 ++ slli.d INCY, INCY, ZBASE_SHIFT ++ bge $r0, N, .L999 ++ li.d I, 2 * SIZE ++ move XORIG, X ++ beq INCX, I, .L10 ++ srai.d I, M, 2 ++ move XORIG, BUFFER ++ move YY, BUFFER ++ bge $r0, I, .L05 ++ .align 3 ++ ++.L02: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ LD a8, X, 1 * SIZE ++ add.d X, X, INCX ++ addi.d I, I, -1 ++ addi.d YY, YY, 8 * SIZE ++ ST a1, YY, -8 * SIZE ++ ST a2, YY, -7 * SIZE ++ ST a3, YY, -6 * SIZE ++ ST a4, YY, -5 * SIZE ++ ST a5, YY, -4 * SIZE ++ ST a6, YY, -3 * SIZE ++ ST a7, YY, -2 * SIZE ++ ST a8, YY, -1 * SIZE ++ blt $r0, I, .L02 ++ .align 3 ++ ++.L05: ++ andi I, M, 3 ++ bge $r0, I, .L10 ++ .align 3 ++ ++.L06: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ ST a1, YY, 0 * SIZE ++ ST a2, YY, 1 * SIZE ++ addi.d I, I, -1 ++ addi.d YY, YY, 2 * SIZE ++ blt $r0, I, .L06 ++ .align 3 ++ ++.L10: ++ srai.d J, N, 1 ++ move YY, Y ++ bge $r0, J, .L20 ++ .align 3 ++ ++.L11: ++ move AO1, A ++ MOV y2, y1 ++ add.d AO2, A, LDA ++ MOV y3, y1 ++ add.d A, AO2, LDA ++ MOV y4, y1 ++ srai.d I, M, 2 ++ move XX, XORIG ++ bge $r0, I, .L15 ++ LD x1, XX, 0 * SIZE ++ LD x2, XX, 1 * SIZE ++ LD x4, XX, 3 * SIZE ++ LD a1, AO1, 0 * SIZE ++ LD a3, AO2, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD a4, AO2, 1 * SIZE ++ LD a5, AO1, 2 * SIZE ++ LD a7, AO2, 2 * SIZE ++ LD a6, AO1, 3 * SIZE ++ LD a8, AO2, 3 * SIZE ++ addi.d I, I, -1 ++ bge $r0, I, .L13 ++ .align 3 ++.L12: ++ MADD1 y1, a1, x1, y1 ++ LD x3, XX, 2 * SIZE ++ MADD2 y2, a1, x2, y2 ++ LD a1, AO1, 4 * SIZE ++ MADD1 y3, a3, x1, y3 ++ MADD2 y4, a3, x2, y4 ++ LD a3, AO2, 4 * SIZE ++ MADD3 y1, a2, x2, y1 ++ MADD4 y2, a2, x1, y2 ++ LD a2, AO1, 5 * SIZE ++ MADD3 y3, a4, x2, y3 ++ LD x2, XX, 5 * SIZE ++ MADD4 y4, a4, x1, y4 ++ LD a4, AO2, 5 * SIZE ++ MADD1 y1, a5, x3, y1 ++ LD x1, XX, 4 * SIZE ++ MADD2 y2, a5, x4, y2 ++ LD a5, AO1, 6 * SIZE ++ MADD1 y3, a7, x3, y3 ++ MADD2 y4, a7, x4, y4 ++ LD a7, AO2, 6 * SIZE ++ MADD3 y1, a6, x4, y1 ++ addi.d I, I, -1 ++ MADD4 y2, a6, x3, y2 ++ LD a6, AO1, 7 * SIZE ++ MADD3 y3, a8, x4, y3 ++ LD x4, XX, 7 * SIZE ++ MADD4 y4, a8, x3, y4 ++ LD a8, AO2, 7 * SIZE ++ MADD1 y1, a1, x1, y1 ++ LD x3, XX, 6 * SIZE ++ MADD2 y2, a1, x2, y2 ++ LD a1, AO1, 8 * SIZE ++ MADD1 y3, a3, x1, y3 ++ MADD2 y4, a3, x2, y4 ++ LD a3, AO2, 8 * SIZE ++ MADD3 y1, a2, x2, y1 ++ MADD4 y2, a2, x1, y2 ++ LD a2, AO1, 9 * SIZE ++ MADD3 y3, a4, x2, y3 ++ LD x2, XX, 9 * SIZE ++ MADD4 y4, a4, x1, y4 ++ LD a4, AO2, 9 * SIZE ++ MADD1 y1, a5, x3, y1 ++ LD x1, XX, 8 * SIZE ++ MADD2 y2, a5, x4, y2 ++ LD a5, AO1, 10 * SIZE ++ MADD1 y3, a7, x3, y3 ++ addi.d XX, XX, 8 * SIZE ++ MADD2 y4, a7, x4, y4 ++ LD a7, AO2, 10 * SIZE ++ MADD3 y1, a6, x4, y1 ++ addi.d AO2, AO2, 8 * SIZE ++ MADD4 y2, a6, x3, y2 ++ LD a6, AO1, 11 * SIZE ++ MADD3 y3, a8, x4, y3 ++ LD x4, XX, 3 * SIZE ++ MADD4 y4, a8, x3, y4 ++ LD a8, AO2, 3 * SIZE ++ addi.d AO1, AO1, 8 * SIZE ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ MADD1 y1, a1, x1, y1 ++ LD x3, XX, 2 * SIZE ++ MADD2 y2, a1, x2, y2 ++ LD a1, AO1, 4 * SIZE ++ MADD1 y3, a3, x1, y3 ++ MADD2 y4, a3, x2, y4 ++ LD a3, AO2, 4 * SIZE ++ MADD3 y1, a2, x2, y1 ++ MADD4 y2, a2, x1, y2 ++ LD a2, AO1, 5 * SIZE ++ MADD3 y3, a4, x2, y3 ++ LD x2, XX, 5 * SIZE ++ MADD4 y4, a4, x1, y4 ++ LD a4, AO2, 5 * SIZE ++ MADD1 y1, a5, x3, y1 ++ LD x1, XX, 4 * SIZE ++ MADD2 y2, a5, x4, y2 ++ LD a5, AO1, 6 * SIZE ++ MADD1 y3, a7, x3, y3 ++ MADD2 y4, a7, x4, y4 ++ LD a7, AO2, 6 * SIZE ++ MADD3 y1, a6, x4, y1 ++ MADD4 y2, a6, x3, y2 ++ LD a6, AO1, 7 * SIZE ++ MADD3 y3, a8, x4, y3 ++ LD x4, XX, 7 * SIZE ++ MADD4 y4, a8, x3, y4 ++ LD a8, AO2, 7 * SIZE ++ MADD1 y1, a1, x1, y1 ++ LD x3, XX, 6 * SIZE ++ MADD2 y2, a1, x2, y2 ++ MADD1 y3, a3, x1, y3 ++ MADD2 y4, a3, x2, y4 ++ MADD3 y1, a2, x2, y1 ++ MADD4 y2, a2, x1, y2 ++ MADD3 y3, a4, x2, y3 ++ MADD4 y4, a4, x1, y4 ++ MADD1 y1, a5, x3, y1 ++ MADD2 y2, a5, x4, y2 ++ MADD1 y3, a7, x3, y3 ++ MADD2 y4, a7, x4, y4 ++ MADD3 y1, a6, x4, y1 ++ addi.d XX, XX, 8 * SIZE ++ MADD4 y2, a6, x3, y2 ++ addi.d AO1, AO1, 8 * SIZE ++ MADD3 y3, a8, x4, y3 ++ addi.d AO2, AO2, 8 * SIZE ++ MADD4 y4, a8, x3, y4 ++ .align 3 ++ ++.L15: ++ andi I, M, 2 ++ bge $r0, I, .L17 ++ LD x1, XX, 0 * SIZE ++ LD x2, XX, 1 * SIZE ++ LD x3, XX, 2 * SIZE ++ LD x4, XX, 3 * SIZE ++ LD a1, AO1, 0 * SIZE ++ LD a3, AO2, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD a4, AO2, 1 * SIZE ++ LD a5, AO1, 2 * SIZE ++ LD a7, AO2, 2 * SIZE ++ LD a6, AO1, 3 * SIZE ++ LD a8, AO2, 3 * SIZE ++ MADD1 y1, a1, x1, y1 ++ MADD2 y2, a1, x2, y2 ++ MADD1 y3, a3, x1, y3 ++ MADD2 y4, a3, x2, y4 ++ MADD3 y1, a2, x2, y1 ++ MADD4 y2, a2, x1, y2 ++ MADD3 y3, a4, x2, y3 ++ MADD4 y4, a4, x1, y4 ++ MADD1 y1, a5, x3, y1 ++ MADD2 y2, a5, x4, y2 ++ MADD1 y3, a7, x3, y3 ++ MADD2 y4, a7, x4, y4 ++ MADD3 y1, a6, x4, y1 ++ addi.d XX, XX, 4 * SIZE ++ MADD4 y2, a6, x3, y2 ++ addi.d AO1, AO1, 4 * SIZE ++ MADD3 y3, a8, x4, y3 ++ addi.d AO2, AO2, 4 * SIZE ++ MADD4 y4, a8, x3, y4 ++ .align 3 ++ ++.L17: ++ andi I, M, 1 ++.align 3 ++ ++ bge $r0, I, .L19 ++.L18: ++ LD x1, XX, 0 * SIZE ++ LD x2, XX, 1 * SIZE ++ LD a1, AO1, 0 * SIZE ++ LD a3, AO2, 0 * SIZE ++ MADD1 y1, a1, x1, y1 ++ LD a2, AO1, 1 * SIZE ++ MADD2 y2, a1, x2, y2 ++ LD a4, AO2, 1 * SIZE ++ MADD1 y3, a3, x1, y3 ++ MADD2 y4, a3, x2, y4 ++ MADD3 y1, a2, x2, y1 ++ MADD4 y2, a2, x1, y2 ++ MADD3 y3, a4, x2, y3 ++ MADD4 y4, a4, x1, y4 ++ .align 3 ++ ++.L19: ++ LD a1, Y, 0 * SIZE ++ LD a2, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ LD a3, Y, 0 * SIZE ++ LD a4, Y, 1 * SIZE ++ add.d Y, Y, INCY ++ MADD a1, y1, ALPHA_R, a1 ++ MADD a2, y1, ALPHA_I, a2 ++ MADD a3, y3, ALPHA_R, a3 ++ MADD a4, y3, ALPHA_I, a4 ++ NMSUB a1, y2, ALPHA_I, a1 ++ MADD a2, y2, ALPHA_R, a2 ++ NMSUB a3, y4, ALPHA_I, a3 ++ MTC y1, $r0 ++ MADD a4, y4, ALPHA_R, a4 ++ addi.d J, J, -1 ++ ST a1, YY, 0 * SIZE ++ ST a2, YY, 1 * SIZE ++ add.d YY, YY, INCY ++ ST a3, YY, 0 * SIZE ++ ST a4, YY, 1 * SIZE ++ add.d YY, YY, INCY ++ blt $r0, J, .L11 ++ .align 3 ++ ++.L20: ++ andi J, N, 1 ++ MOV y2, y1 ++ srai.d I, M, 2 ++ bge $r0, J, .L999 ++ MOV y3, y1 ++ move AO1, A ++ MOV y4, y1 ++ move XX, XORIG ++ bge $r0, I, .L25 ++ LD a1, AO1, 0 * SIZE ++ LD x1, XX, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD x2, XX, 1 * SIZE ++ LD a5, AO1, 2 * SIZE ++ LD x4, XX, 3 * SIZE ++ addi.d I, I, -1 ++ LD a6, AO1, 3 * SIZE ++ bge $r0, I, .L23 ++ .align 3 ++.L22: ++ MADD1 y1, a1, x1, y1 ++ LD x3, XX, 2 * SIZE ++ MADD2 y2, a1, x2, y2 ++ LD a1, AO1, 4 * SIZE ++ MADD3 y3, a2, x2, y3 ++ LD x2, XX, 5 * SIZE ++ MADD4 y4, a2, x1, y4 ++ LD a2, AO1, 5 * SIZE ++ MADD1 y1, a5, x3, y1 ++ LD x1, XX, 4 * SIZE ++ MADD2 y2, a5, x4, y2 ++ LD a5, AO1, 6 * SIZE ++ MADD3 y3, a6, x4, y3 ++ LD x4, XX, 7 * SIZE ++ MADD4 y4, a6, x3, y4 ++ LD a6, AO1, 7 * SIZE ++ MADD1 y1, a1, x1, y1 ++ LD x3, XX, 6 * SIZE ++ MADD2 y2, a1, x2, y2 ++ LD a1, AO1, 8 * SIZE ++ MADD3 y3, a2, x2, y3 ++ LD x2, XX, 9 * SIZE ++ MADD4 y4, a2, x1, y4 ++ LD a2, AO1, 9 * SIZE ++ MADD1 y1, a5, x3, y1 ++ LD x1, XX, 8 * SIZE ++ MADD2 y2, a5, x4, y2 ++ LD a5, AO1, 10 * SIZE ++ MADD3 y3, a6, x4, y3 ++ LD x4, XX, 11 * SIZE ++ MADD4 y4, a6, x3, y4 ++ LD a6, AO1, 11 * SIZE ++ addi.d I, I, -1 ++ addi.d XX, XX, 8 * SIZE ++ addi.d AO1, AO1, 8 * SIZE ++ blt $r0, I, .L22 ++ .align 3 ++ ++.L23: ++ MADD1 y1, a1, x1, y1 ++ LD x3, XX, 2 * SIZE ++ MADD2 y2, a1, x2, y2 ++ LD a1, AO1, 4 * SIZE ++ MADD3 y3, a2, x2, y3 ++ LD x2, XX, 5 * SIZE ++ MADD4 y4, a2, x1, y4 ++ LD a2, AO1, 5 * SIZE ++ MADD1 y1, a5, x3, y1 ++ LD x1, XX, 4 * SIZE ++ MADD2 y2, a5, x4, y2 ++ LD a5, AO1, 6 * SIZE ++ MADD3 y3, a6, x4, y3 ++ LD x4, XX, 7 * SIZE ++ MADD4 y4, a6, x3, y4 ++ LD a6, AO1, 7 * SIZE ++ MADD1 y1, a1, x1, y1 ++ LD x3, XX, 6 * SIZE ++ MADD2 y2, a1, x2, y2 ++ MADD3 y3, a2, x2, y3 ++ MADD4 y4, a2, x1, y4 ++ MADD1 y1, a5, x3, y1 ++ MADD2 y2, a5, x4, y2 ++ MADD3 y3, a6, x4, y3 ++ addi.d XX, XX, 8 * SIZE ++ MADD4 y4, a6, x3, y4 ++ addi.d AO1, AO1, 8 * SIZE ++ .align 3 ++ ++.L25: ++ andi I, M, 2 ++ bge $r0, I, .L27 ++ LD a1, AO1, 0 * SIZE ++ LD x1, XX, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD x2, XX, 1 * SIZE ++ LD a5, AO1, 2 * SIZE ++ MADD1 y1, a1, x1, y1 ++ LD x3, XX, 2 * SIZE ++ MADD2 y2, a1, x2, y2 ++ LD a6, AO1, 3 * SIZE ++ MADD3 y3, a2, x2, y3 ++ LD x4, XX, 3 * SIZE ++ MADD4 y4, a2, x1, y4 ++ MADD1 y1, a5, x3, y1 ++ MADD2 y2, a5, x4, y2 ++ MADD3 y3, a6, x4, y3 ++ addi.d XX, XX, 4 * SIZE ++ MADD4 y4, a6, x3, y4 ++ addi.d AO1, AO1, 4 * SIZE ++ .align 3 ++ ++.L27: ++ andi I, M, 1 ++.align 3 ++ ++ bge $r0, I, .L29 ++.L28: ++ LD a1, AO1, 0 * SIZE ++ LD x1, XX, 0 * SIZE ++ LD a2, AO1, 1 * SIZE ++ LD x2, XX, 1 * SIZE ++ MADD1 y1, a1, x1, y1 ++ MADD2 y2, a1, x2, y2 ++ MADD3 y3, a2, x2, y3 ++ MADD4 y4, a2, x1, y4 ++ .align 3 ++ ++.L29: ++ LD a1, Y, 0 * SIZE ++ LD a2, Y, 1 * SIZE ++ ADD y1, y1, y3 ++ ADD y2, y2, y4 ++ MADD a1, y1, ALPHA_R, a1 ++ MADD a2, y1, ALPHA_I, a2 ++ NMSUB a1, y2, ALPHA_I, a1 ++ MADD a2, y2, ALPHA_R, a2 ++ ST a1, YY, 0 * SIZE ++ ST a2, YY, 1 * SIZE ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 16 ++ fld.d $f19, $sp, 24 ++#endif ++#ifdef __64BIT__ ++ addi.d $sp, $sp, 16 ++#else ++ addi.d $sp, $sp, 32 ++#endif ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S +new file mode 100644 +index 0000000..49f6402 +--- /dev/null ++++ b/kernel/loongarch64/znrm2.S +@@ -0,0 +1,304 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r5 ++#define INCX $r6 ++#define XX $r7 ++#define I $r17 ++#define TEMP $r18 ++#define a1 $f10 ++#define a2 $f11 ++#define a3 $f12 ++#define a4 $f13 ++#define a5 $f14 ++#define a6 $f15 ++#define a7 $f16 ++#define a8 $f17 ++#define t1 $f0 ++#define t2 $f1 ++#define t3 $f2 ++#define t4 $f3 ++#define s1 $f22 ++#define s2 $f8 ++#define s3 $f23 ++#define s4 $f9 ++#define ALPHA $f4 ++#define max $f5 ++ ++ PROLOGUE ++ ++#ifdef F_INTERFACE ++ LDINT N, 0(N) ++ LDINT INCX, 0(INCX) ++#endif ++ ++ MTC s1, $r0 ++ bge $r0, N, .L999 ++ slli.d INCX, INCX, ZBASE_SHIFT ++ bge $r0, INCX, .L999 ++ move XX, X ++ MOV s2, s1 ++ srai.d I, N, 2 ++ MOV s3, s1 ++ MOV s4, s1 ++ bge $r0, I, .L15 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a7, X, 0 * SIZE ++ LD a8, X, 1 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ bge $r0, I, .L13 ++ .align 3 ++ ++.L12: ++ FABS t1, a1 ++ LD a1, X, 0 * SIZE ++ FABS t2, a2 ++ NOP ++ FABS t3, a3 ++ LD a2, X, 1 * SIZE ++ FABS t4, a4 ++ add.d X, X, INCX ++ CMPLT $fcc0, s1, t1 ++ LD a3, X, 0 * SIZE ++ CMPLT $fcc1, s2, t2 ++ NOP ++ CMPLT $fcc2, s3, t3 ++ LD a4, X, 1 * SIZE ++ CMPLT $fcc3, s4, t4 ++ add.d X, X, INCX ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ FABS t1, a5 ++ LD a5, X, 0 * SIZE ++ FABS t2, a6 ++ NOP ++ FABS t3, a7 ++ LD a6, X, 1 * SIZE ++ FABS t4, a8 ++ add.d X, X, INCX ++ CMPLT $fcc0, s1, t1 ++ LD a7, X, 0 * SIZE ++ CMPLT $fcc1, s2, t2 ++ NOP ++ CMPLT $fcc2, s3, t3 ++ LD a8, X, 1 * SIZE ++ CMPLT $fcc3, s4, t4 ++ add.d X, X, INCX ++ CMOVT s1, s1, t1, $fcc0 ++ addi.d I, I, -1 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L13: ++ FABS t1, a1 ++ FABS t2, a2 ++ FABS t3, a3 ++ FABS t4, a4 ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t2 ++ CMPLT $fcc2, s3, t3 ++ CMPLT $fcc3, s4, t4 ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ FABS t1, a5 ++ FABS t2, a6 ++ FABS t3, a7 ++ FABS t4, a8 ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t2 ++ CMPLT $fcc2, s3, t3 ++ CMPLT $fcc3, s4, t4 ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ CMOVT s3, s3, t3, $fcc2 ++ CMOVT s4, s4, t4, $fcc3 ++ .align 3 ++ ++.L15: ++ andi I, N, 3 ++ bge $r0, I, .L100 ++ .align 3 ++ ++.L16: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ addi.d I, I, -1 ++ FABS t1, a1 ++ FABS t2, a2 ++ CMPLT $fcc0, s1, t1 ++ CMPLT $fcc1, s2, t2 ++ CMOVT s1, s1, t1, $fcc0 ++ CMOVT s2, s2, t2, $fcc1 ++ add.d X, X, INCX ++ blt $r0, I, .L16 ++ .align 3 ++ ++.L100: ++ CMPLT $fcc0, s1, s2 ++ CMPLT $fcc1, s3, s4 ++ CMOVT s1, s1, s2, $fcc0 ++ CMOVT s3, s3, s4, $fcc1 ++ CMPLT $fcc0, s1, s3 ++ CMOVT s1, s1, s3, $fcc0 ++ lu12i.w TEMP, 0x3f800 ++ movgr2fr.d a1, $r0 ++ movgr2fr.w ALPHA, TEMP ++ CMPEQ $fcc0, s1, a1 ++ fcvt.d.s ALPHA, ALPHA ++ bcnez $fcc0, .L999 ++ fdiv.d ALPHA, ALPHA, s1 ++ MOV max, s1 ++ MOV s1, a1 ++ MOV s2, a1 ++ MOV s3, a1 ++ MOV s4, a1 ++ srai.d I, N, 2 ++ bge $r0, I, .L105 ++ LD a1, XX, 0 * SIZE ++ LD a2, XX, 1 * SIZE ++ add.d XX, XX, INCX ++ LD a3, XX, 0 * SIZE ++ LD a4, XX, 1 * SIZE ++ add.d XX, XX, INCX ++ LD a5, XX, 0 * SIZE ++ LD a6, XX, 1 * SIZE ++ add.d XX, XX, INCX ++ LD a7, XX, 0 * SIZE ++ LD a8, XX, 1 * SIZE ++ addi.d I, I, -1 ++ add.d XX, XX, INCX ++ bge $r0, I, .L104 ++ .align 3 ++ ++.L103: ++ MUL t1, ALPHA, a1 ++ LD a1, XX, 0 * SIZE ++ MUL t2, ALPHA, a2 ++ addi.d I, I, -1 ++ MUL t3, ALPHA, a3 ++ LD a2, XX, 1 * SIZE ++ MUL t4, ALPHA, a4 ++ add.d XX, XX, INCX ++ MADD s1, t1, t1, s1 ++ LD a3, XX, 0 * SIZE ++ MADD s2, t2, t2, s2 ++ NOP ++ MADD s3, t3, t3, s3 ++ LD a4, XX, 1 * SIZE ++ MADD s4, t4, t4, s4 ++ add.d XX, XX, INCX ++ MUL t1, ALPHA, a5 ++ LD a5, XX, 0 * SIZE ++ MUL t2, ALPHA, a6 ++ NOP ++ MUL t3, ALPHA, a7 ++ LD a6, XX, 1 * SIZE ++ MUL t4, ALPHA, a8 ++ add.d XX, XX, INCX ++ MADD s1, t1, t1, s1 ++ LD a7, XX, 0 * SIZE ++ MADD s2, t2, t2, s2 ++ LD a8, XX, 1 * SIZE ++ MADD s3, t3, t3, s3 ++ add.d XX, XX, INCX ++ MADD s4, t4, t4, s4 ++ blt $r0, I, .L103 ++ .align 3 ++ ++.L104: ++ MUL t1, ALPHA, a1 ++ MUL t2, ALPHA, a2 ++ MUL t3, ALPHA, a3 ++ MUL t4, ALPHA, a4 ++ MADD s1, t1, t1, s1 ++ MADD s2, t2, t2, s2 ++ MADD s3, t3, t3, s3 ++ MADD s4, t4, t4, s4 ++ MUL t1, ALPHA, a5 ++ MUL t2, ALPHA, a6 ++ MUL t3, ALPHA, a7 ++ MUL t4, ALPHA, a8 ++ MADD s1, t1, t1, s1 ++ MADD s2, t2, t2, s2 ++ MADD s3, t3, t3, s3 ++ MADD s4, t4, t4, s4 ++ .align 3 ++ ++.L105: ++ andi I, N, 3 ++ bge $r0, I, .L998 ++ .align 3 ++ ++.L106: ++ LD a1, XX, 0 * SIZE ++ LD a2, XX, 1 * SIZE ++ addi.d I, I, -1 ++ MUL t1, ALPHA, a1 ++ MUL t2, ALPHA, a2 ++ MADD s1, t1, t1, s1 ++ add.d XX, XX, INCX ++ MADD s2, t2, t2, s2 ++ blt $r0, I, .L106 ++ .align 3 ++ ++.L998: ++ ADD s1, s1, s2 ++ ADD s3, s3, s4 ++ ADD s1, s1, s3 ++ fsqrt.d s1, s1 ++ move $r4, $r17 ++ MUL $f0, max, s1 ++ jirl $r0, $r1, 0x0 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S +new file mode 100644 +index 0000000..a12e527 +--- /dev/null ++++ b/kernel/loongarch64/zscal.S +@@ -0,0 +1,356 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define N $r4 ++#define X $r7 ++#define INCX $r8 ++#define I $r17 ++#define TEMP $r18 ++#define XX $r5 ++#define ALPHA_R $f0 ++#define ALPHA_I $f1 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f23 ++#define a4 $f9 ++#define a5 $f10 ++#define a6 $f11 ++#define a7 $f12 ++#define a8 $f13 ++#define t1 $f14 ++#define t2 $f15 ++#define t3 $f16 ++#define t4 $f17 ++ ++ PROLOGUE ++ ++ li.d TEMP, 2 * SIZE ++ MTC a1, $r0 ++ slli.d INCX, INCX, ZBASE_SHIFT ++ bge $r0, N, .L999 ++ CMPEQ $fcc0, ALPHA_R, a1 ++ CMPEQ $fcc1, ALPHA_I, a1 ++ bceqz $fcc0, .L50 ++ bceqz $fcc1, .L50 ++ srai.d I, N, 2 ++ bne INCX, TEMP, .L20 ++ bge $r0, I, .L15 ++ .align 3 ++ ++.L12: ++ ST a1, X, 0 * SIZE ++ ST a1, X, 1 * SIZE ++ ST a1, X, 2 * SIZE ++ ST a1, X, 3 * SIZE ++ ST a1, X, 4 * SIZE ++ ST a1, X, 5 * SIZE ++ ST a1, X, 6 * SIZE ++ ST a1, X, 7 * SIZE ++ addi.w I, I, -1 ++ addi.d X, X, 8 * SIZE ++ blt $r0, I, .L12 ++ .align 3 ++ ++.L15: ++ andi I, N, 3 ++ bge $r0, I, .L999 ++ .align 3 ++.L16: ++ ST a1, X, 0 * SIZE ++ ST a1, X, 1 * SIZE ++ addi.d I, I, -1 ++ addi.d X, X, 2 * SIZE ++ blt $r0, I, .L16 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ .align 3 ++ ++.L20: ++ srai.d I, N, 2 ++ bge $r0, I, .L25 ++ .align 3 ++ ++.L22: ++ ST a1, X, 0 * SIZE ++ ST a1, X, 1 * SIZE ++ add.d X, X, INCX ++ ST a1, X, 0 * SIZE ++ ST a1, X, 1 * SIZE ++ add.d X, X, INCX ++ ST a1, X, 0 * SIZE ++ ST a1, X, 1 * SIZE ++ add.d X, X, INCX ++ ST a1, X, 0 * SIZE ++ ST a1, X, 1 * SIZE ++ addi.d I, I, -1 ++ add.d X, X, INCX ++ blt $r0, I, .L22 ++ .align 3 ++ ++.L25: ++ andi I, N, 3 ++ bge $r0, I, .L999 ++ .align 3 ++.L26: ++ ST a1, X, 0 * SIZE ++ addi.d I, I, -1 ++ ST a1, X, 1 * SIZE ++ add.d X, X, INCX ++ blt $r0, I, .L26 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ .align 3 ++ ++.L50: ++ srai.d I, N, 2 ++ bne INCX, TEMP, .L60 ++ addi.d I, I, -1 ++ blt I, $r0, .L55 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ LD a3, X, 2 * SIZE ++ LD a4, X, 3 * SIZE ++ LD a5, X, 4 * SIZE ++ LD a6, X, 5 * SIZE ++ MUL t1, ALPHA_R, a1 ++ LD a7, X, 6 * SIZE ++ MUL t2, ALPHA_I, a1 ++ LD a8, X, 7 * SIZE ++ MUL t3, ALPHA_R, a3 ++ MUL t4, ALPHA_I, a3 ++ bge $r0, I, .L53 ++ .align 3 ++ ++.L52: ++ NMSUB t1, a2, ALPHA_I, t1 ++ LD a1, X, 8 * SIZE ++ MADD t2, a2, ALPHA_R, t2 ++ LD a2, X, 9 * SIZE ++ NMSUB t3, a4, ALPHA_I, t3 ++ LD a3, X, 10 * SIZE ++ MADD t4, a4, ALPHA_R, t4 ++ LD a4, X, 11 * SIZE ++ ST t1, X, 0 * SIZE ++ MUL t1, ALPHA_R, a5 ++ ST t2, X, 1 * SIZE ++ MUL t2, ALPHA_I, a5 ++ ST t3, X, 2 * SIZE ++ MUL t3, ALPHA_R, a7 ++ ST t4, X, 3 * SIZE ++ MUL t4, ALPHA_I, a7 ++ NMSUB t1, a6, ALPHA_I, t1 ++ LD a5, X, 12 * SIZE ++ MADD t2, a6, ALPHA_R, t2 ++ LD a6, X, 13 * SIZE ++ NMSUB t3, a8, ALPHA_I, t3 ++ LD a7, X, 14 * SIZE ++ MADD t4, a8, ALPHA_R, t4 ++ LD a8, X, 15 * SIZE ++ ST t1, X, 4 * SIZE ++ MUL t1, ALPHA_R, a1 ++ ST t2, X, 5 * SIZE ++ MUL t2, ALPHA_I, a1 ++ ST t3, X, 6 * SIZE ++ MUL t3, ALPHA_R, a3 ++ ST t4, X, 7 * SIZE ++ MUL t4, ALPHA_I, a3 ++ addi.d I, I, -1 ++ addi.d X, X, 8 * SIZE ++ blt $r0, I, .L52 ++ .align 3 ++ ++.L53: ++ NMSUB t1, a2, ALPHA_I, t1 ++ MADD t2, a2, ALPHA_R, t2 ++ NMSUB t3, a4, ALPHA_I, t3 ++ MADD t4, a4, ALPHA_R, t4 ++ ST t1, X, 0 * SIZE ++ MUL t1, ALPHA_R, a5 ++ ST t2, X, 1 * SIZE ++ MUL t2, ALPHA_I, a5 ++ ST t3, X, 2 * SIZE ++ MUL t3, ALPHA_R, a7 ++ ST t4, X, 3 * SIZE ++ MUL t4, ALPHA_I, a7 ++ NMSUB t1, a6, ALPHA_I, t1 ++ MADD t2, a6, ALPHA_R, t2 ++ NMSUB t3, a8, ALPHA_I, t3 ++ MADD t4, a8, ALPHA_R, t4 ++ ST t1, X, 4 * SIZE ++ ST t2, X, 5 * SIZE ++ ST t3, X, 6 * SIZE ++ ST t4, X, 7 * SIZE ++ addi.d X, X, 8 * SIZE ++ .align 3 ++ ++.L55: ++ andi I, N, 3 ++ bge $r0, I, .L999 ++ .align 3 ++.L56: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ MUL t1, ALPHA_R, a1 ++ MUL t2, ALPHA_I, a1 ++ NMSUB t1, a2, ALPHA_I, t1 ++ MADD t2, a2, ALPHA_R, t2 ++ addi.d X, X, 2 * SIZE ++ addi.d I, I, -1 ++ ST t1, X, -2 * SIZE ++ ST t2, X, -1 * SIZE ++ blt $r0, I, .L56 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ .align 3 ++ ++.L60: ++ srai.d I, N, 2 ++ move XX, X ++ addi.d I, I, -1 ++ blt I, $r0, .L65 ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a3, X, 0 * SIZE ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ LD a5, X, 0 * SIZE ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ MUL t1, ALPHA_R, a1 ++ LD a7, X, 0 * SIZE ++ MUL t2, ALPHA_I, a1 ++ LD a8, X, 1 * SIZE ++ MUL t3, ALPHA_R, a3 ++ add.d X, X, INCX ++ MUL t4, ALPHA_I, a3 ++ bge $r0, I, .L63 ++ .align 3 ++ ++.L62: ++ NMSUB t1, a2, ALPHA_I, t1 ++ LD a1, X, 0 * SIZE ++ MADD t2, a2, ALPHA_R, t2 ++ LD a2, X, 1 * SIZE ++ add.d X, X, INCX ++ NMSUB t3, a4, ALPHA_I, t3 ++ LD a3, X, 0 * SIZE ++ MADD t4, a4, ALPHA_R, t4 ++ LD a4, X, 1 * SIZE ++ add.d X, X, INCX ++ ST t1, XX, 0 * SIZE ++ MUL t1, ALPHA_R, a5 ++ ST t2, XX, 1 * SIZE ++ MUL t2, ALPHA_I, a5 ++ add.d XX, XX, INCX ++ ST t3, XX, 0 * SIZE ++ MUL t3, ALPHA_R, a7 ++ ST t4, XX, 1 * SIZE ++ MUL t4, ALPHA_I, a7 ++ add.d XX, XX, INCX ++ NMSUB t1, a6, ALPHA_I, t1 ++ LD a5, X, 0 * SIZE ++ MADD t2, a6, ALPHA_R, t2 ++ LD a6, X, 1 * SIZE ++ add.d X, X, INCX ++ NMSUB t3, a8, ALPHA_I, t3 ++ LD a7, X, 0 * SIZE ++ MADD t4, a8, ALPHA_R, t4 ++ LD a8, X, 1 * SIZE ++ add.d X, X, INCX ++ ST t1, XX, 0 * SIZE ++ MUL t1, ALPHA_R, a1 ++ ST t2, XX, 1 * SIZE ++ MUL t2, ALPHA_I, a1 ++ add.d XX, XX, INCX ++ ST t3, XX, 0 * SIZE ++ MUL t3, ALPHA_R, a3 ++ ST t4, XX, 1 * SIZE ++ MUL t4, ALPHA_I, a3 ++ addi.d I, I, -1 ++ add.d XX, XX, INCX ++ blt $r0, I, .L62 ++ .align 3 ++ ++.L63: ++ NMSUB t1, a2, ALPHA_I, t1 ++ MADD t2, a2, ALPHA_R, t2 ++ NMSUB t3, a4, ALPHA_I, t3 ++ MADD t4, a4, ALPHA_R, t4 ++ ST t1, XX, 0 * SIZE ++ MUL t1, ALPHA_R, a5 ++ ST t2, XX, 1 * SIZE ++ MUL t2, ALPHA_I, a5 ++ add.d XX, XX, INCX ++ ST t3, XX, 0 * SIZE ++ MUL t3, ALPHA_R, a7 ++ ST t4, XX, 1 * SIZE ++ MUL t4, ALPHA_I, a7 ++ add.d XX, XX, INCX ++ NMSUB t1, a6, ALPHA_I, t1 ++ MADD t2, a6, ALPHA_R, t2 ++ NMSUB t3, a8, ALPHA_I, t3 ++ MADD t4, a8, ALPHA_R, t4 ++ ST t1, XX, 0 * SIZE ++ ST t2, XX, 1 * SIZE ++ add.d XX, XX, INCX ++ ST t3, XX, 0 * SIZE ++ ST t4, XX, 1 * SIZE ++ add.d XX, XX, INCX ++ .align 3 ++ ++.L65: ++ andi I, N, 3 ++ bge $r0, I, .L999 ++ .align 3 ++.L66: ++ LD a1, X, 0 * SIZE ++ LD a2, X, 1 * SIZE ++ MUL t1, ALPHA_R, a1 ++ MUL t2, ALPHA_I, a1 ++ NMSUB t1, a2, ALPHA_I, t1 ++ MADD t2, a2, ALPHA_R, t2 ++ addi.d I, I, -1 ++ ST t1, X, 0 * SIZE ++ ST t2, X, 1 * SIZE ++ add.d X, X, INCX ++ blt $r0, I, .L66 ++ .align 3 ++ ++.L999: ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/ztrsm_kernel_LT.S b/kernel/loongarch64/ztrsm_kernel_LT.S +new file mode 100644 +index 0000000..26b1230 +--- /dev/null ++++ b/kernel/loongarch64/ztrsm_kernel_LT.S +@@ -0,0 +1,1344 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define M $r4 ++#define N $r5 ++#define K $r6 ++#define A $r7 ++#define B $r8 ++#define C $r9 ++#define LDC $r10 ++#define OFFSET $r11 ++ ++#define AO $r12 ++#define BO $r13 ++#define I $r17 ++#define J $r18 ++#define L $r25 ++#define CO1 $r14 ++#define CO2 $r15 ++#define CO3 $r23 ++#define CO4 $r24 ++#define KK $r26 ++#define TEMP $r27 ++#define AORIG $r28 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f26 ++#define a4 $f27 ++#define b1 $f23 ++#define b2 $f9 ++#define b3 $f10 ++#define b4 $f11 ++#define b5 $f12 ++#define b6 $f13 ++#define b7 $f14 ++#define b8 $f15 ++#define a5 b8 ++#define c11 $f16 ++#define c12 $f17 ++#define c21 $f0 ++#define c22 $f1 ++#define c31 $f2 ++#define c32 $f3 ++#define c41 $f4 ++#define c42 $f5 ++#define c51 $f6 ++#define c52 $f7 ++#define c61 $f18 ++#define c62 $f19 ++#define c71 $f20 ++#define c72 $f21 ++#define c81 $f24 ++#define c82 $f25 ++ ++#ifndef CONJ ++#define MADD1 MADD ++#define MADD2 MADD ++#define MADD3 MADD ++#define MADD4 NMSUB ++#define MADD5 MSUB ++#define MADD6 MADD ++#define MADD7 NMSUB ++#define MADD8 MADD ++#else ++#if defined(LN) || defined(LT) ++#define MADD1 MADD ++#define MADD2 NMSUB ++#define MADD3 MADD ++#define MADD4 MADD ++#else ++#define MADD1 MADD ++#define MADD2 MADD ++#define MADD3 NMSUB ++#define MADD4 MADD ++#endif ++#define MADD5 MADD ++#define MADD6 MSUB ++#define MADD7 MADD ++#define MADD8 NMSUB ++#endif ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -128 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 16 ++ SDARG $r26, $sp, 24 ++ SDARG $r27, $sp, 32 ++ SDARG $r28, $sp, 40 ++ fst.d $f24, $sp, 48 ++ fst.d $f25, $sp, 56 ++ fst.d $f26, $sp, 64 ++ fst.d $f27, $sp, 72 ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 88 ++ fst.d $f19, $sp, 96 ++ fst.d $f20, $sp, 104 ++ fst.d $f21, $sp, 112 ++#endif ++ slli.d LDC, LDC, ZBASE_SHIFT ++#ifdef LN ++ mul.w TEMP, M, K ++ slli.d TEMP, TEMP, ZBASE_SHIFT ++ add.d A, A, TEMP ++ slli.d TEMP, M, ZBASE_SHIFT ++ add.d C, C, TEMP ++#endif ++#ifdef RN ++ sub.d KK, $r0, OFFSET ++#endif ++#ifdef RT ++ mul.w TEMP, N, K ++ slli.d TEMP, TEMP, ZBASE_SHIFT ++ add.d B, B, TEMP ++ mul.w TEMP, N, LDC ++ add.d C, C, TEMP ++ sub.d KK, N, OFFSET ++#endif ++ srai.d J, N, 2 ++nop ++ bge $r0, J, .L20 ++.L10: ++#ifdef RT ++ slli.d TEMP, K, 2 + ZBASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 2 ++ sub.d C, C, TEMP ++#endif ++ move CO1, C ++MTC c11, $r0 ++ add.d CO2, C, LDC ++ add.d CO3, CO2, LDC ++ addi.d J, J, -1 ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ MOV c31, c11 ++ MOV c41, c11 ++ MOV c51, c11 ++ move I, M ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO4, LDC ++#endif ++MOV c61, c11 ++ bge $r0, I, .L19 ++ .align 3 ++ ++.L11: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, B, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, KK, 2 ++ MOV c32, c11 ++ LD b3, B, 2 * SIZE ++ MOV c42, c11 ++ LD b4, B, 3 * SIZE ++ MOV c52, c11 ++ LD b5, B, 4 * SIZE ++ MOV c62, c11 ++ LD b6, B, 8 * SIZE ++ MOV c72, c11 ++ LD b7, B, 12 * SIZE ++ MOV c82, c11 ++move BO, B ++ bge $r0, L, .L15 ++#else ++#ifdef LN ++ slli.d TEMP, K, ZBASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, ZBASE_SHIFT ++ slli.d TEMP, KK, 2 + ZBASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, TEMP, 2 ++ MOV c32, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c42, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c52, c11 ++ LD b5, BO, 4 * SIZE ++ MOV c62, c11 ++ LD b6, BO, 8 * SIZE ++ MOV c72, c11 ++ LD b7, BO, 12 * SIZE ++ MOV c82, c11 ++ bge $r0, L, .L15 ++#endif ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ bge $r0, L, .L13 ++ .align 3 ++.L12: ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c51, b5, a1, c51 ++ MADD3 c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD1 c71, b3, a1, c71 ++ MADD3 c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD1 c51, b7, a4, c51 ++ MADD3 c61, b2, a4, c61 ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD1 c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD3 c21, b2, a3, c21 ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD1 c51, b5, a3, c51 ++ MADD3 c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD1 c71, b3, a3, c71 ++ MADD3 c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ addi.d L, L, -1 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD1 c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD3 c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ blt $r0, L, .L12 ++ .align 3 ++ ++.L13: ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c51, b5, a1, c51 ++ MADD3 c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD1 c71, b3, a1, c71 ++ MADD3 c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD1 c51, b7, a4, c51 ++ MADD3 c61, b2, a4, c61 ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD1 c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD3 c21, b2, a3, c21 ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD1 c51, b5, a3, c51 ++ MADD3 c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD1 c71, b3, a3, c71 ++ MADD3 c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD1 c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD3 c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ .align 3 ++ ++.L15: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L18 ++ .align 3 ++.L16: ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c51, b5, a1, c51 ++ addi.d L, L, -1 ++ MADD3 c61, b2, a1, c61 ++ addi.d AO, AO, 2 * SIZE ++ MADD1 c71, b3, a1, c71 ++ addi.d BO, BO, 8 * SIZE ++ MADD3 c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 4 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L16 ++.L18: ++ ADD c11, c11, c22 ++ ADD c12, c12, c21 ++ ADD c31, c31, c42 ++ ADD c32, c32, c41 ++ ADD c51, c51, c62 ++ ADD c52, c52, c61 ++ ADD c71, c71, c82 ++ ADD c72, c72, c81 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -4 ++#endif ++ slli.d L, TEMP, ZBASE_SHIFT ++ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c31, b3, c31 ++ SUB c32, b4, c32 ++ SUB c51, b5, c51 ++ SUB c52, b6, c52 ++ SUB c71, b7, c71 ++ SUB c72, b8, c72 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ LD b5, AO, 4 * SIZE ++ LD b6, AO, 5 * SIZE ++ LD b7, AO, 6 * SIZE ++ LD b8, AO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c31, b3, c31 ++ SUB c32, b4, c32 ++ SUB c51, b5, c51 ++ SUB c52, b6, c52 ++ SUB c71, b7, c71 ++ SUB c72, b8, c72 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MUL a3, b2, c32 ++ MUL a4, b2, c31 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++ MADD5 c31, c31, b1, a3 ++ MADD6 c32, c32, b1, a4 ++ MUL a1, b2, c52 ++ MUL a2, b2, c51 ++ MUL a3, b2, c72 ++ MUL a4, b2, c71 ++ MADD5 c51, c51, b1, a1 ++ MADD6 c52, c52, b1, a2 ++ MADD5 c71, c71, b1, a3 ++ MADD6 c72, c72, b1, a4 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++ NMSUB c31, c11, b3, c31 ++ MADD7 c32, c11, b4, c32 ++ NMSUB c51, c11, b5, c51 ++ MADD7 c52, c11, b6, c52 ++ NMSUB c71, c11, b7, c71 ++ MADD7 c72, c11, b8, c72 ++ MADD8 c31, c12, b4, c31 ++ NMSUB c32, c12, b3, c32 ++ MADD8 c51, c12, b6, c51 ++ NMSUB c52, c12, b5, c52 ++ MADD8 c71, c12, b8, c71 ++ NMSUB c72, c12, b7, c72 ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ LD b5, BO, 12 * SIZE ++ LD b6, BO, 13 * SIZE ++ LD b7, BO, 14 * SIZE ++ LD b8, BO, 15 * SIZE ++ MUL a1, b4, c32 ++ MUL a2, b4, c31 ++ MADD5 c31, c31, b3, a1 ++ MADD6 c32, c32, b3, a2 ++ NMSUB c51, c31, b5, c51 ++ MADD7 c52, c31, b6, c52 ++ NMSUB c71, c31, b7, c71 ++ MADD7 c72, c31, b8, c72 ++ MADD8 c51, c32, b6, c51 ++ NMSUB c52, c32, b5, c52 ++ MADD8 c71, c32, b8, c71 ++ NMSUB c72, c32, b7, c72 ++ LD b5, BO, 20 * SIZE ++ LD b6, BO, 21 * SIZE ++ LD b7, BO, 22 * SIZE ++ LD b8, BO, 23 * SIZE ++ MUL a1, b6, c52 ++ MUL a2, b6, c51 ++ MADD5 c51, c51, b5, a1 ++ MADD6 c52, c52, b5, a2 ++ NMSUB c71, c51, b7, c71 ++ MADD7 c72, c51, b8, c72 ++ MADD8 c71, c52, b8, c71 ++ NMSUB c72, c52, b7, c72 ++ LD b7, BO, 30 * SIZE ++ LD b8, BO, 31 * SIZE ++ MUL a1, b8, c72 ++ MUL a2, b8, c71 ++ MADD5 c71, c71, b7, a1 ++ MADD6 c72, c72, b7, a2 ++#endif ++#ifdef RT ++ LD b1, BO, 30 * SIZE ++ LD b2, BO, 31 * SIZE ++ LD b3, BO, 28 * SIZE ++ LD b4, BO, 29 * SIZE ++ LD b5, BO, 26 * SIZE ++ LD b6, BO, 27 * SIZE ++ LD b7, BO, 24 * SIZE ++ LD b8, BO, 25 * SIZE ++ MUL a1, b2, c72 ++ MUL a2, b2, c71 ++ MADD5 c71, c71, b1, a1 ++ MADD6 c72, c72, b1, a2 ++ NMSUB c51, c71, b3, c51 ++ MADD7 c52, c71, b4, c52 ++ NMSUB c31, c71, b5, c31 ++ MADD7 c32, c71, b6, c32 ++ NMSUB c11, c71, b7, c11 ++ MADD7 c12, c71, b8, c12 ++ MADD8 c51, c72, b4, c51 ++ NMSUB c52, c72, b3, c52 ++ MADD8 c31, c72, b6, c31 ++ NMSUB c32, c72, b5, c32 ++ MADD8 c11, c72, b8, c11 ++ NMSUB c12, c72, b7, c12 ++ LD b3, BO, 20 * SIZE ++ LD b4, BO, 21 * SIZE ++ LD b5, BO, 18 * SIZE ++ LD b6, BO, 19 * SIZE ++ LD b7, BO, 16 * SIZE ++ LD b8, BO, 17 * SIZE ++ MUL a1, b4, c52 ++ MUL a2, b4, c51 ++ MADD5 c51, c51, b3, a1 ++ MADD6 c52, c52, b3, a2 ++ NMSUB c31, c51, b5, c31 ++ MADD7 c32, c51, b6, c32 ++ NMSUB c11, c51, b7, c11 ++ MADD7 c12, c51, b8, c12 ++ MADD8 c31, c52, b6, c31 ++ NMSUB c32, c52, b5, c32 ++ MADD8 c11, c52, b8, c11 ++ NMSUB c12, c52, b7, c12 ++ LD b5, BO, 10 * SIZE ++ LD b6, BO, 11 * SIZE ++ LD b7, BO, 8 * SIZE ++ LD b8, BO, 9 * SIZE ++ MUL a1, b6, c32 ++ MUL a2, b6, c31 ++ MADD5 c31, c31, b5, a1 ++ MADD6 c32, c32, b5, a2 ++ NMSUB c11, c31, b7, c11 ++ MADD7 c12, c31, b8, c12 ++ MADD8 c11, c32, b8, c11 ++ NMSUB c12, c32, b7, c12 ++ LD b7, BO, 0 * SIZE ++ LD b8, BO, 1 * SIZE ++ MUL a1, b8, c12 ++ MUL a2, b8, c11 ++ MADD5 c11, c11, b7, a1 ++ MADD6 c12, c12, b7, a2 ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c12, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c32, BO, 3 * SIZE ++ ST c51, BO, 4 * SIZE ++ ST c52, BO, 5 * SIZE ++ ST c71, BO, 6 * SIZE ++ ST c72, BO, 7 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c31, AO, 2 * SIZE ++ ST c32, AO, 3 * SIZE ++ ST c51, AO, 4 * SIZE ++ ST c52, AO, 5 * SIZE ++ ST c71, AO, 6 * SIZE ++ ST c72, AO, 7 * SIZE ++#endif ++#ifdef LN ++ addi.d CO1,CO1, -2 * SIZE ++ addi.d CO2,CO2, -2 * SIZE ++ addi.d CO3,CO3, -2 * SIZE ++ addi.d CO4,CO4, -2 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c31, CO2, 0 * SIZE ++ ST c32, CO2, 1 * SIZE ++ ST c51, CO3, 0 * SIZE ++ ST c52, CO3, 1 * SIZE ++ ST c71, CO4, 0 * SIZE ++ ST c72, CO4, 1 * SIZE ++#ifndef LN ++ addi.d CO1,CO1, 2 * SIZE ++ addi.d CO2,CO2, 2 * SIZE ++ addi.d CO3,CO3, 2 * SIZE ++ addi.d CO4,CO4, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, ZBASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, ZBASE_SHIFT ++ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++MTC c11, $r0 ++ addi.d I, I, -1 ++ MOV c21, c11 ++ MOV c31, c11 ++ MOV c41, c11 ++ MOV c51, c11 ++MOV c61, c11 ++ blt $r0, I, .L11 ++ .align 3 ++ ++.L19: ++#ifdef LN ++ slli.d TEMP, K, 2 + ZBASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 4 ++#endif ++#ifdef RT ++ addi.d KK, KK, -4 ++#endif ++ blt $r0, J, .L10 ++ .align 3 ++ ++.L20: ++ andi J, N, 2 ++ bge $r0, J, .L30 ++#ifdef RT ++ slli.d TEMP, K, 1 + ZBASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 1 ++ sub.d C, C, TEMP ++#endif ++MTC c11, $r0 ++ move CO1, C ++ add.d CO2, C, LDC ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO2, LDC ++#endif ++ move I, M ++ bge $r0, I, .L29 ++ .align 3 ++ ++.L21: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, B, 0 * SIZE ++ MOV c31, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c41, c11 ++ LD b2, B, 1 * SIZE ++ srai.d L, KK, 2 ++ LD b3, B, 2 * SIZE ++ MOV c12, c11 ++ LD b4, B, 3 * SIZE ++ MOV c22, c11 ++ LD b5, B, 4 * SIZE ++ MOV c32, c11 ++ MOV c42, c11 ++move BO, B ++ bge $r0, L, .L25 ++#else ++#ifdef LN ++ slli.d TEMP, K, ZBASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, ZBASE_SHIFT ++ slli.d TEMP, KK, 1 + ZBASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c31, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c41, c11 ++ LD b2, BO, 1 * SIZE ++ srai.d L, TEMP, 2 ++ LD b3, BO, 2 * SIZE ++ MOV c12, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c22, c11 ++ LD b5, BO, 4 * SIZE ++ MOV c32, c11 ++MOV c42, c11 ++ bge $r0, L, .L25 ++#endif ++ .align 3 ++.L22: ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c11, b5, a1, c11 ++ LD a2, AO, 3 * SIZE ++ MADD3 c21, b2, a1, c21 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ LD a1, AO, 8 * SIZE ++ MADD2 c12, b5, a2, c12 ++ LD b5, BO, 12 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 9 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 10 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 11 * SIZE ++ MADD1 c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD3 c21, b2, a3, c21 ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ LD a3, AO, 6 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD1 c11, b5, a3, c11 ++ LD a2, AO, 7 * SIZE ++ MADD3 c21, b2, a3, c21 ++ addi.d AO, AO, 8 * SIZE ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ LD a3, AO, 4 * SIZE ++ MADD2 c12, b5, a2, c12 ++ LD b5, BO, 20 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 17 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 18 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 19 * SIZE ++addi.d BO, BO, 16 * SIZE ++ blt $r0, L, .L22 ++ .align 3 ++ ++.L25: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L28 ++ .align 3 ++.L26: ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD1 c31, b3, a1, c31 ++ addi.d BO, BO, 4 * SIZE ++ MADD3 c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 0 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 1 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 2 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 3 * SIZE ++addi.d AO, AO, 2 * SIZE ++ blt $r0, L, .L26 ++.L28: ++ ADD c11, c11, c22 ++ ADD c12, c12, c21 ++ ADD c31, c31, c42 ++ ADD c32, c32, c41 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -2 ++#endif ++ slli.d L, TEMP, ZBASE_SHIFT ++ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c31, b3, c31 ++ SUB c32, b4, c32 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c31, b3, c31 ++ SUB c32, b4, c32 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MUL a3, b2, c32 ++ MUL a4, b2, c31 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++ MADD5 c31, c31, b1, a3 ++ MADD6 c32, c32, b1, a4 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++ NMSUB c31, c11, b3, c31 ++ MADD7 c32, c11, b4, c32 ++ MADD8 c31, c12, b4, c31 ++ NMSUB c32, c12, b3, c32 ++ LD b3, BO, 6 * SIZE ++ LD b4, BO, 7 * SIZE ++ MUL a1, b4, c32 ++ MUL a2, b4, c31 ++ MADD5 c31, c31, b3, a1 ++ MADD6 c32, c32, b3, a2 ++#endif ++#ifdef RT ++ LD b5, BO, 6 * SIZE ++ LD b6, BO, 7 * SIZE ++ LD b7, BO, 4 * SIZE ++ LD b8, BO, 5 * SIZE ++ MUL a1, b6, c32 ++ MUL a2, b6, c31 ++ MADD5 c31, c31, b5, a1 ++ MADD6 c32, c32, b5, a2 ++ NMSUB c11, c31, b7, c11 ++ MADD7 c12, c31, b8, c12 ++ MADD8 c11, c32, b8, c11 ++ NMSUB c12, c32, b7, c12 ++ LD b7, BO, 0 * SIZE ++ LD b8, BO, 1 * SIZE ++ MUL a1, b8, c12 ++ MUL a2, b8, c11 ++ MADD5 c11, c11, b7, a1 ++ MADD6 c12, c12, b7, a2 ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c12, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c32, BO, 3 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c31, AO, 2 * SIZE ++ ST c32, AO, 3 * SIZE ++#endif ++#ifdef LN ++ addi.d CO1,CO1, -2 * SIZE ++ addi.d CO2,CO2, -2 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c31, CO2, 0 * SIZE ++ ST c32, CO2, 1 * SIZE ++#ifndef LN ++ addi.d CO1,CO1, 2 * SIZE ++ addi.d CO2,CO2, 2 * SIZE ++#endif ++MTC c11, $r0 ++#ifdef RT ++ slli.d TEMP, K, ZBASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, ZBASE_SHIFT ++ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ addi.d I, I, -1 ++ blt $r0, I, .L21 ++ .align 3 ++ ++.L29: ++#ifdef LN ++ slli.d TEMP, K, 1 + ZBASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 2 ++#endif ++#ifdef RT ++ addi.d KK, KK, -2 ++#endif ++ .align 3 ++ ++.L30: ++ andi J, N, 1 ++ bge $r0, J, .L999 ++#ifdef RT ++ slli.d TEMP, K, ZBASE_SHIFT ++ sub.d B, B, TEMP ++ sub.d C, C, LDC ++#endif ++MTC c11, $r0 ++ move CO1, C ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO1, LDC ++#endif ++ move I, M ++ bge $r0, I, .L39 ++ .align 3 ++ ++.L31: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, B, 0 * SIZE ++ MOV c31, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c41, c11 ++ LD b2, B, 1 * SIZE ++ MOV c12, c11 ++ srai.d L, KK, 2 ++ MOV c22, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c32, c11 ++ LD b3, B, 4 * SIZE ++ MOV c42, c11 ++move BO, B ++ bge $r0, L, .L35 ++#else ++#ifdef LN ++ slli.d TEMP, K, ZBASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d TEMP, KK, ZBASE_SHIFT ++ add.d AO, AORIG, TEMP ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c31, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c41, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c12, c11 ++ srai.d L, TEMP, 2 ++ MOV c22, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c32, c11 ++ LD b3, BO, 4 * SIZE ++MOV c42, c11 ++ bge $r0, L, .L35 ++#endif ++ .align 3 ++.L32: ++ MADD1 c11, b1, a1, c11 ++ LD b4, BO, 3 * SIZE ++ MADD3 c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD a2, AO, 3 * SIZE ++ MADD1 c11, b1, a1, c11 ++ LD b2, BO, 5 * SIZE ++ MADD3 c21, b4, a1, c21 ++ LD a1, AO, 8 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD4 c22, b4, a2, c22 ++ LD a2, AO, 5 * SIZE ++ MADD1 c11, b3, a3, c11 ++ LD b4, BO, 7 * SIZE ++ MADD3 c21, b2, a3, c21 ++ LD a3, AO, 6 * SIZE ++ MADD2 c12, b3, a2, c12 ++ LD b3, BO, 6 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD a2, AO, 7 * SIZE ++ MADD1 c11, b3, a3, c11 ++ LD b2, BO, 9 * SIZE ++ MADD3 c21, b4, a3, c21 ++ LD a3, AO, 12 * SIZE ++ MADD2 c12, b3, a2, c12 ++ LD b3, BO, 12 * SIZE ++ MADD4 c22, b4, a2, c22 ++ LD a2, AO, 9 * SIZE ++ addi.d AO, AO, 8 * SIZE ++ addi.d L, L, -1 ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L32 ++ .align 3 ++ ++.L35: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L38 ++ .align 3 ++.L36: ++ MADD1 c11, b1, a1, c11 ++ addi.d L, L, -1 ++ MADD3 c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD a2, AO, 3 * SIZE ++ LD b2, BO, 3 * SIZE ++ addi.d BO, BO, 2 * SIZE ++addi.d AO, AO, 2 * SIZE ++ blt $r0, L, .L36 ++.L38: ++ ADD c11, c11, c22 ++ ADD c12, c12, c21 ++#if defined(LN) || defined(RT) ++ addi.d TEMP, KK, -1 ++ slli.d TEMP, TEMP, ZBASE_SHIFT ++ add.d AO, AORIG, TEMP ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++#endif ++#if defined(RN) || defined(RT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c12, BO, 1 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++#endif ++#ifdef LN ++ addi.d CO1,CO1, -2 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++#ifndef LN ++ addi.d CO1,CO1, 2 * SIZE ++#endif ++MTC c11, $r0 ++#ifdef RT ++ slli.d TEMP, K, ZBASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d TEMP, TEMP, ZBASE_SHIFT ++ add.d AO, AO, TEMP ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ addi.d I, I, -1 ++ blt $r0, I, .L31 ++ .align 3 ++ ++.L39: ++#ifdef LN ++ slli.d TEMP, K, ZBASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 1 ++#endif ++#ifdef RT ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ LDARG $r25, $sp, 16 ++ LDARG $r26, $sp, 24 ++ LDARG $r27, $sp, 32 ++ LDARG $r28, $sp, 40 ++ fld.d $f24, $sp, 48 ++ fld.d $f25, $sp, 56 ++ fld.d $f26, $sp, 64 ++ fld.d $f27, $sp, 72 ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 88 ++ fld.d $f19, $sp, 96 ++ fld.d $f20, $sp, 104 ++ fld.d $f21, $sp, 112 ++#endif ++ addi.d $sp, $sp, 128 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ ++ EPILOGUE +diff --git a/kernel/loongarch64/ztrsm_kernel_RT.S b/kernel/loongarch64/ztrsm_kernel_RT.S +new file mode 100644 +index 0000000..e9f0436 +--- /dev/null ++++ b/kernel/loongarch64/ztrsm_kernel_RT.S +@@ -0,0 +1,1343 @@ ++/*************************************************************************** ++Copyright (c) 2021, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++#define M $r4 ++#define N $r5 ++#define K $r6 ++#define A $r7 ++#define B $r8 ++#define C $r9 ++#define LDC $r10 ++#define OFFSET $r11 ++ ++#define AO $r12 ++#define BO $r13 ++#define I $r17 ++#define J $r18 ++#define L $r25 ++#define CO1 $r14 ++#define CO2 $r15 ++#define CO3 $r23 ++#define CO4 $r24 ++#define KK $r26 ++#define TEMP $r27 ++#define AORIG $r28 ++#define a1 $f22 ++#define a2 $f8 ++#define a3 $f26 ++#define a4 $f27 ++#define b1 $f23 ++#define b2 $f9 ++#define b3 $f10 ++#define b4 $f11 ++#define b5 $f12 ++#define b6 $f13 ++#define b7 $f14 ++#define b8 $f15 ++#define a5 b8 ++#define c11 $f16 ++#define c12 $f17 ++#define c21 $f0 ++#define c22 $f1 ++#define c31 $f2 ++#define c32 $f3 ++#define c41 $f4 ++#define c42 $f5 ++#define c51 $f6 ++#define c52 $f7 ++#define c61 $f18 ++#define c62 $f19 ++#define c71 $f20 ++#define c72 $f21 ++#define c81 $f24 ++#define c82 $f25 ++ ++#ifndef CONJ ++#define MADD1 MADD ++#define MADD2 MADD ++#define MADD3 MADD ++#define MADD4 NMSUB ++#define MADD5 MSUB ++#define MADD6 MADD ++#define MADD7 NMSUB ++#define MADD8 MADD ++#else ++#if defined(LN) || defined(LT) ++#define MADD1 MADD ++#define MADD2 NMSUB ++#define MADD3 MADD ++#define MADD4 MADD ++#else ++#define MADD1 MADD ++#define MADD2 MADD ++#define MADD3 NMSUB ++#define MADD4 MADD ++#endif ++#define MADD5 MADD ++#define MADD6 MSUB ++#define MADD7 MADD ++#define MADD8 NMSUB ++#endif ++ ++ PROLOGUE ++ ++ addi.d $sp, $sp, -128 ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 16 ++ SDARG $r26, $sp, 24 ++ SDARG $r27, $sp, 32 ++ SDARG $r28, $sp, 40 ++ fst.d $f24, $sp, 48 ++ fst.d $f25, $sp, 56 ++ fst.d $f26, $sp, 64 ++ fst.d $f27, $sp, 72 ++#ifndef __64BIT__ ++ fst.d $f18, $sp, 88 ++ fst.d $f19, $sp, 96 ++ fst.d $f20, $sp, 104 ++ fst.d $f21, $sp, 112 ++#endif ++ slli.d LDC, LDC, ZBASE_SHIFT ++#ifdef LN ++ mul.w TEMP, M, K ++ slli.d TEMP, TEMP, ZBASE_SHIFT ++ add.d A, A, TEMP ++ slli.d TEMP, M, ZBASE_SHIFT ++ add.d C, C, TEMP ++#endif ++#ifdef RN ++ sub.d KK, $r0, OFFSET ++#endif ++#ifdef RT ++ mul.w TEMP, N, K ++ slli.d TEMP, TEMP, ZBASE_SHIFT ++ add.d B, B, TEMP ++ mul.w TEMP, N, LDC ++ add.d C, C, TEMP ++ sub.d KK, N, OFFSET ++#endif ++ andi J, N, 1 ++ bge $r0, J, .L20 ++#ifdef RT ++ slli.d TEMP, K, ZBASE_SHIFT ++ sub.d B, B, TEMP ++ sub.d C, C, LDC ++#endif ++MTC c11, $r0 ++ move CO1, C ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO1, LDC ++#endif ++ move I, M ++ bge $r0, I, .L39 ++ .align 3 ++ ++.L31: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, B, 0 * SIZE ++ MOV c31, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c41, c11 ++ LD b2, B, 1 * SIZE ++ MOV c12, c11 ++ srai.d L, KK, 2 ++ MOV c22, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c32, c11 ++ LD b3, B, 4 * SIZE ++ MOV c42, c11 ++move BO, B ++ bge $r0, L, .L35 ++#else ++#ifdef LN ++ slli.d TEMP, K, ZBASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d TEMP, KK, ZBASE_SHIFT ++ add.d AO, AORIG, TEMP ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c31, c11 ++ LD a2, AO, 1 * SIZE ++ MOV c41, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c12, c11 ++ srai.d L, TEMP, 2 ++ MOV c22, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c32, c11 ++ LD b3, BO, 4 * SIZE ++MOV c42, c11 ++ bge $r0, L, .L35 ++#endif ++ .align 3 ++.L32: ++ MADD1 c11, b1, a1, c11 ++ LD b4, BO, 3 * SIZE ++ MADD3 c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD a2, AO, 3 * SIZE ++ MADD1 c11, b1, a1, c11 ++ LD b2, BO, 5 * SIZE ++ MADD3 c21, b4, a1, c21 ++ LD a1, AO, 8 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD4 c22, b4, a2, c22 ++ LD a2, AO, 5 * SIZE ++ MADD1 c11, b3, a3, c11 ++ LD b4, BO, 7 * SIZE ++ MADD3 c21, b2, a3, c21 ++ LD a3, AO, 6 * SIZE ++ MADD2 c12, b3, a2, c12 ++ LD b3, BO, 6 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD a2, AO, 7 * SIZE ++ MADD1 c11, b3, a3, c11 ++ LD b2, BO, 9 * SIZE ++ MADD3 c21, b4, a3, c21 ++ LD a3, AO, 12 * SIZE ++ MADD2 c12, b3, a2, c12 ++ LD b3, BO, 12 * SIZE ++ MADD4 c22, b4, a2, c22 ++ LD a2, AO, 9 * SIZE ++ addi.d AO, AO, 8 * SIZE ++ addi.d L, L, -1 ++addi.d BO, BO, 8 * SIZE ++ blt $r0, L, .L32 ++ .align 3 ++ ++.L35: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L38 ++ .align 3 ++.L36: ++ MADD1 c11, b1, a1, c11 ++ addi.d L, L, -1 ++ MADD3 c21, b2, a1, c21 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 2 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD a2, AO, 3 * SIZE ++ LD b2, BO, 3 * SIZE ++ addi.d BO, BO, 2 * SIZE ++addi.d AO, AO, 2 * SIZE ++ blt $r0, L, .L36 ++.L38: ++ ADD c11, c11, c22 ++ ADD c12, c12, c21 ++#if defined(LN) || defined(RT) ++ addi.d TEMP, KK, -1 ++ slli.d TEMP, TEMP, ZBASE_SHIFT ++ add.d AO, AORIG, TEMP ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++#endif ++#if defined(RN) || defined(RT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c12, BO, 1 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++#endif ++#ifdef LN ++ addi.d CO1,CO1, -2 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++#ifndef LN ++ addi.d CO1,CO1, 2 * SIZE ++#endif ++MTC c11, $r0 ++#ifdef RT ++ slli.d TEMP, K, ZBASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d TEMP, TEMP, ZBASE_SHIFT ++ add.d AO, AO, TEMP ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ addi.d I, I, -1 ++ blt $r0, I, .L31 ++ .align 3 ++ ++.L39: ++#ifdef LN ++ slli.d TEMP, K, ZBASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 1 ++#endif ++#ifdef RT ++ addi.d KK, KK, -1 ++#endif ++ .align 3 ++ ++.L20: ++ andi J, N, 2 ++ bge $r0, J, .L30 ++#ifdef RT ++ slli.d TEMP, K, 1 + ZBASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 1 ++ sub.d C, C, TEMP ++#endif ++MTC c11, $r0 ++ move CO1, C ++ add.d CO2, C, LDC ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO2, LDC ++#endif ++ move I, M ++ bge $r0, I, .L29 ++ .align 3 ++ ++.L21: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, B, 0 * SIZE ++ MOV c31, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c41, c11 ++ LD b2, B, 1 * SIZE ++ srai.d L, KK, 2 ++ LD b3, B, 2 * SIZE ++ MOV c12, c11 ++ LD b4, B, 3 * SIZE ++ MOV c22, c11 ++ LD b5, B, 4 * SIZE ++ MOV c32, c11 ++ MOV c42, c11 ++move BO, B ++ bge $r0, L, .L25 ++#else ++#ifdef LN ++ slli.d TEMP, K, ZBASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, ZBASE_SHIFT ++ slli.d TEMP, KK, 1 + ZBASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c21, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c31, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c41, c11 ++ LD b2, BO, 1 * SIZE ++ srai.d L, TEMP, 2 ++ LD b3, BO, 2 * SIZE ++ MOV c12, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c22, c11 ++ LD b5, BO, 4 * SIZE ++ MOV c32, c11 ++MOV c42, c11 ++ bge $r0, L, .L25 ++#endif ++ .align 3 ++.L22: ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c11, b5, a1, c11 ++ LD a2, AO, 3 * SIZE ++ MADD3 c21, b2, a1, c21 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ LD a1, AO, 8 * SIZE ++ MADD2 c12, b5, a2, c12 ++ LD b5, BO, 12 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 9 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 10 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 11 * SIZE ++ MADD1 c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD3 c21, b2, a3, c21 ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ LD a3, AO, 6 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD1 c11, b5, a3, c11 ++ LD a2, AO, 7 * SIZE ++ MADD3 c21, b2, a3, c21 ++ addi.d AO, AO, 8 * SIZE ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ LD a3, AO, 4 * SIZE ++ MADD2 c12, b5, a2, c12 ++ LD b5, BO, 20 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 17 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 18 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 19 * SIZE ++addi.d BO, BO, 16 * SIZE ++ blt $r0, L, .L22 ++ .align 3 ++ ++.L25: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L28 ++ .align 3 ++.L26: ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD1 c31, b3, a1, c31 ++ addi.d BO, BO, 4 * SIZE ++ MADD3 c41, b4, a1, c41 ++ LD a1, AO, 2 * SIZE ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 0 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 1 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 2 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 3 * SIZE ++addi.d AO, AO, 2 * SIZE ++ blt $r0, L, .L26 ++.L28: ++ ADD c11, c11, c22 ++ ADD c12, c12, c21 ++ ADD c31, c31, c42 ++ ADD c32, c32, c41 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -2 ++#endif ++ slli.d L, TEMP, ZBASE_SHIFT ++ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c31, b3, c31 ++ SUB c32, b4, c32 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c31, b3, c31 ++ SUB c32, b4, c32 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MUL a3, b2, c32 ++ MUL a4, b2, c31 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++ MADD5 c31, c31, b1, a3 ++ MADD6 c32, c32, b1, a4 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++ NMSUB c31, c11, b3, c31 ++ MADD7 c32, c11, b4, c32 ++ MADD8 c31, c12, b4, c31 ++ NMSUB c32, c12, b3, c32 ++ LD b3, BO, 6 * SIZE ++ LD b4, BO, 7 * SIZE ++ MUL a1, b4, c32 ++ MUL a2, b4, c31 ++ MADD5 c31, c31, b3, a1 ++ MADD6 c32, c32, b3, a2 ++#endif ++#ifdef RT ++ LD b5, BO, 6 * SIZE ++ LD b6, BO, 7 * SIZE ++ LD b7, BO, 4 * SIZE ++ LD b8, BO, 5 * SIZE ++ MUL a1, b6, c32 ++ MUL a2, b6, c31 ++ MADD5 c31, c31, b5, a1 ++ MADD6 c32, c32, b5, a2 ++ NMSUB c11, c31, b7, c11 ++ MADD7 c12, c31, b8, c12 ++ MADD8 c11, c32, b8, c11 ++ NMSUB c12, c32, b7, c12 ++ LD b7, BO, 0 * SIZE ++ LD b8, BO, 1 * SIZE ++ MUL a1, b8, c12 ++ MUL a2, b8, c11 ++ MADD5 c11, c11, b7, a1 ++ MADD6 c12, c12, b7, a2 ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c12, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c32, BO, 3 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c31, AO, 2 * SIZE ++ ST c32, AO, 3 * SIZE ++#endif ++#ifdef LN ++ addi.d CO1,CO1, -2 * SIZE ++ addi.d CO2,CO2, -2 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c31, CO2, 0 * SIZE ++ ST c32, CO2, 1 * SIZE ++#ifndef LN ++ addi.d CO1,CO1, 2 * SIZE ++ addi.d CO2,CO2, 2 * SIZE ++#endif ++MTC c11, $r0 ++#ifdef RT ++ slli.d TEMP, K, ZBASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, ZBASE_SHIFT ++ slli.d TEMP, TEMP, 1 + ZBASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++ addi.d I, I, -1 ++ blt $r0, I, .L21 ++ .align 3 ++ ++.L29: ++#ifdef LN ++ slli.d TEMP, K, 1 + ZBASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 2 ++#endif ++#ifdef RT ++ addi.d KK, KK, -2 ++#endif ++ .align 3 ++ ++.L30: ++ srai.d J, N, 2 ++nop ++ bge $r0, J, .L999 ++.L10: ++#ifdef RT ++ slli.d TEMP, K, 2 + ZBASE_SHIFT ++ sub.d B, B, TEMP ++ slli.d TEMP, LDC, 2 ++ sub.d C, C, TEMP ++#endif ++ move CO1, C ++MTC c11, $r0 ++ add.d CO2, C, LDC ++ add.d CO3, CO2, LDC ++ addi.d J, J, -1 ++ add.d CO4, CO3, LDC ++ MOV c21, c11 ++ MOV c31, c11 ++ MOV c41, c11 ++ MOV c51, c11 ++ move I, M ++#ifdef LN ++ add.d KK, M, OFFSET ++#endif ++#ifdef LT ++ move KK, OFFSET ++#endif ++#if defined(LN) || defined(RT) ++ move AORIG, A ++#else ++ move AO, A ++#endif ++#ifndef RT ++ add.d C, CO4, LDC ++#endif ++MOV c61, c11 ++ bge $r0, I, .L19 ++ .align 3 ++ ++.L11: ++#if defined(LT) || defined(RN) ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, B, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, B, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, KK, 2 ++ MOV c32, c11 ++ LD b3, B, 2 * SIZE ++ MOV c42, c11 ++ LD b4, B, 3 * SIZE ++ MOV c52, c11 ++ LD b5, B, 4 * SIZE ++ MOV c62, c11 ++ LD b6, B, 8 * SIZE ++ MOV c72, c11 ++ LD b7, B, 12 * SIZE ++ MOV c82, c11 ++move BO, B ++ bge $r0, L, .L15 ++#else ++#ifdef LN ++ slli.d TEMP, K, ZBASE_SHIFT ++ sub.d AORIG, AORIG, TEMP ++#endif ++ slli.d L, KK, ZBASE_SHIFT ++ slli.d TEMP, KK, 2 + ZBASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++ sub.d TEMP, K, KK ++ LD a1, AO, 0 * SIZE ++ MOV c71, c11 ++ LD b1, BO, 0 * SIZE ++ MOV c81, c11 ++ LD a3, AO, 4 * SIZE ++ MOV c12, c11 ++ LD b2, BO, 1 * SIZE ++ MOV c22, c11 ++ srai.d L, TEMP, 2 ++ MOV c32, c11 ++ LD b3, BO, 2 * SIZE ++ MOV c42, c11 ++ LD b4, BO, 3 * SIZE ++ MOV c52, c11 ++ LD b5, BO, 4 * SIZE ++ MOV c62, c11 ++ LD b6, BO, 8 * SIZE ++ MOV c72, c11 ++ LD b7, BO, 12 * SIZE ++ MOV c82, c11 ++ bge $r0, L, .L15 ++#endif ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ addi.d L, L, -1 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ bge $r0, L, .L13 ++ .align 3 ++.L12: ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c51, b5, a1, c51 ++ MADD3 c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD1 c71, b3, a1, c71 ++ MADD3 c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD1 c51, b7, a4, c51 ++ MADD3 c61, b2, a4, c61 ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD1 c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD3 c21, b2, a3, c21 ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD1 c51, b5, a3, c51 ++ MADD3 c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD1 c71, b3, a3, c71 ++ MADD3 c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ addi.d L, L, -1 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD1 c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD3 c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ blt $r0, L, .L12 ++ .align 3 ++ ++.L13: ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 16 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c51, b5, a1, c51 ++ MADD3 c61, b2, a1, c61 ++ LD a4, AO, 2 * SIZE ++ MADD1 c71, b3, a1, c71 ++ MADD3 c81, b4, a1, c81 ++ LD a1, AO, 8 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 20 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 9 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 10 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 11 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 3 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 24 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 13 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 14 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 15 * SIZE ++ MADD1 c51, b7, a4, c51 ++ MADD3 c61, b2, a4, c61 ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 28 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 17 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 18 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 19 * SIZE ++ MADD1 c11, b1, a3, c11 ++ LD a2, AO, 5 * SIZE ++ MADD3 c21, b2, a3, c21 ++ MADD1 c31, b3, a3, c31 ++ MADD3 c41, b4, a3, c41 ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 32 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 21 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 22 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 23 * SIZE ++ MADD1 c51, b5, a3, c51 ++ MADD3 c61, b2, a3, c61 ++ LD a4, AO, 6 * SIZE ++ MADD1 c71, b3, a3, c71 ++ MADD3 c81, b4, a3, c81 ++ LD a3, AO, 12 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 36 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 25 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 26 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 27 * SIZE ++ MADD1 c11, b6, a4, c11 ++ LD a2, AO, 7 * SIZE ++ MADD3 c21, b2, a4, c21 ++ MADD1 c31, b3, a4, c31 ++ MADD3 c41, b4, a4, c41 ++ MADD2 c12, b6, a2, c12 ++ LD b6, BO, 40 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 29 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 30 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 31 * SIZE ++ MADD1 c51, b7, a4, c51 ++ addi.d BO, BO, 32 * SIZE ++ MADD3 c61, b2, a4, c61 ++ addi.d AO, AO, 8 * SIZE ++ MADD1 c71, b3, a4, c71 ++ MADD3 c81, b4, a4, c81 ++ MADD2 c52, b7, a2, c52 ++ LD b7, BO, 12 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ .align 3 ++ ++.L15: ++#if defined(LT) || defined(RN) ++ andi L, KK, 3 ++#else ++ andi L, TEMP, 3 ++#endif ++ bge $r0, L, .L18 ++ .align 3 ++.L16: ++ MADD1 c11, b1, a1, c11 ++ LD a2, AO, 1 * SIZE ++ MADD3 c21, b2, a1, c21 ++ MADD1 c31, b3, a1, c31 ++ MADD3 c41, b4, a1, c41 ++ MADD2 c12, b1, a2, c12 ++ LD b1, BO, 8 * SIZE ++ MADD4 c22, b2, a2, c22 ++ LD b2, BO, 5 * SIZE ++ MADD2 c32, b3, a2, c32 ++ LD b3, BO, 6 * SIZE ++ MADD4 c42, b4, a2, c42 ++ LD b4, BO, 7 * SIZE ++ MADD1 c51, b5, a1, c51 ++ addi.d L, L, -1 ++ MADD3 c61, b2, a1, c61 ++ addi.d AO, AO, 2 * SIZE ++ MADD1 c71, b3, a1, c71 ++ addi.d BO, BO, 8 * SIZE ++ MADD3 c81, b4, a1, c81 ++ LD a1, AO, 0 * SIZE ++ MADD2 c52, b5, a2, c52 ++ LD b5, BO, 4 * SIZE ++ MADD4 c62, b2, a2, c62 ++ LD b2, BO, 1 * SIZE ++ MADD2 c72, b3, a2, c72 ++ LD b3, BO, 2 * SIZE ++ MADD4 c82, b4, a2, c82 ++ LD b4, BO, 3 * SIZE ++ blt $r0, L, .L16 ++.L18: ++ ADD c11, c11, c22 ++ ADD c12, c12, c21 ++ ADD c31, c31, c42 ++ ADD c32, c32, c41 ++ ADD c51, c51, c62 ++ ADD c52, c52, c61 ++ ADD c71, c71, c82 ++ ADD c72, c72, c81 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ addi.d TEMP, KK, -1 ++#else ++ addi.d TEMP, KK, -4 ++#endif ++ slli.d L, TEMP, ZBASE_SHIFT ++ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT ++ add.d AO, AORIG, L ++ add.d BO, B, TEMP ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c31, b3, c31 ++ SUB c32, b4, c32 ++ SUB c51, b5, c51 ++ SUB c52, b6, c52 ++ SUB c71, b7, c71 ++ SUB c72, b8, c72 ++#else ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ LD b3, AO, 2 * SIZE ++ LD b4, AO, 3 * SIZE ++ LD b5, AO, 4 * SIZE ++ LD b6, AO, 5 * SIZE ++ LD b7, AO, 6 * SIZE ++ LD b8, AO, 7 * SIZE ++ SUB c11, b1, c11 ++ SUB c12, b2, c12 ++ SUB c31, b3, c31 ++ SUB c32, b4, c32 ++ SUB c51, b5, c51 ++ SUB c52, b6, c52 ++ SUB c71, b7, c71 ++ SUB c72, b8, c72 ++#endif ++#if defined(LN) || defined(LT) ++ LD b1, AO, 0 * SIZE ++ LD b2, AO, 1 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MUL a3, b2, c32 ++ MUL a4, b2, c31 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++ MADD5 c31, c31, b1, a3 ++ MADD6 c32, c32, b1, a4 ++ MUL a1, b2, c52 ++ MUL a2, b2, c51 ++ MUL a3, b2, c72 ++ MUL a4, b2, c71 ++ MADD5 c51, c51, b1, a1 ++ MADD6 c52, c52, b1, a2 ++ MADD5 c71, c71, b1, a3 ++ MADD6 c72, c72, b1, a4 ++#endif ++#ifdef RN ++ LD b1, BO, 0 * SIZE ++ LD b2, BO, 1 * SIZE ++ LD b3, BO, 2 * SIZE ++ LD b4, BO, 3 * SIZE ++ LD b5, BO, 4 * SIZE ++ LD b6, BO, 5 * SIZE ++ LD b7, BO, 6 * SIZE ++ LD b8, BO, 7 * SIZE ++ MUL a1, b2, c12 ++ MUL a2, b2, c11 ++ MADD5 c11, c11, b1, a1 ++ MADD6 c12, c12, b1, a2 ++ NMSUB c31, c11, b3, c31 ++ MADD7 c32, c11, b4, c32 ++ NMSUB c51, c11, b5, c51 ++ MADD7 c52, c11, b6, c52 ++ NMSUB c71, c11, b7, c71 ++ MADD7 c72, c11, b8, c72 ++ MADD8 c31, c12, b4, c31 ++ NMSUB c32, c12, b3, c32 ++ MADD8 c51, c12, b6, c51 ++ NMSUB c52, c12, b5, c52 ++ MADD8 c71, c12, b8, c71 ++ NMSUB c72, c12, b7, c72 ++ LD b3, BO, 10 * SIZE ++ LD b4, BO, 11 * SIZE ++ LD b5, BO, 12 * SIZE ++ LD b6, BO, 13 * SIZE ++ LD b7, BO, 14 * SIZE ++ LD b8, BO, 15 * SIZE ++ MUL a1, b4, c32 ++ MUL a2, b4, c31 ++ MADD5 c31, c31, b3, a1 ++ MADD6 c32, c32, b3, a2 ++ NMSUB c51, c31, b5, c51 ++ MADD7 c52, c31, b6, c52 ++ NMSUB c71, c31, b7, c71 ++ MADD7 c72, c31, b8, c72 ++ MADD8 c51, c32, b6, c51 ++ NMSUB c52, c32, b5, c52 ++ MADD8 c71, c32, b8, c71 ++ NMSUB c72, c32, b7, c72 ++ LD b5, BO, 20 * SIZE ++ LD b6, BO, 21 * SIZE ++ LD b7, BO, 22 * SIZE ++ LD b8, BO, 23 * SIZE ++ MUL a1, b6, c52 ++ MUL a2, b6, c51 ++ MADD5 c51, c51, b5, a1 ++ MADD6 c52, c52, b5, a2 ++ NMSUB c71, c51, b7, c71 ++ MADD7 c72, c51, b8, c72 ++ MADD8 c71, c52, b8, c71 ++ NMSUB c72, c52, b7, c72 ++ LD b7, BO, 30 * SIZE ++ LD b8, BO, 31 * SIZE ++ MUL a1, b8, c72 ++ MUL a2, b8, c71 ++ MADD5 c71, c71, b7, a1 ++ MADD6 c72, c72, b7, a2 ++#endif ++#ifdef RT ++ LD b1, BO, 30 * SIZE ++ LD b2, BO, 31 * SIZE ++ LD b3, BO, 28 * SIZE ++ LD b4, BO, 29 * SIZE ++ LD b5, BO, 26 * SIZE ++ LD b6, BO, 27 * SIZE ++ LD b7, BO, 24 * SIZE ++ LD b8, BO, 25 * SIZE ++ MUL a1, b2, c72 ++ MUL a2, b2, c71 ++ MADD5 c71, c71, b1, a1 ++ MADD6 c72, c72, b1, a2 ++ NMSUB c51, c71, b3, c51 ++ MADD7 c52, c71, b4, c52 ++ NMSUB c31, c71, b5, c31 ++ MADD7 c32, c71, b6, c32 ++ NMSUB c11, c71, b7, c11 ++ MADD7 c12, c71, b8, c12 ++ MADD8 c51, c72, b4, c51 ++ NMSUB c52, c72, b3, c52 ++ MADD8 c31, c72, b6, c31 ++ NMSUB c32, c72, b5, c32 ++ MADD8 c11, c72, b8, c11 ++ NMSUB c12, c72, b7, c12 ++ LD b3, BO, 20 * SIZE ++ LD b4, BO, 21 * SIZE ++ LD b5, BO, 18 * SIZE ++ LD b6, BO, 19 * SIZE ++ LD b7, BO, 16 * SIZE ++ LD b8, BO, 17 * SIZE ++ MUL a1, b4, c52 ++ MUL a2, b4, c51 ++ MADD5 c51, c51, b3, a1 ++ MADD6 c52, c52, b3, a2 ++ NMSUB c31, c51, b5, c31 ++ MADD7 c32, c51, b6, c32 ++ NMSUB c11, c51, b7, c11 ++ MADD7 c12, c51, b8, c12 ++ MADD8 c31, c52, b6, c31 ++ NMSUB c32, c52, b5, c32 ++ MADD8 c11, c52, b8, c11 ++ NMSUB c12, c52, b7, c12 ++ LD b5, BO, 10 * SIZE ++ LD b6, BO, 11 * SIZE ++ LD b7, BO, 8 * SIZE ++ LD b8, BO, 9 * SIZE ++ MUL a1, b6, c32 ++ MUL a2, b6, c31 ++ MADD5 c31, c31, b5, a1 ++ MADD6 c32, c32, b5, a2 ++ NMSUB c11, c31, b7, c11 ++ MADD7 c12, c31, b8, c12 ++ MADD8 c11, c32, b8, c11 ++ NMSUB c12, c32, b7, c12 ++ LD b7, BO, 0 * SIZE ++ LD b8, BO, 1 * SIZE ++ MUL a1, b8, c12 ++ MUL a2, b8, c11 ++ MADD5 c11, c11, b7, a1 ++ MADD6 c12, c12, b7, a2 ++#endif ++#if defined(LN) || defined(LT) ++ ST c11, BO, 0 * SIZE ++ ST c12, BO, 1 * SIZE ++ ST c31, BO, 2 * SIZE ++ ST c32, BO, 3 * SIZE ++ ST c51, BO, 4 * SIZE ++ ST c52, BO, 5 * SIZE ++ ST c71, BO, 6 * SIZE ++ ST c72, BO, 7 * SIZE ++#else ++ ST c11, AO, 0 * SIZE ++ ST c12, AO, 1 * SIZE ++ ST c31, AO, 2 * SIZE ++ ST c32, AO, 3 * SIZE ++ ST c51, AO, 4 * SIZE ++ ST c52, AO, 5 * SIZE ++ ST c71, AO, 6 * SIZE ++ ST c72, AO, 7 * SIZE ++#endif ++#ifdef LN ++ addi.d CO1,CO1, -2 * SIZE ++ addi.d CO2,CO2, -2 * SIZE ++ addi.d CO3,CO3, -2 * SIZE ++ addi.d CO4,CO4, -2 * SIZE ++#endif ++ ST c11, CO1, 0 * SIZE ++ ST c12, CO1, 1 * SIZE ++ ST c31, CO2, 0 * SIZE ++ ST c32, CO2, 1 * SIZE ++ ST c51, CO3, 0 * SIZE ++ ST c52, CO3, 1 * SIZE ++ ST c71, CO4, 0 * SIZE ++ ST c72, CO4, 1 * SIZE ++#ifndef LN ++ addi.d CO1,CO1, 2 * SIZE ++ addi.d CO2,CO2, 2 * SIZE ++ addi.d CO3,CO3, 2 * SIZE ++ addi.d CO4,CO4, 2 * SIZE ++#endif ++#ifdef RT ++ slli.d TEMP, K, ZBASE_SHIFT ++ add.d AORIG, AORIG, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ sub.d TEMP, K, KK ++ slli.d L, TEMP, ZBASE_SHIFT ++ slli.d TEMP, TEMP, 2 + ZBASE_SHIFT ++ add.d AO, AO, L ++ add.d BO, BO, TEMP ++#endif ++#ifdef LT ++ addi.d KK, KK, 1 ++#endif ++#ifdef LN ++ addi.d KK, KK, -1 ++#endif ++MTC c11, $r0 ++ addi.d I, I, -1 ++ MOV c21, c11 ++ MOV c31, c11 ++ MOV c41, c11 ++ MOV c51, c11 ++MOV c61, c11 ++ blt $r0, I, .L11 ++ .align 3 ++ ++.L19: ++#ifdef LN ++ slli.d TEMP, K, 2 + ZBASE_SHIFT ++ add.d B, B, TEMP ++#endif ++#if defined(LT) || defined(RN) ++ move B, BO ++#endif ++#ifdef RN ++ addi.d KK, KK, 4 ++#endif ++#ifdef RT ++ addi.d KK, KK, -4 ++#endif ++ blt $r0, J, .L10 ++ .align 3 ++ ++.L999: ++ LDARG $r23, $sp, 0 ++ LDARG $r24, $sp, 8 ++ LDARG $r25, $sp, 16 ++ LDARG $r26, $sp, 24 ++ LDARG $r27, $sp, 32 ++ LDARG $r28, $sp, 40 ++ fld.d $f24, $sp, 48 ++ fld.d $f25, $sp, 56 ++ fld.d $f26, $sp, 64 ++ fld.d $f27, $sp, 72 ++#ifndef __64BIT__ ++ fld.d $f18, $sp, 88 ++ fld.d $f19, $sp, 96 ++ fld.d $f20, $sp, 104 ++ fld.d $f21, $sp, 112 ++#endif ++ addi.d $sp, $sp, 128 ++ move $r4, $r17 ++ fmov.d $f0, $f22 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S +index a095e05..0ccc781 100644 +--- a/kernel/mips64/dnrm2.S ++++ b/kernel/mips64/dnrm2.S +@@ -68,6 +68,7 @@ + + #define ALPHA $f16 + #define max $f17 ++#define INF $f18 + + + PROLOGUE +@@ -86,6 +87,11 @@ + move XX, X + NOP + ++ //Init INF ++ lui TEMP, 0x7FF0 ++ dsll TEMP, TEMP, 32 ++ MTC1 TEMP, INF ++ + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + +@@ -255,6 +261,9 @@ + div.d ALPHA, ALPHA, s1 + MOV max, s1 + ++ CMPEQ $fcc0, ALPHA, INF ++ bc1t $fcc0, .L999 ++ + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 +diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c +index 1e846a6..5fbbeaa 100644 +--- a/kernel/setparam-ref.c ++++ b/kernel/setparam-ref.c +@@ -1004,6 +1004,34 @@ static void init_parameter(void) { + #endif + } + #else // (ARCH_MIPS64) ++#if (ARCH_LOONGARCH64) ++static void init_parameter(void) { ++ ++#ifdef BUILD_BFLOAT16 ++ TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; ++#endif ++ TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; ++ TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; ++ TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; ++ TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; ++ ++#ifdef BUILD_BFLOAT16 ++ TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; ++#endif ++ TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; ++ TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; ++ TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; ++ TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; ++ ++#ifdef BUILD_BFLOAT16 ++ TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; ++#endif ++ TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; ++ TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; ++ TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; ++ TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; ++} ++#else // (ARCH_LOONGARCH64) + #if (ARCH_POWER) + static void init_parameter(void) { + +@@ -1851,5 +1879,6 @@ static void init_parameter(void) { + } + #endif //POWER + #endif //ZARCH ++#endif //(ARCH_LOONGARCH64) + #endif //(ARCH_MIPS64) + #endif //(ARCH_ARM64) +diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile +new file mode 100644 +index 0000000..b87a2eb +--- /dev/null ++++ b/lapack/laswp/loongarch64/Makefile +@@ -0,0 +1,12 @@ ++TOPDIR = ../../.. ++include ../../../Makefile.system ++ ++ifndef LASWP ++LASWP = ../generic/laswp_k.c ++endif ++ ++ifndef ZLASWP ++ZLASWP = ../generic/zlaswp_k.c ++endif ++ ++include ../generic/Makefile +diff --git a/param.h b/param.h +index a35ce69..34dce01 100644 +--- a/param.h ++++ b/param.h +@@ -2689,6 +2689,122 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #define SYMV_P 16 + #endif + ++#if defined (LOONGSON3R5) ++#define SNUMOPT 2 ++#define DNUMOPT 2 ++ ++#define GEMM_DEFAULT_OFFSET_A 0 ++#define GEMM_DEFAULT_OFFSET_B 0 ++#define GEMM_DEFAULT_ALIGN 0x0ffffUL ++ ++#define SGEMM_DEFAULT_UNROLL_N 8 ++#define DGEMM_DEFAULT_UNROLL_N 4 ++#define QGEMM_DEFAULT_UNROLL_N 2 ++#define CGEMM_DEFAULT_UNROLL_N 4 ++#define ZGEMM_DEFAULT_UNROLL_N 4 ++#define XGEMM_DEFAULT_UNROLL_N 1 ++ ++#define SGEMM_DEFAULT_UNROLL_M 2 ++#define DGEMM_DEFAULT_UNROLL_M 16 ++#define QGEMM_DEFAULT_UNROLL_M 2 ++#define CGEMM_DEFAULT_UNROLL_M 1 ++#define ZGEMM_DEFAULT_UNROLL_M 1 ++#define XGEMM_DEFAULT_UNROLL_M 1 ++ ++#define SGEMM_DEFAULT_P 512 ++#define DGEMM_DEFAULT_P 32 ++#define QGEMM_DEFAULT_P 128 ++#define CGEMM_DEFAULT_P 128 ++#define ZGEMM_DEFAULT_P 128 ++#define XGEMM_DEFAULT_P 128 ++ ++#define SGEMM_DEFAULT_R 12288 ++#define DGEMM_DEFAULT_R 858 ++#define QGEMM_DEFAULT_R 4096 ++#define CGEMM_DEFAULT_R 4096 ++#define ZGEMM_DEFAULT_R 4096 ++#define XGEMM_DEFAULT_R 4096 ++ ++#define SGEMM_DEFAULT_Q 128 ++#define DGEMM_DEFAULT_Q 152 ++#define CGEMM_DEFAULT_Q 128 ++#define ZGEMM_DEFAULT_Q 128 ++#define ZGEMM_DEFAULT_Q 128 ++#define XGEMM_DEFAULT_Q 128 ++ ++#define SYMV_P 16 ++#endif ++ ++#ifdef LOONGSON2K1000 ++#define GEMM_DEFAULT_OFFSET_A 0 ++#define GEMM_DEFAULT_OFFSET_B 0 ++#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL ++ ++#define SGEMM_DEFAULT_UNROLL_M 2 ++#define SGEMM_DEFAULT_UNROLL_N 8 ++ ++#define DGEMM_DEFAULT_UNROLL_M 2 ++#define DGEMM_DEFAULT_UNROLL_N 8 ++ ++#define CGEMM_DEFAULT_UNROLL_M 1 ++#define CGEMM_DEFAULT_UNROLL_N 4 ++ ++#define ZGEMM_DEFAULT_UNROLL_M 1 ++#define ZGEMM_DEFAULT_UNROLL_N 4 ++ ++#define SGEMM_DEFAULT_P 128 ++#define DGEMM_DEFAULT_P 128 ++#define CGEMM_DEFAULT_P 96 ++#define ZGEMM_DEFAULT_P 64 ++ ++#define SGEMM_DEFAULT_Q 240 ++#define DGEMM_DEFAULT_Q 120 ++#define CGEMM_DEFAULT_Q 120 ++#define ZGEMM_DEFAULT_Q 120 ++ ++#define SGEMM_DEFAULT_R 12288 ++#define DGEMM_DEFAULT_R 8192 ++#define CGEMM_DEFAULT_R 4096 ++#define ZGEMM_DEFAULT_R 4096 ++ ++#define SYMV_P 16 ++#endif ++ ++#ifdef LOONGSONGENERIC ++#define GEMM_DEFAULT_OFFSET_A 0 ++#define GEMM_DEFAULT_OFFSET_B 0 ++#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL ++ ++#define SGEMM_DEFAULT_UNROLL_M 2 ++#define SGEMM_DEFAULT_UNROLL_N 8 ++ ++#define DGEMM_DEFAULT_UNROLL_M 2 ++#define DGEMM_DEFAULT_UNROLL_N 8 ++ ++#define CGEMM_DEFAULT_UNROLL_M 1 ++#define CGEMM_DEFAULT_UNROLL_N 4 ++ ++#define ZGEMM_DEFAULT_UNROLL_M 1 ++#define ZGEMM_DEFAULT_UNROLL_N 4 ++ ++#define SGEMM_DEFAULT_P 128 ++#define DGEMM_DEFAULT_P 128 ++#define CGEMM_DEFAULT_P 96 ++#define ZGEMM_DEFAULT_P 64 ++ ++#define SGEMM_DEFAULT_Q 240 ++#define DGEMM_DEFAULT_Q 120 ++#define CGEMM_DEFAULT_Q 120 ++#define ZGEMM_DEFAULT_Q 120 ++ ++#define SGEMM_DEFAULT_R 12288 ++#define DGEMM_DEFAULT_R 8192 ++#define CGEMM_DEFAULT_R 4096 ++#define ZGEMM_DEFAULT_R 4096 ++ ++#define SYMV_P 16 ++#endif ++ + #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) + #define SNUMOPT 2 + #define DNUMOPT 2 +-- +2.20.1 + diff --git a/openblas.spec b/openblas.spec index 6287bf4b46daa9e678072f9e8d531b73e04d37dc..f5961f92c4ab0b2ba8b05c18ed1c36e69d110944 100644 --- a/openblas.spec +++ b/openblas.spec @@ -36,6 +36,8 @@ Patch3: openblas-0.3.15-noopt.patch Patch4: openblas-0.3.15-asmflags.patch # Remove optimization pragmas on ppc64le Patch5: openblas-0.3.15-power-optimize.patch +# Add optimization for LoongArch +Patch6: openblas-0.3.15-opt-loongarch64.patch BuildRequires: scl-utils BuildRequires: gcc-toolset-11-gcc @@ -238,6 +240,7 @@ cd OpenBLAS-%{version} %patch3 -p1 -b .noopt %patch4 -p1 -b .asmflags %patch5 -p1 -b .power-optimize +%patch6 -p1 -b .opt-loongarch64 # Fix source permissions find -name \*.f -exec chmod 644 {} \;