diff --git a/0001-add-support-for-sw_64-architecture.patch b/0001-add-support-for-sw_64-architecture.patch new file mode 100755 index 0000000000000000000000000000000000000000..70515849a18382efa82b61fe9c960bb7d1839d91 --- /dev/null +++ b/0001-add-support-for-sw_64-architecture.patch @@ -0,0 +1,39058 @@ +From d4185d2bd760f6e41bc979ee261fbbbee0f2b042 Mon Sep 17 00:00:00 2001 +From: funlei +Date: Tue, 15 Apr 2025 16:59:03 +0800 +Subject: [PATCH] add support for sw_64 architecture + +--- + Makefile.sw_64 | 15 + + Makefile.system | 5 + + c_check | 4 +- + common.h | 4 + + common_sw_64.h | 99 + + cpuid_sw_64.c | 14 + + ctest.c | 4 + + getarch.c | 9 +- + kernel/sw_64/KERNEL | 128 + + kernel/sw_64/Makefile | 2 + + kernel/sw_64/amax.S | 283 ++ + kernel/sw_64/asum.S | 206 ++ + kernel/sw_64/axpy.S | 428 +++ + kernel/sw_64/cabs.S | 71 + + kernel/sw_64/cnrm2.S | 428 +++ + kernel/sw_64/copy.S | 379 +++ + kernel/sw_64/cscal.S | 217 ++ + kernel/sw_64/dnrm2.S | 431 +++ + kernel/sw_64/dot.S | 534 ++++ + kernel/sw_64/gemm_beta.S | 179 ++ + kernel/sw_64/gemm_kernel_4x4.S | 2844 +++++++++++++++++++ + kernel/sw_64/gemv_n.S | 1307 +++++++++ + kernel/sw_64/gemv_t.S | 1061 ++++++++ + kernel/sw_64/iamax.S | 440 +++ + kernel/sw_64/imax.S | 351 +++ + kernel/sw_64/izamax.S | 427 +++ + kernel/sw_64/lsame.S | 76 + + kernel/sw_64/max.S | 227 ++ + kernel/sw_64/rot.S | 624 +++++ + kernel/sw_64/scal.S | 693 +++++ + kernel/sw_64/snrm2.S | 431 +++ + kernel/sw_64/staticbuffer.S | 45 + + kernel/sw_64/sum.S | 206 ++ + kernel/sw_64/swap.S | 252 ++ + kernel/sw_64/trsm_kernel_4x4_LN.S | 4061 ++++++++++++++++++++++++++++ + kernel/sw_64/trsm_kernel_4x4_LT.S | 4059 +++++++++++++++++++++++++++ + kernel/sw_64/trsm_kernel_4x4_RT.S | 4059 +++++++++++++++++++++++++++ + kernel/sw_64/zamax.S | 301 +++ + kernel/sw_64/zasum.S | 208 ++ + kernel/sw_64/zaxpy.S | 611 +++++ + kernel/sw_64/zdot.S | 500 ++++ + kernel/sw_64/zgemm_beta.S | 192 ++ + kernel/sw_64/zgemm_kernel_2x2.S | 1705 ++++++++++++ + kernel/sw_64/zgemv_n.S | 1027 +++++++ + kernel/sw_64/zgemv_t.S | 922 +++++++ + kernel/sw_64/znrm2.S | 428 +++ + kernel/sw_64/zrot.S | 631 +++++ + kernel/sw_64/zscal.S | 255 ++ + kernel/sw_64/zsum.S | 210 ++ + kernel/sw_64/zswap.S | 247 ++ + kernel/sw_64/ztrsm_kernel_2x2_LN.S | 2230 +++++++++++++++ + kernel/sw_64/ztrsm_kernel_2x2_LT.S | 2223 +++++++++++++++ + kernel/sw_64/ztrsm_kernel_2x2_RT.S | 2223 +++++++++++++++ + lapack/laswp/sw_64/Makefile | 8 + + param.h | 31 + + 55 files changed, 38552 insertions(+), 3 deletions(-) + create mode 100644 Makefile.sw_64 + create mode 100644 common_sw_64.h + create mode 100644 cpuid_sw_64.c + create mode 100644 kernel/sw_64/KERNEL + create mode 100644 kernel/sw_64/Makefile + create mode 100644 kernel/sw_64/amax.S + create mode 100644 kernel/sw_64/asum.S + create mode 100644 kernel/sw_64/axpy.S + create mode 100644 kernel/sw_64/cabs.S + create mode 100644 kernel/sw_64/cnrm2.S + create mode 100644 kernel/sw_64/copy.S + create mode 100644 kernel/sw_64/cscal.S + create mode 100644 kernel/sw_64/dnrm2.S + create mode 100644 kernel/sw_64/dot.S + create mode 100644 kernel/sw_64/gemm_beta.S + create mode 100644 kernel/sw_64/gemm_kernel_4x4.S + create mode 100644 kernel/sw_64/gemv_n.S + create mode 100644 kernel/sw_64/gemv_t.S + create mode 100644 kernel/sw_64/iamax.S + create mode 100644 kernel/sw_64/imax.S + create mode 100644 kernel/sw_64/izamax.S + create mode 100644 kernel/sw_64/lsame.S + create mode 100644 kernel/sw_64/max.S + create mode 100644 kernel/sw_64/rot.S + create mode 100644 kernel/sw_64/scal.S + create mode 100644 kernel/sw_64/snrm2.S + create mode 100644 kernel/sw_64/staticbuffer.S + create mode 100644 kernel/sw_64/sum.S + create mode 100644 kernel/sw_64/swap.S + create mode 100644 kernel/sw_64/trsm_kernel_4x4_LN.S + create mode 100644 kernel/sw_64/trsm_kernel_4x4_LT.S + create mode 100644 kernel/sw_64/trsm_kernel_4x4_RT.S + create mode 100644 kernel/sw_64/zamax.S + create mode 100644 kernel/sw_64/zasum.S + create mode 100644 kernel/sw_64/zaxpy.S + create mode 100644 kernel/sw_64/zdot.S + create mode 100644 kernel/sw_64/zgemm_beta.S + create mode 100644 kernel/sw_64/zgemm_kernel_2x2.S + create mode 100644 kernel/sw_64/zgemv_n.S + create mode 100644 kernel/sw_64/zgemv_t.S + create mode 100644 kernel/sw_64/znrm2.S + create mode 100644 kernel/sw_64/zrot.S + create mode 100644 kernel/sw_64/zscal.S + create mode 100644 kernel/sw_64/zsum.S + create mode 100644 kernel/sw_64/zswap.S + create mode 100644 kernel/sw_64/ztrsm_kernel_2x2_LN.S + create mode 100644 kernel/sw_64/ztrsm_kernel_2x2_LT.S + create mode 100644 kernel/sw_64/ztrsm_kernel_2x2_RT.S + create mode 100644 lapack/laswp/sw_64/Makefile + +diff --git a/Makefile.sw_64 b/Makefile.sw_64 +new file mode 100644 +index 0000000..529bd88 +--- /dev/null ++++ b/Makefile.sw_64 +@@ -0,0 +1,15 @@ ++ifeq ($(F_COMPILER), GFORTRAN) ++FCOMMON_OPT += -mieee ++endif ++ ++ifeq ($(F_COMPILER), G77) ++FCOMMON_OPT += -mieee ++endif ++ ++ifndef SMP ++LIBCXML = -lcxml -lots -lm ++LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm ++else ++LIBCXML = -lcxmlp -lots -lm ++LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm ++endif +diff --git a/Makefile.system b/Makefile.system +index b065f9a..c33bb3a 100644 +--- a/Makefile.system ++++ b/Makefile.system +@@ -842,6 +842,11 @@ NO_BINARY_MODE = 1 + BINARY_DEFINED = 1 + endif + ++ifeq ($(ARCH), sw_64) ++NO_BINARY_MODE = 1 ++BINARY_DEFINED = 1 ++endif ++ + ifeq ($(ARCH), arm) + NO_BINARY_MODE = 1 + BINARY_DEFINED = 1 +diff --git a/c_check b/c_check +index c2b52c8..006da54 100755 +--- a/c_check ++++ b/c_check +@@ -84,6 +84,7 @@ case "$data" in + *ARCH_MIPS64*) architecture=mips64 ;; + *ARCH_MIPS*) architecture=mips ;; + *ARCH_ALPHA*) architecture=alpha ;; ++ *ARCH_SW_64*) architecture=sw_64 ;; + *ARCH_SPARC*) architecture=sparc ;; + *ARCH_IA64*) architecture=ia64 ;; + *ARCH_ARM64*) architecture=arm64 ;; +@@ -125,7 +126,7 @@ case "$architecture" in + defined=1 + ;; + arm|arm64) defined=1 ;; +- zarch|e2k|alpha|ia64|riscv64|loonarch64) ++ zarch|e2k|alpha|sw_64|ia64|riscv64|loonarch64) + defined=1 + BINARY=64 + ;; +@@ -242,6 +243,7 @@ case "$data" in + *ARCH_MIPS64*) architecture=mips64 ;; + *ARCH_MIPS*) architecture=mips ;; + *ARCH_ALPHA*) architecture=alpha ;; ++ *ARCH_SW_64*) architecture=sw_64 ;; + *ARCH_SPARC*) architecture=sparc ;; + *ARCH_IA64*) architecture=ia64 ;; + *ARCH_ARM64*) architecture=arm64 ;; +diff --git a/common.h b/common.h +index b8bac1a..de3a722 100644 +--- a/common.h ++++ b/common.h +@@ -406,6 +406,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 + #include "common_alpha.h" + #endif + ++#ifdef ARCH_SW_64 ++#include "common_sw_64.h" ++#endif ++ + #if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include) + #if __has_include() + #include +diff --git a/common_sw_64.h b/common_sw_64.h +new file mode 100644 +index 0000000..6468431 +--- /dev/null ++++ b/common_sw_64.h +@@ -0,0 +1,99 @@ ++#ifndef COMMON_SW_64 ++#define COMMON_SW_64 ++ ++#ifndef ASSEMBLER ++ ++#define MB asm("memb") ++#define WMB asm("memb") ++#define RMB asm("memb") ++ ++static __inline unsigned long rpcc(void){ ++ unsigned long r0; ++ asm __volatile__("rtc %0, $31" : "=r"(r0) : : "memory"); ++ return r0; ++} ++#define RPCC_DEFINED ++ ++#define GET_IMAGE(res) asm __volatile__("fmov $f1, %0" : "=f"(res) : : "memory") ++ ++#ifdef SMP ++#ifdef USE64BITINT ++static __inline long blas_quickdivide(long x, long y){ ++ return x/y; ++} ++#else ++static __inline int blas_quickdivide(unsigned int x, unsigned int y){ ++ return x/y; ++} ++#endif ++#endif ++ ++#define BASE_ADDRESS ((0x1b0UL << 33) | (0x1c0UL << 23) | (0x000UL << 13)) ++ ++#ifndef PAGESIZE ++#define PAGESIZE ( 8UL << 10) ++#define HUGE_PAGESIZE ( 4 << 20) ++#endif ++#define BUFFER_SIZE (32UL << 20) ++ ++#else ++ ++#ifndef F_INTERFACE ++#define REALNAME ASMNAME ++#else ++#define REALNAME ASMFNAME ++#endif ++ ++#define PROLOGUE \ ++ .arch sw8a; \ ++ .set noat; \ ++ .set noreorder; \ ++.text; \ ++ .align 5; \ ++ .globl REALNAME; \ ++ .ent REALNAME; \ ++REALNAME: ++ ++#ifdef PROFILE ++#define PROFCODE \ ++ ldgp $gp, 0($27); \ ++ lda $28, _mcount; \ ++ jsr $28, ($28), _mcount; \ ++ .prologue 1 ++#else ++#define PROFCODE .prologue 0 ++#endif ++ ++#if defined(__linux__) && defined(__ELF__) ++#define GNUSTACK .section .note.GNU-stack,"",@progbits ++#else ++#define GNUSTACK ++#endif ++ ++#define EPILOGUE \ ++ .end REALNAME; \ ++ .ident VERSION; \ ++ GNUSTACK ++ ++#endif ++ ++#ifdef DOUBLE ++#define SXADDQ s8addl ++#define SXSUBL s8subw ++#define LD fldd ++#define ST fstd ++#define ADD faddd ++#define SUB fsubd ++#define MUL fmuld ++#define DIV fdivd ++#else ++#define SXADDQ s4addl ++#define SXSUBL s4subw ++#define LD flds ++#define ST fsts ++#define ADD fadds ++#define SUB fsubs ++#define MUL fmuls ++#define DIV fdivs ++#endif ++#endif +diff --git a/cpuid_sw_64.c b/cpuid_sw_64.c +new file mode 100644 +index 0000000..b05c655 +--- /dev/null ++++ b/cpuid_sw_64.c +@@ -0,0 +1,14 @@ ++char *get_corename(void) { ++ return "SW8A"; ++} ++ ++void get_libname(void){ ++ printf("sw8a"); ++} ++ ++void get_cpuconfig(void){ ++ printf("#define SW8A\n"); ++ printf("#define L1_DATA_LINESIZE 128\n"); ++ printf("#define L2_SIZE 524288\n"); ++ printf("#define DTB_DEFAULT_ENTRIES 32\n"); ++} +diff --git a/ctest.c b/ctest.c +index cbc1532..76ba093 100644 +--- a/ctest.c ++++ b/ctest.c +@@ -137,6 +137,10 @@ ARCH_MIPS + ARCH_ALPHA + #endif + ++#ifdef __sw_64 ++ARCH_SW_64 ++#endif ++ + #if defined(__sparc) || defined(__sparc__) + ARCH_SPARC + #endif +diff --git a/getarch.c b/getarch.c +index 842a843..1b0412b 100644 +--- a/getarch.c ++++ b/getarch.c +@@ -1810,6 +1810,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #define OPENBLAS_SUPPORTED + #endif + ++#ifdef __sw_64 ++#include "cpuid_sw_64.c" ++#define OPENBLAS_SUPPORTED ++#endif ++ + #ifdef POWER + #include "cpuid_power.c" + #define OPENBLAS_SUPPORTED +@@ -1915,7 +1920,7 @@ int main(int argc, char *argv[]){ + #ifdef FORCE + printf("CORE=%s\n", CORENAME); + #else +-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__) ++#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__sw_64) || defined(__csky__) + printf("CORE=%s\n", get_corename()); + #endif + #endif +@@ -2063,7 +2068,7 @@ printf("ELF_VERSION=2\n"); + #ifdef FORCE + printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); + #else +-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__) ++#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__) || defined(__sw_64) + printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); + #endif + #endif +diff --git a/kernel/sw_64/KERNEL b/kernel/sw_64/KERNEL +new file mode 100644 +index 0000000..71e93fb +--- /dev/null ++++ b/kernel/sw_64/KERNEL +@@ -0,0 +1,128 @@ ++ifndef SAMINKERNEL ++SAMINKERNEL = amax.S ++endif ++ ++ifndef DAMINKERNEL ++DAMINKERNEL = amax.S ++endif ++ ++ifndef CAMINKERNEL ++CAMINKERNEL = zamax.S ++endif ++ ++ifndef ZAMINKERNEL ++ZAMINKERNEL = zamax.S ++endif ++ ++ifndef SMINKERNEL ++SMINKERNEL = max.S ++endif ++ ++ifndef DMINKERNEL ++DMINKERNEL = max.S ++endif ++ ++ifndef ISAMINKERNEL ++ISAMINKERNEL = iamax.S ++endif ++ ++ifndef IDAMINKERNEL ++IDAMINKERNEL = iamax.S ++endif ++ ++ifndef ICAMINKERNEL ++ICAMINKERNEL = izamax.S ++endif ++ ++ifndef IZAMINKERNEL ++IZAMINKERNEL = izamax.S ++endif ++ ++ifndef ISMINKERNEL ++ISMINKERNEL = imax.S ++endif ++ ++ifndef ISMAXKERNEL ++ISMAXKERNEL = imax.S ++endif ++ ++ifndef IDMINKERNEL ++IDMINKERNEL = iamax.S ++endif ++ ++ifndef CCOPYKERNEL ++CCOPYKERNEL = copy.S ++endif ++ ++ifndef ZCOPYKERNEL ++ZCOPYKERNEL = copy.S ++endif ++ ++ifndef SNRM2KERNEL ++SNRM2KERNEL = snrm2.S ++endif ++ ++ifndef DNRM2KERNEL ++DNRM2KERNEL = dnrm2.S ++endif ++ ++ifndef CNRM2KERNEL ++CNRM2KERNEL = cnrm2.S ++endif ++ ++ifndef ZNRM2KERNEL ++ZNRM2KERNEL = znrm2.S ++endif ++ ++SGEMMKERNEL = gemm_kernel_4x4.S ++SGEMM_BETA = gemm_beta.S ++SGEMMONCOPY = ../generic/gemm_ncopy_4.c ++SGEMMOTCOPY = ../generic/gemm_tcopy_4.c ++SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) ++SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) ++ ++DGEMMKERNEL = gemm_kernel_4x4.S ++DGEMM_BETA = gemm_beta.S ++DGEMMONCOPY = ../generic/gemm_ncopy_4.c ++DGEMMOTCOPY = ../generic/gemm_tcopy_4.c ++DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) ++DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) ++ ++CGEMMKERNEL = zgemm_kernel_2x2.S ++CGEMM_BETA = zgemm_beta.S ++CGEMMONCOPY = ../generic/zgemm_ncopy_2.c ++CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ++CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) ++CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) ++ ++ZGEMMKERNEL = zgemm_kernel_2x2.S ++ZGEMM_BETA = zgemm_beta.S ++ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ++ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ++ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) ++ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) ++ ++SGEMM_BETA = gemm_beta.S ++DGEMM_BETA = gemm_beta.S ++CGEMM_BETA = zgemm_beta.S ++ZGEMM_BETA = zgemm_beta.S ++ ++STRSMKERNEL_LN = trsm_kernel_4x4_LN.S ++STRSMKERNEL_LT = trsm_kernel_4x4_LT.S ++STRSMKERNEL_RN = trsm_kernel_4x4_LT.S ++STRSMKERNEL_RT = trsm_kernel_4x4_RT.S ++ ++DTRSMKERNEL_LN = trsm_kernel_4x4_LN.S ++DTRSMKERNEL_LT = trsm_kernel_4x4_LT.S ++DTRSMKERNEL_RN = trsm_kernel_4x4_LT.S ++DTRSMKERNEL_RT = trsm_kernel_4x4_RT.S ++ ++CTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S ++CTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S ++CTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S ++CTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S ++ ++ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S ++ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S ++ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S ++ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S +diff --git a/kernel/sw_64/Makefile b/kernel/sw_64/Makefile +new file mode 100644 +index 0000000..efae70d +--- /dev/null ++++ b/kernel/sw_64/Makefile +@@ -0,0 +1,2 @@ ++clean :: ++ +diff --git a/kernel/sw_64/amax.S b/kernel/sw_64/amax.S +new file mode 100644 +index 0000000..b05929b +--- /dev/null ++++ b/kernel/sw_64/amax.S +@@ -0,0 +1,283 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 6 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ nop ++ .align 4 ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ unop ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $0 ++ unop ++ ++ fstd $f6, 32($sp) ++ fclr $f0 ++ sra N, 3, $1 ++ beq $0, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ unop ++ fabs $f20, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ fabs $f20, $f1 ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f21, 0 * SIZE(X) ++ fabs $f20, $f2 ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fabs $f20, $f3 ++ addl X, INCX, X ++ unop ++ ++ LD $f23, 0 * SIZE(X) ++ fabs $f20, $f4 ++ addl X, INCX, X ++ unop ++ ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ fabs $f20, $f5 ++ unop ++ ++ LD $f25, 0 * SIZE(X) ++ fabs $f20, $f6 ++ addl X, INCX, X ++ unop ++ ++ LD $f26, 0 * SIZE(X) ++ fabs $f20, $f28 ++ addl X, INCX, X ++ ldi $1, -1($1) ++ ++ LD $f27, 0 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ fselne $f16, $f12, $f4, $f4 ++ unop ++ fabs $f20, $f29 ++ s_fillcs 56 * SIZE(X) ++ ++ fselne $f17, $f13, $f5, $f5 ++ LD $f20, 0 * SIZE(X) ++ fabs $f21, $f30 ++ addl X, INCX, X ++ ++ fselne $f18, $f14, $f6, $f6 ++ LD $f21, 0 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ fselne $f19, $f15, $f28, $f28 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ addl X, INCX, X ++ ++ fabs $f24, $f12 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f0, $f29), $f16 ++ addl X, INCX, X ++ ++ fabs $f25, $f13 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f1, $f30), $f17 ++ addl X, INCX, X ++ ++ fabs $f26, $f14 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f2, $f10), $f18 ++ addl X, INCX, X ++ ++ fabs $f27, $f15 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f3, $f11), $f19 ++ addl X, INCX, X ++ ++ fselne $f16, $f29, $f0, $f0 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f4, $f12), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f30, $f1, $f1 ++ unop ++ CMPLT($f5, $f13), $f17 ++ ldi $1, -1($1) # i -- ++ ++ fselne $f18, $f10, $f2, $f2 ++ unop ++ CMPLT($f6, $f14), $f18 ++ unop ++ ++ fselne $f19, $f11, $f3, $f3 ++ unop ++ CMPLT($f28, $f15), $f19 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ fselne $f16, $f12, $f4, $f4 ++ fabs $f20, $f29 ++ fselne $f17, $f13, $f5, $f5 ++ fabs $f21, $f30 ++ ++ fselne $f18, $f14, $f6, $f6 ++ fabs $f22, $f10 ++ fselne $f19, $f15, $f28, $f28 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ CMPLT($f0, $f29), $f16 ++ fabs $f25, $f13 ++ CMPLT($f1, $f30), $f17 ++ ++ fabs $f26, $f14 ++ CMPLT($f2, $f10), $f18 ++ fabs $f27, $f15 ++ CMPLT($f3, $f11), $f19 ++ ++ fselne $f16, $f29, $f0, $f0 ++ CMPLT($f4, $f12), $f16 ++ fselne $f17, $f30, $f1, $f1 ++ CMPLT($f5, $f13), $f17 ++ ++ fselne $f18, $f10, $f2, $f2 ++ CMPLT($f6, $f14), $f18 ++ fselne $f19, $f11, $f3, $f3 ++ CMPLT($f28, $f15), $f19 ++ ++ fselne $f16, $f12, $f4, $f4 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f13, $f5, $f5 ++ CMPLT($f2, $f3), $f17 ++ ++ fselne $f18, $f14, $f6, $f6 ++ CMPLT($f4, $f5), $f18 ++ fselne $f19, $f15, $f28, $f28 ++ CMPLT($f6, $f28), $f19 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ fselne $f18, $f5, $f4, $f4 ++ fselne $f19, $f28, $f6, $f6 ++ ++ CMPLT($f0, $f2), $f16 ++ CMPLT($f4, $f6), $f17 ++ ++ fselne $f16, $f2, $f0, $f0 ++ fselne $f17, $f6, $f4, $f4 ++ ++ CMPLT($f0, $f4), $f16 ++ fselne $f16, $f4, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $End ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/asum.S b/kernel/sw_64/asum.S +new file mode 100644 +index 0000000..d49f89f +--- /dev/null ++++ b/kernel/sw_64/asum.S +@@ -0,0 +1,206 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ ble N, $L999 ++ ++ sra N, 3, I ++ fclr s1 ++ fclr s2 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t1 ++ SXADDQ INCX, X, X ++ fclr t2 ++ ++ LD a1, 0 * SIZE(X) ++ fclr t3 ++ SXADDQ INCX, X, X ++ fclr s3 ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ s_fillcs PREFETCHSIZE * 2 * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ LD a7, 0 * SIZE(X) ++ fabs a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ fabs a3, t3 ++ SXADDQ INCX, X, X ++ ++ ADD s0, t0, s0 ++ LD a1, 0 * SIZE(X) ++ fabs a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) ++ fabs a5, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ LD a3, 0 * SIZE(X) ++ fabs a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ fabs a7, t3 ++ SXADDQ INCX, X, X ++ ++ LD a5, 0 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD a6, 0 * SIZE(X) ++ fabs a0, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a7, 0 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ fabs a2, t2 ++ ADD s3, t3, s3 ++ fabs a3, t3 ++ ++ ADD s0, t0, s0 ++ fabs a4, t0 ++ ADD s1, t1, s1 ++ fabs a5, t1 ++ ADD s2, t2, s2 ++ fabs a6, t2 ++ ADD s3, t3, s3 ++ fabs a7, t3 ++ ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ ADD s0, s1, s0 ++ ADD s2, s3, s2 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ADD s0, s2, s0 ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ fabs a0, t0 ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/axpy.S b/kernel/sw_64/axpy.S +new file mode 100644 +index 0000000..cc15b6b +--- /dev/null ++++ b/kernel/sw_64/axpy.S +@@ -0,0 +1,428 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define PREFETCHSIZE 40 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldl $24, 0($sp) ++ fmov $f19, $f30 ++ ldl $23, 8($sp) ++ ldi $sp, -16($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ nop ++ sra $16, 3, $1 ++ fstd $f2, 0($sp) ++ cmpeq $21, 1, $3 ++ ++ fstd $f3, 8($sp) ++ cmpeq $23, 1, $4 ++ and $16, 7, $2 ++ ble $16, $End ++ ++ and $3, $4, $3 ++ fbeq $f30, $End ++ ++ beq $3, $Sub ++ ble $1, $Remain ++ .align 4 ++ ++ LD $f10, 0*SIZE($20) ++ LD $f11, 1*SIZE($20) ++ LD $f12, 2*SIZE($20) ++ LD $f13, 3*SIZE($20) ++ ++ LD $f18, 0*SIZE($24) ++ LD $f19, 1*SIZE($24) ++ LD $f20, 2*SIZE($24) ++ LD $f21, 3*SIZE($24) ++ ++ LD $f14, 4*SIZE($20) ++ LD $f15, 5*SIZE($20) ++ LD $f16, 6*SIZE($20) ++ LD $f17, 7*SIZE($20) ++ ++ LD $f22, 4*SIZE($24) ++ LD $f23, 5*SIZE($24) ++ LD $f24, 6*SIZE($24) ++ LD $f25, 7*SIZE($24) ++ ++ subl $1, 1, $1 ++ addl $20, 8*SIZE, $20 ++ unop ++ ble $1, $LoopEnd ++ .align 4 ++ ++$Loop: ++ fillde_e PREFETCHSIZE * SIZE($24) ++ s_fillcs PREFETCHSIZE * SIZE($20) ++ ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ LD $f10, 0*SIZE($20) ++ MUL $f30, $f11, $f27 ++ LD $f11, 1*SIZE($20) ++ ++ MUL $f30, $f12, $f28 ++ LD $f12, 2*SIZE($20) ++ MUL $f30, $f13, $f29 ++ LD $f13, 3*SIZE($20) ++ ++ ADD $f18, $f26, $f0 ++ LD $f18, 8*SIZE($24) ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ LD $f14, 4*SIZE($20) ++ ++ ADD $f19, $f27, $f1 ++ LD $f19, 9*SIZE($24) ++ MUL $f30, $f15, $f27 ++ LD $f15, 5*SIZE($20) ++ ++ ADD $f20, $f28, $f2 ++ LD $f20, 10*SIZE($24) ++ MUL $f30, $f16, $f28 ++ LD $f16, 6*SIZE($20) ++ ++ ADD $f21, $f29, $f3 ++ LD $f21, 11*SIZE($24) ++ MUL $f30, $f17, $f29 ++ LD $f17, 7*SIZE($20) ++ ++ ST $f0, 0*SIZE($24) ++ ADD $f22, $f26, $f0 ++ ST $f1, 1*SIZE($24) ++ ADD $f23, $f27, $f1 ++ ++ ST $f2, 2*SIZE($24) ++ ADD $f24, $f28, $f2 ++ ST $f3, 3*SIZE($24) ++ ADD $f25, $f29, $f3 ++ ++ LD $f22, 12*SIZE($24) ++ LD $f23, 13*SIZE($24) ++ LD $f24, 14*SIZE($24) ++ LD $f25, 15*SIZE($24) ++ ++ ST $f0, 4*SIZE($24) ++ ST $f1, 5*SIZE($24) ++ ST $f2, 6*SIZE($24) ++ ST $f3, 7*SIZE($24) ++ ++ subl $1, 1, $1 ++ addl $24, 8*SIZE, $24 ++ addl $20, 8*SIZE, $20 ++ bgt $1, $Loop ++ .align 4 ++ ++$LoopEnd: ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ MUL $f30, $f11, $f27 ++ MUL $f30, $f12, $f28 ++ MUL $f30, $f13, $f29 ++ ++ ADD $f18, $f26, $f0 ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ ADD $f19, $f27, $f1 ++ MUL $f30, $f15, $f27 ++ ++ ADD $f20, $f28, $f2 ++ MUL $f30, $f16, $f28 ++ ADD $f21, $f29, $f3 ++ MUL $f30, $f17, $f29 ++ ++ ST $f0, 0*SIZE($24) ++ ADD $f22, $f26, $f0 ++ ST $f1, 1*SIZE($24) ++ ADD $f23, $f27, $f1 ++ ++ ST $f2, 2*SIZE($24) ++ ADD $f24, $f28, $f2 ++ ST $f3, 3*SIZE($24) ++ ADD $f25, $f29, $f3 ++ ++ ST $f0, 4*SIZE($24) ++ ST $f1, 5*SIZE($24) ++ ST $f2, 6*SIZE($24) ++ ST $f3, 7*SIZE($24) ++ addl $24, 8*SIZE, $24 ++ .align 4 ++ ++$Remain: ++ ble $2, $End ++ .align 4 ++ ++$RemainLoop: ++ LD $f10, 0*SIZE($20) ++ LD $f11, 0*SIZE($24) ++ addl $20, SIZE, $20 ++ addl $24, SIZE, $24 ++ ++ MUL $f30, $f10, $f12 ++ subl $2, 1, $2 ++ ADD $f11, $f12, $f13 ++ ST $f13, -1*SIZE($24) ++ bgt $2, $RemainLoop ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ ldi $sp, 16($sp) ++ ret ++ .align 4 ++ ++$Sub: ++ SXSUBL $16, SIZE, $22 ++ subl $1, 1, $4 ++ ble $1, $SubRemain ++ .align 4 ++ ++ LD $f10, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f11, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f12, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f13, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f18, 0($24) ++ SXADDQ $23, $24, $22 ++ ++ LD $f19, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f20, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f21, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f14, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f15, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f16, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f17, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f22, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f23, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f24, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f25, 0($22) ++ SXADDQ $23, $22, $22 ++ unop ++ ble $4, $SubLoopEnd ++ .align 4 ++ ++$SubLoop: ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ LD $f10, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f11, $f27 ++ LD $f11, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f12, $f28 ++ LD $f12, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f13, $f29 ++ LD $f13, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ ADD $f18, $f26, $f0 ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ LD $f14, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ADD $f19, $f27, $f1 ++ MUL $f30, $f15, $f27 ++ LD $f15, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ADD $f20, $f28, $f2 ++ MUL $f30, $f16, $f28 ++ LD $f16, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ADD $f21, $f29, $f3 ++ MUL $f30, $f17, $f29 ++ LD $f17, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f22, $f26, $f0 ++ unop ++ ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f23, $f27, $f1 ++ unop ++ ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f24, $f28, $f2 ++ unop ++ ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f25, $f29, $f3 ++ unop ++ ++ LD $f18, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f19, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f20, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f21, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f22, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f23, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f24, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f25, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ subl $4, 1, $4 ++ bgt $4, $SubLoop ++ .align 4 ++ ++$SubLoopEnd: ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ MUL $f30, $f11, $f27 ++ MUL $f30, $f12, $f28 ++ MUL $f30, $f13, $f29 ++ ++ ADD $f18, $f26, $f0 ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ ADD $f19, $f27, $f1 ++ MUL $f30, $f15, $f27 ++ ++ ADD $f20, $f28, $f2 ++ MUL $f30, $f16, $f28 ++ ADD $f21, $f29, $f3 ++ MUL $f30, $f17, $f29 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ ADD $f22, $f26, $f0 ++ ADD $f23, $f27, $f1 ++ ADD $f24, $f28, $f2 ++ ADD $f25, $f29, $f3 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ .align 4 ++ ++$SubRemain: ++ ble $2, $SubEnd ++ .align 4 ++ ++$SubRemainLoop: ++ LD $f10, 0($20) ++ LD $f11, 0($24) ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f10, $f12 ++ subl $2, 1, $2 ++ ADD $f11, $f12, $f13 ++ ST $f13, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ bgt $2, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ ldi $sp, 16($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/cabs.S b/kernel/sw_64/cabs.S +new file mode 100644 +index 0000000..3812ede +--- /dev/null ++++ b/kernel/sw_64/cabs.S +@@ -0,0 +1,71 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl NAME ++ .ent NAME ++NAME: ++ .frame $sp, 0, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++#endif ++ ++ LD $f10, 0($16) ++ LD $f11, SIZE($16) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fabs $f10, $f12 ++ fabs $f11, $f0 ++ ADD $f12, $f0, $f0 ++ ret ++ .end NAME ++ .ident VERSION +diff --git a/kernel/sw_64/cnrm2.S b/kernel/sw_64/cnrm2.S +new file mode 100644 +index 0000000..1892c5f +--- /dev/null ++++ b/kernel/sw_64/cnrm2.S +@@ -0,0 +1,428 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ sll INCX, ZBASE_SHIFT, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ beq INCX, $L999 ++ ++ fclr a2 ++ cmpeq INCX, 2 * SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, a0 ++ s_fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ LD x1, 1 * SIZE(X) ++ ++ ldi X, 2 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 2, I ++ fclr t1 ++ ble I, $L25 ++ ++ LD x0, 0 * SIZE(X) ++ fclr t2 ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ fclr t3 ++ LD x3, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x5, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x6, 0 * SIZE(X) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, a2 ++ LD x1, 1 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ unop ++ ++ faddd a0, t0, a0 ++ LD x3, 1 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ ldi I, -1(I) ++ ++ faddd a2, t2, a2 ++ LD x5, 1 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L25: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, a0 ++ faddd a1, t1, a1 ++ ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/copy.S b/kernel/sw_64/copy.S +new file mode 100644 +index 0000000..978c205 +--- /dev/null ++++ b/kernel/sw_64/copy.S +@@ -0,0 +1,379 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ cmpeq INCX, 1, $0 ++ ble N, $End ++#ifndef COMPLEX ++ sra N, 4, $4 ++#else ++ sra N, 3, $4 ++#endif ++ cmpeq INCY, 1, $1 ++ ++ and $0, $1, $0 ++ beq $0, $Sub ++#ifndef COMPLEX ++ and N, 15, $5 ++#else ++ and N, 7, $5 ++#endif ++ ble $4, $Remain ++ ++ LD $f10, 0*SIZE(X) ++ LD $f11, 1*SIZE(X) ++ LD $f12, 2*SIZE(X) ++ LD $f13, 3*SIZE(X) ++ LD $f14, 4*SIZE(X) ++ LD $f15, 5*SIZE(X) ++ LD $f16, 6*SIZE(X) ++ LD $f17, 7*SIZE(X) ++ ++ LD $f18, 8*SIZE(X) ++ LD $f19, 9*SIZE(X) ++ LD $f20, 10*SIZE(X) ++ LD $f21, 11*SIZE(X) ++ LD $f22, 12*SIZE(X) ++ LD $f23, 13*SIZE(X) ++ LD $f24, 14*SIZE(X) ++ LD $f25, 15*SIZE(X) ++ ++ subl $4, 1, $4 ++ ldi X, 16*SIZE(X) ++ ble $4, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++ ST $f10, 0*SIZE(Y) ++ ST $f11, 1*SIZE(Y) ++ ST $f12, 2*SIZE(Y) ++ ST $f13, 3*SIZE(Y) ++ ++ LD $f10, 0*SIZE(X) ++ LD $f11, 1*SIZE(X) ++ LD $f12, 2*SIZE(X) ++ LD $f13, 3*SIZE(X) ++ ++ ST $f14, 4*SIZE(Y) ++ ST $f15, 5*SIZE(Y) ++ ST $f16, 6*SIZE(Y) ++ ST $f17, 7*SIZE(Y) ++ ++ LD $f14, 4*SIZE(X) ++ LD $f15, 5*SIZE(X) ++ LD $f16, 6*SIZE(X) ++ LD $f17, 7*SIZE(X) ++ ++ ST $f18, 8*SIZE(Y) ++ ST $f19, 9*SIZE(Y) ++ ST $f20, 10*SIZE(Y) ++ ST $f21, 11*SIZE(Y) ++ ++ LD $f18, 8*SIZE(X) ++ LD $f19, 9*SIZE(X) ++ LD $f20, 10*SIZE(X) ++ LD $f21, 11*SIZE(X) ++ ++ ST $f22, 12*SIZE(Y) ++ ST $f23, 13*SIZE(Y) ++ ST $f24, 14*SIZE(Y) ++ ST $f25, 15*SIZE(Y) ++ ++ LD $f22, 12*SIZE(X) ++ LD $f23, 13*SIZE(X) ++ LD $f24, 14*SIZE(X) ++ LD $f25, 15*SIZE(X) ++ ++ subl $4, 1, $4 ++ ldi Y, 16*SIZE(Y) ++ ldi X, 16*SIZE(X) ++ bgt $4, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ ST $f10, 0*SIZE(Y) ++ ST $f11, 1*SIZE(Y) ++ ST $f12, 2*SIZE(Y) ++ ST $f13, 3*SIZE(Y) ++ ST $f14, 4*SIZE(Y) ++ ST $f15, 5*SIZE(Y) ++ ST $f16, 6*SIZE(Y) ++ ST $f17, 7*SIZE(Y) ++ ++ ST $f18, 8*SIZE(Y) ++ ST $f19, 9*SIZE(Y) ++ ST $f20, 10*SIZE(Y) ++ ST $f21, 11*SIZE(Y) ++ ST $f22, 12*SIZE(Y) ++ ST $f23, 13*SIZE(Y) ++ ST $f24, 14*SIZE(Y) ++ ST $f25, 15*SIZE(Y) ++ ++ ldi Y, 16*SIZE(Y) ++ .align 4 ++ ++$Remain: ++ ble $5, $End ++ .align 4 ++ ++$RemainLoop: ++#ifndef COMPLEX ++ LD $f10, 0*SIZE(X) ++ ldi X, 1*SIZE(X) ++ ST $f10, 0*SIZE(Y) ++ ldi Y, 1*SIZE(Y) ++#else ++ LD $f10, 0*SIZE(X) ++ LD $f11, 1*SIZE(X) ++ ldi X, 2*SIZE(X) ++ ST $f10, 0*SIZE(Y) ++ ST $f11, 1*SIZE(Y) ++ ldi Y, 2*SIZE(Y) ++#endif ++ subl $5, 1, $5 ++ bgt $5, $RemainLoop ++ .align 4 ++$End: ++ ret ++ .align 4 ++ ++$Sub: ++#ifdef COMPLEX ++ addl INCX, INCX, INCX ++ addl INCY, INCY, INCY ++ and N, 7, $5 ++#else ++ and N, 15, $5 ++#endif ++ ble $4, $SubRemain ++ .align 4 ++ ++$SubMainLoop: ++#ifndef COMPLEX ++ LD $f10, 0(X) ++ SXADDQ INCX, X, X ++ LD $f11, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f12, 0(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f14, 0(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f16, 0(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f18, 0(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f20, 0(X) ++ SXADDQ INCX, X, X ++ LD $f21, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f22, 0(X) ++ SXADDQ INCX, X, X ++ LD $f23, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f24, 0(X) ++ SXADDQ INCX, X, X ++ LD $f25, 0(X) ++ SXADDQ INCX, X, X ++ ++ ST $f10, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f11, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f12, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f13, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f14, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f15, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f16, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f17, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f18, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f19, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f20, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f21, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f22, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f23, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f24, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f25, 0(Y) ++ SXADDQ INCY, Y, Y ++#else ++ LD $f10, 0(X) ++ LD $f11, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f12, 0(X) ++ LD $f13, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f14, 0(X) ++ LD $f15, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f16, 0(X) ++ LD $f17, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f18, 0(X) ++ LD $f19, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f20, 0(X) ++ LD $f21, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f22, 0(X) ++ LD $f23, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f24, 0(X) ++ LD $f25, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST $f10, 0(Y) ++ ST $f11, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f12, 0(Y) ++ ST $f13, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f14, 0(Y) ++ ST $f15, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f16, 0(Y) ++ ST $f17, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f18, 0(Y) ++ ST $f19, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f20, 0(Y) ++ ST $f21, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f22, 0(Y) ++ ST $f23, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f24, 0(Y) ++ ST $f25, SIZE(Y) ++ SXADDQ INCY, Y, Y ++#endif ++ subl $4, 1, $4 ++ bgt $4, $SubMainLoop ++ .align 4 ++ ++$SubRemain: ++ ble $5, $SubEnd ++ .align 4 ++ ++ $SubRemainLoop: ++#ifndef COMPLEX ++ LD $f10, 0(X) ++ SXADDQ INCX, X, X ++ ST $f10, 0(Y) ++ SXADDQ INCY, Y, Y ++#else ++ LD $f10, 0(X) ++ LD $f11, SIZE(X) ++ SXADDQ INCX, X, X ++ ST $f10, 0(Y) ++ ST $f11, SIZE(Y) ++ SXADDQ INCY, Y, Y ++#endif ++ subl $5, 1, $5 ++ bgt $5, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/cscal.S b/kernel/sw_64/cscal.S +new file mode 100644 +index 0000000..5ea7cc0 +--- /dev/null ++++ b/kernel/sw_64/cscal.S +@@ -0,0 +1,217 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++ .set noat ++ .set noreorder ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++ ++ .globl NAME ++ .ent NAME ++ ++NAME: ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++#endif ++ ++#ifndef C_INTERFACE ++ ldl $16, 0($16) # n ++ mov $18, $20 # Store Address ++ ldl $19, 0($19) # incx ++ nop ++ ++ LD $f1, 0($17) # alpha ++#else ++ mov $18, $20 # Store Address ++ fmov $f17, $f1 # alpha ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ sra $16, 1, $21 # 4-unrolling ++ ble $16, $End ++ ++ ldi $23, -1($19) ++ ble $19, $End ++ ++ bgt $23, $INC_NOT_1 ++ .align 4 ++ ++ ble $21, $Sub ++ ldi $21, -1($21) ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ ++ LD $f12, 2*SIZE($18) ++ LD $f13, 3*SIZE($18) ++ ldi $18, 4*SIZE($18) ++ ble $21, $MainRemain ++ .align 4 ++ ++$MainLoop: ++ MUL $f10, $f1, $f20 ++ LD $f10, 0*SIZE($18) ++ MUL $f11, $f1, $f21 ++ LD $f11, 1*SIZE($18) ++ ++ MUL $f12, $f1, $f22 ++ LD $f12, 2*SIZE($18) ++ MUL $f13, $f1, $f23 ++ LD $f13, 3*SIZE($18) ++ ++ ldi $18, 4*SIZE($18) ++ ldi $21, -1($21) ++ ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ ST $f22, 2*SIZE($20) ++ ST $f23, 3*SIZE($20) ++ ldi $20, 4*SIZE($20) ++ ++ bgt $21, $MainLoop ++ .align 4 ++ ++$MainRemain: ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ MUL $f12, $f1, $f22 ++ MUL $f13, $f1, $f23 ++ ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ ST $f22, 2*SIZE($20) ++ ST $f23, 3*SIZE($20) ++ ldi $20, 4*SIZE($20) ++ .align 4 ++ ++$Sub: ++ blbc $16, $End ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ .align 4 ++ ++$End: ++ ret ++ .align 4 ++ ++$INC_NOT_1: ++ addl $19, $19, $19 ++ ble $21, $INC_Sub ++ ldi $21, -1($21) ++ ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f12, 0*SIZE($18) ++ LD $f13, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ble $21, $INC_MainRemain ++ .align 4 ++ ++$INC_MainLoop: ++ MUL $f10, $f1, $f20 ++ LD $f10, 0*SIZE($18) ++ MUL $f11, $f1, $f21 ++ LD $f11, 1*SIZE($18) ++ ++ SXADDQ $19, $18, $18 ++ ++ MUL $f12, $f1, $f22 ++ LD $f12, 0*SIZE($18) ++ MUL $f13, $f1, $f23 ++ LD $f13, 1*SIZE($18) ++ ++ SXADDQ $19, $18, $18 ++ ++ ST $f20, 0*SIZE($20) ++ ldi $21, -1($21) ++ ST $f21, 1*SIZE($20) ++ SXADDQ $19, $20, $20 ++ ++ ST $f22, 0*SIZE($20) ++ ST $f23, 1*SIZE($20) ++ SXADDQ $19, $20, $20 ++ unop ++ bgt $21, $INC_MainLoop ++ .align 4 ++ ++$INC_MainRemain: ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ MUL $f12, $f1, $f22 ++ MUL $f13, $f1, $f23 ++ ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ SXADDQ $19, $20, $20 ++ ++ ST $f22, 0*SIZE($20) ++ ST $f23, 1*SIZE($20) ++ SXADDQ $19, $20, $20 ++ .align 4 ++ ++$INC_Sub: ++ blbc $16, $INC_End ++ ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ .align 4 ++ ++$INC_End: ++ ret ++ .end NAME ++ .ident VERSION +diff --git a/kernel/sw_64/dnrm2.S b/kernel/sw_64/dnrm2.S +new file mode 100644 +index 0000000..2752e83 +--- /dev/null ++++ b/kernel/sw_64/dnrm2.S +@@ -0,0 +1,431 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ SXADDQ INCX, 0, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 4, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, a0 ++ s_fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, a1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L15: ++ and N, 15, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L25 ++ ++ fclr t2 ++ fclr t3 ++ ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x3, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x5, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x6, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2, a2 ++ LD x1, 0 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ LD x3, 0 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2, a2 ++ LD x5, 0 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, a1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, a0 ++ ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/dot.S b/kernel/sw_64/dot.S +new file mode 100644 +index 0000000..028a551 +--- /dev/null ++++ b/kernel/sw_64/dot.S +@@ -0,0 +1,534 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define ADD faddd ++#define MUL fmuld ++ ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++ ++#define I $5 ++ ++#define s0 $f0 ++#define s1 $f30 ++#define s2 $f1 ++#define s3 $f2 ++ ++#define a0 $f10 ++#define a1 $f11 ++#define a2 $f12 ++#define a3 $f13 ++#define a4 $f14 ++#define a5 $f15 ++#define a6 $f16 ++#define a7 $f17 ++ ++#define b0 $f18 ++#define b1 $f19 ++#define b2 $f20 ++#define b3 $f21 ++#define b4 $f22 ++#define b5 $f23 ++#define b6 $f24 ++#define b7 $f25 ++ ++#define t0 $f26 ++#define t1 $f27 ++#define t2 $f28 ++#define t3 $f29 ++ ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldi $sp, -16($sp) ++ fclr s0 ++ fstd $f2, 0($sp) ++ fclr s1 ++ ++ fclr s2 ++ nop ++ fclr s3 ++ ble N, $L999 ++ ++ fclr t0 ++ cmpeq INCX, 1, $21 ++ fclr t1 ++ cmpeq INCY, 1, $22 ++ fclr t2 ++ and $21, $22, $22 ++ fclr t3 ++ beq $22, $L20 ++ ++#ifndef DOUBLE ++ srl N, 4, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ LD b2, 2 * SIZE(Y) ++ LD b3, 3 * SIZE(Y) ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ LD b4, 4 * SIZE(Y) ++ LD b5, 5 * SIZE(Y) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ addl X, 16 * SIZE, X ++ subl I, 1, I ++ ++ addl Y, 16 * SIZE, Y ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ s_fillcs PREFETCHSIZE * 2 * SIZE(X) ++ subl I, 1, I ++ s_fillcs PREFETCHSIZE * 2 * SIZE(Y) ++ addl X, 16 * SIZE, X ++ ++ ADD s0, t0, s0 ++ LD b6, -10 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -9 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a0, -24 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -23 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b0, -8 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, -7 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -22 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -21 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, -6 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, -5 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -20 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -19 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, -4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, -3 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -18 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -17 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a0, -16 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -15 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -14 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -13 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 2 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 3 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -12 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -11 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, 4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, 5 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -10 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -9 * SIZE(X) ++ ++ addl Y, 16 * SIZE, Y ++ bgt I, $L12 ++ nop ++ fnop ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD b6,-10 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -9 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a0, -8 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -7 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b0, -8 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, -7 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -6 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -5 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, -6 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, -5 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -4 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -3 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, -4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, -3 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -2 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s1 ++ MUL a5, b5, t1 ++ ADD s2, t2, s2 ++ MUL a6, b6, t2 ++ ADD s3, t3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L15: ++ ADD s0, t0, s0 ++ and N, 15, I ++ ADD s1, t1, s1 ++ ble I, $L18 ++ .align 4 ++ ++#else ++ ++ srl N, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ LD b2, 2 * SIZE(Y) ++ LD b3, 3 * SIZE(Y) ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ LD b4, 4 * SIZE(Y) ++ LD b5, 5 * SIZE(Y) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ addl X, 8 * SIZE, X ++ subl I, 1, I ++ ++ addl Y, 8 * SIZE, Y ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ s_fillcs PREFETCHSIZE * SIZE(X) ++ subl I, 1, I ++ s_fillcs PREFETCHSIZE * SIZE(Y) ++ addl X, 8 * SIZE, X ++ ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a0, -8 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -7 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -6 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -5 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 2 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 3 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -4 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -3 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, 4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, 5 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -2 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -1 * SIZE(X) ++ ++ addl Y, 8 * SIZE, Y ++ bgt I, $L12 ++ nop ++ fnop ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s1 ++ MUL a5, b5, t1 ++ ADD s2, t2, s2 ++ MUL a6, b6, t2 ++ ADD s3, t3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L15: ++ ADD s0, t0, s0 ++ and N, 7, I ++ ADD s1, t1, s1 ++ ble I, $L18 ++ .align 4 ++ ++#endif ++ ++$L16: ++ LD a0, 0 * SIZE(X) ++ addl X, SIZE, X ++ LD b0, 0 * SIZE(Y) ++ addl Y, SIZE, Y ++ ++ ADD s2, t2, s2 ++ MUL a0, b0, t2 ++ subl I, 1, I ++ bgt I, $L16 ++ .align 4 ++ ++$L18: ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ br $L999 ++ .align 4 ++ ++$L20: ++ srl N, 2, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b1, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b2, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b3, 0 * SIZE(Y) ++ subl I, 1, I ++ ++ SXADDQ INCY, Y, Y ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b1, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b2, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b3, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ subl I, 1, I ++ bgt I, $L22 ++ nop ++ fnop ++ .align 4 ++ ++$L23: ++ ADD s0, t0, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ .align 4 ++ ++$L25: ++ ADD s0, t0, s0 ++ and N, 3, I ++ ADD s1, t1, s1 ++ ble I, $L28 ++ .align 4 ++ ++$L26: ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ MUL a0, b0, t2 ++ subl I, 1, I ++ bgt I, $L26 ++ .align 4 ++ ++$L28: ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ .align 4 ++ ++$L999: ++ ADD s2, s3, s2 ++ fldd $f2, 0($sp) ++ ADD s0, s1, s0 ++ ldi $sp, 16($sp) ++ ++ ADD s0, s2, s0 ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/gemm_beta.S b/kernel/sw_64/gemm_beta.S +new file mode 100644 +index 0000000..00e2d12 +--- /dev/null ++++ b/kernel/sw_64/gemm_beta.S +@@ -0,0 +1,179 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++CNAME: ++ .frame $sp, 0, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++#endif ++ ++ ldl $18, 16($sp) ++ ble $16, $End ++ ldl $19, 24($sp) ++ ble $17, $End ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO) ++ .align 4 ++ ++$BETA_NE_ZERO: ++ sra $16, 3, $2 # i = (m >> 3) ++ mov $18, $1 # c_offset = c ++ ldi $17, -1($17) # j -- ++ ble $2,$L52 ++ .align 4 ++ ++$L51: ++ fillde 64($1) ++ ldi $2, -1($2) ++ ++ LD $f14, 0*SIZE($1) ++ LD $f15, 1*SIZE($1) ++ LD $f16, 2*SIZE($1) ++ LD $f17, 3*SIZE($1) ++ LD $f18, 4*SIZE($1) ++ LD $f11, 5*SIZE($1) ++ LD $f21, 6*SIZE($1) ++ LD $f22, 7*SIZE($1) ++ ++ MUL $f19, $f14, $f23 ++ MUL $f19, $f15, $f24 ++ MUL $f19, $f16, $f25 ++ MUL $f19, $f17, $f26 ++ MUL $f19, $f18, $f27 ++ MUL $f19, $f11, $f28 ++ MUL $f19, $f21, $f29 ++ MUL $f19, $f22, $f30 ++ ++ ST $f23, 0*SIZE($1) ++ ST $f24, 1*SIZE($1) ++ ST $f25, 2*SIZE($1) ++ ST $f26, 3*SIZE($1) ++ ST $f27, 4*SIZE($1) ++ ST $f28, 5*SIZE($1) ++ ST $f29, 6*SIZE($1) ++ ST $f30, 7*SIZE($1) ++ ++ ldi $1,8*SIZE($1) ++ bgt $2,$L51 ++ .align 4 ++ ++$L52: ++ and $16, 7, $2 ++ ble $2,$L54 ++ .align 4 ++ ++$L53: ++ LD $f12, 0($1) ++ ldi $2, -1($2) ++ MUL $f19, $f12, $f23 ++ ST $f23, 0($1) ++ ldi $1, SIZE($1) ++ bgt $2,$L53 ++ .align 4 ++ ++$L54: ++ SXADDQ $19, $18, $18 # c += ldc ++ bgt $17,$BETA_NE_ZERO ++ clr $0 ++ ret ++ .align 4 ++ ++$BETA_EQ_ZERO: ++ sra $16, 3, $2 # i = (m >> 3) ++ ldi $4, 8*SIZE($18) ++ mov $18, $1 # c_offset = c ++ ldi $17, -1($17) # j -- ++ ble $2,$L42 ++ .align 4 ++ ++$L41: ++ ST $f31, 0*SIZE($1) ++ ST $f31, 1*SIZE($1) ++ ST $f31, 2*SIZE($1) ++ ST $f31, 3*SIZE($1) ++ ST $f31, 4*SIZE($1) ++ ST $f31, 5*SIZE($1) ++ ST $f31, 6*SIZE($1) ++ ST $f31, 7*SIZE($1) ++ ldi $2, -1($2) ++ ++ ldi $4, 8*SIZE($4) ++ ldi $1, 8*SIZE($1) ++ bgt $2,$L41 ++ .align 4 ++ ++$L42: ++ and $16, 7, $2 ++ ble $2,$L44 ++ .align 4 ++ ++$L43: ++ ldi $2, -1($2) ++ ST $f31, 0($1) ++ ldi $1, SIZE($1) ++ bgt $2, $L43 ++ .align 4 ++ ++$L44: ++ SXADDQ $19, $18, $18 # c += ldc ++ bgt $17,$BETA_EQ_ZERO ++ clr $0 ++ .align 4 ++ ++$End: ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/gemm_kernel_4x4.S b/kernel/sw_64/gemm_kernel_4x4.S +new file mode 100644 +index 0000000..2039c84 +--- /dev/null ++++ b/kernel/sw_64/gemm_kernel_4x4.S +@@ -0,0 +1,2844 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#if !defined(SW8A) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define BB $3 ++#define OFFSET $4 ++ ++#define ALPHA 64($sp) ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++#ifdef TRMMKERNEL ++ ldl OFFSET, 16 + STACKSIZE($sp) ++#endif ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ fstd $f19, ALPHA ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ subl $31, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 ++ .align 4 ++ ++$L01: ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ s4addl K, 0, BB ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ addl C2, LDC, C3 ++ s4addl LDC, C, C ++ ++ SXADDQ BB, B, BB ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(SW8A) ++ s_fillcs 0 * SIZE(BB) ++ s_fillcs 8 * SIZE(BB) ++ unop ++ ldi BB, 16 * SIZE(BB) ++#endif ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillde 4 * SIZE(C1) ++ fclr c03 ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ fclr c04 ++ ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++#else ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++#endif ++ ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, c11 ++ fldd alpha, ALPHA ++ MUL b1, a1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L18 ++#else ++ blbs TMP1, $L18 ++#endif ++ .align 4 ++ ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L18: ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++#ifndef TRMMKERNEL ++ LD b5, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, c01 ++ unop ++ MUL b1, a3, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++#ifndef TRMMKERNEL ++ LD b1, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++#ifndef TRMMKERNEL ++ LD a1, 0 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++#ifndef TRMMKERNEL ++ LD a2, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++#ifndef TRMMKERNEL ++ LD b2, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c09, t1, c09 ++ ldi I, -1(I) ++ MUL b3, a3, t1 ++ unop ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++#ifndef TRMMKERNEL ++ LD b3, 0 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++#ifndef TRMMKERNEL ++ LD a4, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++#ifndef TRMMKERNEL ++ LD a3, 2 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c11, t1, c11 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD b4, 3 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, c12 ++ unop ++ MUL alpha, c02, c02 ++#ifndef TRMMKERNEL ++ LD t1, 1 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c16, t3, c16 ++ unop ++ MUL alpha, c03, c03 ++#ifndef TRMMKERNEL ++ LD t2, 2 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c15, t4, c15 ++ unop ++ MUL alpha, c04, c04 ++#ifndef TRMMKERNEL ++ LD t3, 3 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ MUL alpha, c05, c05 ++ unop ++#ifndef TRMMKERNEL ++ ADD c01, a5, c01 ++ LD t4, 1 * SIZE(C4) ++#else ++ unop ++ unop ++#endif ++ ++ MUL alpha, c06, c06 ++#ifndef TRMMKERNEL ++ unop ++ ADD c02, b5, c02 ++ LD a5, 2 * SIZE(C4) ++#endif ++ ++ MUL alpha, c07, c07 ++#ifndef TRMMKERNEL ++ unop ++ ADD c03, a2, c03 ++ LD b5, 3 * SIZE(C4) ++#endif ++ ++ MUL alpha, c08, c08 ++#ifndef TRMMKERNEL ++ unop ++ ADD c04, b2, c04 ++ unop ++#endif ++ ++ MUL alpha, c09, c09 ++ ST c01, 0 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c05, b1, c05 ++ unop ++#endif ++ ++ MUL alpha, c10, c10 ++ ST c02, 1 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c06, a4, c06 ++ unop ++#endif ++ ++ MUL alpha, c11, c11 ++ ST c03, 2 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c07, a3, c07 ++ unop ++#endif ++ ++ MUL alpha, c12, c12 ++ ST c04, 3 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c08, b4, c08 ++#else ++ unop ++#endif ++ ldi C1, 4 * SIZE(C1) ++ ++ MUL alpha, c13, c13 ++ ST c05, 0 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c09, a1, c09 ++ unop ++#endif ++ ++ MUL alpha, c14, c14 ++ ST c06, 1 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c10, t1, c10 ++ unop ++#endif ++ ++ MUL alpha, c15, c15 ++ ST c07, 2 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c11, t2, c11 ++ unop ++#endif ++ ++ MUL alpha, c16, c16 ++ ST c08, 3 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c12, t3, c12 ++#else ++ unop ++#endif ++ ldi C2, 4 * SIZE(C2) ++ ++#ifndef TRMMKERNEL ++ ADD c13, b3, c13 ++#else ++ unop ++#endif ++ ST c09, 0 * SIZE(C3) ++ fclr t1 ++ ldi C4, 4 * SIZE(C4) ++ ++#ifndef TRMMKERNEL ++ ADD c14, t4, c14 ++#else ++ unop ++#endif ++ ST c10, 1 * SIZE(C3) ++ fclr t2 ++ unop ++ ++#ifndef TRMMKERNEL ++ ADD c15, a5, c15 ++#else ++ unop ++#endif ++ ST c11, 2 * SIZE(C3) ++ fclr t3 ++ unop ++ ++#ifndef TRMMKERNEL ++ ADD c16, b5, c16 ++#else ++ unop ++#endif ++ ST c12, 3 * SIZE(C3) ++ fclr t4 ++ ldi C3, 4 * SIZE(C3) ++ ++ ST c13, -4 * SIZE(C4) ++ ST c14, -3 * SIZE(C4) ++ ST c15, -2 * SIZE(C4) ++ ST c16, -1 * SIZE(C4) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble L, $L25 ++ ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, c09 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L28 ++#else ++ blbs TMP1, $L28 ++#endif ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L28: ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD a4, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++#ifndef TRMMKERNEL ++ LD b5, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++#ifndef TRMMKERNEL ++ LD b1, 0 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++#ifndef TRMMKERNEL ++ LD b2, 1 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++#ifndef TRMMKERNEL ++ LD b3, 0 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c09, t1, c09 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD b4, 1 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c10, t2, c10 ++ unop ++ MUL alpha, c02, c02 ++ unop ++ ++ ADD c13, t3, c13 ++ MUL alpha, c05, c05 ++ ADD c14, t4, c14 ++ MUL alpha, c06, c06 ++ ++ MUL alpha, c09, c09 ++#ifndef TRMMKERNEL ++ ADD c01, a3, c01 ++#endif ++ MUL alpha, c10, c10 ++#ifndef TRMMKERNEL ++ ADD c02, a4, c02 ++#endif ++ ++ MUL alpha, c13, c13 ++#ifndef TRMMKERNEL ++ ADD c05, a5, c05 ++#endif ++ MUL alpha, c14, c14 ++#ifndef TRMMKERNEL ++ ADD c06, b5, c06 ++#endif ++ ++#ifndef TRMMKERNEL ++ ADD c09, b1, c09 ++ unop ++#endif ++ ST c01, 0 * SIZE(C1) ++ fclr t1 ++ ++#ifndef TRMMKERNEL ++ ADD c10, b2, c10 ++ unop ++#endif ++ ST c02, 1 * SIZE(C1) ++ fclr t2 ++ ++#ifndef TRMMKERNEL ++ ADD c13, b3, c13 ++ unop ++#endif ++ ST c05, 0 * SIZE(C2) ++ fclr t3 ++ ++#ifndef TRMMKERNEL ++ ADD c14, b4, c14 ++ unop ++#endif ++ ST c06, 1 * SIZE(C2) ++ fclr t4 ++ ++ ST c09, 0 * SIZE(C3) ++ ldi C1, 2 * SIZE(C1) ++ ST c10, 1 * SIZE(C3) ++ ldi C2, 2 * SIZE(C2) ++ ++ ST c13, 0 * SIZE(C4) ++ ldi C3, 2 * SIZE(C3) ++ ST c14, 1 * SIZE(C4) ++ ldi C4, 2 * SIZE(C4) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ and M, 1, I ++ ble I, $L39 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble L, $L35 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, c01 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L38 ++#else ++ blbs TMP1, $L38 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L38: ++ ADD c05, t2, c05 ++ unop ++ MUL a1, b2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c09, t3, c09 ++ unop ++ MUL a1, b3, t3 ++#ifndef TRMMKERNEL ++ LD b5, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c13, t4, c13 ++ unop ++ MUL a1, b4, t4 ++#ifndef TRMMKERNEL ++ LD a2, 0 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, c01 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c05, t2, c05 ++ unop ++ MUL alpha, c05, c05 ++ unop ++ ++ ADD c09, t3, c09 ++ MUL alpha, c09, c09 ++ ADD c13, t4, c13 ++ MUL alpha, c13, c13 ++ ++#ifndef TRMMKERNEL ++ ADD c01, a5, c01 ++ ADD c05, b5, c05 ++ ADD c09, a2, c09 ++ ADD c13, a3, c13 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 1, KK ++#endif ++ .align 4 ++ ++$L39: ++ mov BO, B ++ ldi J, -1(J) ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 4, KK ++#else ++ unop ++#endif ++ bgt J, $L01 ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ fclr t1 ++ addl C2, LDC, C ++ fclr t2 ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 ++ ++$L51: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ble L, $L55 ++#else ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, c05 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L58 ++#else ++ blbs TMP1, $L58 ++#endif ++ .align 4 ++ ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L58: ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++#ifndef TRMMKERNEL ++ LD c11, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++#ifndef TRMMKERNEL ++ LD c12, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++#ifndef TRMMKERNEL ++ LD c13, 0 * SIZE(C2) ++ unop ++#endif ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++#ifndef TRMMKERNEL ++ LD c14, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++#ifndef TRMMKERNEL ++ LD c15, 2 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c05, t1, c05 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD c16, 3 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c06, t2, c06 ++ ldi I, -1(I) ++ MUL alpha, c02, c02 ++ unop ++ ++ ADD c07, t3, c07 ++ MUL alpha, c03, c03 ++ ADD c08, t4, c08 ++ MUL alpha, c04, c04 ++ ++ MUL alpha, c05, c05 ++#ifndef TRMMKERNEL ++ ADD c01, c09, c01 ++#endif ++ MUL alpha, c06, c06 ++#ifndef TRMMKERNEL ++ ADD c02, c10, c02 ++#endif ++ ++ MUL alpha, c07, c07 ++#ifndef TRMMKERNEL ++ ADD c03, c11, c03 ++#endif ++ MUL alpha, c08, c08 ++#ifndef TRMMKERNEL ++ ADD c04, c12, c04 ++#endif ++ ++#ifndef TRMMKERNEL ++ ADD c05, c13, c05 ++#endif ++ ST c01, 0 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c06, c14, c06 ++#endif ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef TRMMKERNEL ++ ADD c07, c15, c07 ++#endif ++ ST c03, 2 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c08, c16, c08 ++#endif ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ fclr t1 ++ ST c06, 1 * SIZE(C2) ++ fclr t2 ++ ST c07, 2 * SIZE(C2) ++ fclr t3 ++ ST c08, 3 * SIZE(C2) ++ fclr t4 ++ ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif ++ bgt I, $L51 ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ble L, $L65 ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, c01 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L68 ++#else ++ blbs TMP1, $L68 ++#endif ++ .align 4 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L68: ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c11, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, c01 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD c12, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c02, t2, c02 ++ ldi C1, 2 * SIZE(C1) ++ MUL alpha, c02, c02 ++ ldi C2, 2 * SIZE(C2) ++ ++ ADD c05, t3, c05 ++ MUL alpha, c05, c05 ++ ADD c06, t4, c06 ++ MUL alpha, c06, c06 ++ ++#ifndef TRMMKERNEL ++ ADD c01, c09, c01 ++ ADD c02, c10, c02 ++ ADD c05, c11, c05 ++ ADD c06, c12, c06 ++#endif ++ ++ ST c01, -2 * SIZE(C1) ++ fclr t1 ++ ST c02, -1 * SIZE(C1) ++ fclr t2 ++ ST c05, -2 * SIZE(C2) ++ fclr t3 ++ ST c06, -1 * SIZE(C2) ++ fclr t4 ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ and M, 1, I ++ ble I, $L79 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ble L, $L75 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, c01 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L78 ++#else ++ blbs TMP1, $L78 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L78: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 ++#ifndef TRMMKERNEL ++ LD b5, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c01, c02, c01 ++ ADD c05, c06, c05 ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ++ MUL alpha, c01, c01 ++ MUL alpha, c05, c05 ++ ++#ifndef TRMMKERNEL ++ ADD c01, a5, c01 ++ ADD c05, b5, c05 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 1, KK ++#endif ++ .align 4 ++ ++$L79: ++ mov BO, B ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 2, KK ++#else ++ unop ++#endif ++ unop ++ unop ++ .align 4 ++ ++$L80: ++ and N, 1, J ++ ble J, $L999 ++ ++ mov C, C1 ++ mov A, AO ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 ++ .align 4 ++ ++$L91: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO ++ unop ++ ble L, $L95 ++#else ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++#ifndef TRMMKERNEL ++ ADD c01, t1, c01 ++ LD c05, 0 * SIZE(C1) ++ ADD c02, t2, c02 ++ LD c06, 1 * SIZE(C1) ++ ADD c03, t3, c03 ++ LD c07, 2 * SIZE(C1) ++ ADD c04, t4, c04 ++ LD c08, 3 * SIZE(C1) ++#else ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++#endif ++ ++ MUL alpha, c01, c01 ++ MUL alpha, c02, c02 ++ MUL alpha, c03, c03 ++ MUL alpha, c04, c04 ++ ++#ifndef TRMMKERNEL ++ ADD c01, c05, c01 ++ ADD c02, c06, c02 ++ ADD c03, c07, c03 ++ ADD c04, c08, c04 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ldi C1, 4 * SIZE(C1) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ unop ++ unop ++ ble I, $L110 ++ .align 4 ++ ++$L101: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO ++ unop ++ ble L, $L105 ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C1) ++ LD a4, 1 * SIZE(C1) ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, c01 ++ fclr t1 ++ ADD c02, t2, c02 ++ fclr t2 ++ ADD c03, t3, c03 ++ fclr t3 ++ ADD c04, t4, c04 ++ fclr t4 ++ ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 ++ ++ MUL alpha, c01, c01 ++ MUL alpha, c02, c02 ++ ++#ifndef TRMMKERNEL ++ ADD c01, a3, c01 ++ ADD c02, a4, c02 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ldi C1, 2 * SIZE(C1) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ and M, 1, I ++ ble I, $L999 ++ .align 4 ++ ++$L111: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA ++#ifndef TRMMKERNEL ++ LD a2, 0 * SIZE(C1) ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 ++ ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ ADD c01, a2, c01 ++#endif ++ ST c01, 0 * SIZE(C1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/gemv_n.S b/kernel/sw_64/gemv_n.S +new file mode 100644 +index 0000000..1d9f654 +--- /dev/null ++++ b/kernel/sw_64/gemv_n.S +@@ -0,0 +1,1307 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $20 ++#define LDA $21 ++ ++#define X $18 ++#define INCX $19 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define Y1 $4 ++ ++#define A1 $5 ++#define A2 $6 ++#define A3 $7 ++#define A4 $8 ++ ++#define alpha $f19 ++ ++#define alpha1 $f0 ++#define alpha2 $f1 ++#define alpha3 $f10 ++#define alpha4 $f11 ++ ++#define y0 $f12 ++#define y1 $f13 ++#define y2 $f14 ++#define y3 $f15 ++ ++#define y4 $f16 ++#define y5 $f17 ++#define y6 $f18 ++#define y7 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl X, 0 + STACKSIZE($sp) ++ ldl INCX, 8 + STACKSIZE($sp) ++ ldl Y, 16 + STACKSIZE($sp) ++ ldl INCY, 24 + STACKSIZE($sp) ++ ldl BUFFER, 32 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ SXADDQ INCX, 0, INCX ++ cmple N, 0, $1 ++ SXADDQ INCY, 0, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ SXADDQ LDA, 0, LDA ++ ++ cmpeq INCY, SIZE, $0 ++ bne $0, $L10 ++ ++ mov BUFFER, Y1 ++ ++ mov Y, BUFFER ++ mov Y1, Y ++ ++ sra M, 3, I ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ ST $f31, 2 * SIZE(Y1) ++ ST $f31, 3 * SIZE(Y1) ++ ST $f31, 4 * SIZE(Y1) ++ ST $f31, 5 * SIZE(Y1) ++ ST $f31, 6 * SIZE(Y1) ++ ST $f31, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ ldi I, -1(I) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 7, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ ST $f31, 0 * SIZE(Y1) ++ addl Y1, SIZE, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ sra N, 2, J ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ LD alpha1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha3, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha4, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ MUL alpha, alpha1, alpha1 ++ MUL alpha, alpha2, alpha2 ++ MUL alpha, alpha3, alpha3 ++ MUL alpha, alpha4, alpha4 ++ ++ mov A, A1 ++ addl A, LDA, A2 ++ addl A2, LDA, A3 ++ addl A3, LDA, A4 ++ s4addl LDA, A, A ++ ++ mov Y, Y1 ++ s_fillcs 4 * SIZE(X) ++ ++ sra M, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ LD a8, 0 * SIZE(A3) ++ LD a9, 1 * SIZE(A3) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 3 * SIZE(A3) ++ ++ LD y4, 4 * SIZE(Y1) ++ LD y5, 5 * SIZE(Y1) ++ LD y6, 6 * SIZE(Y1) ++ LD y7, 7 * SIZE(Y1) ++ ++ MUL alpha1, a0, a0 ++ LD a12, 0 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ LD a13, 1 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ LD a14, 2 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ LD a15, 3 * SIZE(A4) ++ ++ ADD y0, a0, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, a4 ++ unop ++ ++ ADD y1, a1, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, a5 ++ unop ++ ++ ADD y2, a2, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, a6 ++ unop ++ ++ ADD y3, a3, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, a7 ++ unop ++ ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha3, a8, a8 ++ unop ++ ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha3, a9, a9 ++ ldi I, -1(I) ++ ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha3, a10, a10 ++ unop ++ ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha3, a11, a11 ++ unop ++ ++ ADD y0, a8, y0 ++ LD a8, 4 * SIZE(A3) ++ MUL alpha4, a12, a12 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD y1, a9, y1 ++ LD a9, 5 * SIZE(A3) ++ MUL alpha4, a13, a13 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ++ ADD y2, a10, y2 ++ LD a10, 6 * SIZE(A3) ++ MUL alpha4, a14, a14 ++ unop ++ ++ ADD y3, a11, y3 ++ LD a11, 7 * SIZE(A3) ++ MUL alpha4, a15, a15 ++ ldi I, -1(I) ++ ++ ADD y0, a12, y0 ++ LD a12, 4 * SIZE(A4) ++ MUL alpha1, a0, a0 ++ fillde (PREFETCHSIZE + 0) * SIZE(Y1) ++ ++ ADD y1, a13, y1 ++ LD a13, 5 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ unop ++ ++ ADD y2, a14, y2 ++ LD a14, 6 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ unop ++ ++ ADD y3, a15, y3 ++ LD a15, 7 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ ++ ADD y4, a0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD y5, a1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD y6, a2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD y7, a3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD y4, a4, y4 ++ LD a4, 8 * SIZE(A2) ++ MUL alpha3, a8, a8 ++ LD y0, 8 * SIZE(Y1) ++ ++ ADD y5, a5, y5 ++ LD a5, 9 * SIZE(A2) ++ MUL alpha3, a9, a9 ++ LD y1, 9 * SIZE(Y1) ++ ++ ADD y6, a6, y6 ++ LD a6, 10 * SIZE(A2) ++ MUL alpha3, a10, a10 ++ LD y2, 10 * SIZE(Y1) ++ ++ ADD y7, a7, y7 ++ LD a7, 11 * SIZE(A2) ++ MUL alpha3, a11, a11 ++ LD y3, 11 * SIZE(Y1) ++ ++ ADD y4, a8, y4 ++ LD a8, 8 * SIZE(A3) ++ MUL alpha4, a12, a12 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A3) ++ ++ ADD y5, a9, y5 ++ LD a9, 9 * SIZE(A3) ++ MUL alpha4, a13, a13 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD y6, a10, y6 ++ LD a10, 10 * SIZE(A3) ++ MUL alpha4, a14, a14 ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD y7, a11, y7 ++ LD a11, 11 * SIZE(A3) ++ MUL alpha4, a15, a15 ++ ldi Y1, 8 * SIZE(Y1) ++ ++ ADD y4, a12, y4 ++ LD a12, 8 * SIZE(A4) ++ MUL alpha1, a0, a0 ++ unop ++ ++ ADD y5, a13, y5 ++ LD a13, 9 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ ldi A3, 8 * SIZE(A3) ++ ++ ADD y6, a14, y6 ++ LD a14, 10 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A4) ++ ++ ADD y7, a15, y7 ++ LD a15, 11 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ ldi A4, 8 * SIZE(A4) ++ ++ ADD y0, a0, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, a4 ++ ST y4, -4 * SIZE(Y1) ++ ++ ADD y1, a1, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, a5 ++ ST y5, -3 * SIZE(Y1) ++ ++ ADD y2, a2, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, a6 ++ ST y6, -2 * SIZE(Y1) ++ ++ ADD y3, a3, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, a7 ++ ST y7, -1 * SIZE(Y1) ++ ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha3, a8, a8 ++ LD y4, 4 * SIZE(Y1) ++ ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha3, a9, a9 ++ LD y5, 5 * SIZE(Y1) ++ ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha3, a10, a10 ++ LD y6, 6 * SIZE(Y1) ++ ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha3, a11, a11 ++ LD y7, 7 * SIZE(Y1) ++ ++ ADD y0, a8, y0 ++ LD a8, 4 * SIZE(A3) ++ MUL alpha4, a12, a12 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD y1, a9, y1 ++ LD a9, 5 * SIZE(A3) ++ MUL alpha4, a13, a13 ++ unop ++ ++ ADD y2, a10, y2 ++ LD a10, 6 * SIZE(A3) ++ MUL alpha4, a14, a14 ++ unop ++ ++ ADD y3, a11, y3 ++ LD a11, 7 * SIZE(A3) ++ MUL alpha4, a15, a15 ++ unop ++ ++ ADD y0, a12, y0 ++ LD a12, 4 * SIZE(A4) ++ MUL alpha1, a0, a0 ++ unop ++ ++ ADD y1, a13, y1 ++ LD a13, 5 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ unop ++ ++ ADD y2, a14, y2 ++ LD a14, 6 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ unop ++ ++ ADD y3, a15, y3 ++ LD a15, 7 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ unop ++ ++ ST y0, 0 * SIZE(Y1) ++ ADD y4, a0, y4 ++ unop ++ MUL alpha2, a4, a4 ++ ++ ST y1, 1 * SIZE(Y1) ++ ADD y5, a1, y5 ++ unop ++ MUL alpha2, a5, a5 ++ ++ ST y2, 2 * SIZE(Y1) ++ ADD y6, a2, y6 ++ unop ++ MUL alpha2, a6, a6 ++ ++ ST y3, 3 * SIZE(Y1) ++ ADD y7, a3, y7 ++ ldi Y1, 8 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ ++ ADD y4, a4, y4 ++ MUL alpha3, a8, a8 ++ ADD y5, a5, y5 ++ MUL alpha3, a9, a9 ++ ADD y6, a6, y6 ++ MUL alpha3, a10, a10 ++ ADD y7, a7, y7 ++ MUL alpha3, a11, a11 ++ ++ ADD y4, a8, y4 ++ MUL alpha4, a12, a12 ++ ADD y5, a9, y5 ++ MUL alpha4, a13, a13 ++ ADD y6, a10, y6 ++ MUL alpha4, a14, a14 ++ ADD y7, a11, y7 ++ MUL alpha4, a15, a15 ++ ++ ADD y4, a12, y4 ++ ADD y5, a13, y5 ++ ADD y6, a14, y6 ++ ADD y7, a15, y7 ++ ++ ST y4, -4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, -3 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) ++ ST y6, -2 * SIZE(Y1) ++ ldi A3, 8 * SIZE(A3) ++ ST y7, -1 * SIZE(Y1) ++ ldi A4, 8 * SIZE(A4) ++ .align 4 ++ ++$L15: ++ and M, 4, I ++ ble I, $L16 ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD a8, 0 * SIZE(A3) ++ LD a9, 1 * SIZE(A3) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 3 * SIZE(A3) ++ ++ MUL alpha1, a0, a0 ++ LD a12, 0 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ LD a13, 1 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ LD a14, 2 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ LD a15, 3 * SIZE(A4) ++ ++ ADD y0, a0, y0 ++ MUL alpha2, a4, a4 ++ ADD y1, a1, y1 ++ MUL alpha2, a5, a5 ++ ADD y2, a2, y2 ++ MUL alpha2, a6, a6 ++ ADD y3, a3, y3 ++ MUL alpha2, a7, a7 ++ ++ ADD y0, a4, y0 ++ MUL alpha3, a8, a8 ++ ADD y1, a5, y1 ++ MUL alpha3, a9, a9 ++ ADD y2, a6, y2 ++ MUL alpha3, a10, a10 ++ ADD y3, a7, y3 ++ MUL alpha3, a11, a11 ++ ++ ADD y0, a8, y0 ++ MUL alpha4, a12, a12 ++ ADD y1, a9, y1 ++ MUL alpha4, a13, a13 ++ ADD y2, a10, y2 ++ MUL alpha4, a14, a14 ++ ADD y3, a11, y3 ++ MUL alpha4, a15, a15 ++ ++ ADD y0, a12, y0 ++ ldi Y1, 4 * SIZE(Y1) ++ ADD y1, a13, y1 ++ unop ++ ++ ADD y2, a14, y2 ++ unop ++ ADD y3, a15, y3 ++ unop ++ ++ ST y0, -4 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, -3 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, -2 * SIZE(Y1) ++ ldi A3, 4 * SIZE(A3) ++ ST y3, -1 * SIZE(Y1) ++ ldi A4, 4 * SIZE(A4) ++ .align 4 ++ ++$L16: ++ and M, 2, I ++ ble I, $L17 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ ++ LD a4, 0 * SIZE(A3) ++ MUL alpha1, a0, a0 ++ LD a5, 1 * SIZE(A3) ++ MUL alpha1, a1, a1 ++ LD a6, 0 * SIZE(A4) ++ MUL alpha2, a2, a2 ++ LD a7, 1 * SIZE(A4) ++ MUL alpha2, a3, a3 ++ ++ ADD y0, a0, y0 ++ MUL alpha3, a4, a4 ++ ADD y1, a1, y1 ++ MUL alpha3, a5, a5 ++ ADD y0, a2, y0 ++ MUL alpha4, a6, a6 ++ ADD y1, a3, y1 ++ MUL alpha4, a7, a7 ++ ++ ADD y0, a4, y0 ++ ldi A1, 2 * SIZE(A1) ++ ADD y1, a5, y1 ++ ldi A2, 2 * SIZE(A2) ++ ADD y0, a6, y0 ++ ldi A3, 2 * SIZE(A3) ++ ADD y1, a7, y1 ++ ldi A4, 2 * SIZE(A4) ++ ++ ST y0, 0 * SIZE(Y1) ++ unop ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 ++ ++$L17: ++ blbc M, $L18 ++ ++ LD y0, 0 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) ++ ++ MUL alpha1, a0, a0 ++ MUL alpha2, a1, a1 ++ MUL alpha3, a2, a2 ++ MUL alpha4, a3, a3 ++ ++ ADD y0, a0, y0 ++ ADD y0, a1, y0 ++ ADD y0, a2, y0 ++ ADD y0, a3, y0 ++ ++ ST y0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L18: ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ and N, 2, J ++ ble J, $L30 ++ ++ LD alpha1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha2, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ mov A, A1 ++ MUL alpha, alpha1, alpha1 ++ addl A, LDA, A2 ++ MUL alpha, alpha2, alpha2 ++ ++ addl A2, LDA, A ++ mov Y, Y1 ++ ++ sra M, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ MUL alpha1, a0, a0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a2, a2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a3, a3 ++ LD y7, 7 * SIZE(Y1) ++ ++ ADD y0, a0, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, a4 ++ ++ ADD y1, a1, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, a5 ++ ++ ADD y2, a2, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, a6 ++ ++ ADD y3, a3, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, a7 ++ ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha1, a0, a0 ++ ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ldi I, -1(I) ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD y4, a0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD y5, a1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD y6, a2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD y7, a3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD y4, a4, y4 ++ LD a4, 0 * SIZE(A2) ++ MUL alpha1, a0, a0 ++ LD y0, 8 * SIZE(Y1) ++ ++ ADD y5, a5, y5 ++ LD a5, 1 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ LD y1, 9 * SIZE(Y1) ++ ++ ADD y6, a6, y6 ++ LD a6, 2 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ LD y2, 10 * SIZE(Y1) ++ ++ ADD y7, a7, y7 ++ LD a7, 3 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ LD y3, 11 * SIZE(Y1) ++ ++ ADD y0, a0, y0 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ LD a0, 12 * SIZE(A1) ++ ++ ADD y1, a1, y1 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ LD a1, 13 * SIZE(A1) ++ ++ ADD y2, a2, y2 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ LD a2, 14 * SIZE(A1) ++ ++ ADD y3, a3, y3 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ LD a3, 15 * SIZE(A1) ++ ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha1, a0, a0 ++ LD y4, 12 * SIZE(Y1) ++ ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ LD y5, 13 * SIZE(Y1) ++ ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ LD y6, 14 * SIZE(Y1) ++ ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ LD y7, 15 * SIZE(Y1) ++ ++ fillde (PREFETCHSIZE + 0) * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD y4, a0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ unop ++ ++ ADD y5, a1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ unop ++ ++ ADD y6, a2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ unop ++ ++ ADD y7, a3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ unop ++ ++ ADD y4, a4, y4 ++ ADD y5, a5, y5 ++ ADD y6, a6, y6 ++ ADD y7, a7, y7 ++ ++ ST y4, 4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, 5 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) ++ ++ ST y6, 6 * SIZE(Y1) ++ unop ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L25: ++ and M, 4, I ++ ble I, $L26 ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ MUL alpha1, a0, a0 ++ LD a4, 0 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ LD a5, 1 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ LD a6, 2 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ LD a7, 3 * SIZE(A2) ++ ++ ADD y0, a0, y0 ++ MUL alpha2, a4, a4 ++ ADD y1, a1, y1 ++ MUL alpha2, a5, a5 ++ ADD y2, a2, y2 ++ MUL alpha2, a6, a6 ++ ADD y3, a3, y3 ++ MUL alpha2, a7, a7 ++ ++ ADD y0, a4, y0 ++ ldi Y1, 4 * SIZE(Y1) ++ ADD y1, a5, y1 ++ unop ++ ADD y2, a6, y2 ++ unop ++ ADD y3, a7, y3 ++ unop ++ ++ ST y0, -4 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, -3 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, -2 * SIZE(Y1) ++ ldi A3, 4 * SIZE(A3) ++ ST y3, -1 * SIZE(Y1) ++ ldi A4, 4 * SIZE(A4) ++ .align 4 ++ ++$L26: ++ and M, 2, I ++ ble I, $L27 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ ++ MUL alpha1, a0, a0 ++ MUL alpha1, a1, a1 ++ MUL alpha2, a2, a2 ++ MUL alpha2, a3, a3 ++ ++ ADD y0, a0, y0 ++ ldi A1, 2 * SIZE(A1) ++ ADD y1, a1, y1 ++ ldi A2, 2 * SIZE(A2) ++ ADD y0, a2, y0 ++ unop ++ ADD y1, a3, y1 ++ unop ++ ++ ST y0, 0 * SIZE(Y1) ++ unop ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 ++ ++$L27: ++ blbc M, $L30 ++ ++ LD y0, 0 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ ++ MUL alpha1, a0, a0 ++ MUL alpha2, a1, a1 ++ ++ ADD y0, a0, y0 ++ ADD y0, a1, y0 ++ ++ ST y0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L30: ++ blbc N, $L990 ++ ++ LD alpha1, 0 * SIZE(X) ++ mov A, A1 ++ MUL alpha, alpha1, alpha1 ++ mov Y, Y1 ++ ++ sra M, 3, I ++ ble I, $L35 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ LD a4, 4 * SIZE(A1) ++ LD a5, 5 * SIZE(A1) ++ LD a6, 6 * SIZE(A1) ++ LD a7, 7 * SIZE(A1) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ LD y4, 4 * SIZE(Y1) ++ LD y5, 5 * SIZE(Y1) ++ LD y6, 6 * SIZE(Y1) ++ LD y7, 7 * SIZE(Y1) ++ ++ MUL alpha1, a0, a0 ++ MUL alpha1, a1, a1 ++ MUL alpha1, a2, a2 ++ MUL alpha1, a3, a3 ++ ++ ldi I, -1(I) ++ ble I, $L33 ++ .align 4 ++ ++$L32: ++ ADD y0, a0, y0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, a4 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD y1, a1, y1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, a5 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD y2, a2, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, a6 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD y3, a3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, a7 ++ LD a3, 11 * SIZE(A1) ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ ST y2, 2 * SIZE(Y1) ++ ST y3, 3 * SIZE(Y1) ++ ++ ADD y4, a4, y4 ++ LD y0, 8 * SIZE(Y1) ++ MUL alpha1, a0, a0 ++ LD a4, 12 * SIZE(A1) ++ ++ ADD y5, a5, y5 ++ LD y1, 9 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ LD a5, 13 * SIZE(A1) ++ ++ ADD y6, a6, y6 ++ LD y2, 10 * SIZE(Y1) ++ MUL alpha1, a2, a2 ++ LD a6, 14 * SIZE(A1) ++ ++ ADD y7, a7, y7 ++ LD y3, 11 * SIZE(Y1) ++ MUL alpha1, a3, a3 ++ LD a7, 15 * SIZE(A1) ++ ++ ST y4, 4 * SIZE(Y1) ++ ldi I, -1(I) ++ ST y5, 5 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ++ ST y6, 6 * SIZE(Y1) ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ fillde (PREFETCHSIZE + 0) * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L32 ++ .align 4 ++ ++$L33: ++ ADD y0, a0, y0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, a4 ++ unop ++ ++ ADD y1, a1, y1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, a5 ++ unop ++ ++ ADD y2, a2, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, a6 ++ unop ++ ++ ADD y3, a3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, a7 ++ unop ++ ++ ADD y4, a4, y4 ++ ST y0, 0 * SIZE(Y1) ++ ADD y5, a5, y5 ++ ST y1, 1 * SIZE(Y1) ++ ADD y6, a6, y6 ++ ST y2, 2 * SIZE(Y1) ++ ADD y7, a7, y7 ++ ST y3, 3 * SIZE(Y1) ++ ++ ST y4, 4 * SIZE(Y1) ++ unop ++ ST y5, 5 * SIZE(Y1) ++ unop ++ ++ ST y6, 6 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L35: ++ and M, 4, I ++ ble I, $L36 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ MUL alpha1, a0, a0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, a2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, a3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD y0, a0, y0 ++ ADD y1, a1, y1 ++ ADD y2, a2, y2 ++ ADD y3, a3, y3 ++ ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, 2 * SIZE(Y1) ++ unop ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 ++ ++$L36: ++ and M, 2, I ++ ble I, $L37 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a0, a0 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ ++ ADD y0, a0, y0 ++ ADD y1, a1, y1 ++ ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 2 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 ++ ++$L37: ++ blbc M, $L990 ++ ++ LD y0, 0 * SIZE(Y1) ++ LD a0, 0 * SIZE(A1) ++ ++ MUL alpha1, a0, a0 ++ ++ ADD y0, a0, y0 ++ ST y0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L990: ++ cmpeq INCY, SIZE, $0 ++ bne $0, $L999 ++ ++ mov BUFFER, Y1 ++ ++ sra M, 3, I ++ ble I, $L995 ++ .align 4 ++ ++$L992: ++ LD a0, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a1, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a2, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a3, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ LD y2, 2 * SIZE(Y) ++ LD y3, 3 * SIZE(Y) ++ ++ LD a4, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a5, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a6, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a7, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y4, 4 * SIZE(Y) ++ LD y5, 5 * SIZE(Y) ++ LD y6, 6 * SIZE(Y) ++ LD y7, 7 * SIZE(Y) ++ ++ ADD a0, y0, a0 ++ ADD a1, y1, a1 ++ ADD a2, y2, a2 ++ ADD a3, y3, a3 ++ ADD a4, y4, a4 ++ ADD a5, y5, a5 ++ ADD a6, y6, a6 ++ ADD a7, y7, a7 ++ ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a1, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a2, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a3, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ST a4, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a5, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a6, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a7, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ ldi Y, 8 * SIZE(Y) ++ bgt I, $L992 ++ .align 4 ++ ++$L995: ++ and M, 7, I ++ ble I, $L999 ++ .align 4 ++ ++$L996: ++ LD a0, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ ldi Y, 1 * SIZE(Y) ++ ++ ADD a0, y0, a0 ++ ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L996 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/gemv_t.S b/kernel/sw_64/gemv_t.S +new file mode 100644 +index 0000000..68bce3f +--- /dev/null ++++ b/kernel/sw_64/gemv_t.S +@@ -0,0 +1,1061 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $20 ++#define LDA $21 ++ ++#define X $18 ++#define INCX $19 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define X1 $3 ++#define Y1 $4 ++ ++#define A1 $5 ++#define A2 $6 ++#define A3 $7 ++#define A4 $8 ++ ++#define alpha $f19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl X, 0 + STACKSIZE($sp) ++ ldl INCX, 8 + STACKSIZE($sp) ++ ldl Y, 16 + STACKSIZE($sp) ++ ldl INCY, 24 + STACKSIZE($sp) ++ ldl BUFFER, 32 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ SXADDQ INCX, 0, INCX ++ cmple N, 0, $1 ++ SXADDQ INCY, 0, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ cmpeq INCX, SIZE, $0 ++ mov X, X1 ++ SXADDQ LDA, 0, LDA ++ bne $0, $L10 ++ ++ sra M, 3, I ++ mov BUFFER, Y1 ++ mov BUFFER, X ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ ldi I, -1(I) ++ ++ LD a0, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a1, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a2, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a3, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ST a2, 2 * SIZE(Y1) ++ ST a3, 3 * SIZE(Y1) ++ ++ LD a4, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a5, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a6, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a7, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a4, 4 * SIZE(Y1) ++ ST a5, 5 * SIZE(Y1) ++ ST a6, 6 * SIZE(Y1) ++ ST a7, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 7, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ LD a0, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, SIZE, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ mov Y, Y1 ++ fclr t0 ++ unop ++ fclr t1 ++ ++ sra N, 2, J ++ fclr t2 ++ fclr t3 ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ mov A, A1 ++ fclr s0 ++ addl A, LDA, A2 ++ fclr s1 ++ ++ addl A2, LDA, A3 ++ fclr s2 ++ addl A3, LDA, A4 ++ fclr s3 ++ ++ s4addl LDA, A, A ++ unop ++ mov X, X1 ++ fillde 3 * SIZE(Y) ++ ++ sra M, 3, I ++ ble I, $L15 ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) ++ LD a4, 1 * SIZE(A1) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 1 * SIZE(A3) ++ LD a7, 1 * SIZE(A4) ++ LD a8, 2 * SIZE(A1) ++ LD a9, 2 * SIZE(A2) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 2 * SIZE(A4) ++ LD a12, 3 * SIZE(A1) ++ LD a13, 3 * SIZE(A2) ++ LD a14, 3 * SIZE(A3) ++ LD a15, 3 * SIZE(A4) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 4 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 4 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x0, a2, t2 ++ LD a2, 4 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD a3, 4 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 5 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a5, t1 ++ LD a5, 5 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x1, a6, t2 ++ LD a6, 5 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x1, a7, t3 ++ LD a7, 5 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, -2 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x2, a9, t1 ++ LD a9, 6 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ ldi A2, 8 * SIZE(A2) ++ MUL x2, a10, t2 ++ LD a10, 6 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ ldi A3, 8 * SIZE(A3) ++ MUL x2, a11, t3 ++ LD a11, 6 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, -1 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A4, 8 * SIZE(A4) ++ MUL x3, a13, t1 ++ LD a13, -1 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x3, a14, t2 ++ LD a14, -1 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x3, a15, t3 ++ LD a15, -1 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 0 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE - 8) * SIZE(A3) ++ MUL x0, a1, t1 ++ LD a1, 0 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x0, a2, t2 ++ LD a2, 0 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD a3, 0 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x0, 8 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 1 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x1, a5, t1 ++ LD a5, 1 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x1, a6, t2 ++ LD a6, 1 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x1, a7, t3 ++ LD a7, 1 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x1, 9 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, 2 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE - 8) * SIZE(A4) ++ MUL x2, a9, t1 ++ LD a9, 2 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ ldi X1, 8 * SIZE(X1) ++ MUL x2, a10, t2 ++ LD a10, 2 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ ldi I, -1(I) ++ MUL x2, a11, t3 ++ LD a11, 2 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x2, 2 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, 3 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE - 8) * SIZE(X1) ++ MUL x3, a13, t1 ++ LD a13, 3 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x3, a14, t2 ++ LD a14, 3 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ MUL x3, a15, t3 ++ LD a15, 3 * SIZE(A4) ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 4 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ LD a1, 4 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x0, a2, t2 ++ LD a2, 4 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD a3, 4 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 5 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x1, a5, t1 ++ LD a5, 5 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x1, a6, t2 ++ LD a6, 5 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x1, a7, t3 ++ LD a7, 5 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, 6 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x2, a9, t1 ++ LD a9, 6 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x2, a10, t2 ++ LD a10, 6 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x2, a11, t3 ++ LD a11, 6 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, 7 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x3, a13, t1 ++ LD a13, 7 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ ldi A2, 8 * SIZE(A2) ++ MUL x3, a14, t2 ++ LD a14, 7 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ ldi A3, 8 * SIZE(A3) ++ MUL x3, a15, t3 ++ LD a15, 7 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a0, t0 ++ unop ++ ++ ADD s1, t1, s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x0, a1, t1 ++ ldi A4, 8 * SIZE(A4) ++ ++ ADD s2, t2, s2 ++ MUL x0, a2, t2 ++ ADD s3, t3, s3 ++ MUL x0, a3, t3 ++ ++ ADD s0, t0, s0 ++ MUL x1, a4, t0 ++ ADD s1, t1, s1 ++ MUL x1, a5, t1 ++ ++ ADD s2, t2, s2 ++ MUL x1, a6, t2 ++ ADD s3, t3, s3 ++ MUL x1, a7, t3 ++ ++ ADD s0, t0, s0 ++ MUL x2, a8, t0 ++ ADD s1, t1, s1 ++ MUL x2, a9, t1 ++ ++ ADD s2, t2, s2 ++ MUL x2, a10, t2 ++ ADD s3, t3, s3 ++ MUL x2, a11, t3 ++ ++ ADD s0, t0, s0 ++ MUL x3, a12, t0 ++ ADD s1, t1, s1 ++ MUL x3, a13, t1 ++ ++ ADD s2, t2, s2 ++ MUL x3, a14, t2 ++ ADD s3, t3, s3 ++ MUL x3, a15, t3 ++ .align 4 ++ ++$L15: ++ and M, 7, I ++ ble I, $L18 ++ ++ LD x0, 0 * SIZE(X1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) ++ ++ ldi I, -1(I) ++ ble I, $L17 ++ .align 4 ++ ++$L16: ++ ADD s0, t0, s0 ++ ldi A4, 1 * SIZE(A4) ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A1, 1 * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 1 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ ldi A2, 1 * SIZE(A2) ++ MUL x0, a2, t2 ++ LD a2, 1 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ ldi A3, 1 * SIZE(A3) ++ MUL x0, a3, t3 ++ LD a3, 0 * SIZE(A4) ++ ++ LD x0, 1 * SIZE(X1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L16 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ ADD s1, t1, s1 ++ MUL x0, a1, t1 ++ ++ ADD s2, t2, s2 ++ MUL x0, a2, t2 ++ ADD s3, t3, s3 ++ MUL x0, a3, t3 ++ .align 4 ++ ++$L18: ++ LD a0, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a1, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a2, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a3, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ MUL alpha, s0, s0 ++ MUL alpha, s1, s1 ++ MUL alpha, s2, s2 ++ MUL alpha, s3, s3 ++ ++ ADD a0, s0, a0 ++ fclr t0 ++ ADD a1, s1, a1 ++ fclr t1 ++ ADD a2, s2, a2 ++ fclr t2 ++ ADD a3, s3, a3 ++ fclr t3 ++ ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a1, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a2, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a3, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ and N, 2, J ++ ble J, $L30 ++ mov A, A1 ++ addl A, LDA, A2 ++ ++ addl A2, LDA, A ++ fclr s0 ++ mov X, X1 ++ fclr s1 ++ ++ sra M, 3, I ++ fclr s2 ++ fclr s3 ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 1 * SIZE(A1) ++ LD a3, 1 * SIZE(A2) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 2 * SIZE(A2) ++ LD a6, 3 * SIZE(A1) ++ LD a7, 3 * SIZE(A2) ++ ++ LD a8, 4 * SIZE(A1) ++ LD a9, 4 * SIZE(A2) ++ LD a10, 5 * SIZE(A1) ++ LD a11, 5 * SIZE(A2) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 6 * SIZE(A2) ++ LD a14, 7 * SIZE(A1) ++ LD a15, 7 * SIZE(A2) ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 8 * SIZE(A2) ++ ++ ADD s0, t2, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a2, t2 ++ LD a2, 9 * SIZE(A1) ++ ++ ADD s1, t3, s1 ++ unop ++ MUL x1, a3, t3 ++ LD a3, 9 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a4, t0 ++ LD a4, 10 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi I, -1(I) ++ MUL x2, a5, t1 ++ LD a5, 10 * SIZE(A2) ++ ++ ADD s0, t2, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a6, t2 ++ LD a6, 11 * SIZE(A1) ++ ++ ADD s1, t3, s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a7, t3 ++ LD a7, 11 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x3, -1 * SIZE(X1) ++ MUL x0, a8, t0 ++ LD a8, 12 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x0, a9, t1 ++ LD a9, 12 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x0, 0 * SIZE(X1) ++ MUL x1, a10, t0 ++ LD a10, 13 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a11, t1 ++ LD a11, 13 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x1, 1 * SIZE(X1) ++ MUL x2, a12, t0 ++ LD a12, 6 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ MUL x2, a13, t1 ++ LD a13, 14 * SIZE(A2) ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x2, 2 * SIZE(X1) ++ MUL x3, a14, t0 ++ LD a14, 7 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ MUL x3, a15, t1 ++ LD a15, 7 * SIZE(A2) ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ unop ++ ++ ADD s0, t2, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a2, t2 ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD s1, t3, s1 ++ unop ++ MUL x1, a3, t3 ++ unop ++ ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a4, t0 ++ unop ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x2, a5, t1 ++ unop ++ ++ ADD s0, t2, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a6, t2 ++ unop ++ ++ ADD s1, t3, s1 ++ unop ++ MUL x3, a7, t3 ++ unop ++ ++ ADD s0, t0, s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a8, t0 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x0, a9, t1 ++ unop ++ ++ ADD s0, t0, s0 ++ MUL x1, a10, t0 ++ ADD s1, t1, s1 ++ MUL x1, a11, t1 ++ ++ ADD s0, t0, s0 ++ MUL x2, a12, t0 ++ ADD s1, t1, s1 ++ MUL x2, a13, t1 ++ ++ ADD s0, t0, s0 ++ MUL x3, a14, t0 ++ ADD s1, t1, s1 ++ MUL x3, a15, t1 ++ .align 4 ++ ++$L25: ++ and M, 7, I ++ ble I, $L28 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L27 ++ .align 4 ++ ++$L26: ++ ADD s0, t0, s0 ++ ldi A2, 1 * SIZE(A2) ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A1, 1 * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 0 * SIZE(A2) ++ ++ LD x0, 1 * SIZE(X1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L26 ++ .align 4 ++ ++$L27: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ ADD s1, t1, s1 ++ MUL x0, a1, t1 ++ .align 4 ++ ++$L28: ++ LD a0, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a1, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ ADD s0, s2, s0 ++ ADD s1, s3, s1 ++ ++ MUL alpha, s0, s0 ++ MUL alpha, s1, s1 ++ ++ ADD a0, s0, a0 ++ ADD a1, s1, a1 ++ ++ ST a0, 0 * SIZE(Y1) ++ fclr t0 ++ addl Y1, INCY, Y1 ++ fclr t1 ++ ++ ST a1, 0 * SIZE(Y1) ++ fclr t2 ++ addl Y1, INCY, Y1 ++ fclr t3 ++ .align 4 ++ ++$L30: ++ blbc N, $L999 ++ ++ mov A, A1 ++ fclr s0 ++ mov X, X1 ++ fclr s1 ++ ++ sra M, 3, I ++ fclr s2 ++ fclr s3 ++ ble I, $L35 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a8, 0 * SIZE(X1) ++ LD a9, 1 * SIZE(X1) ++ ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ LD a10, 2 * SIZE(X1) ++ LD a11, 3 * SIZE(X1) ++ ++ LD a4, 4 * SIZE(A1) ++ LD a5, 5 * SIZE(A1) ++ LD a12, 4 * SIZE(X1) ++ LD a13, 5 * SIZE(X1) ++ ++ LD a6, 6 * SIZE(A1) ++ LD a7, 7 * SIZE(A1) ++ LD a14, 6 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L33 ++ .align 4 ++ ++$L32: ++ ADD s0, t0, s0 ++ LD a15, 7 * SIZE(X1) ++ MUL a0, a8, t0 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ LD a8, 8 * SIZE(X1) ++ MUL a1, a9, t1 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD s2, t2, s2 ++ LD a9, 9 * SIZE(X1) ++ MUL a2, a10, t2 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD s3, t3, s3 ++ LD a10, 10 * SIZE(X1) ++ MUL a3, a11, t3 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD s0, t0, s0 ++ LD a11, 11 * SIZE(X1) ++ MUL a4, a12, t0 ++ LD a4, 12 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ LD a12, 12 * SIZE(X1) ++ MUL a5, a13, t1 ++ LD a5, 13 * SIZE(A1) ++ ++ ADD s2, t2, s2 ++ LD a13, 13 * SIZE(X1) ++ MUL a6, a14, t2 ++ LD a6, 14 * SIZE(A1) ++ ++ ADD s3, t3, s3 ++ LD a14, 14 * SIZE(X1) ++ MUL a7, a15, t3 ++ LD a7, 15 * SIZE(A1) ++ ++ ldi A1, 8 * SIZE(A1) ++ ldi I, -1(I) ++ ldi X1, 8 * SIZE(X1) ++ bgt I, $L32 ++ .align 4 ++ ++$L33: ++ ADD s0, t0, s0 ++ LD a15, 7 * SIZE(X1) ++ MUL a0, a8, t0 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL a1, a9, t1 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD s2, t2, s2 ++ MUL a2, a10, t2 ++ ADD s3, t3, s3 ++ MUL a3, a11, t3 ++ ++ ADD s0, t0, s0 ++ MUL a4, a12, t0 ++ ADD s1, t1, s1 ++ MUL a5, a13, t1 ++ ++ ADD s2, t2, s2 ++ MUL a6, a14, t2 ++ ADD s3, t3, s3 ++ MUL a7, a15, t3 ++ .align 4 ++ ++$L35: ++ and M, 7, I ++ ble I, $L38 ++ ++ LD a0, 0 * SIZE(A1) ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L37 ++ .align 4 ++ ++$L36: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) ++ LD x0, 1 * SIZE(X1) ++ ++ ldi A1, 1 * SIZE(A1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L36 ++ .align 4 ++ ++$L37: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ .align 4 ++ ++$L38: ++ LD a0, 0 * SIZE(Y) ++ ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ ADD s0, s2, s0 ++ ADD s1, s3, s1 ++ ADD s0, s1, s0 ++ ++ MUL alpha, s0, s0 ++ ADD a0, s0, a0 ++ ++ ST a0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/iamax.S b/kernel/sw_64/iamax.S +new file mode 100644 +index 0000000..662dc82 +--- /dev/null ++++ b/kernel/sw_64/iamax.S +@@ -0,0 +1,440 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 6 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef F_INTERFACE ++ ldl N, 0(N) # n ++ ldl INCX, 0(INCX) # incx ++#endif ++ ldi $sp, -STACKSIZE($sp) ++ mov X, XX ++ .align 4 ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ unop ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $2 ++ clr $0 ++ ++ fstd $f6, 32($sp) ++ fclr $f0 ++ sra N, 3, $1 ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ unop ++ fabs $f20, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ fabs $f20, $f1 ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f21, 0 * SIZE(X) ++ fabs $f20, $f2 ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fabs $f20, $f3 ++ addl X, INCX, X ++ unop ++ ++ LD $f23, 0 * SIZE(X) ++ fabs $f20, $f4 ++ addl X, INCX, X ++ unop ++ ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ fabs $f20, $f5 ++ unop ++ ++ LD $f25, 0 * SIZE(X) ++ fabs $f20, $f6 ++ addl X, INCX, X ++ unop ++ ++ LD $f26, 0 * SIZE(X) ++ fabs $f20, $f28 ++ addl X, INCX, X ++ ldi $1, -1($1) ++ ++ LD $f27, 0 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ fselne $f16, $f12, $f4, $f4 ++ unop ++ fabs $f20, $f29 ++ s_fillcs 56 * SIZE(X) ++ ++ fselne $f17, $f13, $f5, $f5 ++ LD $f20, 0 * SIZE(X) ++ fabs $f21, $f30 ++ addl X, INCX, X ++ ++ fselne $f18, $f14, $f6, $f6 ++ LD $f21, 0 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ fselne $f19, $f15, $f28, $f28 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ addl X, INCX, X ++ ++ fabs $f24, $f12 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f0, $f29), $f16 ++ addl X, INCX, X ++ ++ fabs $f25, $f13 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f1, $f30), $f17 ++ addl X, INCX, X ++ ++ fabs $f26, $f14 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f2, $f10), $f18 ++ addl X, INCX, X ++ ++ fabs $f27, $f15 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f3, $f11), $f19 ++ addl X, INCX, X ++ ++ fselne $f16, $f29, $f0, $f0 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f4, $f12), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f30, $f1, $f1 ++ unop ++ CMPLT($f5, $f13), $f17 ++ ldi $1, -1($1) # i -- ++ ++ fselne $f18, $f10, $f2, $f2 ++ unop ++ CMPLT($f6, $f14), $f18 ++ unop ++ ++ fselne $f19, $f11, $f3, $f3 ++ unop ++ CMPLT($f28, $f15), $f19 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ fselne $f16, $f12, $f4, $f4 ++ fabs $f20, $f29 ++ fselne $f17, $f13, $f5, $f5 ++ fabs $f21, $f30 ++ ++ fselne $f18, $f14, $f6, $f6 ++ fabs $f22, $f10 ++ fselne $f19, $f15, $f28, $f28 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ CMPLT($f0, $f29), $f16 ++ fabs $f25, $f13 ++ CMPLT($f1, $f30), $f17 ++ ++ fabs $f26, $f14 ++ CMPLT($f2, $f10), $f18 ++ fabs $f27, $f15 ++ CMPLT($f3, $f11), $f19 ++ ++ fselne $f16, $f29, $f0, $f0 ++ CMPLT($f4, $f12), $f16 ++ fselne $f17, $f30, $f1, $f1 ++ CMPLT($f5, $f13), $f17 ++ ++ fselne $f18, $f10, $f2, $f2 ++ CMPLT($f6, $f14), $f18 ++ fselne $f19, $f11, $f3, $f3 ++ CMPLT($f28, $f15), $f19 ++ ++ fselne $f16, $f12, $f4, $f4 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f13, $f5, $f5 ++ CMPLT($f2, $f3), $f17 ++ ++ fselne $f18, $f14, $f6, $f6 ++ CMPLT($f4, $f5), $f18 ++ fselne $f19, $f15, $f28, $f28 ++ CMPLT($f6, $f28), $f19 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ fselne $f18, $f5, $f4, $f4 ++ fselne $f19, $f28, $f6, $f6 ++ ++ CMPLT($f0, $f2), $f16 ++ CMPLT($f4, $f6), $f17 ++ ++ fselne $f16, $f2, $f0, $f0 ++ fselne $f17, $f6, $f4, $f4 ++ ++ CMPLT($f0, $f4), $f16 ++ fselne $f16, $f4, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $L20 ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$L20: ++ sra N, 3, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f11, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f13, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f15, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f17, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ fabs $f12, $f20 ++ fabs $f13, $f21 ++ ++ ldi $1, -1($1) ++ ble $1, $L23 ++ .align 4 ++ ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fabs $f14, $f22 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f18, $f2 ++ ++ LD $f11, 0 * SIZE(XX) ++ fabs $f15, $f23 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f19, $f3 ++ ++ LD $f12, 0 * SIZE(XX) ++ fabs $f16, $f24 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f20, $f4 ++ ++ LD $f13, 0 * SIZE(XX) ++ fabs $f17, $f25 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f21, $f5 ++ ++ LD $f14, 0 * SIZE(XX) ++ ldi $1, -1($1) # i -- ++ fcmpeq $f0, $f22, $f26 ++ addl XX, INCX, XX ++ ++ ldi $0, 1($0) ++ fbne $f2, $End ++ ++ LD $f15, 0 * SIZE(XX) ++ fcmpeq $f0, $f23, $f27 ++ ldi $0, 1($0) ++ fbne $f3, $End ++ ++ addl XX, INCX, XX ++ fcmpeq $f0, $f24, $f28 ++ ldi $0, 1($0) ++ fbne $f4, $End ++ ++ LD $f16, 0 * SIZE(XX) ++ fcmpeq $f0, $f25, $f29 ++ ldi $0, 1($0) ++ fbne $f5, $End ++ ++ addl XX, INCX, XX ++ ldi $0, 1($0) ++ fabs $f10, $f18 ++ fbne $f26, $End ++ ++ LD $f17, 0 * SIZE(XX) ++ ldi $0, 1($0) ++ fabs $f11, $f19 ++ fbne $f27, $End ++ ++ addl XX, INCX, XX ++ ldi $0, 1($0) ++ fabs $f12, $f20 ++ fbne $f28, $End ++ ++ ldi $0, 1($0) ++ fabs $f13, $f21 ++ fbne $f29, $End ++ bgt $1, $L22 ++ .align 4 ++ ++$L23: ++ fabs $f14, $f22 ++ fcmpeq $f0, $f18, $f2 ++ fabs $f15, $f23 ++ fcmpeq $f0, $f19, $f3 ++ ++ fabs $f16, $f24 ++ fcmpeq $f0, $f20, $f4 ++ fabs $f17, $f25 ++ fcmpeq $f0, $f21, $f5 ++ ++ fcmpeq $f0, $f22, $f26 ++ ldi $0, 1($0) ++ unop ++ fbne $f2, $End ++ ++ fcmpeq $f0, $f23, $f27 ++ ldi $0, 1($0) ++ unop ++ fbne $f3, $End ++ ++ fcmpeq $f0, $f24, $f28 ++ ldi $0, 1($0) ++ unop ++ fbne $f4, $End ++ ++ fcmpeq $f0, $f25, $f29 ++ ldi $0, 1($0) ++ unop ++ fbne $f5, $End ++ ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ ldi $0, 1($0) ++ fbne $f28, $End ++ ldi $0, 1($0) ++ fbne $f29, $End ++ .align 4 ++ ++$L40: ++ LD $f20, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fabs $f20, $f25 ++ fcmpeq $f0, $f25, $f29 ++ ++ ldi $0, 1($0) ++ fbne $f29, $End ++ br $31, $L40 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/imax.S b/kernel/sw_64/imax.S +new file mode 100644 +index 0000000..025a109 +--- /dev/null ++++ b/kernel/sw_64/imax.S +@@ -0,0 +1,351 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ ++ clr $0 ++ mov X, XX ++ .align 4 ++ ++ cmplt $31, N, $2 ++ cmplt $31, INCX, $3 ++ SXADDQ INCX, $31, INCX ++ and $2, $3, $2 ++ ++ sra N, 3, $1 ++ fclr $f0 ++ unop ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f0, 0 * SIZE(X) ++ unop ++ unop ++ ble $1, $L15 ++ .align 4 ++ ++ fmov $f0, $f1 ++ addl X, INCX, X ++ fmov $f0, $f10 ++ ldi $1, -1($1) ++ ++ LD $f21, 0 * SIZE(X) ++ fmov $f0, $f11 ++ addl X, INCX, X ++ fmov $f0, $f12 ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f13 ++ addl X, INCX, X ++ fmov $f0, $f14 ++ ++ LD $f23, 0 * SIZE(X) ++ fmov $f0, $f15 ++ addl X, INCX, X ++ fmov $f0, $f20 ++ ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f25, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f26, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f27, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ CMPLT($f0, $f20), $f16 ++ CMPLT($f1, $f21), $f17 ++ CMPLT($f10, $f22), $f18 ++ CMPLT($f11, $f23), $f19 ++ ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ fselne $f16, $f20, $f0, $f0 ++ LD $f20, 0 * SIZE(X) ++ CMPLT($f12, $f24), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f21, $f1, $f1 ++ LD $f21, 0 * SIZE(X) ++ CMPLT($f13, $f25), $f17 ++ addl X, INCX, X ++ ++ fselne $f18, $f22, $f10, $f10 ++ LD $f22, 0 * SIZE(X) ++ CMPLT($f14, $f26), $f18 ++ addl X, INCX, X ++ ++ fselne $f19, $f23, $f11, $f11 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f15, $f27), $f19 ++ addl X, INCX, X ++ ++ fselne $f16, $f24, $f12, $f12 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f0, $f20), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f25, $f13, $f13 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f1, $f21), $f17 ++ addl X, INCX, X ++ ++ fselne $f18, $f26, $f14, $f14 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f10, $f22), $f18 ++ addl X, INCX, X ++ ++ fselne $f19, $f27, $f15, $f15 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f11, $f23), $f19 ++ ldi $1, -1($1) # i -- ++ ++ addl X, INCX, X ++ unop ++ unop ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ fselne $f16, $f20, $f0, $f0 ++ CMPLT($f12, $f24), $f16 ++ ++ fselne $f17, $f21, $f1, $f1 ++ CMPLT($f13, $f25), $f17 ++ ++ fselne $f18, $f22, $f10, $f10 ++ CMPLT($f14, $f26), $f18 ++ ++ fselne $f19, $f23, $f11, $f11 ++ CMPLT($f15, $f27), $f19 ++ ++ fselne $f16, $f24, $f12, $f12 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f25, $f13, $f13 ++ CMPLT($f10, $f11), $f17 ++ ++ fselne $f18, $f26, $f14, $f14 ++ CMPLT($f12, $f13), $f18 ++ fselne $f19, $f27, $f15, $f15 ++ CMPLT($f14, $f15), $f19 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f11, $f10, $f10 ++ fselne $f18, $f13, $f12, $f12 ++ fselne $f19, $f15, $f14, $f14 ++ ++ CMPLT($f0, $f10), $f16 ++ CMPLT($f12, $f14), $f17 ++ ++ fselne $f16, $f10, $f0, $f0 ++ fselne $f17, $f14, $f12, $f12 ++ ++ CMPLT($f0, $f12), $f16 ++ fselne $f16, $f12, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $L20 ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ CMPLT($f0, $f20), $f16 ++ fselne $f16, $f20, $f0, $f0 ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$L20: ++ sra N, 3, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f11, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f13, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f15, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f17, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fcmpeq $f0, $f10, $f20 ++ fcmpeq $f0, $f11, $f21 ++ fcmpeq $f0, $f12, $f22 ++ fcmpeq $f0, $f13, $f23 ++ ++ ldi $1, -1($1) ++ ble $1, $L23 ++ .align 4 ++ ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fcmpeq $f0, $f14, $f24 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f20, $End ++ ++ LD $f11, 0 * SIZE(XX) ++ fcmpeq $f0, $f15, $f25 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f21, $End ++ ++ LD $f12, 0 * SIZE(XX) ++ fcmpeq $f0, $f16, $f26 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f22, $End ++ ++ LD $f13, 0 * SIZE(XX) ++ fcmpeq $f0, $f17, $f27 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f23, $End ++ ++ LD $f14, 0 * SIZE(XX) ++ fcmpeq $f0, $f10, $f20 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f24, $End ++ ++ LD $f15, 0 * SIZE(XX) ++ fcmpeq $f0, $f11, $f21 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f25, $End ++ ++ LD $f16, 0 * SIZE(XX) ++ ldi $1, -1($1) # i -- ++ fcmpeq $f0, $f12, $f22 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f26, $End ++ ++ LD $f17, 0 * SIZE(XX) ++ fcmpeq $f0, $f13, $f23 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f27, $End ++ ++ bgt $1, $L22 ++ .align 4 ++ ++$L23: ++ ldi $0, 1($0) ++ fcmpeq $f0, $f14, $f24 ++ unop ++ fbne $f20, $End ++ ++ ldi $0, 1($0) ++ fcmpeq $f0, $f15, $f25 ++ unop ++ fbne $f21, $End ++ ++ ldi $0, 1($0) ++ fcmpeq $f0, $f16, $f26 ++ unop ++ fbne $f22, $End ++ ++ ldi $0, 1($0) ++ fcmpeq $f0, $f17, $f27 ++ unop ++ fbne $f23, $End ++ ++ ldi $0, 1($0) ++ fbne $f24, $End ++ ldi $0, 1($0) ++ fbne $f25, $End ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ .align 4 ++ ++$L40: ++ LD $f20, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fcmpeq $f0, $f20, $f29 ++ ++ ldi $0, 1($0) ++ fbne $f29, $End ++ br $31, $L40 ++ .align 4 ++ ++$End: ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/izamax.S b/kernel/sw_64/izamax.S +new file mode 100644 +index 0000000..bbb2ff4 +--- /dev/null ++++ b/kernel/sw_64/izamax.S +@@ -0,0 +1,427 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ unop ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $2 ++ clr $0 ++ ++ fstd $f6, 32($sp) ++ mov X, XX ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ fclr $f0 ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ sra N, 2, $1 ++ addl INCX, INCX, INCX ++ ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ faddd $f20, $f21, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ ldi $1, -1($1) ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f1 ++ LD $f23, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ fmov $f0, $f2 ++ LD $f25, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ fmov $f0, $f3 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f8 ++ fabs $f21, $f9 ++ fabs $f22, $f10 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ fabs $f25, $f13 ++ fabs $f26, $f14 ++ fabs $f27, $f15 ++ ++ ble $1, $L14 ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ ldi $1, -1($1) ++ addl X, INCX, X ++ ++ LD $f22, 0 * SIZE(X) ++ LD $f23, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ LD $f25, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ faddd $f8, $f9, $f16 ++ unop ++ fabs $f20, $f8 ++ s_fillcs 64 * SIZE(X) ++ ++ faddd $f10, $f11, $f17 ++ unop ++ fabs $f21, $f9 ++ LD $f20, 0 * SIZE(X) ++ ++ faddd $f12, $f13, $f18 ++ LD $f21, 1 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ faddd $f14, $f15, $f19 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ unop ++ ++ CMPLT($f0, $f16), $f4 ++ LD $f23, 1 * SIZE(X) ++ fabs $f24, $f12 ++ addl X, INCX, X ++ ++ CMPLT($f1, $f17), $f5 ++ LD $f24, 0 * SIZE(X) ++ fabs $f25, $f13 ++ unop ++ ++ CMPLT($f2, $f18), $f6 ++ LD $f25, 1 * SIZE(X) ++ fabs $f26, $f14 ++ addl X, INCX, X ++ ++ CMPLT($f3, $f19), $f7 ++ LD $f26, 0 * SIZE(X) ++ fabs $f27, $f15 ++ unop ++ ++ fselne $f4, $f16, $f0, $f0 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ldi $1, -1($1) # i -- ++ ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ faddd $f8, $f9, $f16 ++ fabs $f20, $f8 ++ ++ faddd $f10, $f11, $f17 ++ fabs $f21, $f9 ++ ++ faddd $f12, $f13, $f18 ++ fabs $f22, $f10 ++ ++ faddd $f14, $f15, $f19 ++ fabs $f23, $f11 ++ ++ CMPLT($f0, $f16), $f4 ++ fabs $f24, $f12 ++ ++ CMPLT($f1, $f17), $f5 ++ fabs $f25, $f13 ++ ++ CMPLT($f2, $f18), $f6 ++ fabs $f26, $f14 ++ CMPLT($f3, $f19), $f7 ++ fabs $f27, $f15 ++ ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ .align 4 ++ ++$L14: ++ faddd $f8, $f9, $f16 ++ faddd $f10, $f11, $f17 ++ faddd $f12, $f13, $f18 ++ faddd $f14, $f15, $f19 ++ ++ CMPLT($f0, $f16), $f4 ++ CMPLT($f1, $f17), $f5 ++ CMPLT($f2, $f18), $f6 ++ CMPLT($f3, $f19), $f7 ++ ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ ++ CMPLT($f0, $f1), $f16 ++ CMPLT($f2, $f3), $f17 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ ++ CMPLT($f0, $f2), $f16 ++ fselne $f16, $f2, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 3, $1 ++ unop ++ unop ++ ble $1, $L20 ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ fabs $f21, $f30 ++ faddd $f29, $f30, $f29 ++ ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$L20: ++ sra N, 2, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ LD $f13, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ LD $f15, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ LD $f17, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ fabs $f12, $f20 ++ fabs $f13, $f21 ++ ++ ldi $1, -1($1) ++ ble $1, $L23 ++ .align 4 ++ ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fabs $f14, $f22 ++ LD $f11, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ fabs $f15, $f23 ++ LD $f13, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ fabs $f16, $f24 ++ LD $f15, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ fabs $f17, $f25 ++ LD $f17, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ faddd $f18, $f19, $f4 ++ faddd $f20, $f21, $f5 ++ faddd $f22, $f23, $f6 ++ faddd $f24, $f25, $f7 ++ ++ fcmpeq $f0, $f4, $f26 ++ fcmpeq $f0, $f5, $f27 ++ fcmpeq $f0, $f6, $f28 ++ fcmpeq $f0, $f7, $f29 ++ ++ fabs $f10, $f18 ++ ldi $0, 1($0) ++ ldi $1, -1($1) # i -- ++ fbne $f26, $End ++ ++ fabs $f11, $f19 ++ ldi $0, 1($0) ++ unop ++ fbne $f27, $End ++ ++ fabs $f12, $f20 ++ ldi $0, 1($0) ++ unop ++ fbne $f28, $End ++ ++ fabs $f13, $f21 ++ ldi $0, 1($0) ++ fbne $f29, $End ++ bgt $1, $L22 ++ .align 4 ++ ++$L23: ++ fabs $f14, $f22 ++ fabs $f15, $f23 ++ fabs $f16, $f24 ++ fabs $f17, $f25 ++ ++ faddd $f18, $f19, $f4 ++ faddd $f20, $f21, $f5 ++ faddd $f22, $f23, $f6 ++ faddd $f24, $f25, $f7 ++ ++ fcmpeq $f0, $f4, $f26 ++ fcmpeq $f0, $f5, $f27 ++ fcmpeq $f0, $f6, $f28 ++ fcmpeq $f0, $f7, $f29 ++ ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ ldi $0, 1($0) ++ fbne $f28, $End ++ ldi $0, 1($0) ++ fbne $f29, $End ++ .align 4 ++ ++$L40: ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) ++ ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ ++ faddd $f18, $f19, $f18 ++ fcmpeq $f0, $f18, $f2 ++ ++ ldi $0, 1($0) ++ fbne $f2, $End ++ br $31, $L40 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/lsame.S b/kernel/sw_64/lsame.S +new file mode 100644 +index 0000000..94dc549 +--- /dev/null ++++ b/kernel/sw_64/lsame.S +@@ -0,0 +1,76 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++ ++ ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl lsame_ ++ .ent lsame_ ++lsame_: ++ .frame $sp,0,$26,0 ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++ .prologue 1 ++#else ++ .prologue 0 ++#endif ++ ++ ldbu $5, 0($16) ++ ldbu $6, 0($17) ++ // extbl $5, $16, $5 ++ // extbl $6, $17, $6 ++ ++ subl $5, 96, $1 ++ subl $6, 96, $2 ++ subl $5, 32, $3 ++ subl $6, 32, $4 ++ ++ selgt $1, $3, $5, $5 ++ selgt $2, $4, $6, $6 ++ cmpeq $5, $6, $0 ++ .align 4 ++ ++$End: ++ ret ++ .end lsame_ ++ .ident VERSION +diff --git a/kernel/sw_64/max.S b/kernel/sw_64/max.S +new file mode 100644 +index 0000000..d4e4bf2 +--- /dev/null ++++ b/kernel/sw_64/max.S +@@ -0,0 +1,227 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef F_INTERFACE ++ ldl N, 0(N) # n ++ ldl INCX, 0(INCX) # incx ++#endif ++ ldi $sp, -STACKSIZE($sp) ++ nop ++ .align 4 ++ ++ cmplt $31, N, $2 ++ cmplt $31, INCX, $3 ++ SXADDQ INCX, $31, INCX ++ and $2, $3, $0 ++ ++ sra N, 3, $1 ++ fclr $f0 ++ unop ++ beq $0, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f0, 0 * SIZE(X) ++ unop ++ unop ++ ble $1, $L15 ++ .align 4 ++ ++ fmov $f0, $f1 ++ addl X, INCX, X ++ fmov $f0, $f10 ++ ldi $1, -1($1) ++ ++ LD $f21, 0 * SIZE(X) ++ fmov $f0, $f11 ++ addl X, INCX, X ++ fmov $f0, $f12 ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f13 ++ addl X, INCX, X ++ fmov $f0, $f14 ++ ++ LD $f23, 0 * SIZE(X) ++ fmov $f0, $f15 ++ addl X, INCX, X ++ fmov $f0, $f20 ++ ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f25, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f26, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f27, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ CMPLT($f0, $f20), $f16 ++ CMPLT($f1, $f21), $f17 ++ CMPLT($f10, $f22), $f18 ++ CMPLT($f11, $f23), $f19 ++ ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ fselne $f16, $f20, $f0, $f0 ++ LD $f20, 0 * SIZE(X) ++ CMPLT($f12, $f24), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f21, $f1, $f1 ++ LD $f21, 0 * SIZE(X) ++ CMPLT($f13, $f25), $f17 ++ addl X, INCX, X ++ ++ fselne $f18, $f22, $f10, $f10 ++ LD $f22, 0 * SIZE(X) ++ CMPLT($f14, $f26), $f18 ++ addl X, INCX, X ++ ++ fselne $f19, $f23, $f11, $f11 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f15, $f27), $f19 ++ addl X, INCX, X ++ ++ fselne $f16, $f24, $f12, $f12 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f0, $f20), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f25, $f13, $f13 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f1, $f21), $f17 ++ addl X, INCX, X ++ ++ fselne $f18, $f26, $f14, $f14 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f10, $f22), $f18 ++ addl X, INCX, X ++ ++ fselne $f19, $f27, $f15, $f15 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f11, $f23), $f19 ++ ldi $1, -1($1) # i -- ++ ++ addl X, INCX, X ++ unop ++ unop ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ fselne $f16, $f20, $f0, $f0 ++ CMPLT($f12, $f24), $f16 ++ ++ fselne $f17, $f21, $f1, $f1 ++ CMPLT($f13, $f25), $f17 ++ ++ fselne $f18, $f22, $f10, $f10 ++ CMPLT($f14, $f26), $f18 ++ ++ fselne $f19, $f23, $f11, $f11 ++ CMPLT($f15, $f27), $f19 ++ ++ fselne $f16, $f24, $f12, $f12 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f25, $f13, $f13 ++ CMPLT($f10, $f11), $f17 ++ ++ fselne $f18, $f26, $f14, $f14 ++ CMPLT($f12, $f13), $f18 ++ fselne $f19, $f27, $f15, $f15 ++ CMPLT($f14, $f15), $f19 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f11, $f10, $f10 ++ fselne $f18, $f13, $f12, $f12 ++ fselne $f19, $f15, $f14, $f14 ++ ++ CMPLT($f0, $f10), $f16 ++ CMPLT($f12, $f14), $f17 ++ ++ fselne $f16, $f10, $f0, $f0 ++ fselne $f17, $f14, $f12, $f12 ++ ++ CMPLT($f0, $f12), $f16 ++ fselne $f16, $f12, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $End ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ CMPLT($f0, $f20), $f16 ++ fselne $f16, $f20, $f0, $f0 ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$End: ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/rot.S b/kernel/sw_64/rot.S +new file mode 100644 +index 0000000..6680a7e +--- /dev/null ++++ b/kernel/sw_64/rot.S +@@ -0,0 +1,624 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define I $21 ++#define XX $23 ++#define YY $24 ++ ++#define C $f10 ++#define S $f11 ++ ++#define PREFETCH_SIZE 80 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fmov $f21, C ++ LD S, 0($sp) ++ ++ cmpeq INCX, 1, $23 ++ cmpeq INCY, 1, $24 ++ ble N, $L998 ++ ++ and $23, $24, $23 ++ beq $23, $L50 ++ ++ sra N, 3, I ++ ble I, $L15 ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ LD $f16, 2*SIZE(X) ++ LD $f17, 2*SIZE(Y) ++ LD $f18, 3*SIZE(X) ++ LD $f19, 3*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ unop ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ ++ LD $f13, 4*SIZE(Y) ++ MUL S, $f12, $f24 ++ LD $f12, 4*SIZE(X) ++ MUL C, $f14, $f25 ++ ++ ldi I, -1(I) ++ MUL S, $f15, $f26 ++ ADD $f21, $f22, $f22 ++ MUL C, $f15, $f27 ++ ++ LD $f15, 5*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ MUL C, $f16, $f21 ++ fillde (PREFETCH_SIZE) * SIZE(X) ++ unop ++ LD $f14, 5*SIZE(X) ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ fillde (PREFETCH_SIZE) * SIZE(Y) ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 6*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 7*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 8*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 8*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 9*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ LD $f14, 9*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ LD $f17, 10*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 10*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ LD $f19, 11*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ ldi I, -1(I) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 11*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 6*SIZE(X) ++ MUL S, $f13, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 12*SIZE(Y) ++ ldi X, 8*SIZE(X) ++ unop ++ ++ ST $f24, 6*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 4*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ unop ++ ++ ST $f26, -1*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 5*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, -1*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ MUL C, $f16, $f21 ++ LD $f14, 5*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ LD $f16, 6*SIZE(X) ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ LD $f18, 7*SIZE(X) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 6*SIZE(X) ++ ADD $f25, $f26, $f26 ++ ST $f24, 6*SIZE(Y) ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 7*SIZE(X) ++ ldi X, 8*SIZE(X) ++ ST $f28, 7*SIZE(Y) ++ ldi Y, 8*SIZE(Y) ++ .align 4 ++ ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f25 ++ SUB $f23, $f24, $f26 ++ ldi I, -1(I) ++ ++ ST $f25, 0*SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ST $f26, 0*SIZE(Y) ++ ldi Y, 1 * SIZE(Y) ++ ++ bgt I, $L16 ++ .align 4 ++ ++$L998: ++ clr $0 ++ ret ++ .align 4 ++ ++$L50: ++ mov X, XX ++ mov Y, YY ++ ++ sra N, 3, I ++ ble I, $L55 ++ .align 4 ++ ++$L51: ++ LD $f12, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f14, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f16, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f18, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f16, $f21 ++ MUL S, $f17, $f22 ++ MUL C, $f17, $f23 ++ MUL S, $f16, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f18, $f25 ++ MUL S, $f19, $f26 ++ MUL C, $f19, $f27 ++ MUL S, $f18, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f14, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f16, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f18, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f16, $f21 ++ MUL S, $f17, $f22 ++ MUL C, $f17, $f23 ++ MUL S, $f16, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f18, $f25 ++ MUL S, $f19, $f26 ++ MUL C, $f19, $f27 ++ MUL S, $f18, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ldi I, -1(I) ++ bgt I, $L51 ++ .align 4 ++ ++$L55: ++ and N, 7, I ++ ble I, $L999 ++ .align 4 ++ ++$L56: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f25 ++ SUB $f23, $f24, $f26 ++ ldi I, -1(I) ++ ++ ST $f25, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ ST $f26, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ bgt I, $L56 ++ .align 4 ++ ++$L999: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/scal.S b/kernel/sw_64/scal.S +new file mode 100644 +index 0000000..39ab088 +--- /dev/null ++++ b/kernel/sw_64/scal.S +@@ -0,0 +1,693 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $20 ++#define INCX $21 ++ ++#define XX $18 ++#define I $19 ++ ++#define ALPHA $f19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f21 ++ ++#define t0 $f22 ++#define t1 $f23 ++#define t2 $f24 ++#define t3 $f25 ++ ++ PROLOGUE ++ PROFCODE ++ ++ mov X, XX ++ ble N, $L999 ++ ++ ldl $0, 24($sp) ++ bne $0, $L11 ++ ++ fbne ALPHA, $L11 ++ cmpeq INCX, 1, $0 ++ beq $0, $L020 ++ ++#ifndef DOUBLE ++ sra N, 4, I ++ ble I, $L015 ++ ++ fmov $f31, t0 ++ fmov $f31, t1 ++ fmov $f31, t2 ++ fmov $f31, t3 ++ ++ ST t0, 0 * SIZE(X) ++ ST t1, 1 * SIZE(X) ++ ST t2, 2 * SIZE(X) ++ ST t3, 3 * SIZE(X) ++ ++ ST t0, 4 * SIZE(X) ++ ST t1, 5 * SIZE(X) ++ ST t2, 6 * SIZE(X) ++ ST t3, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L013 ++ .align 4 ++ ++$L012: ++ ST t0, 8 * SIZE(X) ++ ST t1, 9 * SIZE(X) ++ ST t2, 10 * SIZE(X) ++ ST t3, 11 * SIZE(X) ++ ++ ST t0, 12 * SIZE(X) ++ ST t1, 13 * SIZE(X) ++ ST t2, 14 * SIZE(X) ++ ST t3, 15 * SIZE(X) ++ ++ ST t0, 16 * SIZE(X) ++ ST t1, 17 * SIZE(X) ++ ST t2, 18 * SIZE(X) ++ ST t3, 19 * SIZE(X) ++ ++ ST t0, 20 * SIZE(X) ++ ST t1, 21 * SIZE(X) ++ ST t2, 22 * SIZE(X) ++ ST t3, 23 * SIZE(X) ++ ++ fillde PREFETCHSIZE * SIZE(X) ++ ldi I, -1(I) ++ addl X, 16 * SIZE, X ++ bne I, $L012 ++ .align 4 ++ ++$L013: ++ ST t0, 8 * SIZE(X) ++ ST t1, 9 * SIZE(X) ++ ST t2, 10 * SIZE(X) ++ ST t3, 11 * SIZE(X) ++ ++ ST t0, 12 * SIZE(X) ++ ST t1, 13 * SIZE(X) ++ ST t2, 14 * SIZE(X) ++ ST t3, 15 * SIZE(X) ++ ++ addl X, 16 * SIZE, X ++ .align 4 ++ ++$L015: ++ and N, 15, I ++ ++#else ++ sra N, 3, I ++ ble I, $L015 ++ ++ fmov $f31, t0 ++ fmov $f31, t1 ++ fmov $f31, t2 ++ fmov $f31, t3 ++ ++ ldi I, -1(I) ++ ble I, $L013 ++ .align 4 ++ ++$L012: ++ ST t0, 0 * SIZE(X) ++ ST t1, 1 * SIZE(X) ++ ST t2, 2 * SIZE(X) ++ ST t3, 3 * SIZE(X) ++ ++ ldi I, -1(I) ++ addl X, 8 * SIZE, X ++ ++ ST t0, -4 * SIZE(X) ++ ST t1, -3 * SIZE(X) ++ ST t2, -2 * SIZE(X) ++ ST t3, -1 * SIZE(X) ++ ++ fillde PREFETCHSIZE * SIZE(X) ++ bne I, $L012 ++ .align 4 ++ ++$L013: ++ ST t0, 0 * SIZE(X) ++ ST t1, 1 * SIZE(X) ++ ST t2, 2 * SIZE(X) ++ ST t3, 3 * SIZE(X) ++ ++ ST t0, 4 * SIZE(X) ++ ST t1, 5 * SIZE(X) ++ ST t2, 6 * SIZE(X) ++ ST t3, 7 * SIZE(X) ++ ++ addl X, 8 * SIZE, X ++ .align 4 ++ ++$L015: ++ and N, 7, I ++ ++#endif ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L017: ++ ST $f31, 0 * SIZE(X) ++ addl X, SIZE, X ++ ++ ldi I, -1(I) ++ bne I, $L017 ++ ret ++ .align 4 ++ ++$L020: ++ sra N, 3, I ++ ble I, $L025 ++ ++ fmov $f31, t0 ++ fmov $f31, t1 ++ fmov $f31, t2 ++ fmov $f31, t3 ++ ++ ldi I, -1(I) ++ ble I, $L023 ++ .align 4 ++ ++$L022: ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L022 ++ .align 4 ++ ++$L023: ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ .align 4 ++ ++$L025: ++ and N, 7, I ++ ble I, $L999 ++ .align 4 ++ ++$L027: ++ ST $f31, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L027 ++ ret ++ .align 4 ++ ++$L11: ++ cmpeq INCX, 1, $0 ++ beq $0, $L20 ++ ++#ifndef DOUBLE ++ sra N, 4, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ LD a4, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ LD a5, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ LD a6, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ LD a7, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 8 * SIZE(X) ++ LD a1, 9 * SIZE(X) ++ LD a2, 10 * SIZE(X) ++ LD a3, 11 * SIZE(X) ++ ++ ST t0, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 12 * SIZE(X) ++ LD a5, 13 * SIZE(X) ++ LD a6, 14 * SIZE(X) ++ LD a7, 15 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t0, 8 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 9 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 10 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 11 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 16 * SIZE(X) ++ LD a1, 17 * SIZE(X) ++ LD a2, 18 * SIZE(X) ++ LD a3, 19 * SIZE(X) ++ ++ ST t0, 12 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 13 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 14 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 15 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 20 * SIZE(X) ++ LD a5, 21 * SIZE(X) ++ LD a6, 22 * SIZE(X) ++ LD a7, 23 * SIZE(X) ++ ++ ST t0, 16 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 17 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 18 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 19 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 24 * SIZE(X) ++ LD a1, 25 * SIZE(X) ++ LD a2, 26 * SIZE(X) ++ LD a3, 27 * SIZE(X) ++ ++ ST t0, 20 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 21 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 22 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 23 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 28 * SIZE(X) ++ LD a5, 29 * SIZE(X) ++ LD a6, 30 * SIZE(X) ++ LD a7, 31 * SIZE(X) ++ ++ fillde PREFETCHSIZE * SIZE(X) ++ ldi I, -1(I) ++ addl X, 16 * SIZE, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ST t0, 8 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 9 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 10 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 11 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ ST t0, 12 * SIZE(X) ++ ST t1, 13 * SIZE(X) ++ ST t2, 14 * SIZE(X) ++ ST t3, 15 * SIZE(X) ++ addl X, 16 * SIZE, X ++ .align 4 ++ ++$L15: ++ and N, 15, I ++ ++#else ++ ++ sra N, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ LD a4, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ LD a5, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ LD a6, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ LD a7, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 8 * SIZE(X) ++ ldi I, -1(I) ++ LD a1, 9 * SIZE(X) ++ addl X, 8 * SIZE, X ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ ST t0, -4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, -3 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, -2 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, -1 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ fillde PREFETCHSIZE * SIZE(X) ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ ST t0, 4 * SIZE(X) ++ ST t1, 5 * SIZE(X) ++ ST t2, 6 * SIZE(X) ++ ST t3, 7 * SIZE(X) ++ addl X, 8 * SIZE, X ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ++#endif ++ ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ LD a0, 0 * SIZE(X) ++ ++ MUL a0, ALPHA, t0 ++ ++ ST t0, 0 * SIZE(X) ++ ++ addl X, SIZE, X ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ ret ++ .align 4 ++ ++$L20: ++ sra N, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ldi I, -1(I) ++ SXADDQ INCX, X, X ++ ++ LD a5, 0 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ SXADDQ INCX, X, X ++ unop ++ ++ LD a6, 0 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ SXADDQ INCX, X, X ++ unop ++ ++ LD a7, 0 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ SXADDQ INCX, X, X ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ST t0, 0 * SIZE(XX) ++ MUL a4, ALPHA, t0 ++ fillde PREFETCHSIZE * SIZE(X) ++ SXADDQ INCX, XX, XX ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) ++ unop ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a5, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a6, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a7, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t0, 0 * SIZE(XX) ++ MUL a0, ALPHA, t0 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a4, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a1, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a2, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a6, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a3, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a7, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ unop ++ bne I, $L22 ++ .align 4 ++ ++$L23: ++ ST t0, 0 * SIZE(XX) ++ MUL a4, ALPHA, t0 ++ SXADDQ INCX, XX, XX ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a5, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a6, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a7, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L27: ++ LD a0, 0 * SIZE(X) ++ ++ MUL a0, ALPHA, t0 ++ ++ ST t0, 0 * SIZE(XX) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L27 ++ .align 4 ++ ++$L999: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/snrm2.S b/kernel/sw_64/snrm2.S +new file mode 100644 +index 0000000..2752e83 +--- /dev/null ++++ b/kernel/sw_64/snrm2.S +@@ -0,0 +1,431 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ SXADDQ INCX, 0, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 4, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, a0 ++ s_fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, a1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L15: ++ and N, 15, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L25 ++ ++ fclr t2 ++ fclr t3 ++ ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x3, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x5, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x6, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2, a2 ++ LD x1, 0 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ LD x3, 0 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2, a2 ++ LD x5, 0 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, a1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, a0 ++ ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/staticbuffer.S b/kernel/sw_64/staticbuffer.S +new file mode 100644 +index 0000000..7bbd23d +--- /dev/null ++++ b/kernel/sw_64/staticbuffer.S +@@ -0,0 +1,45 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++#ifdef ALLOC_STATIC ++ .align 8 ++ .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 ++#endif +diff --git a/kernel/sw_64/sum.S b/kernel/sw_64/sum.S +new file mode 100644 +index 0000000..fe51d24 +--- /dev/null ++++ b/kernel/sw_64/sum.S +@@ -0,0 +1,206 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ ble N, $L999 ++ ++ sra N, 3, I ++ fclr s1 ++ fclr s2 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t1 ++ SXADDQ INCX, X, X ++ fclr t2 ++ ++ LD a1, 0 * SIZE(X) ++ fclr t3 ++ SXADDQ INCX, X, X ++ fclr s3 ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ s_fillcs PREFETCHSIZE * 2 * SIZE(X) ++ fmov a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fmov a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ LD a7, 0 * SIZE(X) ++ fmov a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ fmov a3, t3 ++ SXADDQ INCX, X, X ++ ++ ADD s0, t0, s0 ++ LD a1, 0 * SIZE(X) ++ fmov a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) ++ fmov a5, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ LD a3, 0 * SIZE(X) ++ fmov a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ fmov a7, t3 ++ SXADDQ INCX, X, X ++ ++ LD a5, 0 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD a6, 0 * SIZE(X) ++ fmov a0, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a7, 0 * SIZE(X) ++ fmov a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ fmov a2, t2 ++ ADD s3, t3, s3 ++ fmov a3, t3 ++ ++ ADD s0, t0, s0 ++ fmov a4, t0 ++ ADD s1, t1, s1 ++ fmov a5, t1 ++ ADD s2, t2, s2 ++ fmov a6, t2 ++ ADD s3, t3, s3 ++ fmov a7, t3 ++ ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ ADD s0, s1, s0 ++ ADD s2, s3, s2 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ADD s0, s2, s0 ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ fmov a0, t0 ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/swap.S b/kernel/sw_64/swap.S +new file mode 100644 +index 0000000..431d526 +--- /dev/null ++++ b/kernel/sw_64/swap.S +@@ -0,0 +1,252 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++ mov $20, $17 ++ mov $21, $18 ++ ldl $19, 0($sp) ++ ldl $20, 8($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ beq $18, $SubEnd ++ beq $20, $SubEnd ++ ++ subl $18, 1, $1 ++ subl $20, 1, $2 ++ ble $16, $SubEnd # if n <= 0 goto $End ++ or $1, $2, $1 ++ ++ sra $16, 3, $21 ++ ++ and $16, 7, $22 ++ bne $1, $Sub ++ ble $21, $MainRemain ++ .align 4 ++ ++$MainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f12, 2*SIZE($19) ++ LD $f13, 3*SIZE($19) ++ LD $f14, 4*SIZE($19) ++ LD $f15, 5*SIZE($19) ++ LD $f16, 6*SIZE($19) ++ LD $f17, 7*SIZE($19) ++ ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ LD $f22, 2*SIZE($17) ++ LD $f23, 3*SIZE($17) ++ LD $f24, 4*SIZE($17) ++ LD $f25, 5*SIZE($17) ++ LD $f26, 6*SIZE($17) ++ LD $f27, 7*SIZE($17) ++ ++ fillde 32*SIZE($17) ++ unop ++ fillde 32*SIZE($19) ++ subl $21, 1, $21 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f11, 1*SIZE($17) ++ ST $f12, 2*SIZE($17) ++ ST $f13, 3*SIZE($17) ++ ST $f14, 4*SIZE($17) ++ ST $f15, 5*SIZE($17) ++ ST $f16, 6*SIZE($17) ++ ST $f17, 7*SIZE($17) ++ ++ ST $f20, 0*SIZE($19) ++ ST $f21, 1*SIZE($19) ++ ST $f22, 2*SIZE($19) ++ ST $f23, 3*SIZE($19) ++ ST $f24, 4*SIZE($19) ++ ST $f25, 5*SIZE($19) ++ ST $f26, 6*SIZE($19) ++ ST $f27, 7*SIZE($19) ++ ++ ldi $17, 8*SIZE($17) ++ ldi $19, 8*SIZE($19) ++ bgt $21, $MainLoop ++ .align 4 ++ ++$MainRemain: ++ ble $22, $MainEnd ++ .align 4 ++ ++$MainRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ ldi $17, 1*SIZE($17) ++ ldi $19, 1*SIZE($19) ++ subl $22, 1, $22 ++ ST $f10, -1*SIZE($17) ++ ST $f20, -1*SIZE($19) ++ bgt $22, $MainRemainLoop ++ .align 4 ++ ++$MainEnd: ++ clr $0 ++ ret ++ .align 4 ++ ++$Sub: ++ mov $17, $23 ++ mov $19, $24 ++ ++ ble $21, $SubRemain ++ .align 4 ++ ++$SubLoop: ++ LD $f10, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f11, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f12, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f13, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f14, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f15, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f16, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f17, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f20, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f21, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f22, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f23, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f24, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f25, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f26, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f27, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ ST $f10, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f11, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f12, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f13, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f14, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f15, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f16, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f17, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f20, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f21, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f22, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f23, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f24, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f25, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f26, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f27, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ subl $21, 1, $21 ++ bgt $21, $SubLoop ++ .align 4 ++ ++$SubRemain: ++ ble $22, $SubEnd ++ .align 4 ++ ++$SubRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ ++ subl $22, 1, $22 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f20, 0*SIZE($19) ++ ++ SXADDQ $18, $17, $17 ++ SXADDQ $20, $19, $19 ++ bgt $22, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S b/kernel/sw_64/trsm_kernel_4x4_LN.S +new file mode 100644 +index 0000000..e9d9093 +--- /dev/null ++++ b/kernel/sw_64/trsm_kernel_4x4_LN.S +@@ -0,0 +1,4061 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#if !defined(SW8A) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ mull M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif ++ ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ fclr t3 ++ fclr t4 ++ ++ and M, 1, I ++ ble I, $L20 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 ++ ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 ++ ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L37: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ ++ ADD c13, t4, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ADD c09, t3, c09 ++ ADD c13, t4, c13 ++ ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c01, t1 ++ SUB c09, t1, c09 ++ MUL a4, c01, t1 ++ SUB c13, t1, c13 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b2, c05, t1 ++ SUB c09, t1, c09 ++ MUL b3, c05, t1 ++ SUB c13, t1, c13 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a2, c09, t1 ++ SUB c13, t1, c13 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a2, c13, t1 ++ SUB c09, t1, c09 ++ MUL a3, c13, t1 ++ SUB c05, t1, c05 ++ MUL a4, c13, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b2, c09, t1 ++ SUB c05, t1, c05 ++ MUL b3, c09, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 ++ ++ ble L, $L25 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 ++ ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ ADD c10, t2, c10 ++ ADD c13, t3, c13 ++ ADD c14, t4, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c13, c13 ++ SUB b4, c14, c14 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++ MUL a3, c10, c10 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ sra M, 2, I ++ ble I, $L39 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 ++ ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 ++#else ++ ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 ++#endif ++ ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, c11 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c11, t1, c11 ++ ADD c12, t2, c12 ++ ADD c16, t3, c16 ++ ADD c15, t4, c15 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) ++ ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) ++ ++ SUB a1, c03, c03 ++ SUB a2, c07, c07 ++ SUB a3, c11, c11 ++ SUB a4, c15, c15 ++ ++ SUB b1, c04, c04 ++ SUB b2, c08, c08 ++ SUB b3, c12, c12 ++ SUB b4, c16, c16 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++ ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) ++ ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) ++ ++ SUB a1, c09, c09 ++ SUB a2, c10, c10 ++ SUB a3, c11, c11 ++ SUB a4, c12, c12 ++ ++ SUB b1, c13, c13 ++ SUB b2, c14, c14 ++ SUB b3, c15, c15 ++ SUB b4, c16, c16 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ MUL a1, c12, c12 ++ MUL a1, c16, c16 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ MUL a2, c12, t3 ++ MUL a2, c16, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ MUL a3, c12, t3 ++ MUL a3, c16, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ MUL a4, c12, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ MUL b1, c11, c11 ++ MUL b1, c15, c15 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ MUL b2, c11, t3 ++ MUL b2, c15, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ MUL b3, c11, t3 ++ MUL b3, c15, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ MUL a3, c09, t3 ++ MUL a3, c13, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ MUL a4, c09, t3 ++ MUL a4, c13, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ MUL b1, c10, c10 ++ MUL b1, c14, c14 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ MUL b2, c10, t3 ++ MUL b2, c14, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ MUL b3, c10, t3 ++ MUL b3, c14, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ MUL a1, c11, c11 ++ MUL a1, c15, c15 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ MUL a2, c11, t3 ++ MUL a2, c15, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++ MUL a3, c12, c12 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ MUL a4, c03, t3 ++ MUL a4, c04, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ MUL b1, c07, c07 ++ MUL b1, c08, c08 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ MUL b2, c07, t3 ++ MUL b2, c08, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ MUL b3, c07, t3 ++ MUL b3, c08, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ MUL a2, c11, t3 ++ MUL a2, c12, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++ MUL a3, c15, c15 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ MUL a1, c15, c15 ++ MUL a1, c16, c16 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ MUL a2, c15, t3 ++ MUL a2, c16, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ MUL a3, c15, t3 ++ MUL a3, c16, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ MUL a4, c15, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ MUL b1, c11, c11 ++ MUL b1, c12, c12 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ MUL b2, c11, t3 ++ MUL b2, c12, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ MUL b3, c11, t3 ++ MUL b3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++ ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) ++ ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++ ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) ++ ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) ++ ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 4, KK ++#endif ++ ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ fclr t3 ++ fclr t4 ++ ++ and M, 1, I ++ ble I, $L60 ++ ++#if defined(LT) || defined(RN) ++ ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L78 ++ ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L78 ++ ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L77: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 ++ ++ ADD c01, c02, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, c05 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ++ .align 4 ++ ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L68 ++ ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L68 ++ ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 ++#else ++ blbs TMP1, $L67 ++#endif ++ .align 4 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L67: ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c05, t3, c05 ++ ADD c06, t4, c06 ++ .align 4 ++ ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ sra M, 2, I ++ ble I, $L79 ++ .align 4 ++ ++$L51: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble KK, $L58 ++ ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble TMP1, $L58 ++ ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, c05 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L57: ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, c05 ++ ADD c06, t2, c06 ++ ADD c07, t3, c07 ++ ADD c08, t4, c08 ++ .align 4 ++ ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++ ++ SUB b1, c03, c03 ++ SUB b2, c07, c07 ++ SUB b3, c04, c04 ++ SUB b4, c08, c08 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++ MUL a3, c07, c07 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 ++ .align 4 ++ ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L80: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C ++#endif ++ ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ and M, 1, I ++ ble I, $L100 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 ++ ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++#else ++ LD a1, 0 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ ble I, $L110 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ sra M, 2, I ++ ble I, $L119 ++ .align 4 ++ ++$L91: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a2, c04, t1 ++ SUB c03, t1, c03 ++ MUL a3, c04, t1 ++ SUB c02, t1, c02 ++ MUL a4, c04, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b2, c03, t1 ++ SUB c02, t1, c02 ++ MUL b3, c03, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c01, t1 ++ SUB c03, t1, c03 ++ MUL a4, c01, t1 ++ SUB c04, t1, c04 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b2, c02, t1 ++ SUB c03, t1, c03 ++ MUL b3, c02, t1 ++ SUB c04, t1, c04 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a2, c03, t1 ++ SUB c04, t1, c04 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S b/kernel/sw_64/trsm_kernel_4x4_LT.S +new file mode 100644 +index 0000000..4ee360e +--- /dev/null ++++ b/kernel/sw_64/trsm_kernel_4x4_LT.S +@@ -0,0 +1,4059 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#if !defined(SW8A) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ mull M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif ++ ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 ++ ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 ++#else ++ ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 ++#endif ++ ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, c11 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c11, t1, c11 ++ ADD c12, t2, c12 ++ ADD c16, t3, c16 ++ ADD c15, t4, c15 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) ++ ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) ++ ++ SUB a1, c03, c03 ++ SUB a2, c07, c07 ++ SUB a3, c11, c11 ++ SUB a4, c15, c15 ++ ++ SUB b1, c04, c04 ++ SUB b2, c08, c08 ++ SUB b3, c12, c12 ++ SUB b4, c16, c16 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++ ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) ++ ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) ++ ++ SUB a1, c09, c09 ++ SUB a2, c10, c10 ++ SUB a3, c11, c11 ++ SUB a4, c12, c12 ++ ++ SUB b1, c13, c13 ++ SUB b2, c14, c14 ++ SUB b3, c15, c15 ++ SUB b4, c16, c16 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ MUL a1, c12, c12 ++ MUL a1, c16, c16 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ MUL a2, c12, t3 ++ MUL a2, c16, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ MUL a3, c12, t3 ++ MUL a3, c16, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ MUL a4, c12, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ MUL b1, c11, c11 ++ MUL b1, c15, c15 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ MUL b2, c11, t3 ++ MUL b2, c15, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ MUL b3, c11, t3 ++ MUL b3, c15, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ MUL a3, c09, t3 ++ MUL a3, c13, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ MUL a4, c09, t3 ++ MUL a4, c13, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ MUL b1, c10, c10 ++ MUL b1, c14, c14 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ MUL b2, c10, t3 ++ MUL b2, c14, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ MUL b3, c10, t3 ++ MUL b3, c14, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ MUL a1, c11, c11 ++ MUL a1, c15, c15 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ MUL a2, c11, t3 ++ MUL a2, c15, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++ MUL a3, c12, c12 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ MUL a4, c03, t3 ++ MUL a4, c04, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ MUL b1, c07, c07 ++ MUL b1, c08, c08 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ MUL b2, c07, t3 ++ MUL b2, c08, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ MUL b3, c07, t3 ++ MUL b3, c08, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ MUL a2, c11, t3 ++ MUL a2, c12, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++ MUL a3, c15, c15 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ MUL a1, c15, c15 ++ MUL a1, c16, c16 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ MUL a2, c15, t3 ++ MUL a2, c16, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ MUL a3, c15, t3 ++ MUL a3, c16, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ MUL a4, c15, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ MUL b1, c11, c11 ++ MUL b1, c12, c12 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ MUL b2, c11, t3 ++ MUL b2, c12, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ MUL b3, c11, t3 ++ MUL b3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++ ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) ++ ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++ ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) ++ ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) ++ ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 ++ ++ ble L, $L25 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 ++ ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ ADD c10, t2, c10 ++ ADD c13, t3, c13 ++ ADD c14, t4, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c13, c13 ++ SUB b4, c14, c14 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++ MUL a3, c10, c10 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ and M, 1, I ++ ble I, $L39 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 ++ ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 ++ ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L37: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ ++ ADD c13, t4, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ADD c09, t3, c09 ++ ADD c13, t4, c13 ++ ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c01, t1 ++ SUB c09, t1, c09 ++ MUL a4, c01, t1 ++ SUB c13, t1, c13 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b2, c05, t1 ++ SUB c09, t1, c09 ++ MUL b3, c05, t1 ++ SUB c13, t1, c13 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a2, c09, t1 ++ SUB c13, t1, c13 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a2, c13, t1 ++ SUB c09, t1, c09 ++ MUL a3, c13, t1 ++ SUB c05, t1, c05 ++ MUL a4, c13, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b2, c09, t1 ++ SUB c05, t1, c05 ++ MUL b3, c09, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 4, KK ++#endif ++ ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 ++ ++$L51: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble KK, $L58 ++ ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble TMP1, $L58 ++ ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, c05 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L57: ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, c05 ++ ADD c06, t2, c06 ++ ADD c07, t3, c07 ++ ADD c08, t4, c08 ++ .align 4 ++ ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++ ++ SUB b1, c03, c03 ++ SUB b2, c07, c07 ++ SUB b3, c04, c04 ++ SUB b4, c08, c08 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++ MUL a3, c07, c07 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L68 ++ ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L68 ++ ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 ++#else ++ blbs TMP1, $L67 ++#endif ++ .align 4 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L67: ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c05, t3, c05 ++ ADD c06, t4, c06 ++ .align 4 ++ ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ and M, 1, I ++ ble I, $L79 ++ ++#if defined(LT) || defined(RN) ++ ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L78 ++ ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L78 ++ ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L77: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 ++ ++ ADD c01, c02, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, c05 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ++ .align 4 ++ ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L80: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C ++#endif ++ ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 ++ .align 4 ++ ++$L91: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a2, c04, t1 ++ SUB c03, t1, c03 ++ MUL a3, c04, t1 ++ SUB c02, t1, c02 ++ MUL a4, c04, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b2, c03, t1 ++ SUB c02, t1, c02 ++ MUL b3, c03, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c01, t1 ++ SUB c03, t1, c03 ++ MUL a4, c01, t1 ++ SUB c04, t1, c04 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b2, c02, t1 ++ SUB c03, t1, c03 ++ MUL b3, c02, t1 ++ SUB c04, t1, c04 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a2, c03, t1 ++ SUB c04, t1, c04 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ ble I, $L110 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ and M, 1, I ++ ble I, $L119 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 ++ ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++#else ++ LD a1, 0 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S b/kernel/sw_64/trsm_kernel_4x4_RT.S +new file mode 100644 +index 0000000..88d1a23 +--- /dev/null ++++ b/kernel/sw_64/trsm_kernel_4x4_RT.S +@@ -0,0 +1,4059 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#if !defined(SW8A) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ mull M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ and N, 1, J ++ ble J, $L40 ++ ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C ++#endif ++ ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 ++ .align 4 ++ ++$L91: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a2, c04, t1 ++ SUB c03, t1, c03 ++ MUL a3, c04, t1 ++ SUB c02, t1, c02 ++ MUL a4, c04, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b2, c03, t1 ++ SUB c02, t1, c02 ++ MUL b3, c03, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c01, t1 ++ SUB c03, t1, c03 ++ MUL a4, c01, t1 ++ SUB c04, t1, c04 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b2, c02, t1 ++ SUB c03, t1, c03 ++ MUL b3, c02, t1 ++ SUB c04, t1, c04 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a2, c03, t1 ++ SUB c04, t1, c04 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ ble I, $L110 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ and M, 1, I ++ ble I, $L119 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 ++ ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++#else ++ LD a1, 0 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 ++ ++$L51: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble KK, $L58 ++ ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble TMP1, $L58 ++ ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, c05 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L57: ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, c05 ++ ADD c06, t2, c06 ++ ADD c07, t3, c07 ++ ADD c08, t4, c08 ++ .align 4 ++ ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++ ++ SUB b1, c03, c03 ++ SUB b2, c07, c07 ++ SUB b3, c04, c04 ++ SUB b4, c08, c08 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++ MUL a3, c07, c07 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L68 ++ ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L68 ++ ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 ++#else ++ blbs TMP1, $L67 ++#endif ++ .align 4 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L67: ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c05, t3, c05 ++ ADD c06, t4, c06 ++ .align 4 ++ ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ and M, 1, I ++ ble I, $L79 ++ ++#if defined(LT) || defined(RN) ++ ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L78 ++ ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L78 ++ ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L77: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 ++ ++ ADD c01, c02, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, c05 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ++ .align 4 ++ ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L80: ++ sra N, 2, J ++ ble J, $L999 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif ++ ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 ++ ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 ++#else ++ ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 ++#endif ++ ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, c11 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c11, t1, c11 ++ ADD c12, t2, c12 ++ ADD c16, t3, c16 ++ ADD c15, t4, c15 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) ++ ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) ++ ++ SUB a1, c03, c03 ++ SUB a2, c07, c07 ++ SUB a3, c11, c11 ++ SUB a4, c15, c15 ++ ++ SUB b1, c04, c04 ++ SUB b2, c08, c08 ++ SUB b3, c12, c12 ++ SUB b4, c16, c16 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++ ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) ++ ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) ++ ++ SUB a1, c09, c09 ++ SUB a2, c10, c10 ++ SUB a3, c11, c11 ++ SUB a4, c12, c12 ++ ++ SUB b1, c13, c13 ++ SUB b2, c14, c14 ++ SUB b3, c15, c15 ++ SUB b4, c16, c16 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ MUL a1, c12, c12 ++ MUL a1, c16, c16 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ MUL a2, c12, t3 ++ MUL a2, c16, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ MUL a3, c12, t3 ++ MUL a3, c16, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ MUL a4, c12, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ MUL b1, c11, c11 ++ MUL b1, c15, c15 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ MUL b2, c11, t3 ++ MUL b2, c15, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ MUL b3, c11, t3 ++ MUL b3, c15, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ MUL a3, c09, t3 ++ MUL a3, c13, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ MUL a4, c09, t3 ++ MUL a4, c13, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ MUL b1, c10, c10 ++ MUL b1, c14, c14 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ MUL b2, c10, t3 ++ MUL b2, c14, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ MUL b3, c10, t3 ++ MUL b3, c14, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ MUL a1, c11, c11 ++ MUL a1, c15, c15 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ MUL a2, c11, t3 ++ MUL a2, c15, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++ MUL a3, c12, c12 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ MUL a4, c03, t3 ++ MUL a4, c04, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ MUL b1, c07, c07 ++ MUL b1, c08, c08 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ MUL b2, c07, t3 ++ MUL b2, c08, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ MUL b3, c07, t3 ++ MUL b3, c08, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ MUL a2, c11, t3 ++ MUL a2, c12, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++ MUL a3, c15, c15 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ MUL a1, c15, c15 ++ MUL a1, c16, c16 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ MUL a2, c15, t3 ++ MUL a2, c16, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ MUL a3, c15, t3 ++ MUL a3, c16, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ MUL a4, c15, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ MUL b1, c11, c11 ++ MUL b1, c12, c12 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ MUL b2, c11, t3 ++ MUL b2, c12, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ MUL b3, c11, t3 ++ MUL b3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++ ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) ++ ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++ ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) ++ ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) ++ ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 ++ ++ ble L, $L25 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 ++ ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ ADD c10, t2, c10 ++ ADD c13, t3, c13 ++ ADD c14, t4, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c13, c13 ++ SUB b4, c14, c14 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++ MUL a3, c10, c10 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ and M, 1, I ++ ble I, $L39 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 ++ ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 ++ ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L37: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ ++ ADD c13, t4, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ADD c09, t3, c09 ++ ADD c13, t4, c13 ++ ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c01, t1 ++ SUB c09, t1, c09 ++ MUL a4, c01, t1 ++ SUB c13, t1, c13 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b2, c05, t1 ++ SUB c09, t1, c09 ++ MUL b3, c05, t1 ++ SUB c13, t1, c13 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a2, c09, t1 ++ SUB c13, t1, c13 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a2, c13, t1 ++ SUB c09, t1, c09 ++ MUL a3, c13, t1 ++ SUB c05, t1, c05 ++ MUL a4, c13, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b2, c09, t1 ++ SUB c05, t1, c05 ++ MUL b3, c09, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 4, KK ++#endif ++ ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zamax.S b/kernel/sw_64/zamax.S +new file mode 100644 +index 0000000..46674da +--- /dev/null ++++ b/kernel/sw_64/zamax.S +@@ -0,0 +1,301 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $0 ++ unop ++ ++ fstd $f6, 32($sp) ++ unop ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ fclr $f0 ++ beq $0, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ sra N, 2, $1 ++ addl INCX, INCX, INCX ++ ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ faddd $f20, $f21, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ ldi $1, -1($1) ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f1 ++ LD $f23, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ fmov $f0, $f2 ++ LD $f25, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ fmov $f0, $f3 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f8 ++ fabs $f21, $f9 ++ fabs $f22, $f10 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ fabs $f25, $f13 ++ fabs $f26, $f14 ++ fabs $f27, $f15 ++ ++ ble $1, $L14 ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ ldi $1, -1($1) ++ addl X, INCX, X ++ ++ LD $f22, 0 * SIZE(X) ++ LD $f23, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ LD $f25, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ faddd $f8, $f9, $f16 ++ unop ++ fabs $f20, $f8 ++ s_fillcs 64 * SIZE(X) ++ ++ faddd $f10, $f11, $f17 ++ unop ++ fabs $f21, $f9 ++ LD $f20, 0 * SIZE(X) ++ ++ faddd $f12, $f13, $f18 ++ LD $f21, 1 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ faddd $f14, $f15, $f19 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ unop ++ ++ CMPLT($f0, $f16), $f4 ++ LD $f23, 1 * SIZE(X) ++ fabs $f24, $f12 ++ addl X, INCX, X ++ ++ CMPLT($f1, $f17), $f5 ++ LD $f24, 0 * SIZE(X) ++ fabs $f25, $f13 ++ unop ++ ++ CMPLT($f2, $f18), $f6 ++ LD $f25, 1 * SIZE(X) ++ fabs $f26, $f14 ++ addl X, INCX, X ++ ++ CMPLT($f3, $f19), $f7 ++ LD $f26, 0 * SIZE(X) ++ fabs $f27, $f15 ++ unop ++ ++ fselne $f4, $f16, $f0, $f0 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ldi $1, -1($1) # i -- ++ ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ faddd $f8, $f9, $f16 ++ fabs $f20, $f8 ++ ++ faddd $f10, $f11, $f17 ++ fabs $f21, $f9 ++ ++ faddd $f12, $f13, $f18 ++ fabs $f22, $f10 ++ ++ faddd $f14, $f15, $f19 ++ fabs $f23, $f11 ++ ++ CMPLT($f0, $f16), $f4 ++ fabs $f24, $f12 ++ ++ CMPLT($f1, $f17), $f5 ++ fabs $f25, $f13 ++ ++ CMPLT($f2, $f18), $f6 ++ fabs $f26, $f14 ++ CMPLT($f3, $f19), $f7 ++ fabs $f27, $f15 ++ ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ .align 4 ++ ++$L14: ++ faddd $f8, $f9, $f16 ++ faddd $f10, $f11, $f17 ++ faddd $f12, $f13, $f18 ++ faddd $f14, $f15, $f19 ++ ++ CMPLT($f0, $f16), $f4 ++ CMPLT($f1, $f17), $f5 ++ CMPLT($f2, $f18), $f6 ++ CMPLT($f3, $f19), $f7 ++ ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ ++ CMPLT($f0, $f1), $f16 ++ CMPLT($f2, $f3), $f17 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ ++ CMPLT($f0, $f2), $f16 ++ fselne $f16, $f2, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 3, $1 ++ unop ++ unop ++ ble $1, $End ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ fabs $f21, $f30 ++ faddd $f29, $f30, $f29 ++ ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/zasum.S b/kernel/sw_64/zasum.S +new file mode 100644 +index 0000000..9a32605 +--- /dev/null ++++ b/kernel/sw_64/zasum.S +@@ -0,0 +1,208 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ addl INCX, INCX, INCX ++ ++ fclr s1 ++ unop ++ fclr t1 ++ ble N, $L999 ++ ++ fclr s2 ++ sra N, 2, I ++ fclr s3 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t2 ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a2, 0 * SIZE(X) ++ fclr t3 ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) ++ ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ s_fillcs PREFETCHSIZE * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fabs a1, t1 ++ unop ++ ++ ADD s2, t2, s2 ++ LD a7, 1 * SIZE(X) ++ fabs a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ fabs a3, t3 ++ unop ++ ++ ADD s0, t0, s0 ++ LD a1, 1 * SIZE(X) ++ fabs a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) ++ fabs a5, t1 ++ unop ++ ++ ADD s2, t2, s2 ++ LD a3, 1 * SIZE(X) ++ fabs a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ fabs a7, t3 ++ unop ++ ++ LD a5, 1 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD a6, 0 * SIZE(X) ++ fabs a0, t0 ++ ++ ADD s1, t1, s1 ++ LD a7, 1 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ fabs a2, t2 ++ ADD s3, t3, s3 ++ fabs a3, t3 ++ ++ ADD s0, t0, s0 ++ fabs a4, t0 ++ ADD s1, t1, s1 ++ fabs a5, t1 ++ ADD s2, t2, s2 ++ fabs a6, t2 ++ ADD s3, t3, s3 ++ fabs a7, t3 ++ ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ .align 4 ++ ++$L15: ++ ADD s0, s2, s0 ++ and N, 3, I ++ ADD s1, s3, s1 ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ LD a0, 0 * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a1, 1 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ++ ADD s0, s1, s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zaxpy.S b/kernel/sw_64/zaxpy.S +new file mode 100644 +index 0000000..bbcb825 +--- /dev/null ++++ b/kernel/sw_64/zaxpy.S +@@ -0,0 +1,611 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define PREFETCHSIZE 40 ++ ++#ifndef CONJ ++#define ADD1 SUB ++#define ADD2 ADD ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#endif ++ ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldl $19, 0($sp) ++ fmov $f19, $f29 ++ ldl $20, 8($sp) ++ fmov $f20, $f30 ++ ++ mov $21, $18 ++ ldl $21, 16($sp) ++ ldi $sp, -64($sp) ++ nop ++ ++ fstd $f2, 0($sp) ++ cmpeq $19, 1, $1 ++ fstd $f3, 8($sp) ++ cmpeq $21, 1, $2 ++ ++ fstd $f4, 16($sp) ++ and $16, 3, $5 ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ and $1, $2, $1 ++ ble $16, $End ++ sra $16, 2, $4 ++ beq $1, $Sub ++ ++ ble $4, $Remain ++ subl $4, 1, $4 ++ ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ LD $f2, 2*SIZE($18) ++ LD $f3, 3*SIZE($18) ++ LD $f4, 4*SIZE($18) ++ LD $f5, 5*SIZE($18) ++ LD $f6, 6*SIZE($18) ++ LD $f7, 7*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ LD $f10, 2*SIZE($20) ++ LD $f11, 3*SIZE($20) ++ LD $f12, 4*SIZE($20) ++ LD $f13, 5*SIZE($20) ++ LD $f14, 6*SIZE($20) ++ LD $f15, 7*SIZE($20) ++ ++ addl $18, 8*SIZE, $18 ++ ble $4, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++ fillde_e PREFETCHSIZE * SIZE($20) ++ s_fillcs PREFETCHSIZE * SIZE($18) ++ ++ MUL $f29, $f0, $f20 ++ s_fillcs 9*SIZE($18) ++ MUL $f30, $f1, $f21 ++ unop ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ MUL $f29, $f2, $f24 ++ unop ++ MUL $f30, $f3, $f25 ++ nop ++ ++ MUL $f30, $f2, $f26 ++ LD $f2, 2*SIZE($18) ++ MUL $f29, $f3, $f27 ++ LD $f3, 3*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ unop ++ MUL $f30, $f4, $f22 ++ LD $f4, 4*SIZE($18) ++ ++ ADD2 $f26, $f27, $f19 ++ addl $20, 8*SIZE, $20 ++ MUL $f29, $f5, $f23 ++ LD $f5, 5*SIZE($18) ++ ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($20) ++ MUL $f29, $f6, $f24 ++ unop ++ ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($20) ++ MUL $f30, $f7, $f25 ++ unop ++ ++ ADD $f18, $f10, $f18 ++ LD $f10, 2*SIZE($20) ++ MUL $f30, $f6, $f26 ++ LD $f6, 6*SIZE($18) ++ ++ ADD $f19, $f11, $f19 ++ LD $f11, 3*SIZE($20) ++ MUL $f29, $f7, $f27 ++ LD $f7, 7*SIZE($18) ++ ++ ST $f16,-8*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17,-7*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ ST $f18,-6*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ST $f19,-5*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ ++ ADD $f16, $f12, $f16 ++ LD $f12, 4*SIZE($20) ++ ADD $f17, $f13, $f17 ++ LD $f13, 5*SIZE($20) ++ ADD $f18, $f14, $f18 ++ LD $f14, 6*SIZE($20) ++ ADD $f19, $f15, $f19 ++ LD $f15, 7*SIZE($20) ++ ++ ST $f16,-4*SIZE($20) ++ addl $18, 8*SIZE, $18 ++ ST $f17,-3*SIZE($20) ++ subl $4, 1, $4 ++ ++ ST $f18,-2*SIZE($20) ++ nop ++ ST $f19,-1*SIZE($20) ++ bgt $4, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ MUL $f29, $f2, $f24 ++ MUL $f30, $f3, $f25 ++ MUL $f30, $f2, $f26 ++ MUL $f29, $f3, $f27 ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ MUL $f30, $f4, $f22 ++ ADD2 $f26, $f27, $f19 ++ MUL $f29, $f5, $f23 ++ ++ ADD $f16, $f8, $f16 ++ MUL $f29, $f6, $f24 ++ ADD $f17, $f28, $f17 ++ MUL $f30, $f7, $f25 ++ ++ ADD $f18, $f10, $f18 ++ MUL $f30, $f6, $f26 ++ ADD $f19, $f11, $f19 ++ MUL $f29, $f7, $f27 ++ ++ ST $f16, 0*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17, 1*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ ST $f18, 2*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ST $f19, 3*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ ++ ADD $f16, $f12, $f16 ++ ADD $f17, $f13, $f17 ++ ADD $f18, $f14, $f18 ++ ADD $f19, $f15, $f19 ++ ++ ST $f16, 4*SIZE($20) ++ ST $f17, 5*SIZE($20) ++ ST $f18, 6*SIZE($20) ++ ST $f19, 7*SIZE($20) ++ ++ unop ++ addl $20, 8*SIZE, $20 ++ unop ++ ble $5, $End ++ .align 4 ++ ++$Remain: ++ subl $5, 1, $6 ++ ble $5, $End ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ addl $18, 2*SIZE, $18 ++ ble $6, $RemainLoopEnd ++ .align 4 ++ ++$RemainLoop: ++ MUL $f29, $f0, $f20 ++ subl $6, 1, $6 ++ MUL $f30, $f1, $f21 ++ addl $20, 2*SIZE, $20 ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($20) ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($20) ++ ++ ST $f16,-2*SIZE($20) ++ addl $18, 2*SIZE, $18 ++ ST $f17,-1*SIZE($20) ++ bgt $6, $RemainLoop ++ .align 4 ++ ++$RemainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ ADD $f17, $f28, $f17 ++ ++ ST $f16, 0*SIZE($20) ++ nop ++ ST $f17, 1*SIZE($20) ++ nop ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ ldi $sp, 64($sp) ++ ret ++ .align 4 ++ ++$Sub: ++ SXSUBL $16, SIZE, $22 ++ addl $22, $22, $22 # Complex ++ .align 4 ++ ++ addl $19, $19, $19 # Complex ++ addl $21, $21, $21 # Complex ++ ++ ble $4, $SubRemain ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f2, 0*SIZE($18) ++ LD $f3, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f4, 0*SIZE($18) ++ LD $f5, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f6, 0*SIZE($18) ++ LD $f7, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ SXADDQ $21, $20, $24 ++ ++ LD $f10, 0*SIZE($24) ++ LD $f11, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ LD $f12, 0*SIZE($24) ++ LD $f13, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ LD $f14, 0*SIZE($24) ++ LD $f15, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ subl $4, 1, $4 ++ ble $4, $SubMainLoopEnd ++ .align 4 ++ ++$SubMainLoop: ++ MUL $f29, $f0, $f20 ++ unop ++ MUL $f30, $f1, $f21 ++ unop ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ MUL $f29, $f2, $f24 ++ SXADDQ $19, $18, $18 ++ MUL $f30, $f3, $f25 ++ unop ++ ++ MUL $f30, $f2, $f26 ++ LD $f2, 0*SIZE($18) ++ MUL $f29, $f3, $f27 ++ LD $f3, 1*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ SXADDQ $19, $18, $18 ++ MUL $f29, $f4, $f20 ++ unop ++ ++ ADD2 $f22, $f23, $f17 ++ unop ++ MUL $f30, $f5, $f21 ++ unop ++ ++ ADD1 $f24, $f25, $f18 ++ unop ++ MUL $f30, $f4, $f22 ++ LD $f4, 0*SIZE($18) ++ ++ ADD2 $f26, $f27, $f19 ++ unop ++ MUL $f29, $f5, $f23 ++ LD $f5, 1*SIZE($18) ++ ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($24) ++ MUL $f29, $f6, $f24 ++ SXADDQ $19, $18, $18 ++ ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($24) ++ MUL $f30, $f7, $f25 ++ SXADDQ $21, $24, $24 ++ ++ ADD $f18, $f10, $f18 ++ LD $f10, 0*SIZE($24) ++ MUL $f30, $f6, $f26 ++ LD $f6, 0*SIZE($18) ++ ++ ADD $f19, $f11, $f19 ++ LD $f11, 1*SIZE($24) ++ MUL $f29, $f7, $f27 ++ LD $f7, 1*SIZE($18) ++ ++ ST $f16, 0*SIZE($20) ++ SXADDQ $19, $18, $18 ++ ADD1 $f20, $f21, $f16 ++ unop ++ ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ADD2 $f22, $f23, $f17 ++ unop ++ ++ ST $f18, 0*SIZE($20) ++ SXADDQ $21, $24, $24 ++ ADD1 $f24, $f25, $f18 ++ unop ++ ++ ST $f19, 1*SIZE($20) ++ unop ++ ADD2 $f26, $f27, $f19 ++ SXADDQ $21, $20, $20 ++ ++ ADD $f16, $f12, $f16 ++ unop ++ LD $f12, 0*SIZE($24) ++ unop ++ ++ ADD $f17, $f13, $f17 ++ unop ++ LD $f13, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ ADD $f18, $f14, $f18 ++ subl $4, 1, $4 ++ LD $f14, 0*SIZE($24) ++ unop ++ ++ ADD $f19, $f15, $f19 ++ unop ++ LD $f15, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ unop ++ ++ ST $f18, 0*SIZE($20) ++ ST $f19, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ bgt $4, $SubMainLoop ++ .align 4 ++ ++$SubMainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ MUL $f29, $f2, $f24 ++ MUL $f30, $f3, $f25 ++ MUL $f30, $f2, $f26 ++ MUL $f29, $f3, $f27 ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ MUL $f30, $f4, $f22 ++ ADD2 $f26, $f27, $f19 ++ MUL $f29, $f5, $f23 ++ ++ ADD $f16, $f8, $f16 ++ MUL $f29, $f6, $f24 ++ ADD $f17, $f28, $f17 ++ MUL $f30, $f7, $f25 ++ ++ ADD $f18, $f10, $f18 ++ MUL $f30, $f6, $f26 ++ ADD $f19, $f11, $f19 ++ MUL $f29, $f7, $f27 ++ ++ ST $f16, 0*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17, 1*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ SXADDQ $21, $20, $20 ++ nop ++ ST $f18, 0*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ++ ST $f19, 1*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ SXADDQ $21, $20, $20 ++ ADD $f16, $f12, $f16 ++ ++ ADD $f17, $f13, $f17 ++ ADD $f18, $f14, $f18 ++ ADD $f19, $f15, $f19 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ++ ST $f18, 0*SIZE($20) ++ ST $f19, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ble $5, $SubEnd ++ .align 4 ++ ++$SubRemain: ++ subl $5, 1, $6 ++ ble $5, $SubEnd ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ SXADDQ $19, $18, $18 ++ SXADDQ $21, $20, $24 ++ ble $6, $SubRemainLoopEnd ++ .align 4 ++ ++$SubRemainLoop: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ADD1 $f20, $f21, $f16 ++ SXADDQ $19, $18, $18 ++ ++ ADD2 $f22, $f23, $f17 ++ nop ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($24) ++ ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ subl $6, 1, $6 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ bgt $6, $SubRemainLoop ++ .align 4 ++ ++$SubRemainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ ADD $f17, $f28, $f17 ++ ++ ST $f16, 0*SIZE($20) ++ nop ++ ST $f17, 1*SIZE($20) ++ nop ++ .align 4 ++ ++$SubEnd: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ ldi $sp, 64($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zdot.S b/kernel/sw_64/zdot.S +new file mode 100644 +index 0000000..f037aef +--- /dev/null ++++ b/kernel/sw_64/zdot.S +@@ -0,0 +1,500 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define XX $21 ++#define YY $23 ++ ++#define I $5 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f2 ++#define s3 $f30 ++ ++#define a0 $f10 ++#define a1 $f11 ++#define a2 $f12 ++#define a3 $f13 ++#define a4 $f14 ++#define a5 $f15 ++#define a6 $f16 ++#define a7 $f17 ++ ++#define b0 $f18 ++#define b1 $f19 ++#define b2 $f20 ++#define b3 $f21 ++#define b4 $f22 ++#define b5 $f23 ++#define b6 $f24 ++#define b7 $f25 ++ ++#define t0 $f26 ++#define t1 $f27 ++#define t2 $f28 ++#define t3 $f29 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldi $sp, -16($sp) ++ fclr s0 ++ fstd $f2, 0($sp) ++ fclr s1 ++ ++ fclr s2 ++ addl INCX, INCX, INCX ++ fclr s3 ++ ble N, $L999 ++ ++ addl INCY, INCY, INCY ++ fclr t0 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ ++ srl N, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ LD a3, 1 * SIZE(X) ++ LD b2, 0 * SIZE(Y) ++ LD b3, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ LD b4, 0 * SIZE(Y) ++ LD b5, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a6, 0 * SIZE(X) ++ LD b6, 0 * SIZE(Y) ++ ++ subl I, 1, I ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, s0 ++ LD a7, 1 * SIZE(X) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ s_fillcs PREFETCHSIZE * SIZE(X) ++ MUL a0, b1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ s_fillcs PREFETCHSIZE * SIZE(Y) ++ MUL a1, b0, t2 ++ SXADDQ INCY, Y, Y ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, 0 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a1, b0, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ subl I, 1, I ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD b6, 0 * SIZE(Y) ++ MUL a7, b7, t3 ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD s0, t0, s0 ++ LD a7, 1 * SIZE(X) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a1, b0, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, 0 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ MUL a1, b0, t2 ++ ADD s3, t3, s3 ++ MUL a1, b1, t3 ++ ++ ADD s0, t0, s0 ++ MUL a2, b2, t0 ++ ADD s1, t1, s1 ++ MUL a2, b3, t1 ++ ++ ADD s2, t2, s2 ++ MUL a3, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s1 ++ MUL a4, b5, t1 ++ ++ ADD s2, t2, s2 ++ MUL a5, b4, t2 ++ ADD s3, t3, s3 ++ MUL a5, b5, t3 ++ ++ ADD s0, t0, s0 ++ MUL a6, b6, t0 ++ ADD s1, t1, s1 ++ MUL a6, b7, t1 ++ ++ ADD s2, t2, s2 ++ MUL a7, b6, t2 ++ ADD s3, t3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ unop ++ unop ++ ble I, $L998 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ subl I, 1, I ++ SXADDQ INCY, Y, Y ++ ble I, $L28 ++ .align 4 ++ ++$L26: ++ ADD s0, t0, s0 ++ mov X, XX ++ MUL a0, b0, t0 ++ mov Y, YY ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ LD a0, 0 * SIZE(XX) ++ MUL a1, b0, t2 ++ LD b0, 0 * SIZE(YY) ++ ++ ADD s3, t3, s3 ++ subl I, 1, I ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(XX) ++ ++ LD b1, 1 * SIZE(YY) ++ bgt I, $L26 ++ .align 4 ++ ++$L28: ++ ADD s0, t0, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s1 ++ MUL a0, b1, t1 ++ ++ ADD s2, t2, s2 ++ MUL a1, b0, t2 ++ ADD s3, t3, s3 ++ MUL a1, b1, t3 ++ .align 4 ++ ++$L998: ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++#ifndef CONJ ++ SUB s0, s3, s0 ++ ADD s1, s2, s1 ++#else ++ ADD s0, s3, s0 ++ SUB s1, s2, s1 ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ ldi $sp, 16($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/zgemm_beta.S b/kernel/sw_64/zgemm_beta.S +new file mode 100644 +index 0000000..ffaa17b +--- /dev/null ++++ b/kernel/sw_64/zgemm_beta.S +@@ -0,0 +1,192 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++CNAME: ++ .frame $sp, 0, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++ .prologue 1 ++#else ++ .prologue 0 ++#endif ++ ++ ldl $18, 24($sp) ++ ble $16, $End ++ ldl $19, 32($sp) ++ ble $17, $End ++ ++ addl $19, $19, $19 ++ fbne $f19,$Main ++ fbne $f20,$Main ++ .align 4 ++ ++$L13: ++ mov $18, $1 ++ ldi $17, -1($17) ++ SXADDQ $19, $18, $18 ++ mov $16, $2 ++ .align 4 ++ ++$L12: ++ ST $f31, 0*SIZE($1) ++ ST $f31, 1*SIZE($1) ++ ldi $2, -1($2) ++ ldi $1, 2*SIZE($1) ++ bgt $2, $L12 ++ bgt $17,$L13 ++ clr $0 ++ ret ++ .align 4 ++ ++/* Main Routine */ ++$Main: ++ sra $16, 1, $2 # $2 = (m >> 1) ++ mov $18, $1 # c_offset = c ++ ldi $17, -1($17) # n -- ++ SXADDQ $19, $18, $18 # c += ldc ++ beq $2, $L18 ++ ++ LD $f14, 0*SIZE($1) ++ LD $f15, 1*SIZE($1) ++ LD $f24, 2*SIZE($1) ++ LD $f25, 3*SIZE($1) ++ ldi $2, -1($2) # $2 -- ++ ble $2, $L19 ++ .align 4 ++ ++ ++$L23: ++ MUL $f19, $f14, $f10 ++ fillde 9*SIZE($1) ++ MUL $f20, $f15, $f11 ++ ldi $2, -1($2) ++ ++ MUL $f19, $f15, $f12 ++ LD $f15, 5*SIZE($1) ++ MUL $f20, $f14, $f13 ++ LD $f14, 4*SIZE($1) ++ ++ MUL $f19, $f24, $f16 ++ unop ++ MUL $f20, $f25, $f17 ++ unop ++ ++ MUL $f19, $f25, $f18 ++ LD $f25, 7*SIZE($1) ++ SUB $f10, $f11, $f22 ++ unop ++ ++ MUL $f20, $f24, $f21 ++ LD $f24, 6*SIZE($1) ++ ADD $f12, $f13, $f23 ++ ldi $1, 4*SIZE($1) ++ ++ SUB $f16, $f17, $f26 ++ ADD $f18, $f21, $f27 ++ ST $f22,-4*SIZE($1) ++ ST $f23,-3*SIZE($1) ++ ++ ST $f26,-2*SIZE($1) ++ ST $f27,-1*SIZE($1) ++ unop ++ bgt $2,$L23 ++ .align 4 ++ ++$L19: ++ MUL $f19, $f14, $f10 ++ MUL $f20, $f15, $f11 ++ MUL $f19, $f15, $f12 ++ MUL $f20, $f14, $f13 ++ ++ MUL $f19, $f24, $f16 ++ MUL $f20, $f25, $f17 ++ MUL $f19, $f25, $f18 ++ MUL $f20, $f24, $f21 ++ ++ SUB $f10, $f11, $f22 ++ ADD $f12, $f13, $f23 ++ SUB $f16, $f17, $f26 ++ ADD $f18, $f21, $f27 ++ ldi $1, 4*SIZE($1) ++ ++ ST $f22, -4*SIZE($1) ++ ST $f23, -3*SIZE($1) ++ ST $f26, -2*SIZE($1) ++ ST $f27, -1*SIZE($1) ++ ++ blbs $16, $L18 ++ bgt $17, $Main ++ clr $0 ++ ret ++ .align 4 ++ ++$L18: ++ LD $f14, 0*SIZE($1) ++ LD $f15, 1*SIZE($1) ++ MUL $f19, $f15, $f13 ++ MUL $f20, $f14, $f10 ++ ++ MUL $f19, $f14, $f12 ++ MUL $f20, $f15, $f11 ++ ADD $f13, $f10, $f26 ++ SUB $f12, $f11, $f27 ++ ++ ST $f26, 1*SIZE($1) ++ ST $f27, 0*SIZE($1) ++ ldi $1, 2*SIZE($1) ++ bgt $17, $Main ++ .align 4 ++ ++$End: ++ clr $0 ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/zgemm_kernel_2x2.S b/kernel/sw_64/zgemm_kernel_2x2.S +new file mode 100644 +index 0000000..1bd180f +--- /dev/null ++++ b/kernel/sw_64/zgemm_kernel_2x2.S +@@ -0,0 +1,1705 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#if !defined(SW8A) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ ++ .set noat ++ .set noreorder ++ .arch sw8a ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define BB $3 ++#define OFFSET $4 ++ ++#define ALPHA_R 64($sp) ++#define ALPHA_I 72($sp) ++ ++#if defined(NN) || defined(NT) || defined(TN) || defined(TT) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB ++#endif ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++#ifdef TRMMKERNEL ++ ldl OFFSET, 24 + STACKSIZE($sp) ++#endif ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ fstd $f19, ALPHA_R ++ fstd $f20, ALPHA_I ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ subl $31, OFFSET, KK ++#endif ++ ++ sra N, 1, J ++ ble J, $L30 ++ .align 4 ++ ++$L01: ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ s4addl K, 0, BB ++ ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ SXADDQ BB, B, BB ++ addl C2, LDC, C ++ unop ++ ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#ifndef EV4 ++ s_fillcs 0 * SIZE(BB) ++ s_fillcs 8 * SIZE(BB) ++ unop ++ ldi BB, 16 * SIZE(BB) ++#endif ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillde 4 * SIZE(C1) ++ fclr c04 ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ fclr c08 ++ ++ fillde 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble L, $L15 ++#else ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillde 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillde 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD1 c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, c11 ++ fldd alpha_r, ALPHA_R ++ MUL b1, a1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L18 ++#else ++ blbs TMP1, $L18 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L18: ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++#ifndef TRMMKERNEL ++ LD b1, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++#ifndef TRMMKERNEL ++ LD a1, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++#ifndef TRMMKERNEL ++ LD a2, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++#ifndef TRMMKERNEL ++ LD b2, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD1 c09, t1, c09 ++ ldi I, -1(I) ++ MUL b3, a3, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++#ifndef TRMMKERNEL ++ LD b3, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++#ifndef TRMMKERNEL ++ LD a4, 2 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++#ifndef TRMMKERNEL ++ LD a3, 3 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD1 c11, t1, c11 ++ ADD3 c12, t2, c12 ++ ADD2 c16, t3, c16 ++ ADD4 c15, t4, c15 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++ ADD c09, c14, c09 ++ MUL alpha_r, c01, t1 ++ ADD c10, c13, c10 ++ MUL alpha_r, c02, t2 ++ ++ ADD c11, c16, c11 ++ MUL alpha_r, c03, t3 ++ ADD c12, c15, c12 ++ MUL alpha_r, c04, t4 ++ ++#ifndef TRMMKERNEL ++ ADD a5, t1, a5 ++ MUL alpha_i, c02, t1 ++ ADD b1, t2, b1 ++ MUL alpha_i, c01, t2 ++ ++ ADD a1, t3, a1 ++ MUL alpha_i, c04, t3 ++ ADD a2, t4, a2 ++ MUL alpha_i, c03, t4 ++#else ++ ADD $f31, t1, a5 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, b1 ++ MUL alpha_i, c01, t2 ++ ++ ADD $f31, t3, a1 ++ MUL alpha_i, c04, t3 ++ ADD $f31, t4, a2 ++ MUL alpha_i, c03, t4 ++#endif ++ ++ SUB a5, t1, a5 ++ MUL alpha_r, c09, t1 ++ ADD b1, t2, b1 ++ MUL alpha_r, c10, t2 ++ ++ SUB a1, t3, a1 ++ MUL alpha_r, c11, t3 ++ ADD a2, t4, a2 ++ MUL alpha_r, c12, t4 ++ ++#ifndef TRMMKERNEL ++ ADD b2, t1, b2 ++ MUL alpha_i, c10, t1 ++ ADD b3, t2, b3 ++ MUL alpha_i, c09, t2 ++ ++ ADD a4, t3, a4 ++ MUL alpha_i, c12, t3 ++ ADD a3, t4, a3 ++ MUL alpha_i, c11, t4 ++#else ++ ADD $f31, t1, b2 ++ MUL alpha_i, c10, t1 ++ ADD $f31, t2, b3 ++ MUL alpha_i, c09, t2 ++ ++ ADD $f31, t3, a4 ++ MUL alpha_i, c12, t3 ++ ADD $f31, t4, a3 ++ MUL alpha_i, c11, t4 ++#endif ++ ++ SUB b2, t1, b2 ++ ST a5, 0 * SIZE(C1) ++ fclr t1 ++ unop ++ ++ ADD b3, t2, b3 ++ ST b1, 1 * SIZE(C1) ++ fclr t2 ++ unop ++ ++ SUB a4, t3, a4 ++ ST a1, 2 * SIZE(C1) ++ fclr t3 ++ unop ++ ++ ADD a3, t4, a3 ++ ST a2, 3 * SIZE(C1) ++ fclr t4 ++ unop ++ ++ ST b2, 0 * SIZE(C2) ++ fclr c01 ++ ST b3, 1 * SIZE(C2) ++ fclr c05 ++ ++ ST a4, 2 * SIZE(C2) ++ ldi C1, 4 * SIZE(C1) ++ ST a3, 3 * SIZE(C2) ++ ldi C2, 4 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 1, I ++ ble I, $L29 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ble L, $L25 ++#else ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, c09 ++ fldd alpha_r, ALPHA_R ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L28 ++#else ++ blbs TMP1, $L28 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L28: ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c03, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c04, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++#ifndef TRMMKERNEL ++ LD c11, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++#ifndef TRMMKERNEL ++ LD c12, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_r, c09, t3 ++ MUL alpha_r, c10, t4 ++ ++#ifndef TRMMKERNEL ++ ADD c03, t1, c03 ++ MUL alpha_i, c02, t1 ++ ADD c04, t2, c04 ++ MUL alpha_i, c01, t2 ++ ++ ADD c11, t3, c11 ++ MUL alpha_i, c10, t3 ++ ADD c12, t4, c12 ++ MUL alpha_i, c09, t4 ++#else ++ ADD $f31, t1, c03 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, c04 ++ MUL alpha_i, c01, t2 ++ ++ ADD $f31, t3, c11 ++ MUL alpha_i, c10, t3 ++ ADD $f31, t4, c12 ++ MUL alpha_i, c09, t4 ++#endif ++ ++ SUB c03, t1, c03 ++ ADD c04, t2, c04 ++ SUB c11, t3, c11 ++ ADD c12, t4, c12 ++ ++ ST c03, 0 * SIZE(C1) ++ ST c04, 1 * SIZE(C1) ++ ST c11, 0 * SIZE(C2) ++ ST c12, 1 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 1, KK ++#endif ++ .align 4 ++ ++$L29: ++ mov BO, B ++ ldi J, -1(J) ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 2, KK ++#else ++ unop ++#endif ++ bgt J, $L01 ++ .align 4 ++ ++$L30: ++ and N, 1, J ++ ble J, $L999 ++ ++ mov C, C1 ++ mov A, AO ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ sra M, 1, I ++ ble I, $L50 ++ .align 4 ++ ++$L41: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ fclr c04 ++ fclr c08 ++ ble L, $L45 ++#else ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, c05 ++ fldd alpha_r, ALPHA_R ++ MUL b1, a1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L48 ++#else ++ blbs TMP1, $L48 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L48: ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c07, t3, c07 ++ ldi I, -1(I) ++ MUL a3, b1, t3 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++#ifndef TRMMKERNEL ++ LD c11, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++#ifndef TRMMKERNEL ++ LD c12, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_r, c03, t3 ++ MUL alpha_r, c04, t4 ++ ++#ifndef TRMMKERNEL ++ ADD c09, t1, c09 ++ MUL alpha_i, c02, t1 ++ ADD c10, t2, c10 ++ MUL alpha_i, c01, t2 ++ ++ ADD c11, t3, c11 ++ MUL alpha_i, c04, t3 ++ ADD c12, t4, c12 ++ MUL alpha_i, c03, t4 ++#else ++ ADD $f31, t1, c09 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, c10 ++ MUL alpha_i, c01, t2 ++ ++ ADD $f31, t3, c11 ++ MUL alpha_i, c04, t3 ++ ADD $f31, t4, c12 ++ MUL alpha_i, c03, t4 ++#endif ++ ++ SUB c09, t1, c09 ++ ADD c10, t2, c10 ++ SUB c11, t3, c11 ++ ADD c12, t4, c12 ++ ++ ST c09, 0 * SIZE(C1) ++ ST c10, 1 * SIZE(C1) ++ ST c11, 2 * SIZE(C1) ++ ST c12, 3 * SIZE(C1) ++ ++ ldi C1, 4 * SIZE(C1) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ ++ bgt I, $L41 ++ .align 4 ++ ++$L50: ++ and M, 1, I ++ ble I, $L999 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ble L, $L55 ++#else ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, c01 ++ fldd alpha_r, ALPHA_R ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L58 ++#else ++ blbs TMP1, $L58 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L58: ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c03, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c04, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_i, c02, t3 ++ MUL alpha_i, c01, t4 ++ ++#ifndef TRMMKERNEL ++ ADD c03, t1, c03 ++ ADD c04, t2, c04 ++#else ++ ADD $f31, t1, c03 ++ ADD $f31, t2, c04 ++#endif ++ ++ SUB c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ST c03, 0 * SIZE(C1) ++ ST c04, 1 * SIZE(C1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/zgemv_n.S b/kernel/sw_64/zgemv_n.S +new file mode 100644 +index 0000000..f28ad30 +--- /dev/null ++++ b/kernel/sw_64/zgemv_n.S +@@ -0,0 +1,1027 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $21 ++#define LDA $18 ++ ++#define X $19 ++#define INCX $20 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define Y1 $4 ++#define A1 $5 ++#define A2 $6 ++ ++#define alpha_r $f19 ++#define alpha_i $f20 ++ ++#define alpha1 $f0 ++#define alpha2 $f1 ++#define alpha3 $f10 ++#define alpha4 $f11 ++ ++#define y0 $f12 ++#define y1 $f13 ++#define y2 $f14 ++#define y3 $f15 ++ ++#define y4 $f16 ++#define y5 $f17 ++#define y6 $f18 ++#define y7 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define t0 $f2 ++#define t1 $f3 ++#define t2 $f4 ++#define t3 $f5 ++ ++#if !defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#elif !defined(CONJ) && defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB ++#endif ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl LDA, 0 + STACKSIZE($sp) ++ ldl X, 8 + STACKSIZE($sp) ++ ldl INCX, 16 + STACKSIZE($sp) ++ ldl Y, 24 + STACKSIZE($sp) ++ ldl INCY, 32 + STACKSIZE($sp) ++ ldl BUFFER, 40 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ sll INCX, ZBASE_SHIFT, INCX ++ cmple N, 0, $1 ++ sll INCY, ZBASE_SHIFT, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ cmpeq INCY, 2 * SIZE, $0 ++ sll LDA, ZBASE_SHIFT,LDA ++ bne $0, $L10 ++ ++ mov BUFFER, Y1 ++ ++ mov Y, BUFFER ++ mov Y1, Y ++ ++ sra M, 2, I ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ ST $f31, 2 * SIZE(Y1) ++ ST $f31, 3 * SIZE(Y1) ++ ST $f31, 4 * SIZE(Y1) ++ ST $f31, 5 * SIZE(Y1) ++ ST $f31, 6 * SIZE(Y1) ++ ST $f31, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ ldi I, -1(I) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 3, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ addl Y1, 2 * SIZE, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ sra N, 1, J ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ LD alpha1, 0 * SIZE(X) ++ LD alpha2, 1 * SIZE(X) ++ addl X, INCX, X ++ LD alpha3, 0 * SIZE(X) ++ LD alpha4, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ MUL alpha_r, alpha1, y0 ++ MUL alpha_r, alpha2, y1 ++ MUL alpha_r, alpha3, y2 ++ MUL alpha_r, alpha4, y3 ++ ++ MUL alpha_i, alpha2, t0 ++ mov A, A1 ++ MUL alpha_i, alpha1, t1 ++ addl A, LDA, A2 ++ MUL alpha_i, alpha4, t2 ++ addl A2, LDA, A ++ MUL alpha_i, alpha3, t3 ++ mov Y, Y1 ++ ++#ifndef XCONJ ++ SUB y0, t0, alpha1 ++ ADD y1, t1, alpha2 ++ SUB y2, t2, alpha3 ++ ADD y3, t3, alpha4 ++#else ++ ADD y0, t0, alpha1 ++ SUB y1, t1, alpha2 ++ ADD y2, t2, alpha3 ++ SUB y3, t3, alpha4 ++#endif ++ ++ s_fillcs 4 * SIZE(X) ++ ++ sra M, 2, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ unop ++ MUL alpha3, a4, t0 ++ LD y4, 4 * SIZE(Y1) ++ ++ ADD2 y1, t1, y1 ++ unop ++ MUL alpha3, a5, t1 ++ LD y5, 5 * SIZE(Y1) ++ ++ ADD1 y2, t2, y2 ++ unop ++ MUL alpha3, a6, t2 ++ LD y6, 6 * SIZE(Y1) ++ ++ ADD2 y3, t3, y3 ++ unop ++ MUL alpha3, a7, t3 ++ LD y7, 7 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 5 * SIZE(A1) ++ ++ ADD2 y1, t1, y1 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 4 * SIZE(A1) ++ ++ ADD1 y2, t2, y2 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 7 * SIZE(A1) ++ ++ ADD2 y3, t3, y3 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 6 * SIZE(A1) ++ ++ ADD3 y0, t0, y0 ++ unop ++ MUL alpha4, a5, t0 ++ LD a5, 5 * SIZE(A2) ++ ++ ADD4 y1, t1, y1 ++ unop ++ MUL alpha4, a4, t1 ++ LD a4, 4 * SIZE(A2) ++ ++ ADD3 y2, t2, y2 ++ unop ++ MUL alpha4, a7, t2 ++ LD a7, 7 * SIZE(A2) ++ ++ ADD4 y3, t3, y3 ++ unop ++ MUL alpha4, a6, t3 ++ LD a6, 6 * SIZE(A2) ++ ++ ADD3 y0, t0, y0 ++ MUL alpha1, a0, t0 ++ ADD4 y1, t1, y1 ++ MUL alpha1, a1, t1 ++ ++ ADD3 y2, t2, y2 ++ unop ++ MUL alpha1, a2, t2 ++ unop ++ ++ ADD4 y3, t3, y3 ++ ldi I, -1(I) ++ MUL alpha1, a3, t3 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ ldi I, -1(I) ++ ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha3, a6, t2 ++ unop ++ ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ unop ++ ++ ADD1 y4, t0, y4 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 y5, t1, y5 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD1 y6, t2, y6 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD2 y7, t3, y7 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD3 y4, t0, y4 ++ fillde (PREFETCHSIZE + 0) * SIZE(Y1) ++ MUL alpha4, a5, t0 ++ LD a5, 9 * SIZE(A2) ++ ++ ADD4 y5, t1, y5 ++ unop ++ MUL alpha4, a4, t1 ++ LD a4, 8 * SIZE(A2) ++ ++ ADD3 y6, t2, y6 ++ unop ++ MUL alpha4, a7, t2 ++ LD a7, 11 * SIZE(A2) ++ ++ ADD4 y7, t3, y7 ++ unop ++ MUL alpha4, a6, t3 ++ LD a6, 10 * SIZE(A2) ++ ++ ADD3 y4, t0, y4 ++ unop ++ MUL alpha1, a0, t0 ++ LD y0, 8 * SIZE(Y1) ++ ++ ADD4 y5, t1, y5 ++ unop ++ MUL alpha1, a1, t1 ++ LD y1, 9 * SIZE(Y1) ++ ++ ADD3 y6, t2, y6 ++ unop ++ MUL alpha1, a2, t2 ++ LD y2, 10 * SIZE(Y1) ++ ++ ADD4 y7, t3, y7 ++ unop ++ MUL alpha1, a3, t3 ++ LD y3, 11 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ ++ ADD2 y1, t1, y1 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ unop ++ ++ ADD1 y2, t2, y2 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha3, a6, t2 ++ unop ++ ++ ADD2 y3, t3, y3 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ ldi Y1, 8 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 13 * SIZE(A1) ++ ++ ADD2 y1, t1, y1 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 12 * SIZE(A1) ++ ++ ADD1 y2, t2, y2 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 15 * SIZE(A1) ++ ++ ADD2 y3, t3, y3 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 14 * SIZE(A1) ++ ++ ADD3 y0, t0, y0 ++ unop ++ MUL alpha4, a5, t0 ++ LD a5, 13 * SIZE(A2) ++ ++ ADD4 y1, t1, y1 ++ unop ++ MUL alpha4, a4, t1 ++ LD a4, 12 * SIZE(A2) ++ ++ ADD3 y2, t2, y2 ++ unop ++ MUL alpha4, a7, t2 ++ LD a7, 15 * SIZE(A2) ++ ++ ADD4 y3, t3, y3 ++ unop ++ MUL alpha4, a6, t3 ++ LD a6, 14 * SIZE(A2) ++ ++ ADD3 y0, t0, y0 ++ unop ++ MUL alpha1, a0, t0 ++ LD y4, 4 * SIZE(Y1) ++ ++ ADD4 y1, t1, y1 ++ ldi A2, 8 * SIZE(A2) ++ MUL alpha1, a1, t1 ++ LD y5, 5 * SIZE(Y1) ++ ++ ADD3 y2, t2, y2 ++ ldi A1, 8 * SIZE(A1) ++ MUL alpha1, a2, t2 ++ LD y6, 6 * SIZE(Y1) ++ ++ ADD4 y3, t3, y3 ++ MUL alpha1, a3, t3 ++ LD y7, 7 * SIZE(Y1) ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ unop ++ ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ unop ++ ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha3, a6, t2 ++ unop ++ ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ unop ++ ++ ADD1 y4, t0, y4 ++ MUL alpha2, a1, t0 ++ ADD2 y5, t1, y5 ++ MUL alpha2, a0, t1 ++ ++ ADD1 y6, t2, y6 ++ MUL alpha2, a3, t2 ++ ADD2 y7, t3, y7 ++ MUL alpha2, a2, t3 ++ ++ ADD3 y4, t0, y4 ++ MUL alpha4, a5, t0 ++ ADD4 y5, t1, y5 ++ MUL alpha4, a4, t1 ++ ++ ADD3 y6, t2, y6 ++ MUL alpha4, a7, t2 ++ ADD4 y7, t3, y7 ++ MUL alpha4, a6, t3 ++ ++ ADD3 y4, t0, y4 ++ ADD4 y5, t1, y5 ++ ADD3 y6, t2, y6 ++ ADD4 y7, t3, y7 ++ ++ ST y4, 4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, 5 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) ++ ++ ST y6, 6 * SIZE(Y1) ++ unop ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L15: ++ and M, 2, I ++ ble I, $L17 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ MUL alpha3, a4, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha3, a5, t1 ++ ADD1 y2, t2, y2 ++ MUL alpha3, a6, t2 ++ ADD2 y3, t3, y3 ++ MUL alpha3, a7, t3 ++ ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 ++ ++ ADD1 y2, t2, y2 ++ MUL alpha2, a3, t2 ++ ADD2 y3, t3, y3 ++ MUL alpha2, a2, t3 ++ ++ ADD3 y0, t0, y0 ++ MUL alpha4, a5, t0 ++ ADD4 y1, t1, y1 ++ MUL alpha4, a4, t1 ++ ++ ADD3 y2, t2, y2 ++ MUL alpha4, a7, t2 ++ ADD4 y3, t3, y3 ++ MUL alpha4, a6, t3 ++ ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 ++ ADD3 y2, t2, y2 ++ ADD4 y3, t3, y3 ++ ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ++ ST y2, 2 * SIZE(Y1) ++ unop ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 ++ ++$L17: ++ blbc M, $L18 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ ++ MUL alpha1, a0, t0 ++ MUL alpha1, a1, t1 ++ ++ ADD1 y0, t0, y0 ++ MUL alpha3, a2, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha3, a3, t1 ++ ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 ++ ++ ADD3 y0, t0, y0 ++ MUL alpha4, a3, t0 ++ ADD4 y1, t1, y1 ++ MUL alpha4, a2, t1 ++ ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ .align 4 ++ ++$L18: ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ blbc N, $L990 ++ ++ LD alpha1, 0 * SIZE(X) ++ LD alpha2, 1 * SIZE(X) ++ ++ MUL alpha_r, alpha1, y0 ++ MUL alpha_r, alpha2, y1 ++ ++ MUL alpha_i, alpha2, t0 ++ mov A, A1 ++ MUL alpha_i, alpha1, t1 ++ mov Y, Y1 ++ ++#ifndef XCONJ ++ SUB y0, t0, alpha1 ++ ADD y1, t1, alpha2 ++#else ++ ADD y0, t0, alpha1 ++ SUB y1, t1, alpha2 ++#endif ++ ++ sra M, 2, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ MUL alpha1, a0, t0 ++ LD a4, 4 * SIZE(A1) ++ MUL alpha1, a1, t1 ++ LD a5, 5 * SIZE(A1) ++ MUL alpha1, a2, t2 ++ LD a6, 6 * SIZE(A1) ++ MUL alpha1, a3, t3 ++ LD a7, 7 * SIZE(A1) ++ ++ ADD1 y0, t0, y0 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 y1, t1, y1 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD1 y2, t2, y2 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD2 y3, t3, y3 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD3 y0, t0, y0 ++ unop ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, t0 ++ ++ ADD4 y1, t1, y1 ++ unop ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, t1 ++ ++ ADD3 y2, t2, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, t2 ++ ldi I, -1(I) ++ ++ ADD4 y3, t3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, t3 ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a5, t0 ++ LD a5, 13 * SIZE(A1) ++ ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a4, t1 ++ LD a4, 12 * SIZE(A1) ++ ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a7, t2 ++ LD a7, 15 * SIZE(A1) ++ ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a6, t3 ++ LD a6, 14 * SIZE(A1) ++ ++ ADD3 y4, t0, y4 ++ LD y0, 8 * SIZE(Y1) ++ MUL alpha1, a0, t0 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ++ ADD4 y5, t1, y5 ++ LD y1, 9 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ ldi I, -1(I) ++ ++ ADD3 y6, t2, y6 ++ LD y2, 10 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ unop ++ ++ ADD4 y7, t3, y7 ++ LD y3, 11 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ unop ++ ++ ADD1 y0, t0, y0 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha2, a1, t0 ++ LD a1, 17 * SIZE(A1) ++ ++ ADD2 y1, t1, y1 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha2, a0, t1 ++ LD a0, 16 * SIZE(A1) ++ ++ ADD1 y2, t2, y2 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha2, a3, t2 ++ LD a3, 19 * SIZE(A1) ++ ++ ADD2 y3, t3, y3 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha2, a2, t3 ++ LD a2, 18 * SIZE(A1) ++ ++ ADD3 y0, t0, y0 ++ LD y4, 12 * SIZE(Y1) ++ MUL alpha1, a4, t0 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(Y1) ++ ++ ADD4 y1, t1, y1 ++ LD y5, 13 * SIZE(Y1) ++ MUL alpha1, a5, t1 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD3 y2, t2, y2 ++ LD y6, 14 * SIZE(Y1) ++ MUL alpha1, a6, t2 ++ ldi Y1, 8 * SIZE(Y1) ++ ++ ADD4 y3, t3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, t3 ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a5, t0 ++ unop ++ ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a4, t1 ++ unop ++ ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a7, t2 ++ unop ++ ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a6, t3 ++ unop ++ ++ ADD3 y4, t0, y4 ++ ADD4 y5, t1, y5 ++ ADD3 y6, t2, y6 ++ ADD4 y7, t3, y7 ++ ++ ST y4, 4 * SIZE(Y1) ++ unop ++ ST y5, 5 * SIZE(Y1) ++ unop ++ ++ ST y6, 6 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L25: ++ and M, 2, I ++ ble I, $L27 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 ++ ADD1 y2, t2, y2 ++ MUL alpha2, a3, t2 ++ ADD2 y3, t3, y3 ++ MUL alpha2, a2, t3 ++ ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 ++ ADD3 y2, t2, y2 ++ ADD4 y3, t3, y3 ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ ++ ST y2, 2 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 ++ ++$L27: ++ blbc M, $L990 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 ++ ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ .align 4 ++ ++$L990: ++ cmpeq INCY, 2 * SIZE, $0 ++ bne $0, $L999 ++ ++ mov BUFFER, Y1 ++ ++ sra M, 2, I ++ ble I, $L995 ++ .align 4 ++ ++$L992: ++ LD a0, 0 * SIZE(BUFFER) ++ LD a1, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a2, 0 * SIZE(BUFFER) ++ LD a3, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ LD y2, 2 * SIZE(Y) ++ LD y3, 3 * SIZE(Y) ++ ++ LD a4, 0 * SIZE(BUFFER) ++ LD a5, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a6, 0 * SIZE(BUFFER) ++ LD a7, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y4, 4 * SIZE(Y) ++ LD y5, 5 * SIZE(Y) ++ LD y6, 6 * SIZE(Y) ++ LD y7, 7 * SIZE(Y) ++ ++ ADD a0, y0, a0 ++ ADD a1, y1, a1 ++ ADD a2, y2, a2 ++ ADD a3, y3, a3 ++ ++ ST a0, 0 * SIZE(Y1) ++ ADD a4, y4, a4 ++ ST a1, 1 * SIZE(Y1) ++ ADD a5, y5, a5 ++ addl Y1, INCY, Y1 ++ ++ ST a2, 0 * SIZE(Y1) ++ ADD a6, y6, a6 ++ ST a3, 1 * SIZE(Y1) ++ ADD a7, y7, a7 ++ addl Y1, INCY, Y1 ++ ++ ST a4, 0 * SIZE(Y1) ++ ST a5, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a6, 0 * SIZE(Y1) ++ ST a7, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ ldi Y, 8 * SIZE(Y) ++ bgt I, $L992 ++ .align 4 ++ ++$L995: ++ and M, 3, I ++ ble I, $L999 ++ .align 4 ++ ++$L996: ++ LD a0, 0 * SIZE(BUFFER) ++ LD a1, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ ldi Y, 2 * SIZE(Y) ++ ++ ADD a0, y0, a0 ++ ADD a1, y1, a1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L996 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zgemv_t.S b/kernel/sw_64/zgemv_t.S +new file mode 100644 +index 0000000..4ee035c +--- /dev/null ++++ b/kernel/sw_64/zgemv_t.S +@@ -0,0 +1,922 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $21 ++#define LDA $18 ++ ++#define X $19 ++#define INCX $20 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define X1 $3 ++#define Y1 $4 ++#define A1 $5 ++#define A2 $6 ++ ++#define alpha_r $f19 ++#define alpha_i $f20 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 ++ ++#if !defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif !defined(CONJ) && defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#elif defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB ++#endif ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl LDA, 0 + STACKSIZE($sp) ++ ldl X, 8 + STACKSIZE($sp) ++ ldl INCX, 16 + STACKSIZE($sp) ++ ldl Y, 24 + STACKSIZE($sp) ++ ldl INCY, 32 + STACKSIZE($sp) ++ ldl BUFFER, 40 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ sll INCX, ZBASE_SHIFT, INCX ++ cmple N, 0, $1 ++ sll INCY, ZBASE_SHIFT, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ cmpeq INCX, 2 * SIZE, $0 ++ mov X, X1 ++ sll LDA, ZBASE_SHIFT,LDA ++ bne $0, $L10 ++ ++ sra M, 2, I ++ mov BUFFER, Y1 ++ mov BUFFER, X ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ ldi I, -1(I) ++ ++ LD a0, 0 * SIZE(X1) ++ LD a1, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a2, 0 * SIZE(X1) ++ LD a3, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ST a2, 2 * SIZE(Y1) ++ ST a3, 3 * SIZE(Y1) ++ ++ LD a4, 0 * SIZE(X1) ++ LD a5, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a6, 0 * SIZE(X1) ++ LD a7, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a4, 4 * SIZE(Y1) ++ ST a5, 5 * SIZE(Y1) ++ ST a6, 6 * SIZE(Y1) ++ ST a7, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 3, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ LD a0, 0 * SIZE(X1) ++ LD a1, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ mov Y, Y1 ++ fclr t0 ++ unop ++ fclr t1 ++ ++ sra N, 1, J ++ fclr t2 ++ fclr t3 ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ mov A, A1 ++ fclr s0 ++ addl A, LDA, A2 ++ fclr s1 ++ ++ addl A2, LDA, A ++ unop ++ mov X, X1 ++ fillde 3 * SIZE(Y) ++ ++ sra M, 2, I ++ fclr s2 ++ fclr s3 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 3 * SIZE(A1) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD a8, 4 * SIZE(A1) ++ LD a9, 5 * SIZE(A1) ++ LD a10, 4 * SIZE(A2) ++ LD a11, 5 * SIZE(A2) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 7 * SIZE(A1) ++ LD a14, 6 * SIZE(A2) ++ LD a15, 7 * SIZE(A2) ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ unop ++ ++ ADD3 s2, t2, s2 ++ unop ++ MUL x0, a2, t2 ++ unop ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ unop ++ MUL x1, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ unop ++ MUL x1, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD1 s2, t2, s2 ++ unop ++ MUL x1, a3, t2 ++ LD a3, 9 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 ++ unop ++ MUL x1, a2, t3 ++ LD a2, 8 * SIZE(A2) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a4, t0 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ MUL x2, a5, t1 ++ ADD3 s2, t2, s2 ++ MUL x2, a6, t2 ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x2, a7, t3 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ unop ++ MUL x3, a5, t0 ++ LD a5, 11 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ unop ++ MUL x3, a4, t1 ++ LD a4, 10 * SIZE(A1) ++ ++ ADD1 s2, t2, s2 ++ unop ++ MUL x3, a7, t2 ++ LD a7, 11 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 ++ unop ++ MUL x3, a6, t3 ++ LD a6, 10 * SIZE(A2) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a8, t0 ++ LD x3, 7 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x0, a9, t1 ++ unop ++ ++ ADD3 s2, t2, s2 ++ ldi I, -1(I) ++ MUL x0, a10, t2 ++ unop ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a11, t3 ++ LD x0, 8 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ unop ++ MUL x1, a9, t0 ++ LD a9, 13 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ unop ++ MUL x1, a8, t1 ++ LD a8, 12 * SIZE(A1) ++ ++ ADD1 s2, t2, s2 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a11, t2 ++ LD a11, 13 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 ++ unop ++ MUL x1, a10, t3 ++ LD a10, 12 * SIZE(A2) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a12, t0 ++ LD x1, 9 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ MUL x2, a13, t1 ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD3 s2, t2, s2 ++ unop ++ MUL x2, a14, t2 ++ unop ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x2, a15, t3 ++ LD x2, 10 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ unop ++ MUL x3, a13, t0 ++ LD a13, 7 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a12, t1 ++ LD a12, 6 * SIZE(A1) ++ ++ ADD1 s2, t2, s2 ++ unop ++ MUL x3, a15, t2 ++ LD a15, 7 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 ++ MUL x3, a14, t3 ++ LD a14, 6 * SIZE(A2) ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ MUL x0, a1, t1 ++ ADD3 s2, t2, s2 ++ MUL x0, a2, t2 ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 ++ ++ ADD1 s2, t2, s2 ++ unop ++ MUL x1, a3, t2 ++ unop ++ ++ ADD2 s3, t3, s3 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a2, t3 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD3 s0, t0, s0 ++ MUL x2, a4, t0 ++ ADD4 s1, t1, s1 ++ MUL x2, a5, t1 ++ ++ ADD3 s2, t2, s2 ++ unop ++ MUL x2, a6, t2 ++ unop ++ ++ ADD4 s3, t3, s3 ++ ldi A2, 8 * SIZE(A2) ++ MUL x2, a7, t3 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ MUL x3, a5, t0 ++ ADD2 s1, t1, s1 ++ MUL x3, a4, t1 ++ ++ ADD1 s2, t2, s2 ++ unop ++ MUL x3, a7, t2 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD2 s3, t3, s3 ++ unop ++ MUL x3, a6, t3 ++ LD x3, -1 * SIZE(X1) ++ ++ ADD3 s0, t0, s0 ++ MUL x0, a8, t0 ++ ADD4 s1, t1, s1 ++ MUL x0, a9, t1 ++ ++ ADD3 s2, t2, s2 ++ MUL x0, a10, t2 ++ ADD4 s3, t3, s3 ++ MUL x0, a11, t3 ++ ++ ADD1 s0, t0, s0 ++ MUL x1, a9, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a8, t1 ++ ++ ADD1 s2, t2, s2 ++ MUL x1, a11, t2 ++ ADD2 s3, t3, s3 ++ MUL x1, a10, t3 ++ ++ ADD3 s0, t0, s0 ++ MUL x2, a12, t0 ++ ADD4 s1, t1, s1 ++ MUL x2, a13, t1 ++ ++ ADD3 s2, t2, s2 ++ MUL x2, a14, t2 ++ ADD4 s3, t3, s3 ++ MUL x2, a15, t3 ++ ++ ADD1 s0, t0, s0 ++ MUL x3, a13, t0 ++ ADD2 s1, t1, s1 ++ MUL x3, a12, t1 ++ ++ ADD1 s2, t2, s2 ++ MUL x3, a15, t2 ++ ADD2 s3, t3, s3 ++ MUL x3, a14, t3 ++ .align 4 ++ ++$L15: ++ and M, 3, I ++ ble I, $L18 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L17 ++ .align 4 ++ ++$L16: ++ ADD3 s0, t0, s0 ++ ldi I, -1(I) ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ MUL x0, a1, t1 ++ ADD3 s2, t2, s2 ++ MUL x0, a2, t2 ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 2 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ ldi A2, 2 * SIZE(A2) ++ MUL x1, a1, t0 ++ LD a1, 3 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ ldi X1, 2 * SIZE(X1) ++ MUL x1, a0, t1 ++ LD a0, 2 * SIZE(A1) ++ ++ ADD1 s2, t2, s2 ++ ldi A1, 2 * SIZE(A1) ++ MUL x1, a3, t2 ++ LD a3, 1 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 ++ MUL x1, a2, t3 ++ LD a2, 0 * SIZE(A2) ++ bgt I, $L16 ++ .align 4 ++ ++$L17: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ unop ++ ++ ADD3 s2, t2, s2 ++ MUL x0, a2, t2 ++ ADD4 s3, t3, s3 ++ MUL x0, a3, t3 ++ ++ ADD1 s0, t0, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 ++ ++ ADD1 s2, t2, s2 ++ MUL x1, a3, t2 ++ ADD2 s3, t3, s3 ++ MUL x1, a2, t3 ++ .align 4 ++ ++$L18: ++ LD a0, 0 * SIZE(Y) ++ unop ++ LD a1, 1 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ LD a2, 0 * SIZE(Y) ++ unop ++ LD a3, 1 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ ADD3 s0, t0, s0 ++ ADD4 s1, t1, s1 ++ ADD3 s2, t2, s2 ++ ADD4 s3, t3, s3 ++ ++ MUL alpha_r, s0, t0 ++ MUL alpha_r, s1, t1 ++ MUL alpha_r, s2, t2 ++ MUL alpha_r, s3, t3 ++ ++ ADD a0, t0, a0 ++ MUL alpha_i, s1, t0 ++ ADD a1, t1, a1 ++ MUL alpha_i, s0, t1 ++ ADD a2, t2, a2 ++ MUL alpha_i, s3, t2 ++ ADD a3, t3, a3 ++ MUL alpha_i, s2, t3 ++ ++ SUB a0, t0, a0 ++ ADD a1, t1, a1 ++ SUB a2, t2, a2 ++ ADD a3, t3, a3 ++ ++ ST a0, 0 * SIZE(Y1) ++ fclr t0 ++ ST a1, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ST a2, 0 * SIZE(Y1) ++ fclr t1 ++ ST a3, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ fclr t2 ++ ldi J, -1(J) ++ fclr t3 ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ blbc N, $L999 ++ ++ mov A, A1 ++ fclr s0 ++ fclr s1 ++ mov X, X1 ++ ++ sra M, 2, I ++ fclr s2 ++ fclr s3 ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 3 * SIZE(A1) ++ LD a8, 4 * SIZE(A1) ++ LD a9, 5 * SIZE(A1) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 7 * SIZE(A1) ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD3 s0, t0, s0 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ ldi I, -1(I) ++ MUL x1, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 s3, t1, s3 ++ unop ++ MUL x1, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a4, t0 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x2, a5, t1 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ unop ++ MUL x3, a5, t0 ++ LD a5, 11 * SIZE(A1) ++ ++ ADD2 s3, t1, s3 ++ unop ++ MUL x3, a4, t1 ++ LD a4, 10 * SIZE(A1) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a8, t0 ++ LD x3, 7 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a9, t1 ++ LD x0, 8 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ unop ++ MUL x1, a9, t0 ++ LD a9, 13 * SIZE(A1) ++ ++ ADD2 s3, t1, s3 ++ unop ++ MUL x1, a8, t1 ++ LD a8, 12 * SIZE(A1) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a12, t0 ++ LD x1, 9 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x2, a13, t1 ++ LD x2, 10 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a13, t0 ++ LD a13, 7 * SIZE(A1) ++ ++ ADD2 s3, t1, s3 ++ MUL x3, a12, t1 ++ LD a12, 6 * SIZE(A1) ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ unop ++ MUL x1, a1, t0 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD2 s3, t1, s3 ++ unop ++ MUL x1, a0, t1 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a4, t0 ++ unop ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x2, a5, t1 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ unop ++ MUL x3, a5, t0 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD2 s3, t1, s3 ++ unop ++ MUL x3, a4, t1 ++ LD x3, -1 * SIZE(X1) ++ ++ ADD3 s0, t0, s0 ++ MUL x0, a8, t0 ++ ADD4 s1, t1, s1 ++ MUL x0, a9, t1 ++ ++ ADD1 s2, t0, s2 ++ MUL x1, a9, t0 ++ ADD2 s3, t1, s3 ++ MUL x1, a8, t1 ++ ++ ADD3 s0, t0, s0 ++ MUL x2, a12, t0 ++ ADD4 s1, t1, s1 ++ MUL x2, a13, t1 ++ ++ ADD1 s2, t0, s2 ++ MUL x3, a13, t0 ++ ADD2 s3, t1, s3 ++ MUL x3, a12, t1 ++ .align 4 ++ ++$L25: ++ and M, 3, I ++ ble I, $L28 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L27 ++ .align 4 ++ ++$L26: ++ ADD3 s0, t0, s0 ++ ldi A1, 2 * SIZE(A1) ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ ldi I, -1(I) ++ MUL x0, a1, t1 ++ LD x0, 2 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ ldi X1, 2 * SIZE(X1) ++ MUL x1, a1, t0 ++ LD a1, 1 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 ++ LD a0, 0 * SIZE(A1) ++ bgt I, $L26 ++ .align 4 ++ ++$L27: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ unop ++ ++ ADD1 s0, t0, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 ++ .align 4 ++ ++$L28: ++ LD a0, 0 * SIZE(Y) ++ LD a1, 1 * SIZE(Y) ++ ++ ADD3 s0, t0, s0 ++ ADD4 s1, t1, s1 ++ ADD3 s2, t2, s2 ++ ADD4 s3, t3, s3 ++ ++ ADD s0, s2, s0 ++ ADD s1, s3, s1 ++ ++ MUL alpha_r, s0, t0 ++ MUL alpha_r, s1, t1 ++ ++ ADD a0, t0, a0 ++ MUL alpha_i, s1, t0 ++ ADD a1, t1, a1 ++ MUL alpha_i, s0, t1 ++ ++ SUB a0, t0, a0 ++ ADD a1, t1, a1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/znrm2.S b/kernel/sw_64/znrm2.S +new file mode 100644 +index 0000000..1892c5f +--- /dev/null ++++ b/kernel/sw_64/znrm2.S +@@ -0,0 +1,428 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++ ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ sll INCX, ZBASE_SHIFT, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ beq INCX, $L999 ++ ++ fclr a2 ++ cmpeq INCX, 2 * SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, a0 ++ s_fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ LD x1, 1 * SIZE(X) ++ ++ ldi X, 2 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 2, I ++ fclr t1 ++ ble I, $L25 ++ ++ LD x0, 0 * SIZE(X) ++ fclr t2 ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ fclr t3 ++ LD x3, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x5, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x6, 0 * SIZE(X) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, a2 ++ LD x1, 1 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ unop ++ ++ faddd a0, t0, a0 ++ LD x3, 1 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ ldi I, -1(I) ++ ++ faddd a2, t2, a2 ++ LD x5, 1 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L25: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, a0 ++ faddd a1, t1, a1 ++ ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zrot.S b/kernel/sw_64/zrot.S +new file mode 100644 +index 0000000..3d05a2d +--- /dev/null ++++ b/kernel/sw_64/zrot.S +@@ -0,0 +1,631 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define I $21 ++#define XX $23 ++#define YY $24 ++ ++#define C $f10 ++#define S $f11 ++ ++#define PREFETCH_SIZE 80 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fmov $f21, C ++ LD S, 0($sp) ++ ++ addl INCX, INCX, INCX ++ addl INCY, INCY, INCY ++ ++ cmpeq INCX, 2, $23 ++ cmpeq INCY, 2, $24 ++ ble N, $L998 ++ ++ and $23, $24, $23 ++ beq $23, $L50 ++ ++ sra N, 2, I ++ ble I, $L15 ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ LD $f16, 2*SIZE(X) ++ LD $f17, 2*SIZE(Y) ++ LD $f18, 3*SIZE(X) ++ LD $f19, 3*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ unop ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ ++ LD $f13, 4*SIZE(Y) ++ MUL S, $f12, $f24 ++ LD $f12, 4*SIZE(X) ++ MUL C, $f14, $f25 ++ ++ ldi I, -1(I) ++ MUL S, $f15, $f26 ++ ADD $f21, $f22, $f22 ++ MUL C, $f15, $f27 ++ ++ LD $f15, 5*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ MUL C, $f16, $f21 ++ fillde (PREFETCH_SIZE) * SIZE(X) ++ unop ++ LD $f14, 5*SIZE(X) ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ fillde (PREFETCH_SIZE) * SIZE(Y) ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 6*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 7*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 8*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 8*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 9*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ LD $f14, 9*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ LD $f17, 10*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 10*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ LD $f19, 11*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ ldi I, -1(I) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 11*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 6*SIZE(X) ++ MUL S, $f13, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 12*SIZE(Y) ++ ldi X, 8*SIZE(X) ++ unop ++ ++ ST $f24, 6*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 4*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ unop ++ ++ ST $f26, -1*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 5*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, -1*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ MUL C, $f16, $f21 ++ LD $f14, 5*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ LD $f16, 6*SIZE(X) ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ LD $f18, 7*SIZE(X) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 6*SIZE(X) ++ ADD $f25, $f26, $f26 ++ ST $f24, 6*SIZE(Y) ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 7*SIZE(X) ++ ldi X, 8*SIZE(X) ++ ST $f28, 7*SIZE(Y) ++ ldi Y, 8*SIZE(Y) ++ .align 4 ++ ++ ++$L15: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(X) ++ ST $f24, 0*SIZE(Y) ++ ldi I, -1(I) ++ ++ ST $f26, 1*SIZE(X) ++ ldi X, 2 * SIZE(X) ++ ST $f28, 1*SIZE(Y) ++ ldi Y, 2 * SIZE(Y) ++ ++ bgt I, $L16 ++ .align 4 ++ ++$L998: ++ clr $0 ++ ret ++ .align 4 ++ ++$L50: ++ mov X, XX ++ mov Y, YY ++ ++ sra N, 2, I ++ ble I, $L55 ++ .align 4 ++ ++$L51: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ldi I, -1(I) ++ bgt I, $L51 ++ .align 4 ++ ++$L55: ++ and N, 3, I ++ ble I, $L999 ++ .align 4 ++ ++$L56: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(X) ++ ST $f24, 0*SIZE(Y) ++ ldi I, -1(I) ++ ++ ST $f26, 1*SIZE(X) ++ ST $f28, 1*SIZE(Y) ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ bgt I, $L56 ++ .align 4 ++ ++$L999: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zscal.S b/kernel/sw_64/zscal.S +new file mode 100644 +index 0000000..0c97e82 +--- /dev/null ++++ b/kernel/sw_64/zscal.S +@@ -0,0 +1,255 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $21 ++#define INCX $17 ++ ++#define XX $18 ++#define I $19 ++ ++#define ALPHA_R $f19 ++#define ALPHA_I $f20 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f21 ++ ++#define t0 $f22 ++#define t1 $f23 ++#define t2 $f24 ++#define t3 $f25 ++ ++#define t4 $f26 ++#define t5 $f27 ++#define t6 $f28 ++#define t7 $f29 ++ ++ PROLOGUE ++ PROFCODE ++ ++ ldl INCX, 0($sp) ++ mov X, XX ++ ble N, $L999 ++ ++ addl INCX, INCX, INCX ++ ++ sra N, 2, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a2, 0 * SIZE(X) ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a6, 0 * SIZE(X) ++ LD a7, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t4, 0 * SIZE(XX) ++ MUL a2, ALPHA_R, t0 ++ ST t5, 1 * SIZE(XX) ++ MUL a3, ALPHA_I, t1 ++ ++ MUL a2, ALPHA_I, t2 ++ LD a0, 0 * SIZE(X) ++ MUL a3, ALPHA_R, t3 ++ LD a1, 1 * SIZE(X) ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ SXADDQ INCX, X, X ++ ++ MUL a4, ALPHA_R, t0 ++ ST t6, 0 * SIZE(XX) ++ MUL a5, ALPHA_I, t1 ++ ST t7, 1 * SIZE(XX) ++ ++ MUL a4, ALPHA_I, t2 ++ LD a2, 0 * SIZE(X) ++ MUL a5, ALPHA_R, t3 ++ LD a3, 1 * SIZE(X) ++ ++ SUB t0, t1, t4 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t5 ++ SXADDQ INCX, X, X ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, 0 * SIZE(XX) ++ MUL a7, ALPHA_I, t1 ++ ST t5, 1 * SIZE(XX) ++ ++ MUL a6, ALPHA_I, t2 ++ LD a4, 0 * SIZE(X) ++ MUL a7, ALPHA_R, t3 ++ LD a5, 1 * SIZE(X) ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ ST t6, 0 * SIZE(XX) ++ MUL a1, ALPHA_I, t1 ++ ST t7, 1 * SIZE(XX) ++ ++ MUL a0, ALPHA_I, t2 ++ LD a6, 0 * SIZE(X) ++ MUL a1, ALPHA_R, t3 ++ LD a7, 1 * SIZE(X) ++ ++ SUB t0, t1, t4 ++ ldi I, -1(I) ++ ADD t2, t3, t5 ++ SXADDQ INCX, XX, XX ++ ++ fillde PREFETCHSIZE * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ MUL a2, ALPHA_R, t0 ++ MUL a3, ALPHA_I, t1 ++ ST t4, 0 * SIZE(XX) ++ MUL a2, ALPHA_I, t2 ++ ST t5, 1 * SIZE(XX) ++ MUL a3, ALPHA_R, t3 ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ unop ++ ++ ST t6, 0 * SIZE(XX) ++ MUL a4, ALPHA_R, t0 ++ ST t7, 1 * SIZE(XX) ++ MUL a5, ALPHA_I, t1 ++ MUL a4, ALPHA_I, t2 ++ MUL a5, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t5 ++ unop ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, 0 * SIZE(XX) ++ MUL a7, ALPHA_I, t1 ++ ST t5, 1 * SIZE(XX) ++ ++ MUL a6, ALPHA_I, t2 ++ MUL a7, ALPHA_R, t3 ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ ++ ST t6, 0 * SIZE(XX) ++ ST t7, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ .align 4 ++ ++$L15: ++ and N, 3, I ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++ ++ ST t4, 0 * SIZE(XX) ++ ST t5, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zsum.S b/kernel/sw_64/zsum.S +new file mode 100644 +index 0000000..e42bba8 +--- /dev/null ++++ b/kernel/sw_64/zsum.S +@@ -0,0 +1,210 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ addl INCX, INCX, INCX ++ ++ fclr s1 ++ unop ++ fclr t1 ++ ble N, $L999 ++ ++ beq INCX, $L999 ++ ++ fclr s2 ++ sra N, 2, I ++ fclr s3 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t2 ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a2, 0 * SIZE(X) ++ fclr t3 ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) ++ ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ s_fillcs PREFETCHSIZE * SIZE(X) ++ fmov a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fmov a1, t1 ++ unop ++ ++ ADD s2, t2, s2 ++ LD a7, 1 * SIZE(X) ++ fmov a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ fmov a3, t3 ++ unop ++ ++ ADD s0, t0, s0 ++ LD a1, 1 * SIZE(X) ++ fmov a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) ++ fmov a5, t1 ++ unop ++ ++ ADD s2, t2, s2 ++ LD a3, 1 * SIZE(X) ++ fmov a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ fmov a7, t3 ++ unop ++ ++ LD a5, 1 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD a6, 0 * SIZE(X) ++ fmov a0, t0 ++ ++ ADD s1, t1, s1 ++ LD a7, 1 * SIZE(X) ++ fmov a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ fmov a2, t2 ++ ADD s3, t3, s3 ++ fmov a3, t3 ++ ++ ADD s0, t0, s0 ++ fmov a4, t0 ++ ADD s1, t1, s1 ++ fmov a5, t1 ++ ADD s2, t2, s2 ++ fmov a6, t2 ++ ADD s3, t3, s3 ++ fmov a7, t3 ++ ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ .align 4 ++ ++$L15: ++ ADD s0, s2, s0 ++ and N, 3, I ++ ADD s1, s3, s1 ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ LD a0, 0 * SIZE(X) ++ fmov a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a1, 1 * SIZE(X) ++ fmov a1, t1 ++ SXADDQ INCX, X, X ++ ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ++ ADD s0, s1, s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zswap.S b/kernel/sw_64/zswap.S +new file mode 100644 +index 0000000..6b4619c +--- /dev/null ++++ b/kernel/sw_64/zswap.S +@@ -0,0 +1,247 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++ mov $21, $17 ++ ldl $18, 0($sp) ++ ldl $19, 8($sp) ++ ldl $20, 16($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ beq $18, $SubEnd ++ beq $20, $SubEnd ++ ++ ble $16, $SubEnd # if n <= 0 goto $End ++ ++ cmpeq $18, 1, $1 ++ addl $18, $18, $18 ++ cmpeq $20, 1, $2 ++ addl $20, $20, $20 ++ ++ sra $16, 2, $21 ++ and $1, $2, $1 ++ and $16, 3, $22 ++ beq $1, $Sub ++ ++ ble $21, $MainRemain ++ .align 4 ++ ++$MainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f12, 2*SIZE($19) ++ LD $f13, 3*SIZE($19) ++ LD $f14, 4*SIZE($19) ++ LD $f15, 5*SIZE($19) ++ LD $f16, 6*SIZE($19) ++ LD $f17, 7*SIZE($19) ++ ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ LD $f22, 2*SIZE($17) ++ LD $f23, 3*SIZE($17) ++ LD $f24, 4*SIZE($17) ++ LD $f25, 5*SIZE($17) ++ LD $f26, 6*SIZE($17) ++ LD $f27, 7*SIZE($17) ++ ++ fillde 16*SIZE($17) ++ unop ++ fillde 16*SIZE($19) ++ subl $21, 1, $21 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f11, 1*SIZE($17) ++ ST $f12, 2*SIZE($17) ++ ST $f13, 3*SIZE($17) ++ ST $f14, 4*SIZE($17) ++ ST $f15, 5*SIZE($17) ++ ST $f16, 6*SIZE($17) ++ ST $f17, 7*SIZE($17) ++ ++ ST $f20, 0*SIZE($19) ++ ST $f21, 1*SIZE($19) ++ ST $f22, 2*SIZE($19) ++ ST $f23, 3*SIZE($19) ++ ST $f24, 4*SIZE($19) ++ ST $f25, 5*SIZE($19) ++ ST $f26, 6*SIZE($19) ++ ST $f27, 7*SIZE($19) ++ ++ ldi $17, 8*SIZE($17) ++ ldi $19, 8*SIZE($19) ++ bgt $21, $MainLoop ++ .align 4 ++ ++$MainRemain: ++ ble $22, $MainEnd ++ .align 4 ++ ++$MainRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ ++ ldi $17, 2*SIZE($17) ++ ldi $19, 2*SIZE($19) ++ subl $22, 1, $22 ++ ST $f10, -2*SIZE($17) ++ ST $f11, -1*SIZE($17) ++ ST $f20, -2*SIZE($19) ++ ST $f21, -1*SIZE($19) ++ bgt $22, $MainRemainLoop ++ .align 4 ++ ++$MainEnd: ++ clr $0 ++ ret ++ .align 4 ++ ++$Sub: ++ mov $17, $23 ++ mov $19, $24 ++ ble $21, $SubRemain ++ .align 4 ++ ++$SubLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f12, 0*SIZE($19) ++ LD $f13, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f14, 0*SIZE($19) ++ LD $f15, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f16, 0*SIZE($19) ++ LD $f17, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f22, 0*SIZE($17) ++ LD $f23, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f24, 0*SIZE($17) ++ LD $f25, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f26, 0*SIZE($17) ++ LD $f27, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ ST $f10, 0*SIZE($23) ++ ST $f11, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f12, 0*SIZE($23) ++ ST $f13, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f14, 0*SIZE($23) ++ ST $f15, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f16, 0*SIZE($23) ++ ST $f17, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f20, 0*SIZE($24) ++ ST $f21, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f22, 0*SIZE($24) ++ ST $f23, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f24, 0*SIZE($24) ++ ST $f25, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f26, 0*SIZE($24) ++ ST $f27, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ subl $21, 1, $21 ++ bgt $21, $SubLoop ++ .align 4 ++ ++$SubRemain: ++ ble $22, $SubEnd ++ .align 4 ++ ++$SubRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ ++ subl $22, 1, $22 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f11, 1*SIZE($17) ++ ST $f20, 0*SIZE($19) ++ ST $f21, 1*SIZE($19) ++ ++ SXADDQ $18, $17, $17 ++ SXADDQ $20, $19, $19 ++ bgt $22, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S b/kernel/sw_64/ztrsm_kernel_2x2_LN.S +new file mode 100644 +index 0000000..23eb831 +--- /dev/null ++++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S +@@ -0,0 +1,2230 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#if !defined(SW8A) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ ++ .set noat ++ .set noreorder ++ .arch sw8a ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++#if defined(LN) || defined(LT) ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#else ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#endif ++ ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++ ldl OFFSET, 24 + STACKSIZE($sp) ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ addl M, M, TMP2 ++ mull TMP2, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ TMP2, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ addl TMP1, TMP1, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 1, J ++ ble J, $L30 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C2 ++ subl C2, LDC, C1 ++ subl C2, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ and M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ble I, $L20 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L28 ++ ble L, $L25 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L28 ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD3 c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD4 c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD2 c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L20: ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L29 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillde 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(KK) ++ fclr c08 ++ ++ fillde 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble KK, $L18 ++ ble L, $L15 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillde 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillde 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble TMP1, $L18 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD1 c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, c11 ++ unop ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD1 c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD3 c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD2 c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD4 c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD1 c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD3 c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD2 c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD4 c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c11, t1, c11 ++ ADD3 c12, t2, c12 ++ ADD2 c16, t3, c16 ++ ADD4 c15, t4, c15 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ ADD c11, c16, c11 ++ ADD c12, c15, c12 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++ ++ SUB b1, c03, c03 ++ SUB b2, c04, c04 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c09, t3, c09 ++ SUB c10, t4, c10 ++ ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c09, t3, c09 ++ ADD5 c10, t4, c10 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c09, t3 ++ MUL a3, c10, t4 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c10, t3 ++ MUL a4, c09, t4 ++ ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c03, t3, c03 ++ ADD5 c04, t4, c04 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ fclr c01 ++ fclr c05 ++ ++ ldi I, -1(I) ++ bgt I, $L11 ++ .align 4 ++ ++$L29: ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L30: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C1 ++ subl C, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ and M, 1, I ++ ble I, $L50 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L58 ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L58 ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L57: ++ ADD3 c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ++$L58: ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L50: ++ sra M, 1, I ++ ble I, $L59 ++ .align 4 ++ ++$L41: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(KK) ++ fclr c04 ++ fclr c08 ++ ++ ble KK, $L48 ++ ble L, $L45 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ++ ble TMP1, $L48 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, c05 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L47 ++#else ++ blbs TMP1, $L47 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L47: ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD3 c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++$L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L41 ++ .align 4 ++ ++$L59: ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S b/kernel/sw_64/ztrsm_kernel_2x2_LT.S +new file mode 100644 +index 0000000..1e8f2c9 +--- /dev/null ++++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S +@@ -0,0 +1,2223 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#if !defined(SW8A) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ ++ .set noat ++ .set noreorder ++ .arch sw8a ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++#if defined(LN) || defined(LT) ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#else ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#endif ++ ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++ ldl OFFSET, 24 + STACKSIZE($sp) ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ addl M, M, TMP2 ++ mull TMP2, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ TMP2, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ addl TMP1, TMP1, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 1, J ++ ble J, $L30 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C2 ++ subl C2, LDC, C1 ++ subl C2, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillde 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(KK) ++ fclr c08 ++ ++ fillde 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble KK, $L18 ++ ble L, $L15 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillde 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillde 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble TMP1, $L18 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD1 c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, c11 ++ unop ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD1 c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD3 c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD2 c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD4 c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD1 c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD3 c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD2 c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD4 c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c11, t1, c11 ++ ADD3 c12, t2, c12 ++ ADD2 c16, t3, c16 ++ ADD4 c15, t4, c15 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ ADD c11, c16, c11 ++ ADD c12, c15, c12 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++ ++ SUB b1, c03, c03 ++ SUB b2, c04, c04 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c09, t3, c09 ++ SUB c10, t4, c10 ++ ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c09, t3, c09 ++ ADD5 c10, t4, c10 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c09, t3 ++ MUL a3, c10, t4 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c10, t3 ++ MUL a4, c09, t4 ++ ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c03, t3, c03 ++ ADD5 c04, t4, c04 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ fclr c01 ++ fclr c05 ++ ++ ldi I, -1(I) ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 1, I ++ ble I, $L29 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L28 ++ ble L, $L25 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L28 ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD3 c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD4 c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD2 c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L29: ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L30: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C1 ++ subl C, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ ble I, $L50 ++ .align 4 ++ ++$L41: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(KK) ++ fclr c04 ++ fclr c08 ++ ++ ble KK, $L48 ++ ble L, $L45 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ++ ble TMP1, $L48 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, c05 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L47 ++#else ++ blbs TMP1, $L47 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L47: ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD3 c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++$L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L41 ++ .align 4 ++ ++$L50: ++ and M, 1, I ++ ble I, $L59 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L58 ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L58 ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L57: ++ ADD3 c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ++$L58: ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L59: ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S b/kernel/sw_64/ztrsm_kernel_2x2_RT.S +new file mode 100644 +index 0000000..460b2b8 +--- /dev/null ++++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S +@@ -0,0 +1,2223 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++ ++#if !defined(SW8A) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ ++ .set noat ++ .set noreorder ++ .arch sw8a ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++#if defined(LN) || defined(LT) ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#else ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#endif ++ ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++ ldl OFFSET, 24 + STACKSIZE($sp) ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ addl M, M, TMP2 ++ mull TMP2, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ TMP2, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ addl TMP1, TMP1, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ and N, 1, J ++ ble J, $L30 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C1 ++ subl C, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ ble I, $L50 ++ .align 4 ++ ++$L41: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(KK) ++ fclr c04 ++ fclr c08 ++ ++ ble KK, $L48 ++ ble L, $L45 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ++ ble TMP1, $L48 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, c05 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L47 ++#else ++ blbs TMP1, $L47 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L47: ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD3 c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++$L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L41 ++ .align 4 ++ ++$L50: ++ and M, 1, I ++ ble I, $L59 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L58 ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L58 ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L57: ++ ADD3 c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ++$L58: ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L59: ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L30: ++ sra N, 1, J ++ ble J, $L999 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C2 ++ subl C2, LDC, C1 ++ subl C2, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillde 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(KK) ++ fclr c08 ++ ++ fillde 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble KK, $L18 ++ ble L, $L15 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillde 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillde 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble TMP1, $L18 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD1 c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, c11 ++ unop ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD1 c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD3 c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD2 c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD4 c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD1 c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD3 c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD2 c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD4 c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c11, t1, c11 ++ ADD3 c12, t2, c12 ++ ADD2 c16, t3, c16 ++ ADD4 c15, t4, c15 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ ADD c11, c16, c11 ++ ADD c12, c15, c12 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++ ++ SUB b1, c03, c03 ++ SUB b2, c04, c04 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c09, t3, c09 ++ SUB c10, t4, c10 ++ ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c09, t3, c09 ++ ADD5 c10, t4, c10 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c09, t3 ++ MUL a3, c10, t4 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c10, t3 ++ MUL a4, c09, t4 ++ ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c03, t3, c03 ++ ADD5 c04, t4, c04 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ fclr c01 ++ fclr c05 ++ ++ ldi I, -1(I) ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 1, I ++ ble I, $L29 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L28 ++ ble L, $L25 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L28 ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD3 c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD4 c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD2 c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L29: ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/lapack/laswp/sw_64/Makefile b/lapack/laswp/sw_64/Makefile +new file mode 100644 +index 0000000..af1f019 +--- /dev/null ++++ b/lapack/laswp/sw_64/Makefile +@@ -0,0 +1,8 @@ ++TOPDIR = ../../.. ++include ../../../Makefile.system ++ ++LASWP = ../generic/laswp_k_1.c ++ZLASWP = ../generic/zlaswp_k_1.c ++ ++include ../generic/Makefile ++ +diff --git a/param.h b/param.h +index 2618e1f..484f181 100644 +--- a/param.h ++++ b/param.h +@@ -2207,6 +2207,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + #endif + ++#if defined(SW8A) ++ ++#define SNUMOPT 2 ++#define DNUMOPT 2 ++ ++#define GEMM_DEFAULT_OFFSET_A 512 ++#define GEMM_DEFAULT_OFFSET_B 512 ++#define GEMM_DEFAULT_ALIGN 0x0ffffUL ++ ++#define SGEMM_DEFAULT_UNROLL_M 4 ++#define SGEMM_DEFAULT_UNROLL_N 4 ++#define DGEMM_DEFAULT_UNROLL_M 4 ++#define DGEMM_DEFAULT_UNROLL_N 4 ++#define CGEMM_DEFAULT_UNROLL_M 2 ++#define CGEMM_DEFAULT_UNROLL_N 2 ++#define ZGEMM_DEFAULT_UNROLL_M 2 ++#define ZGEMM_DEFAULT_UNROLL_N 2 ++ ++#define SYMV_P 8 ++ ++#define SGEMM_DEFAULT_P 256 ++#define SGEMM_DEFAULT_Q 512 ++#define DGEMM_DEFAULT_P 256 ++#define DGEMM_DEFAULT_Q 256 ++#define CGEMM_DEFAULT_P 256 ++#define CGEMM_DEFAULT_Q 256 ++#define ZGEMM_DEFAULT_P 128 ++#define ZGEMM_DEFAULT_Q 256 ++ ++#endif ++ + #ifdef CELL + + #define SNUMOPT 2 +-- +2.33.0 + diff --git a/openblas.spec b/openblas.spec old mode 100644 new mode 100755 index c89ffd3336350d4276e419de8e1feceb3343cdf3..bfa908a1c92da80326a52320af2490e993993dfd --- a/openblas.spec +++ b/openblas.spec @@ -1,4 +1,4 @@ -%define anolis_release 3 +%define anolis_release 4 %bcond_with system_lapack %global lapackver 3.11.0 @@ -14,6 +14,7 @@ Patch0001: 0001-openblas-0.2.15-system_lapack.patch Patch0002: 0002-openblas-0.2.5-libname.patch Patch0003: 0003-openblas-0.3.11-tests.patch Patch0004: 0001-Fixed-the-undefined-reference-to-blas_set_parameter.patch +Patch0005: 0001-add-support-for-sw_64-architecture.patch BuildRequires: make gcc gcc-c++ gcc-gfortran BuildRequires: perl-devel @@ -213,6 +214,9 @@ cd OpenBLAS-%{version} %patch0003 -p1 -b .tests %patch0004 -p2 #%patch0004 -p1 -b .Add-opt-for-LoongArch64 +%ifarch sw_64 +%patch0005 -p1 -b .sw_64 +%endif find -name \*.f -exec chmod 644 {} \; @@ -359,6 +363,9 @@ cp -a %{_includedir}/lapacke %{buildroot}%{_includedir}/%{name} %endif suffix="" +%ifarch sw_64 +suffix="_sw8a" +%endif %ifarch riscv64 suffix="_riscv64_generic" %endif @@ -538,6 +545,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Fri Jun 10 2025 swcompiler - 0.3.28-4 +- add support for sw_64 architecture + * Fri May 30 2025 Yihao Yan - 0.3.28-3 - add support for riscv build