From 7fa37510123051b96b63a79e9701f5204aa73dac Mon Sep 17 00:00:00 2001 From: swcompiler Date: Thu, 28 Aug 2025 16:57:39 +0800 Subject: [PATCH] fix some bugs for sw_64 (cherry picked from commit a7c97d853549d1cd389435f98acf15b2725c464d) --- ...25-sw.patch => openblas-0.3.25-sw_64.patch | 110750 ++++----------- openblas.spec | 9 +- 2 files changed, 24819 insertions(+), 85940 deletions(-) rename OpenBLAS-0.3.25-sw.patch => openblas-0.3.25-sw_64.patch (37%) diff --git a/OpenBLAS-0.3.25-sw.patch b/openblas-0.3.25-sw_64.patch similarity index 37% rename from OpenBLAS-0.3.25-sw.patch rename to openblas-0.3.25-sw_64.patch index 6555090..491ec45 100644 --- a/OpenBLAS-0.3.25-sw.patch +++ b/openblas-0.3.25-sw_64.patch @@ -1,56 +1,121 @@ -diff --git a/Makefile b/Makefile -index fc021a9..c33edd9 100644 ---- a/Makefile -+++ b/Makefile -@@ -158,18 +158,18 @@ tests : shared - ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - touch $(LIBNAME) - ifndef NO_FBLAS -- $(MAKE) -C test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all -+ $(MAKE) -C test all - endif - endif - ifneq ($(ONLY_CBLAS), 1) -- $(MAKE) -C utest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all -+ #$(MAKE) -C utest all - endif - ifneq ($(NO_CBLAS), 1) - ifneq ($(ONLY_CBLAS), 1) -- $(MAKE) -C ctest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all -+ $(MAKE) -C ctest all - endif - ifeq ($(CPP_THREAD_SAFETY_TEST), 1) -- $(MAKE) -C cpp_thread_test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all -+ $(MAKE) -C cpp_thread_test all - endif - endif - +From bcb07028166d58a0e1fb22c33f96efd52fce88cd Mon Sep 17 00:00:00 2001 +From: funnyaaa +Date: Wed, 27 Aug 2025 16:31:59 +0800 +Subject: [PATCH] add support for sw_64 architecture + +--- + Makefile.sw_64 | 15 + + Makefile.system | 5 + + c_check | 4 +- + common.h | 4 + + common_sw_64.h | 99 + + cpuid_sw_64.c | 14 + + ctest.c | 4 + + getarch.c | 9 +- + kernel/sw_64/KERNEL | 128 + + kernel/sw_64/Makefile | 2 + + kernel/sw_64/amax.S | 283 ++ + kernel/sw_64/asum.S | 206 ++ + kernel/sw_64/axpy.S | 428 +++ + kernel/sw_64/cabs.S | 71 + + kernel/sw_64/cnrm2.S | 428 +++ + kernel/sw_64/copy.S | 379 +++ + kernel/sw_64/cscal.S | 217 ++ + kernel/sw_64/dnrm2.S | 431 +++ + kernel/sw_64/dot.S | 534 ++++ + kernel/sw_64/gemm_beta.S | 179 ++ + kernel/sw_64/gemm_kernel_4x4.S | 2844 +++++++++++++++++++ + kernel/sw_64/gemv_n.S | 1307 +++++++++ + kernel/sw_64/gemv_t.S | 1061 ++++++++ + kernel/sw_64/iamax.S | 440 +++ + kernel/sw_64/imax.S | 351 +++ + kernel/sw_64/izamax.S | 427 +++ + kernel/sw_64/lsame.S | 76 + + kernel/sw_64/max.S | 227 ++ + kernel/sw_64/rot.S | 624 +++++ + kernel/sw_64/scal.S | 693 +++++ + kernel/sw_64/snrm2.S | 431 +++ + kernel/sw_64/staticbuffer.S | 45 + + kernel/sw_64/sum.S | 206 ++ + kernel/sw_64/swap.S | 252 ++ + kernel/sw_64/trsm_kernel_4x4_LN.S | 4061 ++++++++++++++++++++++++++++ + kernel/sw_64/trsm_kernel_4x4_LT.S | 4059 +++++++++++++++++++++++++++ + kernel/sw_64/trsm_kernel_4x4_RT.S | 4059 +++++++++++++++++++++++++++ + kernel/sw_64/zamax.S | 301 +++ + kernel/sw_64/zasum.S | 208 ++ + kernel/sw_64/zaxpy.S | 611 +++++ + kernel/sw_64/zdot.S | 500 ++++ + kernel/sw_64/zgemm_beta.S | 192 ++ + kernel/sw_64/zgemm_kernel_2x2.S | 1705 ++++++++++++ + kernel/sw_64/zgemv_n.S | 1027 +++++++ + kernel/sw_64/zgemv_t.S | 922 +++++++ + kernel/sw_64/znrm2.S | 428 +++ + kernel/sw_64/zrot.S | 631 +++++ + kernel/sw_64/zscal.S | 341 +++ + kernel/sw_64/zsum.S | 210 ++ + kernel/sw_64/zswap.S | 247 ++ + kernel/sw_64/ztrsm_kernel_2x2_LN.S | 2230 +++++++++++++++ + kernel/sw_64/ztrsm_kernel_2x2_LT.S | 2223 +++++++++++++++ + kernel/sw_64/ztrsm_kernel_2x2_RT.S | 2223 +++++++++++++++ + lapack/laswp/sw_64/Makefile | 8 + + param.h | 31 + + 55 files changed, 38638 insertions(+), 3 deletions(-) + create mode 100644 Makefile.sw_64 + create mode 100644 common_sw_64.h + create mode 100644 cpuid_sw_64.c + create mode 100644 kernel/sw_64/KERNEL + create mode 100644 kernel/sw_64/Makefile + create mode 100644 kernel/sw_64/amax.S + create mode 100644 kernel/sw_64/asum.S + create mode 100644 kernel/sw_64/axpy.S + create mode 100644 kernel/sw_64/cabs.S + create mode 100644 kernel/sw_64/cnrm2.S + create mode 100644 kernel/sw_64/copy.S + create mode 100644 kernel/sw_64/cscal.S + create mode 100644 kernel/sw_64/dnrm2.S + create mode 100644 kernel/sw_64/dot.S + create mode 100644 kernel/sw_64/gemm_beta.S + create mode 100644 kernel/sw_64/gemm_kernel_4x4.S + create mode 100644 kernel/sw_64/gemv_n.S + create mode 100644 kernel/sw_64/gemv_t.S + create mode 100644 kernel/sw_64/iamax.S + create mode 100644 kernel/sw_64/imax.S + create mode 100644 kernel/sw_64/izamax.S + create mode 100644 kernel/sw_64/lsame.S + create mode 100644 kernel/sw_64/max.S + create mode 100644 kernel/sw_64/rot.S + create mode 100644 kernel/sw_64/scal.S + create mode 100644 kernel/sw_64/snrm2.S + create mode 100644 kernel/sw_64/staticbuffer.S + create mode 100644 kernel/sw_64/sum.S + create mode 100644 kernel/sw_64/swap.S + create mode 100644 kernel/sw_64/trsm_kernel_4x4_LN.S + create mode 100644 kernel/sw_64/trsm_kernel_4x4_LT.S + create mode 100644 kernel/sw_64/trsm_kernel_4x4_RT.S + create mode 100644 kernel/sw_64/zamax.S + create mode 100644 kernel/sw_64/zasum.S + create mode 100644 kernel/sw_64/zaxpy.S + create mode 100644 kernel/sw_64/zdot.S + create mode 100644 kernel/sw_64/zgemm_beta.S + create mode 100644 kernel/sw_64/zgemm_kernel_2x2.S + create mode 100644 kernel/sw_64/zgemv_n.S + create mode 100644 kernel/sw_64/zgemv_t.S + create mode 100644 kernel/sw_64/znrm2.S + create mode 100644 kernel/sw_64/zrot.S + create mode 100644 kernel/sw_64/zscal.S + create mode 100644 kernel/sw_64/zsum.S + create mode 100644 kernel/sw_64/zswap.S + create mode 100644 kernel/sw_64/ztrsm_kernel_2x2_LN.S + create mode 100644 kernel/sw_64/ztrsm_kernel_2x2_LT.S + create mode 100644 kernel/sw_64/ztrsm_kernel_2x2_RT.S + create mode 100644 lapack/laswp/sw_64/Makefile + diff --git a/Makefile.sw_64 b/Makefile.sw_64 new file mode 100644 -index 0000000..b4542ce +index 000000000..529bd8828 --- /dev/null +++ b/Makefile.sw_64 -@@ -0,0 +1,35 @@ -+CPP = $(CC) -E -+RANLIB = ranlib -+ -+ifeq ($(LIBSUBARCH), SW6) -+LIBNAME = $(LIBPREFIX)_sw6.a -+LIBNAME_P = $(LIBPREFIX)_sw6_p.a -+endif -+ -+ifneq ($(COMPILER), NATIVE) -+# GCC User -+ifeq ($(LIBSUBARCH), SW6) -+OPTION += -DSW6 -mcpu=sw6 -+endif -+else -+# Compaq Compiler User -+ifeq ($(LIBSUBARCH), SW6) -+OPTION += -DSW6 -tune sw6 -arch sw6 -+endif -+endif -+ +@@ -0,0 +1,15 @@ +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -mieee +endif @@ -67,19 +132,10 @@ index 0000000..b4542ce +LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm +endif diff --git a/Makefile.system b/Makefile.system -index 3be47c6..ae90af3 100644 +index 1b84195e4..bbff6d697 100644 --- a/Makefile.system +++ b/Makefile.system -@@ -42,6 +42,8 @@ else ifeq ($(ARCH), mips64el) - override ARCH=mips64 - else ifeq ($(ARCH), zarch) - override ARCH=zarch -+else ifeq ($(ARCH), sw_64) -+override ARCH=sw_64 - endif - - NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib -@@ -809,6 +811,11 @@ NO_BINARY_MODE = 1 +@@ -809,6 +809,11 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif @@ -91,2328 +147,8 @@ index 3be47c6..ae90af3 100644 ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 -diff --git a/Makefile.system.libname b/Makefile.system.libname -deleted file mode 100644 -index 1b84195..0000000 ---- a/Makefile.system.libname -+++ /dev/null -@@ -1,1860 +0,0 @@ --# --# Include user definition --# -- --# TO suppress recursive includes --INCLUDED = 1 -- --ifndef TOPDIR --TOPDIR = . --endif -- --ifndef RELAPACK_REPLACE --RELAPACK_REPLACE=0 --endif -- --# we need to use the host system's architecture for getarch compile options even especially when cross-compiling --HOSTARCH := $(shell uname -m) --ifeq ($(HOSTARCH), amd64) --HOSTARCH=x86_64 --endif -- --# Catch conflicting usage of ARCH in some BSD environments --ifeq ($(ARCH), amd64) --override ARCH=x86_64 --else ifeq ($(ARCH), powerpc64) --override ARCH=power --else ifeq ($(ARCH), powerpc64le) --override ARCH=power --else ifeq ($(ARCH), powerpc) --override ARCH=power --else ifeq ($(ARCH), i386) --override ARCH=x86 --else ifeq ($(ARCH), armv6) --override ARCH=arm --else ifeq ($(ARCH), armv7) --override ARCH=arm --else ifeq ($(ARCH), aarch64) --override ARCH=arm64 --else ifeq ($(ARCH), mipsel) --override ARCH=mips --else ifeq ($(ARCH), mips64el) --override ARCH=mips64 --else ifeq ($(ARCH), zarch) --override ARCH=zarch --endif -- --NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib -- --# Default C compiler --# - Only set if not specified on the command line or inherited from the environment. --# - CC is an implicit variable so neither '?=' or 'ifndef' can be used. --# http://stackoverflow.com/questions/4029274/mingw-and-make-variables --# - Default value is 'cc' which is not always a valid command (e.g. MinGW). --ifeq ($(origin CC),default) -- --# Check if $(CC) refers to a valid command and set the value to gcc if not --ifneq ($(findstring cmd.exe,$(SHELL)),) --ifeq ($(shell where $(CC) 2>NUL),) --CC = gcc --endif --else # POSIX-ish --ifeq ($(shell command -v $(CC) 2>/dev/null),) --ifeq ($(shell uname -s),Darwin) --CC = clang --# EXTRALIB += -Wl,-no_compact_unwind --else --CC = gcc --endif # Darwin --endif # CC exists --endif # Shell is sane -- --endif # CC is set to default -- --# Default Fortran compiler (FC) is selected by f_check. -- --ifndef MAKEFILE_RULE --include $(TOPDIR)/Makefile.rule --else --include $(TOPDIR)/$(MAKEFILE_RULE) --endif -- --# --# Beginning of system configuration --# --ifneq ($(BUILD_SINGLE),1) --ifneq ($(BUILD_DOUBLE),1) --ifneq ($(BUILD_COMPLEX),1) --ifneq ($(BUILD_COMPLEX16),1) --override BUILD_SINGLE=1 --override BUILD_DOUBLE=1 --override BUILD_COMPLEX=1 --override BUILD_COMPLEX16=1 --endif --endif --endif --endif -- --ifndef HOSTCC --HOSTCC = $(CC) --endif -- --ifdef TARGET --GETARCH_FLAGS := -DFORCE_$(TARGET) --GETARCH_FLAGS += -DUSER_TARGET --ifeq ($(TARGET), GENERIC) --ifeq ($(DYNAMIC_ARCH), 1) --override NO_EXPRECISION=1 --export NO_EXPRECISION --endif --endif --endif -- --# Force fallbacks for 32bit -- --ifeq ($(BINARY), 32) --ifeq ($(TARGET), HASWELL) --GETARCH_FLAGS := -DFORCE_NEHALEM --endif --ifeq ($(TARGET), SKYLAKEX) --GETARCH_FLAGS := -DFORCE_NEHALEM --endif --ifeq ($(TARGET), COOPERLAKE) --GETARCH_FLAGS := -DFORCE_NEHALEM --endif --ifeq ($(TARGET), SAPPHIRERAPIDS) --GETARCH_FLAGS := -DFORCE_NEHALEM --endif --ifeq ($(TARGET), SANDYBRIDGE) --GETARCH_FLAGS := -DFORCE_NEHALEM --endif --ifeq ($(TARGET), BULLDOZER) --GETARCH_FLAGS := -DFORCE_BARCELONA --endif --ifeq ($(TARGET), PILEDRIVER) --GETARCH_FLAGS := -DFORCE_BARCELONA --endif --ifeq ($(TARGET), STEAMROLLER) --GETARCH_FLAGS := -DFORCE_BARCELONA --endif --ifeq ($(TARGET), EXCAVATOR) --GETARCH_FLAGS := -DFORCE_BARCELONA --endif --ifeq ($(TARGET), ZEN) --GETARCH_FLAGS := -DFORCE_BARCELONA --endif --ifeq ($(TARGET), ARMV8) --GETARCH_FLAGS := -DFORCE_ARMV7 --endif --ifeq ($(TARGET), POWER8) --GETARCH_FLAGS := -DFORCE_POWER6 --endif --ifeq ($(TARGET), POWER9) --GETARCH_FLAGS := -DFORCE_POWER6 --endif --ifeq ($(TARGET), POWER10) --GETARCH_FLAGS := -DFORCE_POWER6 --endif --endif -- --#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. --# --ifdef TARGET_CORE --GETARCH_FLAGS := -DFORCE_$(TARGET_CORE) --endif -- --# Force fallbacks for 32bit -- --ifeq ($(BINARY), 32) --ifeq ($(TARGET_CORE), HASWELL) --GETARCH_FLAGS := -DFORCE_NEHALEM --endif --ifeq ($(TARGET_CORE), SKYLAKEX) --GETARCH_FLAGS := -DFORCE_NEHALEM --endif --ifeq ($(TARGET_CORE), COOPERLAKE) --GETARCH_FLAGS := -DFORCE_NEHALEM --endif --ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) --GETARCH_FLAGS := -DFORCE_NEHALEM --endif --ifeq ($(TARGET_CORE), SANDYBRIDGE) --GETARCH_FLAGS := -DFORCE_NEHALEM --endif --ifeq ($(TARGET_CORE), BULLDOZER) --GETARCH_FLAGS := -DFORCE_BARCELONA --endif --ifeq ($(TARGET_CORE), PILEDRIVER) --GETARCH_FLAGS := -DFORCE_BARCELONA --endif --ifeq ($(TARGET_CORE), STEAMROLLER) --GETARCH_FLAGS := -DFORCE_BARCELONA --endif --ifeq ($(TARGET_CORE), EXCAVATOR) --GETARCH_FLAGS := -DFORCE_BARCELONA --endif --ifeq ($(TARGET_CORE), ZEN) --GETARCH_FLAGS := -DFORCE_BARCELONA --endif --endif -- -- --# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. --ifeq ($(HOSTARCH), x86_64) --ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) --GETARCH_FLAGS += -march=native --endif --endif -- --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --GETARCH_FLAGS += -DUSE64BITINT --endif --endif -- --ifndef GEMM_MULTITHREAD_THRESHOLD --GEMM_MULTITHREAD_THRESHOLD=4 --endif --GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) -- --ifeq ($(NO_AVX), 1) --GETARCH_FLAGS += -DNO_AVX --endif -- --ifeq ($(BINARY), 32) --GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 --NO_AVX512 = 1 --endif -- --ifeq ($(NO_AVX2), 1) --GETARCH_FLAGS += -DNO_AVX2 --endif -- --ifeq ($(NO_AVX512), 1) --GETARCH_FLAGS += -DNO_AVX512 --endif -- --ifeq ($(DEBUG), 1) --GETARCH_FLAGS += -g --endif -- --ifeq ($(QUIET_MAKE), 1) --MAKE += -s --endif -- --ifndef NO_PARALLEL_MAKE --NO_PARALLEL_MAKE=0 --endif --GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE) -- --ifdef MAKE_NB_JOBS --GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS) --endif -- --ifeq ($(HOSTCC), loongcc) --GETARCH_FLAGS += -static --endif -- --#if don't use Fortran, it will only compile CBLAS. --ifeq ($(ONLY_CBLAS), 1) --NO_LAPACK = 1 --else --ONLY_CBLAS = 0 --endif -- --#For small matrix optimization --ifeq ($(ARCH), x86_64) --SMALL_MATRIX_OPT = 1 --else ifeq ($(ARCH), power) --SMALL_MATRIX_OPT = 1 --BUILD_BFLOAT16 = 1 --endif --ifeq ($(SMALL_MATRIX_OPT), 1) --CCOMMON_OPT += -DSMALL_MATRIX_OPT --endif -- --# This operation is expensive, so execution should be once. --ifndef GOTOBLAS_MAKEFILE --export GOTOBLAS_MAKEFILE = 1 -- --# Generating Makefile.conf and config.h --DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) -- --endif -- --ifndef TARGET_CORE ---include $(TOPDIR)/Makefile.conf --else --HAVE_NEON= --HAVE_VFP= --HAVE_VFPV3= --HAVE_VFPV4= --HAVE_MMX= --HAVE_SSE= --HAVE_SSE2= --HAVE_SSE3= --HAVE_SSSE3= --HAVE_SSE4_1= --HAVE_SSE4_2= --HAVE_SSE4A= --HAVE_SSE5= --HAVE_AVX= --HAVE_AVX2= --HAVE_FMA3= --include $(TOPDIR)/Makefile_kernel.conf --endif -- -- --ifndef NUM_PARALLEL --NUM_PARALLEL = 1 --endif -- --ifndef NUM_THREADS --NUM_THREADS = $(NUM_CORES) --endif -- --ifeq ($(NUM_THREADS), 1) --override USE_THREAD = 0 --override USE_OPENMP = 0 --endif -- --ifdef USE_THREAD --ifeq ($(USE_THREAD), 0) --SMP = --else --SMP = 1 --endif --else --ifeq ($(NUM_THREADS), 1) --SMP = --else --SMP = 1 --endif --endif -- --ifeq ($(SMP), 1) --USE_LOCKING = --endif -- --ifndef NEED_PIC --NEED_PIC = 1 --endif -- --ARFLAGS = --CPP = $(COMPILER) -E --AR ?= $(CROSS_SUFFIX)ar --AS ?= $(CROSS_SUFFIX)as --LD ?= $(CROSS_SUFFIX)ld --RANLIB ?= $(CROSS_SUFFIX)ranlib --NM = $(CROSS_SUFFIX)nm --DLLWRAP = $(CROSS_SUFFIX)dllwrap --OBJCOPY = $(CROSS_SUFFIX)objcopy --OBJCONV = $(CROSS_SUFFIX)objconv -- -- --# When fortran support was either not detected or actively deselected, only build BLAS. --ifeq ($(NOFORTRAN), 1) --C_LAPACK = 1 --override FEXTRALIB = --endif -- --ifeq ($(C_COMPILER), GCC) --GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) --GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) --GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) --GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) --GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) --GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) --GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) --GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) --# Note that the behavior of -dumpversion is compile-time-configurable for --# gcc-7.x and newer. Use -dumpfullversion there --ifeq ($(GCCVERSIONGTEQ7),1) -- GCCDUMPVERSION_PARAM := -dumpfullversion --else -- GCCDUMPVERSION_PARAM := -dumpversion --endif --GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) --GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) --GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) --GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) --endif -- --ifeq ($(C_COMPILER), CLANG) --CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) --CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) --endif -- --# --# OS dependent settings --# -- --ifeq ($(OSNAME), Darwin) --ifndef MACOSX_DEPLOYMENT_TARGET --ifeq ($(ARCH), arm64) --export MACOSX_DEPLOYMENT_TARGET=11.0 --ifeq ($(C_COMPILER), GCC) --export NO_SVE = 1 --endif --else --export MACOSX_DEPLOYMENT_TARGET=10.8 --endif --endif --MD5SUM = md5 -r --XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.Xcode |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.) --ifeq (x$(XCVER)x,xx) --XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.) --endif --ifeq (x$(XCVER), x 15) --CCOMMON_OPT += -Wl,-ld_classic --endif --endif -- --ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) --MD5SUM = md5 -r --endif -- --ifeq ($(OSNAME), NetBSD) --MD5SUM = md5 -n --endif -- --ifeq ($(OSNAME), Linux) --EXTRALIB += -lm --NO_EXPRECISION = 1 --endif -- --ifeq ($(OSNAME), Android) --EXTRALIB += -lm --endif -- --ifeq ($(OSNAME), AIX) --EXTRALIB += -lm --endif -- --ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) --ifeq ($(ARCH), $(filter $(ARCH),arm arm64)) --EXTRALIB += -lm --endif --endif -- --ifeq ($(OSNAME), WINNT) --NEED_PIC = 0 --NO_EXPRECISION = 1 -- --EXTRALIB += -defaultlib:advapi32 -- --SUFFIX = obj --PSUFFIX = pobj --LIBSUFFIX = a -- --ifeq ($(C_COMPILER), CLANG) --CCOMMON_OPT += -DMS_ABI --endif -- --#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) --ifeq ($(GCCVERSIONGT4), 1) --# GCC Major version > 4 --# It is compatible with MSVC ABI. --CCOMMON_OPT += -DMS_ABI --endif -- --ifeq ($(GCCVERSIONGTEQ4), 1) --ifeq ($(GCCMINORVERSIONGTEQ7), 1) --# GCC Version >=4.7 --# It is compatible with MSVC ABI. --CCOMMON_OPT += -DMS_ABI --endif --endif -- --# Ensure the correct stack alignment on Win32 --# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 --ifeq ($(ARCH), x86) --CCOMMON_OPT += -mincoming-stack-boundary=2 --FCOMMON_OPT += -mincoming-stack-boundary=2 --endif -- --endif -- --ifeq ($(OSNAME), Interix) --NEED_PIC = 0 --NO_EXPRECISION = 1 -- --INTERIX_TOOL_DIR = /opt/gcc.3.3/i586-pc-interix3/bin --endif -- --ifeq ($(OSNAME), CYGWIN_NT) --NEED_PIC = 0 --NO_EXPRECISION = 1 --OS_CYGWIN_NT = 1 --endif -- --ifneq ($(OSNAME), WINNT) --ifneq ($(OSNAME), CYGWIN_NT) --ifneq ($(OSNAME), Interix) --ifneq ($(OSNAME), Android) --ifdef SMP --EXTRALIB += -lpthread --endif --endif --endif --endif --endif -- --# ifeq logical or --ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix)) --OS_WINDOWS=1 --endif -- --ifdef QUAD_PRECISION --CCOMMON_OPT += -DQUAD_PRECISION --NO_EXPRECISION = 1 --endif -- --ifneq ($(ARCH), x86) --ifneq ($(ARCH), x86_64) --NO_EXPRECISION = 1 --endif --endif -- --ifdef UTEST_CHECK --CCOMMON_OPT += -DUTEST_CHECK --SANITY_CHECK = 1 --endif -- --ifdef SANITY_CHECK --CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) --endif -- --MAX_STACK_ALLOC ?= 2048 --ifneq ($(MAX_STACK_ALLOC), 0) --CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) --endif -- --ifdef USE_LOCKING --ifneq ($(USE_LOCKING), 0) --CCOMMON_OPT += -DUSE_LOCKING --endif --endif -- --# --# Architecture dependent settings --# -- --ifeq ($(ARCH), x86) --ifndef BINARY --NO_BINARY_MODE = 1 --endif -- --ifeq ($(CORE), generic) --NO_EXPRECISION = 1 --endif -- --ifndef NO_EXPRECISION --ifeq ($(F_COMPILER), GFORTRAN) --# ifeq logical or. GCC or LSB --ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) --EXPRECISION = 1 --CCOMMON_OPT += -DEXPRECISION -m128bit-long-double --FCOMMON_OPT += -m128bit-long-double --endif --ifeq ($(C_COMPILER), CLANG) --EXPRECISION = 1 --CCOMMON_OPT += -DEXPRECISION --FCOMMON_OPT += -m128bit-long-double --endif --endif --endif --endif -- --ifeq ($(ARCH), x86_64) -- --ifeq ($(CORE), generic) --NO_EXPRECISION = 1 --endif -- --ifndef NO_EXPRECISION --ifeq ($(F_COMPILER), GFORTRAN) --# ifeq logical or. GCC or LSB --ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) --EXPRECISION = 1 --CCOMMON_OPT += -DEXPRECISION -m128bit-long-double --FCOMMON_OPT += -m128bit-long-double --endif --ifeq ($(C_COMPILER), CLANG) --EXPRECISION = 1 --CCOMMON_OPT += -DEXPRECISION --FCOMMON_OPT += -m128bit-long-double --endif --endif --endif --endif -- --ifeq ($(C_COMPILER), INTEL) --CCOMMON_OPT += -wd981 --endif -- -- --ifeq ($(USE_OPENMP), 1) -- --#check --ifeq ($(USE_THREAD), 0) --$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.) --endif -- --# ifeq logical or. GCC or LSB --ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) --CCOMMON_OPT += -fopenmp --endif -- --ifeq ($(C_COMPILER), CLANG) --CCOMMON_OPT += -fopenmp --ifeq ($(F_COMPILER), GFORTRAN) --FEXTRALIB := $(subst -lgomp,-lomp,$(FEXTRALIB)) --endif --endif -- --ifeq ($(C_COMPILER), INTEL) --CCOMMON_OPT += -fopenmp --endif -- --ifeq ($(C_COMPILER), PGI) --CCOMMON_OPT += -mp --endif -- --ifeq ($(C_COMPILER), OPEN64) --CCOMMON_OPT += -mp --CEXTRALIB += -lstdc++ --endif -- --ifeq ($(C_COMPILER), PATHSCALE) --CCOMMON_OPT += -mp --endif --endif -- -- --ifeq ($(DYNAMIC_ARCH), 1) --ifeq ($(ARCH), x86) --DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ -- CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO --endif -- --ifeq ($(ARCH), x86_64) --DYNAMIC_CORE = PRESCOTT CORE2 --ifeq ($(DYNAMIC_OLDER), 1) --DYNAMIC_CORE += PENRYN DUNNINGTON --endif --DYNAMIC_CORE += NEHALEM --ifeq ($(DYNAMIC_OLDER), 1) --DYNAMIC_CORE += OPTERON OPTERON_SSE3 --endif --DYNAMIC_CORE += BARCELONA --ifeq ($(DYNAMIC_OLDER), 1) --DYNAMIC_CORE += BOBCAT ATOM NANO --endif --ifneq ($(NO_AVX), 1) --DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR --endif --ifneq ($(NO_AVX2), 1) --DYNAMIC_CORE += HASWELL ZEN --endif --ifneq ($(NO_AVX512), 1) --ifneq ($(NO_AVX2), 1) --DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS --endif --endif --endif -- --ifdef DYNAMIC_LIST --override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST) --XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT --XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) --CCOMMON_OPT += $(XCCOMMON_OPT) --#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' --endif -- --ifeq ($(ARCH), arm64) --DYNAMIC_CORE = ARMV8 --DYNAMIC_CORE += CORTEXA53 --DYNAMIC_CORE += CORTEXA57 --DYNAMIC_CORE += CORTEXA72 --DYNAMIC_CORE += CORTEXA73 --DYNAMIC_CORE += NEOVERSEN1 --ifneq ($(NO_SVE), 1) --DYNAMIC_CORE += NEOVERSEV1 --DYNAMIC_CORE += NEOVERSEN2 --DYNAMIC_CORE += ARMV8SVE --endif --DYNAMIC_CORE += CORTEXA55 --DYNAMIC_CORE += FALKOR --DYNAMIC_CORE += THUNDERX --DYNAMIC_CORE += THUNDERX2T99 --DYNAMIC_CORE += TSV110 --DYNAMIC_CORE += EMAG8180 --DYNAMIC_CORE += THUNDERX3T110 --ifdef DYNAMIC_LIST --override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST) --XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8 --XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) --endif --endif -- --ifeq ($(ARCH), mips64) --DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 MIPS64_GENERIC --ifdef DYNAMIC_LIST --override DYNAMIC_CORE = MIPS64_GENERIC $(DYNAMIC_LIST) --XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_MIPS64_GENERIC --XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) --endif --endif -- --ifeq ($(ARCH), loongarch64) --DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC --endif -- --ifeq ($(ARCH), zarch) --DYNAMIC_CORE = ZARCH_GENERIC -- --# if the compiler accepts -march=arch11 or -march=z13 and can compile a file --# with z13-specific inline assembly, then we can include support for Z13. --# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases --# only support one or the other. --# note: LLVM version 6.x supported -march=z13 yet could not handle vector --# registers in inline assembly, so the check for supporting the -march flag is --# not enough. --ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null --ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) --ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) -- --ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) --DYNAMIC_CORE += Z13 --CCOMMON_OPT += -DDYN_Z13 --else --$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) --endif -- --# as above for z13, check for -march=arch12 and z14 support in the compiler. --ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) --ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) --ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) --DYNAMIC_CORE += Z14 --CCOMMON_OPT += -DDYN_Z14 --else --$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) --endif -- --endif # ARCH zarch -- --ifeq ($(ARCH), power) --ifneq ($(C_COMPILER), PGI) --DYNAMIC_CORE = POWER6 --DYNAMIC_CORE += POWER8 --ifneq ($(C_COMPILER), GCC) --DYNAMIC_CORE += POWER9 --DYNAMIC_CORE += POWER10 --CCOMMON_OPT += -DHAVE_P10_SUPPORT --endif --ifeq ($(C_COMPILER), GCC) --ifeq ($(GCCVERSIONGT5), 1) --DYNAMIC_CORE += POWER9 --else --$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) --endif --ifeq ($(OSNAME), AIX) --LDVERSIONGTEQ35 := 1 --else --LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) --endif --ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) --DYNAMIC_CORE += POWER10 --CCOMMON_OPT += -DHAVE_P10_SUPPORT --else ifeq ($(GCCVERSIONGTEQ10), 1) --ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11) --DYNAMIC_CORE += POWER10 --CCOMMON_OPT += -DHAVE_P10_SUPPORT --endif --else --$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) --endif --endif --else --DYNAMIC_CORE = POWER8 --DYNAMIC_CORE += POWER9 --endif --endif -- --# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty --ifndef DYNAMIC_CORE --override DYNAMIC_ARCH= --endif --endif -- --ifeq ($(ARCH), ia64) --NO_BINARY_MODE = 1 --BINARY_DEFINED = 1 -- --ifeq ($(F_COMPILER), GFORTRAN) --ifeq ($(C_COMPILER), GCC) --# EXPRECISION = 1 --# CCOMMON_OPT += -DEXPRECISION --endif --endif --endif -- --ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) --NO_BINARY_MODE = 1 --endif -- --ifeq ($(ARCH), alpha) --NO_BINARY_MODE = 1 --BINARY_DEFINED = 1 --endif -- --ifeq ($(ARCH), arm) --NO_BINARY_MODE = 1 --BINARY_DEFINED = 1 -- --CCOMMON_OPT += -marm --FCOMMON_OPT += -marm -- --# If softfp abi is mentioned on the command line, force it. --ifeq ($(ARM_SOFTFP_ABI), 1) --CCOMMON_OPT += -mfloat-abi=softfp --FCOMMON_OPT += -mfloat-abi=softfp --endif -- --ifeq ($(OSNAME), Android) --ifeq ($(ARM_SOFTFP_ABI), 1) --EXTRALIB += -lm --else --EXTRALIB += -Wl,-lm_hard --endif --endif --endif -- --ifeq ($(ARCH), arm64) --NO_BINARY_MODE = 1 --BINARY_DEFINED = 1 --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --ifeq ($(F_COMPILER), GFORTRAN) --FCOMMON_OPT += -fdefault-integer-8 --endif --ifeq ($(F_COMPILER), FLANG) --FCOMMON_OPT += -i8 --endif --endif --endif --endif -- --ifeq ($(ARCH), riscv64) --NO_BINARY_MODE = 1 --BINARY_DEFINED = 1 --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --ifeq ($(F_COMPILER), GFORTRAN) --FCOMMON_OPT += -fdefault-integer-8 --endif --ifeq ($(F_COMPILER), FLANG) --FCOMMON_OPT += -i8 --endif --endif --endif --endif -- --ifeq ($(ARCH), loongarch64) --NO_BINARY_MODE = 1 --BINARY_DEFINED = 1 --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --ifeq ($(F_COMPILER), GFORTRAN) --FCOMMON_OPT += -fdefault-integer-8 --endif --ifeq ($(F_COMPILER), FLANG) --FCOMMON_OPT += -i8 --endif --endif --endif --endif -- --# --# C Compiler dependent settings --# -- -- --# ifeq logical or. GCC or CLANG or LSB --# http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or --ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB)) --CCOMMON_OPT += -Wall --COMMON_PROF += -fno-inline --NO_UNINITIALIZED_WARN = -Wno-uninitialized -- --ifeq ($(QUIET_MAKE), 1) --CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused --endif -- --ifdef NO_BINARY_MODE -- --ifeq ($(ARCH), $(filter $(ARCH),mips64)) --ifdef BINARY64 --CCOMMON_OPT += -mabi=64 --else --CCOMMON_OPT += -mabi=n32 --endif --BINARY_DEFINED = 1 --else ifeq ($(ARCH), $(filter $(ARCH),mips)) --CCOMMON_OPT += -mabi=32 --BINARY_DEFINED = 1 --endif -- --ifneq (, $(filter $(CORE), MIPS64_GENERIC)) --CCOMMON_OPT += -DNO_MSA --FCOMMON_OPT += -DNO_MSA --endif -- --ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) --CCOMMON_OPT += -march=loongson3a --FCOMMON_OPT += -march=loongson3a --endif -- --ifeq ($(CORE), MIPS24K) --CCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) --FCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) --endif -- --ifeq ($(CORE), MIPS1004K) --CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) --FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) --endif -- --ifeq ($(CORE), P5600) --CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) --FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) --endif -- --ifeq ($(CORE), I6400) --CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) --FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) --endif -- --ifeq ($(CORE), P6600) --CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) --FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) --endif -- --ifeq ($(CORE), I6500) --CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) --FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) --endif -- --ifeq ($(OSNAME), AIX) --BINARY_DEFINED = 1 --endif -- --ifeq ($(ARCH), loongarch64) --LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) --ifneq ($(LA64_ABI), lp64d) --LA64_ABI=lp64 --endif --CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) --FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) --endif -- --endif -- --ifndef BINARY_DEFINED --ifneq ($(OSNAME), AIX) --ifdef BINARY64 --ifneq ($(ARCH), riscv64) --CCOMMON_OPT += -m64 --endif --else --CCOMMON_OPT += -m32 --endif --endif --endif -- --endif -- --ifeq ($(C_COMPILER), PGI) --PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) --PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20) --PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11) --PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) --ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011)) --NEWPGI := 1 --PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21) --PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21) --PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11) --ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011)) --NEWPGI2 := 1 --endif --endif --ifdef BINARY64 --ifeq ($(ARCH), x86_64) --ifeq (,$(findstring tp,$(CFLAGS))) --ifneq ($(NEWPGI2),1) --CCOMMON_OPT += -tp p7-64 --else --CCOMMON_OPT += -tp px --endif --endif --ifneq ($(NEWPGI),1) --CCOMMON_OPT += -D__MMX__ -Mnollvm --endif --else --ifeq ($(ARCH), power) --ifeq (,$(findstring tp,$(CFLAGS))) --ifeq ($(CORE), POWER8) --CCOMMON_OPT += -tp pwr8 --endif --ifeq ($(CORE), POWER9) --CCOMMON_OPT += -tp pwr9 --endif --endif --endif --endif --else --ifneq ($(NEWPGI2),1) --ifeq (,$(findstring tp,$(CFLAGS))) --CCOMMON_OPT += -tp p7 --else --CCOMMON_OPT += -tp px --endif --endif --endif --endif -- --ifeq ($(C_COMPILER), PATHSCALE) --ifdef BINARY64 --CCOMMON_OPT += -m64 --else --CCOMMON_OPT += -m32 --endif --endif -- --# --# Fortran Compiler dependent settings --# -- --ifeq ($(F_COMPILER), NAG) --FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --FCOMMON_OPT += -i8 --endif --endif --ifeq ($(USE_OPENMP), 1) --FCOMMON_OPT += -openmp --endif --endif -- --ifeq ($(F_COMPILER), FLANG) --CCOMMON_OPT += -DF_INTERFACE_FLANG --FCOMMON_OPT += -Mrecursive -Kieee --ifeq ($(OSNAME), Linux) --ifeq ($(ARCH), x86_64) --FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ") --ifeq ($(FLANG_VENDOR), AMD) --FCOMMON_OPT += -fno-unroll-loops --endif --endif --endif --ifdef BINARY64 --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --FCOMMON_OPT += -i8 --endif --endif --FCOMMON_OPT += -Wall --else --FCOMMON_OPT += -Wall --endif --ifeq ($(USE_OPENMP), 1) --FCOMMON_OPT += -fopenmp --endif --endif -- --ifeq ($(F_COMPILER), G77) --CCOMMON_OPT += -DF_INTERFACE_G77 --FCOMMON_OPT += -Wall --ifndef NO_BINARY_MODE --ifneq ($(OSNAME), AIX) --ifdef BINARY64 --FCOMMON_OPT += -m64 --else --FCOMMON_OPT += -m32 --endif --endif --endif --endif -- --ifeq ($(F_COMPILER), G95) --CCOMMON_OPT += -DF_INTERFACE_G95 --FCOMMON_OPT += -Wall --ifneq ($(OSNAME), AIX) --ifndef NO_BINARY_MODE --ifdef BINARY64 --FCOMMON_OPT += -m64 --else --FCOMMON_OPT += -m32 --endif --endif --ifneq ($(NO_LAPACKE), 1) --FCOMMON_OPT += -fno-second-underscore --endif --endif --endif -- --ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) --CCOMMON_OPT += -DF_INTERFACE_GFORT --ifeq ($(F_COMPILER), GFORTRAN) --FCOMMON_OPT += -Wall --# make single-threaded LAPACK calls thread-safe #1847 --FCOMMON_OPT += -frecursive --# work around ABI problem with passing single-character arguments --FCOMMON_OPT += -fno-optimize-sibling-calls --#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc --ifneq ($(NOFORTRAN), 1) --ifneq ($(NOFORTRAN), 2) --ifneq ($(NO_LAPACK), 1) --EXTRALIB += -lgfortran --endif --endif --endif --endif --ifdef NO_BINARY_MODE --ifeq ($(ARCH), $(filter $(ARCH),mips64)) --ifdef BINARY64 --FCOMMON_OPT += -mabi=64 --else --FCOMMON_OPT += -mabi=n32 --endif --else ifeq ($(ARCH), $(filter $(ARCH),mips)) --FCOMMON_OPT += -mabi=32 --endif --else --ifdef BINARY64 --ifneq ($(OSNAME), AIX) --ifneq ($(ARCH), riscv64) --FCOMMON_OPT += -m64 --endif --endif --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --FCOMMON_OPT += -fdefault-integer-8 --endif --endif --else --ifneq ($(OSNAME), AIX) --FCOMMON_OPT += -m32 --endif --endif --endif --ifeq ($(USE_OPENMP), 1) --FCOMMON_OPT += -fopenmp --endif --endif -- --ifeq ($(F_COMPILER), INTEL) --CCOMMON_OPT += -DF_INTERFACE_INTEL --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --FCOMMON_OPT += -i8 --endif --endif --FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens --ifeq ($(USE_OPENMP), 1) --FCOMMON_OPT += -fopenmp --endif --endif -- --ifeq ($(F_COMPILER), FUJITSU) --CCOMMON_OPT += -DF_INTERFACE_FUJITSU --ifeq ($(USE_OPENMP), 1) --FCOMMON_OPT += -openmp --endif --endif -- --ifeq ($(F_COMPILER), IBM) --CCOMMON_OPT += -DF_INTERFACE_IBM --FEXTRALIB += -lxlf90 --ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG)) --FCOMMON_OPT += -qextname --endif --# FCOMMON_OPT += -qarch=440 --ifdef BINARY64 --FCOMMON_OPT += -q64 --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --FCOMMON_OPT += -qintsize=8 --endif --endif --else --FCOMMON_OPT += -q32 --endif --ifeq ($(USE_OPENMP), 1) --FCOMMON_OPT += -openmp --endif --endif -- --ifeq ($(F_COMPILER), PGI) --CCOMMON_OPT += -DF_INTERFACE_PGI --COMMON_PROF += -DPGICOMPILER --ifdef BINARY64 --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --FCOMMON_OPT += -i8 --endif --endif --ifeq ($(ARCH), x86_64) --ifneq ($(NEWPGI2),1) --FCOMMON_OPT += -tp p7-64 --else --FCOMMON_OPT += -tp px --endif --else --ifeq ($(ARCH), power) --ifeq ($(CORE), POWER6) --$(warning NVIDIA HPC compilers do not support POWER6.) --endif --ifeq ($(CORE), POWER8) --FCOMMON_OPT += -tp pwr8 --endif --ifeq ($(CORE), POWER9) --FCOMMON_OPT += -tp pwr9 --endif --ifeq ($(CORE), POWER10) --$(warning NVIDIA HPC compilers do not support POWER10.) --endif --endif --endif --else --FCOMMON_OPT += -tp p7 --endif --FCOMMON_OPT += -Mrecursive -Kieee --ifeq ($(USE_OPENMP), 1) --FCOMMON_OPT += -mp --endif --endif -- --ifeq ($(F_COMPILER), PATHSCALE) --CCOMMON_OPT += -DF_INTERFACE_PATHSCALE --ifdef BINARY64 --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --FCOMMON_OPT += -i8 --endif --endif --endif -- --ifeq ($(USE_OPENMP), 1) --FCOMMON_OPT += -mp --endif --endif -- --ifeq ($(F_COMPILER), OPEN64) --CCOMMON_OPT += -DF_INTERFACE_OPEN64 --ifdef BINARY64 --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --FCOMMON_OPT += -i8 --endif --endif --endif --ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) --ifndef BINARY64 --FCOMMON_OPT += -n32 --else --FCOMMON_OPT += -n64 --endif --ifeq ($(CORE), LOONGSON3R3) --FCOMMON_OPT += -loongson3 -static --endif --ifeq ($(CORE), LOONGSON3R4) --FCOMMON_OPT += -loongson3 -static --endif --else --ifndef BINARY64 --FCOMMON_OPT += -m32 --else --FCOMMON_OPT += -m64 --endif --endif --ifeq ($(USE_OPENMP), 1) --FEXTRALIB += -lstdc++ --FCOMMON_OPT += -mp --endif --endif -- --ifeq ($(C_COMPILER), OPEN64) --ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) --ifndef BINARY64 --CCOMMON_OPT += -n32 --else --CCOMMON_OPT += -n64 --endif --ifeq ($(CORE), LOONGSON3R3) --CCOMMON_OPT += -loongson3 -static --endif --ifeq ($(CORE), LOONGSON3R4) --CCOMMON_OPT += -loongson3 -static --endif --else --ifndef BINARY64 --CCOMMON_OPT += -m32 --else --CCOMMON_OPT += -m64 --endif --endif --endif -- --ifeq ($(C_COMPILER), SUN) --CCOMMON_OPT += -w --ifeq ($(ARCH), x86) --CCOMMON_OPT += -m32 --else --ifdef BINARY64 --CCOMMON_OPT += -m64 --else --CCOMMON_OPT += -m32 --endif --endif --endif -- --ifeq ($(F_COMPILER), SUN) --CCOMMON_OPT += -DF_INTERFACE_SUN --FCOMMON_OPT += -ftrap=%none -xrecursive --ifeq ($(ARCH), x86) --FCOMMON_OPT += -m32 --else --ifdef BINARY64 --FCOMMON_OPT += -m64 --else --FCOMMON_OPT += -m32 --endif --endif --ifeq ($(USE_OPENMP), 1) --FCOMMON_OPT += -xopenmp=parallel --endif --endif -- --ifeq ($(F_COMPILER), COMPAQ) --CCOMMON_OPT += -DF_INTERFACE_COMPAQ --ifeq ($(USE_OPENMP), 1) --FCOMMON_OPT += -openmp --endif --endif -- --ifeq ($(F_COMPILER), CRAY) --CCOMMON_OPT += -DF_INTERFACE_INTEL --FCOMMON_OPT += -hnopattern --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --FCOMMON_OPT += -s integer64 --endif --endif --ifneq ($(USE_OPENMP), 1) --FCOMMON_OPT += -O noomp --endif --endif -- --ifdef BINARY64 --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --CCOMMON_OPT += --#-DUSE64BITINT --endif --endif --endif -- --ifeq ($(NEED_PIC), 1) --ifeq ($(C_COMPILER), IBM) --CCOMMON_OPT += -qpic=large --else --CCOMMON_OPT += -fPIC --endif --ifeq ($(F_COMPILER), SUN) --FCOMMON_OPT += -pic --else ifeq ($(F_COMPILER), NAG) --FCOMMON_OPT += -PIC --else ifeq ($(F_COMPILER), IBM) --FCOMMON_OPT += -qpic=large --else --FCOMMON_OPT += -fPIC --endif --endif -- --ifeq ($(DYNAMIC_ARCH), 1) --CCOMMON_OPT += -DDYNAMIC_ARCH --endif -- --ifeq ($(DYNAMIC_OLDER), 1) --CCOMMON_OPT += -DDYNAMIC_OLDER --endif -- --ifeq ($(C_LAPACK), 1) --CCOMMON_OPT += -DC_LAPACK --endif -- --ifeq ($(NO_LAPACK), 1) --CCOMMON_OPT += -DNO_LAPACK --#Disable LAPACK C interface --NO_LAPACKE = 1 --endif -- --ifeq ($(NO_LAPACKE), 1) --CCOMMON_OPT += -DNO_LAPACKE --endif -- --ifeq ($(NO_AVX), 1) --CCOMMON_OPT += -DNO_AVX --endif -- --ifeq ($(ARCH), x86) --CCOMMON_OPT += -DNO_AVX --endif -- --ifeq ($(NO_AVX2), 1) --CCOMMON_OPT += -DNO_AVX2 --endif -- --ifeq ($(NO_AVX512), 1) --CCOMMON_OPT += -DNO_AVX512 --endif -- --ifeq ($(NO_SVE), 1) --CCOMMON_OPT += -DNO_SVE --endif -- --ifdef SMP --CCOMMON_OPT += -DSMP_SERVER -- --ifeq ($(ARCH), mips64) --USE_SIMPLE_THREADED_LEVEL3 = 1 --endif -- --ifeq ($(USE_OPENMP), 1) --# USE_SIMPLE_THREADED_LEVEL3 = 1 --# NO_AFFINITY = 1 --CCOMMON_OPT += -DUSE_OPENMP --endif -- --ifeq ($(BIGNUMA), 1) --CCOMMON_OPT += -DBIGNUMA --endif -- --endif -- --ifeq ($(NO_WARMUP), 1) --CCOMMON_OPT += -DNO_WARMUP --endif -- --ifeq ($(CONSISTENT_FPCSR), 1) --CCOMMON_OPT += -DCONSISTENT_FPCSR --endif -- --# Only for development --# CCOMMON_OPT += -DPARAMTEST --# CCOMMON_OPT += -DPREFETCHTEST --# CCOMMON_OPT += -DNO_SWITCHING --# USE_PAPI = 1 -- --ifdef USE_PAPI --CCOMMON_OPT += -DUSE_PAPI --EXTRALIB += -lpapi -lperfctr --endif -- --ifdef BUFFERSIZE --CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) --endif -- --ifdef DYNAMIC_THREADS --CCOMMON_OPT += -DDYNAMIC_THREADS --endif -- --CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) -- --CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL) -- --ifdef USE_SIMPLE_THREADED_LEVEL3 --CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 --endif -- --ifeq ($(USE_TLS), 1) --CCOMMON_OPT += -DUSE_TLS --endif -- --ifeq ($(BUILD_BFLOAT16), 1) --CCOMMON_OPT += -DBUILD_BFLOAT16 --endif --ifeq ($(BUILD_SINGLE), 1) --CCOMMON_OPT += -DBUILD_SINGLE=1 --endif --ifeq ($(BUILD_DOUBLE), 1) --CCOMMON_OPT += -DBUILD_DOUBLE=1 --endif --ifeq ($(BUILD_COMPLEX), 1) --CCOMMON_OPT += -DBUILD_COMPLEX=1 --endif --ifeq ($(BUILD_COMPLEX16), 1) --CCOMMON_OPT += -DBUILD_COMPLEX16=1 --endif -- --CCOMMON_OPT += -DVERSION=\"$(VERSION)\" -- --ifndef SYMBOLPREFIX --SYMBOLPREFIX = --endif -- --ifndef SYMBOLSUFFIX --SYMBOLSUFFIX = --endif -- --ifndef LIBSONAMEBASE --LIBSONAMEBASE = openblas --endif -- --ifndef LIBNAMESUFFIX --LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) --else --LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) --endif -- --ifeq ($(OSNAME), CYGWIN_NT) --LIBPREFIX = cyg$(LIBNAMEBASE) --else --LIBPREFIX = lib$(LIBNAMEBASE) --endif -- --KERNELDIR = $(TOPDIR)/kernel/$(ARCH) -- --include $(TOPDIR)/Makefile.$(ARCH) -- --ifneq ($(C_COMPILER), PGI) --ifneq ($(C_COMPILER), SUN) --CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME --endif --endif --CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" -- --ifeq ($(CORE), PPC440) --CCOMMON_OPT += -DALLOC_QALLOC --endif -- --ifeq ($(CORE), PPC440FP2) --STATIC_ALLOCATION = 1 --endif -- --ifneq ($(OSNAME), Linux) --NO_AFFINITY = 1 --endif -- --ifneq ($(ARCH), x86_64) --ifneq ($(ARCH), x86) --NO_AFFINITY = 1 --endif --endif -- --ifdef NO_AFFINITY --ifeq ($(NO_AFFINITY), 0) --override undefine NO_AFFINITY --else --CCOMMON_OPT += -DNO_AFFINITY --endif --endif -- --ifdef FUNCTION_PROFILE --CCOMMON_OPT += -DFUNCTION_PROFILE --endif -- --ifdef HUGETLB_ALLOCATION --CCOMMON_OPT += -DALLOC_HUGETLB --endif -- --ifdef HUGETLBFILE_ALLOCATION --CCOMMON_OPT += -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION) --endif -- --ifdef STATIC_ALLOCATION --CCOMMON_OPT += -DALLOC_STATIC --endif -- --ifdef DEVICEDRIVER_ALLOCATION --CCOMMON_OPT += -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\" --endif -- --ifdef MIXED_MEMORY_ALLOCATION --CCOMMON_OPT += -DMIXED_MEMORY_ALLOCATION --endif -- --ifeq ($(OSNAME), SunOS) --TAR = gtar --PATCH = gpatch --GREP = ggrep --AWK = nawk --else --TAR = tar --PATCH = patch --GREP = grep --AWK = awk --endif -- --ifndef MD5SUM --MD5SUM = md5sum --endif -- -- --REVISION = -r$(VERSION) --MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) -- --ifeq ($(DEBUG), 1) --COMMON_OPT += -g --endif -- --ifeq ($(DEBUG), 1) --FCOMMON_OPT += -g --endif -- --ifndef COMMON_OPT --COMMON_OPT = -O2 --endif -- --ifndef FCOMMON_OPT --FCOMMON_OPT = -O2 -frecursive --endif -- --override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) --override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) --override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) --override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) --#MAKEOVERRIDES = -- --ifeq ($(NEED_PIC), 1) --ifeq (,$(findstring PIC,$(FFLAGS))) --ifneq ($(F_COMPILER),IBM) --override FFLAGS += -fPIC --endif --endif --endif -- --#For LAPACK Fortran codes. --#Disable -fopenmp for LAPACK Fortran codes on Windows. --ifdef OS_WINDOWS --LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS)) --LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS)) --else --LAPACK_FFLAGS := $(FFLAGS) --LAPACK_FPFLAGS := $(FPFLAGS) --endif -- --ifeq ($(F_COMPILER),NAG) --LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) --override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) --endif --ifeq ($(F_COMPILER),CRAY) --LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) --override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) --endif -- --LAPACK_CFLAGS = $(CFLAGS) --LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) --LAPACK_CFLAGS += -DLAPACK_ILP64 --endif --endif -- --ifdef OS_WINDOWS --LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS --LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE --endif --ifeq ($(C_COMPILER), LSB) --LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE --endif -- --ifndef SUFFIX --SUFFIX = o --endif -- --ifndef PSUFFIX --PSUFFIX = po --endif -- --ifndef LIBSUFFIX --LIBSUFFIX = a --endif -- --ifneq ($(DYNAMIC_ARCH), 1) --ifndef SMP --LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) --LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) --else --LIBNAME = $(LIBPREFIX)_$(LIBCORE)p$(REVISION).$(LIBSUFFIX) --LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)p$(REVISION)_p.$(LIBSUFFIX) --endif --else --ifndef SMP --LIBNAME = $(LIBPREFIX)$(REVISION).$(LIBSUFFIX) --LIBNAME_P = $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX) --else --LIBNAME = $(LIBPREFIX)p$(REVISION).$(LIBSUFFIX) --LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX) --endif --endif -- -- --LIBDLLNAME = $(LIBPREFIX).dll --IMPLIBNAME = lib$(LIBNAMEBASE).dll.a --ifneq ($(OSNAME), AIX) --LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) --else --LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) --endif --LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) --LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) --LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) --LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip) -- --LIBS = $(TOPDIR)/$(LIBNAME) --LIBS_P = $(TOPDIR)/$(LIBNAME_P) -- -- --LIB_COMPONENTS = BLAS --ifneq ($(NO_CBLAS), 1) --LIB_COMPONENTS += CBLAS --endif -- --ifneq ($(NO_LAPACK), 1) --LIB_COMPONENTS += LAPACK --ifneq ($(NO_LAPACKE), 1) --LIB_COMPONENTS += LAPACKE --endif --ifeq ($(BUILD_RELAPACK), 1) --LIB_COMPONENTS += ReLAPACK --endif --endif -- --ifeq ($(ONLY_CBLAS), 1) --LIB_COMPONENTS = CBLAS --endif -- --export OSNAME --export ARCH --export CORE --export LIBCORE --export __BYTE_ORDER__ --export ELF_VERSION --export PGCPATH --export CONFIG --export CC --export FC --export BU --export FU --export NEED2UNDERSCORES --export USE_THREAD --export NUM_THREADS --export NUM_CORES --export SMP --export MAKEFILE_RULE --export NEED_PIC --export BINARY --export BINARY32 --export BINARY64 --export F_COMPILER --export C_COMPILER --export USE_OPENMP --export CROSS --export CROSS_SUFFIX --export NOFORTRAN --export C_LAPACK --export NO_FBLAS --export EXTRALIB --export CEXTRALIB --export FEXTRALIB --export HAVE_SSE --export HAVE_SSE2 --export HAVE_SSE3 --export HAVE_SSSE3 --export HAVE_SSE4_1 --export HAVE_SSE4_2 --export HAVE_SSE4A --export HAVE_SSE5 --export HAVE_AVX --export HAVE_AVX2 --export HAVE_FMA3 --export HAVE_VFP --export HAVE_VFPV3 --export HAVE_VFPV4 --export HAVE_NEON --ifndef NO_MSA -- export HAVE_MSA -- export MSA_FLAGS --endif --export KERNELDIR --export FUNCTION_PROFILE --export TARGET_CORE --export NO_AVX512 --export NO_AVX2 --export BUILD_BFLOAT16 --export NO_LSX --export NO_LASX -- --export SBGEMM_UNROLL_M --export SBGEMM_UNROLL_N --export SGEMM_UNROLL_M --export SGEMM_UNROLL_N --export DGEMM_UNROLL_M --export DGEMM_UNROLL_N --export QGEMM_UNROLL_M --export QGEMM_UNROLL_N --export CGEMM_UNROLL_M --export CGEMM_UNROLL_N --export ZGEMM_UNROLL_M --export ZGEMM_UNROLL_N --export XGEMM_UNROLL_M --export XGEMM_UNROLL_N --export CGEMM3M_UNROLL_M --export CGEMM3M_UNROLL_N --export ZGEMM3M_UNROLL_M --export ZGEMM3M_UNROLL_N --export XGEMM3M_UNROLL_M --export XGEMM3M_UNROLL_N -- -- --ifdef USE_CUDA --export CUDADIR --export CUCC --export CUFLAGS --export CULIB --endif -- --.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f -- --.f.$(SUFFIX): -- $(FC) $(FFLAGS) -c $< -o $(@F) -- --.f.$(PSUFFIX): -- $(FC) $(FPFLAGS) -pg -c $< -o $(@F) -- -- --ifdef BINARY64 --PATHSCALEPATH = /opt/pathscale/lib/3.1 --PGIPATH = /opt/pgi/linux86-64/7.1-5/lib --else --PATHSCALEPATH = /opt/pathscale/lib/3.1/32 --PGIPATH = /opt/pgi/linux86/7.1-5/lib --endif -- --ACMLPATH = /opt/acml/4.3.0 --ifneq ($(OSNAME), Darwin) --MKLPATH = /opt/intel/mkl/10.2.2.025/lib --else --MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib --endif --ATLASPATH = /opt/atlas/3.9.17/opteron --FLAMEPATH = $(HOME)/flame/lib --ifneq ($(OSNAME), SunOS) --SUNPATH = /opt/sunstudio12.1 --else --SUNPATH = /opt/SUNWspro --endif -diff --git a/Makefile.tail b/Makefile.tail -index 54ba649..f73a86d 100644 ---- a/Makefile.tail -+++ b/Makefile.tail -@@ -583,7 +583,7 @@ gen_insn_flash.c : - echo 'int i;' >> gen_insn_flash.c - echo '#ifdef __alpha' >> gen_insn_flash.c - echo 'printf(".set noat;.set noreorder;\n");' >> gen_insn_flash.c -- echo 'printf(".arch ev6;.text;.align 5\n");' >> gen_insn_flash.c -+ echo 'printf(".arch sw6;.text;.align 5\n");' >> gen_insn_flash.c - echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c - echo 'printf(".ent insn_flash\n");' >> gen_insn_flash.c - echo 'printf("insn_flash:\n");' >> gen_insn_flash.c -diff --git a/Makefile.tests b/Makefile.tests -deleted file mode 100644 -index b344abc..0000000 ---- a/Makefile.tests -+++ /dev/null -@@ -1,435 +0,0 @@ --TOPDIR = . --include ./Makefile.system -- --BLASDIRS = interface driver/level2 driver/level3 driver/others -- --ifneq ($(DYNAMIC_ARCH), 1) --BLASDIRS += kernel --endif -- --ifdef SANITY_CHECK --BLASDIRS += reference --endif -- --SUBDIRS = $(BLASDIRS) --ifneq ($(NO_LAPACK), 1) --SUBDIRS += lapack --endif -- --RELA = --ifeq ($(BUILD_RELAPACK), 1) --RELA = re_lapack --endif -- --ifeq ($(NO_FORTRAN), 1) --define NOFORTRAN --1 --endef --ifneq ($(NO_LAPACK), 1) --define C_LAPACK --1 --endef --endif --export NOFORTRAN --export NO_LAPACK --export C_LAPACK --endif -- --ifeq ($(F_COMPILER),CRAY) --LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS)) --else --LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) --endif -- --SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test -- --.PHONY : all libs netlib $(RELA) test ctest shared install --.NOTPARALLEL : shared -- --all :: tests -- @echo -- @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" -- @echo -- @echo " OS ... $(OSNAME) " -- @echo " Architecture ... $(ARCH) " --ifndef BINARY64 -- @echo " BINARY ... 32bit " --else -- @echo " BINARY ... 64bit " --endif -- --ifdef INTERFACE64 --ifneq ($(INTERFACE64), 0) -- @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " --endif --endif -- @$(CC) --version > /dev/null 2>&1;\ -- if [ $$? -eq 0 ]; then \ -- cverinfo=`$(CC) --version | sed -n '1p'`; \ -- if [ -z "$${cverinfo}" ]; then \ -- cverinfo=`$(CC) --version | sed -n '2p'`; \ -- fi; \ -- echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ -- else \ -- echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ -- fi --ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -- @$(FC) --version > /dev/null 2>&1;\ -- if [ $$? -eq 0 ]; then \ -- fverinfo=`$(FC) --version | sed -n '1p'`; \ -- if [ -z "$${fverinfo}" ]; then \ -- fverinfo=`$(FC) --version | sed -n '2p'`; \ -- fi; \ -- echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ -- else \ -- echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ -- fi --endif --ifneq ($(OSNAME), AIX) -- @echo -n " Library Name ... $(LIBNAME)" --else -- @echo " Library Name ... $(LIBNAME)" --endif -- --ifndef SMP -- @echo " (Single-threading) " --else -- @echo " (Multi-threading; Max num-threads is $(NUM_THREADS))" --endif -- --ifeq ($(DYNAMIC_ARCH), 1) -- @echo " Supporting multiple $(ARCH) cpu models with minimum requirement for the common code being $(CORE)" --endif -- --ifeq ($(USE_OPENMP), 1) -- @echo -- @echo " Use OpenMP in the multithreading. Because of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, " -- @echo " you should use OMP_NUM_THREADS environment variable to control the number of threads." -- @echo --endif -- --ifeq ($(OSNAME), Darwin) -- @echo "WARNING: If you plan to use the dynamic library $(LIBDYNNAME), you must run:" -- @echo -- @echo "\"make PREFIX=/your_installation_path/ install\"." -- @echo -- @echo "(or set PREFIX in Makefile.rule and run make install." -- @echo -- @echo "Note that any flags passed to make during build should also be passed to make install" -- @echo "to circumvent any install errors." -- @echo -- @echo "If you want to move the .dylib to a new location later, make sure you change" -- @echo "the internal name of the dylib with:" -- @echo -- @echo "install_name_tool -id /new/absolute/path/to/$(LIBDYNNAME) $(LIBDYNNAME)" --endif -- @echo -- @echo "To install the library, you can run \"make PREFIX=/path/to/your/installation install\"." -- @echo -- @echo "Note that any flags passed to make during build should also be passed to make install" -- @echo "to circumvent any install errors." -- @echo -- --shared : libs netlib $(RELA) --ifneq ($(NO_SHARED), 1) --ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) -- @$(MAKE) -C exports so -- @ln -fs $(LIBSONAME) $(LIBPREFIX).so -- @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) --endif --ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) -- @$(MAKE) -C exports so -- @ln -fs $(LIBSONAME) $(LIBPREFIX).so --endif --ifeq ($(OSNAME), Darwin) -- @$(MAKE) -C exports dyn -- @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib -- @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib --endif --ifeq ($(OSNAME), WINNT) -- @$(MAKE) -C exports dll --endif --ifeq ($(OSNAME), CYGWIN_NT) -- @$(MAKE) -C exports dll --endif --endif -- --tests : shared --ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -- touch $(LIBNAME) --ifndef NO_FBLAS -- $(MAKE) -C test all --endif --endif --ifneq ($(ONLY_CBLAS), 1) -- $(MAKE) -C utest all --endif --ifneq ($(NO_CBLAS), 1) --ifneq ($(ONLY_CBLAS), 1) -- $(MAKE) -C ctest all --endif --ifeq ($(CPP_THREAD_SAFETY_TEST), 1) -- $(MAKE) -C cpp_thread_test all --endif --endif -- --libs : --ifeq ($(CORE), UNKNOWN) -- $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) --endif --ifeq ($(NOFORTRAN), 1) -- $(info OpenBLAS: Detecting fortran compiler failed. Can only compile BLAS and f2c-converted LAPACK.) --endif --ifeq ($(NO_STATIC), 1) --ifeq ($(NO_SHARED), 1) -- $(error OpenBLAS: neither static nor shared are enabled.) --endif --endif -- @for d in $(SUBDIRS) ; \ -- do if test -d $$d; then \ -- $(MAKE) -C $$d $(@F) || exit 1 ; \ -- fi; \ -- done --#Save the config files for installation -- @cp Makefile.conf Makefile.conf_last -- @cp config.h config_last.h --ifdef QUAD_PRECISION -- @echo "#define QUAD_PRECISION">> config_last.h --endif --ifeq ($(EXPRECISION), 1) -- @echo "#define EXPRECISION">> config_last.h --endif --## --ifeq ($(DYNAMIC_ARCH), 1) -- @$(MAKE) -C kernel commonlibs || exit 1 -- @for d in $(DYNAMIC_CORE) ; \ -- do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ -- done -- @echo DYNAMIC_ARCH=1 >> Makefile.conf_last --ifeq ($(DYNAMIC_OLDER), 1) -- @echo DYNAMIC_OLDER=1 >> Makefile.conf_last --endif --endif -- @echo TARGET=$(CORE) >> Makefile.conf_last --ifdef USE_THREAD -- @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last --endif --ifdef SMP --ifdef NUM_THREADS -- @echo NUM_THREADS=$(NUM_THREADS) >> Makefile.conf_last --else -- @echo NUM_THREADS=$(NUM_CORES) >> Makefile.conf_last --endif --endif --ifeq ($(USE_OPENMP),1) -- @echo USE_OPENMP=1 >> Makefile.conf_last --endif --ifeq ($(INTERFACE64),1) -- @echo INTERFACE64=1 >> Makefile.conf_last --endif -- @echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last -- @echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last -- @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) -- @touch lib.grd -- --prof : prof_blas prof_lapack -- --prof_blas : -- ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) -- for d in $(SUBDIRS) ; \ -- do if test -d $$d; then \ -- $(MAKE) -C $$d prof || exit 1 ; \ -- fi; \ -- done --ifeq ($(DYNAMIC_ARCH), 1) -- $(MAKE) -C kernel commonprof || exit 1 --endif -- --blas : -- ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) -- for d in $(BLASDIRS) ; \ -- do if test -d $$d; then \ -- $(MAKE) -C $$d libs || exit 1 ; \ -- fi; \ -- done -- --hpl : -- ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) -- for d in $(BLASDIRS) ../laswp exports ; \ -- do if test -d $$d; then \ -- $(MAKE) -C $$d $(@F) || exit 1 ; \ -- fi; \ -- done --ifeq ($(DYNAMIC_ARCH), 1) -- $(MAKE) -C kernel commonlibs || exit 1 -- for d in $(DYNAMIC_CORE) ; \ -- do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ -- done --endif -- --hpl_p : -- ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) -- for d in $(SUBDIRS) ../laswp exports ; \ -- do if test -d $$d; then \ -- $(MAKE) -C $$d $(@F) || exit 1 ; \ -- fi; \ -- done -- --netlib : lapack_prebuild --ifneq ($(NO_LAPACK), 1) -- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib -- @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib --endif --ifneq ($(NO_LAPACKE), 1) -- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib --endif -- --ifeq ($(NO_LAPACK), 1) --re_lapack : -- --else --re_lapack : -- @$(MAKE) -C relapack --endif -- --prof_lapack : lapack_prebuild -- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof -- --lapack_prebuild : --ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK))) -- -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc --ifeq ($(F_COMPILER), GFORTRAN) -- -@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc --else -- -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc --endif -- -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc --ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1) -- -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc --else -- -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc --endif -- -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc --ifeq ($(F_COMPILER), GFORTRAN) -- -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc --ifdef SMP --ifeq ($(OSNAME), WINNT) -- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc --else ifeq ($(OSNAME), Haiku) -- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc --else -- -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc --endif --else -- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc --endif --else -- -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc --endif --ifeq ($(BUILD_LAPACK_DEPRECATED), 1) -- -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc --endif --ifeq ($(BUILD_SINGLE), 1) -- -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc --endif --ifeq ($(BUILD_DOUBLE), 1) -- -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc --endif --ifeq ($(BUILD_COMPLEX), 1) -- -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc --endif --ifeq ($(BUILD_COMPLEX16), 1) -- -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc --endif -- -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -- -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc --endif -- --large.tgz : --ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -- if [ ! -a $< ]; then -- -wget http://www.netlib.org/lapack/timing/large.tgz; -- fi --endif -- --timing.tgz : --ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -- if [ ! -a $< ]; then -- -wget http://www.netlib.org/lapack/timing/timing.tgz; -- fi --endif -- --lapack-timing : large.tgz timing.tgz --ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -- (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) -- (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) -- $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING --endif -- -- --lapack-test : -- (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) -- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz -- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc --ifneq ($(CROSS), 1) -- ( cd $(NETLIB_LAPACK_DIR)/INSTALL; $(MAKE) all; ./testlsame; ./testslamch; ./testdlamch; \ -- ./testsecond; ./testdsecnd; ./testieee; ./testversion ) -- (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING) --endif -- --lapack-runtest: lapack-test -- ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ -- ./testsecond; ./testdsecnd; ./testieee; ./testversion ) -- (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING ) -- -- --blas-test: -- (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) -- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing -- (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) -- -- --dummy : -- --install : -- $(MAKE) -f Makefile.install install -- --clean :: -- @for d in $(SUBDIRS_ALL) ; \ -- do if test -d $$d; then \ -- $(MAKE) -C $$d $(@F) || exit 1 ; \ -- fi; \ -- done --#ifdef DYNAMIC_ARCH -- @$(MAKE) -C kernel clean --#endif -- @$(MAKE) -C reference clean -- @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 --ifeq ($(OSNAME), Darwin) -- @rm -rf getarch.dSYM getarch_2nd.dSYM --endif -- @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib -- @rm -f cblas.tmp cblas.tmp2 -- @touch $(NETLIB_LAPACK_DIR)/make.inc -- @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean -- @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h -- @$(MAKE) -C relapack clean -- @rm -f *.grd Makefile.conf_last config_last.h -- @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) -- @echo Done. diff --git a/c_check b/c_check -index b018c10..13a7086 100755 +index b018c10a8..66289a683 100755 --- a/c_check +++ b/c_check @@ -84,6 +84,7 @@ case "$data" in @@ -2428,7 +164,7 @@ index b018c10..13a7086 100755 ;; arm|arm64) defined=1 ;; - zarch|e2k|alpha|ia64|riscv64|loonarch64) -+ zarch|e2k|alpha|ia64|riscv64|loonarch64|sw_64) ++ zarch|e2k|alpha|sw_64|ia64|riscv64|loonarch64) defined=1 BINARY=64 ;; @@ -2441,7 +177,7 @@ index b018c10..13a7086 100755 *ARCH_IA64*) architecture=ia64 ;; *ARCH_ARM64*) architecture=arm64 ;; diff --git a/common.h b/common.h -index 4074df0..309c3f9 100644 +index 4074df069..309c3f91d 100644 --- a/common.h +++ b/common.h @@ -420,6 +420,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 @@ -2457,48 +193,10 @@ index 4074df0..309c3f9 100644 #include diff --git a/common_sw_64.h b/common_sw_64.h new file mode 100644 -index 0000000..e14268e +index 000000000..6468431c7 --- /dev/null +++ b/common_sw_64.h -@@ -0,0 +1,200 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ +@@ -0,0 +1,99 @@ +#ifndef COMMON_SW_64 +#define COMMON_SW_64 + @@ -2508,72 +206,14 @@ index 0000000..e14268e +#define WMB asm("memb") +#define RMB asm("memb") + -+static void __inline blas_lock(unsigned long *address){ -+#ifndef __DECC -+ unsigned long tmp1, tmp2,tmp3; -+ asm volatile( -+ "1: ldl %1, %0\n" -+ " bne %1, 2f\n" -+ " ldi %3, %0 \n" -+ " lldl %1, 0(%3)\n" -+ " ldi %2, 1 \n" -+ " wr_f %2 \n" -+ " or %1, 1, %2\n" -+ " memb\n " -+ " lstl %2, 0(%3)\n" -+ " rd_f %2\n" -+ " bne %1, 2f\n" -+ " beq %2, 2f\n" -+ " memb\n " -+ " br $31, 3f\n" -+ "2: br $31, 1b\n" -+ "3:\n" : "=m"(*address), "=&r"(tmp1), "=&r"(tmp2),"=&r"(tmp3) : : "memory"); -+#else -+ asm ( -+ "10:" -+ " ldl %t0, 0(%a0); " -+ " bne %t0, 20f; " -+ " ldi %t2, %a0" -+ " lldl %t0, 0(%t2); " -+ " ldi %t1, 1" -+ " wr_f %t1" -+ " or %t0, 1, %t1;" -+ " memb; " -+ " lstl %t1, 0(%t2); " -+ " rd_f %t1" -+ " bne %t0, 20f; " -+ " beq %t1, 20f; " -+ " memb; " -+ " br %r31,30f; " -+ "20: " -+ " br %r31,10b; " -+ "30:", address); -+#endif -+} -+#define BLAS_LOCK_DEFINED -+ -+static __inline unsigned int rpcc(void){ -+ -+ unsigned int r0; -+ -+#ifndef __DECC -+ asm __volatile__("rtc %0" : "=r"(r0) : : "memory"); -+#else -+ r0 = asm("rtc %v0"); -+#endif -+ ++static __inline unsigned long rpcc(void){ ++ unsigned long r0; ++ asm __volatile__("rtc %0, $31" : "=r"(r0) : : "memory"); + return r0; +} +#define RPCC_DEFINED + -+ -+#define HALT ldl $0, 0($0) -+ -+#ifndef __DECC +#define GET_IMAGE(res) asm __volatile__("fmov $f1, %0" : "=f"(res) : : "memory") -+#else -+#define GET_IMAGE(res) res = dasm("fmov $f1, %f0") -+#endif + +#ifdef SMP +#ifdef USE64BITINT @@ -2581,11 +221,8 @@ index 0000000..e14268e + return x/y; +} +#else -+extern unsigned int blas_quick_divide_table[]; -+ +static __inline int blas_quickdivide(unsigned int x, unsigned int y){ -+ if (y <= 1) return x; -+ return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32); ++ return x/y; +} +#endif +#endif @@ -2607,7 +244,7 @@ index 0000000..e14268e +#endif + +#define PROLOGUE \ -+ .arch sw6; \ ++ .arch sw8a; \ + .set noat; \ + .set noreorder; \ +.text; \ @@ -2618,10 +255,10 @@ index 0000000..e14268e + +#ifdef PROFILE +#define PROFCODE \ -+ ldgp $gp, 0($27); \ -+ ldi $28, _mcount; \ -+ jsr $28, ($28), _mcount; \ -+ .prologue 1 ++ ldgp $gp, 0($27); \ ++ lda $28, _mcount; \ ++ jsr $28, ($28), _mcount; \ ++ .prologue 1 +#else +#define PROFCODE .prologue 0 +#endif @@ -2641,168 +278,53 @@ index 0000000..e14268e + +#ifdef DOUBLE +#define SXADDQ s8addl -+#define SXSUBL s8subl ++#define SXSUBL s8subw +#define LD fldd +#define ST fstd -+#define STQ stq +#define ADD faddd +#define SUB fsubd +#define MUL fmuld +#define DIV fdivd +#else +#define SXADDQ s4addl -+#define SXSUBL s4subl ++#define SXSUBL s4subw +#define LD flds +#define ST fsts -+#define STQ stl +#define ADD fadds +#define SUB fsubs +#define MUL fmuls +#define DIV fdivs +#endif +#endif -diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile -index be8313e..1ab9bb8 100644 ---- a/cpp_thread_test/Makefile -+++ b/cpp_thread_test/Makefile -@@ -1,14 +1,13 @@ --TOPDIR = .. --include $(TOPDIR)/Makefile.system -+include ../Makefile.rule - - all :: dgemv_tester dgemm_tester - - dgemv_tester : -- $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester -+ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester - ./dgemv_tester - - dgemm_tester : dgemv_tester -- $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester -+ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester - ./dgemm_tester - - clean :: diff --git a/cpuid_sw_64.c b/cpuid_sw_64.c new file mode 100644 -index 0000000..61ed28a +index 000000000..b05c655cb --- /dev/null +++ b/cpuid_sw_64.c -@@ -0,0 +1,105 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#if defined(__sw_64__) && defined(__DECC) -+#include -+#endif -+ -+int implver(void){ -+ int arch; -+ -+#ifndef __DECC -+ asm __volatile__("implver %0" : "=r"(arch) : : "memory"); -+#else -+ arch = asm("implver %v0"); -+#endif -+ return arch; -+} -+ -+void get_architecture(void){ -+ printf("SW_64"); -+} -+ -+void get_subarchitecture(void){ -+ printf("sw%d", implver() + 4); +@@ -0,0 +1,14 @@ ++char *get_corename(void) { ++ return "SW8A"; +} + -+void get_subdirname(void){ -+ printf("sw_64"); -+} -+ -+char *get_corename(void){ -+ return "sw_64"; ++void get_libname(void){ ++ printf("sw8a"); +} + +void get_cpuconfig(void){ -+ printf("#define SW%d\n", implver() + 4); -+ -+ switch (implver()){ -+ case 0: -+ printf("#define L1_DATA_SIZE 16384\n"); -+ printf("#define L1_DATA_LINESIZE 32\n"); -+ printf("#define L2_SIZE 2097152\n"); -+ printf("#define L2_LINESIZE 32\n"); ++ printf("#define SW8A\n"); ++ printf("#define L1_DATA_LINESIZE 128\n"); ++ printf("#define L2_SIZE 524288\n"); + printf("#define DTB_DEFAULT_ENTRIES 32\n"); -+ printf("#define DTB_SIZE 8192\n"); -+ break; -+ -+ case 1: -+ printf("#define L1_DATA_SIZE 16384\n"); -+ printf("#define L1_DATA_LINESIZE 32\n"); -+ printf("#define L2_SIZE 2097152\n"); -+ printf("#define L2_LINESIZE 64\n"); -+ printf("#define DTB_DEFAULT_ENTRIES 64\n"); -+ printf("#define DTB_SIZE 8192\n"); -+ break; -+ -+ case 2: -+ printf("#define L1_DATA_SIZE 32768\n"); -+ printf("#define L1_DATA_LINESIZE 64\n"); -+ printf("#define L2_SIZE 4194304\n"); -+ printf("#define L2_LINESIZE 64\n"); -+ printf("#define DTB_DEFAULT_ENTRIES 64\n"); -+ printf("#define DTB_SIZE 8192\n"); -+ break; -+ } -+} -+ -+void get_libname(void){ -+ printf("sw%d\n", implver() + 4); +} diff --git a/ctest.c b/ctest.c -index 2ccae8d..6b21d3a 100644 +index 2ccae8dcc..e6964cb5e 100644 --- a/ctest.c +++ b/ctest.c @@ -137,6 +137,10 @@ ARCH_MIPS ARCH_ALPHA #endif -+#ifdef __sw_64__ ++#ifdef __sw_64 +ARCH_SW_64 +#endif + @@ -2810,85 +332,45 @@ index 2ccae8d..6b21d3a 100644 ARCH_SPARC #endif diff --git a/getarch.c b/getarch.c -index 87384c0..306c389 100644 +index 87384c084..a5e59d5cc 100644 --- a/getarch.c +++ b/getarch.c -@@ -1766,6 +1766,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +@@ -1727,6 +1727,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif -+#ifdef __sw_64__ ++#ifdef __sw_64 +#include "cpuid_sw_64.c" +#define OPENBLAS_SUPPORTED +#endif - - #ifndef OPENBLAS_SUPPORTED - #error "This arch/CPU is not supported by OpenBLAS." -@@ -1831,7 +1835,7 @@ int main(int argc, char *argv[]){ ++ + #ifdef POWER + #include "cpuid_power.c" + #define OPENBLAS_SUPPORTED +@@ -1831,7 +1836,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) -+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__sw_64__) ++#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__sw_64) printf("CORE=%s\n", get_corename()); #endif #endif -@@ -1979,7 +1983,7 @@ printf("ELF_VERSION=2\n"); +@@ -1979,7 +1984,7 @@ printf("ELF_VERSION=2\n"); #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) -+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__sw_64__) ++#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__sw_64) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif -diff --git a/interface/gbmv.c b/interface/gbmv.c -index 1d58ba8..18aa50e 100644 ---- a/interface/gbmv.c -+++ b/interface/gbmv.c -@@ -236,7 +236,12 @@ void CNAME(enum CBLAS_ORDER order, - - #ifdef SMP - } else { -- -+//ZYX20220118 -+#ifndef TRANSA -+ memset(buffer, 0, nthreads*m*sizeof(FLOAT)); -+#else -+ memset(buffer, 0, nthreads*n*sizeof(FLOAT)); -+#endif - (gbmv_thread[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer, nthreads); - - } -diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 -index 0933736..111924b 100644 ---- a/kernel/Makefile.L1 -+++ b/kernel/Makefile.L1 -@@ -398,12 +398,16 @@ ifndef DSWAPKERNEL - DSWAPKERNEL = swap.S - endif - -+#ZYX20220301 - ifndef CSWAPKERNEL --CSWAPKERNEL = zswap.S -+CSWAPKERNEL = zswap.c -+#CSWAPKERNEL = zswap.S - endif - -+#ZYX20220301 - ifndef ZSWAPKERNEL --ZSWAPKERNEL = zswap.S -+ZSWAPKERNEL = zswap.c -+#ZSWAPKERNEL = zswap.S - endif - - ifndef QSWAPKERNEL diff --git a/kernel/sw_64/KERNEL b/kernel/sw_64/KERNEL new file mode 100644 -index 0000000..d10504b +index 000000000..71e93fb46 --- /dev/null +++ b/kernel/sw_64/KERNEL -@@ -0,0 +1,176 @@ +@@ -0,0 +1,128 @@ +ifndef SAMINKERNEL +SAMINKERNEL = amax.S +endif @@ -2929,24 +411,16 @@ index 0000000..d10504b +IZAMINKERNEL = izamax.S +endif + -+#ZYX20220301 -+ifndef LSAME_KERNEL -+LSAME_KERNEL = ../generic/lsame.c -+endif -+ -+#ZYX20220120 +ifndef ISMINKERNEL -+ISMINKERNEL = amax.S -+#ISMINKERNEL = imin.c ++ISMINKERNEL = imax.S +endif + -+#ZYX20220120 -+#ifndef ISMAXKERNEL -+#ISMAXKERNEL = imax.c -+#endif ++ifndef ISMAXKERNEL ++ISMAXKERNEL = imax.S ++endif + +ifndef IDMINKERNEL -+IDMINKERNEL = amax.S ++IDMINKERNEL = iamax.S +endif + +ifndef CCOPYKERNEL @@ -2973,101 +447,61 @@ index 0000000..d10504b +ZNRM2KERNEL = znrm2.S +endif + -+ifndef SGEMMKERNEL +SGEMMKERNEL = gemm_kernel_4x4.S +SGEMM_BETA = gemm_beta.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -+SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) -+SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) -+endif ++SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) ++SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) + -+ifndef DGEMMKERNEL +DGEMMKERNEL = gemm_kernel_4x4.S +DGEMM_BETA = gemm_beta.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) -+endif + -+ifndef CGEMMKERNEL +CGEMMKERNEL = zgemm_kernel_2x2.S +CGEMM_BETA = zgemm_beta.S +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) -+endif + -+ifndef ZGEMMKERNEL +ZGEMMKERNEL = zgemm_kernel_2x2.S +ZGEMM_BETA = zgemm_beta.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) -+endif + +SGEMM_BETA = gemm_beta.S +DGEMM_BETA = gemm_beta.S +CGEMM_BETA = zgemm_beta.S +ZGEMM_BETA = zgemm_beta.S + -+ifndef STRSMKERNEL_LN +STRSMKERNEL_LN = trsm_kernel_4x4_LN.S -+endif -+ifndef STRSMKERNEL_LT +STRSMKERNEL_LT = trsm_kernel_4x4_LT.S -+endif -+ifndef STRSMKERNEL_RN +STRSMKERNEL_RN = trsm_kernel_4x4_LT.S -+endif -+ifndef STRSMKERNEL_RT +STRSMKERNEL_RT = trsm_kernel_4x4_RT.S -+endif + -+ifndef DTRSMKERNEL_LN +DTRSMKERNEL_LN = trsm_kernel_4x4_LN.S -+endif -+ifndef DTRSMKERNEL_LT +DTRSMKERNEL_LT = trsm_kernel_4x4_LT.S -+endif -+ifndef DTRSMKERNEL_RN +DTRSMKERNEL_RN = trsm_kernel_4x4_LT.S -+endif -+ifndef DTRSMKERNEL_RT +DTRSMKERNEL_RT = trsm_kernel_4x4_RT.S -+endif + -+ifndef CTRSMKERNEL_LN +CTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S -+endif -+ifndef CTRSMKERNEL_LT +CTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S -+endif -+ifndef CTRSMKERNEL_RN +CTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S -+endif -+ifndef CTRSMKERNEL_RT +CTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S -+endif + -+ifndef ZTRSMKERNEL_LN +ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S -+endif -+ifndef ZTRSMKERNEL_LT +ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S -+endif -+ifndef ZTRSMKERNEL_RN +ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S -+endif -+ifndef ZTRSMKERNEL_RT +ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S -+endif diff --git a/kernel/sw_64/Makefile b/kernel/sw_64/Makefile new file mode 100644 -index 0000000..efae70d +index 000000000..efae70d7b --- /dev/null +++ b/kernel/sw_64/Makefile @@ -0,0 +1,2 @@ @@ -3075,7 +509,7 @@ index 0000000..efae70d + diff --git a/kernel/sw_64/amax.S b/kernel/sw_64/amax.S new file mode 100644 -index 0000000..300a2f7 +index 000000000..b05929b54 --- /dev/null +++ b/kernel/sw_64/amax.S @@ -0,0 +1,283 @@ @@ -3119,7 +553,7 @@ index 0000000..300a2f7 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" ++ + +#define N $16 +#define X $17 @@ -3218,7 +652,7 @@ index 0000000..300a2f7 + fselne $f16, $f12, $f4, $f4 + unop + fabs $f20, $f29 -+ fillcs 56 * SIZE(X) ++ s_fillcs 56 * SIZE(X) + + fselne $f17, $f13, $f5, $f5 + LD $f20, 0 * SIZE(X) @@ -3255,12 +689,12 @@ index 0000000..300a2f7 + CMPLT($f3, $f11), $f19 + addl X, INCX, X + -+ fselne $f16, $f29, $f0, $f0 ++ fselne $f16, $f29, $f0, $f0 + LD $f27, 0 * SIZE(X) + CMPLT($f4, $f12), $f16 + addl X, INCX, X + -+ fselne $f17, $f30, $f1, $f1 ++ fselne $f17, $f30, $f1, $f1 + unop + CMPLT($f5, $f13), $f17 + ldi $1, -1($1) # i -- @@ -3297,9 +731,9 @@ index 0000000..300a2f7 + fabs $f27, $f15 + CMPLT($f3, $f11), $f19 + -+ fselne $f16, $f29, $f0, $f0 ++ fselne $f16, $f29, $f0, $f0 + CMPLT($f4, $f12), $f16 -+ fselne $f17, $f30, $f1, $f1 ++ fselne $f17, $f30, $f1, $f1 + CMPLT($f5, $f13), $f17 + + fselne $f18, $f10, $f2, $f2 @@ -3326,7 +760,7 @@ index 0000000..300a2f7 + CMPLT($f4, $f6), $f17 + + fselne $f16, $f2, $f0, $f0 -+ fselne $f17, $f6, $f4, $f0 ++ fselne $f17, $f6, $f4, $f4 + + CMPLT($f0, $f4), $f16 + fselne $f16, $f4, $f0, $f0 @@ -3364,245 +798,9 @@ index 0000000..300a2f7 + EPILOGUE diff --git a/kernel/sw_64/asum.S b/kernel/sw_64/asum.S new file mode 100644 -index 0000000..54e7fcb +index 000000000..d49f89fae --- /dev/null +++ b/kernel/sw_64/asum.S -@@ -0,0 +1,230 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 88 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define I $19 -+ -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 -+ -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f19 -+ -+#define t0 $f20 -+#define t1 $f21 -+#define t2 $f22 -+#define t3 $f23 -+ -+ PROLOGUE -+ PROFCODE -+ -+ fclr s0 -+ unop -+ fclr t0 -+ ble N, $L999 -+ -+ sra N, 3, I -+ fclr s1 -+ fclr s2 -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ fclr t1 -+ SXADDQ INCX, X, X -+ fclr t2 -+ -+ LD a1, 0 * SIZE(X) -+ fclr t3 -+ SXADDQ INCX, X, X -+ fclr s3 -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD a4, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a5, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ ldw $31, PREFETCHSIZE * 2 * SIZE(X) -+ fabs a0, t0 -+ ldi I, -1(I) -+ -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ LD a6, 0 * SIZE(X) -+ fabs a1, t1 -+ SXADDQ INCX, X, X -+ -+ ADD s2, t2,$f24 -+ fmov $f24,s2 -+ LD a7, 0 * SIZE(X) -+ fabs a2, t2 -+ SXADDQ INCX, X, X -+ -+ ADD s3, t3,$f24 -+ fmov $f24,s3 -+ LD a0, 0 * SIZE(X) -+ fabs a3, t3 -+ SXADDQ INCX, X, X -+ -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ LD a1, 0 * SIZE(X) -+ fabs a4, t0 -+ SXADDQ INCX, X, X -+ -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ LD a2, 0 * SIZE(X) -+ fabs a5, t1 -+ SXADDQ INCX, X, X -+ -+ ADD s2, t2,$f24 -+ fmov $f24,s2 -+ LD a3, 0 * SIZE(X) -+ fabs a6, t2 -+ SXADDQ INCX, X, X -+ -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ LD a4, 0 * SIZE(X) -+ fabs a7, t3 -+ SXADDQ INCX, X, X -+ -+ LD a5, 0 * SIZE(X) -+ unop -+ SXADDQ INCX, X, X -+ bne I, $L12 -+ .align 4 -+ -+$L13: -+ ADD s0, t0,$f24 -+ fmov $f24,s0 -+ LD a6, 0 * SIZE(X) -+ fabs a0, t0 -+ SXADDQ INCX, X, X -+ -+ ADD s1, t1,$f24 -+ fmov $f24,s1 -+ LD a7, 0 * SIZE(X) -+ fabs a1, t1 -+ SXADDQ INCX, X, X -+ -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ fabs a2, t2 -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ fabs a3, t3 -+ -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ fabs a4, t0 -+ ADD s1, t1,$f24 -+ fmov $f24,s1 -+ fabs a5, t1 -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ fabs a6, t2 -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ fabs a7, t3 -+ -+ ADD s1, t1,$f24 -+ fmov $f24,s1 -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ -+ ADD s0, s1, $f24 -+ fmov $f24,s0 -+ ADD s2, s3, $f24 -+ fmov $f24,s2 -+ .align 4 -+ -+$L15: -+ and N, 7, I -+ ADD s0, s2,$f24 -+ fmov $f24,s0 -+ unop -+ ble I, $L999 -+ .align 4 -+ -+$L17: -+ ADD s0, t0, a0 -+ fmov a0,s0 -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ fabs a0, t0 -+ -+ ldi I, -1(I) -+ bne I, $L17 -+ .align 4 -+ -+$L999: -+ ADD s0, t0,$f24 -+ fmov $f24,s0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/asum.S.bak b/kernel/sw_64/asum.S.bak -new file mode 100644 -index 0000000..faf7827 ---- /dev/null -+++ b/kernel/sw_64/asum.S.bak @@ -0,0 +1,206 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ @@ -3644,7 +842,7 @@ index 0000000..faf7827 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" ++ + +#define PREFETCHSIZE 88 + @@ -3711,355 +909,7 @@ index 0000000..faf7827 + +$L12: + ADD s0, t0, s0 -+ fillcs PREFETCHSIZE * 2 * SIZE(X) -+ fabs a0, t0 -+ ldi I, -1(I) -+ -+ ADD s1, t1, s1 -+ LD a6, 0 * SIZE(X) -+ fabs a1, t1 -+ SXADDQ INCX, X, X -+ -+ ADD s2, t2, s2 -+ LD a7, 0 * SIZE(X) -+ fabs a2, t2 -+ SXADDQ INCX, X, X -+ -+ ADD s3, t3, s3 -+ LD a0, 0 * SIZE(X) -+ fabs a3, t3 -+ SXADDQ INCX, X, X -+ -+ ADD s0, t0, s0 -+ LD a1, 0 * SIZE(X) -+ fabs a4, t0 -+ SXADDQ INCX, X, X -+ -+ ADD s1, t1, s1 -+ LD a2, 0 * SIZE(X) -+ fabs a5, t1 -+ SXADDQ INCX, X, X -+ -+ ADD s2, t2, s2 -+ LD a3, 0 * SIZE(X) -+ fabs a6, t2 -+ SXADDQ INCX, X, X -+ -+ ADD s3, t3, s3 -+ LD a4, 0 * SIZE(X) -+ fabs a7, t3 -+ SXADDQ INCX, X, X -+ -+ LD a5, 0 * SIZE(X) -+ unop -+ SXADDQ INCX, X, X -+ bne I, $L12 -+ .align 4 -+ -+$L13: -+ ADD s0, t0, s0 -+ LD a6, 0 * SIZE(X) -+ fabs a0, t0 -+ SXADDQ INCX, X, X -+ -+ ADD s1, t1, s1 -+ LD a7, 0 * SIZE(X) -+ fabs a1, t1 -+ SXADDQ INCX, X, X -+ -+ ADD s2, t2, s2 -+ fabs a2, t2 -+ ADD s3, t3, s3 -+ fabs a3, t3 -+ -+ ADD s0, t0, s0 -+ fabs a4, t0 -+ ADD s1, t1, s1 -+ fabs a5, t1 -+ ADD s2, t2, s2 -+ fabs a6, t2 -+ ADD s3, t3, s3 -+ fabs a7, t3 -+ -+ ADD s1, t1, s1 -+ ADD s2, t2, s2 -+ ADD s3, t3, s3 -+ -+ ADD s0, s1, s0 -+ ADD s2, s3, s2 -+ .align 4 -+ -+$L15: -+ and N, 7, I -+ ADD s0, s2, s0 -+ unop -+ ble I, $L999 -+ .align 4 -+ -+$L17: -+ ADD s0, t0, s0 -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ fabs a0, t0 -+ -+ ldi I, -1(I) -+ bne I, $L17 -+ .align 4 -+ -+$L999: -+ ADD s0, t0, s0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/asum_simd.S b/kernel/sw_64/asum_simd.S -new file mode 100644 -index 0000000..f9152ec ---- /dev/null -+++ b/kernel/sw_64/asum_simd.S -@@ -0,0 +1,342 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 88 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define I $19 -+ -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 -+ -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f19 -+ -+#define t0 $f20 -+#define t1 $f21 -+#define t2 $f22 -+#define t3 $f23 -+ -+ PROLOGUE -+ PROFCODE -+ -+ fclr s0 -+ unop -+ fclr t0 -+ ble N, $L999 -+ -+ cmpeq INCX, 1, $3 -+ beq $3, $Sub -+ .align 4 -+ -+/* -+ Unloop 16 -+*/ -+ -+/** -+ test the address of X -+**/ -+ and X, (VEC_LEN*SIZE-1), $4 -+ nop -+ nop -+ beq $4, $Align -+ -+/** -+ process the unalign address of X -+**/ -+ -+/*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/ -+ sra N, 4, I -+ fclr s1 -+ fclr s2 -+ ble I, $Remain -+ -+ sra $4, BASE_SHIFT, $4 -+ ldi $3, VEC_LEN -+ subl $3, $4, $4 -+ nop -+ -+$UnAlign_X_Loop: -+ LD a0, 0 * SIZE(X) -+ addl X, SIZE, X -+ fabs a0, t0 -+ subl $4, 1, $4 -+ -+ ADD s0, t0, s0 -+ subl N, 1, N -+ nop -+ bgt $4, $UnAlign_X_Loop -+ -+$Align: -+ sra N, 4, I -+ fclr s1 -+ fclr s2 -+ ble I, $Remain -+ -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t0 -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t1 -+ -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t2 -+ VLD a3, 3*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t3 -+ -+ subl I, 1, I -+ addl X, 16*SIZE, X -+ unop -+ ble I, $MainLoopEnd -+ -+$MainLoop: -+ -+ vcpys $f31, a0, a4 -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ vcpys $f31, a1, a5 -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ -+ vcpys $f31, a2, a6 -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ vcpys $f31, a3, a7 -+ VLD a3, 3*VEC_LEN*SIZE(X) -+ -+ VADD t0, a4, t0 -+ subl I, 1, I -+ VADD t1, a5, t1 -+ fillcs PREFETCHSIZE * SIZE(X) -+ -+ VADD t2, a6, t2 -+ addl X, 16*SIZE, X -+ VADD t3, a7, t3 -+ bgt I, $MainLoop -+ -+$MainLoopEnd: -+ /*fabs*/ -+ -+ vcpys $f31, a0, a4 -+ vcpys $f31, a1, a5 -+ vcpys $f31, a2, a6 -+ vcpys $f31, a3, a7 -+ -+ VADD t0, a4, t0 -+ VADD t1, a5, t1 -+ VADD t2, a6, t2 -+ VADD t3, a7, t3 -+ -+ VADD t0, t1, t0 -+ VADD t2, t3, t2 -+ VADD t0, t2, t0 -+ nop -+ -+ vextf t0, 1, s1 -+ vextf t0, 2, s2 -+ vextf t0, 3, s3 -+ nop -+ -+ /*sum*/ -+ ADD t0, s1, t0 -+ ADD s2, s3, s2 -+ ADD s0, t0, s0 -+ nop -+$Remain: -+ and N, 15, I -+ ADD s0, s2, s0 -+ unop -+ ble I, $End -+ .align 4 -+ -+$RemainLoop: -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ fabs a0, t0 -+ ldi I, -1(I) -+ -+ ADD s0, t0, s0 -+ bne I, $RemainLoop -+ .align 4 -+ -+$End: -+ ret -+ -+ -+$Sub: -+ sra N, 3, I -+ fclr s1 -+ fclr s2 -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ fclr t1 -+ SXADDQ INCX, X, X -+ fclr t2 -+ -+ LD a1, 0 * SIZE(X) -+ fclr t3 -+ SXADDQ INCX, X, X -+ fclr s3 -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD a4, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a5, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ADD s0, t0, s0 -+ fillcs PREFETCHSIZE * 2 * SIZE(X) ++ s_fillcs PREFETCHSIZE * 2 * SIZE(X) + fabs a0, t0 + ldi I, -1(I) + @@ -4160,7 +1010,7 @@ index 0000000..f9152ec + EPILOGUE diff --git a/kernel/sw_64/axpy.S b/kernel/sw_64/axpy.S new file mode 100644 -index 0000000..70e97d6 +index 000000000..cc15b6b94 --- /dev/null +++ b/kernel/sw_64/axpy.S @@ -0,0 +1,428 @@ @@ -4204,7 +1054,7 @@ index 0000000..70e97d6 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" ++ + +#define PREFETCHSIZE 40 + @@ -4266,8 +1116,8 @@ index 0000000..70e97d6 + .align 4 + +$Loop: -+ fillcs PREFETCHSIZE * SIZE($24) -+ fillcs PREFETCHSIZE * SIZE($20) ++ fillde_e PREFETCHSIZE * SIZE($24) ++ s_fillcs PREFETCHSIZE * SIZE($20) + + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + LD $f10, 0*SIZE($20) @@ -4592,12 +1442,12 @@ index 0000000..70e97d6 + ldi $sp, 16($sp) + ret + EPILOGUE -diff --git a/kernel/sw_64/axpy_simd.S b/kernel/sw_64/axpy_simd.S +diff --git a/kernel/sw_64/cabs.S b/kernel/sw_64/cabs.S new file mode 100644 -index 0000000..3a2219c +index 000000000..3812edebf --- /dev/null -+++ b/kernel/sw_64/axpy_simd.S -@@ -0,0 +1,655 @@ ++++ b/kernel/sw_64/cabs.S +@@ -0,0 +1,71 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -4638,705 +1488,477 @@ index 0000000..3a2219c + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+ -+#define PREFETCHSIZE 80 -+// #define PREFETCH_DISTANCE_BYTES 384 + -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 16, $26, 0 ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl NAME ++ .ent NAME ++NAME: ++ .frame $sp, 0, $26, 0 + -+ ldl $24, 0($sp) -+ fmov $f19, $f30 -+ ldl $23, 8($sp) -+ ldi $sp, -16($sp) ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++#endif ++ ++ LD $f10, 0($16) ++ LD $f11, SIZE($16) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + -+ fstd $f2, 0($sp) -+ cmpeq $21, 1, $3 -+ fstd $f3, 8($sp) -+ cmpeq $23, 1, $4 -+ -+ ble $16, $End -+ fbeq $f30, $End -+ and $3, $4, $3 -+ beq $3, $Sub -+ -+/** -+ test the address of Y -+**/ -+ and $24, (VEC_LEN*SIZE-1), $4 -+ nop -+ nop -+ beq $4, $Align_Y_Access -+ .align 4 -+/** -+ process the unalign address of Y -+**/ -+ -+ sra $16, 4, $1 -+ and $16, 15, $2 -+ sra $4, BASE_SHIFT, $4 -+ ble $1, $Remain /*if N is too small(less then unroll size), don't need process unalign Y. Just jump to remain section.*/ ++ fabs $f10, $f12 ++ fabs $f11, $f0 ++ ADD $f12, $f0, $f0 ++ ret ++ .end NAME ++ .ident VERSION +diff --git a/kernel/sw_64/cnrm2.S b/kernel/sw_64/cnrm2.S +new file mode 100644 +index 000000000..1892c5f2b +--- /dev/null ++++ b/kernel/sw_64/cnrm2.S +@@ -0,0 +1,428 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+ ldi $3, VEC_LEN -+ subl $3, $4, $4 ++#define ASSEMBLER + -+$UnAlign_Y_Loop: -+ LD $f10, 0*SIZE($20) -+ LD $f11, 0*SIZE($24) -+ addl $20, SIZE, $20 -+ addl $24, SIZE, $24 ++#include "common.h" + -+ MAD $f30, $f10, $f11, $f13 -+ subl $4, 1, $4 -+ subl $16, 1, $16 -+ ST $f13, -1*SIZE($24) -+ bgt $4, $UnAlign_Y_Loop -+ .align 4 -+ -+ -+$Align_Y_Access: + -+ nop -+ sra $16, 4, $1 -+ and $16, 15, $2 -+ ble $1, $Remain ++#define PREFETCH_SIZE 80 + -+/** -+ test the address of X -+**/ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 + -+ and $20, (VEC_LEN*SIZE-1), $3 -+ nop -+ nop -+ bne $3, $UnAlign_X_Access -+ -+ .align 4 -+$Align_Access: -+/*** -+ extern alpha from $f30 to vector 4 in $f13 -+ unloop 16 -+***/ -+ vcpyf $f30, $f13 -+ -+ VLD $f10, 0*VEC_LEN*SIZE($20) -+/* -+ LD $f10, 0*SIZE($20) -+ LD $f11, 1*SIZE($20) -+ LD $f12, 2*SIZE($20) -+ LD $f13, 3*SIZE($20) -+*/ -+ VLD $f18, 0*VEC_LEN*SIZE($24) -+/* -+ LD $f18, 0*SIZE($24) -+ LD $f19, 1*SIZE($24) -+ LD $f20, 2*SIZE($24) -+ LD $f21, 3*SIZE($24) -+*/ -+ VLD $f14, 1*VEC_LEN*SIZE($20) -+ VLD $f15, 2*VEC_LEN*SIZE($20) -+ VLD $f16, 3*VEC_LEN*SIZE($20) -+/* -+ LD $f14, 4*SIZE($20) -+ LD $f15, 5*SIZE($20) -+ LD $f16, 6*SIZE($20) -+ LD $f17, 7*SIZE($20) -+*/ -+ VLD $f22, 1*VEC_LEN*SIZE($24) -+ VLD $f23, 2*VEC_LEN*SIZE($24) -+ VLD $f24, 3*VEC_LEN*SIZE($24) -+/* -+ LD $f22, 4*SIZE($24) -+ LD $f23, 5*SIZE($24) -+ LD $f24, 6*SIZE($24) -+ LD $f25, 7*SIZE($24) -+*/ -+ -+ subl $1, 1, $1 -+ addl $20, 16*SIZE, $20 -+ unop -+ ble $1, $LoopEnd -+ .align 4 ++#define I $0 + -+$Loop: ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 + -+ fillcs PREFETCHSIZE * SIZE($24) -+ fillcs PREFETCHSIZE * SIZE($20) -+/* -+ fillcs PREFETCH_DISTANCE_BYTES($24) -+ fillcs PREFETCH_DISTANCE_BYTES($20) -+*/ -+ -+ VMAD $f13, $f10, $f18, $f0 -+ VLD $f10, 0*VEC_LEN*SIZE($20) -+ VLD $f18, 4*VEC_LEN*SIZE($24) -+/* -+ MAD $f30, $f10, $f18, $f0 # y += alpha * x -+ LD $f10, 0*SIZE($20) -+ MAD $f30, $f11, $f19, $f1 -+ LD $f11, 1*SIZE($20) ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 + -+ MAD $f30, $f12, $f20, $f2 -+ LD $f12, 2*SIZE($20) -+ MAD $f30, $f13, $f21, $f3 -+ LD $f13, 3*SIZE($20) -+*/ -+ -+ VMAD $f13, $f14, $f22, $f26 -+ VLD $f14, 1*VEC_LEN*SIZE($20) -+ VLD $f22, 5*VEC_LEN*SIZE($24) -+ -+ VMAD $f13, $f15, $f23, $f27 -+ VLD $f15, 2*VEC_LEN*SIZE($20) -+ VLD $f23, 6*VEC_LEN*SIZE($24) -+ -+ VMAD $f13, $f16, $f24, $f28 -+ VLD $f16, 3*VEC_LEN*SIZE($20) -+ VLD $f24, 7*VEC_LEN*SIZE($24) -+/* -+ MAD $f30, $f14, $f22, $f26 # y += alpha * x -+ LD $f14, 4*SIZE($20) -+ MAD $f30, $f15, $f23, $f27 -+ LD $f15, 5*SIZE($20) ++ PROLOGUE + -+ MAD $f30, $f16, $f24, $f28 -+ LD $f16, 6*SIZE($20) -+ MAD $f30, $f17, $f25, $f29 -+ LD $f17, 7*SIZE($20) -+*/ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 + -+/* -+ LD $f18, 8*SIZE($24) -+ LD $f19, 9*SIZE($24) -+ LD $f20, 10*SIZE($24) -+ LD $f21, 11*SIZE($24) -+ -+ LD $f22, 12*SIZE($24) -+ LD $f23, 13*SIZE($24) -+ LD $f24, 14*SIZE($24) -+ LD $f25, 15*SIZE($24) -+*/ -+ -+ -+ -+ VST $f0, 0*VEC_LEN*SIZE($24) -+ VST $f26, 1*VEC_LEN*SIZE($24) -+ VST $f27, 2*VEC_LEN*SIZE($24) -+ VST $f28, 3*VEC_LEN*SIZE($24) -+/* -+ ST $f0, 0*SIZE($24) -+ ST $f1, 1*SIZE($24) -+ ST $f2, 2*SIZE($24) -+ ST $f3, 3*SIZE($24) ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) + -+ ST $f26, 4*SIZE($24) -+ ST $f27, 5*SIZE($24) -+ ST $f28, 6*SIZE($24) -+ ST $f29, 7*SIZE($24) -+*/ -+ subl $1, 1, $1 -+ addl $24, 16*SIZE, $24 -+ addl $20, 16*SIZE, $20 -+ bgt $1, $Loop -+ .align 4 ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif + -+$LoopEnd: -+ VMAD $f13, $f10, $f18, $f0 -+ VST $f0, 0*VEC_LEN*SIZE($24) -+ VMAD $f13, $f14, $f22, $f26 -+ VST $f26, 1*VEC_LEN*SIZE($24) -+ VMAD $f13, $f15, $f23, $f27 -+ VST $f27, 2*VEC_LEN*SIZE($24) -+ VMAD $f13, $f16, $f24, $f28 -+ VST $f28, 3*VEC_LEN*SIZE($24) -+ -+/* -+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 -+ MUL $f30, $f11, $f27 -+ MUL $f30, $f12, $f28 -+ MUL $f30, $f13, $f29 ++ fclr a0 ++ sll INCX, ZBASE_SHIFT, INCX ++ fclr a1 ++ ble N, $L999 + -+ ADD $f18, $f26, $f0 -+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 -+ ADD $f19, $f27, $f1 -+ MUL $f30, $f15, $f27 ++ beq INCX, $L999 + -+ ADD $f20, $f28, $f2 -+ MUL $f30, $f16, $f28 -+ ADD $f21, $f29, $f3 -+ MUL $f30, $f17, $f29 ++ fclr a2 ++ cmpeq INCX, 2 * SIZE, $0 ++ fclr a3 ++ beq $0, $L20 + -+ ST $f0, 0*SIZE($24) -+ ADD $f22, $f26, $f0 -+ ST $f1, 1*SIZE($24) -+ ADD $f23, $f27, $f1 ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L15 + -+ ST $f2, 2*SIZE($24) -+ ADD $f24, $f28, $f2 -+ ST $f3, 3*SIZE($24) -+ ADD $f25, $f29, $f3 ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) + -+ ST $f0, 4*SIZE($24) -+ ST $f1, 5*SIZE($24) -+ ST $f2, 6*SIZE($24) -+ ST $f3, 7*SIZE($24) -+*/ -+ addl $24, 16*SIZE, $24 ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) + ++ ldi I, -1(I) ++ ble I, $L12 + .align 4 + -+$Remain: -+ ble $2, $End -+ -+ .align 4 ++$L11: ++ faddd a0, t0, a0 ++ s_fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) + -+$RemainLoop: -+ LD $f10, 0*SIZE($20) -+ LD $f11, 0*SIZE($24) -+ addl $20, SIZE, $20 -+ addl $24, SIZE, $24 ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) + -+ MAD $f30, $f10, $f11, $f13 -+ subl $2, 1, $2 -+ ST $f13, -1*SIZE($24) -+ bgt $2, $RemainLoop -+ .align 4 ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) + -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ ldi $sp, 16($sp) -+ ret -+ .align 4 -+ -+$UnAlign_X_Access: -+/*** -+ extern alpha from $f30 to vector 4 in $f13 -+ unloop 16 -+ unalign access X -+ align access Y -+***/ -+ vcpyf $f30, $f13 -+ VLD_UL $f10, 0*VEC_LEN*SIZE($20) -+ VLD_UH $f2, 1*VEC_LEN*SIZE($20) -+ -+ VLD_UL $f14, 1*VEC_LEN*SIZE($20) -+ VLD_UH $f3, 2*VEC_LEN*SIZE($20) -+ -+ VLD_UL $f15, 2*VEC_LEN*SIZE($20) -+ VLD_UH $f11, 3*VEC_LEN*SIZE($20) -+ -+ VLD_UL $f16, 3*VEC_LEN*SIZE($20) -+ VLD_UH $f12, 4*VEC_LEN*SIZE($20) -+ -+ VLD $f18, 0*VEC_LEN*SIZE($24) -+ VLD $f22, 1*VEC_LEN*SIZE($24) -+ VLD $f23, 2*VEC_LEN*SIZE($24) -+ VLD $f24, 3*VEC_LEN*SIZE($24) -+ -+ vbisw $f10, $f2, $f10 -+ vbisw $f14, $f3, $f14 -+ vbisw $f15, $f11, $f15 -+ vbisw $f16, $f12, $f16 -+ -+ -+ subl $1, 1, $1 -+ addl $20, 16*SIZE, $20 ++ faddd a3, t3, a3 + unop -+ ble $1, $UnAlign_X_LoopEnd -+ .align 4 -+ -+$UnAlign_X_Loop: -+ -+ fillcs PREFETCHSIZE * SIZE($24) -+ fillcs PREFETCHSIZE * SIZE($20) -+ -+ VMAD $f13, $f10, $f18, $f0 -+ VLD_UL $f10, 0*VEC_LEN*SIZE($20) -+ VLD_UH $f2, 1*VEC_LEN*SIZE($20) ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) + ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) + -+ VMAD $f13, $f14, $f22, $f26 -+ VLD_UL $f14, 1*VEC_LEN*SIZE($20) -+ VLD_UH $f3, 2*VEC_LEN*SIZE($20) ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) + -+ VMAD $f13, $f15, $f23, $f27 -+ VLD_UL $f15, 2*VEC_LEN*SIZE($20) -+ VLD_UH $f11, 3*VEC_LEN*SIZE($20) ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) + -+ VMAD $f13, $f16, $f24, $f28 -+ VLD_UL $f16, 3*VEC_LEN*SIZE($20) -+ VLD_UH $f12, 4*VEC_LEN*SIZE($20) ++ faddd a3, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) + ++ faddd a0, t0, a0 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) + ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) + ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) + -+ VLD $f18, 4*VEC_LEN*SIZE($24) -+ vbisw $f10, $f2, $f10 -+ VLD $f22, 5*VEC_LEN*SIZE($24) -+ vbisw $f14, $f3, $f14 -+ VLD $f23, 6*VEC_LEN*SIZE($24) -+ vbisw $f15, $f11, $f15 -+ VLD $f24, 7*VEC_LEN*SIZE($24) -+ vbisw $f16, $f12, $f16 -+ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) + -+ VST $f0, 0*VEC_LEN*SIZE($24) -+ VST $f26, 1*VEC_LEN*SIZE($24) -+ VST $f27, 2*VEC_LEN*SIZE($24) -+ VST $f28, 3*VEC_LEN*SIZE($24) -+ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) + -+ subl $1, 1, $1 -+ addl $24, 16*SIZE, $24 -+ addl $20, 16*SIZE, $20 -+ bgt $1, $UnAlign_X_Loop -+ .align 4 -+ -+$UnAlign_X_LoopEnd: -+ VMAD $f13, $f10, $f18, $f0 -+ VST $f0, 0*VEC_LEN*SIZE($24) -+ VMAD $f13, $f14, $f22, $f26 -+ VST $f26, 1*VEC_LEN*SIZE($24) -+ VMAD $f13, $f15, $f23, $f27 -+ VST $f27, 2*VEC_LEN*SIZE($24) -+ VMAD $f13, $f16, $f24, $f28 -+ VST $f28, 3*VEC_LEN*SIZE($24) ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) + -+ addl $24, 16*SIZE, $24 ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) + ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 + .align 4 -+ -+$UnAlign_X_Remain: -+ ble $2, $UnAlign_X_End + -+ .align 4 ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) + -+$UnAlign_X_RemainLoop: -+ LD $f10, 0*SIZE($20) -+ LD $f11, 0*SIZE($24) -+ addl $20, SIZE, $20 -+ addl $24, SIZE, $24 ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) + -+ MAD $f30, $f10, $f11, $f13 -+ subl $2, 1, $2 -+ ST $f13, -1*SIZE($24) -+ bgt $2, $UnAlign_X_RemainLoop -+ .align 4 ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) + -+$UnAlign_X_End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ ldi $sp, 16($sp) -+ ret -+ .align 4 -+ -+ -+$Sub: -+ sra $16, 3, $1 -+ and $16, 7, $2 -+ SXSUBL $16, SIZE, $22 -+ subl $1, 1, $4 -+ -+ ble $1, $SubRemain -+ .align 4 ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) + -+ LD $f10, 0($20) -+ SXADDQ $21, $20, $20 ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) + -+ LD $f11, 0($20) -+ SXADDQ $21, $20, $20 -+ LD $f12, 0($20) -+ SXADDQ $21, $20, $20 ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) + -+ LD $f13, 0($20) -+ SXADDQ $21, $20, $20 -+ LD $f18, 0($24) -+ SXADDQ $23, $24, $22 ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) + -+ LD $f19, 0($22) -+ SXADDQ $23, $22, $22 -+ LD $f20, 0($22) -+ SXADDQ $23, $22, $22 ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) + -+ LD $f21, 0($22) -+ SXADDQ $23, $22, $22 -+ LD $f14, 0($20) -+ SXADDQ $21, $20, $20 ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 + -+ LD $f15, 0($20) -+ SXADDQ $21, $20, $20 -+ LD $f16, 0($20) -+ SXADDQ $21, $20, $20 ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 + -+ LD $f17, 0($20) -+ SXADDQ $21, $20, $20 -+ LD $f22, 0($22) -+ SXADDQ $23, $22, $22 ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 + -+ LD $f23, 0($22) -+ SXADDQ $23, $22, $22 -+ LD $f24, 0($22) -+ SXADDQ $23, $22, $22 ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 + -+ LD $f25, 0($22) -+ SXADDQ $23, $22, $22 -+ unop -+ ble $4, $SubLoopEnd ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 + .align 4 + -+$SubLoop: -+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 -+ LD $f10, 0($20) -+ unop -+ SXADDQ $21, $20, $20 -+ -+ MUL $f30, $f11, $f27 -+ LD $f11, 0($20) -+ unop -+ SXADDQ $21, $20, $20 ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 + -+ MUL $f30, $f12, $f28 -+ LD $f12, 0($20) -+ unop -+ SXADDQ $21, $20, $20 ++$L16: ++ LD x0, 0 * SIZE(X) ++ LD x1, 1 * SIZE(X) + -+ MUL $f30, $f13, $f29 -+ LD $f13, 0($20) -+ unop -+ SXADDQ $21, $20, $20 ++ ldi X, 2 * SIZE(X) + -+ ADD $f18, $f26, $f0 -+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 -+ LD $f14, 0($20) -+ SXADDQ $21, $20, $20 ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 + -+ ADD $f19, $f27, $f1 -+ MUL $f30, $f15, $f27 -+ LD $f15, 0($20) -+ SXADDQ $21, $20, $20 ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 + -+ ADD $f20, $f28, $f2 -+ MUL $f30, $f16, $f28 -+ LD $f16, 0($20) -+ SXADDQ $21, $20, $20 ++$L20: ++ fclr t0 ++ sra N, 2, I ++ fclr t1 ++ ble I, $L25 + -+ ADD $f21, $f29, $f3 -+ MUL $f30, $f17, $f29 -+ LD $f17, 0($20) -+ SXADDQ $21, $20, $20 ++ LD x0, 0 * SIZE(X) ++ fclr t2 ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ fclr t3 ++ LD x3, 1 * SIZE(X) ++ addl X, INCX, X + -+ ST $f0, 0($24) -+ SXADDQ $23, $24, $24 -+ ADD $f22, $f26, $f0 -+ unop ++ LD x4, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x5, 1 * SIZE(X) ++ addl X, INCX, X + -+ ST $f1, 0($24) -+ SXADDQ $23, $24, $24 -+ ADD $f23, $f27, $f1 -+ unop ++ LD x6, 0 * SIZE(X) ++ ble I, $L22 ++ .align 4 + -+ ST $f2, 0($24) -+ SXADDQ $23, $24, $24 -+ ADD $f24, $f28, $f2 -+ unop ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X + -+ ST $f3, 0($24) -+ SXADDQ $23, $24, $24 -+ ADD $f25, $f29, $f3 ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 + unop + -+ LD $f18, 0($22) -+ SXADDQ $23, $22, $22 -+ LD $f19, 0($22) -+ SXADDQ $23, $22, $22 ++ faddd a2, t2, a2 ++ LD x1, 1 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X + -+ LD $f20, 0($22) -+ SXADDQ $23, $22, $22 -+ LD $f21, 0($22) -+ SXADDQ $23, $22, $22 ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ unop + -+ LD $f22, 0($22) -+ SXADDQ $23, $22, $22 -+ LD $f23, 0($22) -+ SXADDQ $23, $22, $22 ++ faddd a0, t0, a0 ++ LD x3, 1 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X + -+ LD $f24, 0($22) -+ SXADDQ $23, $22, $22 -+ LD $f25, 0($22) -+ SXADDQ $23, $22, $22 ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ ldi I, -1(I) + -+ ST $f0, 0($24) -+ SXADDQ $23, $24, $24 -+ ST $f1, 0($24) -+ SXADDQ $23, $24, $24 -+ ST $f2, 0($24) -+ SXADDQ $23, $24, $24 -+ ST $f3, 0($24) -+ SXADDQ $23, $24, $24 ++ faddd a2, t2, a2 ++ LD x5, 1 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X + -+ subl $4, 1, $4 -+ bgt $4, $SubLoop ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ bgt I, $L21 + .align 4 + -+$SubLoopEnd: -+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 -+ MUL $f30, $f11, $f27 -+ MUL $f30, $f12, $f28 -+ MUL $f30, $f13, $f29 -+ -+ ADD $f18, $f26, $f0 -+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 -+ ADD $f19, $f27, $f1 -+ MUL $f30, $f15, $f27 -+ -+ ADD $f20, $f28, $f2 -+ MUL $f30, $f16, $f28 -+ ADD $f21, $f29, $f3 -+ MUL $f30, $f17, $f29 -+ -+ ST $f0, 0($24) -+ SXADDQ $23, $24, $24 -+ ST $f1, 0($24) -+ SXADDQ $23, $24, $24 ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X + -+ ST $f2, 0($24) -+ SXADDQ $23, $24, $24 -+ ST $f3, 0($24) -+ SXADDQ $23, $24, $24 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 + -+ ADD $f22, $f26, $f0 -+ ADD $f23, $f27, $f1 -+ ADD $f24, $f28, $f2 -+ ADD $f25, $f29, $f3 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 + -+ ST $f0, 0($24) -+ SXADDQ $23, $24, $24 -+ ST $f1, 0($24) -+ SXADDQ $23, $24, $24 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 + -+ ST $f2, 0($24) -+ SXADDQ $23, $24, $24 -+ ST $f3, 0($24) -+ SXADDQ $23, $24, $24 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 + .align 4 + -+$SubRemain: -+ ble $2, $SubEnd ++$L25: ++ and N, 3, I ++ ble I, $L998 + .align 4 + -+$SubRemainLoop: -+ LD $f10, 0($20) -+ LD $f11, 0($24) -+ SXADDQ $21, $20, $20 ++$L26: ++ LD x0, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X + -+ MUL $f30, $f10, $f12 -+ subl $2, 1, $2 -+ ADD $f11, $f12, $f13 -+ ST $f13, 0($24) -+ SXADDQ $23, $24, $24 ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 + -+ bgt $2, $SubRemainLoop ++ bgt I, $L26 + .align 4 + -+$SubEnd: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ ldi $sp, 16($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/cabs.S b/kernel/sw_64/cabs.S -new file mode 100644 -index 0000000..3f9ed2c ---- /dev/null -+++ b/kernel/sw_64/cabs.S -@@ -0,0 +1,72 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++$L998: ++ faddd a0, t0, a0 ++ faddd a1, t1, a1 + -+ .set noat -+ .set noreorder -+.text -+ .align 5 -+ .globl NAME -+ .ent NAME -+NAME: -+ .frame $sp, 0, $26, 0 ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 + -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ ldi $28, _mcount -+ jsr $28, ($28), _mcount -+#endif ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 + -+ LD $f10, 0($16) -+ LD $f11, SIZE($16) -+#ifndef PROFILE -+ .prologue 0 ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 +#else -+ .prologue 1 ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 +#endif ++ .align 4 + -+ fabs $f10, $f12 -+ fabs $f11, $f0 -+ ADD $f12, $f0, $f29 -+ fmov $f29, $f0 ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif + ret -+ .end NAME -+ .ident VERSION -diff --git a/kernel/sw_64/cabs.S.bak b/kernel/sw_64/cabs.S.bak ++ EPILOGUE +diff --git a/kernel/sw_64/copy.S b/kernel/sw_64/copy.S new file mode 100644 -index 0000000..5fa27af +index 000000000..978c2052b --- /dev/null -+++ b/kernel/sw_64/cabs.S.bak -@@ -0,0 +1,71 @@ ++++ b/kernel/sw_64/copy.S +@@ -0,0 +1,379 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -5377,43 +1999,351 @@ index 0000000..5fa27af + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+ .set noat -+ .set noreorder -+.text -+ .align 5 -+ .globl NAME -+ .ent NAME -+NAME: -+ .frame $sp, 0, $26, 0 + -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ lda $28, _mcount -+ jsr $28, ($28), _mcount -+#endif ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 + -+ LD $f10, 0($16) -+ LD $f11, SIZE($16) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + -+ fabs $f10, $f12 -+ fabs $f11, $f0 -+ ADD $f12, $f0, $f0 -+ ret -+ .end NAME -+ .ident VERSION -diff --git a/kernel/sw_64/cnrm2.S b/kernel/sw_64/cnrm2.S ++ cmpeq INCX, 1, $0 ++ ble N, $End ++#ifndef COMPLEX ++ sra N, 4, $4 ++#else ++ sra N, 3, $4 ++#endif ++ cmpeq INCY, 1, $1 ++ ++ and $0, $1, $0 ++ beq $0, $Sub ++#ifndef COMPLEX ++ and N, 15, $5 ++#else ++ and N, 7, $5 ++#endif ++ ble $4, $Remain ++ ++ LD $f10, 0*SIZE(X) ++ LD $f11, 1*SIZE(X) ++ LD $f12, 2*SIZE(X) ++ LD $f13, 3*SIZE(X) ++ LD $f14, 4*SIZE(X) ++ LD $f15, 5*SIZE(X) ++ LD $f16, 6*SIZE(X) ++ LD $f17, 7*SIZE(X) ++ ++ LD $f18, 8*SIZE(X) ++ LD $f19, 9*SIZE(X) ++ LD $f20, 10*SIZE(X) ++ LD $f21, 11*SIZE(X) ++ LD $f22, 12*SIZE(X) ++ LD $f23, 13*SIZE(X) ++ LD $f24, 14*SIZE(X) ++ LD $f25, 15*SIZE(X) ++ ++ subl $4, 1, $4 ++ ldi X, 16*SIZE(X) ++ ble $4, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++ ST $f10, 0*SIZE(Y) ++ ST $f11, 1*SIZE(Y) ++ ST $f12, 2*SIZE(Y) ++ ST $f13, 3*SIZE(Y) ++ ++ LD $f10, 0*SIZE(X) ++ LD $f11, 1*SIZE(X) ++ LD $f12, 2*SIZE(X) ++ LD $f13, 3*SIZE(X) ++ ++ ST $f14, 4*SIZE(Y) ++ ST $f15, 5*SIZE(Y) ++ ST $f16, 6*SIZE(Y) ++ ST $f17, 7*SIZE(Y) ++ ++ LD $f14, 4*SIZE(X) ++ LD $f15, 5*SIZE(X) ++ LD $f16, 6*SIZE(X) ++ LD $f17, 7*SIZE(X) ++ ++ ST $f18, 8*SIZE(Y) ++ ST $f19, 9*SIZE(Y) ++ ST $f20, 10*SIZE(Y) ++ ST $f21, 11*SIZE(Y) ++ ++ LD $f18, 8*SIZE(X) ++ LD $f19, 9*SIZE(X) ++ LD $f20, 10*SIZE(X) ++ LD $f21, 11*SIZE(X) ++ ++ ST $f22, 12*SIZE(Y) ++ ST $f23, 13*SIZE(Y) ++ ST $f24, 14*SIZE(Y) ++ ST $f25, 15*SIZE(Y) ++ ++ LD $f22, 12*SIZE(X) ++ LD $f23, 13*SIZE(X) ++ LD $f24, 14*SIZE(X) ++ LD $f25, 15*SIZE(X) ++ ++ subl $4, 1, $4 ++ ldi Y, 16*SIZE(Y) ++ ldi X, 16*SIZE(X) ++ bgt $4, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ ST $f10, 0*SIZE(Y) ++ ST $f11, 1*SIZE(Y) ++ ST $f12, 2*SIZE(Y) ++ ST $f13, 3*SIZE(Y) ++ ST $f14, 4*SIZE(Y) ++ ST $f15, 5*SIZE(Y) ++ ST $f16, 6*SIZE(Y) ++ ST $f17, 7*SIZE(Y) ++ ++ ST $f18, 8*SIZE(Y) ++ ST $f19, 9*SIZE(Y) ++ ST $f20, 10*SIZE(Y) ++ ST $f21, 11*SIZE(Y) ++ ST $f22, 12*SIZE(Y) ++ ST $f23, 13*SIZE(Y) ++ ST $f24, 14*SIZE(Y) ++ ST $f25, 15*SIZE(Y) ++ ++ ldi Y, 16*SIZE(Y) ++ .align 4 ++ ++$Remain: ++ ble $5, $End ++ .align 4 ++ ++$RemainLoop: ++#ifndef COMPLEX ++ LD $f10, 0*SIZE(X) ++ ldi X, 1*SIZE(X) ++ ST $f10, 0*SIZE(Y) ++ ldi Y, 1*SIZE(Y) ++#else ++ LD $f10, 0*SIZE(X) ++ LD $f11, 1*SIZE(X) ++ ldi X, 2*SIZE(X) ++ ST $f10, 0*SIZE(Y) ++ ST $f11, 1*SIZE(Y) ++ ldi Y, 2*SIZE(Y) ++#endif ++ subl $5, 1, $5 ++ bgt $5, $RemainLoop ++ .align 4 ++$End: ++ ret ++ .align 4 ++ ++$Sub: ++#ifdef COMPLEX ++ addl INCX, INCX, INCX ++ addl INCY, INCY, INCY ++ and N, 7, $5 ++#else ++ and N, 15, $5 ++#endif ++ ble $4, $SubRemain ++ .align 4 ++ ++$SubMainLoop: ++#ifndef COMPLEX ++ LD $f10, 0(X) ++ SXADDQ INCX, X, X ++ LD $f11, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f12, 0(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f14, 0(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f16, 0(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f18, 0(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f20, 0(X) ++ SXADDQ INCX, X, X ++ LD $f21, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f22, 0(X) ++ SXADDQ INCX, X, X ++ LD $f23, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f24, 0(X) ++ SXADDQ INCX, X, X ++ LD $f25, 0(X) ++ SXADDQ INCX, X, X ++ ++ ST $f10, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f11, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f12, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f13, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f14, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f15, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f16, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f17, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f18, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f19, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f20, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f21, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f22, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f23, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f24, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f25, 0(Y) ++ SXADDQ INCY, Y, Y ++#else ++ LD $f10, 0(X) ++ LD $f11, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f12, 0(X) ++ LD $f13, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f14, 0(X) ++ LD $f15, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f16, 0(X) ++ LD $f17, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f18, 0(X) ++ LD $f19, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f20, 0(X) ++ LD $f21, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f22, 0(X) ++ LD $f23, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f24, 0(X) ++ LD $f25, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST $f10, 0(Y) ++ ST $f11, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f12, 0(Y) ++ ST $f13, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f14, 0(Y) ++ ST $f15, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f16, 0(Y) ++ ST $f17, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f18, 0(Y) ++ ST $f19, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f20, 0(Y) ++ ST $f21, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f22, 0(Y) ++ ST $f23, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f24, 0(Y) ++ ST $f25, SIZE(Y) ++ SXADDQ INCY, Y, Y ++#endif ++ subl $4, 1, $4 ++ bgt $4, $SubMainLoop ++ .align 4 ++ ++$SubRemain: ++ ble $5, $SubEnd ++ .align 4 ++ ++ $SubRemainLoop: ++#ifndef COMPLEX ++ LD $f10, 0(X) ++ SXADDQ INCX, X, X ++ ST $f10, 0(Y) ++ SXADDQ INCY, Y, Y ++#else ++ LD $f10, 0(X) ++ LD $f11, SIZE(X) ++ SXADDQ INCX, X, X ++ ST $f10, 0(Y) ++ ST $f11, SIZE(Y) ++ SXADDQ INCY, Y, Y ++#endif ++ subl $5, 1, $5 ++ bgt $5, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/cscal.S b/kernel/sw_64/cscal.S new file mode 100644 -index 0000000..25eab03 +index 000000000..5ea7cc061 --- /dev/null -+++ b/kernel/sw_64/cnrm2.S -@@ -0,0 +1,440 @@ ++++ b/kernel/sw_64/cscal.S +@@ -0,0 +1,217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -5452,414 +2382,191 @@ index 0000000..25eab03 +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + ++ .set noat ++ .set noreorder ++ +#define ASSEMBLER + +#include "common.h" -+#include "version.h" -+ -+#define PREFETCH_SIZE 80 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+ -+#define I $0 -+ -+#define a0 $f0 -+#define a1 $f1 -+#define a2 $f10 -+#define a3 $f11 -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 + -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f19 -+#define x4 $f20 -+#define x5 $f21 -+#define x6 $f22 -+#define x7 $f23 + -+ PROLOGUE ++ .globl NAME ++ .ent NAME + -+#if defined(EV4) || defined(EV5) -+ .frame $30,16,$26,0 -+ .mask 0x4000000,-16 -+ ldih $29, 0($27) !gpdisp!1 -+ ldi $29, 0($29) !gpdisp!1 ++NAME: ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++#endif + -+ ldi $sp, -16($sp) -+ ldl $27, sqrt($29) !literal!2 -+ stl $26, 0($sp) ++#ifndef C_INTERFACE ++ ldl $16, 0($16) # n ++ mov $18, $20 # Store Address ++ ldl $19, 0($19) # incx ++ nop + -+ PROFCODE -+ .prologue 1 ++ LD $f1, 0($17) # alpha +#else -+ PROFCODE ++ mov $18, $20 # Store Address ++ fmov $f17, $f1 # alpha +#endif + -+ fclr a0 -+ sll INCX, ZBASE_SHIFT, INCX -+ fclr a1 -+ ble N, $L999 ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif + -+ fclr a2 -+ cmpeq INCX, 2 * SIZE, $0 -+ fclr a3 -+ beq $0, $L20 ++ sra $16, 1, $21 # 4-unrolling ++ ble $16, $End + -+ fclr t0 -+ sra N, 3, I -+ fclr t1 -+ ble I, $L15 ++ ldi $23, -1($19) ++ ble $19, $End + -+ fclr t2 -+ LD x0, 0 * SIZE(X) -+ fclr t3 -+ LD x1, 1 * SIZE(X) ++ bgt $23, $INC_NOT_1 ++ .align 4 + -+ LD x2, 2 * SIZE(X) -+ LD x3, 3 * SIZE(X) -+ LD x4, 4 * SIZE(X) -+ LD x5, 5 * SIZE(X) -+ LD x6, 6 * SIZE(X) -+ LD x7, 7 * SIZE(X) ++ ble $21, $Sub ++ ldi $21, -1($21) ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) + -+ ldi I, -1(I) -+ ble I, $L12 ++ LD $f12, 2*SIZE($18) ++ LD $f13, 3*SIZE($18) ++ ldi $18, 4*SIZE($18) ++ ble $21, $MainRemain + .align 4 + -+$L11: -+ faddd a0, t0, $f25 -+ fillcs (PREFETCH_SIZE) * SIZE(X) -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) ++$MainLoop: ++ MUL $f10, $f1, $f20 ++ LD $f10, 0*SIZE($18) ++ MUL $f11, $f1, $f21 ++ LD $f11, 1*SIZE($18) + -+ faddd a1, t1, $f26 -+ mov X, XX -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) ++ MUL $f12, $f1, $f22 ++ LD $f12, 2*SIZE($18) ++ MUL $f13, $f1, $f23 ++ LD $f13, 3*SIZE($18) + -+ faddd a2, t2, $f27 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) ++ ldi $18, 4*SIZE($18) ++ ldi $21, -1($21) + -+ faddd a3, t3, $f28 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ ST $f22, 2*SIZE($20) ++ ST $f23, 3*SIZE($20) ++ ldi $20, 4*SIZE($20) + -+ faddd $f25, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(X) ++ bgt $21, $MainLoop ++ .align 4 + -+ faddd $f26, t1, a1 -+ unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(X) ++$MainRemain: ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ MUL $f12, $f1, $f22 ++ MUL $f13, $f1, $f23 + -+ faddd $f27, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(X) ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ ST $f22, 2*SIZE($20) ++ ST $f23, 3*SIZE($20) ++ ldi $20, 4*SIZE($20) ++ .align 4 + -+ faddd $f28, t3, a3 -+ unop -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(X) ++$Sub: ++ blbc $16, $End ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ .align 4 + -+ faddd a0, t0, $f25 -+ unop -+ fmuld x0, x0, t0 -+ LD x0, 16 * SIZE(X) ++$End: ++ ret ++ .align 4 + -+ faddd a1, t1, $f26 -+ ldi X, 16 * SIZE(X) -+ fmuld x1, x1, t1 -+ LD x1, 17 * SIZE(XX) ++$INC_NOT_1: ++ addl $19, $19, $19 ++ ble $21, $INC_Sub ++ ldi $21, -1($21) + -+ faddd a2, t2, $f27 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 18 * SIZE(XX) ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ SXADDQ $19, $18, $18 + -+ faddd a3, t3, $f28 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 19 * SIZE(XX) ++ LD $f12, 0*SIZE($18) ++ LD $f13, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ble $21, $INC_MainRemain ++ .align 4 + -+ faddd $f25, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 20 * SIZE(XX) -+ -+ faddd $f26, t1, a1 -+ ldi I, -1(I) -+ fmuld x5, x5, t1 -+ LD x5, 21 * SIZE(XX) -+ -+ faddd $f27, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 22 * SIZE(XX) -+ -+ faddd $f28, t3, a3 -+ fmuld x7, x7, t3 -+ LD x7, 23 * SIZE(XX) -+ bgt I, $L11 -+ .align 4 -+ -+$L12: -+ faddd a0, t0, $f25 -+ mov X, XX -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) -+ -+ faddd a1, t1, $f26 -+ unop -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) -+ -+ faddd a2, t2, $f27 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) -+ -+ faddd a3, t3, $f28 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) -+ -+ faddd $f25, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(XX) -+ -+ faddd $f26, t1, a1 -+ unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(XX) -+ -+ faddd $f27, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(XX) -+ -+ faddd $f28, t3, a3 -+ ldi X, 16 * SIZE(X) -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(XX) -+ -+ faddd a0, t0, $f25 -+ fmuld x0, x0, t0 -+ faddd a1, t1, $f26 -+ fmuld x1, x1, t1 -+ -+ faddd a2, t2, $f27 -+ fmuld x2, x2, t2 -+ faddd a3, t3, $f28 -+ fmuld x3, x3, t3 -+ -+ faddd $f25, t0, a0 -+ fmuld x4, x4, t0 -+ faddd $f26, t1, a1 -+ fmuld x5, x5, t1 -+ -+ faddd $f27, t2, a2 -+ fmuld x6, x6, t2 -+ faddd $f28, t3, a2 -+ fmuld x7, x7, t3 -+ -+ faddd a2, t2, $f27 -+ fmov $f27, a2 -+ faddd a3, t3, $f28 -+ fmov $f28, a3 -+ .align 4 -+ -+$L15: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 -+ -+$L16: -+ LD x0, 0 * SIZE(X) -+ LD x1, 1 * SIZE(X) -+ -+ ldi X, 2 * SIZE(X) -+ -+ faddd a0, t0, $f25 -+ fmov $f25, a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, $f26 -+ fmov $f26, a1 -+ fmuld x1, x1, t1 -+ -+ ldi I, -1(I) -+ bgt I, $L16 -+ bsr $31, $L998 -+ .align 4 -+ -+$L20: -+ fclr t0 -+ sra N, 2, I -+ fclr t1 -+ ble I, $L25 -+ -+ LD x0, 0 * SIZE(X) -+ fclr t2 -+ LD x1, 1 * SIZE(X) -+ addl X, INCX, X -+ LD x2, 0 * SIZE(X) -+ fclr t3 -+ LD x3, 1 * SIZE(X) -+ addl X, INCX, X -+ -+ LD x4, 0 * SIZE(X) -+ ldi I, -1(I) -+ LD x5, 1 * SIZE(X) -+ addl X, INCX, X ++$INC_MainLoop: ++ MUL $f10, $f1, $f20 ++ LD $f10, 0*SIZE($18) ++ MUL $f11, $f1, $f21 ++ LD $f11, 1*SIZE($18) + -+ LD x6, 0 * SIZE(X) -+ ble I, $L22 -+ .align 4 ++ SXADDQ $19, $18, $18 + -+$L21: -+ faddd a0, t0, $f25 -+ LD x7, 1 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X ++ MUL $f12, $f1, $f22 ++ LD $f12, 0*SIZE($18) ++ MUL $f13, $f1, $f23 ++ LD $f13, 1*SIZE($18) + -+ faddd a1, t1, $f26 -+ LD x0, 0 * SIZE(X) -+ fmuld x1, x1, t1 -+ unop ++ SXADDQ $19, $18, $18 + -+ faddd a2, t2, $f27 -+ LD x1, 1 * SIZE(X) -+ fmuld x2, x2, t2 -+ addl X, INCX, X ++ ST $f20, 0*SIZE($20) ++ ldi $21, -1($21) ++ ST $f21, 1*SIZE($20) ++ SXADDQ $19, $20, $20 + -+ faddd a3, t3, $f28 -+ LD x2, 0 * SIZE(X) -+ fmuld x3, x3, t3 ++ ST $f22, 0*SIZE($20) ++ ST $f23, 1*SIZE($20) ++ SXADDQ $19, $20, $20 + unop -+ -+ faddd $f25, t0, a0 -+ LD x3, 1 * SIZE(X) -+ fmuld x4, x4, t0 -+ addl X, INCX, X -+ -+ faddd $f26, t1, a1 -+ LD x4, 0 * SIZE(X) -+ fmuld x5, x5, t1 -+ ldi I, -1(I) -+ -+ faddd $f27, t2, a2 -+ LD x5, 1 * SIZE(X) -+ fmuld x6, x6, t2 -+ addl X, INCX, X -+ -+ faddd $f28, t3, a3 -+ LD x6, 0 * SIZE(X) -+ fmuld x7, x7, t3 -+ bgt I, $L21 -+ .align 4 -+ -+$L22: -+ faddd a0, t0, $f25 -+ LD x7, 1 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X -+ -+ faddd a1, t1, $f26 -+ fmuld x1, x1, t1 -+ faddd a2, t2, $f27 -+ fmuld x2, x2, t2 -+ -+ faddd a3, t3, $f28 -+ fmuld x3, x3, t3 -+ faddd $f25, t0, a0 -+ fmuld x4, x4, t0 -+ -+ faddd $f26, t1, a1 -+ fmuld x5, x5, t1 -+ faddd $f27, t2, a2 -+ fmuld x6, x6, t2 -+ -+ faddd $f28, t3, a3 -+ fmuld x7, x7, t3 -+ faddd a2, t2, $f27 -+ fmov $f27, a2 -+ faddd a3, t3, $f28 -+ fmov $f28, a3 -+ .align 4 -+ -+$L25: -+ and N, 3, I -+ ble I, $L998 ++ bgt $21, $INC_MainLoop + .align 4 + -+$L26: -+ LD x0, 0 * SIZE(X) -+ ldi I, -1(I) -+ LD x1, 1 * SIZE(X) -+ addl X, INCX, X ++$INC_MainRemain: ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ MUL $f12, $f1, $f22 ++ MUL $f13, $f1, $f23 + -+ faddd a0, t0, $f25 -+ fmov $f25, a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, $f26 -+ fmov $f26, a1 -+ fmuld x1, x1, t1 ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ SXADDQ $19, $20, $20 + -+ bgt I, $L26 ++ ST $f22, 0*SIZE($20) ++ ST $f23, 1*SIZE($20) ++ SXADDQ $19, $20, $20 + .align 4 + ++$INC_Sub: ++ blbc $16, $INC_End + -+$L998: -+ faddd a0, t0, $f25 -+ fmov $f25, a0 -+ faddd a1, t1, $f26 -+ fmov $f26, a1 -+ -+ faddd a0, a1, $f25 -+ fmov $f25, a0 -+ faddd a2, a3, $f26 -+ fmov $f26, a2 -+ -+#if defined(EV4) || defined(EV5) -+ faddd a0, a2, $f16 -+ jsr $26, ($27), sqrt !lituse_jsr!2 ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 + -+ ldih $29, 0($26) !gpdisp!3 -+ ldi $29, 0($29) !gpdisp!3 -+#else -+ faddd a0, a2, $f25 -+ fmov $f25, a0 -+ fsqrtd a0, $f25 -+ fmov $f25, a0 -+#endif ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) + .align 4 + -+$L999: -+#if defined(EV4) || defined(EV5) -+ ldl $26, 0($sp) -+ ldi $sp, 16($sp) -+#endif ++$INC_End: + ret -+ EPILOGUE -diff --git a/kernel/sw_64/cnrm2.S.bak b/kernel/sw_64/cnrm2.S.bak ++ .end NAME ++ .ident VERSION +diff --git a/kernel/sw_64/dnrm2.S b/kernel/sw_64/dnrm2.S new file mode 100644 -index 0000000..b2e80e0 +index 000000000..2752e831d --- /dev/null -+++ b/kernel/sw_64/cnrm2.S.bak -@@ -0,0 +1,426 @@ ++++ b/kernel/sw_64/dnrm2.S +@@ -0,0 +1,431 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -5901,7 +2608,7 @@ index 0000000..b2e80e0 +#define ASSEMBLER + +#include "common.h" -+#include "version.h" ++ + +#define PREFETCH_SIZE 80 + @@ -5937,7 +2644,7 @@ index 0000000..b2e80e0 + .mask 0x4000000,-16 + ldih $29, 0($27) !gpdisp!1 + ldi $29, 0($29) !gpdisp!1 -+ ++ + ldi $sp, -16($sp) + ldl $27, sqrt($29) !literal!2 + stq $26, 0($sp) @@ -5947,19 +2654,19 @@ index 0000000..b2e80e0 +#else + PROFCODE +#endif -+ ++ + fclr a0 -+ sll INCX, ZBASE_SHIFT, INCX ++ SXADDQ INCX, 0, INCX + fclr a1 + ble N, $L999 + + fclr a2 -+ cmpeq INCX, 2 * SIZE, $0 ++ cmpeq INCX, SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 -+ sra N, 3, I ++ sra N, 4, I + fclr t1 + ble I, $L15 + @@ -5981,7 +2688,7 @@ index 0000000..b2e80e0 + +$L11: + faddd a0, t0, a0 -+ fillcs (PREFETCH_SIZE) * SIZE(X) ++ s_fillcs (PREFETCH_SIZE) * SIZE(X) + fmuld x0, x0, t0 + LD x0, 8 * SIZE(X) + @@ -6122,25 +2829,22 @@ index 0000000..b2e80e0 + faddd a3, t3, a3 + fmuld x7, x7, t3 + ++ faddd a1, t1, a1 + faddd a2, t2, a2 + faddd a3, t3, a3 + .align 4 + +$L15: -+ and N, 7, I ++ and N, 15, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) -+ LD x1, 1 * SIZE(X) -+ -+ ldi X, 2 * SIZE(X) ++ ldi X, 1 * SIZE(X) + + faddd a0, t0, a0 + fmuld x0, x0, t0 -+ faddd a1, t1, a1 -+ fmuld x1, x1, t1 + + ldi I, -1(I) + bgt I, $L16 @@ -6149,120 +2853,128 @@ index 0000000..b2e80e0 + +$L20: + fclr t0 -+ sra N, 2, I ++ sra N, 3, I + fclr t1 + ble I, $L25 + -+ LD x0, 0 * SIZE(X) + fclr t2 -+ LD x1, 1 * SIZE(X) ++ fclr t3 ++ ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x1, 0 * SIZE(X) + addl X, INCX, X + LD x2, 0 * SIZE(X) -+ fclr t3 -+ LD x3, 1 * SIZE(X) ++ addl X, INCX, X ++ LD x3, 0 * SIZE(X) + addl X, INCX, X + + LD x4, 0 * SIZE(X) -+ ldi I, -1(I) -+ LD x5, 1 * SIZE(X) + addl X, INCX, X -+ ++ LD x5, 0 * SIZE(X) ++ addl X, INCX, X + LD x6, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ ldi I, -1(I) + ble I, $L22 + .align 4 + +$L21: + faddd a0, t0, a0 -+ LD x7, 1 * SIZE(X) ++ LD x7, 0 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, a1 + LD x0, 0 * SIZE(X) + fmuld x1, x1, t1 -+ unop ++ addl X, INCX, X + + faddd a2, t2, a2 -+ LD x1, 1 * SIZE(X) ++ LD x1, 0 * SIZE(X) + fmuld x2, x2, t2 + addl X, INCX, X + + faddd a3, t3, a3 + LD x2, 0 * SIZE(X) + fmuld x3, x3, t3 -+ unop ++ addl X, INCX, X + + faddd a0, t0, a0 -+ LD x3, 1 * SIZE(X) ++ LD x3, 0 * SIZE(X) + fmuld x4, x4, t0 + addl X, INCX, X + + faddd a1, t1, a1 + LD x4, 0 * SIZE(X) + fmuld x5, x5, t1 -+ ldi I, -1(I) ++ addl X, INCX, X + + faddd a2, t2, a2 -+ LD x5, 1 * SIZE(X) ++ LD x5, 0 * SIZE(X) + fmuld x6, x6, t2 + addl X, INCX, X + + faddd a3, t3, a3 + LD x6, 0 * SIZE(X) + fmuld x7, x7, t3 ++ addl X, INCX, X ++ ++ ldi I, -1(I) + bgt I, $L21 + .align 4 + +$L22: + faddd a0, t0, a0 -+ LD x7, 1 * SIZE(X) ++ LD x7, 0 * SIZE(X) + fmuld x0, x0, t0 + addl X, INCX, X + + faddd a1, t1, a1 ++ unop + fmuld x1, x1, t1 ++ unop ++ + faddd a2, t2, a2 + fmuld x2, x2, t2 -+ + faddd a3, t3, a3 + fmuld x3, x3, t3 ++ + faddd a0, t0, a0 + fmuld x4, x4, t0 -+ + faddd a1, t1, a1 + fmuld x5, x5, t1 ++ + faddd a2, t2, a2 + fmuld x6, x6, t2 -+ + faddd a3, t3, a3 + fmuld x7, x7, t3 ++ ++ faddd a1, t1, a1 + faddd a2, t2, a2 + faddd a3, t3, a3 + .align 4 + +$L25: -+ and N, 3, I ++ and N, 7, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) -+ ldi I, -1(I) -+ LD x1, 1 * SIZE(X) + addl X, INCX, X + + faddd a0, t0, a0 + fmuld x0, x0, t0 -+ faddd a1, t1, a1 -+ fmuld x1, x1, t1 + ++ ldi I, -1(I) + bgt I, $L26 + .align 4 + + +$L998: + faddd a0, t0, a0 -+ faddd a1, t1, a1 + + faddd a0, a1, a0 + faddd a2, a3, a2 @@ -6286,12 +2998,12 @@ index 0000000..b2e80e0 +#endif + ret + EPILOGUE -diff --git a/kernel/sw_64/copy.S b/kernel/sw_64/copy.S +diff --git a/kernel/sw_64/dot.S b/kernel/sw_64/dot.S new file mode 100644 -index 0000000..c960ac1 +index 000000000..028a55152 --- /dev/null -+++ b/kernel/sw_64/copy.S -@@ -0,0 +1,379 @@ ++++ b/kernel/sw_64/dot.S +@@ -0,0 +1,534 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -6332,7 +3044,13 @@ index 0000000..c960ac1 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" ++ ++ ++#define ADD faddd ++#define MUL fmuld ++ ++ ++#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 @@ -6340,343 +3058,492 @@ index 0000000..c960ac1 +#define Y $19 +#define INCY $20 + -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 ++#define I $5 + -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif ++#define s0 $f0 ++#define s1 $f30 ++#define s2 $f1 ++#define s3 $f2 + -+ cmpeq INCX, 1, $0 -+ ble N, $End -+#ifndef COMPLEX -+ sra N, 4, $4 -+#else -+ sra N, 3, $4 -+#endif -+ cmpeq INCY, 1, $1 ++#define a0 $f10 ++#define a1 $f11 ++#define a2 $f12 ++#define a3 $f13 ++#define a4 $f14 ++#define a5 $f15 ++#define a6 $f16 ++#define a7 $f17 + -+ and $0, $1, $0 -+ beq $0, $Sub -+#ifndef COMPLEX -+ and N, 15, $5 -+#else -+ and N, 7, $5 -+#endif -+ ble $4, $Remain ++#define b0 $f18 ++#define b1 $f19 ++#define b2 $f20 ++#define b3 $f21 ++#define b4 $f22 ++#define b5 $f23 ++#define b6 $f24 ++#define b7 $f25 + -+ LD $f10, 0*SIZE(X) -+ LD $f11, 1*SIZE(X) -+ LD $f12, 2*SIZE(X) -+ LD $f13, 3*SIZE(X) -+ LD $f14, 4*SIZE(X) -+ LD $f15, 5*SIZE(X) -+ LD $f16, 6*SIZE(X) -+ LD $f17, 7*SIZE(X) ++#define t0 $f26 ++#define t1 $f27 ++#define t2 $f28 ++#define t3 $f29 + -+ LD $f18, 8*SIZE(X) -+ LD $f19, 9*SIZE(X) -+ LD $f20, 10*SIZE(X) -+ LD $f21, 11*SIZE(X) -+ LD $f22, 12*SIZE(X) -+ LD $f23, 13*SIZE(X) -+ LD $f24, 14*SIZE(X) -+ LD $f25, 15*SIZE(X) + -+ subl $4, 1, $4 -+ ldi X, 16*SIZE(X) -+ ble $4, $MainLoopEnd -+ .align 4 ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 + -+$MainLoop: -+ ST $f10, 0*SIZE(Y) -+ ST $f11, 1*SIZE(Y) -+ ST $f12, 2*SIZE(Y) -+ ST $f13, 3*SIZE(Y) ++ ldi $sp, -16($sp) ++ fclr s0 ++ fstd $f2, 0($sp) ++ fclr s1 + -+ LD $f10, 0*SIZE(X) -+ LD $f11, 1*SIZE(X) -+ LD $f12, 2*SIZE(X) -+ LD $f13, 3*SIZE(X) ++ fclr s2 ++ nop ++ fclr s3 ++ ble N, $L999 + -+ ST $f14, 4*SIZE(Y) -+ ST $f15, 5*SIZE(Y) -+ ST $f16, 6*SIZE(Y) -+ ST $f17, 7*SIZE(Y) ++ fclr t0 ++ cmpeq INCX, 1, $21 ++ fclr t1 ++ cmpeq INCY, 1, $22 ++ fclr t2 ++ and $21, $22, $22 ++ fclr t3 ++ beq $22, $L20 + -+ LD $f14, 4*SIZE(X) -+ LD $f15, 5*SIZE(X) -+ LD $f16, 6*SIZE(X) -+ LD $f17, 7*SIZE(X) ++#ifndef DOUBLE ++ srl N, 4, I ++ ble I, $L15 + -+ ST $f18, 8*SIZE(Y) -+ ST $f19, 9*SIZE(Y) -+ ST $f20, 10*SIZE(Y) -+ ST $f21, 11*SIZE(Y) ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) + -+ LD $f18, 8*SIZE(X) -+ LD $f19, 9*SIZE(X) -+ LD $f20, 10*SIZE(X) -+ LD $f21, 11*SIZE(X) ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ LD b2, 2 * SIZE(Y) ++ LD b3, 3 * SIZE(Y) + -+ ST $f22, 12*SIZE(Y) -+ ST $f23, 13*SIZE(Y) -+ ST $f24, 14*SIZE(Y) -+ ST $f25, 15*SIZE(Y) ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ LD b4, 4 * SIZE(Y) ++ LD b5, 5 * SIZE(Y) + -+ LD $f22, 12*SIZE(X) -+ LD $f23, 13*SIZE(X) -+ LD $f24, 14*SIZE(X) -+ LD $f25, 15*SIZE(X) ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ addl X, 16 * SIZE, X ++ subl I, 1, I + -+ subl $4, 1, $4 -+ ldi Y, 16*SIZE(Y) -+ ldi X, 16*SIZE(X) -+ bgt $4, $MainLoop ++ addl Y, 16 * SIZE, Y ++ ble I, $L13 + .align 4 + -+$MainLoopEnd: -+ ST $f10, 0*SIZE(Y) -+ ST $f11, 1*SIZE(Y) -+ ST $f12, 2*SIZE(Y) -+ ST $f13, 3*SIZE(Y) -+ ST $f14, 4*SIZE(Y) -+ ST $f15, 5*SIZE(Y) -+ ST $f16, 6*SIZE(Y) -+ ST $f17, 7*SIZE(Y) -+ -+ ST $f18, 8*SIZE(Y) -+ ST $f19, 9*SIZE(Y) -+ ST $f20, 10*SIZE(Y) -+ ST $f21, 11*SIZE(Y) -+ ST $f22, 12*SIZE(Y) -+ ST $f23, 13*SIZE(Y) -+ ST $f24, 14*SIZE(Y) -+ ST $f25, 15*SIZE(Y) ++$L12: ++ s_fillcs PREFETCHSIZE * 2 * SIZE(X) ++ subl I, 1, I ++ s_fillcs PREFETCHSIZE * 2 * SIZE(Y) ++ addl X, 16 * SIZE, X + -+ ldi Y, 16*SIZE(Y) -+ .align 4 ++ ADD s0, t0, s0 ++ LD b6, -10 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -9 * SIZE(Y) + -+$Remain: -+ ble $5, $End -+ .align 4 ++ ADD s1, t1, s1 ++ LD a0, -24 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -23 * SIZE(X) + -+$RemainLoop: -+#ifndef COMPLEX -+ LD $f10, 0*SIZE(X) -+ ldi X, 1*SIZE(X) -+ ST $f10, 0*SIZE(Y) -+ ldi Y, 1*SIZE(Y) -+#else -+ LD $f10, 0*SIZE(X) -+ LD $f11, 1*SIZE(X) -+ ldi X, 2*SIZE(X) -+ ST $f10, 0*SIZE(Y) -+ ST $f11, 1*SIZE(Y) -+ ldi Y, 2*SIZE(Y) -+#endif -+ subl $5, 1, $5 -+ bgt $5, $RemainLoop -+ .align 4 -+$End: -+ ret -+ .align 4 ++ ADD s2, t2, s2 ++ LD b0, -8 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, -7 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -22 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -21 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, -6 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, -5 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -20 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -19 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, -4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, -3 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -18 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -17 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a0, -16 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -15 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -14 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -13 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 2 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 3 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -12 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -11 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, 4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, 5 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -10 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -9 * SIZE(X) ++ ++ addl Y, 16 * SIZE, Y ++ bgt I, $L12 ++ nop ++ fnop ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD b6,-10 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -9 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a0, -8 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -7 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b0, -8 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, -7 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -6 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -5 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, -6 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, -5 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -4 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -3 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, -4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, -3 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -2 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s1 ++ MUL a5, b5, t1 ++ ADD s2, t2, s2 ++ MUL a6, b6, t2 ++ ADD s3, t3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L15: ++ ADD s0, t0, s0 ++ and N, 15, I ++ ADD s1, t1, s1 ++ ble I, $L18 ++ .align 4 + -+$Sub: -+#ifdef COMPLEX -+ addl INCX, INCX, INCX -+ addl INCY, INCY, INCY -+ and N, 7, $5 +#else -+ and N, 15, $5 -+#endif -+ ble $4, $SubRemain ++ ++ srl N, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ LD b2, 2 * SIZE(Y) ++ LD b3, 3 * SIZE(Y) ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ LD b4, 4 * SIZE(Y) ++ LD b5, 5 * SIZE(Y) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ addl X, 8 * SIZE, X ++ subl I, 1, I ++ ++ addl Y, 8 * SIZE, Y ++ ble I, $L13 + .align 4 + -+$SubMainLoop: -+#ifndef COMPLEX -+ LD $f10, 0(X) -+ SXADDQ INCX, X, X -+ LD $f11, 0(X) -+ SXADDQ INCX, X, X ++$L12: ++ s_fillcs PREFETCHSIZE * SIZE(X) ++ subl I, 1, I ++ s_fillcs PREFETCHSIZE * SIZE(Y) ++ addl X, 8 * SIZE, X + -+ LD $f12, 0(X) -+ SXADDQ INCX, X, X -+ LD $f13, 0(X) -+ SXADDQ INCX, X, X ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) + -+ LD $f14, 0(X) -+ SXADDQ INCX, X, X -+ LD $f15, 0(X) -+ SXADDQ INCX, X, X ++ ADD s1, t1, s1 ++ LD a0, -8 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -7 * SIZE(X) + -+ LD $f16, 0(X) -+ SXADDQ INCX, X, X -+ LD $f17, 0(X) -+ SXADDQ INCX, X, X ++ ADD s2, t2, s2 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, 1 * SIZE(Y) + -+ LD $f18, 0(X) -+ SXADDQ INCX, X, X -+ LD $f19, 0(X) -+ SXADDQ INCX, X, X ++ ADD s3, t3, s3 ++ LD a2, -6 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -5 * SIZE(X) + -+ LD $f20, 0(X) -+ SXADDQ INCX, X, X -+ LD $f21, 0(X) -+ SXADDQ INCX, X, X ++ ADD s0, t0, s0 ++ LD b2, 2 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 3 * SIZE(Y) + -+ LD $f22, 0(X) -+ SXADDQ INCX, X, X -+ LD $f23, 0(X) -+ SXADDQ INCX, X, X ++ ADD s1, t1, s1 ++ LD a4, -4 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -3 * SIZE(X) + -+ LD $f24, 0(X) -+ SXADDQ INCX, X, X -+ LD $f25, 0(X) -+ SXADDQ INCX, X, X ++ ADD s2, t2, s2 ++ LD b4, 4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, 5 * SIZE(Y) + -+ ST $f10, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f11, 0(Y) -+ SXADDQ INCY, Y, Y ++ ADD s3, t3, s3 ++ LD a6, -2 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -1 * SIZE(X) + -+ ST $f12, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f13, 0(Y) -+ SXADDQ INCY, Y, Y ++ addl Y, 8 * SIZE, Y ++ bgt I, $L12 ++ nop ++ fnop ++ .align 4 + -+ ST $f14, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f15, 0(Y) -+ SXADDQ INCY, Y, Y ++$L13: ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 + -+ ST $f16, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f17, 0(Y) -+ SXADDQ INCY, Y, Y ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 + -+ ST $f18, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f19, 0(Y) -+ SXADDQ INCY, Y, Y ++ ADD s0, t0, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s1 ++ MUL a5, b5, t1 ++ ADD s2, t2, s2 ++ MUL a6, b6, t2 ++ ADD s3, t3, s3 ++ MUL a7, b7, t3 ++ .align 4 + -+ ST $f20, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f21, 0(Y) -+ SXADDQ INCY, Y, Y ++$L15: ++ ADD s0, t0, s0 ++ and N, 7, I ++ ADD s1, t1, s1 ++ ble I, $L18 ++ .align 4 + -+ ST $f22, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f23, 0(Y) -+ SXADDQ INCY, Y, Y ++#endif + -+ ST $f24, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f25, 0(Y) -+ SXADDQ INCY, Y, Y -+#else -+ LD $f10, 0(X) -+ LD $f11, SIZE(X) -+ SXADDQ INCX, X, X ++$L16: ++ LD a0, 0 * SIZE(X) ++ addl X, SIZE, X ++ LD b0, 0 * SIZE(Y) ++ addl Y, SIZE, Y + -+ LD $f12, 0(X) -+ LD $f13, SIZE(X) -+ SXADDQ INCX, X, X ++ ADD s2, t2, s2 ++ MUL a0, b0, t2 ++ subl I, 1, I ++ bgt I, $L16 ++ .align 4 + -+ LD $f14, 0(X) -+ LD $f15, SIZE(X) -+ SXADDQ INCX, X, X ++$L18: ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ br $L999 ++ .align 4 + -+ LD $f16, 0(X) -+ LD $f17, SIZE(X) -+ SXADDQ INCX, X, X ++$L20: ++ srl N, 2, I ++ ble I, $L25 + -+ LD $f18, 0(X) -+ LD $f19, SIZE(X) ++ LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X -+ -+ LD $f20, 0(X) -+ LD $f21, SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X ++ LD b1, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y + -+ LD $f22, 0(X) -+ LD $f23, SIZE(X) ++ LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X -+ -+ LD $f24, 0(X) -+ LD $f25, SIZE(X) ++ LD b2, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X ++ LD b3, 0 * SIZE(Y) ++ subl I, 1, I + -+ ST $f10, 0(Y) -+ ST $f11, SIZE(Y) + SXADDQ INCY, Y, Y ++ ble I, $L23 ++ .align 4 + -+ ST $f12, 0(Y) -+ ST $f13, SIZE(Y) -+ SXADDQ INCY, Y, Y ++$L22: ++ ADD s0, t0, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 + -+ ST $f14, 0(Y) -+ ST $f15, SIZE(Y) ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y -+ -+ ST $f16, 0(Y) -+ ST $f17, SIZE(Y) ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + -+ ST $f18, 0(Y) -+ ST $f19, SIZE(Y) ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y -+ -+ ST $f20, 0(Y) -+ ST $f21, SIZE(Y) ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b3, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + -+ ST $f22, 0(Y) -+ ST $f23, SIZE(Y) -+ SXADDQ INCY, Y, Y ++ subl I, 1, I ++ bgt I, $L22 ++ nop ++ fnop ++ .align 4 + -+ ST $f24, 0(Y) -+ ST $f25, SIZE(Y) -+ SXADDQ INCY, Y, Y -+#endif -+ subl $4, 1, $4 -+ bgt $4, $SubMainLoop ++$L23: ++ ADD s0, t0, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 + .align 4 + -+$SubRemain: -+ ble $5, $SubEnd ++$L25: ++ ADD s0, t0, s0 ++ and N, 3, I ++ ADD s1, t1, s1 ++ ble I, $L28 + .align 4 + -+ $SubRemainLoop: -+#ifndef COMPLEX -+ LD $f10, 0(X) -+ SXADDQ INCX, X, X -+ ST $f10, 0(Y) -+ SXADDQ INCY, Y, Y -+#else -+ LD $f10, 0(X) -+ LD $f11, SIZE(X) ++$L26: ++ LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X -+ ST $f10, 0(Y) -+ ST $f11, SIZE(Y) ++ LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y -+#endif -+ subl $5, 1, $5 -+ bgt $5, $SubRemainLoop ++ ++ ADD s2, t2, s2 ++ MUL a0, b0, t2 ++ subl I, 1, I ++ bgt I, $L26 + .align 4 + -+$SubEnd: ++$L28: ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ .align 4 ++ ++$L999: ++ ADD s2, s3, s2 ++ fldd $f2, 0($sp) ++ ADD s0, s1, s0 ++ ldi $sp, 16($sp) ++ ++ ADD s0, s2, s0 + ret ++ + EPILOGUE -diff --git a/kernel/sw_64/copy_simd.S b/kernel/sw_64/copy_simd.S +diff --git a/kernel/sw_64/gemm_beta.S b/kernel/sw_64/gemm_beta.S new file mode 100644 -index 0000000..84e96a9 +index 000000000..00e2d12d1 --- /dev/null -+++ b/kernel/sw_64/copy_simd.S -@@ -0,0 +1,563 @@ ++++ b/kernel/sw_64/gemm_beta.S +@@ -0,0 +1,179 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -6717,535 +3584,151 @@ index 0000000..84e96a9 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 80 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 + -+ PROLOGUE -+ PROFCODE ++ ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++CNAME: + .frame $sp, 0, $26, 0 + ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++#endif ++ ++ ldl $18, 16($sp) ++ ble $16, $End ++ ldl $19, 24($sp) ++ ble $17, $End +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + -+ cmpeq INCX, 1, $0 -+ ble N, $End -+#ifndef COMPLEX -+ sra N, 4, $4 -+#else -+ sra N, 3, $4 -+#endif -+ cmpeq INCY, 1, $1 -+ -+ and $0, $1, $0 -+ beq $0, $Sub -+#ifndef COMPLEX -+ and N, 15, $5 -+#else -+ and N, 7, $5 -+#endif -+ ble $4, $Remain -+ -+/** -+ test the address of X & Y -+**/ ++ fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO) ++ .align 4 + -+ and Y, (VEC_LEN*SIZE-1), $6 -+ and X, (VEC_LEN*SIZE-1), $7 -+ bgt $6, $UnAlign_Y_ACCESS -+ bgt $7, $UnAlign_X_ACCESS -+ -+ .align 4 -+ -+$Align: -+ VLD $f10, 0*VEC_LEN*SIZE(X) -+ VLD $f11, 1*VEC_LEN*SIZE(X) -+ VLD $f12, 2*VEC_LEN*SIZE(X) -+ VLD $f13, 3*VEC_LEN*SIZE(X) -+ -+ subl $4, 1, $4 -+ ldi X, 16*SIZE(X) -+ ble $4, $MainLoopEnd -+ .align 4 -+ -+$MainLoop: -+ fillcs PREFETCHSIZE * SIZE(X) -+ fillcs PREFETCHSIZE * SIZE(Y) -+ -+ VST $f10, 0*VEC_LEN*SIZE(Y) -+ VST $f11, 1*VEC_LEN*SIZE(Y) -+ VST $f12, 2*VEC_LEN*SIZE(Y) -+ VST $f13, 3*VEC_LEN*SIZE(Y) -+ -+ VLD $f10, 0*VEC_LEN*SIZE(X) -+ VLD $f11, 1*VEC_LEN*SIZE(X) -+ VLD $f12, 2*VEC_LEN*SIZE(X) -+ VLD $f13, 3*VEC_LEN*SIZE(X) -+ -+ subl $4, 1, $4 -+ ldi Y, 16*SIZE(Y) -+ ldi X, 16*SIZE(X) -+ bgt $4, $MainLoop -+ .align 4 -+ -+$MainLoopEnd: -+ -+ VST $f10, 0*VEC_LEN*SIZE(Y) -+ VST $f11, 1*VEC_LEN*SIZE(Y) -+ VST $f12, 2*VEC_LEN*SIZE(Y) -+ VST $f13, 3*VEC_LEN*SIZE(Y) -+ -+ ldi Y, 16*SIZE(Y) -+ .align 4 -+ -+$Remain: -+ ble $5, $End -+ .align 4 -+ -+$RemainLoop: -+#ifndef COMPLEX -+ LD $f10, 0*SIZE(X) -+ ldi X, 1*SIZE(X) -+ ST $f10, 0*SIZE(Y) -+ ldi Y, 1*SIZE(Y) -+#else -+ LD $f10, 0*SIZE(X) -+ LD $f11, 1*SIZE(X) -+ ldi X, 2*SIZE(X) -+ ST $f10, 0*SIZE(Y) -+ ST $f11, 1*SIZE(Y) -+ ldi Y, 2*SIZE(Y) -+#endif -+ subl $5, 1, $5 -+ bgt $5, $RemainLoop -+ .align 4 -+$End: -+ ret ++$BETA_NE_ZERO: ++ sra $16, 3, $2 # i = (m >> 3) ++ mov $18, $1 # c_offset = c ++ ldi $17, -1($17) # j -- ++ ble $2,$L52 + .align 4 + -+$UnAlign_X_ACCESS: -+ and Y, (VEC_LEN*SIZE-1), $7 -+ nop -+ nop -+ bgt $7, $UnAlign_XY_ACCESS -+ .align 4 -+ -+ VLD_UL $f10, 0*VEC_LEN*SIZE(X) -+ VLD_UH $f14, 1*VEC_LEN*SIZE(X) -+ -+ VLD_UL $f11, 1*VEC_LEN*SIZE(X) -+ VLD_UH $f15, 2*VEC_LEN*SIZE(X) ++$L51: ++ fillde 64($1) ++ ldi $2, -1($2) + -+ VLD_UL $f12, 2*VEC_LEN*SIZE(X) -+ VLD_UH $f16, 3*VEC_LEN*SIZE(X) ++ LD $f14, 0*SIZE($1) ++ LD $f15, 1*SIZE($1) ++ LD $f16, 2*SIZE($1) ++ LD $f17, 3*SIZE($1) ++ LD $f18, 4*SIZE($1) ++ LD $f11, 5*SIZE($1) ++ LD $f21, 6*SIZE($1) ++ LD $f22, 7*SIZE($1) + ++ MUL $f19, $f14, $f23 ++ MUL $f19, $f15, $f24 ++ MUL $f19, $f16, $f25 ++ MUL $f19, $f17, $f26 ++ MUL $f19, $f18, $f27 ++ MUL $f19, $f11, $f28 ++ MUL $f19, $f21, $f29 ++ MUL $f19, $f22, $f30 + -+ VLD_UL $f13, 3*VEC_LEN*SIZE(X) -+ VLD_UH $f17, 4*VEC_LEN*SIZE(X) ++ ST $f23, 0*SIZE($1) ++ ST $f24, 1*SIZE($1) ++ ST $f25, 2*SIZE($1) ++ ST $f26, 3*SIZE($1) ++ ST $f27, 4*SIZE($1) ++ ST $f28, 5*SIZE($1) ++ ST $f29, 6*SIZE($1) ++ ST $f30, 7*SIZE($1) + -+ subl $4, 1, $4 -+ vbisw $f10, $f14, $f10 -+ ldi X, 16*SIZE(X) -+ vbisw $f11, $f15, $f11 -+ -+ vbisw $f12, $f16, $f12 -+ vbisw $f13, $f17, $f13 -+ nop -+ ble $4, $UnAlign_X_MainLoopEnd ++ ldi $1,8*SIZE($1) ++ bgt $2,$L51 + .align 4 + -+$UnAlign_X_MainLoop: -+ fillcs PREFETCHSIZE * SIZE(X) -+ fillcs PREFETCHSIZE * SIZE(Y) -+ -+ VST $f10, 0*VEC_LEN*SIZE(Y) -+ VST $f11, 1*VEC_LEN*SIZE(Y) -+ VST $f12, 2*VEC_LEN*SIZE(Y) -+ VST $f13, 3*VEC_LEN*SIZE(Y) -+ -+ VLD_UL $f10, 0*VEC_LEN*SIZE(X) -+ VLD_UH $f14, 1*VEC_LEN*SIZE(X) -+ VLD_UL $f11, 1*VEC_LEN*SIZE(X) -+ VLD_UH $f15, 2*VEC_LEN*SIZE(X) ++$L52: ++ and $16, 7, $2 ++ ble $2,$L54 ++ .align 4 + -+ VLD_UL $f12, 2*VEC_LEN*SIZE(X) -+ VLD_UH $f16, 3*VEC_LEN*SIZE(X) -+ VLD_UL $f13, 3*VEC_LEN*SIZE(X) -+ VLD_UH $f17, 4*VEC_LEN*SIZE(X) -+ -+ subl $4, 1, $4 -+ vbisw $f10, $f14, $f10 -+ ldi Y, 16*SIZE(Y) -+ vbisw $f11, $f15, $f11 -+ -+ vbisw $f12, $f16, $f12 -+ ldi X, 16*SIZE(X) -+ vbisw $f13, $f17, $f13 -+ bgt $4, $UnAlign_X_MainLoop ++$L53: ++ LD $f12, 0($1) ++ ldi $2, -1($2) ++ MUL $f19, $f12, $f23 ++ ST $f23, 0($1) ++ ldi $1, SIZE($1) ++ bgt $2,$L53 + .align 4 + -+$UnAlign_X_MainLoopEnd: -+ -+ VST $f10, 0*VEC_LEN*SIZE(Y) -+ VST $f11, 1*VEC_LEN*SIZE(Y) -+ VST $f12, 2*VEC_LEN*SIZE(Y) -+ VST $f13, 3*VEC_LEN*SIZE(Y) -+ -+ ldi Y, 16*SIZE(Y) -+ ble $5, $End -+ jmp $RemainLoop -+ ++$L54: ++ SXADDQ $19, $18, $18 # c += ldc ++ bgt $17,$BETA_NE_ZERO ++ clr $0 ++ ret + .align 4 + -+$UnAlign_Y_ACCESS: -+ and X, (VEC_LEN*SIZE-1), $7 -+ nop -+ nop -+ bgt $7, $UnAlign_XY_ACCESS -+ .align 4 -+ -+ VLD $f10, 0*VEC_LEN*SIZE(X) -+ VLD $f11, 1*VEC_LEN*SIZE(X) -+ VLD $f12, 2*VEC_LEN*SIZE(X) -+ VLD $f13, 3*VEC_LEN*SIZE(X) -+ -+ subl $4, 1, $4 -+ ldi X, 16*SIZE(X) -+ ble $4, $UnAlign_Y_MainLoopEnd -+ .align 4 -+ -+$UnAlign_Y_MainLoop: -+ fillcs PREFETCHSIZE * SIZE(X) -+ fillcs PREFETCHSIZE * SIZE(Y) -+ -+ VST_UL $f10, 0*VEC_LEN*SIZE(Y) -+ VST_UH $f10, 1*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f11, 1*VEC_LEN*SIZE(Y) -+ VST_UH $f11, 2*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f12, 2*VEC_LEN*SIZE(Y) -+ VST_UH $f12, 3*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f13, 3*VEC_LEN*SIZE(Y) -+ VST_UH $f13, 4*VEC_LEN*SIZE(Y) -+ -+ VLD $f10, 0*VEC_LEN*SIZE(X) -+ VLD $f11, 1*VEC_LEN*SIZE(X) -+ VLD $f12, 2*VEC_LEN*SIZE(X) -+ VLD $f13, 3*VEC_LEN*SIZE(X) -+ -+ subl $4, 1, $4 -+ ldi Y, 16*SIZE(Y) -+ ldi X, 16*SIZE(X) -+ bgt $4, $UnAlign_Y_MainLoop -+ .align 4 -+ -+$UnAlign_Y_MainLoopEnd: -+ -+ VST_UL $f10, 0*VEC_LEN*SIZE(Y) -+ VST_UH $f10, 1*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f11, 1*VEC_LEN*SIZE(Y) -+ VST_UH $f11, 2*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f12, 2*VEC_LEN*SIZE(Y) -+ VST_UH $f12, 3*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f13, 3*VEC_LEN*SIZE(Y) -+ VST_UH $f13, 4*VEC_LEN*SIZE(Y) -+ -+ ldi Y, 16*SIZE(Y) -+ ble $5, $End -+ jmp $RemainLoop -+ ++$BETA_EQ_ZERO: ++ sra $16, 3, $2 # i = (m >> 3) ++ ldi $4, 8*SIZE($18) ++ mov $18, $1 # c_offset = c ++ ldi $17, -1($17) # j -- ++ ble $2,$L42 + .align 4 + -+$UnAlign_XY_ACCESS: -+ -+ VLD_UL $f10, 0*VEC_LEN*SIZE(X) -+ VLD_UH $f14, 1*VEC_LEN*SIZE(X) -+ -+ VLD_UL $f11, 1*VEC_LEN*SIZE(X) -+ VLD_UH $f15, 2*VEC_LEN*SIZE(X) -+ -+ VLD_UL $f12, 2*VEC_LEN*SIZE(X) -+ VLD_UH $f16, 3*VEC_LEN*SIZE(X) -+ -+ -+ VLD_UL $f13, 3*VEC_LEN*SIZE(X) -+ VLD_UH $f17, 4*VEC_LEN*SIZE(X) -+ -+ subl $4, 1, $4 -+ vbisw $f10, $f14, $f10 -+ ldi X, 16*SIZE(X) -+ vbisw $f11, $f15, $f11 -+ -+ vbisw $f12, $f16, $f12 -+ vbisw $f13, $f17, $f13 -+ nop -+ ble $4, $UnAlign_XY_MainLoopEnd -+ .align 4 -+ -+$UnAlign_XY_MainLoop: -+ fillcs PREFETCHSIZE * SIZE(X) -+ fillcs PREFETCHSIZE * SIZE(Y) -+ -+ VST_UL $f10, 0*VEC_LEN*SIZE(Y) -+ VST_UH $f10, 1*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f11, 1*VEC_LEN*SIZE(Y) -+ VST_UH $f11, 2*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f12, 2*VEC_LEN*SIZE(Y) -+ VST_UH $f12, 3*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f13, 3*VEC_LEN*SIZE(Y) -+ VST_UH $f13, 4*VEC_LEN*SIZE(Y) -+ -+ -+ VLD_UL $f10, 0*VEC_LEN*SIZE(X) -+ VLD_UH $f14, 1*VEC_LEN*SIZE(X) -+ VLD_UL $f11, 1*VEC_LEN*SIZE(X) -+ VLD_UH $f15, 2*VEC_LEN*SIZE(X) -+ -+ VLD_UL $f12, 2*VEC_LEN*SIZE(X) -+ VLD_UH $f16, 3*VEC_LEN*SIZE(X) -+ VLD_UL $f13, 3*VEC_LEN*SIZE(X) -+ VLD_UH $f17, 4*VEC_LEN*SIZE(X) -+ -+ subl $4, 1, $4 -+ vbisw $f10, $f14, $f10 -+ ldi Y, 16*SIZE(Y) -+ vbisw $f11, $f15, $f11 -+ -+ vbisw $f12, $f16, $f12 -+ ldi X, 16*SIZE(X) -+ vbisw $f13, $f17, $f13 -+ bgt $4, $UnAlign_XY_MainLoop -+ .align 4 -+ -+$UnAlign_XY_MainLoopEnd: -+ -+ VST_UL $f10, 0*VEC_LEN*SIZE(Y) -+ VST_UH $f10, 1*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f11, 1*VEC_LEN*SIZE(Y) -+ VST_UH $f11, 2*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f12, 2*VEC_LEN*SIZE(Y) -+ VST_UH $f12, 3*VEC_LEN*SIZE(Y) -+ -+ VST_UL $f13, 3*VEC_LEN*SIZE(Y) -+ VST_UH $f13, 4*VEC_LEN*SIZE(Y) -+ -+ ldi Y, 16*SIZE(Y) -+ ble $5, $End -+ jmp $RemainLoop -+ -+ .align 4 ++$L41: ++ ST $f31, 0*SIZE($1) ++ ST $f31, 1*SIZE($1) ++ ST $f31, 2*SIZE($1) ++ ST $f31, 3*SIZE($1) ++ ST $f31, 4*SIZE($1) ++ ST $f31, 5*SIZE($1) ++ ST $f31, 6*SIZE($1) ++ ST $f31, 7*SIZE($1) ++ ldi $2, -1($2) + -+$Sub: -+#ifdef COMPLEX -+ addl INCX, INCX, INCX -+ addl INCY, INCY, INCY -+ and N, 7, $5 -+#else -+ and N, 15, $5 -+#endif -+ ble $4, $SubRemain ++ ldi $4, 8*SIZE($4) ++ ldi $1, 8*SIZE($1) ++ bgt $2,$L41 + .align 4 + -+$SubMainLoop: -+#ifndef COMPLEX -+ LD $f10, 0(X) -+ SXADDQ INCX, X, X -+ LD $f11, 0(X) -+ SXADDQ INCX, X, X -+ -+ LD $f12, 0(X) -+ SXADDQ INCX, X, X -+ LD $f13, 0(X) -+ SXADDQ INCX, X, X -+ -+ LD $f14, 0(X) -+ SXADDQ INCX, X, X -+ LD $f15, 0(X) -+ SXADDQ INCX, X, X -+ -+ LD $f16, 0(X) -+ SXADDQ INCX, X, X -+ LD $f17, 0(X) -+ SXADDQ INCX, X, X -+ -+ LD $f18, 0(X) -+ SXADDQ INCX, X, X -+ LD $f19, 0(X) -+ SXADDQ INCX, X, X -+ -+ LD $f20, 0(X) -+ SXADDQ INCX, X, X -+ LD $f21, 0(X) -+ SXADDQ INCX, X, X -+ -+ LD $f22, 0(X) -+ SXADDQ INCX, X, X -+ LD $f23, 0(X) -+ SXADDQ INCX, X, X -+ -+ LD $f24, 0(X) -+ SXADDQ INCX, X, X -+ LD $f25, 0(X) -+ SXADDQ INCX, X, X -+ -+ ST $f10, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f11, 0(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f12, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f13, 0(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f14, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f15, 0(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f16, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f17, 0(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f18, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f19, 0(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f20, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f21, 0(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f22, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f23, 0(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f24, 0(Y) -+ SXADDQ INCY, Y, Y -+ ST $f25, 0(Y) -+ SXADDQ INCY, Y, Y -+#else -+ LD $f10, 0(X) -+ LD $f11, SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD $f12, 0(X) -+ LD $f13, SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD $f14, 0(X) -+ LD $f15, SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD $f16, 0(X) -+ LD $f17, SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD $f18, 0(X) -+ LD $f19, SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD $f20, 0(X) -+ LD $f21, SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD $f22, 0(X) -+ LD $f23, SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD $f24, 0(X) -+ LD $f25, SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST $f10, 0(Y) -+ ST $f11, SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f12, 0(Y) -+ ST $f13, SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f14, 0(Y) -+ ST $f15, SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f16, 0(Y) -+ ST $f17, SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f18, 0(Y) -+ ST $f19, SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f20, 0(Y) -+ ST $f21, SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f22, 0(Y) -+ ST $f23, SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ ST $f24, 0(Y) -+ ST $f25, SIZE(Y) -+ SXADDQ INCY, Y, Y -+#endif -+ subl $4, 1, $4 -+ bgt $4, $SubMainLoop -+ .align 4 ++$L42: ++ and $16, 7, $2 ++ ble $2,$L44 ++ .align 4 + -+$SubRemain: -+ ble $5, $SubEnd ++$L43: ++ ldi $2, -1($2) ++ ST $f31, 0($1) ++ ldi $1, SIZE($1) ++ bgt $2, $L43 + .align 4 + -+ $SubRemainLoop: -+#ifndef COMPLEX -+ LD $f10, 0(X) -+ SXADDQ INCX, X, X -+ ST $f10, 0(Y) -+ SXADDQ INCY, Y, Y -+#else -+ LD $f10, 0(X) -+ LD $f11, SIZE(X) -+ SXADDQ INCX, X, X -+ ST $f10, 0(Y) -+ ST $f11, SIZE(Y) -+ SXADDQ INCY, Y, Y -+#endif -+ subl $5, 1, $5 -+ bgt $5, $SubRemainLoop ++$L44: ++ SXADDQ $19, $18, $18 # c += ldc ++ bgt $17,$BETA_EQ_ZERO ++ clr $0 + .align 4 + -+$SubEnd: ++$End: + ret -+ EPILOGUE -diff --git a/kernel/sw_64/cscal.S b/kernel/sw_64/cscal.S ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/gemm_kernel_4x4.S b/kernel/sw_64/gemm_kernel_4x4.S new file mode 100644 -index 0000000..bba3137 +index 000000000..2039c8437 --- /dev/null -+++ b/kernel/sw_64/cscal.S -@@ -0,0 +1,217 @@ ++++ b/kernel/sw_64/gemm_kernel_4x4.S +@@ -0,0 +1,2844 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -7284,52555 +3767,1450 @@ index 0000000..bba3137 +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + -+ .set noat -+ .set noreorder -+ +#define ASSEMBLER -+ +#include "common.h" -+#include "version.h" + -+ .globl NAME -+ .ent NAME + -+NAME: -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ lda $28, _mcount -+ jsr $28, ($28), _mcount ++#if !defined(SW8A) ++#error "Architecture is not specified." +#endif + -+#ifndef C_INTERFACE -+ ldl $16, 0($16) # n -+ mov $18, $20 # Store Address -+ ldl $19, 0($19) # incx -+ nop -+ -+ LD $f1, 0($17) # alpha -+#else -+ mov $18, $20 # Store Address -+ fmov $f17, $f1 # alpha ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop +#endif + -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif + -+ sra $16, 1, $21 # 4-unrolling -+ ble $16, $End ++#define STACKSIZE 80 + -+ lda $23, -1($19) -+ ble $19, $End ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 + -+ bgt $23, $INC_NOT_1 -+ .align 4 ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 + -+ ble $21, $Sub -+ lda $21, -1($21) -+ LD $f10, 0*SIZE($18) -+ LD $f11, 1*SIZE($18) ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 + -+ LD $f12, 2*SIZE($18) -+ LD $f13, 3*SIZE($18) -+ lda $18, 4*SIZE($18) -+ ble $21, $MainRemain -+ .align 4 ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 + -+$MainLoop: -+ MUL $f10, $f1, $f20 -+ LD $f10, 0*SIZE($18) -+ MUL $f11, $f1, $f21 -+ LD $f11, 1*SIZE($18) ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 + -+ MUL $f12, $f1, $f22 -+ LD $f12, 2*SIZE($18) -+ MUL $f13, $f1, $f23 -+ LD $f13, 3*SIZE($18) ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 + -+ lda $18, 4*SIZE($18) -+ lda $21, -1($21) ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 + -+ ST $f20, 0*SIZE($20) -+ ST $f21, 1*SIZE($20) -+ ST $f22, 2*SIZE($20) -+ ST $f23, 3*SIZE($20) -+ lda $20, 4*SIZE($20) ++#define alpha $f30 + -+ bgt $21, $MainLoop -+ .align 4 ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 + -+$MainRemain: -+ MUL $f10, $f1, $f20 -+ MUL $f11, $f1, $f21 -+ MUL $f12, $f1, $f22 -+ MUL $f13, $f1, $f23 ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 + -+ ST $f20, 0*SIZE($20) -+ ST $f21, 1*SIZE($20) -+ ST $f22, 2*SIZE($20) -+ ST $f23, 3*SIZE($20) -+ lda $20, 4*SIZE($20) -+ .align 4 ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 + -+$Sub: -+ blbc $16, $End -+ LD $f10, 0*SIZE($18) -+ LD $f11, 1*SIZE($18) -+ MUL $f10, $f1, $f20 -+ MUL $f11, $f1, $f21 -+ ST $f20, 0*SIZE($20) -+ ST $f21, 1*SIZE($20) -+ .align 4 ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 + -+$End: -+ ret -+ .align 4 ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define BB $3 ++#define OFFSET $4 + -+$INC_NOT_1: -+ addl $19, $19, $19 -+ ble $21, $INC_Sub -+ lda $21, -1($21) ++#define ALPHA 64($sp) + -+ LD $f10, 0*SIZE($18) -+ LD $f11, 1*SIZE($18) -+ SXADDQ $19, $18, $18 ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 + -+ LD $f12, 0*SIZE($18) -+ LD $f13, 1*SIZE($18) -+ SXADDQ $19, $18, $18 -+ ble $21, $INC_MainRemain -+ .align 4 ++ ldi $sp, -STACKSIZE($sp) + -+$INC_MainLoop: -+ MUL $f10, $f1, $f20 -+ LD $f10, 0*SIZE($18) -+ MUL $f11, $f1, $f21 -+ LD $f11, 1*SIZE($18) ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++#ifdef TRMMKERNEL ++ ldl OFFSET, 16 + STACKSIZE($sp) ++#endif + -+ SXADDQ $19, $18, $18 ++ SXADDQ LDC, 0, LDC + -+ MUL $f12, $f1, $f22 -+ LD $f12, 0*SIZE($18) -+ MUL $f13, $f1, $f23 -+ LD $f13, 1*SIZE($18) ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ fstd $f19, ALPHA + -+ SXADDQ $19, $18, $18 ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 + -+ ST $f20, 0*SIZE($20) -+ lda $21, -1($21) -+ ST $f21, 1*SIZE($20) -+ SXADDQ $19, $20, $20 ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 + -+ ST $f22, 0*SIZE($20) -+ ST $f23, 1*SIZE($20) -+ SXADDQ $19, $20, $20 -+ unop -+ bgt $21, $INC_MainLoop ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ subl $31, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 + .align 4 + -+$INC_MainRemain: -+ MUL $f10, $f1, $f20 -+ MUL $f11, $f1, $f21 -+ MUL $f12, $f1, $f22 -+ MUL $f13, $f1, $f23 ++$L01: ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ s4addl K, 0, BB + -+ ST $f20, 0*SIZE($20) -+ ST $f21, 1*SIZE($20) -+ SXADDQ $19, $20, $20 ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif + -+ ST $f22, 0*SIZE($20) -+ ST $f23, 1*SIZE($20) -+ SXADDQ $19, $20, $20 ++ addl C2, LDC, C3 ++ s4addl LDC, C, C ++ ++ SXADDQ BB, B, BB ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 + .align 4 + -+$INC_Sub: -+ blbc $16, $INC_End ++$L11: ++#if defined(SW8A) ++ s_fillcs 0 * SIZE(BB) ++ s_fillcs 8 * SIZE(BB) ++ unop ++ ldi BB, 16 * SIZE(BB) ++#endif + -+ LD $f10, 0*SIZE($18) -+ LD $f11, 1*SIZE($18) -+ MUL $f10, $f1, $f20 -+ MUL $f11, $f1, $f21 ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + -+ ST $f20, 0*SIZE($20) -+ ST $f21, 1*SIZE($20) -+ .align 4 ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif + -+$INC_End: -+ ret -+ .end NAME -+ .ident VERSION -diff --git a/kernel/sw_64/dnrm2.S b/kernel/sw_64/dnrm2.S -new file mode 100644 -index 0000000..89cf787 ---- /dev/null -+++ b/kernel/sw_64/dnrm2.S -@@ -0,0 +1,490 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 + -+#define ASSEMBLER ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 + -+#include "common.h" -+#include "version.h" ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 + -+#define PREFETCH_SIZE 80 ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 + -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 ++ fillde 4 * SIZE(C1) ++ fclr c03 ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ fclr c04 + -+#define I $0 ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 + -+#define a0 $f0 -+#define a1 $f1 -+#define a2 $f10 -+#define a3 $f11 -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 + -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f19 -+#define x4 $f20 -+#define x5 $f21 -+#define x6 $f22 -+#define x7 $f23 ++#else ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl B, TMP1, BO ++ subl K, KK, TMP1 + -+ PROLOGUE ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 + -+#if defined(EV4) || defined(EV5) -+ .frame $30,16,$26,0 -+ .mask 0x4000000,-16 -+ ldih $29, 0($27) !gpdisp!1 -+ ldi $29, 0($29) !gpdisp!1 ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 + -+ ldi $sp, -16($sp) -+ ldl $27, sqrt($29) !literal!2 -+ stl $26, 0($sp) ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 + -+ PROFCODE -+ .prologue 1 -+#else -+ PROFCODE -+#endif ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 + -+ fclr a0 -+ SXADDQ INCX, 0, INCX -+ fclr a1 -+ ble N, $L999 ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 + -+ fclr a2 -+ cmpeq INCX, SIZE, $0 -+ fclr a3 -+ beq $0, $L20 ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 + -+ fclr t0 -+ sra N, 4, I -+ fclr t1 -+ ble I, $L15 ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++#endif + -+ fclr t2 -+ LD x0, 0 * SIZE(X) -+ fclr t3 -+ LD x1, 1 * SIZE(X) ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble L, $L15 ++ .align 5 + -+ LD x2, 2 * SIZE(X) -+ LD x3, 3 * SIZE(X) -+ LD x4, 4 * SIZE(X) -+ LD x5, 5 * SIZE(X) -+ LD x6, 6 * SIZE(X) -+ LD x7, 7 * SIZE(X) ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif + -+ ldi I, -1(I) -+ ble I, $L12 -+ .align 4 ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop + -+$L11: -+ faddd a0, t0,$f24 -+ fmov $f24,a0 -+ fillcs (PREFETCH_SIZE) * SIZE(X) -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) + -+ faddd a1, t1,$f24 -+ fmov $f24,a1 -+ mov X, XX -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) + -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ #unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP + -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ #unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP + -+ faddd a0, t0, $f24 -+ fmov $f24,a0 -+ #unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(X) ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop + -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ #unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(X) ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop + -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ #unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(X) ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop + -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ #unop -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(X) ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop + -+ faddd a0, t0, $f24 -+ fmov $f24,a0 -+ #unop -+ fmuld x0, x0, t0 -+ LD x0, 16 * SIZE(X) ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) + -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ ldi X, 16 * SIZE(X) -+ fmuld x1, x1, t1 -+ LD x1, 17 * SIZE(XX) ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) + -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ #unop -+ fmuld x2, x2, t2 -+ LD x2, 18 * SIZE(XX) ++/* 4 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) + -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ #unop -+ fmuld x3, x3, t3 -+ LD x3, 19 * SIZE(XX) ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) + -+ faddd a0, t0, $f24 -+ fmov $f24,a0 ++ ADD c14, t3, c14 + unop -+ fmuld x4, x4, t0 -+ LD x4, 20 * SIZE(XX) ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) + -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ ldi I, -1(I) -+ fmuld x5, x5, t1 -+ LD x5, 21 * SIZE(XX) ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) + -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ #unop -+ fmuld x6, x6, t2 -+ LD x6, 22 * SIZE(XX) ++/* 5 */ ++ ADD c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) + -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ fmuld x7, x7, t3 -+ LD x7, 23 * SIZE(XX) -+ bgt I, $L11 -+ .align 4 ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) + -+$L12: -+ faddd a0, t0,$f24 -+ fmov $f24,a0 -+ mov X, XX -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop + -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ #unop -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop + -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ #unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop + -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ #unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) ++ ADD c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop + -+ faddd a0, t0, $f24 -+ fmov $f24,a0 -+ #unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(XX) ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop + -+ faddd a1, t1,$f24 -+ fmov $f24,a1 ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 + unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(XX) + -+ faddd a2, t2,$f24 -+ fmov $f24,a2 ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 + unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(XX) + -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ ldi X, 16 * SIZE(X) -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(XX) ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop + -+ faddd a0, t0, $f24 -+ fmov $f24,a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ fmuld x1, x1, t1 ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) + -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ fmuld x2, x2, t2 -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ fmuld x3, x3, t3 ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) + -+ faddd a0, t0, $f24 -+ fmov $f24,a0 -+ fmuld x4, x4, t0 -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ fmuld x5, x5, t1 ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) + -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ fmuld x6, x6, t2 -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ fmuld x7, x7, t3 ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) + -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ faddd a3, t3, $f24 -+ fmov $f24,a3 ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 + .align 4 + +$L15: -+ and N, 15, I -+ ble I, $L998 ++ ADD c11, t1, c11 ++ fldd alpha, ALPHA ++ MUL b1, a1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L18 ++#else ++ blbs TMP1, $L18 ++#endif + .align 4 + -+$L16: -+ LD x0, 0 * SIZE(X) -+ ldi X, 1 * SIZE(X) ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 + -+ faddd a0, t0, $f24 -+ fmov $f24,a0 -+ fmuld x0, x0, t0 ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 + -+ ldi I, -1(I) -+ bgt I, $L16 -+ bsr $31, $L998 -+ .align 4 ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) + -+$L20: -+ fclr t0 -+ sra N, 3, I -+ fclr t1 -+ ble I, $L25 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 + -+ fclr t2 -+ fclr t3 ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) + -+ LD x0, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x1, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x2, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x3, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ LD x4, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x5, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x6, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ ldi I, -1(I) -+ ble I, $L22 -+ .align 4 -+ -+$L21: -+ faddd a0, t0,$f24 -+ fmov $f24,a0 -+ LD x7, 0 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X -+ -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ LD x0, 0 * SIZE(X) -+ fmuld x1, x1, t1 -+ addl X, INCX, X ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop + -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ LD x1, 0 * SIZE(X) -+ fmuld x2, x2, t2 -+ addl X, INCX, X ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) + -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ LD x2, 0 * SIZE(X) -+ fmuld x3, x3, t3 -+ addl X, INCX, X ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) + -+ faddd a0, t0, $f24 -+ fmov $f24,a0 -+ LD x3, 0 * SIZE(X) -+ fmuld x4, x4, t0 -+ addl X, INCX, X ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) + -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ LD x4, 0 * SIZE(X) -+ fmuld x5, x5, t1 -+ addl X, INCX, X ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) + -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ LD x5, 0 * SIZE(X) -+ fmuld x6, x6, t2 -+ addl X, INCX, X ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) + -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ LD x6, 0 * SIZE(X) -+ fmuld x7, x7, t3 -+ addl X, INCX, X ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) + -+ ldi I, -1(I) -+ bgt I, $L21 ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) + .align 4 + -+$L22: -+ faddd a0, t0, $f24 -+ fmov $f24,a0 -+ LD x7, 0 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X -+ -+ faddd a1, t1, $f24 -+ fmov $f24,a1 ++$L18: ++ ADD c12, t2, c12 + unop -+ fmuld x1, x1, t1 ++ MUL b1, a2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else + unop ++#endif + -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ fmuld x2, x2, t2 -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ fmuld x3, x3, t3 -+ -+ faddd a0, t0, $f24 -+ fmov $f24,a0 -+ fmuld x4, x4, t0 -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ fmuld x5, x5, t1 -+ -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ fmuld x6, x6, t2 -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ fmuld x7, x7, t3 -+ -+ faddd a1, t1, $f24 -+ fmov $f24,a1 -+ faddd a2, t2, $f24 -+ fmov $f24,a2 -+ faddd a3, t3, $f24 -+ fmov $f24,a3 -+ .align 4 -+ -+$L25: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 -+ -+$L26: -+ LD x0, 0 * SIZE(X) -+ addl X, INCX, X ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop + -+ faddd a0, t0,$f24 -+ fmov $f24,a0 -+ fmuld x0, x0, t0 ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++#ifndef TRMMKERNEL ++ LD b5, 1 * SIZE(C1) ++#else ++ unop ++#endif + -+ ldi I, -1(I) -+ bgt I, $L26 -+ .align 4 ++ ADD c01, t1, c01 ++ unop ++ MUL b1, a3, t1 ++ unop + ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++#ifndef TRMMKERNEL ++ LD b1, 0 * SIZE(C2) ++#else ++ unop ++#endif + -+$L998: -+ faddd a0, t0, $f24 -+ fmov $f24,a0 ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop + -+ faddd a0, a1, $f24 -+ fmov $f24,a1 -+ faddd a2, a3, $f24 -+ fmov $f24,a2 ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop + -+#if defined(EV4) || defined(EV5) -+ faddd a0, a2, $f16 -+ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop + -+ ldih $29, 0($26) !gpdisp!3 -+ ldi $29, 0($29) !gpdisp!3 ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++#ifndef TRMMKERNEL ++ LD a1, 0 * SIZE(C3) +#else -+ faddd a0, a2, $f24 -+ fsqrtd $f24, a0 ++ unop +#endif -+ .align 4 + -+$L999: -+#if defined(EV4) || defined(EV5) -+ ldl $26, 0($sp) -+ ldi $sp, 16($sp) ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++#ifndef TRMMKERNEL ++ LD a2, 2 * SIZE(C1) ++#else ++ unop +#endif -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/dnrm2.S.bak b/kernel/sw_64/dnrm2.S.bak -new file mode 100644 -index 0000000..753c90b ---- /dev/null -+++ b/kernel/sw_64/dnrm2.S.bak -@@ -0,0 +1,431 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCH_SIZE 80 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+ -+#define I $0 -+ -+#define a0 $f0 -+#define a1 $f1 -+#define a2 $f10 -+#define a3 $f11 -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 -+ -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f19 -+#define x4 $f20 -+#define x5 $f21 -+#define x6 $f22 -+#define x7 $f23 -+ -+ PROLOGUE -+ -+#if defined(EV4) || defined(EV5) -+ .frame $30,16,$26,0 -+ .mask 0x4000000,-16 -+ ldih $29, 0($27) !gpdisp!1 -+ ldi $29, 0($29) !gpdisp!1 -+ -+ ldi $sp, -16($sp) -+ ldl $27, sqrt($29) !literal!2 -+ stq $26, 0($sp) + -+ PROFCODE -+ .prologue 1 ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++#ifndef TRMMKERNEL ++ LD b2, 3 * SIZE(C1) +#else -+ PROFCODE ++ unop +#endif -+ -+ fclr a0 -+ SXADDQ INCX, 0, INCX -+ fclr a1 -+ ble N, $L999 -+ -+ fclr a2 -+ cmpeq INCX, SIZE, $0 -+ fclr a3 -+ beq $0, $L20 -+ -+ fclr t0 -+ sra N, 4, I -+ fclr t1 -+ ble I, $L15 -+ -+ fclr t2 -+ LD x0, 0 * SIZE(X) -+ fclr t3 -+ LD x1, 1 * SIZE(X) -+ -+ LD x2, 2 * SIZE(X) -+ LD x3, 3 * SIZE(X) -+ LD x4, 4 * SIZE(X) -+ LD x5, 5 * SIZE(X) -+ LD x6, 6 * SIZE(X) -+ LD x7, 7 * SIZE(X) -+ -+ ldi I, -1(I) -+ ble I, $L12 -+ .align 4 -+ -+$L11: -+ faddd a0, t0, a0 -+ fillcs (PREFETCH_SIZE) * SIZE(X) -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) -+ -+ faddd a1, t1, a1 -+ mov X, XX -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) + -+ faddd a2, t2, a2 ++ ADD c09, t1, c09 ++ ldi I, -1(I) ++ MUL b3, a3, t1 + unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) + -+ faddd a3, t3, a3 ++ ADD c10, t2, c10 + unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) -+ -+ faddd a0, t0, a0 ++ MUL b3, a4, t2 ++#ifndef TRMMKERNEL ++ LD b3, 0 * SIZE(C4) ++#else + unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(X) ++#endif + -+ faddd a1, t1, a1 ++ ADD c14, t3, c14 + unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(X) ++ MUL b4, a4, t3 ++#ifndef TRMMKERNEL ++ LD a4, 1 * SIZE(C2) ++#else ++ unop ++#endif + -+ faddd a2, t2, a2 ++ ADD c07, t4, c07 + unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(X) ++ MUL b4, a3, t4 ++#ifndef TRMMKERNEL ++ LD a3, 2 * SIZE(C2) ++#else ++ unop ++#endif + -+ faddd a3, t3, a3 ++ ADD c11, t1, c11 + unop -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(X) ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD b4, 3 * SIZE(C2) ++#else ++ unop ++#endif + -+ faddd a0, t0, a0 ++ ADD c12, t2, c12 + unop -+ fmuld x0, x0, t0 -+ LD x0, 16 * SIZE(X) ++ MUL alpha, c02, c02 ++#ifndef TRMMKERNEL ++ LD t1, 1 * SIZE(C3) ++#else ++ unop ++#endif + -+ faddd a1, t1, a1 -+ ldi X, 16 * SIZE(X) -+ fmuld x1, x1, t1 -+ LD x1, 17 * SIZE(XX) ++ ADD c16, t3, c16 ++ unop ++ MUL alpha, c03, c03 ++#ifndef TRMMKERNEL ++ LD t2, 2 * SIZE(C3) ++#else ++ unop ++#endif + -+ faddd a2, t2, a2 ++ ADD c15, t4, c15 + unop -+ fmuld x2, x2, t2 -+ LD x2, 18 * SIZE(XX) ++ MUL alpha, c04, c04 ++#ifndef TRMMKERNEL ++ LD t3, 3 * SIZE(C3) ++#else ++ unop ++#endif + -+ faddd a3, t3, a3 ++ MUL alpha, c05, c05 + unop -+ fmuld x3, x3, t3 -+ LD x3, 19 * SIZE(XX) ++#ifndef TRMMKERNEL ++ ADD c01, a5, c01 ++ LD t4, 1 * SIZE(C4) ++#else ++ unop ++ unop ++#endif + -+ faddd a0, t0, a0 ++ MUL alpha, c06, c06 ++#ifndef TRMMKERNEL + unop -+ fmuld x4, x4, t0 -+ LD x4, 20 * SIZE(XX) ++ ADD c02, b5, c02 ++ LD a5, 2 * SIZE(C4) ++#endif + -+ faddd a1, t1, a1 -+ ldi I, -1(I) -+ fmuld x5, x5, t1 -+ LD x5, 21 * SIZE(XX) ++ MUL alpha, c07, c07 ++#ifndef TRMMKERNEL ++ unop ++ ADD c03, a2, c03 ++ LD b5, 3 * SIZE(C4) ++#endif + -+ faddd a2, t2, a2 ++ MUL alpha, c08, c08 ++#ifndef TRMMKERNEL + unop -+ fmuld x6, x6, t2 -+ LD x6, 22 * SIZE(XX) ++ ADD c04, b2, c04 ++ unop ++#endif + -+ faddd a3, t3, a3 -+ fmuld x7, x7, t3 -+ LD x7, 23 * SIZE(XX) -+ bgt I, $L11 -+ .align 4 ++ MUL alpha, c09, c09 ++ ST c01, 0 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c05, b1, c05 ++ unop ++#endif + -+$L12: -+ faddd a0, t0, a0 -+ mov X, XX -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) ++ MUL alpha, c10, c10 ++ ST c02, 1 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c06, a4, c06 ++ unop ++#endif + -+ faddd a1, t1, a1 ++ MUL alpha, c11, c11 ++ ST c03, 2 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c07, a3, c07 + unop -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) ++#endif + -+ faddd a2, t2, a2 ++ MUL alpha, c12, c12 ++ ST c04, 3 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c08, b4, c08 ++#else + unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) ++#endif ++ ldi C1, 4 * SIZE(C1) + -+ faddd a3, t3, a3 ++ MUL alpha, c13, c13 ++ ST c05, 0 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c09, a1, c09 + unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) ++#endif + -+ faddd a0, t0, a0 ++ MUL alpha, c14, c14 ++ ST c06, 1 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c10, t1, c10 + unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(XX) ++#endif + -+ faddd a1, t1, a1 ++ MUL alpha, c15, c15 ++ ST c07, 2 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c11, t2, c11 + unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(XX) ++#endif + -+ faddd a2, t2, a2 ++ MUL alpha, c16, c16 ++ ST c08, 3 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c12, t3, c12 ++#else + unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(XX) ++#endif ++ ldi C2, 4 * SIZE(C2) + -+ faddd a3, t3, a3 -+ ldi X, 16 * SIZE(X) -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(XX) ++#ifndef TRMMKERNEL ++ ADD c13, b3, c13 ++#else ++ unop ++#endif ++ ST c09, 0 * SIZE(C3) ++ fclr t1 ++ ldi C4, 4 * SIZE(C4) + -+ faddd a0, t0, a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, a1 -+ fmuld x1, x1, t1 -+ -+ faddd a2, t2, a2 -+ fmuld x2, x2, t2 -+ faddd a3, t3, a3 -+ fmuld x3, x3, t3 -+ -+ faddd a0, t0, a0 -+ fmuld x4, x4, t0 -+ faddd a1, t1, a1 -+ fmuld x5, x5, t1 ++#ifndef TRMMKERNEL ++ ADD c14, t4, c14 ++#else ++ unop ++#endif ++ ST c10, 1 * SIZE(C3) ++ fclr t2 ++ unop + -+ faddd a2, t2, a2 -+ fmuld x6, x6, t2 -+ faddd a3, t3, a3 -+ fmuld x7, x7, t3 ++#ifndef TRMMKERNEL ++ ADD c15, a5, c15 ++#else ++ unop ++#endif ++ ST c11, 2 * SIZE(C3) ++ fclr t3 ++ unop + -+ faddd a1, t1, a1 -+ faddd a2, t2, a2 -+ faddd a3, t3, a3 -+ .align 4 ++#ifndef TRMMKERNEL ++ ADD c16, b5, c16 ++#else ++ unop ++#endif ++ ST c12, 3 * SIZE(C3) ++ fclr t4 ++ ldi C3, 4 * SIZE(C3) + -+$L15: -+ and N, 15, I -+ ble I, $L998 -+ .align 4 ++ ST c13, -4 * SIZE(C4) ++ ST c14, -3 * SIZE(C4) ++ ST c15, -2 * SIZE(C4) ++ ST c16, -1 * SIZE(C4) + -+$L16: -+ LD x0, 0 * SIZE(X) -+ ldi X, 1 * SIZE(X) ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif + -+ faddd a0, t0, a0 -+ fmuld x0, x0, t0 ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif + -+ ldi I, -1(I) -+ bgt I, $L16 -+ bsr $31, $L998 ++ bgt I, $L11 + .align 4 + +$L20: -+ fclr t0 -+ sra N, 3, I -+ fclr t1 -+ ble I, $L25 ++ and M, 2, I ++ ble I, $L30 + -+ fclr t2 -+ fclr t3 ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + -+ LD x0, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x1, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x2, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x3, 0 * SIZE(X) -+ addl X, INCX, X ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif + -+ LD x4, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x5, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x6, 0 * SIZE(X) -+ addl X, INCX, X ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 + -+ ldi I, -1(I) -+ ble I, $L22 -+ .align 4 ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 + -+$L21: -+ faddd a0, t0, a0 -+ LD x7, 0 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) + -+ faddd a1, t1, a1 -+ LD x0, 0 * SIZE(X) -+ fmuld x1, x1, t1 -+ addl X, INCX, X ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 + -+ faddd a2, t2, a2 -+ LD x1, 0 * SIZE(X) -+ fmuld x2, x2, t2 -+ addl X, INCX, X ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble L, $L25 + -+ faddd a3, t3, a3 -+ LD x2, 0 * SIZE(X) -+ fmuld x3, x3, t3 -+ addl X, INCX, X ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 + -+ faddd a0, t0, a0 -+ LD x3, 0 * SIZE(X) -+ fmuld x4, x4, t0 -+ addl X, INCX, X ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 + -+ faddd a1, t1, a1 -+ LD x4, 0 * SIZE(X) -+ fmuld x5, x5, t1 -+ addl X, INCX, X ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 + -+ faddd a2, t2, a2 -+ LD x5, 0 * SIZE(X) -+ fmuld x6, x6, t2 -+ addl X, INCX, X ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) + -+ faddd a3, t3, a3 -+ LD x6, 0 * SIZE(X) -+ fmuld x7, x7, t3 -+ addl X, INCX, X ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 + -+ ldi I, -1(I) -+ bgt I, $L21 -+ .align 4 ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble L, $L25 ++#endif ++ .align 4 + +$L22: -+ faddd a0, t0, a0 -+ LD x7, 0 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop + -+ faddd a1, t1, a1 ++ ADD c10, t2, c10 + unop -+ fmuld x1, x1, t1 ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 + unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) + -+ faddd a2, t2, a2 -+ fmuld x2, x2, t2 -+ faddd a3, t3, a3 -+ fmuld x3, x3, t3 ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) + -+ faddd a0, t0, a0 -+ fmuld x4, x4, t0 -+ faddd a1, t1, a1 -+ fmuld x5, x5, t1 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop + -+ faddd a2, t2, a2 -+ fmuld x6, x6, t2 -+ faddd a3, t3, a3 -+ fmuld x7, x7, t3 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) + -+ faddd a1, t1, a1 -+ faddd a2, t2, a2 -+ faddd a3, t3, a3 -+ .align 4 ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) + -+$L25: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) + -+$L26: -+ LD x0, 0 * SIZE(X) -+ addl X, INCX, X ++ ADD c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) + -+ faddd a0, t0, a0 -+ fmuld x0, x0, t0 ++ ADD c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) + -+ ldi I, -1(I) -+ bgt I, $L26 -+ .align 4 ++ ADD c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) + ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) + -+$L998: -+ faddd a0, t0, a0 ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) + -+ faddd a0, a1, a0 -+ faddd a2, a3, a2 ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) + -+#if defined(EV4) || defined(EV5) -+ faddd a0, a2, $f16 -+ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) + -+ ldih $29, 0($26) !gpdisp!3 -+ ldi $29, 0($29) !gpdisp!3 -+#else -+ faddd a0, a2, a0 -+ fsqrtd a0, a0 -+#endif ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 + .align 4 + -+$L999: -+#if defined(EV4) || defined(EV5) -+ ldl $26, 0($sp) -+ ldi $sp, 16($sp) ++$L25: ++ ADD c09, t1, c09 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L28 ++#else ++ blbs TMP1, $L28 +#endif -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/dot.S b/kernel/sw_64/dot.S -new file mode 100644 -index 0000000..513eada ---- /dev/null -+++ b/kernel/sw_64/dot.S -@@ -0,0 +1,607 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) + -+#define PREFETCHSIZE 88 ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop + -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) + -+#define I $5 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) + -+#define s0 $f0 -+#define s1 $f30 -+#define s2 $f1 -+#define s3 $f2 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) + -+#define a0 $f10 -+#define a1 $f11 -+#define a2 $f12 -+#define a3 $f13 -+#define a4 $f14 -+#define a5 $f15 -+#define a6 $f16 -+#define a7 $f17 ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) + -+#define b0 $f18 -+#define b1 $f19 -+#define b2 $f20 -+#define b3 $f21 -+#define b4 $f22 -+#define b5 $f23 -+#define b6 $f24 -+#define b7 $f25 ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) + -+#define t0 $f26 -+#define t1 $f27 -+#define t2 $f28 -+#define t3 $f29 ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 + ++$L28: ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C1) ++#else ++ unop ++#endif + -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 16, $26, 0 ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD a4, 1 * SIZE(C1) ++#else ++ unop ++#endif + -+ ldi $sp, -16($sp) -+ fclr s0 -+ fstd $f2, 0($sp) -+#ifndef ZYX20220111 -+ fstd $f3, 8($sp) ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C2) ++#else ++ unop +#endif -+ fclr s1 + -+ fclr s2 -+ nop -+ fclr s3 -+ ble N, $L999 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++#ifndef TRMMKERNEL ++ LD b5, 1 * SIZE(C2) ++#else ++ unop ++#endif + -+ fclr t0 -+ cmpeq INCX, 1, $21 -+ fclr t1 -+ cmpeq INCY, 1, $22 -+ fclr t2 -+ and $21, $22, $22 -+ fclr t3 -+ beq $22, $L20 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++#ifndef TRMMKERNEL ++ LD b1, 0 * SIZE(C3) ++#else ++ unop ++#endif + -+#ifndef DOUBLE -+ srl N, 4, I -+ ble I, $L15 ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++#ifndef TRMMKERNEL ++ LD b2, 1 * SIZE(C3) ++#else ++ unop ++#endif + -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++#ifndef TRMMKERNEL ++ LD b3, 0 * SIZE(C4) ++#else ++ unop ++#endif + -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ LD b2, 2 * SIZE(Y) -+ LD b3, 3 * SIZE(Y) -+ -+ LD a4, 4 * SIZE(X) -+ LD a5, 5 * SIZE(X) -+ LD b4, 4 * SIZE(Y) -+ LD b5, 5 * SIZE(Y) -+ -+ LD a6, 6 * SIZE(X) -+ LD a7, 7 * SIZE(X) -+ addl X, 16 * SIZE, X -+ subl I, 1, I ++ ADD c09, t1, c09 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD b4, 1 * SIZE(C4) ++#else ++ unop ++#endif + -+ addl Y, 16 * SIZE, Y -+ ble I, $L13 -+ .align 4 ++ ADD c10, t2, c10 ++ unop ++ MUL alpha, c02, c02 ++ unop + -+$L12: -+ fillcs PREFETCHSIZE * 2 * SIZE(X) -+ subl I, 1, I -+ fillcs PREFETCHSIZE * 2 * SIZE(Y) -+ addl X, 16 * SIZE, X ++ ADD c13, t3, c13 ++ MUL alpha, c05, c05 ++ ADD c14, t4, c14 ++ MUL alpha, c06, c06 + -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6, -10 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -9 * SIZE(Y) ++ MUL alpha, c09, c09 ++#ifndef TRMMKERNEL ++ ADD c01, a3, c01 ++#endif ++ MUL alpha, c10, c10 ++#ifndef TRMMKERNEL ++ ADD c02, a4, c02 ++#endif + -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a0, -24 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -23 * SIZE(X) ++ MUL alpha, c13, c13 ++#ifndef TRMMKERNEL ++ ADD c05, a5, c05 ++#endif ++ MUL alpha, c14, c14 ++#ifndef TRMMKERNEL ++ ADD c06, b5, c06 ++#endif + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b0, -8 * SIZE(Y) -+ MUL a2, b2, $f3 -+ fmov $f3, t2 -+ LD b1, -7 * SIZE(Y) ++#ifndef TRMMKERNEL ++ ADD c09, b1, c09 ++ unop ++#endif ++ ST c01, 0 * SIZE(C1) ++ fclr t1 + -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a2, -22 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -21 * SIZE(X) ++#ifndef TRMMKERNEL ++ ADD c10, b2, c10 ++ unop ++#endif ++ ST c02, 1 * SIZE(C1) ++ fclr t2 + -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b2, -6 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, -5 * SIZE(Y) ++#ifndef TRMMKERNEL ++ ADD c13, b3, c13 ++ unop ++#endif ++ ST c05, 0 * SIZE(C2) ++ fclr t3 + -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a4, -20 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -19 * SIZE(X) ++#ifndef TRMMKERNEL ++ ADD c14, b4, c14 ++ unop ++#endif ++ ST c06, 1 * SIZE(C2) ++ fclr t4 + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b4, -4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, -3 * SIZE(Y) ++ ST c09, 0 * SIZE(C3) ++ ldi C1, 2 * SIZE(C1) ++ ST c10, 1 * SIZE(C3) ++ ldi C2, 2 * SIZE(C2) + -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a6, -18 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -17 * SIZE(X) ++ ST c13, 0 * SIZE(C4) ++ ldi C3, 2 * SIZE(C3) ++ ST c14, 1 * SIZE(C4) ++ ldi C4, 2 * SIZE(C4) + -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a0, -16 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -15 * SIZE(X) ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t2 -+ LD b1, 1 * SIZE(Y) ++$L30: ++ and M, 1, I ++ ble I, $L39 + -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a2, -14 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -13 * SIZE(X) ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b2, 2 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 3 * SIZE(Y) ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif + -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a4, -12 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -11 * SIZE(X) ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b4, 4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, 5 * SIZE(Y) ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) + -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a6, -10 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -9 * SIZE(X) ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 + -+ addl Y, 16 * SIZE, Y -+ bgt I, $L12 -+ nop -+ fnop -+ .align 4 ++ ldi BO, 4 * SIZE(B) ++ ble L, $L35 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 + -+$L13: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6,-10 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -9 * SIZE(Y) ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a0, -8 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -7 * SIZE(X) ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b0, -8 * SIZE(Y) -+ MUL a2, b2, t2 -+ LD b1, -7 * SIZE(Y) ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 + -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a2, -6 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -5 * SIZE(X) ++ ldi BO, 4 * SIZE(BO) ++ ble L, $L35 ++#endif ++ .align 4 + -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b2, -6 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, -5 * SIZE(Y) ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) + -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a4, -4 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -3 * SIZE(X) ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b4, -4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, -3 * SIZE(Y) ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) + -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a6, -2 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -1 * SIZE(X) ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) + -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a1, b1, t1 ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 ++ ADD c05, t2, c05 + MUL a2, b2, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a3, b3, t3 ++ LD b2, -3 * SIZE(BO) + -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ MUL a4, b4, t0 -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a5, b5, t1 -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a6, b6, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a7, b7, t3 -+ .align 4 ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) + -+$L15: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ and N, 15, I -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ ble I, $L18 ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 + .align 4 + ++$L35: ++ ADD c01, t1, c01 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L38 +#else -+ -+ srl N, 3, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) -+ -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ LD b2, 2 * SIZE(Y) -+ LD b3, 3 * SIZE(Y) -+ -+ LD a4, 4 * SIZE(X) -+ LD a5, 5 * SIZE(X) -+ LD b4, 4 * SIZE(Y) -+ LD b5, 5 * SIZE(Y) -+ -+ LD a6, 6 * SIZE(X) -+ LD a7, 7 * SIZE(X) -+ addl X, 8 * SIZE, X -+ subl I, 1, I -+ -+ addl Y, 8 * SIZE, Y -+ ble I, $L13 ++ blbs TMP1, $L38 ++#endif + .align 4 + -+$L12: -+ fillcs PREFETCHSIZE * SIZE(X) -+ subl I, 1, I -+ fillcs PREFETCHSIZE * SIZE(Y) -+ addl X, 8 * SIZE, X ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) + -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) + -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a0, -8 * SIZE(X) ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 -+ LD a1, -7 * SIZE(X) ++ ldi BO, 4 * SIZE(BO) ++ .align 4 + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t2 -+ LD b1, 1 * SIZE(Y) ++$L38: ++ ADD c05, t2, c05 ++ unop ++ MUL a1, b2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif + -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a2, -6 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -5 * SIZE(X) ++ ADD c09, t3, c09 ++ unop ++ MUL a1, b3, t3 ++#ifndef TRMMKERNEL ++ LD b5, 0 * SIZE(C2) ++#else ++ unop ++#endif + -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b2, 2 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 3 * SIZE(Y) ++ ADD c13, t4, c13 ++ unop ++ MUL a1, b4, t4 ++#ifndef TRMMKERNEL ++ LD a2, 0 * SIZE(C3) ++#else ++ unop ++#endif + -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a4, -4 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -3 * SIZE(X) ++ ADD c01, t1, c01 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C4) ++#else ++ unop ++#endif + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b4, 4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, 5 * SIZE(Y) ++ ADD c05, t2, c05 ++ unop ++ MUL alpha, c05, c05 ++ unop + -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a6, -2 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -1 * SIZE(X) ++ ADD c09, t3, c09 ++ MUL alpha, c09, c09 ++ ADD c13, t4, c13 ++ MUL alpha, c13, c13 + -+ addl Y, 8 * SIZE, Y -+ bgt I, $L12 -+ nop -+ fnop -+ .align 4 ++#ifndef TRMMKERNEL ++ ADD c01, a5, c01 ++ ADD c05, b5, c05 ++ ADD c09, a2, c09 ++ ADD c13, a3, c13 ++#endif + -+$L13: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a1, b1, t1 ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a3, b3, t3 ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ MUL a4, b4, t0 -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a5, b5, t1 -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a6, b6, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a7, b7, t3 ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 1, KK ++#endif + .align 4 + -+$L15: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ and N, 7, I -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ ble I, $L18 ++$L39: ++ mov BO, B ++ ldi J, -1(J) ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 4, KK ++#else ++ unop ++#endif ++ bgt J, $L01 + .align 4 + -+#endif ++$L40: ++ and N, 2, J ++ ble J, $L80 + -+$L16: -+ LD a0, 0 * SIZE(X) -+ addl X, SIZE, X -+ LD b0, 0 * SIZE(Y) -+ addl Y, SIZE, Y ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ fclr t1 ++ addl C2, LDC, C ++ fclr t2 + -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a0, b0, t2 -+ subl I, 1, I -+ bgt I, $L16 -+ .align 4 ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif + -+$L18: -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ br $L999 ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 + .align 4 + -+$L20: -+ srl N, 2, I -+ ble I, $L25 ++$L51: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b0, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b1, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif + -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b2, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b3, 0 * SIZE(Y) -+ subl I, 1, I -+ -+ SXADDQ INCY, Y, Y -+ ble I, $L23 -+ .align 4 -+ -+$L22: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ MUL a0, b0, t0 -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a1, b1, t1 -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a3, b3, t3 -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b0, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b1, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b2, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b3, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ subl I, 1, I -+ bgt I, $L22 -+ nop -+ fnop -+ .align 4 -+ -+$L23: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ MUL a0, b0, t0 -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a1, b1, t1 -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a3, b3, t3 -+ .align 4 -+ -+$L25: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ and N, 3, I -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ ble I, $L28 -+ .align 4 -+ -+$L26: -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b0, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a0, b0, t2 -+ subl I, 1, I -+ bgt I, $L26 -+ .align 4 -+ -+$L28: -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ .align 4 -+ -+$L999: -+ ADD s2, s3, $f3 -+ fmov $f3, s2 -+ fldd $f2, 0($sp) -+ ADD s0, s1, $f3 -+ fmov $f3, s0 -+ ADD s0, s2, $f3 -+ fmov $f3, s0 -+#ifndef ZYX20220111 -+ fldd $f3, 8($sp) -+ ldi $sp, 16($sp) -+#endif -+ ret -+ -+ EPILOGUE -diff --git a/kernel/sw_64/dot.S.bak b/kernel/sw_64/dot.S.bak -new file mode 100644 -index 0000000..cd96e21 ---- /dev/null -+++ b/kernel/sw_64/dot.S.bak -@@ -0,0 +1,602 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 88 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 -+ -+#define I $5 -+ -+#define s0 $f0 -+#define s1 $f30 -+#define s2 $f1 -+#define s3 $f2 -+ -+#define a0 $f10 -+#define a1 $f11 -+#define a2 $f12 -+#define a3 $f13 -+#define a4 $f14 -+#define a5 $f15 -+#define a6 $f16 -+#define a7 $f17 -+ -+#define b0 $f18 -+#define b1 $f19 -+#define b2 $f20 -+#define b3 $f21 -+#define b4 $f22 -+#define b5 $f23 -+#define b6 $f24 -+#define b7 $f25 -+ -+#define t0 $f26 -+#define t1 $f27 -+#define t2 $f28 -+#define t3 $f29 -+ -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 16, $26, 0 -+ -+ ldi $sp, -16($sp) -+ fclr s0 -+ fstd $f2, 0($sp) -+ fclr s1 -+ -+ fclr s2 -+ nop -+ fclr s3 -+ ble N, $L999 -+ -+ fclr t0 -+ cmpeq INCX, 1, $21 -+ fclr t1 -+ cmpeq INCY, 1, $22 -+ fclr t2 -+ and $21, $22, $22 -+ fclr t3 -+ beq $22, $L20 -+ -+#ifndef DOUBLE -+ srl N, 4, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) -+ -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ LD b2, 2 * SIZE(Y) -+ LD b3, 3 * SIZE(Y) -+ -+ LD a4, 4 * SIZE(X) -+ LD a5, 5 * SIZE(X) -+ LD b4, 4 * SIZE(Y) -+ LD b5, 5 * SIZE(Y) -+ -+ LD a6, 6 * SIZE(X) -+ LD a7, 7 * SIZE(X) -+ addl X, 16 * SIZE, X -+ subl I, 1, I -+ -+ addl Y, 16 * SIZE, Y -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ fillcs PREFETCHSIZE * 2 * SIZE(X) -+ subl I, 1, I -+ fillcs PREFETCHSIZE * 2 * SIZE(Y) -+ addl X, 16 * SIZE, X -+ -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6, -10 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -9 * SIZE(Y) -+ -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a0, -24 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -23 * SIZE(X) -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b0, -8 * SIZE(Y) -+ MUL a2, b2, $f3 -+ fmov $f3, t2 -+ LD b1, -7 * SIZE(Y) -+ -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a2, -22 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -21 * SIZE(X) -+ -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b2, -6 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, -5 * SIZE(Y) -+ -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a4, -20 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -19 * SIZE(X) -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b4, -4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, -3 * SIZE(Y) -+ -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a6, -18 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -17 * SIZE(X) -+ -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) -+ -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a0, -16 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -15 * SIZE(X) -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t2 -+ LD b1, 1 * SIZE(Y) -+ -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a2, -14 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -13 * SIZE(X) -+ -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b2, 2 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 3 * SIZE(Y) -+ -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a4, -12 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -11 * SIZE(X) -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b4, 4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, 5 * SIZE(Y) -+ -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a6, -10 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -9 * SIZE(X) -+ -+ addl Y, 16 * SIZE, Y -+ bgt I, $L12 -+ nop -+ fnop -+ .align 4 -+ -+$L13: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6,-10 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -9 * SIZE(Y) -+ -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a0, -8 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -7 * SIZE(X) -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b0, -8 * SIZE(Y) -+ MUL a2, b2, t2 -+ LD b1, -7 * SIZE(Y) -+ -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a2, -6 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -5 * SIZE(X) -+ -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b2, -6 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, -5 * SIZE(Y) -+ -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a4, -4 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -3 * SIZE(X) -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b4, -4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, -3 * SIZE(Y) -+ -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a6, -2 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -1 * SIZE(X) -+ -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a1, b1, t1 -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a3, b3, t3 -+ -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ MUL a4, b4, t0 -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a5, b5, t1 -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a6, b6, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a7, b7, t3 -+ .align 4 -+ -+$L15: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ and N, 15, I -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ ble I, $L18 -+ .align 4 -+ -+#else -+ -+ srl N, 3, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) -+ -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ LD b2, 2 * SIZE(Y) -+ LD b3, 3 * SIZE(Y) -+ -+ LD a4, 4 * SIZE(X) -+ LD a5, 5 * SIZE(X) -+ LD b4, 4 * SIZE(Y) -+ LD b5, 5 * SIZE(Y) -+ -+ LD a6, 6 * SIZE(X) -+ LD a7, 7 * SIZE(X) -+ addl X, 8 * SIZE, X -+ subl I, 1, I -+ -+ addl Y, 8 * SIZE, Y -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ fillcs PREFETCHSIZE * SIZE(X) -+ subl I, 1, I -+ fillcs PREFETCHSIZE * SIZE(Y) -+ addl X, 8 * SIZE, X -+ -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) -+ -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a0, -8 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -7 * SIZE(X) -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t2 -+ LD b1, 1 * SIZE(Y) -+ -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a2, -6 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -5 * SIZE(X) -+ -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b2, 2 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 3 * SIZE(Y) -+ -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ LD a4, -4 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -3 * SIZE(X) -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ LD b4, 4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, 5 * SIZE(Y) -+ -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ LD a6, -2 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -1 * SIZE(X) -+ -+ addl Y, 8 * SIZE, Y -+ bgt I, $L12 -+ nop -+ fnop -+ .align 4 -+ -+$L13: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a1, b1, t1 -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a3, b3, t3 -+ -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ MUL a4, b4, t0 -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a5, b5, t1 -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a6, b6, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a7, b7, t3 -+ .align 4 -+ -+$L15: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ and N, 7, I -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ ble I, $L18 -+ .align 4 -+ -+#endif -+ -+$L16: -+ LD a0, 0 * SIZE(X) -+ addl X, SIZE, X -+ LD b0, 0 * SIZE(Y) -+ addl Y, SIZE, Y -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a0, b0, t2 -+ subl I, 1, I -+ bgt I, $L16 -+ .align 4 -+ -+$L18: -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ br $L999 -+ .align 4 -+ -+$L20: -+ srl N, 2, I -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b0, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b1, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b2, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b3, 0 * SIZE(Y) -+ subl I, 1, I -+ -+ SXADDQ INCY, Y, Y -+ ble I, $L23 -+ .align 4 -+ -+$L22: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ MUL a0, b0, t0 -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a1, b1, t1 -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a3, b3, t3 -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b0, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b1, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b2, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b3, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ subl I, 1, I -+ bgt I, $L22 -+ nop -+ fnop -+ .align 4 -+ -+$L23: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ MUL a0, b0, t0 -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ MUL a1, b1, t1 -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ MUL a3, b3, t3 -+ .align 4 -+ -+$L25: -+ ADD s0, t0, $f3 -+ fmov $f3, s0 -+ and N, 3, I -+ ADD s1, t1, $f3 -+ fmov $f3, s1 -+ ble I, $L28 -+ .align 4 -+ -+$L26: -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b0, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ MUL a0, b0, t2 -+ subl I, 1, I -+ bgt I, $L26 -+ .align 4 -+ -+$L28: -+ ADD s2, t2, $f3 -+ fmov $f3, s2 -+ ADD s3, t3, $f3 -+ fmov $f3, s3 -+ .align 4 -+ -+$L999: -+ ADD s2, s3, $f3 -+ fmov $f3, s2 -+ fldd $f2, 0($sp) -+ ADD s0, s1, $f3 -+ fmov $f3, s0 -+ ldi $sp, 16($sp) -+ -+ ADD s0, s2, $f3 -+ fmov $f3, s0 -+ ret -+ -+ EPILOGUE -diff --git a/kernel/sw_64/dot_simd.S b/kernel/sw_64/dot_simd.S -new file mode 100644 -index 0000000..3e2288d ---- /dev/null -+++ b/kernel/sw_64/dot_simd.S -@@ -0,0 +1,634 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 80 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 -+ -+#define I $5 -+ -+#define s0 $f0 -+#define s1 $f30 -+#define s2 $f1 -+#define s3 $f2 -+ -+#define a0 $f10 -+#define a1 $f11 -+#define a2 $f12 -+#define a3 $f13 -+#define a4 $f14 -+#define a5 $f15 -+#define a6 $f16 -+#define a7 $f17 -+ -+#define b0 $f18 -+#define b1 $f19 -+#define b2 $f20 -+#define b3 $f21 -+#define b4 $f22 -+#define b5 $f23 -+#define b6 $f24 -+#define b7 $f25 -+ -+#define t0 $f26 -+#define t1 $f27 -+#define t2 $f28 -+#define t3 $f29 -+ -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 16, $26, 0 -+ -+ ldi $sp, -16($sp) -+ fclr s0 -+ fstd $f2, 0($sp) -+ fclr s1 -+ -+ fclr s2 -+ nop -+ fclr s3 -+ ble N, $L999 -+ -+ fclr t0 -+ cmpeq INCX, 1, $21 -+ fclr t1 -+ cmpeq INCY, 1, $22 -+ fclr t2 -+ and $21, $22, $22 -+ fclr t3 -+ beq $22, $L20 -+ -+ -+/* -+ test the address of Y & X -+*/ -+ and Y, (VEC_LEN*SIZE-1), $4 -+ and X, (VEC_LEN*SIZE-1), $3 -+ or $3, $4, $4 -+ bne $4, $UnAlign_ACCESS -+ -+/*Align Accessing*/ -+ sra N, 4, I -+ ble I, $Remain -+ -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, s0 #clear s0 vector -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, s1 -+ -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, s2 -+ VLD a3, 3*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, s3 -+ -+ VLD b0, 0*VEC_LEN*SIZE(Y) -+ VLD b1, 1*VEC_LEN*SIZE(Y) -+ VLD b2, 2*VEC_LEN*SIZE(Y) -+ VLD b3, 3*VEC_LEN*SIZE(Y) -+ -+ addl X, 16 * SIZE, X -+ addl Y, 16 * SIZE, Y -+ subl I, 1, I -+ ble I, $MainLoopEnd -+$MainLoop: -+ VMAD a0, b0, s0, s0 -+ fillcs PREFETCHSIZE * SIZE(X) -+ VMAD a1, b1, s1, s1 -+ fillcs PREFETCHSIZE * SIZE(Y) -+ -+ subl I, 1, I -+ VMAD a2, b2, s2, s2 -+ addl X, 16 * SIZE, X -+ VMAD a3, b3, s3, s3 -+ -+ VLD a0, -4*VEC_LEN*SIZE(X) -+ VLD a1, -3*VEC_LEN*SIZE(X) -+ VLD a2, -2*VEC_LEN*SIZE(X) -+ VLD a3, -1*VEC_LEN*SIZE(X) -+ -+ VLD b0, 0*VEC_LEN*SIZE(Y) -+ VLD b1, 1*VEC_LEN*SIZE(Y) -+ VLD b2, 2*VEC_LEN*SIZE(Y) -+ VLD b3, 3*VEC_LEN*SIZE(Y) -+ -+ -+ addl Y, 16 * SIZE, Y -+ bgt I, $MainLoop -+ .align 4 -+ -+$MainLoopEnd: -+ VMAD a0, b0, s0, s0 -+ VMAD a1, b1, s1, s1 -+ VMAD a2, b2, s2, s2 -+ VMAD a3, b3, s3, s3 -+ -+ VADD s0, s1, t0 -+ VADD s2, s3, t1 -+ nop -+ VADD t0, t1, s0 -+ -+ vextf s0, 1, s1 -+ vextf s0, 2, s2 -+ vextf s0, 3, s3 -+ nop -+ -+ ADD s0, s1, t2 -+ ADD s2, s3, t3 -+ nop -+ ADD t2, t3, s0 -+ -+ .align 4 -+$Remain: -+ and N, 15, I -+ ble I, $End -+ .align 4 -+$Remain_Loop: -+ LD a0, 0 * SIZE(X) -+ addl X, SIZE, X -+ LD b0, 0 * SIZE(Y) -+ addl Y, SIZE, Y -+ -+ MAD a0, b0, s0, s0 -+ subl I, 1, I -+ bgt I, $Remain_Loop -+ .align 4 -+$End: -+ -+ fldd $f2, 0($sp) -+ ldi $sp, 16($sp) -+ ret -+ .align 4 -+ -+/*UnAlign Accessing*/ -+$UnAlign_ACCESS: -+ -+#ifndef DOUBLE -+ srl N, 4, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) -+ -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ LD b2, 2 * SIZE(Y) -+ LD b3, 3 * SIZE(Y) -+ -+ LD a4, 4 * SIZE(X) -+ LD a5, 5 * SIZE(X) -+ LD b4, 4 * SIZE(Y) -+ LD b5, 5 * SIZE(Y) -+ -+ LD a6, 6 * SIZE(X) -+ LD a7, 7 * SIZE(X) -+ addl X, 16 * SIZE, X -+ subl I, 1, I -+ -+ addl Y, 16 * SIZE, Y -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ fillcs PREFETCHSIZE * 2 * SIZE(X) -+ subl I, 1, I -+ fillcs PREFETCHSIZE * 2 * SIZE(Y) -+ addl X, 16 * SIZE, X -+ -+ ADD s0, t0, s0 -+ LD b6, -10 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -9 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ LD a0, -24 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -23 * SIZE(X) -+ -+ ADD s2, t2, s2 -+ LD b0, -8 * SIZE(Y) -+ MUL a2, b2, t2 -+ LD b1, -7 * SIZE(Y) -+ -+ ADD s3, t3, s3 -+ LD a2, -22 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -21 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b2, -6 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, -5 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ LD a4, -20 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -19 * SIZE(X) -+ -+ ADD s2, t2, s2 -+ LD b4, -4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, -3 * SIZE(Y) -+ -+ ADD s3, t3, s3 -+ LD a6, -18 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -17 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ LD a0, -16 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -15 * SIZE(X) -+ -+ ADD s2, t2, s2 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t2 -+ LD b1, 1 * SIZE(Y) -+ -+ ADD s3, t3, s3 -+ LD a2, -14 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -13 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b2, 2 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 3 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ LD a4, -12 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -11 * SIZE(X) -+ -+ ADD s2, t2, s2 -+ LD b4, 4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, 5 * SIZE(Y) -+ -+ ADD s3, t3, s3 -+ LD a6, -10 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -9 * SIZE(X) -+ -+ addl Y, 16 * SIZE, Y -+ bgt I, $L12 -+ nop -+ fnop -+ .align 4 -+ -+$L13: -+ ADD s0, t0, s0 -+ LD b6,-10 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -9 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ LD a0, -8 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -7 * SIZE(X) -+ -+ ADD s2, t2, s2 -+ LD b0, -8 * SIZE(Y) -+ MUL a2, b2, t2 -+ LD b1, -7 * SIZE(Y) -+ -+ ADD s3, t3, s3 -+ LD a2, -6 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -5 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b2, -6 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, -5 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ LD a4, -4 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -3 * SIZE(X) -+ -+ ADD s2, t2, s2 -+ LD b4, -4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, -3 * SIZE(Y) -+ -+ ADD s3, t3, s3 -+ LD a6, -2 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) -+ ADD s1, t1, s1 -+ MUL a1, b1, t1 -+ -+ ADD s2, t2, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, s3 -+ MUL a3, b3, t3 -+ -+ ADD s0, t0, s0 -+ MUL a4, b4, t0 -+ ADD s1, t1, s1 -+ MUL a5, b5, t1 -+ ADD s2, t2, s2 -+ MUL a6, b6, t2 -+ ADD s3, t3, s3 -+ MUL a7, b7, t3 -+ .align 4 -+ -+$L15: -+ ADD s0, t0, s0 -+ and N, 15, I -+ ADD s1, t1, s1 -+ ble I, $L18 -+ .align 4 -+ -+#else -+ -+ srl N, 3, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) -+ -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ LD b2, 2 * SIZE(Y) -+ LD b3, 3 * SIZE(Y) -+ -+ LD a4, 4 * SIZE(X) -+ LD a5, 5 * SIZE(X) -+ LD b4, 4 * SIZE(Y) -+ LD b5, 5 * SIZE(Y) -+ -+ LD a6, 6 * SIZE(X) -+ LD a7, 7 * SIZE(X) -+ addl X, 8 * SIZE, X -+ subl I, 1, I -+ -+ addl Y, 8 * SIZE, Y -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ fillcs PREFETCHSIZE * SIZE(X) -+ subl I, 1, I -+ fillcs PREFETCHSIZE * SIZE(Y) -+ addl X, 8 * SIZE, X -+ -+ ADD s0, t0, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ LD a0, -8 * SIZE(X) -+ MUL a1, b1, t1 -+ LD a1, -7 * SIZE(X) -+ -+ ADD s2, t2, s2 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t2 -+ LD b1, 1 * SIZE(Y) -+ -+ ADD s3, t3, s3 -+ LD a2, -6 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, -5 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b2, 2 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 3 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ LD a4, -4 * SIZE(X) -+ MUL a5, b5, t1 -+ LD a5, -3 * SIZE(X) -+ -+ ADD s2, t2, s2 -+ LD b4, 4 * SIZE(Y) -+ MUL a6, b6, t2 -+ LD b5, 5 * SIZE(Y) -+ -+ ADD s3, t3, s3 -+ LD a6, -2 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, -1 * SIZE(X) -+ -+ addl Y, 8 * SIZE, Y -+ bgt I, $L12 -+ nop -+ fnop -+ .align 4 -+ -+$L13: -+ ADD s0, t0, s0 -+ LD b6, -2 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, -1 * SIZE(Y) -+ ADD s1, t1, s1 -+ MUL a1, b1, t1 -+ -+ ADD s2, t2, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, s3 -+ MUL a3, b3, t3 -+ -+ ADD s0, t0, s0 -+ MUL a4, b4, t0 -+ ADD s1, t1, s1 -+ MUL a5, b5, t1 -+ ADD s2, t2, s2 -+ MUL a6, b6, t2 -+ ADD s3, t3, s3 -+ MUL a7, b7, t3 -+ .align 4 -+ -+$L15: -+ ADD s0, t0, s0 -+ and N, 7, I -+ ADD s1, t1, s1 -+ ble I, $L18 -+ .align 4 -+ -+#endif -+ -+$L16: -+ LD a0, 0 * SIZE(X) -+ addl X, SIZE, X -+ LD b0, 0 * SIZE(Y) -+ addl Y, SIZE, Y -+ -+ ADD s2, t2, s2 -+ MUL a0, b0, t2 -+ subl I, 1, I -+ bgt I, $L16 -+ .align 4 -+ -+$L18: -+ ADD s2, t2, s2 -+ ADD s3, t3, s3 -+ br $L999 -+ .align 4 -+ -+$L20: -+ srl N, 2, I -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b0, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b1, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b2, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b3, 0 * SIZE(Y) -+ subl I, 1, I -+ -+ SXADDQ INCY, Y, Y -+ ble I, $L23 -+ .align 4 -+ -+$L22: -+ ADD s0, t0, s0 -+ MUL a0, b0, t0 -+ ADD s1, t1, s1 -+ MUL a1, b1, t1 -+ ADD s2, t2, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, s3 -+ MUL a3, b3, t3 -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b0, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b1, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b2, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b3, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ subl I, 1, I -+ bgt I, $L22 -+ nop -+ fnop -+ .align 4 -+ -+$L23: -+ ADD s0, t0, s0 -+ MUL a0, b0, t0 -+ ADD s1, t1, s1 -+ MUL a1, b1, t1 -+ ADD s2, t2, s2 -+ MUL a2, b2, t2 -+ ADD s3, t3, s3 -+ MUL a3, b3, t3 -+ .align 4 -+ -+$L25: -+ ADD s0, t0, s0 -+ and N, 3, I -+ ADD s1, t1, s1 -+ ble I, $L28 -+ .align 4 -+ -+$L26: -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD b0, 0 * SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ MUL a0, b0, t2 -+ subl I, 1, I -+ bgt I, $L26 -+ .align 4 -+ -+$L28: -+ ADD s2, t2, s2 -+ ADD s3, t3, s3 -+ .align 4 -+ -+$L999: -+ ADD s2, s3, s2 -+ fldd $f2, 0($sp) -+ ADD s0, s1, s0 -+ ldi $sp, 16($sp) -+ -+ ADD s0, s2, s0 -+ ret -+ -+ EPILOGUE -diff --git a/kernel/sw_64/gemm_beta.S b/kernel/sw_64/gemm_beta.S -new file mode 100644 -index 0000000..d9ea890 ---- /dev/null -+++ b/kernel/sw_64/gemm_beta.S -@@ -0,0 +1,179 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+ .set noat -+ .set noreorder -+.text -+ .align 5 -+ .globl CNAME -+ .ent CNAME -+CNAME: -+ .frame $sp, 0, $26, 0 -+ -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ ldi $28, _mcount -+ jsr $28, ($28), _mcount -+#endif -+ -+ ldl $18, 16($sp) -+ ble $16, $End -+ ldl $19, 24($sp) -+ ble $17, $End -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif -+ -+ fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO) -+ .align 4 -+ -+$BETA_NE_ZERO: -+ sra $16, 3, $2 # i = (m >> 3) -+ mov $18, $1 # c_offset = c -+ ldi $17, -1($17) # j -- -+ ble $2,$L52 -+ .align 4 -+ -+$L51: -+ fillcs 64($1) -+ ldi $2, -1($2) -+ -+ LD $f14, 0*SIZE($1) -+ LD $f15, 1*SIZE($1) -+ LD $f16, 2*SIZE($1) -+ LD $f17, 3*SIZE($1) -+ LD $f18, 4*SIZE($1) -+ LD $f11, 5*SIZE($1) -+ LD $f21, 6*SIZE($1) -+ LD $f22, 7*SIZE($1) -+ -+ MUL $f19, $f14, $f23 -+ MUL $f19, $f15, $f24 -+ MUL $f19, $f16, $f25 -+ MUL $f19, $f17, $f26 -+ MUL $f19, $f18, $f27 -+ MUL $f19, $f11, $f28 -+ MUL $f19, $f21, $f29 -+ MUL $f19, $f22, $f30 -+ -+ ST $f23, 0*SIZE($1) -+ ST $f24, 1*SIZE($1) -+ ST $f25, 2*SIZE($1) -+ ST $f26, 3*SIZE($1) -+ ST $f27, 4*SIZE($1) -+ ST $f28, 5*SIZE($1) -+ ST $f29, 6*SIZE($1) -+ ST $f30, 7*SIZE($1) -+ -+ ldi $1,8*SIZE($1) -+ bgt $2,$L51 -+ .align 4 -+ -+$L52: -+ and $16, 7, $2 -+ ble $2,$L54 -+ .align 4 -+ -+$L53: -+ LD $f12, 0($1) -+ ldi $2, -1($2) -+ MUL $f19, $f12, $f23 -+ ST $f23, 0($1) -+ ldi $1, SIZE($1) -+ bgt $2,$L53 -+ .align 4 -+ -+$L54: -+ SXADDQ $19, $18, $18 # c += ldc -+ bgt $17,$BETA_NE_ZERO -+ clr $0 -+ ret -+ .align 4 -+ -+$BETA_EQ_ZERO: -+ sra $16, 3, $2 # i = (m >> 3) -+ ldi $4, 8*SIZE($18) -+ mov $18, $1 # c_offset = c -+ ldi $17, -1($17) # j -- -+ ble $2,$L42 -+ .align 4 -+ -+$L41: -+ ST $f31, 0*SIZE($1) -+ ST $f31, 1*SIZE($1) -+ ST $f31, 2*SIZE($1) -+ ST $f31, 3*SIZE($1) -+ ST $f31, 4*SIZE($1) -+ ST $f31, 5*SIZE($1) -+ ST $f31, 6*SIZE($1) -+ ST $f31, 7*SIZE($1) -+ ldi $2, -1($2) -+ -+ ldi $4, 8*SIZE($4) -+ ldi $1, 8*SIZE($1) -+ bgt $2,$L41 -+ .align 4 -+ -+$L42: -+ and $16, 7, $2 -+ ble $2,$L44 -+ .align 4 -+ -+$L43: -+ ldi $2, -1($2) -+ ST $f31, 0($1) -+ ldi $1, SIZE($1) -+ bgt $2, $L43 -+ .align 4 -+ -+$L44: -+ SXADDQ $19, $18, $18 # c += ldc -+ bgt $17,$BETA_EQ_ZERO -+ clr $0 -+ .align 4 -+ -+$End: -+ ret -+ .ident VERSION -+ .end CNAME -diff --git a/kernel/sw_64/gemm_kernel_4x4.S b/kernel/sw_64/gemm_kernel_4x4.S -new file mode 100644 -index 0000000..dd17554 ---- /dev/null -+++ b/kernel/sw_64/gemm_kernel_4x4.S -@@ -0,0 +1,3244 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#if !defined(EV4) && !defined(EV5) && !defined(SW6) -+#error "Architecture is not specified." -+#endif -+ -+#ifdef SW6 -+#define PREFETCHSIZE 56 -+#define UNOP unop -+#endif -+ -+#ifdef EV5 -+#define PREFETCHSIZE 56 -+#define UNOP -+#endif -+ -+#ifdef EV4 -+#define UNOP -+#endif -+ -+#define STACKSIZE 96 -+ -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $20 -+#define B $21 -+#define C $22 -+#define LDC $23 -+ -+#define C1 $19 -+#define C2 $24 -+#define C3 $25 -+#define C4 $27 -+ -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 -+ -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 -+ -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 -+ -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 -+ -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 -+ -+#define alpha $f30 -+ -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 -+ -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 -+ -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 -+ -+ -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 -+ -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define BB $3 -+#define OFFSET $4 -+ -+#define tmp $9 -+ -+#define ALPHA 64($sp) -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ ldl C, 0 + STACKSIZE($sp) -+ ldl LDC, 8 + STACKSIZE($sp) -+#ifdef TRMMKERNEL -+ ldl OFFSET, 16 + STACKSIZE($sp) -+#endif -+ -+ SXADDQ LDC, 0, LDC -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ stl $9, 80($sp) -+ fstd $f19, ALPHA -+ -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 -+ -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 -+ -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ subl $31, OFFSET, KK -+#endif -+ -+ sra N, 2, J -+ ble J, $L40 -+ .align 4 -+ -+$L01: -+ mov C, C1 -+ addl C, LDC, C2 -+ mov A, AO -+ s4addl K, 0, BB -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK -+#endif -+ -+ addl C2, LDC, C3 -+ s4addl LDC, C, C -+ -+ SXADDQ BB, B, BB -+ fclr t1 -+ addl C3, LDC, C4 -+ fclr t2 -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L20 -+ .align 4 -+ -+$L11: -+#if defined(EV5) || defined(SW6A) -+ fillcs 0 * SIZE(BB) -+ fillcs 8 * SIZE(BB) -+ unop -+ ldi BB, 16 * SIZE(BB) -+#endif -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 4, TMP1 -+#else -+ addl KK, 4, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(B) -+ fclr c06 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(B) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+#else -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ addl B, TMP1, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c06 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(TMP1) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(BO) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+#endif -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble L, $L15 -+ .align 5 -+ -+$L12: -+/* 1 */ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop -+#endif -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ unop -+ MUL b1, a2, t2 -+ unop -+ -+ ADD c16, t3,b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) -+ -+ ADD c15, t4,b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ FIMOVD b5, tmp -+/* 2 */ -+ ADD c01, t1,b5 -+ fmov b5, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP -+ -+ ADD c02, t2,b5 -+ fmov b5, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP -+ -+ ADD c06, t3,b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a1, t4 -+ unop -+ -+/* 3 */ -+ ADD c03, t1,b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, t1 -+ unop -+ -+ ADD c04, t2,b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3,b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4,b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+/* 4 */ -+ ADD c09, t1,b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) -+ -+/* 5 */ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ ldi L, -2(L) -+ IFMOVD tmp, b5 -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a5, t4 -+ unop -+ -+/* 6 */ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a6, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a4, t2 -+ unop -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a5, t4 -+ unop -+ -+/* 7 */ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 -+ unop -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) -+ -+/* 8 */ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 -+ .align 4 -+ -+$L15: -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ fldd alpha, ALPHA -+ MUL b1, a1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L18 -+#else -+ blbs TMP1, $L18 -+#endif -+ .align 4 -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, t2 -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, t3 -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL b1, a4, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, t3 -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, t4 -+ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, t1 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, t4 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ LD b4, 3 * SIZE(BO) -+ MUL b1, a1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L18: -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ unop -+ MUL b1, a2, t2 -+#ifndef TRMMKERNEL -+ LD a5, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a1, t4 -+#ifndef TRMMKERNEL -+ LD b5, 1 * SIZE(C1) -+ FIMOVD b5, tmp -+#else -+ unop -+#endif -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL b1, a3, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL b1, a4, t2 -+#ifndef TRMMKERNEL -+ LD b1, 0 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a1, t4 -+ unop -+ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, t1 -+ unop -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, t2 -+#ifndef TRMMKERNEL -+ LD a1, 0 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+#ifndef TRMMKERNEL -+ LD a2, 2 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, t4 -+#ifndef TRMMKERNEL -+ LD b2, 3 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ ldi I, -1(I) -+ MUL b3, a3, t1 -+ unop -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+#ifndef TRMMKERNEL -+ LD b3, 0 * SIZE(C4) -+#else -+ unop -+#endif -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+#ifndef TRMMKERNEL -+ LD a4, 1 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, t4 -+#ifndef TRMMKERNEL -+ LD a3, 2 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ unop -+ MUL alpha, c01, b5 -+ fmov b5, c01 -+#ifndef TRMMKERNEL -+ LD b4, 3 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ unop -+ MUL alpha, c02, b5 -+ fmov b5, c02 -+#ifndef TRMMKERNEL -+ LD t1, 1 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL alpha, c03, b5 -+ fmov b5, c03 -+#ifndef TRMMKERNEL -+ LD t2, 2 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL alpha, c04, b5 -+ fmov b5, c04 -+#ifndef TRMMKERNEL -+ LD t3, 3 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ MUL alpha, c05, b5 -+ fmov b5, c05 -+ unop -+#ifndef TRMMKERNEL -+ ADD c01, a5, b5 -+ fmov b5, c01 -+ LD t4, 1 * SIZE(C4) -+#else -+ unop -+ unop -+#endif -+ -+ MUL alpha, c06, b5 -+ fmov b5, c06 -+#ifndef TRMMKERNEL -+ unop -+ IFMOVD tmp, b5 -+ fstd b1, 88($sp) -+# FIMOVD b1, tmp -+ ADD c02, b5, b1 -+ fmov b1, c02 -+ fldd b1, 88($sp) -+# IFMOVD tmp, b1 -+ LD a5, 2 * SIZE(C4) -+#endif -+ -+ MUL alpha, c07, b5 -+ fmov b5, c07 -+#ifndef TRMMKERNEL -+ unop -+ ADD c03, a2, b5 -+ fmov b5, c03 -+ LD b5, 3 * SIZE(C4) -+ FIMOVD b5, tmp -+#endif -+ -+ MUL alpha, c08, b5 -+ fmov b5, c08 -+#ifndef TRMMKERNEL -+ unop -+ ADD c04, b2, b5 -+ fmov b5, c04 -+ unop -+#endif -+ -+ MUL alpha, c09, b5 -+ fmov b5, c09 -+ ST c01, 0 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c05, b1, b5 -+ fmov b5, c05 -+ unop -+#endif -+ -+ MUL alpha, c10, b5 -+ fmov b5, c10 -+ ST c02, 1 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c06, a4, b5 -+ fmov b5, c06 -+ unop -+#endif -+ -+ MUL alpha, c11, b5 -+ fmov b5, c11 -+ ST c03, 2 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c07, a3, b5 -+ fmov b5, c07 -+ unop -+#endif -+ -+ MUL alpha, c12, b5 -+ fmov b5, c12 -+ ST c04, 3 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c08, b4, b5 -+ fmov b5, c08 -+#else -+ unop -+#endif -+ ldi C1, 4 * SIZE(C1) -+ -+ MUL alpha, c13, b5 -+ fmov b5, c13 -+ ST c05, 0 * SIZE(C2) -+#ifndef TRMMKERNEL -+ ADD c09, a1, b5 -+ fmov b5, c09 -+ unop -+#endif -+ -+ MUL alpha, c14, b5 -+ fmov b5, c14 -+ ST c06, 1 * SIZE(C2) -+#ifndef TRMMKERNEL -+ ADD c10, t1, b5 -+ fmov b5, c10 -+ unop -+#endif -+ -+ MUL alpha, c15, b5 -+ fmov b5, c15 -+ ST c07, 2 * SIZE(C2) -+#ifndef TRMMKERNEL -+ ADD c11, t2, b5 -+ fmov b5, c11 -+ unop -+#endif -+ -+ MUL alpha, c16, b5 -+ fmov b5, c16 -+ ST c08, 3 * SIZE(C2) -+#ifndef TRMMKERNEL -+ ADD c12, t3, b5 -+ fmov b5, c12 -+#else -+ unop -+#endif -+ ldi C2, 4 * SIZE(C2) -+ -+#ifndef TRMMKERNEL -+ ADD c13, b3, b5 -+ fmov b5, c13 -+#else -+ unop -+#endif -+ ST c09, 0 * SIZE(C3) -+ fclr t1 -+ ldi C4, 4 * SIZE(C4) -+ -+#ifndef TRMMKERNEL -+ ADD c14, t4, b5 -+ fmov b5, c14 -+#else -+ unop -+#endif -+ ST c10, 1 * SIZE(C3) -+ fclr t2 -+ unop -+ -+#ifndef TRMMKERNEL -+ ADD c15, a5, b5 -+ fmov b5, c15 -+#else -+ unop -+#endif -+ ST c11, 2 * SIZE(C3) -+ fclr t3 -+ unop -+ -+#ifndef TRMMKERNEL -+ IFMOVD tmp, b5 -+# FIMOVD b1, tmp -+ fstd b1, 88($sp) -+ ADD c16, b5, b1 -+ fmov b1, c16 -+ fldd b1, 88($sp) -+# IFMOVD tmp, b1 -+#else -+ unop -+#endif -+ ST c12, 3 * SIZE(C3) -+ fclr t4 -+ ldi C3, 4 * SIZE(C3) -+ -+ ST c13, -4 * SIZE(C4) -+ ST c14, -3 * SIZE(C4) -+ ST c15, -2 * SIZE(C4) -+ ST c16, -1 * SIZE(C4) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 4, TMP1 -+#else -+ subl TMP1, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 4, KK -+#endif -+ -+ bgt I, $L11 -+ .align 4 -+ -+$L20: -+ and M, 2, I -+ ble I, $L30 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 2, TMP1 -+#else -+ addl KK, 4, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(B) -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c01 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(B) -+ fclr c02 -+ fclr c06 -+ ble L, $L25 -+ -+#else -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c01 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(BO) -+ fclr c02 -+ fclr c06 -+ ble L, $L25 -+#endif -+ .align 4 -+ -+$L22: -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ FIMOVD b5, tmp -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ IFMOVD tmp, b5 -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ IFMOVD tmp, b5 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 -+ -+$L25: -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ fldd alpha, ALPHA -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L28 -+#else -+ blbs TMP1, $L28 -+#endif -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a1, b2, t3 -+ unop -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L28: -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a2, b1, t2 -+#ifndef TRMMKERNEL -+ LD a3, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a1, b2, t3 -+#ifndef TRMMKERNEL -+ LD a4, 1 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ unop -+ MUL a2, b2, t4 -+#ifndef TRMMKERNEL -+ LD a5, 0 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, t1 -+#ifndef TRMMKERNEL -+ LD b5, 1 * SIZE(C2) -+ FIMOVD b5, tmp -+#else -+ unop -+#endif -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, t2 -+#ifndef TRMMKERNEL -+ LD b1, 0 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, t3 -+#ifndef TRMMKERNEL -+ LD b2, 1 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b4, t4 -+#ifndef TRMMKERNEL -+ LD b3, 0 * SIZE(C4) -+#else -+ unop -+#endif -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL alpha, c01, b5 -+ fmov b5, c01 -+#ifndef TRMMKERNEL -+ LD b4, 1 * SIZE(C4) -+#else -+ unop -+#endif -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL alpha, c02, b5 -+ fmov b5, c02 -+ unop -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ MUL alpha, c05, b5 -+ fmov b5, c05 -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ MUL alpha, c06, b5 -+ fmov b5, c06 -+ -+ MUL alpha, c09, b5 -+ fmov b5, c09 -+#ifndef TRMMKERNEL -+ ADD c01, a3, b5 -+ fmov b5, c01 -+#endif -+ MUL alpha, c10, b5 -+ fmov b5, c10 -+#ifndef TRMMKERNEL -+ ADD c02, a4, b5 -+ fmov b5, c02 -+#endif -+ -+ MUL alpha, c13, b5 -+ fmov b5, c13 -+#ifndef TRMMKERNEL -+ ADD c05, a5, b5 -+ fmov b5, c05 -+#endif -+ MUL alpha, c14, b5 -+ fmov b5, c14 -+#ifndef TRMMKERNEL -+ IFMOVD tmp, b5 -+ fstd b1, 88($sp) -+# FIMOVD b1, tmp -+ ADD c06, b5, b1 -+ fmov b1, c06 -+ fldd b1, 88($sp) -+# IFMOVD tmp, b1 -+#endif -+ -+#ifndef TRMMKERNEL -+ ADD c09, b1, b5 -+ fmov b5, c09 -+ unop -+#endif -+ ST c01, 0 * SIZE(C1) -+ fclr t1 -+ -+#ifndef TRMMKERNEL -+ ADD c10, b2, b5 -+ fmov b5, c10 -+ unop -+#endif -+ ST c02, 1 * SIZE(C1) -+ fclr t2 -+ -+#ifndef TRMMKERNEL -+ ADD c13, b3, b5 -+ fmov b5, c13 -+ unop -+#endif -+ ST c05, 0 * SIZE(C2) -+ fclr t3 -+ -+#ifndef TRMMKERNEL -+ ADD c14, b4, b5 -+ fmov b5, c14 -+ unop -+#endif -+ ST c06, 1 * SIZE(C2) -+ fclr t4 -+ -+ ST c09, 0 * SIZE(C3) -+ ldi C1, 2 * SIZE(C1) -+ ST c10, 1 * SIZE(C3) -+ ldi C2, 2 * SIZE(C2) -+ -+ ST c13, 0 * SIZE(C4) -+ ldi C3, 2 * SIZE(C3) -+ ST c14, 1 * SIZE(C4) -+ ldi C4, 2 * SIZE(C4) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 2, TMP1 -+#else -+ subl TMP1, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 2, KK -+#endif -+ .align 4 -+ -+$L30: -+ and M, 1, I -+ ble I, $L39 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 1, TMP1 -+#else -+ addl KK, 4, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ LD b2, 1 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c09 -+ LD b4, 3 * SIZE(B) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(B) -+ ble L, $L35 -+#else -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c09 -+ LD b4, 3 * SIZE(BO) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(BO) -+ ble L, $L35 -+#endif -+ .align 4 -+ -+$L32: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ ldi AO, 2 * SIZE(AO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ LD b5, 3 * SIZE(BO) -+ FIMOVD b5, tmp -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL a1, b4, t4 -+ LD a1, -1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a2, b1, t1 -+ LD b1, 4 * SIZE(BO) -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a2, b2, t2 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ LD b4, -1 * SIZE(BO) -+ MUL a2, b3, t3 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ IFMOVD tmp, b5 -+ MUL a2, b5, t4 -+ LD a2, 0 * SIZE(AO) -+ bgt L, $L32 -+ .align 4 -+ -+$L35: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ fldd alpha, ALPHA -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L38 -+#else -+ blbs TMP1, $L38 -+#endif -+ .align 4 -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ LD b1, 0 * SIZE(BO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL a1, b4, t4 -+ LD a1, 0 * SIZE(AO) -+ ldi AO, 1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L38: -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b2, t2 -+#ifndef TRMMKERNEL -+ LD a5, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ unop -+ MUL a1, b3, t3 -+#ifndef TRMMKERNEL -+ LD b5, 0 * SIZE(C2) -+ FIMOVD b5, tmp -+#else -+ unop -+#endif -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL a1, b4, t4 -+#ifndef TRMMKERNEL -+ LD a2, 0 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL alpha, c01, b5 -+ fmov b5, c01 -+#ifndef TRMMKERNEL -+ LD a3, 0 * SIZE(C4) -+#else -+ unop -+#endif -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ unop -+ MUL alpha, c05, b5 -+ fmov b5, c05 -+ unop -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ MUL alpha, c09, b5 -+ fmov b5, c09 -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL alpha, c13, b5 -+ fmov b5, c13 -+ -+#ifndef TRMMKERNEL -+ IFMOVD tmp, b5 -+ fstd b1, 88($sp) -+# FIMOVD b1, tmp -+ ADD c01, a5, b1 -+ fmov b1, c01 -+ ADD c05, b5, b1 -+ fmov b1, c05 -+ ADD c09, a2, b1 -+ fmov b1, c09 -+ ADD c13, a3, b1 -+ fmov b1, c13 -+ fldd b1, 88($sp) -+# IFMOVD tmp, b1 -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c09, 0 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 1, TMP1 -+#else -+ subl TMP1, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 1, KK -+#endif -+ .align 4 -+ -+$L39: -+ mov BO, B -+ ldi J, -1(J) -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addl KK, 4, KK -+#else -+ unop -+#endif -+ bgt J, $L01 -+ .align 4 -+ -+$L40: -+ and N, 2, J -+ ble J, $L80 -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ mov A, AO -+ fclr t1 -+ addl C2, LDC, C -+ fclr t2 -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK -+#endif -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L60 -+ .align 4 -+ -+$L51: -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 4, TMP1 -+#else -+ addl KK, 2, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 -+ -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ ldi BO, 2 * SIZE(B) -+ ldi AO, 4 * SIZE(AO) -+ ble L, $L55 -+#else -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ ldi BO, 2 * SIZE(BO) -+ ldi AO, 4 * SIZE(AO) -+ ble L, $L55 -+#endif -+ .align 4 -+ -+$L52: -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ ldi L, -2(L) -+ MUL a2, b1, t2 -+ unop -+ -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b1, t3 -+ unop -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ unop -+ MUL a4, b2, t4 -+ LD a5, 3 * SIZE(AO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b3, t1 -+ LD b2, -1 * SIZE(BO) -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b3, t2 -+ unop -+ -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b3, t3 -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a5, b3, t4 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b4, t1 -+ LD a1, -4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b4, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L52 -+ .align 4 -+ -+$L55: -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ fldd alpha, ALPHA -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L58 -+#else -+ blbs TMP1, $L58 -+#endif -+ .align 4 -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, t2 -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, t3 -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L58: -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b1, t2 -+#ifndef TRMMKERNEL -+ LD c09, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b1, t3 -+#ifndef TRMMKERNEL -+ LD c10, 1 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, t4 -+#ifndef TRMMKERNEL -+ LD c11, 2 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, t1 -+#ifndef TRMMKERNEL -+ LD c12, 3 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b2, t2 -+#ifndef TRMMKERNEL -+ LD c13, 0 * SIZE(C2) -+ unop -+#endif -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, t3 -+#ifndef TRMMKERNEL -+ LD c14, 1 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ unop -+ MUL a4, b2, t4 -+#ifndef TRMMKERNEL -+ LD c15, 2 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL alpha, c01, b5 -+ fmov b5, c01 -+#ifndef TRMMKERNEL -+ LD c16, 3 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ ldi I, -1(I) -+ MUL alpha, c02, b5 -+ fmov b5, c02 -+ unop -+ -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ MUL alpha, c03, b5 -+ fmov b5, c03 -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ MUL alpha, c04, b5 -+ fmov b5, c04 -+ -+ MUL alpha, c05, b5 -+ fmov b5, c05 -+#ifndef TRMMKERNEL -+ ADD c01, c09, b5 -+ fmov b5, c01 -+#endif -+ MUL alpha, c06, b5 -+ fmov b5, c06 -+#ifndef TRMMKERNEL -+ ADD c02, c10, b5 -+ fmov b5, c02 -+#endif -+ -+ MUL alpha, c07, b5 -+ fmov b5, c07 -+#ifndef TRMMKERNEL -+ ADD c03, c11, b5 -+ fmov b5, c03 -+#endif -+ MUL alpha, c08, b5 -+ fmov b5, c08 -+#ifndef TRMMKERNEL -+ ADD c04, c12, b5 -+ fmov b5, c04 -+#endif -+ -+#ifndef TRMMKERNEL -+ ADD c05, c13, b5 -+ fmov b5, c05 -+#endif -+ ST c01, 0 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c06, c14, b5 -+ fmov b5, c06 -+#endif -+ ST c02, 1 * SIZE(C1) -+ -+#ifndef TRMMKERNEL -+ ADD c07, c15, b5 -+ fmov b5, c07 -+#endif -+ ST c03, 2 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c08, c16, b5 -+ fmov b5, c08 -+#endif -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ fclr t1 -+ ST c06, 1 * SIZE(C2) -+ fclr t2 -+ ST c07, 2 * SIZE(C2) -+ fclr t3 -+ ST c08, 3 * SIZE(C2) -+ fclr t4 -+ -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 4, TMP1 -+#else -+ subl TMP1, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 4, KK -+#endif -+ bgt I, $L51 -+ .align 4 -+ -+$L60: -+ and M, 2, I -+ ble I, $L70 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 2, TMP1 -+#else -+ addl KK, 2, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(B) -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ ble L, $L65 -+#else -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ ble L, $L65 -+#endif -+ .align 4 -+ -+$L62: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ldi L, -2(L) -+ MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, t1 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a3, b4, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a4, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ unop -+ -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L62 -+ .align 4 -+ -+$L65: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ fldd alpha, ALPHA -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L68 -+#else -+ blbs TMP1, $L68 -+#endif -+ .align 4 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, t3 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi AO, 2 * SIZE(AO) -+ .align 4 -+ -+$L68: -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b1, t2 -+#ifndef TRMMKERNEL -+ LD c09, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b2, t3 -+#ifndef TRMMKERNEL -+ LD c10, 1 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, t4 -+#ifndef TRMMKERNEL -+ LD c11, 0 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL alpha, c01, b5 -+ fmov b5, c01 -+#ifndef TRMMKERNEL -+ LD c12, 1 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi C1, 2 * SIZE(C1) -+ MUL alpha, c02, b5 -+ fmov b5, c02 -+ ldi C2, 2 * SIZE(C2) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ MUL alpha, c05, b5 -+ fmov b5, c05 -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL alpha, c06, b5 -+ fmov b5, c06 -+ -+#ifndef TRMMKERNEL -+ ADD c01, c09, b5 -+ fmov b5, c01 -+ ADD c02, c10, b5 -+ fmov b5, c02 -+ ADD c05, c11, b5 -+ fmov b5, c05 -+ ADD c06, c12, b5 -+ fmov b5, c06 -+#endif -+ -+ ST c01, -2 * SIZE(C1) -+ fclr t1 -+ ST c02, -1 * SIZE(C1) -+ fclr t2 -+ ST c05, -2 * SIZE(C2) -+ fclr t3 -+ ST c06, -1 * SIZE(C2) -+ fclr t4 -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 2, TMP1 -+#else -+ subl TMP1, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 2, KK -+#endif -+ .align 4 -+ -+$L70: -+ and M, 1, I -+ ble I, $L79 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 1, TMP1 -+#else -+ addl KK, 2, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c02 -+ LD b2, 1 * SIZE(B) -+ fclr c06 -+ -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ -+ LD b3, 2 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ ble L, $L75 -+#else -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c02 -+ LD b2, 1 * SIZE(BO) -+ fclr c06 -+ -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ -+ LD b3, 2 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ ble L, $L75 -+#endif -+ .align 4 -+ -+$L72: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, t2 -+ LD a1, 1 * SIZE(AO) -+ LD b2, 3 * SIZE(BO) -+ -+ ADD c02, t3, b5 -+ fmov b5, c02 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b3, t3 -+ LD b3, 4 * SIZE(BO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, t4 -+ LD a2, 0 * SIZE(AO) -+ LD b4, 5 * SIZE(BO) -+ -+ ldi BO, 4 * SIZE(BO) -+ unop -+ unop -+ bgt L, $L72 -+ .align 4 -+ -+$L75: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ fldd alpha, ALPHA -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L78 -+#else -+ blbs TMP1, $L78 -+#endif -+ .align 4 -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, t2 -+ LD a1, 0 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L78: -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, t2 -+#ifndef TRMMKERNEL -+ LD a5, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c02, t3, b5 -+ fmov b5, c02 -+ ADD c06, t4, b5 -+ fmov b5, c06 -+#ifndef TRMMKERNEL -+ LD b5, 0 * SIZE(C2) -+ FIMOVD b5, tmp -+#else -+ unop -+#endif -+ -+ ADD c01, c02, b5 -+ fmov b5, c01 -+ ADD c05, c06, b5 -+ fmov b5, c05 -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ -+ MUL alpha, c01, b5 -+ fmov b5, c01 -+ MUL alpha, c05, b5 -+ fmov b5, c05 -+ -+#ifndef TRMMKERNEL -+ IFMOVD tmp ,b5 -+ fstd b1, 88($sp) -+# FIMOVD b1, tmp -+ ADD c01, a5, b1 -+ fmov b1, c01 -+ ADD c05, b5, b1 -+ fmov b1, c05 -+ fldd b1, 88($sp) -+# IFMOVD tmp ,b1 -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 1, TMP1 -+#else -+ subl TMP1, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 1, KK -+#endif -+ .align 4 -+ -+$L79: -+ mov BO, B -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addl KK, 2, KK -+#else -+ unop -+#endif -+ unop -+ unop -+ .align 4 -+ -+$L80: -+ and N, 1, J -+ ble J, $L999 -+ -+ mov C, C1 -+ mov A, AO -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK -+#endif -+ -+ sra M, 2, I -+ ble I, $L100 -+ .align 4 -+ -+$L91: -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 4, TMP1 -+#else -+ addl KK, 1, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ mov B, BO -+ unop -+ ble L, $L95 -+#else -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ unop -+ ble L, $L95 -+#endif -+ .align 5 -+ -+$L92: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi L, -1(L) -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b1, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b1, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 8 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 9 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 10 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, t4 -+ LD a4, 11 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, t1 -+ LD a1, 12 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, t2 -+ LD a2, 13 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b3, t3 -+ LD a3, 14 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b3, t4 -+ LD a5, 15 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b4, t1 -+ LD a1, 16 * SIZE(AO) -+ ldi AO, 16 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b4, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L92 -+ .align 4 -+ -+$L95: -+#ifndef TRMMKERNEL -+ and K, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ fldd alpha, ALPHA -+ unop -+ ble L, $L98 -+ .align 4 -+ -+$L96: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 1 * SIZE(BO) -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b1, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b1, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ldi AO, 4 * SIZE(AO) -+ bgt L, $L96 -+ .align 4 -+ -+$L98: -+#ifndef TRMMKERNEL -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD c05, 0 * SIZE(C1) -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ LD c06, 1 * SIZE(C1) -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD c07, 2 * SIZE(C1) -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ LD c08, 3 * SIZE(C1) -+#else -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+#endif -+ -+ MUL alpha, c01, b5 -+ fmov b5, c01 -+ MUL alpha, c02, b5 -+ fmov b5, c02 -+ MUL alpha, c03, b5 -+ fmov b5, c03 -+ MUL alpha, c04, b5 -+ fmov b5, c04 -+ -+#ifndef TRMMKERNEL -+ ADD c01, c05, b5 -+ fmov b5, c01 -+ ADD c02, c06, b5 -+ fmov b5, c02 -+ ADD c03, c07, b5 -+ fmov b5, c03 -+ ADD c04, c08, b5 -+ fmov b5, c04 -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ldi C1, 4 * SIZE(C1) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 4, TMP1 -+#else -+ subl TMP1, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ bgt I, $L91 -+ .align 4 -+ -+$L100: -+ and M, 2, I -+ unop -+ unop -+ ble I, $L110 -+ .align 4 -+ -+$L101: -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 2, TMP1 -+#else -+ addl KK, 1, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ mov B, BO -+ unop -+ ble L, $L105 -+#else -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ unop -+ ble L, $L105 -+#endif -+ .align 5 -+ -+$L102: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ldi BO, 4 * SIZE(BO) -+ MUL a3, b2, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, t4 -+ LD a5, 7 * SIZE(AO) -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b3, t1 -+ LD a1, 8 * SIZE(AO) -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L102 -+ .align 4 -+ -+$L105: -+#ifndef TRMMKERNEL -+ and K, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ fldd alpha, ALPHA -+#ifndef TRMMKERNEL -+ LD a3, 0 * SIZE(C1) -+ LD a4, 1 * SIZE(C1) -+#endif -+ ble L, $L108 -+ .align 4 -+ -+$L106: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, t2 -+ LD a2, 3 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi AO, 2 * SIZE(AO) -+ unop -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L106 -+ .align 4 -+ -+$L108: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ fclr t1 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ fclr t2 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ fclr t3 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ fclr t4 -+ -+ ADD c01, c03, b5 -+ fmov b5, c01 -+ ADD c02, c04, b5 -+ fmov b5, c02 -+ -+ MUL alpha, c01, b5 -+ fmov b5, c01 -+ MUL alpha, c02, b5 -+ fmov b5, c02 -+ -+#ifndef TRMMKERNEL -+ ADD c01, a3, b5 -+ fmov b5, c01 -+ ADD c02, a4, b5 -+ fmov b5, c02 -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ldi C1, 2 * SIZE(C1) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 2, TMP1 -+#else -+ subl TMP1, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 2, KK -+#endif -+ .align 4 -+ -+$L110: -+ and M, 1, I -+ ble I, $L999 -+ .align 4 -+ -+$L111: -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 1, TMP1 -+#else -+ addl KK, 1, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ mov B, BO -+ unop -+ ble L, $L115 -+#else -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ unop -+ ble L, $L115 -+#endif -+ .align 4 -+ -+$L112: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b2, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ MUL a3, b3, t3 -+ LD a3, 6 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b4, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b4, 7 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 4 * SIZE(AO) -+ ldi BO, 4 * SIZE(BO) -+ bgt L, $L112 -+ .align 4 -+ -+$L115: -+#ifndef TRMMKERNEL -+ and K, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ fldd alpha, ALPHA -+#ifndef TRMMKERNEL -+ LD a2, 0 * SIZE(C1) -+#endif -+ ble L, $L118 -+ .align 4 -+ -+$L116: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, t1 -+ LD a1, 1 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 1 * SIZE(AO) -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L116 -+ .align 4 -+ -+$L118: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ -+ ADD c01, c02, b5 -+ fmov b5, c01 -+ ADD c03, c04, b5 -+ fmov b5, c03 -+ ADD c01, c03, b5 -+ fmov b5, c01 -+ -+ MUL alpha, c01, b5 -+ fmov b5, c01 -+#ifndef TRMMKERNEL -+ ADD c01, a2, b5 -+ fmov b5, c01 -+#endif -+ ST c01, 0 * SIZE(C1) -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldl $9, 80($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/gemm_kernel_4x4.S.bak b/kernel/sw_64/gemm_kernel_4x4.S.bak -new file mode 100644 -index 0000000..10dc98d ---- /dev/null -+++ b/kernel/sw_64/gemm_kernel_4x4.S.bak -@@ -0,0 +1,2844 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#if !defined(SW2B) -+#error "Architecture is not specified." -+#endif -+ -+#ifdef SW2B -+#define PREFETCHSIZE 56 -+#define UNOP nop -+#endif -+ -+ -+#define STACKSIZE 80 -+ -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $20 -+#define B $21 -+#define C $22 -+#define LDC $23 -+ -+#define C1 $19 -+#define C2 $24 -+#define C3 $25 -+#define C4 $27 -+ -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 -+ -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 -+ -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 -+ -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 -+ -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 -+ -+#define alpha $f30 -+ -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 -+ -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 -+ -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 -+ -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 -+ -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define BB $3 -+#define OFFSET $4 -+ -+#define ALPHA 64($sp) -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ ldl C, 0 + STACKSIZE($sp) -+ ldl LDC, 8 + STACKSIZE($sp) -+#ifdef TRMMKERNEL -+ ldl OFFSET, 16 + STACKSIZE($sp) -+#endif -+ -+ SXADDQ LDC, 0, LDC -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ fstd $f19, ALPHA -+ -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 -+ -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 -+ -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ subl $31, OFFSET, KK -+#endif -+ -+ sra N, 2, J -+ ble J, $L40 -+ .align 4 -+ -+$L01: -+ mov C, C1 -+ addl C, LDC, C2 -+ mov A, AO -+ s4addl K, 0, BB -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK -+#endif -+ -+ addl C2, LDC, C3 -+ s4addl LDC, C, C -+ -+ SXADDQ BB, B, BB -+ fclr t1 -+ addl C3, LDC, C4 -+ fclr t2 -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L20 -+ .align 4 -+ -+$L11: -+#if defined(EV5) || defined(EV6) || defined(SW2B) -+ fillcs 0 * SIZE(BB) -+ fillcs 8 * SIZE(BB) -+ unop -+ ldi BB, 16 * SIZE(BB) -+#endif -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 4, TMP1 -+#else -+ addl KK, 4, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(B) -+ fclr c06 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(B) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+#else -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ addl B, TMP1, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c06 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(TMP1) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(BO) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+#endif -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble L, $L15 -+ .align 5 -+ -+$L12: -+/* 1 */ -+ ADD c11, t1, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop -+#endif -+ -+ ADD c12, t2, c12 -+ unop -+ MUL b1, a2, t2 -+ unop -+ -+ ADD c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) -+ -+ ADD c15, t4, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ -+/* 2 */ -+ ADD c01, t1, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP -+ -+ ADD c02, t2, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP -+ -+ ADD c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, c05 -+ unop -+ MUL b4, a1, t4 -+ unop -+ -+/* 3 */ -+ ADD c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+ unop -+ -+ ADD c04, t2, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+/* 4 */ -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) -+ -+ ADD c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) -+ -+/* 5 */ -+ ADD c11, t1, c11 -+ unop -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c12, t2, c12 -+ ldi L, -2(L) -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD c15, t4, c15 -+ unop -+ MUL b2, a5, t4 -+ unop -+ -+/* 6 */ -+ ADD c01, t1, c01 -+ unop -+ MUL b5, a6, t1 -+ unop -+ -+ ADD c02, t2, c02 -+ unop -+ MUL b5, a4, t2 -+ unop -+ -+ ADD c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, c05 -+ unop -+ MUL b4, a5, t4 -+ unop -+ -+/* 7 */ -+ ADD c03, t1, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 -+ unop -+ -+ ADD c04, t2, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) -+ -+/* 8 */ -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 -+ .align 4 -+ -+$L15: -+ ADD c11, t1, c11 -+ fldd alpha, ALPHA -+ MUL b1, a1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L18 -+#else -+ blbs TMP1, $L18 -+#endif -+ .align 4 -+ -+ ADD c12, t2, c12 -+ MUL b1, a2, t2 -+ ADD c16, t3, c16 -+ MUL b2, a2, t3 -+ -+ ADD c15, t4, c15 -+ MUL b2, a1, t4 -+ ADD c01, t1, c01 -+ MUL b1, a3, t1 -+ -+ ADD c02, t2, c02 -+ unop -+ MUL b1, a4, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c06, t3, c06 -+ MUL b2, a4, t3 -+ ADD c05, t4, c05 -+ MUL b4, a1, t4 -+ -+ ADD c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c04, t2, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a3, t1 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c11, t1, c11 -+ LD b4, 3 * SIZE(BO) -+ MUL b1, a1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L18: -+ ADD c12, t2, c12 -+ unop -+ MUL b1, a2, t2 -+#ifndef TRMMKERNEL -+ LD a5, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD c15, t4, c15 -+ unop -+ MUL b2, a1, t4 -+#ifndef TRMMKERNEL -+ LD b5, 1 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c01, t1, c01 -+ unop -+ MUL b1, a3, t1 -+ unop -+ -+ ADD c02, t2, c02 -+ unop -+ MUL b1, a4, t2 -+#ifndef TRMMKERNEL -+ LD b1, 0 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, c05 -+ unop -+ MUL b4, a1, t4 -+ unop -+ -+ ADD c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+ unop -+ -+ ADD c04, t2, c04 -+ unop -+ MUL b3, a2, t2 -+#ifndef TRMMKERNEL -+ LD a1, 0 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+#ifndef TRMMKERNEL -+ LD a2, 2 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+#ifndef TRMMKERNEL -+ LD b2, 3 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c09, t1, c09 -+ ldi I, -1(I) -+ MUL b3, a3, t1 -+ unop -+ -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+#ifndef TRMMKERNEL -+ LD b3, 0 * SIZE(C4) -+#else -+ unop -+#endif -+ -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+#ifndef TRMMKERNEL -+ LD a4, 1 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+#ifndef TRMMKERNEL -+ LD a3, 2 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c11, t1, c11 -+ unop -+ MUL alpha, c01, c01 -+#ifndef TRMMKERNEL -+ LD b4, 3 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c12, t2, c12 -+ unop -+ MUL alpha, c02, c02 -+#ifndef TRMMKERNEL -+ LD t1, 1 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c16, t3, c16 -+ unop -+ MUL alpha, c03, c03 -+#ifndef TRMMKERNEL -+ LD t2, 2 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c15, t4, c15 -+ unop -+ MUL alpha, c04, c04 -+#ifndef TRMMKERNEL -+ LD t3, 3 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ MUL alpha, c05, c05 -+ unop -+#ifndef TRMMKERNEL -+ ADD c01, a5, c01 -+ LD t4, 1 * SIZE(C4) -+#else -+ unop -+ unop -+#endif -+ -+ MUL alpha, c06, c06 -+#ifndef TRMMKERNEL -+ unop -+ ADD c02, b5, c02 -+ LD a5, 2 * SIZE(C4) -+#endif -+ -+ MUL alpha, c07, c07 -+#ifndef TRMMKERNEL -+ unop -+ ADD c03, a2, c03 -+ LD b5, 3 * SIZE(C4) -+#endif -+ -+ MUL alpha, c08, c08 -+#ifndef TRMMKERNEL -+ unop -+ ADD c04, b2, c04 -+ unop -+#endif -+ -+ MUL alpha, c09, c09 -+ ST c01, 0 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c05, b1, c05 -+ unop -+#endif -+ -+ MUL alpha, c10, c10 -+ ST c02, 1 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c06, a4, c06 -+ unop -+#endif -+ -+ MUL alpha, c11, c11 -+ ST c03, 2 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c07, a3, c07 -+ unop -+#endif -+ -+ MUL alpha, c12, c12 -+ ST c04, 3 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c08, b4, c08 -+#else -+ unop -+#endif -+ ldi C1, 4 * SIZE(C1) -+ -+ MUL alpha, c13, c13 -+ ST c05, 0 * SIZE(C2) -+#ifndef TRMMKERNEL -+ ADD c09, a1, c09 -+ unop -+#endif -+ -+ MUL alpha, c14, c14 -+ ST c06, 1 * SIZE(C2) -+#ifndef TRMMKERNEL -+ ADD c10, t1, c10 -+ unop -+#endif -+ -+ MUL alpha, c15, c15 -+ ST c07, 2 * SIZE(C2) -+#ifndef TRMMKERNEL -+ ADD c11, t2, c11 -+ unop -+#endif -+ -+ MUL alpha, c16, c16 -+ ST c08, 3 * SIZE(C2) -+#ifndef TRMMKERNEL -+ ADD c12, t3, c12 -+#else -+ unop -+#endif -+ ldi C2, 4 * SIZE(C2) -+ -+#ifndef TRMMKERNEL -+ ADD c13, b3, c13 -+#else -+ unop -+#endif -+ ST c09, 0 * SIZE(C3) -+ fclr t1 -+ ldi C4, 4 * SIZE(C4) -+ -+#ifndef TRMMKERNEL -+ ADD c14, t4, c14 -+#else -+ unop -+#endif -+ ST c10, 1 * SIZE(C3) -+ fclr t2 -+ unop -+ -+#ifndef TRMMKERNEL -+ ADD c15, a5, c15 -+#else -+ unop -+#endif -+ ST c11, 2 * SIZE(C3) -+ fclr t3 -+ unop -+ -+#ifndef TRMMKERNEL -+ ADD c16, b5, c16 -+#else -+ unop -+#endif -+ ST c12, 3 * SIZE(C3) -+ fclr t4 -+ ldi C3, 4 * SIZE(C3) -+ -+ ST c13, -4 * SIZE(C4) -+ ST c14, -3 * SIZE(C4) -+ ST c15, -2 * SIZE(C4) -+ ST c16, -1 * SIZE(C4) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 4, TMP1 -+#else -+ subl TMP1, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 4, KK -+#endif -+ -+ bgt I, $L11 -+ .align 4 -+ -+$L20: -+ and M, 2, I -+ ble I, $L30 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 2, TMP1 -+#else -+ addl KK, 4, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(B) -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c01 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(B) -+ fclr c02 -+ fclr c06 -+ ble L, $L25 -+ -+#else -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c01 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(BO) -+ fclr c02 -+ fclr c06 -+ ble L, $L25 -+#endif -+ .align 4 -+ -+$L22: -+ ADD c09, t1, c09 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c10, t2, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ -+ ADD c09, t1, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD c13, t3, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c14, t4, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 -+ -+$L25: -+ ADD c09, t1, c09 -+ fldd alpha, ALPHA -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L28 -+#else -+ blbs TMP1, $L28 -+#endif -+ -+ ADD c10, t2, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ unop -+ -+ ADD c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c09, t1, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L28: -+ ADD c10, t2, c10 -+ unop -+ MUL a2, b1, t2 -+#ifndef TRMMKERNEL -+ LD a3, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+#ifndef TRMMKERNEL -+ LD a4, 1 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+#ifndef TRMMKERNEL -+ LD a5, 0 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+#ifndef TRMMKERNEL -+ LD b5, 1 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+#ifndef TRMMKERNEL -+ LD b1, 0 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+#ifndef TRMMKERNEL -+ LD b2, 1 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b4, t4 -+#ifndef TRMMKERNEL -+ LD b3, 0 * SIZE(C4) -+#else -+ unop -+#endif -+ -+ ADD c09, t1, c09 -+ unop -+ MUL alpha, c01, c01 -+#ifndef TRMMKERNEL -+ LD b4, 1 * SIZE(C4) -+#else -+ unop -+#endif -+ -+ ADD c10, t2, c10 -+ unop -+ MUL alpha, c02, c02 -+ unop -+ -+ ADD c13, t3, c13 -+ MUL alpha, c05, c05 -+ ADD c14, t4, c14 -+ MUL alpha, c06, c06 -+ -+ MUL alpha, c09, c09 -+#ifndef TRMMKERNEL -+ ADD c01, a3, c01 -+#endif -+ MUL alpha, c10, c10 -+#ifndef TRMMKERNEL -+ ADD c02, a4, c02 -+#endif -+ -+ MUL alpha, c13, c13 -+#ifndef TRMMKERNEL -+ ADD c05, a5, c05 -+#endif -+ MUL alpha, c14, c14 -+#ifndef TRMMKERNEL -+ ADD c06, b5, c06 -+#endif -+ -+#ifndef TRMMKERNEL -+ ADD c09, b1, c09 -+ unop -+#endif -+ ST c01, 0 * SIZE(C1) -+ fclr t1 -+ -+#ifndef TRMMKERNEL -+ ADD c10, b2, c10 -+ unop -+#endif -+ ST c02, 1 * SIZE(C1) -+ fclr t2 -+ -+#ifndef TRMMKERNEL -+ ADD c13, b3, c13 -+ unop -+#endif -+ ST c05, 0 * SIZE(C2) -+ fclr t3 -+ -+#ifndef TRMMKERNEL -+ ADD c14, b4, c14 -+ unop -+#endif -+ ST c06, 1 * SIZE(C2) -+ fclr t4 -+ -+ ST c09, 0 * SIZE(C3) -+ ldi C1, 2 * SIZE(C1) -+ ST c10, 1 * SIZE(C3) -+ ldi C2, 2 * SIZE(C2) -+ -+ ST c13, 0 * SIZE(C4) -+ ldi C3, 2 * SIZE(C3) -+ ST c14, 1 * SIZE(C4) -+ ldi C4, 2 * SIZE(C4) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 2, TMP1 -+#else -+ subl TMP1, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 2, KK -+#endif -+ .align 4 -+ -+$L30: -+ and M, 1, I -+ ble I, $L39 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 1, TMP1 -+#else -+ addl KK, 4, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ LD b2, 1 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c09 -+ LD b4, 3 * SIZE(B) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(B) -+ ble L, $L35 -+#else -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c09 -+ LD b4, 3 * SIZE(BO) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(BO) -+ ble L, $L35 -+#endif -+ .align 4 -+ -+$L32: -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t2, c05 -+ ldi AO, 2 * SIZE(AO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, c09 -+ LD b5, 3 * SIZE(BO) -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, c13 -+ MUL a1, b4, t4 -+ LD a1, -1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ MUL a2, b1, t1 -+ LD b1, 4 * SIZE(BO) -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c05, t2, c05 -+ MUL a2, b2, t2 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c09, t3, c09 -+ LD b4, -1 * SIZE(BO) -+ MUL a2, b3, t3 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c13, t4, c13 -+ MUL a2, b5, t4 -+ LD a2, 0 * SIZE(AO) -+ bgt L, $L32 -+ .align 4 -+ -+$L35: -+ ADD c01, t1, c01 -+ fldd alpha, ALPHA -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L38 -+#else -+ blbs TMP1, $L38 -+#endif -+ .align 4 -+ -+ ADD c05, t2, c05 -+ LD b1, 0 * SIZE(BO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, c09 -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, c13 -+ MUL a1, b4, t4 -+ LD a1, 0 * SIZE(AO) -+ ldi AO, 1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L38: -+ ADD c05, t2, c05 -+ unop -+ MUL a1, b2, t2 -+#ifndef TRMMKERNEL -+ LD a5, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c09, t3, c09 -+ unop -+ MUL a1, b3, t3 -+#ifndef TRMMKERNEL -+ LD b5, 0 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c13, t4, c13 -+ unop -+ MUL a1, b4, t4 -+#ifndef TRMMKERNEL -+ LD a2, 0 * SIZE(C3) -+#else -+ unop -+#endif -+ -+ ADD c01, t1, c01 -+ unop -+ MUL alpha, c01, c01 -+#ifndef TRMMKERNEL -+ LD a3, 0 * SIZE(C4) -+#else -+ unop -+#endif -+ -+ ADD c05, t2, c05 -+ unop -+ MUL alpha, c05, c05 -+ unop -+ -+ ADD c09, t3, c09 -+ MUL alpha, c09, c09 -+ ADD c13, t4, c13 -+ MUL alpha, c13, c13 -+ -+#ifndef TRMMKERNEL -+ ADD c01, a5, c01 -+ ADD c05, b5, c05 -+ ADD c09, a2, c09 -+ ADD c13, a3, c13 -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c09, 0 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 1, TMP1 -+#else -+ subl TMP1, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 1, KK -+#endif -+ .align 4 -+ -+$L39: -+ mov BO, B -+ ldi J, -1(J) -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addl KK, 4, KK -+#else -+ unop -+#endif -+ bgt J, $L01 -+ .align 4 -+ -+$L40: -+ and N, 2, J -+ ble J, $L80 -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ mov A, AO -+ fclr t1 -+ addl C2, LDC, C -+ fclr t2 -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK -+#endif -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L60 -+ .align 4 -+ -+$L51: -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 4, TMP1 -+#else -+ addl KK, 2, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 -+ -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ ldi BO, 2 * SIZE(B) -+ ldi AO, 4 * SIZE(AO) -+ ble L, $L55 -+#else -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ ldi BO, 2 * SIZE(BO) -+ ldi AO, 4 * SIZE(AO) -+ ble L, $L55 -+#endif -+ .align 4 -+ -+$L52: -+ ADD c05, t1, c05 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c06, t2, c06 -+ ldi L, -2(L) -+ MUL a2, b1, t2 -+ unop -+ -+ ADD c07, t3, c07 -+ unop -+ MUL a3, b1, t3 -+ unop -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ unop -+ MUL a4, b2, t4 -+ LD a5, 3 * SIZE(AO) -+ -+ ADD c05, t1, c05 -+ unop -+ MUL a1, b3, t1 -+ LD b2, -1 * SIZE(BO) -+ -+ ADD c06, t2, c06 -+ unop -+ MUL a2, b3, t2 -+ unop -+ -+ ADD c07, t3, c07 -+ unop -+ MUL a3, b3, t3 -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a5, b3, t4 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b4, t1 -+ LD a1, -4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b4, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L52 -+ .align 4 -+ -+$L55: -+ ADD c05, t1, c05 -+ fldd alpha, ALPHA -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L58 -+#else -+ blbs TMP1, $L58 -+#endif -+ .align 4 -+ -+ ADD c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD c07, t3, c07 -+ MUL a3, b1, t3 -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c05, t1, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L58: -+ ADD c06, t2, c06 -+ unop -+ MUL a2, b1, t2 -+#ifndef TRMMKERNEL -+ LD c09, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c07, t3, c07 -+ unop -+ MUL a3, b1, t3 -+#ifndef TRMMKERNEL -+ LD c10, 1 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a4, b1, t4 -+#ifndef TRMMKERNEL -+ LD c11, 2 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+#ifndef TRMMKERNEL -+ LD c12, 3 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b2, t2 -+#ifndef TRMMKERNEL -+ LD c13, 0 * SIZE(C2) -+ unop -+#endif -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+#ifndef TRMMKERNEL -+ LD c14, 1 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c04, t4, c04 -+ unop -+ MUL a4, b2, t4 -+#ifndef TRMMKERNEL -+ LD c15, 2 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c05, t1, c05 -+ unop -+ MUL alpha, c01, c01 -+#ifndef TRMMKERNEL -+ LD c16, 3 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c06, t2, c06 -+ ldi I, -1(I) -+ MUL alpha, c02, c02 -+ unop -+ -+ ADD c07, t3, c07 -+ MUL alpha, c03, c03 -+ ADD c08, t4, c08 -+ MUL alpha, c04, c04 -+ -+ MUL alpha, c05, c05 -+#ifndef TRMMKERNEL -+ ADD c01, c09, c01 -+#endif -+ MUL alpha, c06, c06 -+#ifndef TRMMKERNEL -+ ADD c02, c10, c02 -+#endif -+ -+ MUL alpha, c07, c07 -+#ifndef TRMMKERNEL -+ ADD c03, c11, c03 -+#endif -+ MUL alpha, c08, c08 -+#ifndef TRMMKERNEL -+ ADD c04, c12, c04 -+#endif -+ -+#ifndef TRMMKERNEL -+ ADD c05, c13, c05 -+#endif -+ ST c01, 0 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c06, c14, c06 -+#endif -+ ST c02, 1 * SIZE(C1) -+ -+#ifndef TRMMKERNEL -+ ADD c07, c15, c07 -+#endif -+ ST c03, 2 * SIZE(C1) -+#ifndef TRMMKERNEL -+ ADD c08, c16, c08 -+#endif -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ fclr t1 -+ ST c06, 1 * SIZE(C2) -+ fclr t2 -+ ST c07, 2 * SIZE(C2) -+ fclr t3 -+ ST c08, 3 * SIZE(C2) -+ fclr t4 -+ -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 4, TMP1 -+#else -+ subl TMP1, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 4, KK -+#endif -+ bgt I, $L51 -+ .align 4 -+ -+$L60: -+ and M, 2, I -+ ble I, $L70 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 2, TMP1 -+#else -+ addl KK, 2, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(B) -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ ble L, $L65 -+#else -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ ble L, $L65 -+#endif -+ .align 4 -+ -+$L62: -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c02, t2, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ ldi L, -2(L) -+ MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, t1 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a3, b4, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ MUL a4, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ unop -+ -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L62 -+ .align 4 -+ -+$L65: -+ ADD c01, t1, c01 -+ fldd alpha, ALPHA -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L68 -+#else -+ blbs TMP1, $L68 -+#endif -+ .align 4 -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, t3 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi AO, 2 * SIZE(AO) -+ .align 4 -+ -+$L68: -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b1, t2 -+#ifndef TRMMKERNEL -+ LD c09, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a1, b2, t3 -+#ifndef TRMMKERNEL -+ LD c10, 1 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+#ifndef TRMMKERNEL -+ LD c11, 0 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c01, t1, c01 -+ unop -+ MUL alpha, c01, c01 -+#ifndef TRMMKERNEL -+ LD c12, 1 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c02, t2, c02 -+ ldi C1, 2 * SIZE(C1) -+ MUL alpha, c02, c02 -+ ldi C2, 2 * SIZE(C2) -+ -+ ADD c05, t3, c05 -+ MUL alpha, c05, c05 -+ ADD c06, t4, c06 -+ MUL alpha, c06, c06 -+ -+#ifndef TRMMKERNEL -+ ADD c01, c09, c01 -+ ADD c02, c10, c02 -+ ADD c05, c11, c05 -+ ADD c06, c12, c06 -+#endif -+ -+ ST c01, -2 * SIZE(C1) -+ fclr t1 -+ ST c02, -1 * SIZE(C1) -+ fclr t2 -+ ST c05, -2 * SIZE(C2) -+ fclr t3 -+ ST c06, -1 * SIZE(C2) -+ fclr t4 -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 2, TMP1 -+#else -+ subl TMP1, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 2, KK -+#endif -+ .align 4 -+ -+$L70: -+ and M, 1, I -+ ble I, $L79 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 1, TMP1 -+#else -+ addl KK, 2, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c02 -+ LD b2, 1 * SIZE(B) -+ fclr c06 -+ -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ -+ LD b3, 2 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ ble L, $L75 -+#else -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c02 -+ LD b2, 1 * SIZE(BO) -+ fclr c06 -+ -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ -+ LD b3, 2 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ ble L, $L75 -+#endif -+ .align 4 -+ -+$L72: -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ LD a1, 1 * SIZE(AO) -+ LD b2, 3 * SIZE(BO) -+ -+ ADD c02, t3, c02 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b3, t3 -+ LD b3, 4 * SIZE(BO) -+ -+ ADD c06, t4, c06 -+ MUL a2, b4, t4 -+ LD a2, 0 * SIZE(AO) -+ LD b4, 5 * SIZE(BO) -+ -+ ldi BO, 4 * SIZE(BO) -+ unop -+ unop -+ bgt L, $L72 -+ .align 4 -+ -+$L75: -+ ADD c01, t1, c01 -+ fldd alpha, ALPHA -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L78 -+#else -+ blbs TMP1, $L78 -+#endif -+ .align 4 -+ -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ LD a1, 0 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L78: -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+#ifndef TRMMKERNEL -+ LD a5, 0 * SIZE(C1) -+#else -+ unop -+#endif -+ -+ ADD c02, t3, c02 -+ ADD c06, t4, c06 -+#ifndef TRMMKERNEL -+ LD b5, 0 * SIZE(C2) -+#else -+ unop -+#endif -+ -+ ADD c01, c02, c01 -+ ADD c05, c06, c05 -+ -+ ADD c01, t1, c01 -+ ADD c05, t2, c05 -+ -+ MUL alpha, c01, c01 -+ MUL alpha, c05, c05 -+ -+#ifndef TRMMKERNEL -+ ADD c01, a5, c01 -+ ADD c05, b5, c05 -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 1, TMP1 -+#else -+ subl TMP1, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 1, KK -+#endif -+ .align 4 -+ -+$L79: -+ mov BO, B -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addl KK, 2, KK -+#else -+ unop -+#endif -+ unop -+ unop -+ .align 4 -+ -+$L80: -+ and N, 1, J -+ ble J, $L999 -+ -+ mov C, C1 -+ mov A, AO -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK -+#endif -+ -+ sra M, 2, I -+ ble I, $L100 -+ .align 4 -+ -+$L91: -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 4, TMP1 -+#else -+ addl KK, 1, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ mov B, BO -+ unop -+ ble L, $L95 -+#else -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ unop -+ ble L, $L95 -+#endif -+ .align 5 -+ -+$L92: -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi L, -1(L) -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b1, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b1, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 8 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 9 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 10 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a4, 11 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ LD a1, 12 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD a2, 13 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b3, t3 -+ LD a3, 14 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b3, t4 -+ LD a5, 15 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ MUL a1, b4, t1 -+ LD a1, 16 * SIZE(AO) -+ ldi AO, 16 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b4, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L92 -+ .align 4 -+ -+$L95: -+#ifndef TRMMKERNEL -+ and K, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ fldd alpha, ALPHA -+ unop -+ ble L, $L98 -+ .align 4 -+ -+$L96: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 1 * SIZE(BO) -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b1, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b1, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ldi AO, 4 * SIZE(AO) -+ bgt L, $L96 -+ .align 4 -+ -+$L98: -+#ifndef TRMMKERNEL -+ ADD c01, t1, c01 -+ LD c05, 0 * SIZE(C1) -+ ADD c02, t2, c02 -+ LD c06, 1 * SIZE(C1) -+ ADD c03, t3, c03 -+ LD c07, 2 * SIZE(C1) -+ ADD c04, t4, c04 -+ LD c08, 3 * SIZE(C1) -+#else -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+#endif -+ -+ MUL alpha, c01, c01 -+ MUL alpha, c02, c02 -+ MUL alpha, c03, c03 -+ MUL alpha, c04, c04 -+ -+#ifndef TRMMKERNEL -+ ADD c01, c05, c01 -+ ADD c02, c06, c02 -+ ADD c03, c07, c03 -+ ADD c04, c08, c04 -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ldi C1, 4 * SIZE(C1) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 4, TMP1 -+#else -+ subl TMP1, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ bgt I, $L91 -+ .align 4 -+ -+$L100: -+ and M, 2, I -+ unop -+ unop -+ ble I, $L110 -+ .align 4 -+ -+$L101: -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 2, TMP1 -+#else -+ addl KK, 1, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ mov B, BO -+ unop -+ ble L, $L105 -+#else -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ unop -+ ble L, $L105 -+#endif -+ .align 5 -+ -+$L102: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c03, t3, c03 -+ ldi BO, 4 * SIZE(BO) -+ MUL a3, b2, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a5, 7 * SIZE(AO) -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ MUL a1, b3, t1 -+ LD a1, 8 * SIZE(AO) -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L102 -+ .align 4 -+ -+$L105: -+#ifndef TRMMKERNEL -+ and K, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ fldd alpha, ALPHA -+#ifndef TRMMKERNEL -+ LD a3, 0 * SIZE(C1) -+ LD a4, 1 * SIZE(C1) -+#endif -+ ble L, $L108 -+ .align 4 -+ -+$L106: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b1, t2 -+ LD a2, 3 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi AO, 2 * SIZE(AO) -+ unop -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L106 -+ .align 4 -+ -+$L108: -+ ADD c01, t1, c01 -+ fclr t1 -+ ADD c02, t2, c02 -+ fclr t2 -+ ADD c03, t3, c03 -+ fclr t3 -+ ADD c04, t4, c04 -+ fclr t4 -+ -+ ADD c01, c03, c01 -+ ADD c02, c04, c02 -+ -+ MUL alpha, c01, c01 -+ MUL alpha, c02, c02 -+ -+#ifndef TRMMKERNEL -+ ADD c01, a3, c01 -+ ADD c02, a4, c02 -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ldi C1, 2 * SIZE(C1) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 2, TMP1 -+#else -+ subl TMP1, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 2, KK -+#endif -+ .align 4 -+ -+$L110: -+ and M, 1, I -+ ble I, $L999 -+ .align 4 -+ -+$L111: -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 1, TMP1 -+#else -+ addl KK, 1, TMP1 -+#endif -+#endif -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ mov B, BO -+ unop -+ ble L, $L115 -+#else -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AO, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+#ifndef TRMMKERNEL -+ sra K, 2, L -+#else -+ sra TMP1, 2, L -+#endif -+ unop -+ ble L, $L115 -+#endif -+ .align 4 -+ -+$L112: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b2, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c03, t3, c03 -+ MUL a3, b3, t3 -+ LD a3, 6 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b4, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b4, 7 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 4 * SIZE(AO) -+ ldi BO, 4 * SIZE(BO) -+ bgt L, $L112 -+ .align 4 -+ -+$L115: -+#ifndef TRMMKERNEL -+ and K, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ fldd alpha, ALPHA -+#ifndef TRMMKERNEL -+ LD a2, 0 * SIZE(C1) -+#endif -+ ble L, $L118 -+ .align 4 -+ -+$L116: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+ LD a1, 1 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 1 * SIZE(AO) -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L116 -+ .align 4 -+ -+$L118: -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+ -+ ADD c01, c02, c01 -+ ADD c03, c04, c03 -+ ADD c01, c03, c01 -+ -+ MUL alpha, c01, c01 -+#ifndef TRMMKERNEL -+ ADD c01, a2, c01 -+#endif -+ ST c01, 0 * SIZE(C1) -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/gemm_kernel_simd_16x4.S b/kernel/sw_64/gemm_kernel_simd_16x4.S -new file mode 100644 -index 0000000..1acf679 ---- /dev/null -+++ b/kernel/sw_64/gemm_kernel_simd_16x4.S -@@ -0,0 +1,4054 @@ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#if !defined(SW2B) -+#error "Architecture is not specified." -+#endif -+ -+ -+#define STACKSIZE 336 -+ -+#define CO $1 -+#define C1 $2 -+#define C2 $3 -+#define C3 $4 -+ -+#define LDM $5 -+ -+#define PREB $7 -+#define SPANA $8 -+#define SPANB $9 -+#define NC1 $10 -+#define KC1 $11 -+#define MC1 $12 -+#define PREA $13 -+ -+#define A $20 -+#define B $21 -+#define C $19 -+#define MC $16 -+#define NC $17 -+#define KC $18 -+ -+#define A1 $22 -+#define B1 $23 -+ -+#define ALPHA $f8 -+ -+#define a0 $f0 -+#define a4 $f1 -+#define a8 $f2 -+#define a12 $f3 -+ -+#define b0 $f4 -+#define b1 $f5 -+#define b2 $f6 -+#define b3 $f7 -+ -+#define na0 $f0 -+#define na4 $f8 -+#define na8 $f9 -+#define na12 $f10 -+ -+#define nb0 $f11 -+#define nb1 $f12 -+#define nb2 $f13 -+#define nb3 $f14 -+ -+#define t00 $f15 -+#define t01 $f16 -+#define t02 $f17 -+#define t03 $f18 -+#define t04 $f19 -+#define t05 $f20 -+#define t06 $f21 -+#define t07 $f22 -+#define t08 $f23 -+#define t09 $f24 -+#define t10 $f25 -+#define t11 $f26 -+#define t12 $f27 -+#define t13 $f28 -+#define t14 $f29 -+#define t15 $f30 -+ -+#define c00 $f1 -+#define c01 $f2 -+#define c02 $f3 -+#define c03 $f4 -+ -+#define c04 $f5 -+#define c05 $f6 -+#define c06 $f7 -+#define c07 $f9 -+ -+#define c08 $f10 -+#define c09 $f11 -+#define c10 $f12 -+#define c11 $f13 -+ -+#define c12 $f1 -+#define c13 $f2 -+#define c14 $f3 -+#define c15 $f4 -+ -+#if defined(TRMMKERNEL) -+#define TEMP $14 -+#define KK $24 -+#define OFFSET $25 -+#endif -+ -+ PROLOGUE -+ PROFCODE -+ -+.frame $30,STACKSIZE,$26,0 -+ldi $sp,-STACKSIZE($sp) # # [2] -+ -+ stl $9,328($sp) # Integer Saved Register -+ stl $10,320($sp) -+ stl $11,312($sp) -+ stl $12,304($sp) -+ stl $13,296($sp) -+ stl $14,288($sp) -+ -+ -+ ST $f2,280($sp) # Float Saved Register -+ ST $f3,272($sp) -+ ST $f4,264($sp) -+ ST $f5,256($sp) -+ ST $f6,248($sp) -+ ST $f7,240($sp) -+ ST $f8,232($sp) -+ ST $f9,224($sp) -+ -+ -+ -+ .align 5 -+ -+$Begin_NC_Unroll4: -+ ldl C, 0 + STACKSIZE($sp) # load C -+ ldl LDM, 8 + STACKSIZE($sp) # load ldm -+ -+#ifdef TRMMKERNEL -+ ldl OFFSET, 16 + STACKSIZE($sp) # load offset -+ nop -+#endif -+ -+ ST $f19, 192($sp) # store alpha -+ SXADDQ LDM, 0, LDM # ldm*X+0 -+ -+ mov NC, NC1 # backup nc -+ mov KC, KC1 # backup kc -+ mov MC, MC1 # backup mc -+ -+ mov B, B1 # backup the initial address of b -+ sra NC1,2,NC # NC=NC1/4 Unroll N 4 -+ -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ subl $31, OFFSET, KK # when trmm at right -+ nop -+#endif -+ -+ mov A, A1 # backup the initial address of a -+ sll KC1,1+BASE_SHIFT,SPANB # kc*2nr -+ -+ sll KC1,4+BASE_SHIFT,SPANA # kc*16mr -+ beq NC,$Begin_NC_Unroll2 -+ -+ -+ .align 5 -+ -+.L0: -+ sra MC1,4,MC # MC=MC1/16 -+ mov C, CO # compute c pointer -+ -+ addl B1,SPANB,PREB # prefetch B -+ addl A1,SPANA,PREA # prefetch A -+ -+ addl C, LDM, C1 -+ addl C1,LDM, C2 -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET,KK # Reset the left offset -+ nop -+#endif -+ -+ subl PREA,16*SIZE,PREA # prea=kc1*mc-mc -+ addl C2,LDM, C3 -+ -+ s4addl LDM,C,C # C=ldm*4+C -+ beq MC,.L15 # MC=0:MC1<16 -+ -+ -+ .align 5 # nr=4,mr=4----------------------------- -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B # LL && RU reset B -+ nop -+#else -+ sll KK, 4 + BASE_SHIFT, KC # KK*16 -+ sll KK, 2 + BASE_SHIFT, TEMP # KK*4 -+ -+ addl A, KC, A # mov A point to the data part -+ addl B1,TEMP,B # mov B point to the data part -+#endif -+ -+ vcpys $f31,$f31,t00 # CLEAR Results Register -+ fillcs 0(CO) # prefetch C -+ fillcs 0(C1) -+ -+ vcpys $f31,$f31,t01 # 64 results -+ fillcs 0(C2) -+ fillcs 0(C3) -+ -+ vcpys $f31,$f31,t02 -+ LDDE b0,0*SIZE(B) -+ LDDE b1,1*SIZE(B) -+ -+ vcpys $f31,$f31,t03 -+ LDDE b2,2*SIZE(B) -+ LDDE b3,3*SIZE(B) -+ -+ vcpys $f31,$f31,t04 -+ fillcs 4(CO) # prefetch C -+ fillcs 4(C1) -+ -+ vcpys $f31,$f31,t05 -+ fillcs 4(C2) -+ fillcs 4(C3) -+ -+ vcpys $f31,$f31,t06 -+ VLD a0, 0*SIZE(A) -+ VLD a4, 4*SIZE(A) -+ -+ vcpys $f31,$f31,t07 -+ VLD a8, 8*SIZE(A) -+ VLD a12,12*SIZE(A) -+ -+ vcpys $f31,$f31,t08 -+ fillcs 8*SIZE(CO) -+ fillcs 8*SIZE(C1) -+ -+ vcpys $f31,$f31,t09 -+ fillcs 8*SIZE(C2) -+ fillcs 8*SIZE(C3) -+ -+ vcpys $f31,$f31,t10 -+ fillcs 12*SIZE(CO) -+ fillcs 12*SIZE(C1) -+ -+ vcpys $f31,$f31,t11 -+ fillcs 12*SIZE(C2) -+ fillcs 12*SIZE(C3) -+ -+ vcpys $f31,$f31,t12 -+ vcpys $f31,$f31,t13 -+ vcpys $f31,$f31,t14 -+ vcpys $f31,$f31,t15 -+ -+ -+#if (defined(LEFT) && !defined(TRANSA)) \ -+ ||(!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP # temp is the length of data part -+#elif defined(LEFT) -+ addl KK, 16, TEMP # mr=16 -+#else -+ addl KK, 4, TEMP # right nr=4 -+#endif -+ sra TEMP, 1, KC # KC=TEMP/2 -+ -+ nop -+ beq KC, $Rest_16x4x1 -+ -+#else -+ -+ vcpys $f31,$f31,t00 # CLEAR Results Register -+ mov B1,B # Reset B -+ sra KC1,1,KC # Unroll Kr=2, KC=KC1/2 -+ -+ vcpys $f31,$f31,t01 # 64 results -+ fillcs 0(CO) # prefetch C -+ fillcs 0(C1) -+ -+ vcpys $f31,$f31,t02 -+ fillcs 0(C2) -+ fillcs 0(C3) -+ -+ vcpys $f31,$f31,t03 -+ LDDE b0,0*SIZE(B) -+ LDDE b1,1*SIZE(B) -+ -+ vcpys $f31,$f31,t04 -+ LDDE b2,2*SIZE(B) -+ LDDE b3,3*SIZE(B) -+ -+ vcpys $f31,$f31,t05 -+ fillcs 4(CO) # prefetch C -+ fillcs 4(C1) -+ -+ vcpys $f31,$f31,t06 -+ fillcs 4(C2) -+ fillcs 4(C3) -+ -+ vcpys $f31,$f31,t07 -+ VLD a0, 0*SIZE(A) -+ VLD a4, 4*SIZE(A) -+ -+ vcpys $f31,$f31,t08 -+ VLD a8, 8*SIZE(A) -+ VLD a12,12*SIZE(A) -+ -+ vcpys $f31,$f31,t09 -+ fillcs 8(CO) # prefetch C -+ fillcs 8(C1) -+ -+ vcpys $f31,$f31,t10 -+ fillcs 8(C2) -+ fillcs 8(C3) -+ -+ vcpys $f31,$f31,t11 -+ fillcs 12*SIZE(CO) -+ fillcs 12*SIZE(C1) -+ -+ vcpys $f31,$f31,t12 -+ fillcs 12*SIZE(C2) -+ fillcs 12*SIZE(C3) -+ -+ vcpys $f31,$f31,t13 -+ vcpys $f31,$f31,t14 -+ -+ vcpys $f31,$f31,t15 -+ beq KC,$Rest_16x4x1 # KC1<2 goto $Rest_16x4x1 -+ -+#endif -+ -+ .align 5 -+ -+$Panel_16x4x2: # nr=4,mr=4,kr=2------------------------ -+ -+ VMAD a0,b0,t00,t00 -+ addl A,16*SIZE,A # 16a*1k -+ LDDE nb0,4*SIZE(B) # get next 4b -+ -+ VMAD a0,b1,t04,t04 -+ LDDE nb1,5*SIZE(B) -+ -+ VMAD a4,b0,t01,t01 -+ VLD na12,12*SIZE(A) -+ -+ VMAD a4,b1,t05,t05 -+ VLD na8,8*SIZE(A) -+ -+ VMAD a0,b2,t08,t08 -+ LDDE nb2,6*SIZE(B) -+ -+ VMAD a0,b3,t12,t12 -+ LDDE nb3,7*SIZE(B) -+ -+ VMAD a8,b0,t02,t02 -+ VMAD a8,b1,t06,t06 -+ -+ VMAD a4,b2,t09,t09 -+ addl B,8*SIZE,B # 4b*2k -+ VLD na0,0*SIZE(A) # carefule na0=a0 use the same register -+ -+ VMAD a4,b3,t13,t13 -+ VLD na4,4*SIZE(A) # get next 16a -+ -+ VMAD a12,b0,t03,t03 -+ VMAD a12,b1,t07,t07 -+ -+ VMAD a8,b2,t10,t10 -+ fillcs 0(PREB) -+ -+ VMAD a8,b3,t14,t14 -+ fillcs 0(PREA) -+ -+ VMAD a12,b2,t11,t11 -+ fillcs 8*SIZE(PREA) -+ -+ VMAD a12,b3,t15,t15 -+ subl KC,1,KC # loop k -- -+ -+ -+ VMAD na12,nb0,t03,t03 -+ addl A,16*SIZE,A # ### next k ### -+ LDDE b0,0(B) # get 3rd 4b -+ -+ VMAD na12,nb1,t07,t07 -+ LDDE b1,1*SIZE(B) -+ -+ VMAD na8,nb0,t02,t02 -+ VLD a12,12*SIZE(A) -+ -+ VMAD na8,nb1,t06,t06 -+ VLD a8,8*SIZE(A) -+ -+ VMAD na0,nb0,t00,t00 -+ subl PREA,16*SIZE,PREA # prea-=16 -+ LDDE b2,2*SIZE(B) -+ -+ VMAD na0,nb1,t04,t04 -+ LDDE b3,3*SIZE(B) -+ -+ VMAD na12,nb2,t11,t11 -+ VMAD na12,nb3,t15,t15 -+ VMAD na8,nb2,t10,t10 -+ VMAD na8,nb3,t14,t14 -+ -+ VMAD na0,nb2,t08,t08 -+ fillcs 0(PREA) -+ -+ VMAD na0,nb3,t12,t12 -+ fillcs 4*SIZE(PREB) -+ -+ VMAD na4,nb0,t01,t01 -+ VLD a0,0(A) # get 3rd 16a -+ -+ VMAD na4,nb1,t05,t05 -+ VLD a4,4*SIZE(A) -+ -+ VMAD na4,nb2,t09,t09 -+ fillcs 8*SIZE(PREA) -+ addl PREB,8*SIZE,PREB # preb+=8 -+ -+ VMAD na4,nb3,t13,t13 -+ subl PREA,16*SIZE,PREA # prea-=16 -+ bne KC,$Panel_16x4x2 -+ -+ -+$Rest_16x4x1: -+ LDDE ALPHA, 192($sp) # get alpha -+#ifndef TRMMKERNEL -+ blbc KC1, $Write_16x4 -+#else -+ blbc TEMP,$Write_16x4 -+#endif -+ -+ VMAD a0,b0,t00,t00 -+ addl A,16*SIZE,A # 16a*1k -+ -+ VMAD a0,b1,t04,t04 -+ addl B,4*SIZE,B # 4b*1k -+ -+ VMAD a0,b2,t08,t08 -+ VMAD a0,b3,t12,t12 -+ -+ -+ VMAD a4,b0,t01,t01 -+ VMAD a4,b1,t05,t05 -+ VMAD a4,b2,t09,t09 -+ VMAD a4,b3,t13,t13 -+ -+ VMAD a8,b0,t02,t02 -+ VMAD a8,b1,t06,t06 -+ VMAD a8,b2,t10,t10 -+ VMAD a8,b3,t14,t14 -+ -+ VMAD a12,b0,t03,t03 -+ VMAD a12,b1,t07,t07 -+ VMAD a12,b2,t11,t11 -+ VMAD a12,b3,t15,t15 -+ -+ -+ .align 5 -+ -+$Write_16x4: -+ -+#ifndef TRMMKERNEL -+ and CO, (VEC_LEN*SIZE-1), $6 ### gemm part #### -+ bne $6, $UnAlign_CO_Access_16x4 -+ -+$Align_CO_Access_16x4: -+ VLD c00,0(CO) -+ VLD c01,4*SIZE(CO) -+ VLD c02,8*SIZE(CO) -+ VLD c03,12*SIZE(CO) -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ VMAD t02,ALPHA,c02,t02 -+ VMAD t03,ALPHA,c03,t03 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ VST t02,8*SIZE(CO) -+ VST t03,12*SIZE(CO) -+ jmp $Access_C1_16x4 -+ -+$UnAlign_CO_Access_16x4: -+ VLD_UL c00, 0*VEC_LEN*SIZE(CO) -+ VLD_UH c04, 1*VEC_LEN*SIZE(CO) -+ -+ VLD_UL c01, 1*VEC_LEN*SIZE(CO) -+ VLD_UH c05, 2*VEC_LEN*SIZE(CO) -+ -+ vbisw c00,c04,c00 -+ VLD_UL c02, 2*VEC_LEN*SIZE(CO) -+ VLD_UH c06, 3*VEC_LEN*SIZE(CO) -+ -+ vbisw c01,c05,c01 -+ VLD_UL c03, 3*VEC_LEN*SIZE(CO) -+ VLD_UH c07, 4*VEC_LEN*SIZE(CO) -+ -+ vbisw c02,c06,c02 -+ vbisw c03,c07,c03 -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ -+ VMAD t02,ALPHA,c02,t02 -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VMAD t03,ALPHA,c03,t03 -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+ VST_UL t02, 2*VEC_LEN*SIZE(CO) -+ VST_UH t02, 3*VEC_LEN*SIZE(CO) -+ -+ VST_UL t03, 3*VEC_LEN*SIZE(CO) -+ VST_UH t03, 4*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_16x4: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_C1_Access_16x4 -+ -+$Align_C1_Access_16x4: -+ VLD c04,0(C1) -+ VLD c05,4*SIZE(C1) -+ VLD c06,8*SIZE(C1) -+ VLD c07,12*SIZE(C1) -+ -+ VMAD t04,ALPHA,c04,t04 -+ VMAD t05,ALPHA,c05,t05 -+ VMAD t06,ALPHA,c06,t06 -+ VMAD t07,ALPHA,c07,t07 -+ -+ VST t04,0(C1) -+ VST t05,4*SIZE(C1) -+ VST t06,8*SIZE(C1) -+ VST t07,12*SIZE(C1) -+ jmp $Access_C2_16x4 -+ -+$UnAlign_C1_Access_16x4: -+ VLD_UL c04, 0*VEC_LEN*SIZE(C1) -+ VLD_UH t00, 1*VEC_LEN*SIZE(C1) -+ -+ VLD_UL c05, 1*VEC_LEN*SIZE(C1) -+ VLD_UH t01, 2*VEC_LEN*SIZE(C1) -+ -+ vbisw c04,t00,c04 -+ VLD_UL c06, 2*VEC_LEN*SIZE(C1) -+ VLD_UH t02, 3*VEC_LEN*SIZE(C1) -+ -+ vbisw c05,t01,c05 -+ VLD_UL c07, 3*VEC_LEN*SIZE(C1) -+ VLD_UH t03, 4*VEC_LEN*SIZE(C1) -+ -+ vbisw c06,t02,c06 -+ vbisw c07,t03,c07 -+ -+ VMAD t04,ALPHA,c04,t04 -+ VMAD t05,ALPHA,c05,t05 -+ -+ VMAD t06,ALPHA,c06,t06 -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ -+ VMAD t07,ALPHA,c07,t07 -+ VST_UL t05, 1*VEC_LEN*SIZE(C1) -+ VST_UH t05, 2*VEC_LEN*SIZE(C1) -+ -+ VST_UL t06, 2*VEC_LEN*SIZE(C1) -+ VST_UH t06, 3*VEC_LEN*SIZE(C1) -+ -+ VST_UL t07, 3*VEC_LEN*SIZE(C1) -+ VST_UH t07, 4*VEC_LEN*SIZE(C1) -+ -+ -+$Access_C2_16x4: -+ and C2, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_C2_Access_16x4 -+ -+ $Align_C2_Access_16x4: -+ VLD c08,0(C2) -+ VLD c09,4*SIZE(C2) -+ VLD c10,8*SIZE(C2) -+ VLD c11,12*SIZE(C2) -+ -+ VMAD t08,ALPHA,c08,t08 -+ VMAD t09,ALPHA,c09,t09 -+ VMAD t10,ALPHA,c10,t10 -+ VMAD t11,ALPHA,c11,t11 -+ -+ VST t08,0(C2) -+ VST t09,4*SIZE(C2) -+ VST t10,8*SIZE(C2) -+ VST t11,12*SIZE(C2) -+ jmp $Access_C3_16x4 -+ -+$UnAlign_C2_Access_16x4: -+ VLD_UL c08, 0*VEC_LEN*SIZE(C2) -+ VLD_UH t00, 1*VEC_LEN*SIZE(C2) -+ -+ VLD_UL c09, 1*VEC_LEN*SIZE(C2) -+ VLD_UH t01, 2*VEC_LEN*SIZE(C2) -+ -+ vbisw c08,t00,c08 -+ VLD_UL c10, 2*VEC_LEN*SIZE(C2) -+ VLD_UH t02, 3*VEC_LEN*SIZE(C2) -+ -+ vbisw c09,t01,c09 -+ VLD_UL c11, 3*VEC_LEN*SIZE(C2) -+ VLD_UH t03, 4*VEC_LEN*SIZE(C2) -+ -+ vbisw c10,t02,c10 -+ vbisw c11,t03,c11 -+ -+ VMAD t08,ALPHA,c08,t08 -+ VMAD t09,ALPHA,c09,t09 -+ -+ VMAD t10,ALPHA,c10,t10 -+ VST_UL t08, 0*VEC_LEN*SIZE(C2) -+ VST_UH t08, 1*VEC_LEN*SIZE(C2) -+ -+ VMAD t11,ALPHA,c11,t11 -+ VST_UL t09, 1*VEC_LEN*SIZE(C2) -+ VST_UH t09, 2*VEC_LEN*SIZE(C2) -+ -+ VST_UL t10, 2*VEC_LEN*SIZE(C2) -+ VST_UH t10, 3*VEC_LEN*SIZE(C2) -+ -+ VST_UL t11, 3*VEC_LEN*SIZE(C2) -+ VST_UH t11, 4*VEC_LEN*SIZE(C2) -+ -+ -+$Access_C3_16x4: -+ and C3, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_C3_Access_16x4 -+ -+$Align_C3_Access_16x4: -+ VLD c12,0(C3) -+ VLD c13,4*SIZE(C3) -+ VLD c14,8*SIZE(C3) -+ VLD c15,12*SIZE(C3) -+ -+ VMAD t12,ALPHA,c12,t12 -+ VMAD t13,ALPHA,c13,t13 -+ VMAD t14,ALPHA,c14,t14 -+ VMAD t15,ALPHA,c15,t15 -+ -+ VST t12,0(C3) -+ VST t13,4*SIZE(C3) -+ VST t14,8*SIZE(C3) -+ VST t15,12*SIZE(C3) -+ jmp $End_NC_Unroll4 -+ -+$UnAlign_C3_Access_16x4: -+ VLD_UL c12, 0*VEC_LEN*SIZE(C3) -+ VLD_UH t04, 1*VEC_LEN*SIZE(C3) -+ -+ VLD_UL c13, 1*VEC_LEN*SIZE(C3) -+ VLD_UH t05, 2*VEC_LEN*SIZE(C3) -+ -+ vbisw c12,t04,c12 -+ VLD_UL c14, 2*VEC_LEN*SIZE(C3) -+ VLD_UH t06, 3*VEC_LEN*SIZE(C3) -+ -+ vbisw c13,t05,c13 -+ VLD_UL c15, 3*VEC_LEN*SIZE(C3) -+ VLD_UH t07, 4*VEC_LEN*SIZE(C3) -+ -+ vbisw c14,t06,c14 -+ vbisw c15,t07,c15 -+ -+ VMAD t12,ALPHA,c12,t12 -+ VMAD t13,ALPHA,c13,t13 -+ -+ VMAD t14,ALPHA,c14,t14 -+ VST_UL t12, 0*VEC_LEN*SIZE(C3) -+ VST_UH t12, 1*VEC_LEN*SIZE(C3) -+ -+ VMAD t15,ALPHA,c15,t15 -+ VST_UL t13, 1*VEC_LEN*SIZE(C3) -+ VST_UH t13, 2*VEC_LEN*SIZE(C3) -+ -+ VST_UL t14, 2*VEC_LEN*SIZE(C3) -+ VST_UH t14, 3*VEC_LEN*SIZE(C3) -+ -+ VST_UL t15, 3*VEC_LEN*SIZE(C3) -+ VST_UH t15, 4*VEC_LEN*SIZE(C3) -+ jmp $End_NC_Unroll4 -+ -+#else -+ and CO, (VEC_LEN*SIZE-1),$6 ### trmm part ### -+ bne $6,$UnAlign_CO_Access_16x4 -+ -+$Align_CO_Access_16x4: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ VMUL t02,ALPHA,t02 -+ VMUL t03,ALPHA,t03 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ VST t02,8*SIZE(CO) -+ VST t03,12*SIZE(CO) -+ jmp $Access_C1_16x4 -+ -+$UnAlign_CO_Access_16x4: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ -+ VMUL t02,ALPHA,t02 -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VMUL t03,ALPHA,t03 -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+ VST_UL t02, 2*VEC_LEN*SIZE(CO) -+ VST_UH t02, 3*VEC_LEN*SIZE(CO) -+ -+ VST_UL t03, 3*VEC_LEN*SIZE(CO) -+ VST_UH t03, 4*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_16x4: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_C1_Access_16x4 -+ -+$Align_C1_Access_16x4: -+ VMUL t04,ALPHA,t04 -+ VMUL t05,ALPHA,t05 -+ VMUL t06,ALPHA,t06 -+ VMUL t07,ALPHA,t07 -+ -+ VST t04,0(C1) -+ VST t05,4*SIZE(C1) -+ VST t06,8*SIZE(C1) -+ VST t07,12*SIZE(C1) -+ jmp $Access_C2_16x4 -+ -+$UnAlign_C1_Access_16x4: -+ VMUL t04,ALPHA,t04 -+ VMUL t05,ALPHA,t05 -+ -+ VMUL t06,ALPHA,t06 -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ -+ VMUL t07,ALPHA,t07 -+ VST_UL t05, 1*VEC_LEN*SIZE(C1) -+ VST_UH t05, 2*VEC_LEN*SIZE(C1) -+ -+ VST_UL t06, 2*VEC_LEN*SIZE(C1) -+ VST_UH t06, 3*VEC_LEN*SIZE(C1) -+ -+ VST_UL t07, 3*VEC_LEN*SIZE(C1) -+ VST_UH t07, 4*VEC_LEN*SIZE(C1) -+ -+ -+$Access_C2_16x4: -+ and C2, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_C2_Access_16x4 -+ -+$Align_C2_Access_16x4: -+ VMUL t08,ALPHA,t08 -+ VMUL t09,ALPHA,t09 -+ VMUL t10,ALPHA,t10 -+ VMUL t11,ALPHA,t11 -+ -+ VST t08,0(C2) -+ VST t09,4*SIZE(C2) -+ VST t10,8*SIZE(C2) -+ VST t11,12*SIZE(C2) -+ jmp $Access_C3_16x4 -+ -+$UnAlign_C2_Access_16x4: -+ VMUL t08,ALPHA,t08 -+ VMUL t09,ALPHA,t09 -+ -+ VMUL t10,ALPHA,t10 -+ VST_UL t08, 0*VEC_LEN*SIZE(C2) -+ VST_UH t08, 1*VEC_LEN*SIZE(C2) -+ -+ VMUL t11,ALPHA,t11 -+ VST_UL t09, 1*VEC_LEN*SIZE(C2) -+ VST_UH t09, 2*VEC_LEN*SIZE(C2) -+ -+ VST_UL t10, 2*VEC_LEN*SIZE(C2) -+ VST_UH t10, 3*VEC_LEN*SIZE(C2) -+ -+ VST_UL t11, 3*VEC_LEN*SIZE(C2) -+ VST_UH t11, 4*VEC_LEN*SIZE(C2) -+ -+ -+$Access_C3_16x4: -+ and C3, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_C3_Access_16x4 -+ -+$Align_C3_Access_16x4: -+ VMUL t12,ALPHA,t12 -+ VMUL t13,ALPHA,t13 -+ VMUL t14,ALPHA,t14 -+ VMUL t15,ALPHA,t15 -+ -+ VST t12,0(C3) -+ VST t13,4*SIZE(C3) -+ VST t14,8*SIZE(C3) -+ VST t15,12*SIZE(C3) -+ jmp $TRMMKERNEL_16x4 -+ -+$UnAlign_C3_Access_16x4: -+ VMUL t12,ALPHA,t12 -+ VMUL t13,ALPHA,t13 -+ -+ VMUL t14,ALPHA,t14 -+ VST_UL t12, 0*VEC_LEN*SIZE(C3) -+ VST_UH t12, 1*VEC_LEN*SIZE(C3) -+ -+ VMUL t15,ALPHA,t15 -+ VST_UL t13, 1*VEC_LEN*SIZE(C3) -+ VST_UH t13, 2*VEC_LEN*SIZE(C3) -+ -+ VST_UL t14, 2*VEC_LEN*SIZE(C3) -+ VST_UH t14, 3*VEC_LEN*SIZE(C3) -+ -+ VST_UL t15, 3*VEC_LEN*SIZE(C3) -+ VST_UH t15, 4*VEC_LEN*SIZE(C3) -+ -+ -+$TRMMKERNEL_16x4: -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP # nodata length -+#ifdef LEFT -+ subl TEMP, 16, TEMP # mr=16 -+#else -+ subl TEMP, 4, TEMP # nr=4 -+#endif -+ -+ sll TEMP, 4 + BASE_SHIFT,KC # mr=16 -+ sll TEMP, 2 + BASE_SHIFT,TEMP # nr=4 -+ -+ addl A, KC, A # mov A to the end of this panel -+ addl B, TEMP,B # mov B to the end of this panel -+#endif -+ -+#ifdef LEFT -+ addl KK, 16 ,KK -+#endif -+ nop -+ jmp $End_NC_Unroll4 -+#endif -+ -+ -+ .align 5 -+ -+.L15: # n=4,m=8----------------------------- -+ and MC1,8,MC -+ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc -+ nop -+ beq MC,.L16 -+ -+ addl A1,SPANA,PREA -+ subl PREA,8*SIZE,PREA # PREA-=MC -+ -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA))\ -+ || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B # set B -+ nop -+#else -+ sll KK, 3 + BASE_SHIFT,KC # mr=8 -+ sll KK, 2 + BASE_SHIFT,TEMP # nr=4 -+ -+ addl A,KC,A -+ addl B1,TEMP,B -+#endif -+ -+ vcpys $f31,$f31,t00 # clear (32 results) -+ vcpys $f31,$f31,t01 -+ vcpys $f31,$f31,t04 -+ vcpys $f31,$f31,t05 -+ -+ LDDE b0,0(B) -+ LDDE b1,1*SIZE(B) -+ LDDE b2,2*SIZE(B) -+ LDDE b3,3*SIZE(B) -+ -+ vcpys $f31,$f31,t08 -+ vcpys $f31,$f31,t09 -+ vcpys $f31,$f31,t12 -+ vcpys $f31,$f31,t13 -+ -+ VLD a0,0(A) # get 8 A -+ VLD a4,4*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ fillcs 0(C1) -+ fillcs 0(C2) -+ fillcs 0(C3) -+ -+ fillcs 4*SIZE(CO) # -+ fillcs 4*SIZE(C1) -+ fillcs 4*SIZE(C2) -+ fillcs 4*SIZE(C3) -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP # temp is the length of the data part -+#elif defined(LEFT) -+ addl KK, 8, TEMP # mr=8 -+#else -+ addl KK, 4, TEMP # nr=4 -+#endif -+ sra TEMP,1, KC # kc/2 -+ beq KC,$Rest_8x4x1 -+ -+#else -+ -+ mov B1,B # Reset B -+ sra KC1,1,KC # unroll kc as 2, kc=kc1/2 -+ vcpys $f31,$f31,t00 # clear (32 results) -+ vcpys $f31,$f31,t01 -+ vcpys $f31,$f31,t04 -+ vcpys $f31,$f31,t05 -+ -+ LDDE b0,0(B) -+ LDDE b1,1*SIZE(B) -+ LDDE b2,2*SIZE(B) -+ LDDE b3,3*SIZE(B) -+ -+ vcpys $f31,$f31,t08 -+ vcpys $f31,$f31,t09 -+ vcpys $f31,$f31,t12 -+ vcpys $f31,$f31,t13 -+ -+ VLD a0,0(A) # get 8 A -+ VLD a4,4*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ fillcs 0(C1) -+ fillcs 0(C2) -+ fillcs 0(C3) -+ -+ fillcs 4*SIZE(CO) # -+ fillcs 4*SIZE(C1) -+ fillcs 4*SIZE(C2) -+ fillcs 4*SIZE(C3) -+ -+ beq KC,$Rest_8x4x1 -+#endif -+ -+ .align 5 -+ -+$Panel_8x4x2: -+ VMAD a0,b0,t00,t00 -+ VMAD a0,b1,t04,t04 -+ VMAD a0,b2,t08,t08 -+ VMAD a0,b3,t12,t12 -+ -+ LDDE nb0,4*SIZE(B) # get next 4b -+ LDDE nb1,5*SIZE(B) -+ LDDE nb2,6*SIZE(B) -+ LDDE nb3,7*SIZE(B) -+ -+ addl B,8*SIZE,B # 4n*2k -+ VMAD a4,b0,t01,t01 -+ VMAD a4,b1,t05,t05 -+ VMAD a4,b2,t09,t09 -+ VMAD a4,b3,t13,t13 -+ -+ VLD na8,8*SIZE(A) # get next 8a -+ VLD na12,12*SIZE(A) -+ -+ fillcs 0(PREA) -+ fillcs 4*SIZE(PREA) -+ subl PREA,8*SIZE,PREA # prea -= 8 -+ -+ subl KC,1,KC -+ addl A,16*SIZE,A # ### next k ###8m*2k -+ VMAD na8,nb0,t00,t00 -+ VMAD na8,nb1,t04,t04 -+ VMAD na8,nb2,t08,t08 -+ VMAD na8,nb3,t12,t12 -+ -+ LDDE b0,0(B) # get 3rd 4b -+ LDDE b1,1*SIZE(B) -+ LDDE b2,2*SIZE(B) -+ LDDE b3,3*SIZE(B) -+ -+ VMAD na12,nb0,t01,t01 -+ VMAD na12,nb1,t05,t05 -+ VMAD na12,nb2,t09,t09 -+ VMAD na12,nb3,t13,t13 -+ -+ VLD a0,0(A) # get 3rd 8a -+ VLD a4,4*SIZE(A) -+ -+ fillcs 0(PREA) -+ fillcs 4*SIZE(PREA) -+ subl PREA,8*SIZE,PREA # prea -= mc -+ bne KC,$Panel_8x4x2 # loop k-- -+ -+$Rest_8x4x1: -+ LDDE ALPHA, 192($sp) # get alpha -+#ifndef TRMMKERNEL -+ blbc KC1, $Write_8x4 -+#else -+ blbc TEMP, $Write_8x4 -+#endif -+ -+ addl A,8*SIZE,A # 8a*1k -+ addl B,4*SIZE,B # 4b*1K -+ -+ VMAD a0,b0,t00,t00 -+ VMAD a0,b1,t04,t04 -+ VMAD a0,b2,t08,t08 -+ VMAD a0,b3,t12,t12 -+ -+ fillcs 0(PREA) -+ fillcs 4*SIZE(PREA) -+ subl PREA,8*SIZE,PREA -+ -+ VMAD a4,b0,t01,t01 -+ VMAD a4,b1,t05,t05 -+ VMAD a4,b2,t09,t09 -+ VMAD a4,b3,t13,t13 -+ -+$Write_8x4: -+ -+#ifndef TRMMKERNEL -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_8x4 -+ -+$Align_CO_Access_8x4: -+ VLD c00,0(CO) # get 1st colum of 16c -+ VLD c01,4*SIZE(CO) -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ jmp $Access_C1_8x4 -+ -+$UnAlign_CO_Access_8x4: -+ VLD_UL c00, 0*VEC_LEN*SIZE(CO) -+ VLD_UH c02, 1*VEC_LEN*SIZE(CO) -+ -+ VLD_UL c01, 1*VEC_LEN*SIZE(CO) -+ VLD_UH c03, 2*VEC_LEN*SIZE(CO) -+ -+ vbisw c00,c02,c00 -+ vbisw c01,c03,c01 -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_8x4: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ addl CO,8*SIZE,CO -+ nop -+ bne $6,$UnAlign_C1_Access_8x4 -+ -+$Align_C1_Access_8x4: -+ VLD c04,0(C1) -+ VLD c05,4*SIZE(C1) -+ -+ VMAD t04,ALPHA,c04,t04 -+ VMAD t05,ALPHA,c05,t05 -+ -+ VST t04,0(C1) -+ VST t05,4*SIZE(C1) -+ jmp $Access_C2_8x4 -+ -+$UnAlign_C1_Access_8x4: -+ VLD_UL c04, 0*VEC_LEN*SIZE(C1) -+ VLD_UH c06, 1*VEC_LEN*SIZE(C1) -+ -+ VLD_UL c05, 1*VEC_LEN*SIZE(C1) -+ VLD_UH c07, 2*VEC_LEN*SIZE(C1) -+ -+ vbisw c04,c06,c04 -+ vbisw c05,c07,c05 -+ -+ VMAD t04,ALPHA,c04,t04 -+ VMAD t05,ALPHA,c05,t05 -+ -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ -+ VST_UL t05, 1*VEC_LEN*SIZE(C1) -+ VST_UH t05, 2*VEC_LEN*SIZE(C1) -+ -+ -+$Access_C2_8x4: -+ and C2, (VEC_LEN*SIZE-1),$6 -+ addl C1,8*SIZE,C1 -+ nop -+ bne $6,$UnAlign_C2_Access_8x4 -+ -+$Align_C2_Access_8x4: -+ VLD c08,0(C2) -+ VLD c09,4*SIZE(C2) -+ -+ VMAD t08,ALPHA,c08,t08 -+ VMAD t09,ALPHA,c09,t09 -+ -+ VST t08,0(C2) -+ VST t09,4*SIZE(C2) -+ jmp $Access_C3_8x4 -+ -+$UnAlign_C2_Access_8x4: -+ VLD_UL c08, 0*VEC_LEN*SIZE(C2) -+ VLD_UH c10, 1*VEC_LEN*SIZE(C2) -+ -+ VLD_UL c09, 1*VEC_LEN*SIZE(C2) -+ VLD_UH c11, 2*VEC_LEN*SIZE(C2) -+ -+ vbisw c08,c10,c08 -+ vbisw c09,c11,c09 -+ -+ VMAD t08,ALPHA,c08,t08 -+ VMAD t09,ALPHA,c09,t09 -+ -+ VST_UL t08, 0*VEC_LEN*SIZE(C2) -+ VST_UH t08, 1*VEC_LEN*SIZE(C2) -+ -+ VST_UL t09, 1*VEC_LEN*SIZE(C2) -+ VST_UH t09, 2*VEC_LEN*SIZE(C2) -+ -+ -+$Access_C3_8x4: -+ and C3, (VEC_LEN*SIZE-1),$6 -+ addl C2,8*SIZE,C2 -+ nop -+ bne $6,$UnAlign_C3_Access_8x4 -+ -+$Align_C3_Access_8x4: -+ VLD c12,0(C3) -+ VLD c13,4*SIZE(C3) -+ -+ VMAD t12,ALPHA,c12,t12 -+ VMAD t13,ALPHA,c13,t13 -+ -+ VST t12,0(C3) -+ VST t13,4*SIZE(C3) -+ addl C3,8*SIZE,C3 -+ jmp .L16 -+ -+ -+$UnAlign_C3_Access_8x4: -+ VLD_UL c12, 0*VEC_LEN*SIZE(C3) -+ VLD_UH c14, 1*VEC_LEN*SIZE(C3) -+ -+ VLD_UL c13, 1*VEC_LEN*SIZE(C3) -+ VLD_UH c15, 2*VEC_LEN*SIZE(C3) -+ -+ vbisw c12,c14,c12 -+ vbisw c13,c15,c13 -+ -+ VMAD t12,ALPHA,c12,t12 -+ VMAD t13,ALPHA,c13,t13 -+ -+ VST_UL t12, 0*VEC_LEN*SIZE(C3) -+ VST_UH t12, 1*VEC_LEN*SIZE(C3) -+ -+ VST_UL t13, 1*VEC_LEN*SIZE(C3) -+ VST_UH t13, 2*VEC_LEN*SIZE(C3) -+ addl C3,8*SIZE,C3 -+ -+#else -+ -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_8x4 -+ -+$Align_CO_Access_8x4: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ jmp $Access_C1_8x4 -+ -+$UnAlign_CO_Access_8x4: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_8x4: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ addl CO,8*SIZE,CO # 8c -+ nop -+ bne $6,$UnAlign_C1_Access_8x4 -+ -+$Align_C1_Access_8x4: -+ VMUL t04,ALPHA,t04 -+ VMUL t05,ALPHA,t05 -+ -+ VST t04,0(C1) -+ VST t05,4*SIZE(C1) -+ jmp $Access_C2_8x4 -+ -+$UnAlign_C1_Access_8x4: -+ VMUL t04,ALPHA,t04 -+ VMUL t05,ALPHA,t05 -+ -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ -+ VST_UL t05, 1*VEC_LEN*SIZE(C1) -+ VST_UH t05, 2*VEC_LEN*SIZE(C1) -+ -+ -+$Access_C2_8x4: -+ and C2, (VEC_LEN*SIZE-1),$6 -+ addl C1,8*SIZE,C1 # 8c -+ nop -+ bne $6,$UnAlign_C2_Access_8x4 -+ -+$Align_C2_Access_8x4: -+ VMUL t08,ALPHA,t08 -+ VMUL t09,ALPHA,t09 -+ -+ VST t08,0(C2) -+ VST t09,4*SIZE(C2) -+ jmp $Access_C3_8x4 -+ -+$UnAlign_C2_Access_8x4: -+ VMUL t08,ALPHA,t08 -+ VMUL t09,ALPHA,t09 -+ -+ VST_UL t08, 0*VEC_LEN*SIZE(C2) -+ VST_UH t08, 1*VEC_LEN*SIZE(C2) -+ -+ VST_UL t09, 1*VEC_LEN*SIZE(C2) -+ VST_UH t09, 2*VEC_LEN*SIZE(C2) -+ -+ -+$Access_C3_8x4: -+ and C3, (VEC_LEN*SIZE-1),$6 -+ addl C2,8*SIZE,C2 # 8c -+ nop -+ bne $6,$UnAlign_C3_Access_8x4 -+ -+$Align_C3_Access_8x4: -+ VMUL t12,ALPHA,t12 -+ VMUL t13,ALPHA,t13 -+ -+ VST t12,0(C3) -+ VST t13,4*SIZE(C3) -+ addl C3,8*SIZE,C3 -+ jmp $TRMMKERNEL_8x4 -+ -+$UnAlign_C3_Access_8x4: -+ VMUL t12,ALPHA,t12 -+ VMUL t13,ALPHA,t13 -+ -+ VST_UL t12, 0*VEC_LEN*SIZE(C3) -+ VST_UH t12, 1*VEC_LEN*SIZE(C3) -+ -+ VST_UL t13, 1*VEC_LEN*SIZE(C3) -+ VST_UH t13, 2*VEC_LEN*SIZE(C3) -+ addl C3,8*SIZE,C3 -+ -+$TRMMKERNEL_8x4: -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 8,TEMP # mr=8 -+#else -+ subl TEMP, 4,TEMP # nr=4 -+#endif -+ -+ sll TEMP, 3 + BASE_SHIFT,KC -+ sll TEMP, 2 + BASE_SHIFT,TEMP -+ -+ addl A, KC, A # move A, B to the end of this panel -+ addl B, TEMP, B -+#endif -+ -+#ifdef LEFT -+ addl KK, 8, KK -+#endif -+#endif -+ -+ -+ -+ .align 5 -+ -+.L16: -+ and MC1,4,MC # nr=4,mr=4---------------------------- -+ sll KC1,2+BASE_SHIFT,SPANA # spana=kc1*mc -+ nop -+ beq MC,.L17 -+ -+ addl A1,SPANA,PREA -+ subl PREA,4*SIZE,PREA # PREA-=MC -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1,B # Set B -+ nop -+#else -+ sll KK, 2 + BASE_SHIFT,KC # mr=nr=4 -+ nop -+ -+ addl A, KC, A -+ addl B1,KC, B -+#endif -+ -+ vcpys $f31,$f31,t00 # clear 16 register -+ vcpys $f31,$f31,t04 -+ vcpys $f31,$f31,t08 -+ vcpys $f31,$f31,t12 -+ -+ LDDE b0,0(B) # get 4b -+ LDDE b1,1*SIZE(B) -+ LDDE b2,2*SIZE(B) -+ LDDE b3,3*SIZE(B) -+ -+ VLD a0,0(A) # get 4a -+ -+ fillcs 0(CO) # prefetch C -+ fillcs 0(C1) -+ fillcs 0(C2) -+ fillcs 0(C3) -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#else -+ addl KK, 4, TEMP -+#endif -+ sra TEMP,1,KC -+ nop -+ beq KC,$Rest_4x4x1 -+ -+#else -+ mov B1,B # Reset B -+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 -+ vcpys $f31,$f31,t00 # clear 16 register -+ vcpys $f31,$f31,t04 -+ vcpys $f31,$f31,t08 -+ vcpys $f31,$f31,t12 -+ -+ LDDE b0,0(B) # get 4b -+ LDDE b1,1*SIZE(B) -+ LDDE b2,2*SIZE(B) -+ LDDE b3,3*SIZE(B) -+ -+ VLD a0,0(A) # get 4a -+ -+ fillcs 0(CO) # prefetch C -+ fillcs 0(C1) -+ fillcs 0(C2) -+ fillcs 0(C3) -+ -+ beq KC,$Rest_4x4x1 -+ -+#endif -+ -+ -+$Panel_4x4x2: -+ VMAD a0,b0,t00,t00 -+ VMAD a0,b1,t04,t04 -+ VMAD a0,b2,t08,t08 -+ VMAD a0,b3,t12,t12 -+ -+ VLD a4,4*SIZE(A) -+ LDDE nb0,4*SIZE(B) # get next 4b and 4a -+ LDDE nb1,5*SIZE(B) -+ LDDE nb2,6*SIZE(B) -+ LDDE nb3,7*SIZE(B) -+ addl B,8*SIZE,B # 4b*2k -+ -+ fillcs 0(PREA) -+ subl PREA,4*SIZE,PREA -+ -+ subl KC,1,KC -+ VMAD a4,nb0,t00,t00 -+ VMAD a4,nb1,t04,t04 -+ VMAD a4,nb2,t08,t08 -+ VMAD a4,nb3,t12,t12 -+ -+ addl A,8*SIZE,A # 4a*2k -+ LDDE b0,0(B) # get 3rd 4b and 4a -+ LDDE b1,1*SIZE(B) -+ LDDE b2,2*SIZE(B) -+ LDDE b3,3*SIZE(B) -+ VLD a0,0(A) -+ -+ fillcs 0(PREA) -+ subl PREA,4*SIZE,PREA -+ bne KC,$Panel_4x4x2 -+ -+ -+$Rest_4x4x1: -+ LDDE ALPHA, 192($sp) # Get ALPHA -+#ifndef TRMMKERNEL -+ blbc KC1, $Write_4x4 -+#else -+ blbc TEMP, $Write_4x4 -+#endif -+ -+ addl A,4*SIZE,A # 4a*1k -+ addl B,4*SIZE,B # 4b*1K -+ -+ fillcs 0(PREA) -+ subl PREA,4*SIZE,PREA -+ -+ VMAD a0,b0,t00,t00 -+ VMAD a0,b1,t04,t04 -+ VMAD a0,b2,t08,t08 -+ VMAD a0,b3,t12,t12 -+ -+ -+$Write_4x4: -+ -+#ifndef TRMMKERNEL -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_4x4 -+ -+$Align_CO_Access_4x4: -+ VLD c00,0(CO) # get 1st colum of 16c -+ VMAD t00,ALPHA,c00,t00 -+ VST t00,0(CO) -+ jmp $Access_C1_4x4 -+ -+$UnAlign_CO_Access_4x4: -+ VLD_UL c00, 0*VEC_LEN*SIZE(CO) -+ VLD_UH c02, 1*VEC_LEN*SIZE(CO) -+ -+ vbisw c00,c02,c00 -+ -+ VMAD t00,ALPHA,c00,t00 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_4x4: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ addl CO,4*SIZE,CO # 4c -+ nop -+ bne $6,$UnAlign_C1_Access_4x4 -+ -+$Align_C1_Access_4x4: -+ VLD c04,0(C1) -+ VMAD t04,ALPHA,c04,t04 -+ VST t04,0(C1) -+ jmp $Access_C2_4x4 -+ -+$UnAlign_C1_Access_4x4: -+ VLD_UL c04, 0*VEC_LEN*SIZE(C1) -+ VLD_UH c06, 1*VEC_LEN*SIZE(C1) -+ -+ vbisw c04,c06,c04 -+ -+ VMAD t04,ALPHA,c04,t04 -+ -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ -+ -+$Access_C2_4x4: -+ and C2, (VEC_LEN*SIZE-1),$6 -+ addl C1,4*SIZE,C1 # 4c -+ nop -+ bne $6,$UnAlign_C2_Access_4x4 -+ -+$Align_C2_Access_4x4: -+ VLD c08,0(C2) -+ VMAD t08,ALPHA,c08,t08 -+ VST t08,0(C2) -+ jmp $Access_C3_4x4 -+ -+$UnAlign_C2_Access_4x4: -+ VLD_UL c08, 0*VEC_LEN*SIZE(C2) -+ VLD_UH c10, 1*VEC_LEN*SIZE(C2) -+ -+ vbisw c08,c10,c08 -+ -+ VMAD t08,ALPHA,c08,t08 -+ -+ VST_UL t08, 0*VEC_LEN*SIZE(C2) -+ VST_UH t08, 1*VEC_LEN*SIZE(C2) -+ -+ -+$Access_C3_4x4: -+ and C3, (VEC_LEN*SIZE-1),$6 -+ addl C2,4*SIZE,C2 # 4c -+ nop -+ bne $6,$UnAlign_C3_Access_4x4 -+ -+$Align_C3_Access_4x4: -+ VLD c12,0(C3) -+ VMAD t12,ALPHA,c12,t12 -+ VST t12,0(C3) -+ addl C3,4*SIZE,C3 -+ jmp .L17 -+ -+$UnAlign_C3_Access_4x4: -+ VLD_UL c12, 0*VEC_LEN*SIZE(C3) -+ VLD_UH c14, 1*VEC_LEN*SIZE(C3) -+ -+ vbisw c12,c14,c12 -+ -+ VMAD t12,ALPHA,c12,t12 -+ -+ VST_UL t12, 0*VEC_LEN*SIZE(C3) -+ VST_UH t12, 1*VEC_LEN*SIZE(C3) -+ addl C3,4*SIZE,C3 -+ -+ -+#else -+ -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_4x4 -+ -+$Align_CO_Access_4x4: -+ VMUL t00,ALPHA,t00 -+ VST t00,0(CO) -+ jmp $Access_C1_4x4 -+ -+$UnAlign_CO_Access_4x4: -+ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_4x4: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ addl CO,4*SIZE,CO # 4c -+ nop -+ bne $6,$UnAlign_C1_Access_4x4 -+ -+$Align_C1_Access_4x4: -+ VMUL t04,ALPHA,t04 -+ VST t04,0(C1) -+ jmp $Access_C2_4x4 -+ -+$UnAlign_C1_Access_4x4: -+ VMUL t04,ALPHA,t04 -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ -+ -+$Access_C2_4x4: -+ and C2, (VEC_LEN*SIZE-1),$6 -+ addl C1,4*SIZE,C1 # 4c -+ nop -+ bne $6,$UnAlign_C2_Access_4x4 -+ -+$Align_C2_Access_4x4: -+ VMUL t08,ALPHA,t08 -+ VST t08,0(C2) -+ jmp $Access_C3_4x4 -+ -+$UnAlign_C2_Access_4x4: -+ VMUL t08,ALPHA,t08 -+ VST_UL t08, 0*VEC_LEN*SIZE(C2) -+ VST_UH t08, 1*VEC_LEN*SIZE(C2) -+ -+ -+$Access_C3_4x4: -+ and C3, (VEC_LEN*SIZE-1),$6 -+ addl C2,4*SIZE,C2 # 4c -+ nop -+ bne $6,$UnAlign_C3_Access_4x4 -+ -+$Align_C3_Access_4x4: -+ VMUL t12,ALPHA,t12 -+ VST t12,0(C3) -+ addl C3,4*SIZE,C3 -+ jmp $TRMMKERNEL_4x4 -+ -+$UnAlign_C3_Access_4x4: -+ VMUL t12,ALPHA,t12 -+ VST_UL t12, 0*VEC_LEN*SIZE(C3) -+ VST_UH t12, 1*VEC_LEN*SIZE(C3) -+ addl C3,4*SIZE,C3 -+ -+$TRMMKERNEL_4x4: -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+ subl TEMP, 4, TEMP # mr=nr=4 -+ -+ sll TEMP, 2 + BASE_SHIFT,KC -+ nop -+ -+ addl A, KC, A # move A B to the end of this panel -+ addl B, KC, B -+#endif -+ -+#ifdef LEFT -+ addl KK, 4, KK -+#endif -+#endif -+ -+ -+ -+ -+ .align 5 -+.L17: # nr=4,mr=2-------------------- -+ and MC1,2,MC -+ beq MC,.L18 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA))\ -+ || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B -+#else -+ sll KK, 1 + BASE_SHIFT, KC # mr=2 -+ sll KK, 2 + BASE_SHIFT, TEMP # nr=4 -+ -+ addl A, KC, A -+ addl B1,TEMP, B -+#endif -+ -+ fclr t00 # CLEAR 8 register -+ fclr t01 -+ fclr t04 -+ fclr t05 -+ fclr t08 -+ fclr t09 -+ fclr t12 -+ fclr t13 -+ -+ LD b0,0(B) # get 4b -+ LD b1,1*SIZE(B) -+ LD a0,0(A) # get 2a -+ LD b2,2*SIZE(B) -+ LD b3,3*SIZE(B) -+ LD a4,1*SIZE(A) -+ -+ fillcs 0(CO) # prefetch C -+ fillcs 0(C1) -+ fillcs 0(C2) -+ fillcs 0(C3) -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 2, TEMP # mr=2 -+#else -+ addl KK, 4, TEMP # nr=4 -+#endif -+ sra TEMP, 1, KC -+ beq KC,$Rest_2x4x1 -+ -+#else -+ mov B1,B # reset B -+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 -+ fclr t00 # CLEAR 8 register -+ fclr t01 -+ fclr t04 -+ fclr t05 -+ fclr t08 -+ fclr t09 -+ fclr t12 -+ fclr t13 -+ -+ LD b0,0(B) # get 4b -+ LD b1,1*SIZE(B) -+ LD a0,0(A) # get 2a -+ LD b2,2*SIZE(B) -+ LD b3,3*SIZE(B) -+ LD a4,1*SIZE(A) -+ -+ fillcs 0(CO) # prefetch C -+ fillcs 0(C1) -+ fillcs 0(C2) -+ fillcs 0(C3) -+ -+ beq KC,$Rest_2x4x1 -+#endif -+ -+ -+$Panel_2x4x2: -+ MAD a0,b0,t00,t00 -+ MAD a0,b1,t04,t04 -+ MAD a0,b2,t08,t08 -+ MAD a0,b3,t12,t12 -+ -+ LD nb0,4*SIZE(B) # get next 4b and 2a -+ LD nb1,5*SIZE(B) -+ LD a8,2*SIZE(A) -+ LD nb2,6*SIZE(B) -+ LD nb3,7*SIZE(B) -+ LD a12,3*SIZE(A) -+ addl B,8*SIZE,B # 4b*2k -+ -+ MAD a4,b0,t01,t01 -+ MAD a4,b1,t05,t05 -+ MAD a4,b2,t09,t09 -+ MAD a4,b3,t13,t13 -+ -+ subl KC,1,KC -+ MAD a8,nb0,t00,t00 -+ MAD a8,nb1,t04,t04 -+ MAD a8,nb2,t08,t08 -+ MAD a8,nb3,t12,t12 -+ -+ addl A,4*SIZE,A # 2a*2k -+ LD b0,0(B) # get 3rd 4b and 2a -+ LD b1,1*SIZE(B) -+ LD a0,0(A) -+ LD b2,2*SIZE(B) -+ LD b3,3*SIZE(B) -+ LD a4,1*SIZE(A) -+ -+ MAD a12,nb0,t01,t01 -+ MAD a12,nb1,t05,t05 -+ MAD a12,nb2,t09,t09 -+ MAD a12,nb3,t13,t13 -+ -+ bne KC,$Panel_2x4x2 -+ -+ -+$Rest_2x4x1: -+ LD ALPHA, 192($sp) # get alpha -+#ifndef TRMMKERNEL -+ blbc KC1, $Write_2x4 -+#else -+ blbc TEMP, $Write_2x4 -+#endif -+ -+ addl A,2*SIZE,A # 2a*1k -+ addl B,4*SIZE,B # 4b*1K -+ -+ MAD a0,b0,t00,t00 -+ MAD a0,b1,t04,t04 -+ MAD a0,b2,t08,t08 -+ MAD a0,b3,t12,t12 -+ -+ MAD a4,b0,t01,t01 -+ MAD a4,b1,t05,t05 -+ MAD a4,b2,t09,t09 -+ MAD a4,b3,t13,t13 -+ -+$Write_2x4: -+#ifndef TRMMKERNEL -+ LD c00,0(CO) -+ LD c01,1*SIZE(CO) -+ LD c04,0(C1) -+ LD c05,1*SIZE(C1) -+ -+ MAD t00,ALPHA,c00,t00 -+ MAD t01,ALPHA,c01,t01 -+ -+ LD c08,0(C2) -+ LD c09,1*SIZE(C2) -+ -+ MAD t04,ALPHA,c04,t04 -+ MAD t05,ALPHA,c05,t05 -+ -+ LD c12,0(C3) -+ LD c13,1*SIZE(C3) -+ -+ MAD t08,ALPHA,c08,t08 -+ MAD t09,ALPHA,c09,t09 -+ -+ addl CO,2*SIZE,CO # 2c -+ addl C1,2*SIZE,C1 -+ addl C2,2*SIZE,C2 -+ addl C3,2*SIZE,C3 -+ -+ ST t00,-2*SIZE(CO) # 2c -+ ST t01,-1*SIZE(CO) -+ -+ MAD t12,ALPHA,c12,t12 -+ MAD t13,ALPHA,c13,t13 -+ -+ ST t04,-2*SIZE(C1) -+ ST t05,-1*SIZE(C1) -+ -+ ST t08,-2*SIZE(C2) -+ ST t09,-1*SIZE(C2) -+ -+ ST t12,-2*SIZE(C3) -+ ST t13,-1*SIZE(C3) -+ -+#else -+ MUL t00,ALPHA,t00 -+ MUL t01,ALPHA,t01 -+ -+ MUL t04,ALPHA,t04 -+ MUL t05,ALPHA,t05 -+ -+ MUL t08,ALPHA,t08 -+ MUL t09,ALPHA,t09 -+ -+ addl CO,2*SIZE,CO # 2c -+ addl C1,2*SIZE,C1 -+ addl C2,2*SIZE,C2 -+ addl C3,2*SIZE,C3 -+ -+ ST t00,-2*SIZE(CO) # 2c -+ ST t01,-1*SIZE(CO) -+ -+ MUL t12,ALPHA,t12 -+ MUL t13,ALPHA,t13 -+ -+ ST t04,-2*SIZE(C1) -+ ST t05,-1*SIZE(C1) -+ -+ ST t08,-2*SIZE(C2) -+ ST t09,-1*SIZE(C2) -+ -+ ST t12,-2*SIZE(C3) -+ ST t13,-1*SIZE(C3) -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 2, TEMP -+#else -+ subl TEMP, 4, TEMP -+#endif -+ -+ sll TEMP, 1 + BASE_SHIFT,KC -+ sll TEMP, 2 + BASE_SHIFT,TEMP -+ -+ addl A, KC, A -+ addl B, TEMP, B -+#endif -+ -+#ifdef LEFT -+ addl KK,2,KK -+#endif -+#endif -+ -+ -+ -+.align 5 -+.L18: # nr=4,mr=1--------------------------- -+ and MC1,1,MC -+ beq MC,$End_NC_Unroll4 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B -+ nop -+#else -+ sll KK, BASE_SHIFT, KC # mr=1 -+ sll KK, 2 + BASE_SHIFT,TEMP # nr=4 -+ -+ addl A, KC, A -+ addl B1,TEMP, B -+#endif -+ -+ fclr t00 # clear 4 regitster -+ fclr t04 -+ fclr t08 -+ fclr t12 -+ -+ LD b0,0(B) # get 4b -+ LD b1,1*SIZE(B) -+ LD b2,2*SIZE(B) -+ LD b3,3*SIZE(B) -+ -+ LD a0,0(A) # get 1 a -+ -+ fillcs 0(CO) # prefetch C -+ fillcs 0(C1) -+ fillcs 0(C2) -+ fillcs 0(C3) -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 1, TEMP # mr=1 -+#else -+ addl KK, 4,TEMP # nr=4 -+#endif -+ sra TEMP,1,KC -+ beq KC,$Rest_1x4x1 -+ -+#else -+ mov B1,B # Reset B -+ fclr t00 # clear 4 regitster -+ fclr t04 -+ fclr t08 -+ fclr t12 -+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 -+ -+ LD b0,0(B) # get 4b -+ LD b1,1*SIZE(B) -+ LD b2,2*SIZE(B) -+ LD b3,3*SIZE(B) -+ -+ LD a0,0(A) # get 1 a -+ -+ fillcs 0(CO) # prefetch C -+ fillcs 0(C1) -+ fillcs 0(C2) -+ fillcs 0(C3) -+ -+ beq KC,$Rest_1x4x1 -+ -+#endif -+ -+ -+$Panel_1x4x2: -+ MAD a0,b0,t00,t00 -+ MAD a0,b1,t04,t04 -+ MAD a0,b2,t08,t08 -+ MAD a0,b3,t12,t12 -+ -+ LD a8,1*SIZE(A) -+ LD nb0,4*SIZE(B) -+ LD nb1,5*SIZE(B) -+ LD nb2,6*SIZE(B) -+ LD nb3,7*SIZE(B) -+ -+ addl B,8*SIZE,B # 4b*2k -+ -+ subl KC,1,KC -+ MAD a8,nb0,t00,t00 -+ MAD a8,nb1,t04,t04 -+ MAD a8,nb2,t08,t08 -+ MAD a8,nb3,t12,t12 -+ -+ addl A,2*SIZE,A # 1a*2k -+ LD a0,0(A) # get 3rd 4b and 1a -+ LD b0,0(B) -+ LD b1,1*SIZE(B) -+ LD b2,2*SIZE(B) -+ LD b3,3*SIZE(B) -+ bne KC,$Panel_1x4x2 -+ -+ -+$Rest_1x4x1: -+ LD ALPHA,192($sp) # get alpha -+#ifndef TRMMKERNEL -+ blbc KC1, $Write_1x4 -+#else -+ blbc TEMP, $Write_1x4 -+#endif -+ -+ addl A,1*SIZE,A # 1m*1k*8Byte -+ addl B,4*SIZE,B # 4n*1K*8Byte -+ -+ MAD a0,b0,t00,t00 -+ MAD a0,b1,t04,t04 -+ MAD a0,b2,t08,t08 -+ MAD a0,b3,t12,t12 -+ -+ -+$Write_1x4: -+#ifndef TRMMKERNEL -+ LD c00,0(CO) -+ LD c04,0(C1) -+ MAD t00,ALPHA,c00,t00 -+ MAD t04,ALPHA,c04,t04 -+ LD c08,0(C2) -+ LD c12,0(C3) -+ MAD t08,ALPHA,c08,t08 -+ MAD t12,ALPHA,c12,t12 -+ ST t00,0(CO) -+ ST t04,0(C1) -+ ST t08,0(C2) -+ ST t12,0(C3) -+ -+#else -+ MUL t00,ALPHA,t00 -+ MUL t04,ALPHA,t04 -+ MUL t08,ALPHA,t08 -+ MUL t12,ALPHA,t12 -+ -+ ST t00,0(CO) -+ ST t04,0(C1) -+ ST t08,0(C2) -+ ST t12,0(C3) -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 1, TEMP -+#else -+ subl TEMP, 4, TEMP -+#endif -+ -+ sll TEMP, BASE_SHIFT, KC -+ sll TEMP, 2 + BASE_SHIFT, TEMP -+ -+ addl A, KC, A -+ addl B, TEMP,B -+#endif -+ -+#ifdef LEFT -+ addl KK, 1,KK -+#endif -+#endif -+ -+ -+ .align 5 -+ -+$End_NC_Unroll4: -+ subl NC,1,NC # Loop N -- -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addl KK, 4, KK -+ nop -+#endif -+ mov A1,A # Reset A -+ mov B, B1 # mov B1 to the next panel -+ bne NC,.L0 -+ -+ -+ -+ -+ .align 5 -+$Begin_NC_Unroll2: -+ -+ and NC1, 2, NC -+ beq NC, $Begin_NC_Unroll1 -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK # reset KK -+#endif -+ -+ mov C,CO -+ addl C,LDM,C1 -+ -+ sra MC1,4,MC # MC=MC1/16 -+ sll KC1,4+BASE_SHIFT,SPANA # SPANA=KC1*MC -+ -+ addl A1,SPANA,PREA -+ subl PREA,16*SIZE,PREA -+ -+ addl C1,LDM,C # C=C1+LDM, Mov C to Next Panel -+ beq MC,.L25 # MC=0:MC1<16 -+ -+ -+ .align 5 -+.L2: # nr=2,mr=16------------------- -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA))\ -+ || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1,B -+#else -+ sll KK, 4 + BASE_SHIFT,KC # mr=16 -+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2 -+ -+ addl A,KC,A -+ addl B1,TEMP,B -+#endif -+ -+ vcpys $f31,$f31,t00 # CLEAR Results Register -+ vcpys $f31,$f31,t01 -+ vcpys $f31,$f31,t02 -+ vcpys $f31,$f31,t03 -+ -+ LDDE b0,0(B) -+ LDDE b1,1*SIZE(B) -+ -+ VLD a0,0(A) # Get 16 A and 2 B -+ VLD a4,4*SIZE(A) -+ VLD a8,8*SIZE(A) -+ VLD a12,12*SIZE(A) -+ -+ vcpys $f31,$f31,t04 -+ vcpys $f31,$f31,t06 -+ vcpys $f31,$f31,t05 -+ vcpys $f31,$f31,t07 -+ -+ fillcs 0(CO) # fetch C -+ fillcs 0(C1) -+ fillcs 8*SIZE(CO) -+ fillcs 8*SIZE(C1) -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 16, TEMP # mr=16 -+#else -+ addl KK, 2, TEMP # nr=2 -+#endif -+ sra TEMP, 1, KC -+ nop -+ beq KC,$Rest_16x2x1 -+ -+#else -+ -+ mov B1,B # Set B -+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 -+ vcpys $f31,$f31,t00 # CLEAR Results Register -+ vcpys $f31,$f31,t01 -+ vcpys $f31,$f31,t02 -+ vcpys $f31,$f31,t03 -+ -+ LDDE b0,0(B) -+ LDDE b1,1*SIZE(B) -+ -+ VLD a0,0(A) # Get 16 A and 2 B -+ VLD a4,4*SIZE(A) -+ VLD a8,8*SIZE(A) -+ VLD a12,12*SIZE(A) -+ -+ vcpys $f31,$f31,t04 -+ vcpys $f31,$f31,t06 -+ vcpys $f31,$f31,t05 -+ vcpys $f31,$f31,t07 -+ -+ fillcs 0(CO) # fetch C -+ fillcs 0(C1) -+ fillcs 8*SIZE(CO) -+ fillcs 8*SIZE(C1) -+ -+ beq KC,$Rest_16x2x1 -+ -+#endif -+ -+ -+$Panel_16x2x2: -+ VMAD a0,b0,t00,t00 -+ VMAD a0,b1,t04,t04 -+ -+ addl A,16*SIZE,A # 16m*1k -+ LDDE nb0,2*SIZE(B) -+ LDDE nb1,3*SIZE(B) -+ -+ VMAD a4,b0,t01,t01 -+ VMAD a4,b1,t05,t05 -+ -+ addl B,4*SIZE,B # 2n*2k -+ VLD na0,0(A) -+ VLD na4,4*SIZE(A) -+ VLD na8,8*SIZE(A) -+ VLD na12,12*SIZE(A) -+ -+ VMAD a8,b0,t02,t02 -+ VMAD a8,b1,t06,t06 -+ -+ VMAD a12,b0,t03,t03 -+ VMAD a12,b1,t07,t07 -+ -+ fillcs 0(PREA) -+ fillcs 8*SIZE(PREA) -+ subl PREA,16*SIZE,PREA -+ -+ subl KC,1,KC -+ VMAD na0,nb0,t00,t00 -+ VMAD na0,nb1,t04,t04 -+ -+ addl A,16*SIZE,A # 16m*1k -+ LDDE b0,0(B) -+ LDDE b1,1*SIZE(B) -+ -+ VMAD na4,nb0,t01,t01 -+ VMAD na4,nb1,t05,t05 -+ -+ VLD a0,0(A) # get 3rd 16a -+ VLD a4,4*SIZE(A) -+ VLD a8,8*SIZE(A) -+ VLD a12,12*SIZE(A) -+ -+ VMAD na8,nb0,t02,t02 -+ VMAD na8,nb1,t06,t06 -+ -+ VMAD na12,nb0,t03,t03 -+ VMAD na12,nb1,t07,t07 -+ -+ fillcs 0(PREA) -+ fillcs 8*SIZE(PREA) -+ subl PREA,16*SIZE,PREA -+ bne KC,$Panel_16x2x2 -+ -+ -+$Rest_16x2x1: -+ LDDE ALPHA, 192($sp) # get alpha -+#ifndef TRMMKERNEL -+ blbc KC1, $Write_16x2 -+#else -+ blbc TEMP, $Write_16x2 -+#endif -+ -+ addl A,16*SIZE,A # 16m*1k -+ addl B,2*SIZE,B # 2n*1k -+ -+ VMAD a0,b0,t00,t00 -+ VMAD a0,b1,t04,t04 -+ -+ fillcs 0(PREA) -+ fillcs 8*SIZE(PREA) -+ subl PREA,16*SIZE,PREA -+ -+ VMAD a4,b0,t01,t01 -+ VMAD a4,b1,t05,t05 -+ VMAD a8,b0,t02,t02 -+ VMAD a8,b1,t06,t06 -+ VMAD a12,b0,t03,t03 -+ VMAD a12,b1,t07,t07 -+ -+ -+$Write_16x2: -+ -+#ifndef TRMMKERNEL -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_16x2 -+ -+$Align_CO_Access_16x2: -+ VLD c00,0(CO) # get 1st colum of 16c -+ VLD c01,4*SIZE(CO) -+ VLD c02,8*SIZE(CO) -+ VLD c03,12*SIZE(CO) -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ VMAD t02,ALPHA,c02,t02 -+ VMAD t03,ALPHA,c03,t03 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ VST t02,8*SIZE(CO) -+ VST t03,12*SIZE(CO) -+ jmp $Access_C1_16x2 -+ -+$UnAlign_CO_Access_16x2: -+ VLD_UL c00, 0*VEC_LEN*SIZE(CO) -+ VLD_UH c04, 1*VEC_LEN*SIZE(CO) -+ -+ VLD_UL c01, 1*VEC_LEN*SIZE(CO) -+ VLD_UH c05, 2*VEC_LEN*SIZE(CO) -+ -+ VLD_UL c02, 2*VEC_LEN*SIZE(CO) -+ VLD_UH c06, 3*VEC_LEN*SIZE(CO) -+ -+ VLD_UL c03, 3*VEC_LEN*SIZE(CO) -+ VLD_UH c07, 4*VEC_LEN*SIZE(CO) -+ -+ vbisw c00,c04,c00 -+ vbisw c01,c05,c01 -+ vbisw c02,c06,c02 -+ vbisw c03,c07,c03 -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ VMAD t02,ALPHA,c02,t02 -+ VMAD t03,ALPHA,c03,t03 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+ VST_UL t02, 2*VEC_LEN*SIZE(CO) -+ VST_UH t02, 3*VEC_LEN*SIZE(CO) -+ -+ VST_UL t03, 3*VEC_LEN*SIZE(CO) -+ VST_UH t03, 4*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_16x2: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_C1_Access_16x2 -+ -+$Align_C1_Access_16x2: -+ VLD c04,0(C1) -+ VLD c05,4*SIZE(C1) -+ VLD c06,8*SIZE(C1) -+ VLD c07,12*SIZE(C1) -+ -+ VMAD t04,ALPHA,c04,t04 -+ VMAD t05,ALPHA,c05,t05 -+ VMAD t06,ALPHA,c06,t06 -+ VMAD t07,ALPHA,c07,t07 -+ -+ VST t04,0(C1) -+ VST t05,4*SIZE(C1) -+ VST t06,8*SIZE(C1) -+ VST t07,12*SIZE(C1) -+ jmp $End_NC_Unroll2 -+ -+$UnAlign_C1_Access_16x2: -+ VLD_UL c04, 0*VEC_LEN*SIZE(C1) -+ VLD_UH t00, 1*VEC_LEN*SIZE(C1) -+ -+ VLD_UL c05, 1*VEC_LEN*SIZE(C1) -+ VLD_UH t01, 2*VEC_LEN*SIZE(C1) -+ -+ VLD_UL c06, 2*VEC_LEN*SIZE(C1) -+ VLD_UH t02, 3*VEC_LEN*SIZE(C1) -+ -+ VLD_UL c07, 3*VEC_LEN*SIZE(C1) -+ VLD_UH t03, 4*VEC_LEN*SIZE(C1) -+ -+ vbisw c04,t00,c04 -+ vbisw c05,t01,c05 -+ vbisw c06,t02,c06 -+ vbisw c07,t03,c07 -+ -+ VMAD t04,ALPHA,c04,t04 -+ VMAD t05,ALPHA,c05,t05 -+ VMAD t06,ALPHA,c06,t06 -+ VMAD t07,ALPHA,c07,t07 -+ -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ -+ VST_UL t05, 1*VEC_LEN*SIZE(C1) -+ VST_UH t05, 2*VEC_LEN*SIZE(C1) -+ -+ VST_UL t06, 2*VEC_LEN*SIZE(C1) -+ VST_UH t06, 3*VEC_LEN*SIZE(C1) -+ -+ VST_UL t07, 3*VEC_LEN*SIZE(C1) -+ VST_UH t07, 4*VEC_LEN*SIZE(C1) -+ jmp $End_NC_Unroll2 # loop m finished -+ -+ -+#else -+ -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_16x2 -+ -+$Align_CO_Access_16x2: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ VMUL t02,ALPHA,t02 -+ VMUL t03,ALPHA,t03 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ VST t02,8*SIZE(CO) -+ VST t03,12*SIZE(CO) -+ jmp $Access_C1_16x2 -+ -+$UnAlign_CO_Access_16x2: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ VMUL t02,ALPHA,t02 -+ VMUL t03,ALPHA,t03 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+ VST_UL t02, 2*VEC_LEN*SIZE(CO) -+ VST_UH t02, 3*VEC_LEN*SIZE(CO) -+ -+ VST_UL t03, 3*VEC_LEN*SIZE(CO) -+ VST_UH t03, 4*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_16x2: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_C1_Access_16x2 -+ -+$Align_C1_Access_16x2: -+ VMUL t04,ALPHA,t04 -+ VMUL t05,ALPHA,t05 -+ VMUL t06,ALPHA,t06 -+ VMUL t07,ALPHA,t07 -+ -+ VST t04,0(C1) -+ VST t05,4*SIZE(C1) -+ VST t06,8*SIZE(C1) -+ VST t07,12*SIZE(C1) -+ jmp $TRMMKERNEL_16x2 -+ -+$UnAlign_C1_Access_16x2: -+ VMUL t04,ALPHA,t04 -+ VMUL t05,ALPHA,t05 -+ VMUL t06,ALPHA,t06 -+ VMUL t07,ALPHA,t07 -+ -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ -+ VST_UL t05, 1*VEC_LEN*SIZE(C1) -+ VST_UH t05, 2*VEC_LEN*SIZE(C1) -+ -+ VST_UL t06, 2*VEC_LEN*SIZE(C1) -+ VST_UH t06, 3*VEC_LEN*SIZE(C1) -+ -+ VST_UL t07, 3*VEC_LEN*SIZE(C1) -+ VST_UH t07, 4*VEC_LEN*SIZE(C1) -+ -+$TRMMKERNEL_16x2: -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 16, TEMP -+#else -+ subl TEMP, 2, TEMP -+#endif -+ -+ sll TEMP, 4 + BASE_SHIFT,KC -+ sll TEMP, 1 + BASE_SHIFT,TEMP -+ -+ addl A, KC, A -+ addl B, TEMP,B -+#endif -+ -+#ifdef LEFT -+ addl KK, 16, KK -+ nop -+#endif -+ -+ jmp $End_NC_Unroll2 # loop m finished -+#endif -+ -+ -+ -+ .align 5 -+ -+.L25: -+ and MC1,8,MC -+ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc -+ nop -+ beq MC,.L26 -+ -+ addl A1,SPANA,PREA -+ subl PREA,8*SIZE,PREA # PREA-=MC -+ -+ -+ .align 5 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA))\ -+ || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B -+#else -+ sll KK, 3 + BASE_SHIFT,KC # mr=8 -+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2 -+ -+ addl A,KC, A -+ addl B1,TEMP,B -+#endif -+ -+ vcpys $f31,$f31,t00 # clear 16 registers -+ vcpys $f31,$f31,t01 -+ -+ LDDE b0,0(B) # Get 2b -+ LDDE b1,1*SIZE(B) -+ -+ vcpys $f31,$f31,t04 -+ vcpys $f31,$f31,t05 -+ -+ VLD a0,0(A) # Get 8a -+ VLD a4,4*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ fillcs 0(C1) -+ fillcs 4*SIZE(CO) -+ fillcs 4*SIZE(C1) -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 8, TEMP # mr=8 -+#else -+ addl KK, 2, TEMP # nr=2 -+#endif -+ sra TEMP, 1,KC -+ nop -+ beq KC,$Rest_8x2x1 -+ -+#else -+ -+ mov B1, B -+ sra KC1,1,KC -+ vcpys $f31,$f31,t00 # clear 16 registers -+ vcpys $f31,$f31,t01 -+ -+ LDDE b0,0(B) # Get 2b -+ LDDE b1,1*SIZE(B) -+ -+ vcpys $f31,$f31,t04 -+ vcpys $f31,$f31,t05 -+ -+ VLD a0,0(A) # Get 8a -+ VLD a4,4*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ fillcs 0(C1) -+ fillcs 4*SIZE(CO) -+ fillcs 4*SIZE(C1) -+ -+ beq KC,$Rest_8x2x1 -+#endif -+ -+ -+$Panel_8x2x2: -+ VMAD a0,b0,t00,t00 -+ VMAD a0,b1,t04,t04 -+ -+ LDDE nb0,2*SIZE(B) # get next 2b -+ LDDE nb1,3*SIZE(B) -+ -+ VMAD a4,b0,t01,t01 -+ VMAD a4,b1,t05,t05 -+ -+ addl B,4*SIZE,B # 2n*2k -+ VLD na8,8*SIZE(A) # get next 8a -+ VLD na12,12*SIZE(A) -+ -+ fillcs 0(PREA) -+ fillcs 4*SIZE(PREA) -+ subl PREA,8*SIZE,PREA -+ -+ subl KC,1,KC -+ VMAD na8,nb0,t00,t00 -+ VMAD na8,nb1,t04,t04 -+ -+ addl A,16*SIZE,A # 8m*2k -+ LDDE b0,0(B) -+ LDDE b1,1*SIZE(B) # get 3rd 2b -+ -+ VMAD na12,nb0,t01,t01 -+ VMAD na12,nb1,t05,t05 -+ -+ VLD a0,0(A) # get 3rd 8a -+ VLD a4,4*SIZE(A) -+ -+ fillcs 0(PREA) -+ fillcs 4*SIZE(PREA) -+ subl PREA,8*SIZE,PREA -+ bne KC,$Panel_8x2x2 -+ -+ -+$Rest_8x2x1: -+ LDDE ALPHA,192($sp) # get alpha -+#ifndef TRMMKERNEL -+ blbc KC1,$Write_8x2 -+#else -+ blbc TEMP,$Write_8x2 -+#endif -+ -+ addl A,8*SIZE,A # 8m*1k -+ addl B,2*SIZE,B # 2n*1K -+ -+ fillcs 0(PREA) -+ fillcs 4*SIZE(PREA) -+ subl PREA,8*SIZE,PREA -+ -+ VMAD a0,b0,t00,t00 -+ VMAD a0,b1,t04,t04 -+ VMAD a4,b0,t01,t01 -+ VMAD a4,b1,t05,t05 -+ -+ -+$Write_8x2: -+ -+#ifndef TRMMKERNEL -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_8x2 -+ -+$Align_CO_Access_8x2: -+ VLD c00,0(CO) # get 1st colum of 16c -+ VLD c01,4*SIZE(CO) -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ jmp $Access_C1_8x2 -+ -+$UnAlign_CO_Access_8x2: -+ VLD_UL c00, 0*VEC_LEN*SIZE(CO) -+ VLD_UH c02, 1*VEC_LEN*SIZE(CO) -+ -+ VLD_UL c01, 1*VEC_LEN*SIZE(CO) -+ VLD_UH c03, 2*VEC_LEN*SIZE(CO) -+ -+ vbisw c00,c02,c00 -+ vbisw c01,c03,c01 -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_8x2: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ addl CO,8*SIZE,CO # 8c -+ nop -+ bne $6,$UnAlign_C1_Access_8x2 -+ -+$Align_C1_Access_8x2: -+ VLD c04,0(C1) -+ VLD c05,4*SIZE(C1) -+ -+ VMAD t04,ALPHA,c04,t04 -+ VMAD t05,ALPHA,c05,t05 -+ -+ VST t04,0(C1) -+ VST t05,4*SIZE(C1) -+ addl C1,8*SIZE,C1 -+ jmp .L26 -+ -+$UnAlign_C1_Access_8x2: -+ VLD_UL c04, 0*VEC_LEN*SIZE(C1) -+ VLD_UH c06, 1*VEC_LEN*SIZE(C1) -+ -+ VLD_UL c05, 1*VEC_LEN*SIZE(C1) -+ VLD_UH c07, 2*VEC_LEN*SIZE(C1) -+ -+ vbisw c04,c06,c04 -+ vbisw c05,c07,c05 -+ -+ VMAD t04,ALPHA,c04,t04 -+ VMAD t05,ALPHA,c05,t05 -+ -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ -+ VST_UL t05, 1*VEC_LEN*SIZE(C1) -+ VST_UH t05, 2*VEC_LEN*SIZE(C1) -+ addl C1,8*SIZE,C1 -+ -+#else -+ -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_8x2 -+ -+$Align_CO_Access_8x2: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ jmp $Access_C1_8x2 -+ -+$UnAlign_CO_Access_8x2: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_8x2: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ addl CO,8*SIZE,CO # 8c -+ nop -+ bne $6,$UnAlign_C1_Access_8x2 -+ -+$Align_C1_Access_8x2: -+ VMUL t04,ALPHA,t04 -+ VMUL t05,ALPHA,t05 -+ -+ VST t04,0(C1) -+ VST t05,4*SIZE(C1) -+ addl C1,8*SIZE,C1 -+ jmp $TRMMKERNEL_8x2 -+ -+$UnAlign_C1_Access_8x2: -+ VMUL t04,ALPHA,t04 -+ VMUL t05,ALPHA,t05 -+ -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ -+ VST_UL t05, 1*VEC_LEN*SIZE(C1) -+ VST_UH t05, 2*VEC_LEN*SIZE(C1) -+ addl C1,8*SIZE,C1 -+ -+$TRMMKERNEL_8x2: -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK,TEMP -+#ifdef LEFT -+ subl TEMP, 8,TEMP # mr=8 -+#else -+ subl TEMP, 2,TEMP # nr=2 -+#endif -+ -+ sll TEMP, 3 + BASE_SHIFT,KC -+ sll TEMP, 1 + BASE_SHIFT,TEMP -+ -+ addl A,KC,A -+ addl B,TEMP,B -+#endif -+ -+#ifdef LEFT -+ addl KK,8,KK -+ nop -+#endif -+#endif -+ -+ -+ -+ .align 5 -+ -+.L26: # nr=2,mr=4------------------ -+ and MC1,4,MC # MC1&4 -+ beq MC,.L27 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B -+ nop -+#else -+ sll KK, 2 + BASE_SHIFT,KC # mr=4 -+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2 -+ -+ addl A,KC,A -+ addl B1,TEMP,B -+#endif -+ -+ vcpys $f31,$f31,t00 # clear 2vector registers -+ vcpys $f31,$f31,t04 -+ -+ LDDE b0,0(B) # get 2b -+ LDDE b1,1*SIZE(B) -+ -+ VLD a0,0(A) # Get 4 a -+ -+ fillcs 0(CO) # fetch C -+ fillcs 0(C1) -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 4, TEMP -+#else -+ addl KK, 2, TEMP -+#endif -+ sra TEMP,1,KC -+ beq KC,$Rest_4x2x1 -+ -+#else -+ -+ mov B1,B -+ sra KC1,1,KC -+ vcpys $f31,$f31,t00 # clear 2vector registers -+ vcpys $f31,$f31,t04 -+ -+ LDDE b0,0(B) # get 2b -+ LDDE b1,1*SIZE(B) -+ -+ VLD a0,0(A) # Get 4 a -+ -+ fillcs 0(CO) # fetch C -+ fillcs 0(C1) -+ -+ beq KC,$Rest_4x2x1 -+#endif -+ -+$Panel_4x2x2: -+ VMAD a0,b0,t00,t00 -+ VMAD a0,b1,t04,t04 -+ -+ LDDE nb0,2*SIZE(B) # get next 2b -+ LDDE nb1,3*SIZE(B) -+ -+ addl B,4*SIZE,B # 2n*2K -+ VLD a4,4*SIZE(A) # get next 4a -+ -+ subl KC,1,KC -+ VMAD a4,nb0,t00,t00 -+ VMAD a4,nb1,t04,t04 -+ -+ addl A,8*SIZE,A # 4m*2k -+ LDDE b0,0(B) # get 3rd 2b -+ LDDE b1,1*SIZE(B) -+ -+ VLD a0,0(A) # get 3rd 4a -+ bne KC,$Panel_4x2x2 -+ -+ -+$Rest_4x2x1: -+ LDDE ALPHA,192($sp) # get alpha -+#ifndef TRMMKERNEL -+ blbc KC1,$Write_4x2 -+#else -+ blbc TEMP,$Write_4x2 -+#endif -+ -+ addl A,4*SIZE,A # 4m*1k -+ addl B,2*SIZE,B # 2n*1K -+ -+ VMAD a0,b0,t00,t00 -+ VMAD a0,b1,t04,t04 -+ -+ -+$Write_4x2: -+ -+#ifndef TRMMKERNEL -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_4x2 -+ -+$Align_CO_Access_4x2: -+ VLD c00,0(CO) # get 1st colum of 16c -+ VMAD t00,ALPHA,c00,t00 -+ VST t00,0(CO) -+ jmp $Access_C1_4x2 -+ -+$UnAlign_CO_Access_4x2: -+ VLD_UL c00, 0*VEC_LEN*SIZE(CO) -+ VLD_UH c01, 1*VEC_LEN*SIZE(CO) -+ -+ vbisw c00,c01,c00 -+ -+ VMAD t00,ALPHA,c00,t00 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_4x2: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ addl CO,4*SIZE,CO # 4c -+ nop -+ bne $6,$UnAlign_C1_Access_4x2 -+ -+$Align_C1_Access_4x2: -+ VLD c04,0(C1) -+ VMAD t04,ALPHA,c04,t04 -+ VST t04,0(C1) -+ addl C1,4*SIZE,C1 -+ jmp .L27 -+ -+$UnAlign_C1_Access_4x2: -+ VLD_UL c04, 0*VEC_LEN*SIZE(C1) -+ VLD_UH c05, 1*VEC_LEN*SIZE(C1) -+ -+ vbisw c04,c05,c04 -+ -+ VMAD t04,ALPHA,c04,t04 -+ -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ addl C1,4*SIZE,C1 -+ -+#else -+ -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_4x2 -+ -+$Align_CO_Access_4x2: -+ VMUL t00,ALPHA,t00 -+ VST t00,0(CO) -+ jmp $Access_C1_4x2 -+ -+$UnAlign_CO_Access_4x2: -+ VMUL t00,ALPHA,t00 -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ -+$Access_C1_4x2: -+ and C1, (VEC_LEN*SIZE-1),$6 -+ addl CO,4*SIZE,CO # 4c -+ nop -+ bne $6,$UnAlign_C1_Access_4x2 -+ -+$Align_C1_Access_4x2: -+ VMUL t04,ALPHA,t04 -+ VST t04,0(C1) -+ addl C1,4*SIZE,C1 -+ jmp $TRMMKERNEL_4x2 -+ -+$UnAlign_C1_Access_4x2: -+ VMUL t04,ALPHA,t04 -+ VST_UL t04, 0*VEC_LEN*SIZE(C1) -+ VST_UH t04, 1*VEC_LEN*SIZE(C1) -+ addl C1,4*SIZE,C1 -+ -+$TRMMKERNEL_4x2: -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 4, TEMP -+#else -+ subl TEMP, 2, TEMP -+#endif -+ -+ sll TEMP, 2 + BASE_SHIFT,KC -+ sll TEMP, 1 + BASE_SHIFT,TEMP -+ -+ addl A, KC, A -+ addl B, TEMP, B -+#endif -+ -+#ifdef LEFT -+ addl KK, 4, KK -+ nop -+#endif -+#endif -+ -+ -+ -+ .align 5 -+ -+.L27: # nr=2,mr=2-------------- -+ and MC1,2,MC -+ beq MC,.L28 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B -+#else -+ sll KK, 1 + BASE_SHIFT,KC # mr=nr=2 -+ nop -+ addl A,KC,A -+ addl B1,KC,B -+#endif -+ -+ fclr t00 # clear 4 register -+ fclr t01 -+ fclr t04 -+ fclr t05 -+ -+ LD b0,0(B) # get 2b -+ LD b1,1*SIZE(B) -+ -+ LD a0,0(A) # get 2a -+ LD a4,1*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ fillcs 0(C1) -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#else -+ addl KK, 2, TEMP # mr=nr=2 -+#endif -+ sra TEMP,1, KC -+ nop -+ nop -+ beq KC,$Rest_2x2x1 -+ -+#else -+ -+ mov B1,B # Reset B -+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 -+ fclr t00 # clear 4 register -+ fclr t01 -+ fclr t04 -+ fclr t05 -+ -+ LD b0,0(B) # get 2b -+ LD b1,1*SIZE(B) -+ -+ LD a0,0(A) # get 2a -+ LD a4,1*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ fillcs 0(C1) -+ beq KC,$Rest_2x2x1 -+ -+#endif -+ -+ -+$Panel_2x2x2: -+ MAD a0,b0,t00,t00 -+ MAD a0,b1,t04,t04 -+ -+ LD nb0,2*SIZE(B) # get next 2b -+ LD nb1,3*SIZE(B) -+ -+ MAD a4,b0,t01,t01 -+ MAD a4,b1,t05,t05 -+ -+ addl B,4*SIZE,B # 2(n)*2(k) -+ LD a8,2*SIZE(A) # get next 2a -+ LD a12,3*SIZE(A) -+ -+ subl KC,1,KC -+ MAD a8,nb0,t00,t00 -+ MAD a8,nb1,t04,t04 -+ -+ addl A,4*SIZE,A # 2m*2k -+ LD b0,0(B) -+ LD b1,1*SIZE(B) -+ -+ MAD a12,nb0,t01,t01 -+ MAD a12,nb1,t05,t05 -+ -+ LD a0,0(A) -+ LD a4,1*SIZE(A) -+ bne KC,$Panel_2x2x2 -+ -+ -+$Rest_2x2x1: -+ LD ALPHA,192($sp) # Get ALPHA -+#ifndef TRMMKERNEL -+ blbc KC1,$Write_2x2 -+#else -+ blbc TEMP,$Write_2x2 -+#endif -+ -+ addl A,2*SIZE,A # 2m*1k -+ addl B,2*SIZE,B # 2n*1K -+ -+ MAD a0,b0,t00,t00 -+ MAD a0,b1,t04,t04 -+ MAD a4,b0,t01,t01 -+ MAD a4,b1,t05,t05 -+ -+ -+$Write_2x2: -+ -+#ifndef TRMMKERNEL -+ LD c00,0(CO) -+ LD c04,0(C1) -+ LD c01,1*SIZE(CO) -+ LD c05,1*SIZE(C1) -+ -+ MAD t00,ALPHA,c00,t00 -+ MAD t04,ALPHA,c04,t04 -+ MAD t01,ALPHA,c01,t01 -+ MAD t05,ALPHA,c05,t05 -+ -+ ST t00,0(CO) -+ ST t04,0(C1) -+ ST t01,1*SIZE(CO) -+ ST t05,1*SIZE(C1) -+ -+ addl CO,2*SIZE,CO # 2c -+ addl C1,2*SIZE,C1 -+ -+#else -+ -+ MUL t00,ALPHA,t00 -+ MUL t04,ALPHA,t04 -+ MUL t01,ALPHA,t01 -+ MUL t05,ALPHA,t05 -+ -+ ST t00,0(CO) -+ ST t04,0(C1) -+ ST t01,1*SIZE(CO) -+ ST t05,1*SIZE(C1) -+ -+ addl CO,2*SIZE,CO # 2c -+ addl C1,2*SIZE,C1 -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+ subl TEMP, 2, TEMP -+ -+ sll TEMP, 1 + BASE_SHIFT, KC -+ nop -+ -+ addl A,KC, A -+ addl B,KC, B -+#endif -+ -+#ifdef LEFT -+ addl KK, 2, KK -+#endif -+#endif -+ -+ -+ -+ .align 5 -+.L28: -+ and MC1,1,MC # nr=2,mr=1------------------- -+ beq MC,$End_NC_Unroll2 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B -+#else -+ sll KK, BASE_SHIFT,KC # mr=1 -+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2 -+ -+ addl A,KC,A -+ addl B1,TEMP,B -+#endif -+ -+ fclr t00 # clear 2 registers -+ fclr t04 -+ -+ LD b0,0(B) # 2b -+ LD b1,1*SIZE(B) -+ -+ LD a0,0(A) # 1a -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 1, TEMP -+#else -+ addl KK, 2, TEMP -+#endif -+ sra TEMP,1,KC -+ nop -+ beq KC,$Rest_1x2x1 -+ -+#else -+ mov B1,B # Reset B -+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 -+ fclr t00 # clear 2 registers -+ fclr t04 -+ -+ LD b0,0(B) # 2b -+ LD b1,1*SIZE(B) -+ -+ LD a0,0(A) # 1a -+ beq KC,$Rest_1x2x1 -+#endif -+ -+ -+ .align 5 -+ -+$Panel_1x2x2: -+ MAD a0,b0,t00,t00 -+ MAD a0,b1,t04,t04 -+ -+ LD nb0,2*SIZE(B) # get next 2b -+ LD nb1,3*SIZE(B) -+ -+ addl B,4*SIZE,B # 2(n)*2(k) -+ LD a8,1*SIZE(A) # get next 1a -+ -+ subl KC,1,KC -+ MAD a8,nb0,t00,t00 -+ MAD a8,nb1,t04,t04 -+ -+ addl A,2*SIZE,A # 1m*2k -+ LD b0,0(B) # get 3rd 2b -+ LD b1,1*SIZE(B) -+ -+ LD a0,0(A) # get 3rd 1a -+ bne KC,$Panel_1x2x2 -+ -+ -+$Rest_1x2x1: -+ LD ALPHA,192($sp) # Get ALPHA -+#ifndef TRMMKERNEL -+ blbc KC1,$Write_1x2 -+#else -+ blbc TEMP,$Write_1x2 -+#endif -+ -+ addl A,1*SIZE,A # 1m*1k -+ addl B,2*SIZE,B # 2n*1K -+ -+ MAD a0,b0,t00,t00 -+ MAD a0,b1,t04,t04 -+ -+ -+$Write_1x2: # Write back 2 results -+#ifndef TRMMKERNEL -+ LD c00,0(CO) -+ LD c04,0(C1) -+ -+ MAD t00,ALPHA,c00,t00 -+ MAD t04,ALPHA,c04,t04 -+ -+ ST t00,0(CO) -+ ST t04,0(C1) -+ -+#else -+ -+ MUL t00,ALPHA,t00 -+ MUL t04,ALPHA,t04 -+ -+ ST t00,0(CO) -+ ST t04,0(C1) -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 1,TEMP -+#else -+ subl TEMP, 2,TEMP -+#endif -+ -+ sll TEMP, BASE_SHIFT,KC -+ sll TEMP, 1 + BASE_SHIFT,TEMP -+ -+ addl A,KC,A -+ addl B,TEMP,B -+#endif -+ -+#ifdef LEFT -+ addl KK,1,KK -+#endif -+#endif -+ -+ -+ .align 5 -+ -+$End_NC_Unroll2: -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addl KK, 2,KK -+#endif -+ mov B, B1 -+ -+ -+ .align 5 -+$Begin_NC_Unroll1: # Nr=1 -+ and NC1,1,NC # NC=NC1&1 -+ beq NC,$Kernel_End -+ -+ mov A1,A # Reset A -+ mov C,CO # Reset C -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET,KK # reset offset -+#endif -+ -+ sll KC1,4+BASE_SHIFT,SPANA # SPANA=KC1*MC -+ subl PREA,16*SIZE,PREA -+ -+ sra MC1,4,MC # MC=MC1/16 -+ beq MC,.L35 # MC=0:MC1<16 -+ -+ -+.L3: # nr=1,mr=16 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1,B -+#else -+ sll KK, 4 + BASE_SHIFT, KC # mr=16 -+ sll KK, BASE_SHIFT,TEMP # nr=1 -+ -+ addl A,KC,A -+ addl B1,TEMP,B -+#endif -+ -+ vcpys $f31,$f31,t00 # CLEAR 16 Register -+ vcpys $f31,$f31,t01 -+ vcpys $f31,$f31,t02 -+ vcpys $f31,$f31,t03 -+ -+ LDDE b0,0(B) # get 1b and 16a -+ -+ VLD a0,0(A) -+ VLD a4,4*SIZE(A) -+ VLD a8,8*SIZE(A) -+ VLD a12,12*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ fillcs 4*SIZE(CO) -+ fillcs 8*SIZE(CO) -+ fillcs 12*SIZE(CO) -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 16, TEMP -+#else -+ addl KK, 1, TEMP -+#endif -+ sra TEMP, 1, KC -+ beq KC,$Rest_16x1x1 -+ -+#else -+ -+ mov B1,B # Set B -+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 -+ vcpys $f31,$f31,t00 # CLEAR 16 Register -+ vcpys $f31,$f31,t01 -+ vcpys $f31,$f31,t02 -+ vcpys $f31,$f31,t03 -+ -+ LDDE b0,0(B) # get 1b and 16a -+ -+ VLD a0,0(A) -+ VLD a4,4*SIZE(A) -+ VLD a8,8*SIZE(A) -+ VLD a12,12*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ fillcs 4*SIZE(CO) -+ fillcs 8*SIZE(CO) -+ fillcs 12*SIZE(CO) -+ -+ beq KC,$Rest_16x1x1 -+ -+#endif -+ -+$Panel_16x1x2: -+ addl A,16*SIZE,A # 16(m)*1(k) -+ LDDE b1,1*SIZE(B) # get next 1b -+ -+ VMAD a0,b0,t00,t00 -+ VMAD a4,b0,t01,t01 -+ -+ addl B,2*SIZE,B # 1(n)*2(k) -+ VLD na0,0(A) # get next 16a -+ VLD na4,4*SIZE(A) -+ VLD na8,8*SIZE(A) -+ VLD na12,12*SIZE(A) -+ -+ VMAD a8,b0,t02,t02 -+ VMAD a12,b0,t03,t03 -+ -+ subl KC,1,KC -+ addl A,16*SIZE,A # 16m*1k -+ LDDE b0,0(B) -+ -+ VMAD na0,b1,t00,t00 -+ VMAD na4,b1,t01,t01 -+ -+ VLD a0,0(A) -+ VLD a4,4*SIZE(A) -+ VLD a8,8*SIZE(A) -+ VLD a12,12*SIZE(A) -+ -+ VMAD na8,b1,t02,t02 -+ VMAD na12,b1,t03,t03 -+ bne KC,$Panel_16x1x2 -+ -+ -+$Rest_16x1x1: -+ LDDE ALPHA,192($sp) -+#ifndef TRMMKERNEL -+ blbc KC1,$Write_16x1 # If(KC1[0]==0) goto $Write_16x1 -+#else -+ blbc TEMP,$Write_16x1 # If(KC1[0]==0) goto $Write_16x1 -+#endif -+ -+ addl A,16*SIZE,A # 16a*1k -+ addl B,1*SIZE,B # 1b*1k -+ -+ VMAD a0,b0,t00,t00 -+ VMAD a4,b0,t01,t01 -+ VMAD a8,b0,t02,t02 -+ VMAD a12,b0,t03,t03 -+ -+ -+$Write_16x1: -+ -+#ifndef TRMMKERNEL -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_16x1 -+ -+$Align_CO_Access_16x1: -+ VLD c00,0(CO) # get 1st colum of 16c -+ VLD c01,4*SIZE(CO) -+ VLD c02,8*SIZE(CO) -+ VLD c03,12*SIZE(CO) -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ VMAD t02,ALPHA,c02,t02 -+ VMAD t03,ALPHA,c03,t03 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ VST t02,8*SIZE(CO) -+ VST t03,12*SIZE(CO) -+ jmp $Kernel_End -+ -+$UnAlign_CO_Access_16x1: -+ VLD_UL c00, 0*VEC_LEN*SIZE(CO) -+ VLD_UH c04, 1*VEC_LEN*SIZE(CO) -+ -+ VLD_UL c01, 1*VEC_LEN*SIZE(CO) -+ VLD_UH c05, 2*VEC_LEN*SIZE(CO) -+ -+ VLD_UL c02, 2*VEC_LEN*SIZE(CO) -+ VLD_UH c06, 3*VEC_LEN*SIZE(CO) -+ -+ VLD_UL c03, 3*VEC_LEN*SIZE(CO) -+ VLD_UH c07, 4*VEC_LEN*SIZE(CO) -+ -+ vbisw c00,c04,c00 -+ vbisw c01,c05,c01 -+ vbisw c02,c06,c02 -+ vbisw c03,c07,c03 -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ VMAD t02,ALPHA,c02,t02 -+ VMAD t03,ALPHA,c03,t03 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+ VST_UL t02, 2*VEC_LEN*SIZE(CO) -+ VST_UH t02, 3*VEC_LEN*SIZE(CO) -+ -+ VST_UL t03, 3*VEC_LEN*SIZE(CO) -+ VST_UH t03, 4*VEC_LEN*SIZE(CO) -+ jmp $Kernel_End -+ -+#else -+ -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_16x1 -+ -+$Align_CO_Access_16x1: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ VMUL t02,ALPHA,t02 -+ VMUL t03,ALPHA,t03 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ VST t02,8*SIZE(CO) -+ VST t03,12*SIZE(CO) -+ jmp $TRMMKERNEL_16x1 -+ -+$UnAlign_CO_Access_16x1: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ VMUL t02,ALPHA,t02 -+ VMUL t03,ALPHA,t03 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+ VST_UL t02, 2*VEC_LEN*SIZE(CO) -+ VST_UH t02, 3*VEC_LEN*SIZE(CO) -+ -+ VST_UL t03, 3*VEC_LEN*SIZE(CO) -+ VST_UH t03, 4*VEC_LEN*SIZE(CO) -+ -+$TRMMKERNEL_16x1: -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 16, TEMP -+#else -+ subl TEMP, 1,TEMP -+#endif -+ -+ sll TEMP, 4 + BASE_SHIFT,KC -+ sll TEMP, BASE_SHIFT, TEMP -+ -+ addl A,KC,A -+ addl B,TEMP,B -+#endif -+ -+#ifdef LEFT -+ addl KK, 16, KK -+ nop -+#endif -+ -+ jmp $Kernel_End -+#endif -+ -+ -+ -+ .align 5 -+.L35: # nr=1,mr=8------------------ -+ and MC1,8,MC -+ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc -+ nop -+ beq MC,.L36 # MC1<8 -+ -+ addl A1,SPANA,PREA -+ subl PREA,8*SIZE,PREA # PREA-=MC -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B -+#else -+ sll KK, 3 + BASE_SHIFT,KC # mr=8 -+ sll KK, BASE_SHIFT,TEMP # nr=1 -+ -+ addl A,KC, A -+ addl B1,TEMP,B -+#endif -+ -+ vcpys $f31,$f31,t00 # CLEAR 8Register -+ vcpys $f31,$f31,t01 -+ -+ LDDE b0,0(B) # get 1b -+ -+ VLD a0,0(A) # get 8a -+ VLD a4,4*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ fillcs 4*SIZE(CO) -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK,TEMP -+#elif defined(LEFT) -+ addl KK, 8,TEMP -+#else -+ addl KK, 1,TEMP -+#endif -+ sra TEMP,1,KC -+ nop -+ beq KC,$Rest_8x1x1 -+ -+#else -+ -+ mov B1, B -+ sra KC1,1,KC -+ vcpys $f31,$f31,t00 # CLEAR 8Register -+ vcpys $f31,$f31,t01 -+ -+ LDDE b0,0(B) # get 1b -+ -+ VLD a0,0(A) # get 8a -+ VLD a4,4*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ fillcs 4*SIZE(CO) -+ beq KC,$Rest_8x1x1 -+ -+#endif -+ -+ -+$Panel_8x1x2: -+ VMAD a0,b0,t00,t00 -+ VMAD a4,b0,t01,t01 -+ -+ LDDE nb0,1*SIZE(B) # get next 1b -+ -+ addl B,2*SIZE,B # 1(n)*2k -+ VLD na8,8*SIZE(A) # get next 8a -+ VLD na12,12*SIZE(A) -+ -+ fillcs 0(PREA) -+ subl PREA,8*SIZE,PREA -+ -+ subl KC,1,KC -+ VMAD na8,nb0,t00,t00 -+ VMAD na12,nb0,t01,t01 -+ -+ addl A,16*SIZE,A # 8m*2k -+ LDDE b0,0(B) # get 3rd 1b -+ -+ VLD a0,0(A) # get 3rd 8a -+ VLD a4,4*SIZE(A) -+ -+ fillcs 0(PREA) -+ subl PREA,8*SIZE,PREA -+ bne KC,$Panel_8x1x2 -+ -+ -+$Rest_8x1x1: -+ LDDE ALPHA,192($sp) # Get ALPHA -+#ifndef TRMMKERNEL -+ blbc KC1,$Write_8x1 -+#else -+ blbc TEMP,$Write_8x1 -+#endif -+ -+ addl A,8*SIZE,A # 8m*1k -+ addl B,1*SIZE,B # 1n*1k -+ -+ VMAD a0,b0,t00,t00 -+ VMAD a4,b0,t01,t01 -+ -+ -+$Write_8x1: -+ -+#ifndef TRMMKERNEL -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_8x1 -+ -+$Align_CO_Access_8x1: -+ VLD c00,0(CO) # get 1st colum of 16c -+ VLD c01,4*SIZE(CO) -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ addl CO,8*SIZE,CO # 8c -+ jmp .L36 -+ -+$UnAlign_CO_Access_8x1: -+ VLD_UL c00, 0*VEC_LEN*SIZE(CO) -+ VLD_UH c02, 1*VEC_LEN*SIZE(CO) -+ -+ VLD_UL c01, 1*VEC_LEN*SIZE(CO) -+ VLD_UH c03, 2*VEC_LEN*SIZE(CO) -+ -+ vbisw c00,c02,c00 -+ vbisw c01,c03,c01 -+ -+ VMAD t00,ALPHA,c00,t00 -+ VMAD t01,ALPHA,c01,t01 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ addl CO,8*SIZE,CO # 8c -+ -+#else -+ -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_8x1 -+ -+$Align_CO_Access_8x1: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ -+ VST t00,0(CO) -+ VST t01,4*SIZE(CO) -+ jmp $TRMMKERNEL_8x1 -+ -+$UnAlign_CO_Access_8x1: -+ VMUL t00,ALPHA,t00 -+ VMUL t01,ALPHA,t01 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+ VST_UL t01, 1*VEC_LEN*SIZE(CO) -+ VST_UH t01, 2*VEC_LEN*SIZE(CO) -+ -+$TRMMKERNEL_8x1: -+ addl CO,8*SIZE,CO # 8c -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 8, TEMP -+#else -+ subl TEMP, 1, TEMP -+#endif -+ -+ sll TEMP, 3 + BASE_SHIFT, KC -+ sll TEMP, BASE_SHIFT,TEMP -+ -+ addl A,KC, A -+ addl B,TEMP,B -+#endif -+ -+#ifdef LEFT -+ addl KK,8, KK -+#endif -+#endif -+ -+ -+ -+ .align 5 -+.L36: # nr=1,mr=4--------------- -+ and MC1,4,MC # MC1&4 -+ beq MC,.L37 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA))\ -+ || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B -+#else -+ sll KK, 2 + BASE_SHIFT, KC # mr=4 -+ sll KK, BASE_SHIFT, TEMP # nr=1 -+ -+ addl A,KC,A -+ addl B1,TEMP,B -+#endif -+ -+ vcpys $f31,$f31,t00 # CLEAR 4 Register -+ -+ LDDE b0,0(B) -+ VLD a0,0(A) -+ -+ fillcs 0(CO) # fetch C -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 4, TEMP # mr=4 -+#else -+ addl KK, 1, TEMP # nr=1 -+#endif -+ sra TEMP,1, KC -+ beq KC,$Rest_4x1x1 -+ -+#else -+ -+ mov B1,B # Reset B -+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 -+ vcpys $f31,$f31,t00 # CLEAR 4 Register -+ -+ LDDE b0,0(B) -+ VLD a0,0(A) -+ -+ fillcs 0(CO) # fetch C -+ beq KC,$Rest_4x1x1 -+#endif -+ -+ -+$Panel_4x1x2: -+ VMAD a0,b0,t00,t00 -+ -+ LDDE nb0,1*SIZE(B) -+ VLD a4,4*SIZE(A) -+ addl B,2*SIZE,B # 1(n)*2(k)*8Byte -+ -+ subl KC,1,KC -+ VMAD a4,nb0,t00,t00 -+ -+ addl A,8*SIZE,A # 4m*2k -+ LDDE b0,0(B) -+ VLD a0,0(A) -+ -+ bne KC,$Panel_4x1x2 -+ -+ -+$Rest_4x1x1: -+ LDDE ALPHA,192($sp) # Get ALPHA -+#ifndef TRMMKERNEL -+ blbc KC1,$Write_4x1 -+#else -+ blbc TEMP,$Write_4x1 -+#endif -+ -+ addl A,4*SIZE,A # 4m*1k -+ addl B,1*SIZE,B # 1n*1K -+ -+ VMAD a0,b0,t00,t00 -+ -+ -+$Write_4x1: # Write back 4 results -+ -+#ifndef TRMMKERNEL -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_4x1 -+ -+$Align_CO_Access_4x1: -+ VLD c00,0(CO) # get 1st colum of 16c -+ VMAD t00,ALPHA,c00,t00 -+ VST t00,0(CO) -+ addl CO,4*SIZE,CO # 4c -+ jmp .L37 -+ -+$UnAlign_CO_Access_4x1: -+ VLD_UL c00, 0*VEC_LEN*SIZE(CO) -+ VLD_UH c01, 1*VEC_LEN*SIZE(CO) -+ -+ vbisw c00,c01,c00 -+ -+ VMAD t00,ALPHA,c00,t00 -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ addl CO,4*SIZE,CO # 4c -+ -+ -+#else -+ and CO, (VEC_LEN*SIZE-1),$6 -+ bne $6,$UnAlign_CO_Access_4x1 -+ -+$Align_CO_Access_4x1: -+ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register -+ VST t00,0(CO) -+ jmp $TRMMKERNEL_4x1 -+ -+$UnAlign_CO_Access_4x1: -+ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register -+ -+ VST_UL t00, 0*VEC_LEN*SIZE(CO) -+ VST_UH t00, 1*VEC_LEN*SIZE(CO) -+ -+$TRMMKERNEL_4x1: -+ addl CO,4*SIZE,CO # 4c -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 4, TEMP # mr=4 -+#else -+ subl TEMP, 1, TEMP -+#endif -+ -+ sll TEMP, 2 + BASE_SHIFT, KC -+ sll TEMP, BASE_SHIFT, TEMP -+ -+ addl A, KC, A -+ addl B, TEMP,B -+#endif -+ -+#ifdef LEFT -+ addl KK, 4, KK -+#endif -+#endif -+ -+ -+ -+ -+ .align 5 -+.L37: # nr=1,mr=2------------------------- -+ and MC1,2,MC -+ beq MC,.L38 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B -+#else -+ sll KK, 1 + BASE_SHIFT,KC # mr=2 -+ sll KK, BASE_SHIFT, TEMP # nr=1 -+ -+ addl A,KC, A -+ addl B1,TEMP,B -+#endif -+ -+ fclr t00 # CLEAR 2 Register -+ fclr t01 -+ -+ LD b0,0(B) -+ -+ LD a0,0(A) -+ LD a4,1*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 2,TEMP -+#else -+ addl KK, 1,TEMP -+#endif -+ sra TEMP,1,KC -+ beq KC,.L373 -+ -+#else -+ -+ mov B1,B # Reset B -+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 -+ fclr t00 # CLEAR 2 Register -+ fclr t01 -+ -+ LD b0,0(B) -+ -+ LD a0,0(A) -+ LD a4,1*SIZE(A) -+ -+ fillcs 0(CO) # fetch C -+ beq KC,.L373 -+ -+#endif -+ -+.L371: -+ MAD a0,b0,t00,t00 -+ MAD a4,b0,t01,t01 -+ -+ LD nb0,1*SIZE(B) -+ -+ addl B,2*SIZE,B # 1(n)*2(k) -+ LD a8,2*SIZE(A) -+ LD a12,3*SIZE(A) -+ -+ subl KC,1,KC -+ MAD a8,nb0,t00,t00 -+ MAD a12,nb0,t01,t01 -+ -+ addl A,4*SIZE,A # 2m*2k -+ LD b0,0(B) -+ -+ LD a0,0(A) -+ LD a4,1*SIZE(A) -+ bne KC,.L371 -+ -+.L373: -+ LD ALPHA,192($sp) # Get ALPHA -+#ifndef TRMMKERNEL -+ blbc KC1,.L374 -+#else -+ blbc TEMP,.L374 -+#endif -+ -+ addl A,2*SIZE,A # 2m*1k*8Byte -+ addl B,1*SIZE,B # 1n*1K*8Byte -+ -+ MAD a0,b0,t00,t00 -+ MAD a4,b0,t01,t01 -+ -+.L374: # Write back 2 results -+ -+#ifndef TRMMKERNEL -+ LD c00,0(CO) -+ LD c01,1*SIZE(CO) -+ -+ MAD t00,ALPHA,c00,t00 -+ MAD t01,ALPHA,c01,t01 -+ -+ ST t00,0(CO) -+ ST t01,1*SIZE(CO) -+ addl CO,2*SIZE,CO # 2c -+ -+#else -+ -+ MUL t00,ALPHA,t00 -+ MUL t01,ALPHA,t01 -+ -+ ST t00,0(CO) -+ ST t01,1*SIZE(CO) -+ -+ addl CO,2*SIZE,CO # 2c -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl KC1, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 2, TEMP -+#else -+ subl TEMP, 1, TEMP -+#endif -+ -+ sll TEMP, 1 + BASE_SHIFT,KC -+ sll TEMP, BASE_SHIFT,TEMP -+ -+ addl A,KC,A -+ addl B,TEMP,B -+#endif -+ -+#ifdef LEFT -+ addl KK, 2, KK -+#endif -+#endif -+ -+ -+ -+ .align 5 -+.L38: -+ and MC1,1,MC -+ beq MC,$Kernel_End -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B1, B -+#else -+ sll KK, BASE_SHIFT,KC # mr=nr=1 -+ nop -+ -+ addl A,KC,A -+ addl B1,KC,B -+#endif -+ -+ fclr t00 # CLEAR Results Register -+ -+ LD b0,0(B) -+ LD a0,0(A) # Get 16 A and 4 B -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl KC1, KK, TEMP -+#else -+ addl KK, 1, TEMP # mr=nr=1 -+#endif -+ sra TEMP,1,KC -+ nop -+ beq KC,.L383 -+ -+#else -+ -+ mov B1,B # Reset B -+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 -+ fclr t00 # CLEAR Results Register -+ -+ LD b0,0(B) -+ LD a0,0(A) # Get 16 A and 4 B -+ -+ beq KC,.L383 -+#endif -+ -+.L381: -+ MAD a0,b0,t00,t00 -+ LD nb0,1*SIZE(B) -+ -+ addl B,2*SIZE,B # 1n*2k -+ LD a8,1*SIZE(A) -+ -+ -+ subl KC,1,KC -+ MAD a8,nb0,t00,t00 -+ -+ addl A,2*SIZE,A # 1m*2k -+ LD b0,0(B) -+ -+ LD a0,0(A) -+ bne KC,.L381 -+ -+ -+.L383: -+ LD ALPHA,192($sp) # get alpha -+#ifndef TRMMKERNEL -+ blbc KC1,.L384 -+#else -+ blbc TEMP,.L384 -+#endif -+ -+ addl A,1*SIZE,A # 1m*1k -+ addl B,1*SIZE,B # 1n*1K -+ -+ MAD a0,b0,t00,t00 -+ -+ -+.L384: # Write back 1 results -+ -+#ifndef TRMMKERNEL -+ LD c00,0(CO) -+ MAD t00,ALPHA,c00,t00 -+ ST t00,0(CO) -+ -+#else -+ MUL t00,ALPHA,t00 -+ ST t00,0(CO) -+#endif -+ -+ -+ -+$Kernel_End: -+ ldl $9,328($sp) # Integer Saved Register -+ ldl $10,320($sp) -+ ldl $11,312($sp) -+ ldl $12,304($sp) -+ ldl $13,296($sp) -+ldl $14,288($sp) -+# Float Saved Register -+ LD $f2,280($sp) -+ LD $f3,272($sp) -+ LD $f4,264($sp) -+ LD $f5,256($sp) -+ LD $f6,248($sp) -+ LD $f7,240($sp) -+ LD $f8,232($sp) -+LD $f9,224($sp) -+ -+ ldi $sp,STACKSIZE($sp) # -+ ret $31,($26),1 # -+ -+ EPILOGUE -+ -+ -diff --git a/kernel/sw_64/gemv_n.S b/kernel/sw_64/gemv_n.S -new file mode 100644 -index 0000000..90284db ---- /dev/null -+++ b/kernel/sw_64/gemv_n.S -@@ -0,0 +1,1647 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define STACKSIZE 72 -+#define PREFETCHSIZE 32 -+ -+#define M $16 -+#define N $17 -+#define A $20 -+#define LDA $21 -+ -+#define X $18 -+#define INCX $19 -+#define Y $22 -+#define INCY $23 -+ -+#define BUFFER $24 -+ -+#define I $25 -+#define J $27 -+ -+#define Y1 $4 -+ -+#define A1 $5 -+#define A2 $6 -+#define A3 $7 -+#define A4 $8 -+ -+#define alpha $f19 -+ -+#define alpha1 $f0 -+#define alpha2 $f1 -+#define alpha3 $f10 -+#define alpha4 $f11 -+ -+#define y0 $f12 -+#define y1 $f13 -+#define y2 $f14 -+#define y3 $f15 -+ -+#define y4 $f16 -+#define y5 $f17 -+#define y6 $f18 -+#define y7 $f21 -+ -+#define a0 $f22 -+#define a1 $f23 -+#define a2 $f24 -+#define a3 $f25 -+#define a4 $f26 -+#define a5 $f27 -+#define a6 $f28 -+#define a7 $f29 -+ -+#define a8 $f2 -+#define a9 $f3 -+#define a10 $f4 -+#define a11 $f5 -+#define a12 $f6 -+#define a13 $f7 -+#define a14 $f8 -+#define a15 $f9 -+ -+#define tmp $f20 -+ PROLOGUE -+ -+ ldi $sp, -STACKSIZE($sp) -+ ldl X, 0 + STACKSIZE($sp) -+ ldl INCX, 8 + STACKSIZE($sp) -+ ldl Y, 16 + STACKSIZE($sp) -+ ldl INCY, 24 + STACKSIZE($sp) -+ ldl BUFFER, 32 + STACKSIZE($sp) -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ fstd tmp, 64($sp) -+ PROFCODE -+ -+ cmple M, 0, $0 -+ SXADDQ INCX, 0, INCX -+ cmple N, 0, $1 -+ SXADDQ INCY, 0, INCY -+ -+ or $0, $1, $0 -+ bne $0, $L999 -+ -+ SXADDQ LDA, 0, LDA -+ -+ cmpeq INCY, SIZE, $0 -+ bne $0, $L10 -+ -+ mov BUFFER, Y1 -+ -+ mov Y, BUFFER -+ mov Y1, Y -+ -+ sra M, 3, I -+ ble I, $L05 -+ .align 4 -+ -+$L02: -+ ST $f31, 0 * SIZE(Y1) -+ ST $f31, 1 * SIZE(Y1) -+ ST $f31, 2 * SIZE(Y1) -+ ST $f31, 3 * SIZE(Y1) -+ ST $f31, 4 * SIZE(Y1) -+ ST $f31, 5 * SIZE(Y1) -+ ST $f31, 6 * SIZE(Y1) -+ ST $f31, 7 * SIZE(Y1) -+ -+ ldi Y1, 8 * SIZE(Y1) -+ ldi I, -1(I) -+ bgt I, $L02 -+ .align 4 -+ -+$L05: -+ and M, 7, I -+ ble I, $L10 -+ .align 4 -+ -+$L06: -+ ST $f31, 0 * SIZE(Y1) -+ addl Y1, SIZE, Y1 -+ -+ ldi I, -1(I) -+ bgt I, $L06 -+ .align 4 -+ -+$L10: -+ sra N, 2, J -+ ble J, $L20 -+ .align 4 -+ -+$L11: -+ LD alpha1, 0 * SIZE(X) -+ addl X, INCX, X -+ LD alpha2, 0 * SIZE(X) -+ addl X, INCX, X -+ LD alpha3, 0 * SIZE(X) -+ addl X, INCX, X -+ LD alpha4, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ MUL alpha, alpha1, tmp -+ fmov tmp, alpha1 -+ MUL alpha, alpha2, tmp -+ fmov tmp, alpha2 -+ MUL alpha, alpha3, tmp -+ fmov tmp, alpha3 -+ MUL alpha, alpha4, tmp -+ fmov tmp, alpha4 -+ -+ mov A, A1 -+ addl A, LDA, A2 -+ addl A2, LDA, A3 -+ addl A3, LDA, A4 -+ s4addl LDA, A, A -+ -+ mov Y, Y1 -+ ldw $31, 4 * SIZE(X) -+ -+ sra M, 3, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ -+ LD a4, 0 * SIZE(A2) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) -+ -+ LD a8, 0 * SIZE(A3) -+ LD a9, 1 * SIZE(A3) -+ LD a10, 2 * SIZE(A3) -+ LD a11, 3 * SIZE(A3) -+ -+ LD y4, 4 * SIZE(Y1) -+ LD y5, 5 * SIZE(Y1) -+ LD y6, 6 * SIZE(Y1) -+ LD y7, 7 * SIZE(Y1) -+ -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ LD a12, 0 * SIZE(A4) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ LD a13, 1 * SIZE(A4) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ LD a14, 2 * SIZE(A4) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ LD a15, 3 * SIZE(A4) -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ LD a0, 4 * SIZE(A1) -+ MUL alpha2, a4, tmp -+ fmov tmp, a4 -+ unop -+ -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ LD a1, 5 * SIZE(A1) -+ MUL alpha2, a5, tmp -+ fmov tmp, a5 -+ unop -+ -+ ADD y2, a2, tmp -+ fmov tmp, y2 -+ LD a2, 6 * SIZE(A1) -+ MUL alpha2, a6, tmp -+ fmov tmp, a6 -+ unop -+ -+ ADD y3, a3, tmp -+ fmov tmp, y3 -+ LD a3, 7 * SIZE(A1) -+ MUL alpha2, a7, tmp -+ fmov tmp, a7 -+ unop -+ -+ ADD y0, a4, tmp -+ fmov tmp, y0 -+ LD a4, 4 * SIZE(A2) -+ MUL alpha3, a8, tmp -+ fmov tmp, a8 -+ unop -+ -+ ADD y1, a5, tmp -+ fmov tmp, y1 -+ LD a5, 5 * SIZE(A2) -+ MUL alpha3, a9, tmp -+ fmov tmp, a9 -+ ldi I, -1(I) -+ -+ ADD y2, a6, tmp -+ fmov tmp, y2 -+ LD a6, 6 * SIZE(A2) -+ MUL alpha3, a10, tmp -+ fmov tmp, a10 -+ unop -+ -+ ADD y3, a7, tmp -+ fmov tmp, y3 -+ LD a7, 7 * SIZE(A2) -+ MUL alpha3, a11, tmp -+ fmov tmp, a11 -+ unop -+ -+ ADD y0, a8, tmp -+ fmov tmp, y0 -+ LD a8, 4 * SIZE(A3) -+ MUL alpha4, a12, tmp -+ fmov tmp, a12 -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ADD y1, a9, tmp -+ fmov tmp, y1 -+ LD a9, 5 * SIZE(A3) -+ MUL alpha4, a13, tmp -+ fmov tmp, a13 -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) -+ -+ ADD y2, a10, tmp -+ fmov tmp, y2 -+ LD a10, 6 * SIZE(A3) -+ MUL alpha4, a14, tmp -+ fmov tmp, a14 -+ unop -+ -+ ADD y3, a11, tmp -+ fmov tmp, y3 -+ LD a11, 7 * SIZE(A3) -+ MUL alpha4, a15, tmp -+ fmov tmp, a15 -+ ldi I, -1(I) -+ -+ ADD y0, a12, tmp -+ fmov tmp, y0 -+ LD a12, 4 * SIZE(A4) -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) -+ -+ ADD y1, a13, tmp -+ fmov tmp, y1 -+ LD a13, 5 * SIZE(A4) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ unop -+ -+ ADD y2, a14, tmp -+ fmov tmp, y2 -+ LD a14, 6 * SIZE(A4) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ unop -+ -+ ADD y3, a15, tmp -+ fmov tmp, y3 -+ LD a15, 7 * SIZE(A4) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) -+ -+ ADD y4, a0, tmp -+ fmov tmp, y4 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha2, a4, tmp -+ fmov tmp, a4 -+ LD a0, 8 * SIZE(A1) -+ -+ ADD y5, a1, tmp -+ fmov tmp, y5 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha2, a5, tmp -+ fmov tmp, a5 -+ LD a1, 9 * SIZE(A1) -+ -+ ADD y6, a2, tmp -+ fmov tmp, y6 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha2, a6, tmp -+ fmov tmp, a6 -+ LD a2, 10 * SIZE(A1) -+ -+ ADD y7, a3, tmp -+ fmov tmp, y7 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha2, a7, tmp -+ fmov tmp, a7 -+ LD a3, 11 * SIZE(A1) -+ -+ ADD y4, a4, tmp -+ fmov tmp, y4 -+ LD a4, 8 * SIZE(A2) -+ MUL alpha3, a8, tmp -+ fmov tmp, a8 -+ LD y0, 8 * SIZE(Y1) -+ -+ ADD y5, a5, tmp -+ fmov tmp, y5 -+ LD a5, 9 * SIZE(A2) -+ MUL alpha3, a9, tmp -+ fmov tmp, a9 -+ LD y1, 9 * SIZE(Y1) -+ -+ ADD y6, a6, tmp -+ fmov tmp, y6 -+ LD a6, 10 * SIZE(A2) -+ MUL alpha3, a10, tmp -+ fmov tmp, a10 -+ LD y2, 10 * SIZE(Y1) -+ -+ ADD y7, a7, tmp -+ fmov tmp, y7 -+ LD a7, 11 * SIZE(A2) -+ MUL alpha3, a11, tmp -+ fmov tmp, a11 -+ LD y3, 11 * SIZE(Y1) -+ -+ ADD y4, a8, tmp -+ fmov tmp, y4 -+ LD a8, 8 * SIZE(A3) -+ MUL alpha4, a12, tmp -+ fmov tmp, a12 -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A3) -+ -+ ADD y5, a9, tmp -+ fmov tmp, y5 -+ LD a9, 9 * SIZE(A3) -+ MUL alpha4, a13, tmp -+ fmov tmp, a13 -+ ldi A1, 8 * SIZE(A1) -+ -+ ADD y6, a10, tmp -+ fmov tmp, y6 -+ LD a10, 10 * SIZE(A3) -+ MUL alpha4, a14, tmp -+ fmov tmp, a14 -+ ldi A2, 8 * SIZE(A2) -+ -+ ADD y7, a11, tmp -+ fmov tmp, y7 -+ LD a11, 11 * SIZE(A3) -+ MUL alpha4, a15, tmp -+ fmov tmp, a15 -+ ldi Y1, 8 * SIZE(Y1) -+ -+ ADD y4, a12, tmp -+ fmov tmp, y4 -+ LD a12, 8 * SIZE(A4) -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ unop -+ -+ ADD y5, a13, tmp -+ fmov tmp, y5 -+ LD a13, 9 * SIZE(A4) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ ldi A3, 8 * SIZE(A3) -+ -+ ADD y6, a14, tmp -+ fmov tmp, y6 -+ LD a14, 10 * SIZE(A4) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A4) -+ -+ ADD y7, a15, tmp -+ fmov tmp, y7 -+ LD a15, 11 * SIZE(A4) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ ldi A4, 8 * SIZE(A4) -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ LD a0, 4 * SIZE(A1) -+ MUL alpha2, a4, tmp -+ fmov tmp, a4 -+ ST y4, -4 * SIZE(Y1) -+ -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ LD a1, 5 * SIZE(A1) -+ MUL alpha2, a5, tmp -+ fmov tmp, a5 -+ ST y5, -3 * SIZE(Y1) -+ -+ ADD y2, a2, tmp -+ fmov tmp, y2 -+ LD a2, 6 * SIZE(A1) -+ MUL alpha2, a6, tmp -+ fmov tmp, a6 -+ ST y6, -2 * SIZE(Y1) -+ -+ ADD y3, a3, tmp -+ fmov tmp, y3 -+ LD a3, 7 * SIZE(A1) -+ MUL alpha2, a7, tmp -+ fmov tmp, a7 -+ ST y7, -1 * SIZE(Y1) -+ -+ ADD y0, a4, tmp -+ fmov tmp, y0 -+ LD a4, 4 * SIZE(A2) -+ MUL alpha3, a8, tmp -+ fmov tmp, a8 -+ LD y4, 4 * SIZE(Y1) -+ -+ ADD y1, a5, tmp -+ fmov tmp, y1 -+ LD a5, 5 * SIZE(A2) -+ MUL alpha3, a9, tmp -+ fmov tmp, a9 -+ LD y5, 5 * SIZE(Y1) -+ -+ ADD y2, a6, tmp -+ fmov tmp, y2 -+ LD a6, 6 * SIZE(A2) -+ MUL alpha3, a10, tmp -+ fmov tmp, a10 -+ LD y6, 6 * SIZE(Y1) -+ -+ ADD y3, a7, tmp -+ fmov tmp, y3 -+ LD a7, 7 * SIZE(A2) -+ MUL alpha3, a11, tmp -+ fmov tmp, a11 -+ LD y7, 7 * SIZE(Y1) -+ -+ ADD y0, a8, tmp -+ fmov tmp, y0 -+ LD a8, 4 * SIZE(A3) -+ MUL alpha4, a12, tmp -+ fmov tmp, a12 -+ bgt I, $L12 -+ .align 4 -+ -+$L13: -+ ADD y1, a9, tmp -+ fmov tmp, y1 -+ LD a9, 5 * SIZE(A3) -+ MUL alpha4, a13, tmp -+ fmov tmp, a13 -+ unop -+ -+ ADD y2, a10, tmp -+ fmov tmp, y2 -+ LD a10, 6 * SIZE(A3) -+ MUL alpha4, a14, tmp -+ fmov tmp, a14 -+ unop -+ -+ ADD y3, a11, tmp -+ fmov tmp, y3 -+ LD a11, 7 * SIZE(A3) -+ MUL alpha4, a15, tmp -+ fmov tmp, a15 -+ unop -+ -+ ADD y0, a12, tmp -+ fmov tmp, y0 -+ LD a12, 4 * SIZE(A4) -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ unop -+ -+ ADD y1, a13, tmp -+ fmov tmp, y1 -+ LD a13, 5 * SIZE(A4) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ unop -+ -+ ADD y2, a14, tmp -+ fmov tmp, y2 -+ LD a14, 6 * SIZE(A4) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ unop -+ -+ ADD y3, a15, tmp -+ fmov tmp, y3 -+ LD a15, 7 * SIZE(A4) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ unop -+ -+ ST y0, 0 * SIZE(Y1) -+ ADD y4, a0, tmp -+ fmov tmp, y4 -+ unop -+ MUL alpha2, a4, tmp -+ fmov tmp, a4 -+ -+ ST y1, 1 * SIZE(Y1) -+ ADD y5, a1, tmp -+ fmov tmp, y5 -+ unop -+ MUL alpha2, a5, tmp -+ fmov tmp, a5 -+ -+ ST y2, 2 * SIZE(Y1) -+ ADD y6, a2, tmp -+ fmov tmp, y6 -+ unop -+ MUL alpha2, a6, tmp -+ fmov tmp, a6 -+ -+ ST y3, 3 * SIZE(Y1) -+ ADD y7, a3, tmp -+ fmov tmp, y7 -+ ldi Y1, 8 * SIZE(Y1) -+ MUL alpha2, a7, tmp -+ fmov tmp, a7 -+ -+ ADD y4, a4, tmp -+ fmov tmp, y4 -+ MUL alpha3, a8, tmp -+ fmov tmp, a8 -+ ADD y5, a5, tmp -+ fmov tmp, y5 -+ MUL alpha3, a9, tmp -+ fmov tmp, a9 -+ ADD y6, a6, tmp -+ fmov tmp, y6 -+ MUL alpha3, a10, tmp -+ fmov tmp, a10 -+ ADD y7, a7, tmp -+ fmov tmp, y7 -+ MUL alpha3, a11, tmp -+ fmov tmp, a11 -+ -+ ADD y4, a8, tmp -+ fmov tmp, y4 -+ MUL alpha4, a12, tmp -+ fmov tmp, a12 -+ ADD y5, a9, tmp -+ fmov tmp, y5 -+ MUL alpha4, a13, tmp -+ fmov tmp, a13 -+ ADD y6, a10, tmp -+ fmov tmp, y6 -+ MUL alpha4, a14, tmp -+ fmov tmp, a14 -+ ADD y7, a11, tmp -+ fmov tmp, y7 -+ MUL alpha4, a15, tmp -+ fmov tmp, a15 -+ -+ ADD y4, a12, tmp -+ fmov tmp, y4 -+ ADD y5, a13, tmp -+ fmov tmp, y5 -+ ADD y6, a14, tmp -+ fmov tmp, y6 -+ ADD y7, a15, tmp -+ fmov tmp, y7 -+ -+ ST y4, -4 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ST y5, -3 * SIZE(Y1) -+ ldi A2, 8 * SIZE(A2) -+ ST y6, -2 * SIZE(Y1) -+ ldi A3, 8 * SIZE(A3) -+ ST y7, -1 * SIZE(Y1) -+ ldi A4, 8 * SIZE(A4) -+ .align 4 -+ -+$L15: -+ and M, 4, I -+ ble I, $L16 -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ -+ LD a4, 0 * SIZE(A2) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) -+ -+ LD a8, 0 * SIZE(A3) -+ LD a9, 1 * SIZE(A3) -+ LD a10, 2 * SIZE(A3) -+ LD a11, 3 * SIZE(A3) -+ -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ LD a12, 0 * SIZE(A4) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ LD a13, 1 * SIZE(A4) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ LD a14, 2 * SIZE(A4) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ LD a15, 3 * SIZE(A4) -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ MUL alpha2, a4, tmp -+ fmov tmp, a4 -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ MUL alpha2, a5, tmp -+ fmov tmp, a5 -+ ADD y2, a2, tmp -+ fmov tmp, y2 -+ MUL alpha2, a6, tmp -+ fmov tmp, a6 -+ ADD y3, a3, tmp -+ fmov tmp, y3 -+ MUL alpha2, a7, tmp -+ fmov tmp, a7 -+ -+ ADD y0, a4, tmp -+ fmov tmp, y0 -+ MUL alpha3, a8, tmp -+ fmov tmp, a8 -+ ADD y1, a5, tmp -+ fmov tmp, y1 -+ MUL alpha3, a9, tmp -+ fmov tmp, a9 -+ ADD y2, a6, tmp -+ fmov tmp, y2 -+ MUL alpha3, a10, tmp -+ fmov tmp, a10 -+ ADD y3, a7, tmp -+ fmov tmp, y3 -+ MUL alpha3, a11, tmp -+ fmov tmp, a11 -+ -+ ADD y0, a8, tmp -+ fmov tmp, y0 -+ MUL alpha4, a12, tmp -+ fmov tmp, a12 -+ ADD y1, a9, tmp -+ fmov tmp, y1 -+ MUL alpha4, a13, tmp -+ fmov tmp, a13 -+ ADD y2, a10, tmp -+ fmov tmp, y2 -+ MUL alpha4, a14, tmp -+ fmov tmp, a14 -+ ADD y3, a11, tmp -+ fmov tmp, y3 -+ MUL alpha4, a15, tmp -+ fmov tmp, a15 -+ -+ ADD y0, a12, tmp -+ fmov tmp, y0 -+ ldi Y1, 4 * SIZE(Y1) -+ ADD y1, a13, tmp -+ fmov tmp, y1 -+ unop -+ -+ ADD y2, a14, tmp -+ fmov tmp, y2 -+ unop -+ ADD y3, a15, tmp -+ fmov tmp, y3 -+ unop -+ -+ ST y0, -4 * SIZE(Y1) -+ ldi A1, 4 * SIZE(A1) -+ ST y1, -3 * SIZE(Y1) -+ ldi A2, 4 * SIZE(A2) -+ ST y2, -2 * SIZE(Y1) -+ ldi A3, 4 * SIZE(A3) -+ ST y3, -1 * SIZE(Y1) -+ ldi A4, 4 * SIZE(A4) -+ .align 4 -+ -+$L16: -+ and M, 2, I -+ ble I, $L17 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 0 * SIZE(A2) -+ LD a3, 1 * SIZE(A2) -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ -+ LD a4, 0 * SIZE(A3) -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ LD a5, 1 * SIZE(A3) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ LD a6, 0 * SIZE(A4) -+ MUL alpha2, a2, tmp -+ fmov tmp, a2 -+ LD a7, 1 * SIZE(A4) -+ MUL alpha2, a3, tmp -+ fmov tmp, a3 -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ MUL alpha3, a4, tmp -+ fmov tmp, a4 -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ MUL alpha3, a5, tmp -+ fmov tmp, a5 -+ ADD y0, a2, tmp -+ fmov tmp, y0 -+ MUL alpha4, a6, tmp -+ fmov tmp, a6 -+ ADD y1, a3, tmp -+ fmov tmp, y1 -+ MUL alpha4, a7, tmp -+ fmov tmp, a7 -+ -+ ADD y0, a4, tmp -+ fmov tmp, y0 -+ ldi A1, 2 * SIZE(A1) -+ ADD y1, a5, tmp -+ fmov tmp, y1 -+ ldi A2, 2 * SIZE(A2) -+ ADD y0, a6, tmp -+ fmov tmp, y0 -+ ldi A3, 2 * SIZE(A3) -+ ADD y1, a7, tmp -+ fmov tmp, y1 -+ ldi A4, 2 * SIZE(A4) -+ -+ ST y0, 0 * SIZE(Y1) -+ unop -+ ST y1, 1 * SIZE(Y1) -+ ldi Y1, 2 * SIZE(Y1) -+ .align 4 -+ -+$L17: -+ blbc M, $L18 -+ -+ LD y0, 0 * SIZE(Y1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ LD a2, 0 * SIZE(A3) -+ LD a3, 0 * SIZE(A4) -+ -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ MUL alpha2, a1, tmp -+ fmov tmp, a1 -+ MUL alpha3, a2, tmp -+ fmov tmp, a2 -+ MUL alpha4, a3, tmp -+ fmov tmp, a3 -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ ADD y0, a1, tmp -+ fmov tmp, y0 -+ ADD y0, a2, tmp -+ fmov tmp, y0 -+ ADD y0, a3, tmp -+ fmov tmp, y0 -+ -+ ST y0, 0 * SIZE(Y1) -+ .align 4 -+ -+$L18: -+ ldi J, -1(J) -+ bgt J, $L11 -+ .align 4 -+ -+$L20: -+ and N, 2, J -+ ble J, $L30 -+ -+ LD alpha1, 0 * SIZE(X) -+ addl X, INCX, X -+ LD alpha2, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ mov A, A1 -+ MUL alpha, alpha1, tmp -+ fmov tmp, alpha1 -+ addl A, LDA, A2 -+ MUL alpha, alpha2, tmp -+ fmov tmp, alpha2 -+ -+ addl A2, LDA, A -+ mov Y, Y1 -+ -+ sra M, 3, I -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ -+ LD a4, 0 * SIZE(A2) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) -+ -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ LD y4, 4 * SIZE(Y1) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ LD y5, 5 * SIZE(Y1) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ LD y6, 6 * SIZE(Y1) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ LD y7, 7 * SIZE(Y1) -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ LD a0, 4 * SIZE(A1) -+ MUL alpha2, a4, tmp -+ fmov tmp, a4 -+ -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ LD a1, 5 * SIZE(A1) -+ MUL alpha2, a5, tmp -+ fmov tmp, a5 -+ -+ ADD y2, a2, tmp -+ fmov tmp, y2 -+ LD a2, 6 * SIZE(A1) -+ MUL alpha2, a6, tmp -+ fmov tmp, a6 -+ -+ ADD y3, a3, tmp -+ fmov tmp, y3 -+ LD a3, 7 * SIZE(A1) -+ MUL alpha2, a7, tmp -+ fmov tmp, a7 -+ -+ ADD y0, a4, tmp -+ fmov tmp, y0 -+ LD a4, 4 * SIZE(A2) -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ -+ ADD y1, a5, tmp -+ fmov tmp, y1 -+ LD a5, 5 * SIZE(A2) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ -+ ADD y2, a6, tmp -+ fmov tmp, y2 -+ LD a6, 6 * SIZE(A2) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ -+ ADD y3, a7, tmp -+ fmov tmp, y3 -+ LD a7, 7 * SIZE(A2) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ -+ ldi I, -1(I) -+ ble I, $L23 -+ .align 4 -+ -+$L22: -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) -+ ldi I, -1(I) -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) -+ ldi A2, 8 * SIZE(A2) -+ -+ ADD y4, a0, tmp -+ fmov tmp, y4 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha2, a4, tmp -+ fmov tmp, a4 -+ LD a0, 8 * SIZE(A1) -+ -+ ADD y5, a1, tmp -+ fmov tmp, y5 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha2, a5, tmp -+ fmov tmp, a5 -+ LD a1, 9 * SIZE(A1) -+ -+ ADD y6, a2, tmp -+ fmov tmp, y6 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha2, a6, tmp -+ fmov tmp, a6 -+ LD a2, 10 * SIZE(A1) -+ -+ ADD y7, a3, tmp -+ fmov tmp, y7 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha2, a7, tmp -+ fmov tmp, a7 -+ LD a3, 11 * SIZE(A1) -+ -+ ADD y4, a4, tmp -+ fmov tmp, y4 -+ LD a4, 0 * SIZE(A2) -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ LD y0, 8 * SIZE(Y1) -+ -+ ADD y5, a5, tmp -+ fmov tmp, y5 -+ LD a5, 1 * SIZE(A2) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ LD y1, 9 * SIZE(Y1) -+ -+ ADD y6, a6, tmp -+ fmov tmp, y6 -+ LD a6, 2 * SIZE(A2) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ LD y2, 10 * SIZE(Y1) -+ -+ ADD y7, a7, tmp -+ fmov tmp, y7 -+ LD a7, 3 * SIZE(A2) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ LD y3, 11 * SIZE(Y1) -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ ST y4, 4 * SIZE(Y1) -+ MUL alpha2, a4, tmp -+ fmov tmp, a4 -+ LD a0, 12 * SIZE(A1) -+ -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ ST y5, 5 * SIZE(Y1) -+ MUL alpha2, a5, tmp -+ fmov tmp, a5 -+ LD a1, 13 * SIZE(A1) -+ -+ ADD y2, a2, tmp -+ fmov tmp, y2 -+ ST y6, 6 * SIZE(Y1) -+ MUL alpha2, a6, tmp -+ fmov tmp, a6 -+ LD a2, 14 * SIZE(A1) -+ -+ ADD y3, a3, tmp -+ fmov tmp, y3 -+ ST y7, 7 * SIZE(Y1) -+ MUL alpha2, a7, tmp -+ fmov tmp, a7 -+ LD a3, 15 * SIZE(A1) -+ -+ ADD y0, a4, tmp -+ fmov tmp, y0 -+ LD a4, 4 * SIZE(A2) -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ LD y4, 12 * SIZE(Y1) -+ -+ ADD y1, a5, tmp -+ fmov tmp, y1 -+ LD a5, 5 * SIZE(A2) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ LD y5, 13 * SIZE(Y1) -+ -+ ADD y2, a6, tmp -+ fmov tmp, y2 -+ LD a6, 6 * SIZE(A2) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ LD y6, 14 * SIZE(Y1) -+ -+ ADD y3, a7, tmp -+ fmov tmp, y3 -+ LD a7, 7 * SIZE(A2) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ LD y7, 15 * SIZE(Y1) -+ -+ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ldi Y1, 8 * SIZE(Y1) -+ bgt I, $L22 -+ .align 4 -+ -+$L23: -+ ADD y4, a0, tmp -+ fmov tmp, y4 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha2, a4, tmp -+ fmov tmp, a4 -+ unop -+ -+ ADD y5, a1, tmp -+ fmov tmp, y5 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha2, a5, tmp -+ fmov tmp, a5 -+ unop -+ -+ ADD y6, a2, tmp -+ fmov tmp, y6 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha2, a6, tmp -+ fmov tmp, a6 -+ unop -+ -+ ADD y7, a3, tmp -+ fmov tmp, y7 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha2, a7, tmp -+ fmov tmp, a7 -+ unop -+ -+ ADD y4, a4, tmp -+ fmov tmp, y4 -+ ADD y5, a5, tmp -+ fmov tmp, y5 -+ ADD y6, a6, tmp -+ fmov tmp, y6 -+ ADD y7, a7, tmp -+ fmov tmp, y7 -+ -+ ST y4, 4 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ST y5, 5 * SIZE(Y1) -+ ldi A2, 8 * SIZE(A2) -+ -+ ST y6, 6 * SIZE(Y1) -+ unop -+ ST y7, 7 * SIZE(Y1) -+ ldi Y1, 8 * SIZE(Y1) -+ .align 4 -+ -+$L25: -+ and M, 4, I -+ ble I, $L26 -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ LD a4, 0 * SIZE(A2) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ LD a5, 1 * SIZE(A2) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ LD a6, 2 * SIZE(A2) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ LD a7, 3 * SIZE(A2) -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ MUL alpha2, a4, tmp -+ fmov tmp, a4 -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ MUL alpha2, a5, tmp -+ fmov tmp, a5 -+ ADD y2, a2, tmp -+ fmov tmp, y2 -+ MUL alpha2, a6, tmp -+ fmov tmp, a6 -+ ADD y3, a3, tmp -+ fmov tmp, y3 -+ MUL alpha2, a7, tmp -+ fmov tmp, a7 -+ -+ ADD y0, a4, tmp -+ fmov tmp, y0 -+ ldi Y1, 4 * SIZE(Y1) -+ ADD y1, a5, tmp -+ fmov tmp, y1 -+ unop -+ ADD y2, a6, tmp -+ fmov tmp, y2 -+ unop -+ ADD y3, a7, tmp -+ fmov tmp, y3 -+ unop -+ -+ ST y0, -4 * SIZE(Y1) -+ ldi A1, 4 * SIZE(A1) -+ ST y1, -3 * SIZE(Y1) -+ ldi A2, 4 * SIZE(A2) -+ ST y2, -2 * SIZE(Y1) -+ ldi A3, 4 * SIZE(A3) -+ ST y3, -1 * SIZE(Y1) -+ ldi A4, 4 * SIZE(A4) -+ .align 4 -+ -+$L26: -+ and M, 2, I -+ ble I, $L27 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 0 * SIZE(A2) -+ LD a3, 1 * SIZE(A2) -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ MUL alpha2, a2, tmp -+ fmov tmp, a2 -+ MUL alpha2, a3, tmp -+ fmov tmp, a3 -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ ldi A1, 2 * SIZE(A1) -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ ldi A2, 2 * SIZE(A2) -+ ADD y0, a2, tmp -+ fmov tmp, y0 -+ unop -+ ADD y1, a3, tmp -+ fmov tmp, y1 -+ unop -+ -+ ST y0, 0 * SIZE(Y1) -+ unop -+ ST y1, 1 * SIZE(Y1) -+ ldi Y1, 2 * SIZE(Y1) -+ .align 4 -+ -+$L27: -+ blbc M, $L30 -+ -+ LD y0, 0 * SIZE(Y1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ MUL alpha2, a1, tmp -+ fmov tmp, a1 -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ ADD y0, a1, tmp -+ fmov tmp, y0 -+ -+ ST y0, 0 * SIZE(Y1) -+ .align 4 -+ -+$L30: -+ blbc N, $L990 -+ -+ LD alpha1, 0 * SIZE(X) -+ mov A, A1 -+ MUL alpha, alpha1, tmp -+ fmov tmp, alpha1 -+ mov Y, Y1 -+ -+ sra M, 3, I -+ ble I, $L35 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ LD a4, 4 * SIZE(A1) -+ LD a5, 5 * SIZE(A1) -+ LD a6, 6 * SIZE(A1) -+ LD a7, 7 * SIZE(A1) -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) -+ LD y4, 4 * SIZE(Y1) -+ LD y5, 5 * SIZE(Y1) -+ LD y6, 6 * SIZE(Y1) -+ LD y7, 7 * SIZE(Y1) -+ -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ -+ ldi I, -1(I) -+ ble I, $L33 -+ .align 4 -+ -+$L32: -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ LD y4, 4 * SIZE(Y1) -+ MUL alpha1, a4, tmp -+ fmov tmp, a4 -+ LD a0, 8 * SIZE(A1) -+ -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ LD y5, 5 * SIZE(Y1) -+ MUL alpha1, a5, tmp -+ fmov tmp, a5 -+ LD a1, 9 * SIZE(A1) -+ -+ ADD y2, a2, tmp -+ fmov tmp, y2 -+ LD y6, 6 * SIZE(Y1) -+ MUL alpha1, a6, tmp -+ fmov tmp, a6 -+ LD a2, 10 * SIZE(A1) -+ -+ ADD y3, a3, tmp -+ fmov tmp, y3 -+ LD y7, 7 * SIZE(Y1) -+ MUL alpha1, a7, tmp -+ fmov tmp, a7 -+ LD a3, 11 * SIZE(A1) -+ -+ ST y0, 0 * SIZE(Y1) -+ ST y1, 1 * SIZE(Y1) -+ ST y2, 2 * SIZE(Y1) -+ ST y3, 3 * SIZE(Y1) -+ -+ ADD y4, a4, tmp -+ fmov tmp, y4 -+ LD y0, 8 * SIZE(Y1) -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ LD a4, 12 * SIZE(A1) -+ -+ ADD y5, a5, tmp -+ fmov tmp, y5 -+ LD y1, 9 * SIZE(Y1) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ LD a5, 13 * SIZE(A1) -+ -+ ADD y6, a6, tmp -+ fmov tmp, y6 -+ LD y2, 10 * SIZE(Y1) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ LD a6, 14 * SIZE(A1) -+ -+ ADD y7, a7, tmp -+ fmov tmp, y7 -+ LD y3, 11 * SIZE(Y1) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ LD a7, 15 * SIZE(A1) -+ -+ ST y4, 4 * SIZE(Y1) -+ ldi I, -1(I) -+ ST y5, 5 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ -+ ST y6, 6 * SIZE(Y1) -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) -+ ST y7, 7 * SIZE(Y1) -+ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) -+ -+ ldi Y1, 8 * SIZE(Y1) -+ bgt I, $L32 -+ .align 4 -+ -+$L33: -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ LD y4, 4 * SIZE(Y1) -+ MUL alpha1, a4, tmp -+ fmov tmp, a4 -+ unop -+ -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ LD y5, 5 * SIZE(Y1) -+ MUL alpha1, a5, tmp -+ fmov tmp, a5 -+ unop -+ -+ ADD y2, a2, tmp -+ fmov tmp, y2 -+ LD y6, 6 * SIZE(Y1) -+ MUL alpha1, a6, tmp -+ fmov tmp, a6 -+ unop -+ -+ ADD y3, a3, tmp -+ fmov tmp, y3 -+ LD y7, 7 * SIZE(Y1) -+ MUL alpha1, a7, tmp -+ fmov tmp, a7 -+ unop -+ -+ ADD y4, a4, tmp -+ fmov tmp, y4 -+ ST y0, 0 * SIZE(Y1) -+ ADD y5, a5, tmp -+ fmov tmp, y5 -+ ST y1, 1 * SIZE(Y1) -+ ADD y6, a6, tmp -+ fmov tmp, y6 -+ ST y2, 2 * SIZE(Y1) -+ ADD y7, a7, tmp -+ fmov tmp, y7 -+ ST y3, 3 * SIZE(Y1) -+ -+ ST y4, 4 * SIZE(Y1) -+ unop -+ ST y5, 5 * SIZE(Y1) -+ unop -+ -+ ST y6, 6 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ST y7, 7 * SIZE(Y1) -+ ldi Y1, 8 * SIZE(Y1) -+ .align 4 -+ -+$L35: -+ and M, 4, I -+ ble I, $L36 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ LD y1, 1 * SIZE(Y1) -+ MUL alpha1, a2, tmp -+ fmov tmp, a2 -+ LD y2, 2 * SIZE(Y1) -+ MUL alpha1, a3, tmp -+ fmov tmp, a3 -+ LD y3, 3 * SIZE(Y1) -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ ADD y2, a2, tmp -+ fmov tmp, y2 -+ ADD y3, a3, tmp -+ fmov tmp, y3 -+ -+ ST y0, 0 * SIZE(Y1) -+ ldi A1, 4 * SIZE(A1) -+ ST y1, 1 * SIZE(Y1) -+ ldi A2, 4 * SIZE(A2) -+ ST y2, 2 * SIZE(Y1) -+ unop -+ ST y3, 3 * SIZE(Y1) -+ ldi Y1, 4 * SIZE(Y1) -+ .align 4 -+ -+$L36: -+ and M, 2, I -+ ble I, $L37 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ LD y1, 1 * SIZE(Y1) -+ MUL alpha1, a1, tmp -+ fmov tmp, a1 -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ ADD y1, a1, tmp -+ fmov tmp, y1 -+ -+ ST y0, 0 * SIZE(Y1) -+ ldi A1, 2 * SIZE(A1) -+ ST y1, 1 * SIZE(Y1) -+ ldi Y1, 2 * SIZE(Y1) -+ .align 4 -+ -+$L37: -+ blbc M, $L990 -+ -+ LD y0, 0 * SIZE(Y1) -+ LD a0, 0 * SIZE(A1) -+ -+ MUL alpha1, a0, tmp -+ fmov tmp, a0 -+ -+ ADD y0, a0, tmp -+ fmov tmp, y0 -+ ST y0, 0 * SIZE(Y1) -+ .align 4 -+ -+$L990: -+ cmpeq INCY, SIZE, $0 -+ bne $0, $L999 -+ -+ mov BUFFER, Y1 -+ -+ sra M, 3, I -+ ble I, $L995 -+ .align 4 -+ -+$L992: -+ LD a0, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a1, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a2, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a3, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ -+ LD y0, 0 * SIZE(Y) -+ LD y1, 1 * SIZE(Y) -+ LD y2, 2 * SIZE(Y) -+ LD y3, 3 * SIZE(Y) -+ -+ LD a4, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a5, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a6, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a7, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ -+ LD y4, 4 * SIZE(Y) -+ LD y5, 5 * SIZE(Y) -+ LD y6, 6 * SIZE(Y) -+ LD y7, 7 * SIZE(Y) -+ -+ ADD a0, y0, tmp -+ fmov tmp, a0 -+ ADD a1, y1, tmp -+ fmov tmp, a1 -+ ADD a2, y2, tmp -+ fmov tmp, a2 -+ ADD a3, y3, tmp -+ fmov tmp, a3 -+ ADD a4, y4, tmp -+ fmov tmp, a4 -+ ADD a5, y5, tmp -+ fmov tmp, a5 -+ ADD a6, y6, tmp -+ fmov tmp, a6 -+ ADD a7, y7, tmp -+ fmov tmp, a7 -+ -+ ST a0, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a1, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a2, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a3, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ -+ ST a4, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a5, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a6, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a7, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ -+ ldi I, -1(I) -+ ldi Y, 8 * SIZE(Y) -+ bgt I, $L992 -+ .align 4 -+ -+$L995: -+ and M, 7, I -+ ble I, $L999 -+ .align 4 -+ -+$L996: -+ LD a0, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ -+ LD y0, 0 * SIZE(Y) -+ ldi Y, 1 * SIZE(Y) -+ -+ ADD a0, y0, tmp -+ fmov tmp, a0 -+ -+ ST a0, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ -+ ldi I, -1(I) -+ bgt I, $L996 -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ fldd $f20, 64($sp) -+ -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/gemv_n.S.bak b/kernel/sw_64/gemv_n.S.bak -new file mode 100644 -index 0000000..f90abdf ---- /dev/null -+++ b/kernel/sw_64/gemv_n.S.bak -@@ -0,0 +1,1307 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define STACKSIZE 64 -+#define PREFETCHSIZE 32 -+ -+#define M $16 -+#define N $17 -+#define A $20 -+#define LDA $21 -+ -+#define X $18 -+#define INCX $19 -+#define Y $22 -+#define INCY $23 -+ -+#define BUFFER $24 -+ -+#define I $25 -+#define J $27 -+ -+#define Y1 $4 -+ -+#define A1 $5 -+#define A2 $6 -+#define A3 $7 -+#define A4 $8 -+ -+#define alpha $f19 -+ -+#define alpha1 $f0 -+#define alpha2 $f1 -+#define alpha3 $f10 -+#define alpha4 $f11 -+ -+#define y0 $f12 -+#define y1 $f13 -+#define y2 $f14 -+#define y3 $f15 -+ -+#define y4 $f16 -+#define y5 $f17 -+#define y6 $f18 -+#define y7 $f21 -+ -+#define a0 $f22 -+#define a1 $f23 -+#define a2 $f24 -+#define a3 $f25 -+#define a4 $f26 -+#define a5 $f27 -+#define a6 $f28 -+#define a7 $f29 -+ -+#define a8 $f2 -+#define a9 $f3 -+#define a10 $f4 -+#define a11 $f5 -+#define a12 $f6 -+#define a13 $f7 -+#define a14 $f8 -+#define a15 $f9 -+ -+ PROLOGUE -+ -+ ldi $sp, -STACKSIZE($sp) -+ ldl X, 0 + STACKSIZE($sp) -+ ldl INCX, 8 + STACKSIZE($sp) -+ ldl Y, 16 + STACKSIZE($sp) -+ ldl INCY, 24 + STACKSIZE($sp) -+ ldl BUFFER, 32 + STACKSIZE($sp) -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ -+ PROFCODE -+ -+ cmple M, 0, $0 -+ SXADDQ INCX, 0, INCX -+ cmple N, 0, $1 -+ SXADDQ INCY, 0, INCY -+ -+ or $0, $1, $0 -+ bne $0, $L999 -+ -+ SXADDQ LDA, 0, LDA -+ -+ cmpeq INCY, SIZE, $0 -+ bne $0, $L10 -+ -+ mov BUFFER, Y1 -+ -+ mov Y, BUFFER -+ mov Y1, Y -+ -+ sra M, 3, I -+ ble I, $L05 -+ .align 4 -+ -+$L02: -+ ST $f31, 0 * SIZE(Y1) -+ ST $f31, 1 * SIZE(Y1) -+ ST $f31, 2 * SIZE(Y1) -+ ST $f31, 3 * SIZE(Y1) -+ ST $f31, 4 * SIZE(Y1) -+ ST $f31, 5 * SIZE(Y1) -+ ST $f31, 6 * SIZE(Y1) -+ ST $f31, 7 * SIZE(Y1) -+ -+ ldi Y1, 8 * SIZE(Y1) -+ ldi I, -1(I) -+ bgt I, $L02 -+ .align 4 -+ -+$L05: -+ and M, 7, I -+ ble I, $L10 -+ .align 4 -+ -+$L06: -+ ST $f31, 0 * SIZE(Y1) -+ addl Y1, SIZE, Y1 -+ -+ ldi I, -1(I) -+ bgt I, $L06 -+ .align 4 -+ -+$L10: -+ sra N, 2, J -+ ble J, $L20 -+ .align 4 -+ -+$L11: -+ LD alpha1, 0 * SIZE(X) -+ addl X, INCX, X -+ LD alpha2, 0 * SIZE(X) -+ addl X, INCX, X -+ LD alpha3, 0 * SIZE(X) -+ addl X, INCX, X -+ LD alpha4, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ MUL alpha, alpha1, alpha1 -+ MUL alpha, alpha2, alpha2 -+ MUL alpha, alpha3, alpha3 -+ MUL alpha, alpha4, alpha4 -+ -+ mov A, A1 -+ addl A, LDA, A2 -+ addl A2, LDA, A3 -+ addl A3, LDA, A4 -+ s4addl LDA, A, A -+ -+ mov Y, Y1 -+ fillcs 4 * SIZE(X) -+ -+ sra M, 3, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ -+ LD a4, 0 * SIZE(A2) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) -+ -+ LD a8, 0 * SIZE(A3) -+ LD a9, 1 * SIZE(A3) -+ LD a10, 2 * SIZE(A3) -+ LD a11, 3 * SIZE(A3) -+ -+ LD y4, 4 * SIZE(Y1) -+ LD y5, 5 * SIZE(Y1) -+ LD y6, 6 * SIZE(Y1) -+ LD y7, 7 * SIZE(Y1) -+ -+ MUL alpha1, a0, a0 -+ LD a12, 0 * SIZE(A4) -+ MUL alpha1, a1, a1 -+ LD a13, 1 * SIZE(A4) -+ MUL alpha1, a2, a2 -+ LD a14, 2 * SIZE(A4) -+ MUL alpha1, a3, a3 -+ LD a15, 3 * SIZE(A4) -+ -+ ADD y0, a0, y0 -+ LD a0, 4 * SIZE(A1) -+ MUL alpha2, a4, a4 -+ unop -+ -+ ADD y1, a1, y1 -+ LD a1, 5 * SIZE(A1) -+ MUL alpha2, a5, a5 -+ unop -+ -+ ADD y2, a2, y2 -+ LD a2, 6 * SIZE(A1) -+ MUL alpha2, a6, a6 -+ unop -+ -+ ADD y3, a3, y3 -+ LD a3, 7 * SIZE(A1) -+ MUL alpha2, a7, a7 -+ unop -+ -+ ADD y0, a4, y0 -+ LD a4, 4 * SIZE(A2) -+ MUL alpha3, a8, a8 -+ unop -+ -+ ADD y1, a5, y1 -+ LD a5, 5 * SIZE(A2) -+ MUL alpha3, a9, a9 -+ ldi I, -1(I) -+ -+ ADD y2, a6, y2 -+ LD a6, 6 * SIZE(A2) -+ MUL alpha3, a10, a10 -+ unop -+ -+ ADD y3, a7, y3 -+ LD a7, 7 * SIZE(A2) -+ MUL alpha3, a11, a11 -+ unop -+ -+ ADD y0, a8, y0 -+ LD a8, 4 * SIZE(A3) -+ MUL alpha4, a12, a12 -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ADD y1, a9, y1 -+ LD a9, 5 * SIZE(A3) -+ MUL alpha4, a13, a13 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) -+ -+ ADD y2, a10, y2 -+ LD a10, 6 * SIZE(A3) -+ MUL alpha4, a14, a14 -+ unop -+ -+ ADD y3, a11, y3 -+ LD a11, 7 * SIZE(A3) -+ MUL alpha4, a15, a15 -+ ldi I, -1(I) -+ -+ ADD y0, a12, y0 -+ LD a12, 4 * SIZE(A4) -+ MUL alpha1, a0, a0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) -+ -+ ADD y1, a13, y1 -+ LD a13, 5 * SIZE(A4) -+ MUL alpha1, a1, a1 -+ unop -+ -+ ADD y2, a14, y2 -+ LD a14, 6 * SIZE(A4) -+ MUL alpha1, a2, a2 -+ unop -+ -+ ADD y3, a15, y3 -+ LD a15, 7 * SIZE(A4) -+ MUL alpha1, a3, a3 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A2) -+ -+ ADD y4, a0, y4 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha2, a4, a4 -+ LD a0, 8 * SIZE(A1) -+ -+ ADD y5, a1, y5 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha2, a5, a5 -+ LD a1, 9 * SIZE(A1) -+ -+ ADD y6, a2, y6 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha2, a6, a6 -+ LD a2, 10 * SIZE(A1) -+ -+ ADD y7, a3, y7 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha2, a7, a7 -+ LD a3, 11 * SIZE(A1) -+ -+ ADD y4, a4, y4 -+ LD a4, 8 * SIZE(A2) -+ MUL alpha3, a8, a8 -+ LD y0, 8 * SIZE(Y1) -+ -+ ADD y5, a5, y5 -+ LD a5, 9 * SIZE(A2) -+ MUL alpha3, a9, a9 -+ LD y1, 9 * SIZE(Y1) -+ -+ ADD y6, a6, y6 -+ LD a6, 10 * SIZE(A2) -+ MUL alpha3, a10, a10 -+ LD y2, 10 * SIZE(Y1) -+ -+ ADD y7, a7, y7 -+ LD a7, 11 * SIZE(A2) -+ MUL alpha3, a11, a11 -+ LD y3, 11 * SIZE(Y1) -+ -+ ADD y4, a8, y4 -+ LD a8, 8 * SIZE(A3) -+ MUL alpha4, a12, a12 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A3) -+ -+ ADD y5, a9, y5 -+ LD a9, 9 * SIZE(A3) -+ MUL alpha4, a13, a13 -+ ldi A1, 8 * SIZE(A1) -+ -+ ADD y6, a10, y6 -+ LD a10, 10 * SIZE(A3) -+ MUL alpha4, a14, a14 -+ ldi A2, 8 * SIZE(A2) -+ -+ ADD y7, a11, y7 -+ LD a11, 11 * SIZE(A3) -+ MUL alpha4, a15, a15 -+ ldi Y1, 8 * SIZE(Y1) -+ -+ ADD y4, a12, y4 -+ LD a12, 8 * SIZE(A4) -+ MUL alpha1, a0, a0 -+ unop -+ -+ ADD y5, a13, y5 -+ LD a13, 9 * SIZE(A4) -+ MUL alpha1, a1, a1 -+ ldi A3, 8 * SIZE(A3) -+ -+ ADD y6, a14, y6 -+ LD a14, 10 * SIZE(A4) -+ MUL alpha1, a2, a2 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A4) -+ -+ ADD y7, a15, y7 -+ LD a15, 11 * SIZE(A4) -+ MUL alpha1, a3, a3 -+ ldi A4, 8 * SIZE(A4) -+ -+ ADD y0, a0, y0 -+ LD a0, 4 * SIZE(A1) -+ MUL alpha2, a4, a4 -+ ST y4, -4 * SIZE(Y1) -+ -+ ADD y1, a1, y1 -+ LD a1, 5 * SIZE(A1) -+ MUL alpha2, a5, a5 -+ ST y5, -3 * SIZE(Y1) -+ -+ ADD y2, a2, y2 -+ LD a2, 6 * SIZE(A1) -+ MUL alpha2, a6, a6 -+ ST y6, -2 * SIZE(Y1) -+ -+ ADD y3, a3, y3 -+ LD a3, 7 * SIZE(A1) -+ MUL alpha2, a7, a7 -+ ST y7, -1 * SIZE(Y1) -+ -+ ADD y0, a4, y0 -+ LD a4, 4 * SIZE(A2) -+ MUL alpha3, a8, a8 -+ LD y4, 4 * SIZE(Y1) -+ -+ ADD y1, a5, y1 -+ LD a5, 5 * SIZE(A2) -+ MUL alpha3, a9, a9 -+ LD y5, 5 * SIZE(Y1) -+ -+ ADD y2, a6, y2 -+ LD a6, 6 * SIZE(A2) -+ MUL alpha3, a10, a10 -+ LD y6, 6 * SIZE(Y1) -+ -+ ADD y3, a7, y3 -+ LD a7, 7 * SIZE(A2) -+ MUL alpha3, a11, a11 -+ LD y7, 7 * SIZE(Y1) -+ -+ ADD y0, a8, y0 -+ LD a8, 4 * SIZE(A3) -+ MUL alpha4, a12, a12 -+ bgt I, $L12 -+ .align 4 -+ -+$L13: -+ ADD y1, a9, y1 -+ LD a9, 5 * SIZE(A3) -+ MUL alpha4, a13, a13 -+ unop -+ -+ ADD y2, a10, y2 -+ LD a10, 6 * SIZE(A3) -+ MUL alpha4, a14, a14 -+ unop -+ -+ ADD y3, a11, y3 -+ LD a11, 7 * SIZE(A3) -+ MUL alpha4, a15, a15 -+ unop -+ -+ ADD y0, a12, y0 -+ LD a12, 4 * SIZE(A4) -+ MUL alpha1, a0, a0 -+ unop -+ -+ ADD y1, a13, y1 -+ LD a13, 5 * SIZE(A4) -+ MUL alpha1, a1, a1 -+ unop -+ -+ ADD y2, a14, y2 -+ LD a14, 6 * SIZE(A4) -+ MUL alpha1, a2, a2 -+ unop -+ -+ ADD y3, a15, y3 -+ LD a15, 7 * SIZE(A4) -+ MUL alpha1, a3, a3 -+ unop -+ -+ ST y0, 0 * SIZE(Y1) -+ ADD y4, a0, y4 -+ unop -+ MUL alpha2, a4, a4 -+ -+ ST y1, 1 * SIZE(Y1) -+ ADD y5, a1, y5 -+ unop -+ MUL alpha2, a5, a5 -+ -+ ST y2, 2 * SIZE(Y1) -+ ADD y6, a2, y6 -+ unop -+ MUL alpha2, a6, a6 -+ -+ ST y3, 3 * SIZE(Y1) -+ ADD y7, a3, y7 -+ ldi Y1, 8 * SIZE(Y1) -+ MUL alpha2, a7, a7 -+ -+ ADD y4, a4, y4 -+ MUL alpha3, a8, a8 -+ ADD y5, a5, y5 -+ MUL alpha3, a9, a9 -+ ADD y6, a6, y6 -+ MUL alpha3, a10, a10 -+ ADD y7, a7, y7 -+ MUL alpha3, a11, a11 -+ -+ ADD y4, a8, y4 -+ MUL alpha4, a12, a12 -+ ADD y5, a9, y5 -+ MUL alpha4, a13, a13 -+ ADD y6, a10, y6 -+ MUL alpha4, a14, a14 -+ ADD y7, a11, y7 -+ MUL alpha4, a15, a15 -+ -+ ADD y4, a12, y4 -+ ADD y5, a13, y5 -+ ADD y6, a14, y6 -+ ADD y7, a15, y7 -+ -+ ST y4, -4 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ST y5, -3 * SIZE(Y1) -+ ldi A2, 8 * SIZE(A2) -+ ST y6, -2 * SIZE(Y1) -+ ldi A3, 8 * SIZE(A3) -+ ST y7, -1 * SIZE(Y1) -+ ldi A4, 8 * SIZE(A4) -+ .align 4 -+ -+$L15: -+ and M, 4, I -+ ble I, $L16 -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ -+ LD a4, 0 * SIZE(A2) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) -+ -+ LD a8, 0 * SIZE(A3) -+ LD a9, 1 * SIZE(A3) -+ LD a10, 2 * SIZE(A3) -+ LD a11, 3 * SIZE(A3) -+ -+ MUL alpha1, a0, a0 -+ LD a12, 0 * SIZE(A4) -+ MUL alpha1, a1, a1 -+ LD a13, 1 * SIZE(A4) -+ MUL alpha1, a2, a2 -+ LD a14, 2 * SIZE(A4) -+ MUL alpha1, a3, a3 -+ LD a15, 3 * SIZE(A4) -+ -+ ADD y0, a0, y0 -+ MUL alpha2, a4, a4 -+ ADD y1, a1, y1 -+ MUL alpha2, a5, a5 -+ ADD y2, a2, y2 -+ MUL alpha2, a6, a6 -+ ADD y3, a3, y3 -+ MUL alpha2, a7, a7 -+ -+ ADD y0, a4, y0 -+ MUL alpha3, a8, a8 -+ ADD y1, a5, y1 -+ MUL alpha3, a9, a9 -+ ADD y2, a6, y2 -+ MUL alpha3, a10, a10 -+ ADD y3, a7, y3 -+ MUL alpha3, a11, a11 -+ -+ ADD y0, a8, y0 -+ MUL alpha4, a12, a12 -+ ADD y1, a9, y1 -+ MUL alpha4, a13, a13 -+ ADD y2, a10, y2 -+ MUL alpha4, a14, a14 -+ ADD y3, a11, y3 -+ MUL alpha4, a15, a15 -+ -+ ADD y0, a12, y0 -+ ldi Y1, 4 * SIZE(Y1) -+ ADD y1, a13, y1 -+ unop -+ -+ ADD y2, a14, y2 -+ unop -+ ADD y3, a15, y3 -+ unop -+ -+ ST y0, -4 * SIZE(Y1) -+ ldi A1, 4 * SIZE(A1) -+ ST y1, -3 * SIZE(Y1) -+ ldi A2, 4 * SIZE(A2) -+ ST y2, -2 * SIZE(Y1) -+ ldi A3, 4 * SIZE(A3) -+ ST y3, -1 * SIZE(Y1) -+ ldi A4, 4 * SIZE(A4) -+ .align 4 -+ -+$L16: -+ and M, 2, I -+ ble I, $L17 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 0 * SIZE(A2) -+ LD a3, 1 * SIZE(A2) -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ -+ LD a4, 0 * SIZE(A3) -+ MUL alpha1, a0, a0 -+ LD a5, 1 * SIZE(A3) -+ MUL alpha1, a1, a1 -+ LD a6, 0 * SIZE(A4) -+ MUL alpha2, a2, a2 -+ LD a7, 1 * SIZE(A4) -+ MUL alpha2, a3, a3 -+ -+ ADD y0, a0, y0 -+ MUL alpha3, a4, a4 -+ ADD y1, a1, y1 -+ MUL alpha3, a5, a5 -+ ADD y0, a2, y0 -+ MUL alpha4, a6, a6 -+ ADD y1, a3, y1 -+ MUL alpha4, a7, a7 -+ -+ ADD y0, a4, y0 -+ ldi A1, 2 * SIZE(A1) -+ ADD y1, a5, y1 -+ ldi A2, 2 * SIZE(A2) -+ ADD y0, a6, y0 -+ ldi A3, 2 * SIZE(A3) -+ ADD y1, a7, y1 -+ ldi A4, 2 * SIZE(A4) -+ -+ ST y0, 0 * SIZE(Y1) -+ unop -+ ST y1, 1 * SIZE(Y1) -+ ldi Y1, 2 * SIZE(Y1) -+ .align 4 -+ -+$L17: -+ blbc M, $L18 -+ -+ LD y0, 0 * SIZE(Y1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ LD a2, 0 * SIZE(A3) -+ LD a3, 0 * SIZE(A4) -+ -+ MUL alpha1, a0, a0 -+ MUL alpha2, a1, a1 -+ MUL alpha3, a2, a2 -+ MUL alpha4, a3, a3 -+ -+ ADD y0, a0, y0 -+ ADD y0, a1, y0 -+ ADD y0, a2, y0 -+ ADD y0, a3, y0 -+ -+ ST y0, 0 * SIZE(Y1) -+ .align 4 -+ -+$L18: -+ ldi J, -1(J) -+ bgt J, $L11 -+ .align 4 -+ -+$L20: -+ and N, 2, J -+ ble J, $L30 -+ -+ LD alpha1, 0 * SIZE(X) -+ addl X, INCX, X -+ LD alpha2, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ mov A, A1 -+ MUL alpha, alpha1, alpha1 -+ addl A, LDA, A2 -+ MUL alpha, alpha2, alpha2 -+ -+ addl A2, LDA, A -+ mov Y, Y1 -+ -+ sra M, 3, I -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ -+ LD a4, 0 * SIZE(A2) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) -+ -+ MUL alpha1, a0, a0 -+ LD y4, 4 * SIZE(Y1) -+ MUL alpha1, a1, a1 -+ LD y5, 5 * SIZE(Y1) -+ MUL alpha1, a2, a2 -+ LD y6, 6 * SIZE(Y1) -+ MUL alpha1, a3, a3 -+ LD y7, 7 * SIZE(Y1) -+ -+ ADD y0, a0, y0 -+ LD a0, 4 * SIZE(A1) -+ MUL alpha2, a4, a4 -+ -+ ADD y1, a1, y1 -+ LD a1, 5 * SIZE(A1) -+ MUL alpha2, a5, a5 -+ -+ ADD y2, a2, y2 -+ LD a2, 6 * SIZE(A1) -+ MUL alpha2, a6, a6 -+ -+ ADD y3, a3, y3 -+ LD a3, 7 * SIZE(A1) -+ MUL alpha2, a7, a7 -+ -+ ADD y0, a4, y0 -+ LD a4, 4 * SIZE(A2) -+ MUL alpha1, a0, a0 -+ -+ ADD y1, a5, y1 -+ LD a5, 5 * SIZE(A2) -+ MUL alpha1, a1, a1 -+ -+ ADD y2, a6, y2 -+ LD a6, 6 * SIZE(A2) -+ MUL alpha1, a2, a2 -+ -+ ADD y3, a7, y3 -+ LD a7, 7 * SIZE(A2) -+ MUL alpha1, a3, a3 -+ -+ ldi I, -1(I) -+ ble I, $L23 -+ .align 4 -+ -+$L22: -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) -+ ldi I, -1(I) -+ fillcs (PREFETCHSIZE + 0) * SIZE(A2) -+ ldi A2, 8 * SIZE(A2) -+ -+ ADD y4, a0, y4 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha2, a4, a4 -+ LD a0, 8 * SIZE(A1) -+ -+ ADD y5, a1, y5 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha2, a5, a5 -+ LD a1, 9 * SIZE(A1) -+ -+ ADD y6, a2, y6 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha2, a6, a6 -+ LD a2, 10 * SIZE(A1) -+ -+ ADD y7, a3, y7 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha2, a7, a7 -+ LD a3, 11 * SIZE(A1) -+ -+ ADD y4, a4, y4 -+ LD a4, 0 * SIZE(A2) -+ MUL alpha1, a0, a0 -+ LD y0, 8 * SIZE(Y1) -+ -+ ADD y5, a5, y5 -+ LD a5, 1 * SIZE(A2) -+ MUL alpha1, a1, a1 -+ LD y1, 9 * SIZE(Y1) -+ -+ ADD y6, a6, y6 -+ LD a6, 2 * SIZE(A2) -+ MUL alpha1, a2, a2 -+ LD y2, 10 * SIZE(Y1) -+ -+ ADD y7, a7, y7 -+ LD a7, 3 * SIZE(A2) -+ MUL alpha1, a3, a3 -+ LD y3, 11 * SIZE(Y1) -+ -+ ADD y0, a0, y0 -+ ST y4, 4 * SIZE(Y1) -+ MUL alpha2, a4, a4 -+ LD a0, 12 * SIZE(A1) -+ -+ ADD y1, a1, y1 -+ ST y5, 5 * SIZE(Y1) -+ MUL alpha2, a5, a5 -+ LD a1, 13 * SIZE(A1) -+ -+ ADD y2, a2, y2 -+ ST y6, 6 * SIZE(Y1) -+ MUL alpha2, a6, a6 -+ LD a2, 14 * SIZE(A1) -+ -+ ADD y3, a3, y3 -+ ST y7, 7 * SIZE(Y1) -+ MUL alpha2, a7, a7 -+ LD a3, 15 * SIZE(A1) -+ -+ ADD y0, a4, y0 -+ LD a4, 4 * SIZE(A2) -+ MUL alpha1, a0, a0 -+ LD y4, 12 * SIZE(Y1) -+ -+ ADD y1, a5, y1 -+ LD a5, 5 * SIZE(A2) -+ MUL alpha1, a1, a1 -+ LD y5, 13 * SIZE(Y1) -+ -+ ADD y2, a6, y2 -+ LD a6, 6 * SIZE(A2) -+ MUL alpha1, a2, a2 -+ LD y6, 14 * SIZE(Y1) -+ -+ ADD y3, a7, y3 -+ LD a7, 7 * SIZE(A2) -+ MUL alpha1, a3, a3 -+ LD y7, 15 * SIZE(Y1) -+ -+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ldi Y1, 8 * SIZE(Y1) -+ bgt I, $L22 -+ .align 4 -+ -+$L23: -+ ADD y4, a0, y4 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha2, a4, a4 -+ unop -+ -+ ADD y5, a1, y5 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha2, a5, a5 -+ unop -+ -+ ADD y6, a2, y6 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha2, a6, a6 -+ unop -+ -+ ADD y7, a3, y7 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha2, a7, a7 -+ unop -+ -+ ADD y4, a4, y4 -+ ADD y5, a5, y5 -+ ADD y6, a6, y6 -+ ADD y7, a7, y7 -+ -+ ST y4, 4 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ST y5, 5 * SIZE(Y1) -+ ldi A2, 8 * SIZE(A2) -+ -+ ST y6, 6 * SIZE(Y1) -+ unop -+ ST y7, 7 * SIZE(Y1) -+ ldi Y1, 8 * SIZE(Y1) -+ .align 4 -+ -+$L25: -+ and M, 4, I -+ ble I, $L26 -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ -+ MUL alpha1, a0, a0 -+ LD a4, 0 * SIZE(A2) -+ MUL alpha1, a1, a1 -+ LD a5, 1 * SIZE(A2) -+ MUL alpha1, a2, a2 -+ LD a6, 2 * SIZE(A2) -+ MUL alpha1, a3, a3 -+ LD a7, 3 * SIZE(A2) -+ -+ ADD y0, a0, y0 -+ MUL alpha2, a4, a4 -+ ADD y1, a1, y1 -+ MUL alpha2, a5, a5 -+ ADD y2, a2, y2 -+ MUL alpha2, a6, a6 -+ ADD y3, a3, y3 -+ MUL alpha2, a7, a7 -+ -+ ADD y0, a4, y0 -+ ldi Y1, 4 * SIZE(Y1) -+ ADD y1, a5, y1 -+ unop -+ ADD y2, a6, y2 -+ unop -+ ADD y3, a7, y3 -+ unop -+ -+ ST y0, -4 * SIZE(Y1) -+ ldi A1, 4 * SIZE(A1) -+ ST y1, -3 * SIZE(Y1) -+ ldi A2, 4 * SIZE(A2) -+ ST y2, -2 * SIZE(Y1) -+ ldi A3, 4 * SIZE(A3) -+ ST y3, -1 * SIZE(Y1) -+ ldi A4, 4 * SIZE(A4) -+ .align 4 -+ -+$L26: -+ and M, 2, I -+ ble I, $L27 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 0 * SIZE(A2) -+ LD a3, 1 * SIZE(A2) -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ -+ MUL alpha1, a0, a0 -+ MUL alpha1, a1, a1 -+ MUL alpha2, a2, a2 -+ MUL alpha2, a3, a3 -+ -+ ADD y0, a0, y0 -+ ldi A1, 2 * SIZE(A1) -+ ADD y1, a1, y1 -+ ldi A2, 2 * SIZE(A2) -+ ADD y0, a2, y0 -+ unop -+ ADD y1, a3, y1 -+ unop -+ -+ ST y0, 0 * SIZE(Y1) -+ unop -+ ST y1, 1 * SIZE(Y1) -+ ldi Y1, 2 * SIZE(Y1) -+ .align 4 -+ -+$L27: -+ blbc M, $L30 -+ -+ LD y0, 0 * SIZE(Y1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ -+ MUL alpha1, a0, a0 -+ MUL alpha2, a1, a1 -+ -+ ADD y0, a0, y0 -+ ADD y0, a1, y0 -+ -+ ST y0, 0 * SIZE(Y1) -+ .align 4 -+ -+$L30: -+ blbc N, $L990 -+ -+ LD alpha1, 0 * SIZE(X) -+ mov A, A1 -+ MUL alpha, alpha1, alpha1 -+ mov Y, Y1 -+ -+ sra M, 3, I -+ ble I, $L35 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ LD a4, 4 * SIZE(A1) -+ LD a5, 5 * SIZE(A1) -+ LD a6, 6 * SIZE(A1) -+ LD a7, 7 * SIZE(A1) -+ -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) -+ LD y4, 4 * SIZE(Y1) -+ LD y5, 5 * SIZE(Y1) -+ LD y6, 6 * SIZE(Y1) -+ LD y7, 7 * SIZE(Y1) -+ -+ MUL alpha1, a0, a0 -+ MUL alpha1, a1, a1 -+ MUL alpha1, a2, a2 -+ MUL alpha1, a3, a3 -+ -+ ldi I, -1(I) -+ ble I, $L33 -+ .align 4 -+ -+$L32: -+ ADD y0, a0, y0 -+ LD y4, 4 * SIZE(Y1) -+ MUL alpha1, a4, a4 -+ LD a0, 8 * SIZE(A1) -+ -+ ADD y1, a1, y1 -+ LD y5, 5 * SIZE(Y1) -+ MUL alpha1, a5, a5 -+ LD a1, 9 * SIZE(A1) -+ -+ ADD y2, a2, y2 -+ LD y6, 6 * SIZE(Y1) -+ MUL alpha1, a6, a6 -+ LD a2, 10 * SIZE(A1) -+ -+ ADD y3, a3, y3 -+ LD y7, 7 * SIZE(Y1) -+ MUL alpha1, a7, a7 -+ LD a3, 11 * SIZE(A1) -+ -+ ST y0, 0 * SIZE(Y1) -+ ST y1, 1 * SIZE(Y1) -+ ST y2, 2 * SIZE(Y1) -+ ST y3, 3 * SIZE(Y1) -+ -+ ADD y4, a4, y4 -+ LD y0, 8 * SIZE(Y1) -+ MUL alpha1, a0, a0 -+ LD a4, 12 * SIZE(A1) -+ -+ ADD y5, a5, y5 -+ LD y1, 9 * SIZE(Y1) -+ MUL alpha1, a1, a1 -+ LD a5, 13 * SIZE(A1) -+ -+ ADD y6, a6, y6 -+ LD y2, 10 * SIZE(Y1) -+ MUL alpha1, a2, a2 -+ LD a6, 14 * SIZE(A1) -+ -+ ADD y7, a7, y7 -+ LD y3, 11 * SIZE(Y1) -+ MUL alpha1, a3, a3 -+ LD a7, 15 * SIZE(A1) -+ -+ ST y4, 4 * SIZE(Y1) -+ ldi I, -1(I) -+ ST y5, 5 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ -+ ST y6, 6 * SIZE(Y1) -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) -+ ST y7, 7 * SIZE(Y1) -+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) -+ -+ ldi Y1, 8 * SIZE(Y1) -+ bgt I, $L32 -+ .align 4 -+ -+$L33: -+ ADD y0, a0, y0 -+ LD y4, 4 * SIZE(Y1) -+ MUL alpha1, a4, a4 -+ unop -+ -+ ADD y1, a1, y1 -+ LD y5, 5 * SIZE(Y1) -+ MUL alpha1, a5, a5 -+ unop -+ -+ ADD y2, a2, y2 -+ LD y6, 6 * SIZE(Y1) -+ MUL alpha1, a6, a6 -+ unop -+ -+ ADD y3, a3, y3 -+ LD y7, 7 * SIZE(Y1) -+ MUL alpha1, a7, a7 -+ unop -+ -+ ADD y4, a4, y4 -+ ST y0, 0 * SIZE(Y1) -+ ADD y5, a5, y5 -+ ST y1, 1 * SIZE(Y1) -+ ADD y6, a6, y6 -+ ST y2, 2 * SIZE(Y1) -+ ADD y7, a7, y7 -+ ST y3, 3 * SIZE(Y1) -+ -+ ST y4, 4 * SIZE(Y1) -+ unop -+ ST y5, 5 * SIZE(Y1) -+ unop -+ -+ ST y6, 6 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ST y7, 7 * SIZE(Y1) -+ ldi Y1, 8 * SIZE(Y1) -+ .align 4 -+ -+$L35: -+ and M, 4, I -+ ble I, $L36 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ -+ MUL alpha1, a0, a0 -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a1, a1 -+ LD y1, 1 * SIZE(Y1) -+ MUL alpha1, a2, a2 -+ LD y2, 2 * SIZE(Y1) -+ MUL alpha1, a3, a3 -+ LD y3, 3 * SIZE(Y1) -+ -+ ADD y0, a0, y0 -+ ADD y1, a1, y1 -+ ADD y2, a2, y2 -+ ADD y3, a3, y3 -+ -+ ST y0, 0 * SIZE(Y1) -+ ldi A1, 4 * SIZE(A1) -+ ST y1, 1 * SIZE(Y1) -+ ldi A2, 4 * SIZE(A2) -+ ST y2, 2 * SIZE(Y1) -+ unop -+ ST y3, 3 * SIZE(Y1) -+ ldi Y1, 4 * SIZE(Y1) -+ .align 4 -+ -+$L36: -+ and M, 2, I -+ ble I, $L37 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a0, a0 -+ LD y1, 1 * SIZE(Y1) -+ MUL alpha1, a1, a1 -+ -+ ADD y0, a0, y0 -+ ADD y1, a1, y1 -+ -+ ST y0, 0 * SIZE(Y1) -+ ldi A1, 2 * SIZE(A1) -+ ST y1, 1 * SIZE(Y1) -+ ldi Y1, 2 * SIZE(Y1) -+ .align 4 -+ -+$L37: -+ blbc M, $L990 -+ -+ LD y0, 0 * SIZE(Y1) -+ LD a0, 0 * SIZE(A1) -+ -+ MUL alpha1, a0, a0 -+ -+ ADD y0, a0, y0 -+ ST y0, 0 * SIZE(Y1) -+ .align 4 -+ -+$L990: -+ cmpeq INCY, SIZE, $0 -+ bne $0, $L999 -+ -+ mov BUFFER, Y1 -+ -+ sra M, 3, I -+ ble I, $L995 -+ .align 4 -+ -+$L992: -+ LD a0, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a1, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a2, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a3, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ -+ LD y0, 0 * SIZE(Y) -+ LD y1, 1 * SIZE(Y) -+ LD y2, 2 * SIZE(Y) -+ LD y3, 3 * SIZE(Y) -+ -+ LD a4, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a5, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a6, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a7, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ -+ LD y4, 4 * SIZE(Y) -+ LD y5, 5 * SIZE(Y) -+ LD y6, 6 * SIZE(Y) -+ LD y7, 7 * SIZE(Y) -+ -+ ADD a0, y0, a0 -+ ADD a1, y1, a1 -+ ADD a2, y2, a2 -+ ADD a3, y3, a3 -+ ADD a4, y4, a4 -+ ADD a5, y5, a5 -+ ADD a6, y6, a6 -+ ADD a7, y7, a7 -+ -+ ST a0, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a1, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a2, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a3, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ -+ ST a4, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a5, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a6, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a7, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ -+ ldi I, -1(I) -+ ldi Y, 8 * SIZE(Y) -+ bgt I, $L992 -+ .align 4 -+ -+$L995: -+ and M, 7, I -+ ble I, $L999 -+ .align 4 -+ -+$L996: -+ LD a0, 0 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ -+ LD y0, 0 * SIZE(Y) -+ ldi Y, 1 * SIZE(Y) -+ -+ ADD a0, y0, a0 -+ -+ ST a0, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ -+ ldi I, -1(I) -+ bgt I, $L996 -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/gemv_t.S b/kernel/sw_64/gemv_t.S -new file mode 100644 -index 0000000..4d8f130 ---- /dev/null -+++ b/kernel/sw_64/gemv_t.S -@@ -0,0 +1,1222 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define STACKSIZE 72 -+#define PREFETCHSIZE 32 -+ -+#define M $16 -+#define N $17 -+#define A $20 -+#define LDA $21 -+ -+#define X $18 -+#define INCX $19 -+#define Y $22 -+#define INCY $23 -+ -+#define BUFFER $24 -+ -+#define I $25 -+#define J $27 -+ -+#define X1 $3 -+#define Y1 $4 -+ -+#define A1 $5 -+#define A2 $6 -+#define A3 $7 -+#define A4 $8 -+ -+#define alpha $f19 -+#define f20 $f20 -+ -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 -+ -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 -+ -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f21 -+ -+#define a0 $f22 -+#define a1 $f23 -+#define a2 $f24 -+#define a3 $f25 -+#define a4 $f26 -+#define a5 $f27 -+#define a6 $f28 -+#define a7 $f29 -+ -+#define a8 $f2 -+#define a9 $f3 -+#define a10 $f4 -+#define a11 $f5 -+#define a12 $f6 -+#define a13 $f7 -+#define a14 $f8 -+#define a15 $f9 -+ -+ PROLOGUE -+ -+ ldi $sp, -STACKSIZE($sp) -+ ldl X, 0 + STACKSIZE($sp) -+ ldl INCX, 8 + STACKSIZE($sp) -+ ldl Y, 16 + STACKSIZE($sp) -+ ldl INCY, 24 + STACKSIZE($sp) -+ ldl BUFFER, 32 + STACKSIZE($sp) -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ fstd f20, 64($sp) -+ -+ PROFCODE -+ -+ cmple M, 0, $0 -+ SXADDQ INCX, 0, INCX -+ cmple N, 0, $1 -+ SXADDQ INCY, 0, INCY -+ -+ or $0, $1, $0 -+ bne $0, $L999 -+ -+ cmpeq INCX, SIZE, $0 -+ mov X, X1 -+ SXADDQ LDA, 0, LDA -+ bne $0, $L10 -+ -+ sra M, 3, I -+ mov BUFFER, Y1 -+ mov BUFFER, X -+ ble I, $L05 -+ .align 4 -+ -+$L02: -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(X1) -+ ldi I, -1(I) -+ -+ LD a0, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a1, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a2, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a3, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ -+ ST a0, 0 * SIZE(Y1) -+ ST a1, 1 * SIZE(Y1) -+ ST a2, 2 * SIZE(Y1) -+ ST a3, 3 * SIZE(Y1) -+ -+ LD a4, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a5, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a6, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a7, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ -+ ST a4, 4 * SIZE(Y1) -+ ST a5, 5 * SIZE(Y1) -+ ST a6, 6 * SIZE(Y1) -+ ST a7, 7 * SIZE(Y1) -+ -+ ldi Y1, 8 * SIZE(Y1) -+ bgt I, $L02 -+ .align 4 -+ -+$L05: -+ and M, 7, I -+ ble I, $L10 -+ .align 4 -+ -+$L06: -+ LD a0, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ ST a0, 0 * SIZE(Y1) -+ addl Y1, SIZE, Y1 -+ -+ ldi I, -1(I) -+ bgt I, $L06 -+ .align 4 -+ -+$L10: -+ mov Y, Y1 -+ fclr t0 -+ unop -+ fclr t1 -+ -+ sra N, 2, J -+ fclr t2 -+ fclr t3 -+ ble J, $L20 -+ .align 4 -+ -+$L11: -+ mov A, A1 -+ fclr s0 -+ addl A, LDA, A2 -+ fclr s1 -+ -+ addl A2, LDA, A3 -+ fclr s2 -+ addl A3, LDA, A4 -+ fclr s3 -+ -+ s4addl LDA, A, A -+ unop -+ mov X, X1 -+ flds $f31, 3 * SIZE(Y) -+ -+ sra M, 3, I -+ ble I, $L15 -+ -+ LD x0, 0 * SIZE(X1) -+ LD x1, 1 * SIZE(X1) -+ LD x2, 2 * SIZE(X1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ LD a2, 0 * SIZE(A3) -+ LD a3, 0 * SIZE(A4) -+ LD a4, 1 * SIZE(A1) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 1 * SIZE(A3) -+ LD a7, 1 * SIZE(A4) -+ LD a8, 2 * SIZE(A1) -+ LD a9, 2 * SIZE(A2) -+ LD a10, 2 * SIZE(A3) -+ LD a11, 2 * SIZE(A4) -+ LD a12, 3 * SIZE(A1) -+ LD a13, 3 * SIZE(A2) -+ LD a14, 3 * SIZE(A3) -+ LD a15, 3 * SIZE(A4) -+ -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x3, 3 * SIZE(X1) -+ MUL x0, a0, t0 -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) -+ MUL x0, a1, t1 -+ LD a1, 4 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ unop -+ MUL x0, a2, t2 -+ LD a2, 4 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20, s3 -+ LD a0, 4 * SIZE(A1) -+ unop -+ MUL x0, a3, t3 -+ LD a3, 4 * SIZE(A4) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x0, 4 * SIZE(X1) -+ MUL x1, a4, t0 -+ -+ ADD s1, t1, f20 -+ fmov f20, s1 -+ LD a4, 5 * SIZE(A1) -+ ldi A1, 8 * SIZE(A1) -+ MUL x1, a5, t1 -+ LD a5, 5 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ #unop -+ MUL x1, a6, t2 -+ LD a6, 5 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ #unop -+ MUL x1, a7, t3 -+ LD a7, 5 * SIZE(A4) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x1, 5 * SIZE(X1) -+ MUL x2, a8, t0 -+ LD a8, -2 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) -+ MUL x2, a9, t1 -+ LD a9, 6 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ ldi A2, 8 * SIZE(A2) -+ MUL x2, a10, t2 -+ LD a10, 6 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ ldi A3, 8 * SIZE(A3) -+ MUL x2, a11, t3 -+ LD a11, 6 * SIZE(A4) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x2, 6 * SIZE(X1) -+ MUL x3, a12, t0 -+ LD a12, -1 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldi A4, 8 * SIZE(A4) -+ MUL x3, a13, t1 -+ LD a13, -1 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ unop -+ MUL x3, a14, t2 -+ LD a14, -1 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ unop -+ MUL x3, a15, t3 -+ LD a15, -1 * SIZE(A4) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x3, 7 * SIZE(X1) -+ MUL x0, a0, t0 -+ LD a0, 0 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldw $31, (PREFETCHSIZE - 8) * SIZE(A3) -+ MUL x0, a1, t1 -+ LD a1, 0 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ unop -+ MUL x0, a2, t2 -+ LD a2, 0 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ unop -+ MUL x0, a3, t3 -+ LD a3, 0 * SIZE(A4) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x0, 8 * SIZE(X1) -+ MUL x1, a4, t0 -+ LD a4, 1 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ unop -+ MUL x1, a5, t1 -+ LD a5, 1 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ unop -+ MUL x1, a6, t2 -+ LD a6, 1 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ unop -+ MUL x1, a7, t3 -+ LD a7, 1 * SIZE(A4) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x1, 9 * SIZE(X1) -+ MUL x2, a8, t0 -+ LD a8, 2 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldw $31, (PREFETCHSIZE - 8) * SIZE(A4) -+ MUL x2, a9, t1 -+ LD a9, 2 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ ldi X1, 8 * SIZE(X1) -+ MUL x2, a10, t2 -+ LD a10, 2 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ ldi I, -1(I) -+ MUL x2, a11, t3 -+ LD a11, 2 * SIZE(A4) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x2, 2 * SIZE(X1) -+ MUL x3, a12, t0 -+ LD a12, 3 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldw $31, (PREFETCHSIZE - 8) * SIZE(X1) -+ MUL x3, a13, t1 -+ LD a13, 3 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ unop -+ MUL x3, a14, t2 -+ LD a14, 3 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ MUL x3, a15, t3 -+ LD a15, 3 * SIZE(A4) -+ bgt I, $L12 -+ .align 4 -+ -+$L13: -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x3, 3 * SIZE(X1) -+ MUL x0, a0, t0 -+ LD a0, 4 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ #unop -+ MUL x0, a1, t1 -+ LD a1, 4 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ #unop -+ MUL x0, a2, t2 -+ LD a2, 4 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ #unop -+ MUL x0, a3, t3 -+ LD a3, 4 * SIZE(A4) -+ -+ ADD s0, t0, x0 -+ fmov x0,s0 -+ LD x0, 4 * SIZE(X1) -+ MUL x1, a4, t0 -+ LD a4, 5 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ #unop -+ MUL x1, a5, t1 -+ LD a5, 5 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ #unop -+ MUL x1, a6, t2 -+ LD a6, 5 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ #unop -+ MUL x1, a7, t3 -+ LD a7, 5 * SIZE(A4) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x1, 5 * SIZE(X1) -+ MUL x2, a8, t0 -+ LD a8, 6 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ #unop -+ MUL x2, a9, t1 -+ LD a9, 6 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ #unop -+ MUL x2, a10, t2 -+ LD a10, 6 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ #unop -+ MUL x2, a11, t3 -+ LD a11, 6 * SIZE(A4) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x2, 6 * SIZE(X1) -+ MUL x3, a12, t0 -+ LD a12, 7 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldi A1, 8 * SIZE(A1) -+ MUL x3, a13, t1 -+ LD a13, 7 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ ldi A2, 8 * SIZE(A2) -+ MUL x3, a14, t2 -+ LD a14, 7 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ ldi A3, 8 * SIZE(A3) -+ MUL x3, a15, t3 -+ LD a15, 7 * SIZE(A4) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x3, 7 * SIZE(X1) -+ MUL x0, a0, t0 -+ unop -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldi X1, 8 * SIZE(X1) -+ MUL x0, a1, t1 -+ ldi A4, 8 * SIZE(A4) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ MUL x0, a2, t2 -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ MUL x0, a3, t3 -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ MUL x1, a4, t0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL x1, a5, t1 -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ MUL x1, a6, t2 -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ MUL x1, a7, t3 -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ MUL x2, a8, t0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL x2, a9, t1 -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ MUL x2, a10, t2 -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ MUL x2, a11, t3 -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ MUL x3, a12, t0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL x3, a13, t1 -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ MUL x3, a14, t2 -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ MUL x3, a15, t3 -+ .align 4 -+ -+$L15: -+ and M, 7, I -+ ble I, $L18 -+ -+ LD x0, 0 * SIZE(X1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ LD a2, 0 * SIZE(A3) -+ LD a3, 0 * SIZE(A4) -+ -+ ldi I, -1(I) -+ ble I, $L17 -+ .align 4 -+ -+$L16: -+ ADD s0, t0,f20 -+ fmov f20,s0 -+ ldi A4, 1 * SIZE(A4) -+ MUL x0, a0, t0 -+ LD a0, 1 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldi A1, 1 * SIZE(A1) -+ MUL x0, a1, t1 -+ LD a1, 1 * SIZE(A2) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ ldi A2, 1 * SIZE(A2) -+ MUL x0, a2, t2 -+ LD a2, 1 * SIZE(A3) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ ldi A3, 1 * SIZE(A3) -+ MUL x0, a3, t3 -+ LD a3, 0 * SIZE(A4) -+ -+ LD x0, 1 * SIZE(X1) -+ ldi X1, 1 * SIZE(X1) -+ ldi I, -1(I) -+ bgt I, $L16 -+ .align 4 -+ -+$L17: -+ ADD s0, t0,f20 -+ fmov f20,s0 -+ MUL x0, a0, t0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL x0, a1, t1 -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ MUL x0, a2, t2 -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ MUL x0, a3, t3 -+ .align 4 -+ -+$L18: -+ LD a0, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ LD a1, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ LD a2, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ LD a3, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ -+ ADD s0, t0,f20 -+ fmov f20,s0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ -+ MUL alpha, s0,f20 -+ fmov f20,s0 -+ MUL alpha, s1, f20 -+ fmov f20,s1 -+ MUL alpha, s2, f20 -+ fmov f20,s2 -+ MUL alpha, s3, f20 -+ fmov f20,s3 -+ -+ ADD a0, s0,f20 -+ fmov f20,a0 -+ fclr t0 -+ ADD a1, s1, f20 -+ fmov f20,a1 -+ fclr t1 -+ ADD a2, s2, f20 -+ fmov f20,a2 -+ fclr t2 -+ ADD a3, s3, f20 -+ fmov f20,a3 -+ fclr t3 -+ -+ ST a0, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a1, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a2, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a3, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ -+ ldi J, -1(J) -+ bgt J, $L11 -+ .align 4 -+ -+$L20: -+ and N, 2, J -+ ble J, $L30 -+ mov A, A1 -+ addl A, LDA, A2 -+ -+ addl A2, LDA, A -+ fclr s0 -+ mov X, X1 -+ fclr s1 -+ -+ sra M, 3, I -+ fclr s2 -+ fclr s3 -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ LD a2, 1 * SIZE(A1) -+ LD a3, 1 * SIZE(A2) -+ LD a4, 2 * SIZE(A1) -+ LD a5, 2 * SIZE(A2) -+ LD a6, 3 * SIZE(A1) -+ LD a7, 3 * SIZE(A2) -+ -+ LD a8, 4 * SIZE(A1) -+ LD a9, 4 * SIZE(A2) -+ LD a10, 5 * SIZE(A1) -+ LD a11, 5 * SIZE(A2) -+ LD a12, 6 * SIZE(A1) -+ LD a13, 6 * SIZE(A2) -+ LD a14, 7 * SIZE(A1) -+ LD a15, 7 * SIZE(A2) -+ -+ LD x0, 0 * SIZE(X1) -+ LD x1, 1 * SIZE(X1) -+ LD x2, 2 * SIZE(X1) -+ -+ ldi I, -1(I) -+ ble I, $L23 -+ .align 4 -+ -+$L22: -+ ADD s0, t0, x3 -+ fmov x3,s0 -+ LD x3, 3 * SIZE(X1) -+ MUL x0, a0, t0 -+ LD a0, 8 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) -+ MUL x0, a1, t1 -+ LD a1, 8 * SIZE(A2) -+ -+ ADD s0, t2, x0 -+ fmov x0,s0 -+ LD x0, 4 * SIZE(X1) -+ MUL x1, a2, t2 -+ LD a2, 9 * SIZE(A1) -+ -+ ADD s1, t3, f20 -+ fmov f20,s1 -+ #unop -+ MUL x1, a3, t3 -+ LD a3, 9 * SIZE(A2) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x1, 5 * SIZE(X1) -+ MUL x2, a4, t0 -+ LD a4, 10 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldi I, -1(I) -+ MUL x2, a5, t1 -+ LD a5, 10 * SIZE(A2) -+ -+ ADD s0, t2, f20 -+ fmov f20,s0 -+ LD x2, 6 * SIZE(X1) -+ MUL x3, a6, t2 -+ LD a6, 11 * SIZE(A1) -+ -+ ADD s1, t3, f20 -+ fmov f20,s1 -+ ldi X1, 8 * SIZE(X1) -+ MUL x3, a7, t3 -+ LD a7, 11 * SIZE(A2) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x3, -1 * SIZE(X1) -+ MUL x0, a8, t0 -+ LD a8, 12 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) -+ MUL x0, a9, t1 -+ LD a9, 12 * SIZE(A2) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x0, 0 * SIZE(X1) -+ MUL x1, a10, t0 -+ LD a10, 13 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ldi A1, 8 * SIZE(A1) -+ MUL x1, a11, t1 -+ LD a11, 13 * SIZE(A2) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x1, 1 * SIZE(X1) -+ MUL x2, a12, t0 -+ LD a12, 6 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL x2, a13, t1 -+ LD a13, 14 * SIZE(A2) -+ ldi A2, 8 * SIZE(A2) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x2, 2 * SIZE(X1) -+ MUL x3, a14, t0 -+ LD a14, 7 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL x3, a15, t1 -+ LD a15, 7 * SIZE(A2) -+ bgt I, $L22 -+ .align 4 -+ -+$L23: -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x3, 3 * SIZE(X1) -+ MUL x0, a0, t0 -+ ldi A1, 8 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ unop -+ MUL x0, a1, t1 -+ unop -+ -+ ADD s0, t2, f20 -+ fmov f20,s0 -+ LD x0, 4 * SIZE(X1) -+ MUL x1, a2, t2 -+ ldi A2, 8 * SIZE(A2) -+ -+ ADD s1, t3, f20 -+ fmov f20,s1 -+ unop -+ MUL x1, a3, t3 -+ unop -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x1, 5 * SIZE(X1) -+ MUL x2, a4, t0 -+ unop -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ unop -+ MUL x2, a5, t1 -+ unop -+ -+ ADD s0, t2, f20 -+ fmov f20,s0 -+ LD x2, 6 * SIZE(X1) -+ MUL x3, a6, t2 -+ unop -+ -+ ADD s1, t3, f20 -+ fmov f20,s1 -+ unop -+ MUL x3, a7, t3 -+ unop -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD x3, 7 * SIZE(X1) -+ MUL x0, a8, t0 -+ ldi X1, 8 * SIZE(X1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ unop -+ MUL x0, a9, t1 -+ unop -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ MUL x1, a10, t0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL x1, a11, t1 -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ MUL x2, a12, t0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL x2, a13, t1 -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ MUL x3, a14, t0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL x3, a15, t1 -+ .align 4 -+ -+$L25: -+ and M, 7, I -+ ble I, $L28 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ LD x0, 0 * SIZE(X1) -+ -+ ldi I, -1(I) -+ ble I, $L27 -+ .align 4 -+ -+$L26: -+ ADD s0, t0,f20 -+ fmov f20,s0 -+ ldi A2, 1 * SIZE(A2) -+ MUL x0, a0, t0 -+ LD a0, 1 * SIZE(A1) -+ -+ ADD s1, t1,f20 -+ fmov f20,s1 -+ ldi A1, 1 * SIZE(A1) -+ MUL x0, a1, t1 -+ LD a1, 0 * SIZE(A2) -+ -+ LD x0, 1 * SIZE(X1) -+ ldi X1, 1 * SIZE(X1) -+ ldi I, -1(I) -+ bgt I, $L26 -+ .align 4 -+ -+$L27: -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ MUL x0, a0, t0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL x0, a1, t1 -+ .align 4 -+ -+$L28: -+ LD a0, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ LD a1, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ -+ ADD s0, s2, f20 -+ fmov f20,s0 -+ ADD s1, s3, f20 -+ fmov f20,s1 -+ -+ MUL alpha, s0, f20 -+ fmov f20,s0 -+ MUL alpha, s1,f20 -+ fmov f20,s1 -+ -+ ADD a0, s0, f20 -+ fmov f20,a0 -+ ADD a1, s1, f20 -+ fmov f20,a1 -+ -+ ST a0, 0 * SIZE(Y1) -+ fclr t0 -+ addl Y1, INCY, Y1 -+ fclr t1 -+ -+ ST a1, 0 * SIZE(Y1) -+ fclr t2 -+ addl Y1, INCY, Y1 -+ fclr t3 -+ .align 4 -+ -+$L30: -+ blbc N, $L999 -+ -+ mov A, A1 -+ fclr s0 -+ mov X, X1 -+ fclr s1 -+ -+ sra M, 3, I -+ fclr s2 -+ fclr s3 -+ ble I, $L35 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a8, 0 * SIZE(X1) -+ LD a9, 1 * SIZE(X1) -+ -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ LD a10, 2 * SIZE(X1) -+ LD a11, 3 * SIZE(X1) -+ -+ LD a4, 4 * SIZE(A1) -+ LD a5, 5 * SIZE(A1) -+ LD a12, 4 * SIZE(X1) -+ LD a13, 5 * SIZE(X1) -+ -+ LD a6, 6 * SIZE(A1) -+ LD a7, 7 * SIZE(A1) -+ LD a14, 6 * SIZE(X1) -+ -+ ldi I, -1(I) -+ ble I, $L33 -+ .align 4 -+ -+$L32: -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD a15, 7 * SIZE(X1) -+ MUL a0, a8, f20 -+ fmov f20,t0 -+ LD a0, 8 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ LD a8, 8 * SIZE(X1) -+ MUL a1, a9, t1 -+ LD a1, 9 * SIZE(A1) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ LD a9, 9 * SIZE(X1) -+ MUL a2, a10, t2 -+ LD a2, 10 * SIZE(A1) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ LD a10, 10 * SIZE(X1) -+ MUL a3, a11, t3 -+ LD a3, 11 * SIZE(A1) -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD a11, 11 * SIZE(X1) -+ MUL a4, a12, t0 -+ LD a4, 12 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ LD a12, 12 * SIZE(X1) -+ MUL a5, a13, t1 -+ LD a5, 13 * SIZE(A1) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ LD a13, 13 * SIZE(X1) -+ MUL a6, a14, t2 -+ LD a6, 14 * SIZE(A1) -+ -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ LD a14, 14 * SIZE(X1) -+ MUL a7, a15, t3 -+ LD a7, 15 * SIZE(A1) -+ -+ ldi A1, 8 * SIZE(A1) -+ ldi I, -1(I) -+ ldi X1, 8 * SIZE(X1) -+ bgt I, $L32 -+ .align 4 -+ -+$L33: -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ LD a15, 7 * SIZE(X1) -+ MUL a0, a8, t0 -+ ldi A1, 8 * SIZE(A1) -+ -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ unop -+ MUL a1, a9, t1 -+ ldi X1, 8 * SIZE(X1) -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ MUL a2, a10, t2 -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ MUL a3, a11, t3 -+ -+ ADD s0, t0, f20 -+ fmov f20,s0 -+ MUL a4, a12, t0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ MUL a5, a13, t1 -+ -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ MUL a6, a14, t2 -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ MUL a7, a15, t3 -+ .align 4 -+ -+$L35: -+ and M, 7, I -+ ble I, $L38 -+ -+ LD a0, 0 * SIZE(A1) -+ LD x0, 0 * SIZE(X1) -+ -+ ldi I, -1(I) -+ ble I, $L37 -+ .align 4 -+ -+$L36: -+ ADD s0, t0,f20 -+ fmov f20,s0 -+ MUL x0, a0, t0 -+ LD a0, 1 * SIZE(A1) -+ LD x0, 1 * SIZE(X1) -+ -+ ldi A1, 1 * SIZE(A1) -+ ldi X1, 1 * SIZE(X1) -+ ldi I, -1(I) -+ bgt I, $L36 -+ .align 4 -+ -+$L37: -+ ADD s0, t0,f20 -+ fmov f20,s0 -+ MUL x0, a0, t0 -+ .align 4 -+ -+$L38: -+ LD a0, 0 * SIZE(Y) -+ -+ ADD s0, t0,f20 -+ fmov f20,s0 -+ ADD s1, t1, f20 -+ fmov f20,s1 -+ ADD s2, t2, f20 -+ fmov f20,s2 -+ ADD s3, t3, f20 -+ fmov f20,s3 -+ -+ ADD s0, s2, f20 -+ fmov f20,s0 -+ ADD s1, s3, f20 -+ fmov f20,s1 -+ ADD s0, s1, f20 -+ fmov f20,s0 -+ -+ MUL alpha, s0, f20 -+ fmov f20,s0 -+ ADD a0, s0, f20 -+ fmov f20,a0 -+ -+ ST a0, 0 * SIZE(Y1) -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ fldd f20, 64($sp) -+ -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/gemv_t.S.bak b/kernel/sw_64/gemv_t.S.bak -new file mode 100644 -index 0000000..068e463 ---- /dev/null -+++ b/kernel/sw_64/gemv_t.S.bak -@@ -0,0 +1,1061 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define STACKSIZE 64 -+#define PREFETCHSIZE 32 -+ -+#define M $16 -+#define N $17 -+#define A $20 -+#define LDA $21 -+ -+#define X $18 -+#define INCX $19 -+#define Y $22 -+#define INCY $23 -+ -+#define BUFFER $24 -+ -+#define I $25 -+#define J $27 -+ -+#define X1 $3 -+#define Y1 $4 -+ -+#define A1 $5 -+#define A2 $6 -+#define A3 $7 -+#define A4 $8 -+ -+#define alpha $f19 -+ -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 -+ -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 -+ -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f21 -+ -+#define a0 $f22 -+#define a1 $f23 -+#define a2 $f24 -+#define a3 $f25 -+#define a4 $f26 -+#define a5 $f27 -+#define a6 $f28 -+#define a7 $f29 -+ -+#define a8 $f2 -+#define a9 $f3 -+#define a10 $f4 -+#define a11 $f5 -+#define a12 $f6 -+#define a13 $f7 -+#define a14 $f8 -+#define a15 $f9 -+ -+ PROLOGUE -+ -+ ldi $sp, -STACKSIZE($sp) -+ ldl X, 0 + STACKSIZE($sp) -+ ldl INCX, 8 + STACKSIZE($sp) -+ ldl Y, 16 + STACKSIZE($sp) -+ ldl INCY, 24 + STACKSIZE($sp) -+ ldl BUFFER, 32 + STACKSIZE($sp) -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ -+ PROFCODE -+ -+ cmple M, 0, $0 -+ SXADDQ INCX, 0, INCX -+ cmple N, 0, $1 -+ SXADDQ INCY, 0, INCY -+ -+ or $0, $1, $0 -+ bne $0, $L999 -+ -+ cmpeq INCX, SIZE, $0 -+ mov X, X1 -+ SXADDQ LDA, 0, LDA -+ bne $0, $L10 -+ -+ sra M, 3, I -+ mov BUFFER, Y1 -+ mov BUFFER, X -+ ble I, $L05 -+ .align 4 -+ -+$L02: -+ fillcs (PREFETCHSIZE + 0) * SIZE(X1) -+ ldi I, -1(I) -+ -+ LD a0, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a1, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a2, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a3, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ -+ ST a0, 0 * SIZE(Y1) -+ ST a1, 1 * SIZE(Y1) -+ ST a2, 2 * SIZE(Y1) -+ ST a3, 3 * SIZE(Y1) -+ -+ LD a4, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a5, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a6, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a7, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ -+ ST a4, 4 * SIZE(Y1) -+ ST a5, 5 * SIZE(Y1) -+ ST a6, 6 * SIZE(Y1) -+ ST a7, 7 * SIZE(Y1) -+ -+ ldi Y1, 8 * SIZE(Y1) -+ bgt I, $L02 -+ .align 4 -+ -+$L05: -+ and M, 7, I -+ ble I, $L10 -+ .align 4 -+ -+$L06: -+ LD a0, 0 * SIZE(X1) -+ addl X1, INCX, X1 -+ ST a0, 0 * SIZE(Y1) -+ addl Y1, SIZE, Y1 -+ -+ ldi I, -1(I) -+ bgt I, $L06 -+ .align 4 -+ -+$L10: -+ mov Y, Y1 -+ fclr t0 -+ unop -+ fclr t1 -+ -+ sra N, 2, J -+ fclr t2 -+ fclr t3 -+ ble J, $L20 -+ .align 4 -+ -+$L11: -+ mov A, A1 -+ fclr s0 -+ addl A, LDA, A2 -+ fclr s1 -+ -+ addl A2, LDA, A3 -+ fclr s2 -+ addl A3, LDA, A4 -+ fclr s3 -+ -+ s4addl LDA, A, A -+ unop -+ mov X, X1 -+ fillcs 3 * SIZE(Y) -+ -+ sra M, 3, I -+ ble I, $L15 -+ -+ LD x0, 0 * SIZE(X1) -+ LD x1, 1 * SIZE(X1) -+ LD x2, 2 * SIZE(X1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ LD a2, 0 * SIZE(A3) -+ LD a3, 0 * SIZE(A4) -+ LD a4, 1 * SIZE(A1) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 1 * SIZE(A3) -+ LD a7, 1 * SIZE(A4) -+ LD a8, 2 * SIZE(A1) -+ LD a9, 2 * SIZE(A2) -+ LD a10, 2 * SIZE(A3) -+ LD a11, 2 * SIZE(A4) -+ LD a12, 3 * SIZE(A1) -+ LD a13, 3 * SIZE(A2) -+ LD a14, 3 * SIZE(A3) -+ LD a15, 3 * SIZE(A4) -+ -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ADD s0, t0, s0 -+ LD x3, 3 * SIZE(X1) -+ MUL x0, a0, t0 -+ LD a0, 4 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) -+ MUL x0, a1, t1 -+ LD a1, 4 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ unop -+ MUL x0, a2, t2 -+ LD a2, 4 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ unop -+ MUL x0, a3, t3 -+ LD a3, 4 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x0, 4 * SIZE(X1) -+ MUL x1, a4, t0 -+ LD a4, 5 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ ldi A1, 8 * SIZE(A1) -+ MUL x1, a5, t1 -+ LD a5, 5 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ unop -+ MUL x1, a6, t2 -+ LD a6, 5 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ unop -+ MUL x1, a7, t3 -+ LD a7, 5 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x1, 5 * SIZE(X1) -+ MUL x2, a8, t0 -+ LD a8, -2 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A2) -+ MUL x2, a9, t1 -+ LD a9, 6 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ ldi A2, 8 * SIZE(A2) -+ MUL x2, a10, t2 -+ LD a10, 6 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ ldi A3, 8 * SIZE(A3) -+ MUL x2, a11, t3 -+ LD a11, 6 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x2, 6 * SIZE(X1) -+ MUL x3, a12, t0 -+ LD a12, -1 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ ldi A4, 8 * SIZE(A4) -+ MUL x3, a13, t1 -+ LD a13, -1 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ unop -+ MUL x3, a14, t2 -+ LD a14, -1 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ unop -+ MUL x3, a15, t3 -+ LD a15, -1 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x3, 7 * SIZE(X1) -+ MUL x0, a0, t0 -+ LD a0, 0 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ fillcs (PREFETCHSIZE - 8) * SIZE(A3) -+ MUL x0, a1, t1 -+ LD a1, 0 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ unop -+ MUL x0, a2, t2 -+ LD a2, 0 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ unop -+ MUL x0, a3, t3 -+ LD a3, 0 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x0, 8 * SIZE(X1) -+ MUL x1, a4, t0 -+ LD a4, 1 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ unop -+ MUL x1, a5, t1 -+ LD a5, 1 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ unop -+ MUL x1, a6, t2 -+ LD a6, 1 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ unop -+ MUL x1, a7, t3 -+ LD a7, 1 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x1, 9 * SIZE(X1) -+ MUL x2, a8, t0 -+ LD a8, 2 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ fillcs (PREFETCHSIZE - 8) * SIZE(A4) -+ MUL x2, a9, t1 -+ LD a9, 2 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ ldi X1, 8 * SIZE(X1) -+ MUL x2, a10, t2 -+ LD a10, 2 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ ldi I, -1(I) -+ MUL x2, a11, t3 -+ LD a11, 2 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x2, 2 * SIZE(X1) -+ MUL x3, a12, t0 -+ LD a12, 3 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ fillcs (PREFETCHSIZE - 8) * SIZE(X1) -+ MUL x3, a13, t1 -+ LD a13, 3 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ unop -+ MUL x3, a14, t2 -+ LD a14, 3 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ MUL x3, a15, t3 -+ LD a15, 3 * SIZE(A4) -+ bgt I, $L12 -+ .align 4 -+ -+$L13: -+ ADD s0, t0, s0 -+ LD x3, 3 * SIZE(X1) -+ MUL x0, a0, t0 -+ LD a0, 4 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ unop -+ MUL x0, a1, t1 -+ LD a1, 4 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ unop -+ MUL x0, a2, t2 -+ LD a2, 4 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ unop -+ MUL x0, a3, t3 -+ LD a3, 4 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x0, 4 * SIZE(X1) -+ MUL x1, a4, t0 -+ LD a4, 5 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ unop -+ MUL x1, a5, t1 -+ LD a5, 5 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ unop -+ MUL x1, a6, t2 -+ LD a6, 5 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ unop -+ MUL x1, a7, t3 -+ LD a7, 5 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x1, 5 * SIZE(X1) -+ MUL x2, a8, t0 -+ LD a8, 6 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ unop -+ MUL x2, a9, t1 -+ LD a9, 6 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ unop -+ MUL x2, a10, t2 -+ LD a10, 6 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ unop -+ MUL x2, a11, t3 -+ LD a11, 6 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x2, 6 * SIZE(X1) -+ MUL x3, a12, t0 -+ LD a12, 7 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ ldi A1, 8 * SIZE(A1) -+ MUL x3, a13, t1 -+ LD a13, 7 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ ldi A2, 8 * SIZE(A2) -+ MUL x3, a14, t2 -+ LD a14, 7 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ ldi A3, 8 * SIZE(A3) -+ MUL x3, a15, t3 -+ LD a15, 7 * SIZE(A4) -+ -+ ADD s0, t0, s0 -+ LD x3, 7 * SIZE(X1) -+ MUL x0, a0, t0 -+ unop -+ -+ ADD s1, t1, s1 -+ ldi X1, 8 * SIZE(X1) -+ MUL x0, a1, t1 -+ ldi A4, 8 * SIZE(A4) -+ -+ ADD s2, t2, s2 -+ MUL x0, a2, t2 -+ ADD s3, t3, s3 -+ MUL x0, a3, t3 -+ -+ ADD s0, t0, s0 -+ MUL x1, a4, t0 -+ ADD s1, t1, s1 -+ MUL x1, a5, t1 -+ -+ ADD s2, t2, s2 -+ MUL x1, a6, t2 -+ ADD s3, t3, s3 -+ MUL x1, a7, t3 -+ -+ ADD s0, t0, s0 -+ MUL x2, a8, t0 -+ ADD s1, t1, s1 -+ MUL x2, a9, t1 -+ -+ ADD s2, t2, s2 -+ MUL x2, a10, t2 -+ ADD s3, t3, s3 -+ MUL x2, a11, t3 -+ -+ ADD s0, t0, s0 -+ MUL x3, a12, t0 -+ ADD s1, t1, s1 -+ MUL x3, a13, t1 -+ -+ ADD s2, t2, s2 -+ MUL x3, a14, t2 -+ ADD s3, t3, s3 -+ MUL x3, a15, t3 -+ .align 4 -+ -+$L15: -+ and M, 7, I -+ ble I, $L18 -+ -+ LD x0, 0 * SIZE(X1) -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ LD a2, 0 * SIZE(A3) -+ LD a3, 0 * SIZE(A4) -+ -+ ldi I, -1(I) -+ ble I, $L17 -+ .align 4 -+ -+$L16: -+ ADD s0, t0, s0 -+ ldi A4, 1 * SIZE(A4) -+ MUL x0, a0, t0 -+ LD a0, 1 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ ldi A1, 1 * SIZE(A1) -+ MUL x0, a1, t1 -+ LD a1, 1 * SIZE(A2) -+ -+ ADD s2, t2, s2 -+ ldi A2, 1 * SIZE(A2) -+ MUL x0, a2, t2 -+ LD a2, 1 * SIZE(A3) -+ -+ ADD s3, t3, s3 -+ ldi A3, 1 * SIZE(A3) -+ MUL x0, a3, t3 -+ LD a3, 0 * SIZE(A4) -+ -+ LD x0, 1 * SIZE(X1) -+ ldi X1, 1 * SIZE(X1) -+ ldi I, -1(I) -+ bgt I, $L16 -+ .align 4 -+ -+$L17: -+ ADD s0, t0, s0 -+ MUL x0, a0, t0 -+ ADD s1, t1, s1 -+ MUL x0, a1, t1 -+ -+ ADD s2, t2, s2 -+ MUL x0, a2, t2 -+ ADD s3, t3, s3 -+ MUL x0, a3, t3 -+ .align 4 -+ -+$L18: -+ LD a0, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ LD a1, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ LD a2, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ LD a3, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ -+ ADD s0, t0, s0 -+ ADD s1, t1, s1 -+ ADD s2, t2, s2 -+ ADD s3, t3, s3 -+ -+ MUL alpha, s0, s0 -+ MUL alpha, s1, s1 -+ MUL alpha, s2, s2 -+ MUL alpha, s3, s3 -+ -+ ADD a0, s0, a0 -+ fclr t0 -+ ADD a1, s1, a1 -+ fclr t1 -+ ADD a2, s2, a2 -+ fclr t2 -+ ADD a3, s3, a3 -+ fclr t3 -+ -+ ST a0, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a1, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a2, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a3, 0 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ -+ ldi J, -1(J) -+ bgt J, $L11 -+ .align 4 -+ -+$L20: -+ and N, 2, J -+ ble J, $L30 -+ mov A, A1 -+ addl A, LDA, A2 -+ -+ addl A2, LDA, A -+ fclr s0 -+ mov X, X1 -+ fclr s1 -+ -+ sra M, 3, I -+ fclr s2 -+ fclr s3 -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ LD a2, 1 * SIZE(A1) -+ LD a3, 1 * SIZE(A2) -+ LD a4, 2 * SIZE(A1) -+ LD a5, 2 * SIZE(A2) -+ LD a6, 3 * SIZE(A1) -+ LD a7, 3 * SIZE(A2) -+ -+ LD a8, 4 * SIZE(A1) -+ LD a9, 4 * SIZE(A2) -+ LD a10, 5 * SIZE(A1) -+ LD a11, 5 * SIZE(A2) -+ LD a12, 6 * SIZE(A1) -+ LD a13, 6 * SIZE(A2) -+ LD a14, 7 * SIZE(A1) -+ LD a15, 7 * SIZE(A2) -+ -+ LD x0, 0 * SIZE(X1) -+ LD x1, 1 * SIZE(X1) -+ LD x2, 2 * SIZE(X1) -+ -+ ldi I, -1(I) -+ ble I, $L23 -+ .align 4 -+ -+$L22: -+ ADD s0, t0, s0 -+ LD x3, 3 * SIZE(X1) -+ MUL x0, a0, t0 -+ LD a0, 8 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) -+ MUL x0, a1, t1 -+ LD a1, 8 * SIZE(A2) -+ -+ ADD s0, t2, s0 -+ LD x0, 4 * SIZE(X1) -+ MUL x1, a2, t2 -+ LD a2, 9 * SIZE(A1) -+ -+ ADD s1, t3, s1 -+ unop -+ MUL x1, a3, t3 -+ LD a3, 9 * SIZE(A2) -+ -+ ADD s0, t0, s0 -+ LD x1, 5 * SIZE(X1) -+ MUL x2, a4, t0 -+ LD a4, 10 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ ldi I, -1(I) -+ MUL x2, a5, t1 -+ LD a5, 10 * SIZE(A2) -+ -+ ADD s0, t2, s0 -+ LD x2, 6 * SIZE(X1) -+ MUL x3, a6, t2 -+ LD a6, 11 * SIZE(A1) -+ -+ ADD s1, t3, s1 -+ ldi X1, 8 * SIZE(X1) -+ MUL x3, a7, t3 -+ LD a7, 11 * SIZE(A2) -+ -+ ADD s0, t0, s0 -+ LD x3, -1 * SIZE(X1) -+ MUL x0, a8, t0 -+ LD a8, 12 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A2) -+ MUL x0, a9, t1 -+ LD a9, 12 * SIZE(A2) -+ -+ ADD s0, t0, s0 -+ LD x0, 0 * SIZE(X1) -+ MUL x1, a10, t0 -+ LD a10, 13 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ ldi A1, 8 * SIZE(A1) -+ MUL x1, a11, t1 -+ LD a11, 13 * SIZE(A2) -+ -+ ADD s0, t0, s0 -+ LD x1, 1 * SIZE(X1) -+ MUL x2, a12, t0 -+ LD a12, 6 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ MUL x2, a13, t1 -+ LD a13, 14 * SIZE(A2) -+ ldi A2, 8 * SIZE(A2) -+ -+ ADD s0, t0, s0 -+ LD x2, 2 * SIZE(X1) -+ MUL x3, a14, t0 -+ LD a14, 7 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ MUL x3, a15, t1 -+ LD a15, 7 * SIZE(A2) -+ bgt I, $L22 -+ .align 4 -+ -+$L23: -+ ADD s0, t0, s0 -+ LD x3, 3 * SIZE(X1) -+ MUL x0, a0, t0 -+ ldi A1, 8 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ unop -+ MUL x0, a1, t1 -+ unop -+ -+ ADD s0, t2, s0 -+ LD x0, 4 * SIZE(X1) -+ MUL x1, a2, t2 -+ ldi A2, 8 * SIZE(A2) -+ -+ ADD s1, t3, s1 -+ unop -+ MUL x1, a3, t3 -+ unop -+ -+ ADD s0, t0, s0 -+ LD x1, 5 * SIZE(X1) -+ MUL x2, a4, t0 -+ unop -+ -+ ADD s1, t1, s1 -+ unop -+ MUL x2, a5, t1 -+ unop -+ -+ ADD s0, t2, s0 -+ LD x2, 6 * SIZE(X1) -+ MUL x3, a6, t2 -+ unop -+ -+ ADD s1, t3, s1 -+ unop -+ MUL x3, a7, t3 -+ unop -+ -+ ADD s0, t0, s0 -+ LD x3, 7 * SIZE(X1) -+ MUL x0, a8, t0 -+ ldi X1, 8 * SIZE(X1) -+ -+ ADD s1, t1, s1 -+ unop -+ MUL x0, a9, t1 -+ unop -+ -+ ADD s0, t0, s0 -+ MUL x1, a10, t0 -+ ADD s1, t1, s1 -+ MUL x1, a11, t1 -+ -+ ADD s0, t0, s0 -+ MUL x2, a12, t0 -+ ADD s1, t1, s1 -+ MUL x2, a13, t1 -+ -+ ADD s0, t0, s0 -+ MUL x3, a14, t0 -+ ADD s1, t1, s1 -+ MUL x3, a15, t1 -+ .align 4 -+ -+$L25: -+ and M, 7, I -+ ble I, $L28 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 0 * SIZE(A2) -+ LD x0, 0 * SIZE(X1) -+ -+ ldi I, -1(I) -+ ble I, $L27 -+ .align 4 -+ -+$L26: -+ ADD s0, t0, s0 -+ ldi A2, 1 * SIZE(A2) -+ MUL x0, a0, t0 -+ LD a0, 1 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ ldi A1, 1 * SIZE(A1) -+ MUL x0, a1, t1 -+ LD a1, 0 * SIZE(A2) -+ -+ LD x0, 1 * SIZE(X1) -+ ldi X1, 1 * SIZE(X1) -+ ldi I, -1(I) -+ bgt I, $L26 -+ .align 4 -+ -+$L27: -+ ADD s0, t0, s0 -+ MUL x0, a0, t0 -+ ADD s1, t1, s1 -+ MUL x0, a1, t1 -+ .align 4 -+ -+$L28: -+ LD a0, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ LD a1, 0 * SIZE(Y) -+ addl Y, INCY, Y -+ -+ ADD s0, t0, s0 -+ ADD s1, t1, s1 -+ ADD s2, t2, s2 -+ ADD s3, t3, s3 -+ -+ ADD s0, s2, s0 -+ ADD s1, s3, s1 -+ -+ MUL alpha, s0, s0 -+ MUL alpha, s1, s1 -+ -+ ADD a0, s0, a0 -+ ADD a1, s1, a1 -+ -+ ST a0, 0 * SIZE(Y1) -+ fclr t0 -+ addl Y1, INCY, Y1 -+ fclr t1 -+ -+ ST a1, 0 * SIZE(Y1) -+ fclr t2 -+ addl Y1, INCY, Y1 -+ fclr t3 -+ .align 4 -+ -+$L30: -+ blbc N, $L999 -+ -+ mov A, A1 -+ fclr s0 -+ mov X, X1 -+ fclr s1 -+ -+ sra M, 3, I -+ fclr s2 -+ fclr s3 -+ ble I, $L35 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a8, 0 * SIZE(X1) -+ LD a9, 1 * SIZE(X1) -+ -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) -+ LD a10, 2 * SIZE(X1) -+ LD a11, 3 * SIZE(X1) -+ -+ LD a4, 4 * SIZE(A1) -+ LD a5, 5 * SIZE(A1) -+ LD a12, 4 * SIZE(X1) -+ LD a13, 5 * SIZE(X1) -+ -+ LD a6, 6 * SIZE(A1) -+ LD a7, 7 * SIZE(A1) -+ LD a14, 6 * SIZE(X1) -+ -+ ldi I, -1(I) -+ ble I, $L33 -+ .align 4 -+ -+$L32: -+ ADD s0, t0, s0 -+ LD a15, 7 * SIZE(X1) -+ MUL a0, a8, t0 -+ LD a0, 8 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ LD a8, 8 * SIZE(X1) -+ MUL a1, a9, t1 -+ LD a1, 9 * SIZE(A1) -+ -+ ADD s2, t2, s2 -+ LD a9, 9 * SIZE(X1) -+ MUL a2, a10, t2 -+ LD a2, 10 * SIZE(A1) -+ -+ ADD s3, t3, s3 -+ LD a10, 10 * SIZE(X1) -+ MUL a3, a11, t3 -+ LD a3, 11 * SIZE(A1) -+ -+ ADD s0, t0, s0 -+ LD a11, 11 * SIZE(X1) -+ MUL a4, a12, t0 -+ LD a4, 12 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ LD a12, 12 * SIZE(X1) -+ MUL a5, a13, t1 -+ LD a5, 13 * SIZE(A1) -+ -+ ADD s2, t2, s2 -+ LD a13, 13 * SIZE(X1) -+ MUL a6, a14, t2 -+ LD a6, 14 * SIZE(A1) -+ -+ ADD s3, t3, s3 -+ LD a14, 14 * SIZE(X1) -+ MUL a7, a15, t3 -+ LD a7, 15 * SIZE(A1) -+ -+ ldi A1, 8 * SIZE(A1) -+ ldi I, -1(I) -+ ldi X1, 8 * SIZE(X1) -+ bgt I, $L32 -+ .align 4 -+ -+$L33: -+ ADD s0, t0, s0 -+ LD a15, 7 * SIZE(X1) -+ MUL a0, a8, t0 -+ ldi A1, 8 * SIZE(A1) -+ -+ ADD s1, t1, s1 -+ unop -+ MUL a1, a9, t1 -+ ldi X1, 8 * SIZE(X1) -+ -+ ADD s2, t2, s2 -+ MUL a2, a10, t2 -+ ADD s3, t3, s3 -+ MUL a3, a11, t3 -+ -+ ADD s0, t0, s0 -+ MUL a4, a12, t0 -+ ADD s1, t1, s1 -+ MUL a5, a13, t1 -+ -+ ADD s2, t2, s2 -+ MUL a6, a14, t2 -+ ADD s3, t3, s3 -+ MUL a7, a15, t3 -+ .align 4 -+ -+$L35: -+ and M, 7, I -+ ble I, $L38 -+ -+ LD a0, 0 * SIZE(A1) -+ LD x0, 0 * SIZE(X1) -+ -+ ldi I, -1(I) -+ ble I, $L37 -+ .align 4 -+ -+$L36: -+ ADD s0, t0, s0 -+ MUL x0, a0, t0 -+ LD a0, 1 * SIZE(A1) -+ LD x0, 1 * SIZE(X1) -+ -+ ldi A1, 1 * SIZE(A1) -+ ldi X1, 1 * SIZE(X1) -+ ldi I, -1(I) -+ bgt I, $L36 -+ .align 4 -+ -+$L37: -+ ADD s0, t0, s0 -+ MUL x0, a0, t0 -+ .align 4 -+ -+$L38: -+ LD a0, 0 * SIZE(Y) -+ -+ ADD s0, t0, s0 -+ ADD s1, t1, s1 -+ ADD s2, t2, s2 -+ ADD s3, t3, s3 -+ -+ ADD s0, s2, s0 -+ ADD s1, s3, s1 -+ ADD s0, s1, s0 -+ -+ MUL alpha, s0, s0 -+ ADD a0, s0, a0 -+ -+ ST a0, 0 * SIZE(Y1) -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/iamax.S b/kernel/sw_64/iamax.S -new file mode 100644 -index 0000000..f3b2909 ---- /dev/null -+++ b/kernel/sw_64/iamax.S -@@ -0,0 +1,440 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+ -+#ifndef USE_MIN -+#define CMPLT(a, b) fcmplt a, b -+#else -+#define CMPLT(a, b) fcmplt b, a -+#endif -+ -+#define STACKSIZE 6 * 8 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+#ifdef F_INTERFACE -+ ldl N, 0(N) # n -+ ldl INCX, 0(INCX) # incx -+#endif -+ ldi $sp, -STACKSIZE($sp) -+ mov X, XX -+ .align 4 -+ -+ fstd $f2, 0($sp) -+ fclr $f16 -+ cmplt $31, N, $2 -+ unop -+ -+ fstd $f3, 8($sp) -+ fclr $f17 -+ cmplt $31, INCX, $3 -+ unop -+ -+ fstd $f4, 16($sp) -+ fclr $f18 -+ SXADDQ INCX, $31, INCX -+ unop -+ -+ fstd $f5, 24($sp) -+ fclr $f19 -+ and $2, $3, $2 -+ clr $0 -+ -+ fstd $f6, 32($sp) -+ fclr $f0 -+ sra N, 3, $1 -+ beq $2, $End # if (n <= 0) or (incx <= 0) return -+ .align 4 -+ -+ LD $f20, 0 * SIZE(X) -+ unop -+ fabs $f20, $f0 -+ ble $1, $L15 -+ .align 4 -+ -+ fabs $f20, $f1 -+ unop -+ addl X, INCX, X -+ unop -+ -+ LD $f21, 0 * SIZE(X) -+ fabs $f20, $f2 -+ addl X, INCX, X -+ unop -+ -+ LD $f22, 0 * SIZE(X) -+ fabs $f20, $f3 -+ addl X, INCX, X -+ unop -+ -+ LD $f23, 0 * SIZE(X) -+ fabs $f20, $f4 -+ addl X, INCX, X -+ unop -+ -+ LD $f24, 0 * SIZE(X) -+ addl X, INCX, X -+ fabs $f20, $f5 -+ unop -+ -+ LD $f25, 0 * SIZE(X) -+ fabs $f20, $f6 -+ addl X, INCX, X -+ unop -+ -+ LD $f26, 0 * SIZE(X) -+ fabs $f20, $f28 -+ addl X, INCX, X -+ ldi $1, -1($1) -+ -+ LD $f27, 0 * SIZE(X) -+ unop -+ addl X, INCX, X -+ ble $1, $L13 -+ .align 4 -+ -+$L12: -+ fselne $f16, $f12, $f4, $f4 -+ unop -+ fabs $f20, $f29 -+ fillcs 56 * SIZE(X) -+ -+ fselne $f17, $f13, $f5, $f5 -+ LD $f20, 0 * SIZE(X) -+ fabs $f21, $f30 -+ addl X, INCX, X -+ -+ fselne $f18, $f14, $f6, $f6 -+ LD $f21, 0 * SIZE(X) -+ fabs $f22, $f10 -+ addl X, INCX, X -+ -+ fselne $f19, $f15, $f28, $f28 -+ LD $f22, 0 * SIZE(X) -+ fabs $f23, $f11 -+ addl X, INCX, X -+ -+ fabs $f24, $f12 -+ LD $f23, 0 * SIZE(X) -+ CMPLT($f0, $f29), $f16 -+ addl X, INCX, X -+ -+ fabs $f25, $f13 -+ LD $f24, 0 * SIZE(X) -+ CMPLT($f1, $f30), $f17 -+ addl X, INCX, X -+ -+ fabs $f26, $f14 -+ LD $f25, 0 * SIZE(X) -+ CMPLT($f2, $f10), $f18 -+ addl X, INCX, X -+ -+ fabs $f27, $f15 -+ LD $f26, 0 * SIZE(X) -+ CMPLT($f3, $f11), $f19 -+ addl X, INCX, X -+ -+ fselne $f16, $f29, $f0, $f0 -+ LD $f27, 0 * SIZE(X) -+ CMPLT($f4, $f12), $f16 -+ addl X, INCX, X -+ -+ fselne $f17, $f30, $f1, $f1 -+ unop -+ CMPLT($f5, $f13), $f17 -+ ldi $1, -1($1) # i -- -+ -+ fselne $f18, $f10, $f2, $f2 -+ unop -+ CMPLT($f6, $f14), $f18 -+ unop -+ -+ fselne $f19, $f11, $f3, $f3 -+ unop -+ CMPLT($f28, $f15), $f19 -+ bgt $1,$L12 -+ .align 4 -+ -+$L13: -+ fselne $f16, $f12, $f4, $f4 -+ fabs $f20, $f29 -+ fselne $f17, $f13, $f5, $f5 -+ fabs $f21, $f30 -+ -+ fselne $f18, $f14, $f6, $f6 -+ fabs $f22, $f10 -+ fselne $f19, $f15, $f28, $f28 -+ fabs $f23, $f11 -+ -+ fabs $f24, $f12 -+ CMPLT($f0, $f29), $f16 -+ fabs $f25, $f13 -+ CMPLT($f1, $f30), $f17 -+ -+ fabs $f26, $f14 -+ CMPLT($f2, $f10), $f18 -+ fabs $f27, $f15 -+ CMPLT($f3, $f11), $f19 -+ -+ fselne $f16, $f29, $f0, $f0 -+ CMPLT($f4, $f12), $f16 -+ fselne $f17, $f30, $f1, $f1 -+ CMPLT($f5, $f13), $f17 -+ -+ fselne $f18, $f10, $f2, $f2 -+ CMPLT($f6, $f14), $f18 -+ fselne $f19, $f11, $f3, $f3 -+ CMPLT($f28, $f15), $f19 -+ -+ fselne $f16, $f12, $f4, $f4 -+ CMPLT($f0, $f1), $f16 -+ fselne $f17, $f13, $f5, $f5 -+ CMPLT($f2, $f3), $f17 -+ -+ fselne $f18, $f14, $f6, $f6 -+ CMPLT($f4, $f5), $f18 -+ fselne $f19, $f15, $f28, $f28 -+ CMPLT($f6, $f28), $f19 -+ -+ fselne $f16, $f1, $f0, $f0 -+ fselne $f17, $f3, $f2, $f2 -+ fselne $f18, $f5, $f4, $f4 -+ fselne $f19, $f28, $f6, $f6 -+ -+ CMPLT($f0, $f2), $f16 -+ CMPLT($f4, $f6), $f17 -+ -+ fselne $f16, $f2, $f0, $f0 -+ fselne $f17, $f6, $f4, $f4 -+ -+ CMPLT($f0, $f4), $f16 -+ fselne $f16, $f4, $f0, $f0 -+ .align 4 -+ -+$L15: -+ and N, 7, $1 -+ unop -+ unop -+ ble $1, $L20 -+ .align 4 -+ -+$L16: -+ LD $f20, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ fabs $f20, $f29 -+ CMPLT($f0, $f29), $f16 -+ fselne $f16, $f29, $f0, $f0 -+ -+ ldi $1, -1($1) # i -- -+ bgt $1, $L16 -+ .align 4 -+ -+$L20: -+ sra N, 3, $1 -+ ble $1, $L40 -+ .align 4 -+ -+ LD $f10, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ LD $f11, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f12, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ LD $f13, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f14, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ LD $f15, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f16, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ LD $f17, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ fabs $f10, $f18 -+ fabs $f11, $f19 -+ fabs $f12, $f20 -+ fabs $f13, $f21 -+ -+ ldi $1, -1($1) -+ ble $1, $L23 -+ .align 4 -+ -+$L22: -+ LD $f10, 0 * SIZE(XX) -+ fabs $f14, $f22 -+ addl XX, INCX, XX -+ fcmpeq $f0, $f18, $f2 -+ -+ LD $f11, 0 * SIZE(XX) -+ fabs $f15, $f23 -+ addl XX, INCX, XX -+ fcmpeq $f0, $f19, $f3 -+ -+ LD $f12, 0 * SIZE(XX) -+ fabs $f16, $f24 -+ addl XX, INCX, XX -+ fcmpeq $f0, $f20, $f4 -+ -+ LD $f13, 0 * SIZE(XX) -+ fabs $f17, $f25 -+ addl XX, INCX, XX -+ fcmpeq $f0, $f21, $f5 -+ -+ LD $f14, 0 * SIZE(XX) -+ ldi $1, -1($1) # i -- -+ fcmpeq $f0, $f22, $f26 -+ addl XX, INCX, XX -+ -+ ldi $0, 1($0) -+ fbne $f2, $End -+ -+ LD $f15, 0 * SIZE(XX) -+ fcmpeq $f0, $f23, $f27 -+ ldi $0, 1($0) -+ fbne $f3, $End -+ -+ addl XX, INCX, XX -+ fcmpeq $f0, $f24, $f28 -+ ldi $0, 1($0) -+ fbne $f4, $End -+ -+ LD $f16, 0 * SIZE(XX) -+ fcmpeq $f0, $f25, $f29 -+ ldi $0, 1($0) -+ fbne $f5, $End -+ -+ addl XX, INCX, XX -+ ldi $0, 1($0) -+ fabs $f10, $f18 -+ fbne $f26, $End -+ -+ LD $f17, 0 * SIZE(XX) -+ ldi $0, 1($0) -+ fabs $f11, $f19 -+ fbne $f27, $End -+ -+ addl XX, INCX, XX -+ ldi $0, 1($0) -+ fabs $f12, $f20 -+ fbne $f28, $End -+ -+ ldi $0, 1($0) -+ fabs $f13, $f21 -+ fbne $f29, $End -+ bgt $1, $L22 -+ .align 4 -+ -+$L23: -+ fabs $f14, $f22 -+ fcmpeq $f0, $f18, $f2 -+ fabs $f15, $f23 -+ fcmpeq $f0, $f19, $f3 -+ -+ fabs $f16, $f24 -+ fcmpeq $f0, $f20, $f4 -+ fabs $f17, $f25 -+ fcmpeq $f0, $f21, $f5 -+ -+ fcmpeq $f0, $f22, $f26 -+ ldi $0, 1($0) -+ unop -+ fbne $f2, $End -+ -+ fcmpeq $f0, $f23, $f27 -+ ldi $0, 1($0) -+ unop -+ fbne $f3, $End -+ -+ fcmpeq $f0, $f24, $f28 -+ ldi $0, 1($0) -+ unop -+ fbne $f4, $End -+ -+ fcmpeq $f0, $f25, $f29 -+ ldi $0, 1($0) -+ unop -+ fbne $f5, $End -+ -+ ldi $0, 1($0) -+ fbne $f26, $End -+ ldi $0, 1($0) -+ fbne $f27, $End -+ ldi $0, 1($0) -+ fbne $f28, $End -+ ldi $0, 1($0) -+ fbne $f29, $End -+ .align 4 -+ -+$L40: -+ LD $f20, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ fabs $f20, $f25 -+ fcmpeq $f0, $f25, $f29 -+ -+ ldi $0, 1($0) -+ fbne $f29, $End -+ br $31, $L40 -+ .align 4 -+ -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ -+ fldd $f6, 32($sp) -+ ldi $sp, STACKSIZE($sp) -+ ret -+ -+ EPILOGUE -diff --git a/kernel/sw_64/iamax_simd.S b/kernel/sw_64/iamax_simd.S -new file mode 100644 -index 0000000..c7c6c27 ---- /dev/null -+++ b/kernel/sw_64/iamax_simd.S -@@ -0,0 +1,732 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 96 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+#define I $1 -+#define NN $22 -+ -+#ifndef USE_MIN -+#define CMPLT(a, b) fcmplt a, b -+#else -+#define CMPLT(a, b) fcmplt b, a -+#endif -+ -+#ifndef USE_MIN -+#define VCMPLT(a, b) vfcmplt a, b -+#else -+#define VCMPLT(a, b) vfcmplt b, a -+#endif -+ -+#define STACKSIZE 6 * 8 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+#ifdef F_INTERFACE -+ ldl N, 0(N) # n -+ ldl INCX, 0(INCX) # incx -+#endif -+ ldi $sp, -STACKSIZE($sp) -+ mov X, XX -+ mov N, NN -+ .align 4 -+ -+ fstd $f2, 0($sp) -+ fclr $f16 -+ cmplt $31, N, $2 -+ unop -+ -+ fstd $f3, 8($sp) -+ fclr $f17 -+ cmplt $31, INCX, $3 -+ unop -+ -+ fstd $f4, 16($sp) -+ fclr $f18 -+ SXADDQ INCX, $31, INCX -+ unop -+ -+ fstd $f5, 24($sp) -+ fclr $f19 -+ and $2, $3, $2 -+ clr $0 -+ -+ fstd $f6, 32($sp) -+ fclr $f0 -+ unop -+ beq $2, $End # if (n <= 0) or (incx <= 0) return -+ .align 4 -+ -+ cmpeq INCX, SIZE, $3 -+ beq $3, $Sub -+ .align 4 -+ -+ -+/** -+ test the address of Y -+**/ -+ -+ and X, (VEC_LEN*SIZE-1), $3 -+ LD $f10, 0*SIZE(X) -+ fabs $f10, $f0 # init temp max/min result value -+ beq $3, $Align_Access -+ .align 4 -+/** -+ process the unalign address of X -+**/ -+ -+/*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/ -+ sra NN, 4, I -+ and NN, 15, $3 -+ ble I, $Remain -+ nop -+ -+ sra $3, BASE_SHIFT, $3 -+ ldi $2, VEC_LEN -+ subl $2, $3, $3 -+ nop -+$UnAlign_Y_Loop: -+ LD $f10, 0*SIZE(X) -+ addl X, SIZE, X -+ fabs $f10, $f29 -+ CMPLT($f0, $f29), $f16 -+ -+ fseleq $f16, $f0, $f29, $f0 -+ subl $3, 1, $3 -+ subl NN, 1, NN -+ bgt $3, $UnAlign_Y_Loop -+ .align 4 -+ -+ -+$Align_Access: -+/*search max or min. Unloop 16 */ -+ sra NN, 4, I -+ and NN, 15, $3 -+ ble I, $Remain -+ nop -+ -+ VLD $f10, 0*VEC_LEN*SIZE(X) -+ VLD $f11, 1*VEC_LEN*SIZE(X) -+ VLD $f12, 2*VEC_LEN*SIZE(X) -+ VLD $f13, 3*VEC_LEN*SIZE(X) -+ -+ /*vfabs*/ -+ vcpys $f31, $f10, $f22 -+ vcpys $f31, $f11, $f23 -+ vcpys $f31, $f12, $f24 -+ vcpys $f31, $f13, $f25 -+ -+ vcpyf $f0, $f0 -+ vcpys $f22, $f22, $f1 # copy $f22 -> $f1 -+ vcpys $f22, $f22, $f2 -+ vcpys $f22, $f22, $f3 -+ -+ subl I, 1, I -+ addl X, 16*SIZE, X -+ nop -+ ble I, $MainLoopEnd -+ .align 4 -+$MainLoop: -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ VCMPLT($f0, $f22), $f26 -+ subl I, 1, I -+ VCMPLT($f1, $f23), $f27 -+ -+ VLD $f10, 0*VEC_LEN*SIZE(X) -+ VLD $f11, 1*VEC_LEN*SIZE(X) -+ VLD $f12, 2*VEC_LEN*SIZE(X) -+ VLD $f13, 3*VEC_LEN*SIZE(X) -+ -+ VCMPLT($f2, $f24), $f28 -+ addl X, 16 * SIZE, X -+ nop -+ VCMPLT($f3, $f25), $f29 -+ -+ vfseleq $f26, $f0, $f22, $f0 -+ vfseleq $f27, $f1, $f23, $f1 -+ vfseleq $f28, $f2, $f24, $f2 -+ vfseleq $f29, $f3, $f25, $f3 -+ -+ vcpys $f31, $f10, $f22 -+ vcpys $f31, $f11, $f23 -+ vcpys $f31, $f12, $f24 -+ vcpys $f31, $f13, $f25 -+ -+ bne I, $MainLoop -+ .align 4 -+ -+$MainLoopEnd: -+ VCMPLT($f0, $f22), $f26 -+ VCMPLT($f1, $f23), $f27 -+ VCMPLT($f2, $f24), $f28 -+ VCMPLT($f3, $f25), $f29 -+ -+ vfseleq $f26, $f0, $f22, $f0 -+ vfseleq $f27, $f1, $f23, $f1 -+ vfseleq $f28, $f2, $f24, $f2 -+ vfseleq $f29, $f3, $f25, $f3 -+ -+ /*find the max or min among f0, f1 ,f2 and f3*/ -+ VCMPLT($f0, $f1), $f26 -+ VCMPLT($f2, $f3), $f27 -+ vfseleq $f26, $f0, $f1, $f0 -+ vfseleq $f27, $f2, $f3, $f2 -+ -+ VCMPLT($f0, $f2), $f26 -+ vfseleq $f26, $f0, $f2, $f0 -+ vextf $f0, 1, $f22 -+ vextf $f0, 2, $f23 -+ -+ vextf $f0, 3, $f24 -+ CMPLT($f0, $f22), $f16 -+ CMPLT($f23, $f24), $f17 -+ fseleq $f16, $f0, $f22, $f0 -+ -+ fseleq $f17, $f23, $f24, $f23 -+ CMPLT($f0, $f23), $f18 -+ fseleq $f18, $f0, $f23, $f0 -+ nop -+$Remain: -+ ble $3, $Continuous_FindIndex -+ .align 4 -+$RemainLoop: -+ LD $f20, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ fabs $f20, $f29 -+ CMPLT($f0, $f29), $f16 -+ fseleq $f16, $f0, $f29, $f0 -+ -+ subl $3, 1, $3 -+ bgt $3, $RemainLoop -+ .align 4 -+ /*find index*/ -+$Continuous_FindIndex: -+ sra N, 3, $1 -+ ble $1, $L40 -+ .align 4 -+ -+ LD $f10, 0 * SIZE(XX) -+ LD $f11, 1 * SIZE(XX) -+ LD $f12, 2 * SIZE(XX) -+ LD $f13, 3 * SIZE(XX) -+ -+ -+ LD $f14, 4 * SIZE(XX) -+ LD $f15, 5 * SIZE(XX) -+ LD $f16, 6 * SIZE(XX) -+ LD $f17, 7 * SIZE(XX) -+ -+ -+ fabs $f10, $f18 -+ fabs $f11, $f19 -+ fabs $f12, $f20 -+ fabs $f13, $f21 -+ -+ addl XX, 8*SIZE, XX -+ ldi $1, -1($1) -+ ble $1, $Continuous_FindIndex_Loop -+ .align 4 -+ -+$Continuous_FindIndex_Loop: -+ LD $f10, 0 * SIZE(XX) -+ fabs $f14, $f22 -+ LD $f11, 1 * SIZE(XX) -+ fcmpeq $f0, $f18, $f2 -+ -+ LD $f12, 2 * SIZE(XX) -+ fabs $f15, $f23 -+ LD $f13, 3 * SIZE(XX) -+ fcmpeq $f0, $f19, $f3 -+ -+ LD $f14, 4 * SIZE(XX) -+ fabs $f16, $f24 -+ ldi $1, -1($1) # i -- -+ fcmpeq $f0, $f20, $f4 -+ -+ LD $f15, 5 * SIZE(XX) -+ fabs $f17, $f25 -+ fcmpeq $f0, $f21, $f5 -+ fillcs PREFETCHSIZE * SIZE(X) -+ -+ LD $f16, 6 * SIZE(XX) -+ fcmpeq $f0, $f22, $f26 -+ ldi $0, 1($0) -+ fbne $f2, $End -+ -+ LD $f17, 7 * SIZE(XX) -+ fcmpeq $f0, $f23, $f27 -+ ldi $0, 1($0) -+ fbne $f3, $End -+ -+ addl XX, 8*SIZE, XX -+ fcmpeq $f0, $f24, $f28 -+ ldi $0, 1($0) -+ fbne $f4, $End -+ -+ fcmpeq $f0, $f25, $f29 -+ ldi $0, 1($0) -+ nop -+ fbne $f5, $End -+ -+ ldi $0, 1($0) -+ fabs $f10, $f18 -+ nop -+ fbne $f26, $End -+ -+ ldi $0, 1($0) -+ fabs $f11, $f19 -+ nop -+ fbne $f27, $End -+ -+ ldi $0, 1($0) -+ fabs $f12, $f20 -+ nop -+ fbne $f28, $End -+ -+ ldi $0, 1($0) -+ fabs $f13, $f21 -+ fbne $f29, $End -+ bgt $1, $Continuous_FindIndex_Loop -+ .align 4 -+ -+$Continuous_FindIndex_LoopEnd: -+ fabs $f14, $f22 -+ fcmpeq $f0, $f18, $f2 -+ fabs $f15, $f23 -+ fcmpeq $f0, $f19, $f3 -+ -+ fabs $f16, $f24 -+ fcmpeq $f0, $f20, $f4 -+ fabs $f17, $f25 -+ fcmpeq $f0, $f21, $f5 -+ -+ fcmpeq $f0, $f22, $f26 -+ ldi $0, 1($0) -+ unop -+ fbne $f2, $End -+ -+ fcmpeq $f0, $f23, $f27 -+ ldi $0, 1($0) -+ unop -+ fbne $f3, $End -+ -+ fcmpeq $f0, $f24, $f28 -+ ldi $0, 1($0) -+ unop -+ fbne $f4, $End -+ -+ fcmpeq $f0, $f25, $f29 -+ ldi $0, 1($0) -+ unop -+ fbne $f5, $End -+ -+ ldi $0, 1($0) -+ fbne $f26, $End -+ ldi $0, 1($0) -+ fbne $f27, $End -+ ldi $0, 1($0) -+ fbne $f28, $End -+ ldi $0, 1($0) -+ fbne $f29, $End -+ .align 4 -+ -+ jmp $L40 -+ .align 4 -+$Sub: -+ sra N, 3, $1 -+ LD $f20, 0 * SIZE(X) -+ fabs $f20, $f0 -+ ble $1, $L15 -+ .align 4 -+ -+ fabs $f20, $f1 -+ unop -+ addl X, INCX, X -+ unop -+ -+ LD $f21, 0 * SIZE(X) -+ fabs $f20, $f2 -+ addl X, INCX, X -+ unop -+ -+ LD $f22, 0 * SIZE(X) -+ fabs $f20, $f3 -+ addl X, INCX, X -+ unop -+ -+ LD $f23, 0 * SIZE(X) -+ fabs $f20, $f4 -+ addl X, INCX, X -+ unop -+ -+ LD $f24, 0 * SIZE(X) -+ addl X, INCX, X -+ fabs $f20, $f5 -+ unop -+ -+ LD $f25, 0 * SIZE(X) -+ fabs $f20, $f6 -+ addl X, INCX, X -+ unop -+ -+ LD $f26, 0 * SIZE(X) -+ fabs $f20, $f28 -+ addl X, INCX, X -+ ldi $1, -1($1) -+ -+ LD $f27, 0 * SIZE(X) -+ unop -+ addl X, INCX, X -+ ble $1, $L13 -+ .align 4 -+ -+$L12: -+ fselne $f16, $f12, $f4, $f4 -+ unop -+ fabs $f20, $f29 -+ fillcs 56 * SIZE(X) -+ -+ fselne $f17, $f13, $f5, $f5 -+ LD $f20, 0 * SIZE(X) -+ fabs $f21, $f30 -+ addl X, INCX, X -+ -+ fselne $f18, $f14, $f6, $f6 -+ LD $f21, 0 * SIZE(X) -+ fabs $f22, $f10 -+ addl X, INCX, X -+ -+ fselne $f19, $f15, $f28, $f28 -+ LD $f22, 0 * SIZE(X) -+ fabs $f23, $f11 -+ addl X, INCX, X -+ -+ fabs $f24, $f12 -+ LD $f23, 0 * SIZE(X) -+ CMPLT($f0, $f29), $f16 -+ addl X, INCX, X -+ -+ fabs $f25, $f13 -+ LD $f24, 0 * SIZE(X) -+ CMPLT($f1, $f30), $f17 -+ addl X, INCX, X -+ -+ fabs $f26, $f14 -+ LD $f25, 0 * SIZE(X) -+ CMPLT($f2, $f10), $f18 -+ addl X, INCX, X -+ -+ fabs $f27, $f15 -+ LD $f26, 0 * SIZE(X) -+ CMPLT($f3, $f11), $f19 -+ addl X, INCX, X -+ -+ fselne $f16, $f29, $f0, $f0 -+ LD $f27, 0 * SIZE(X) -+ CMPLT($f4, $f12), $f16 -+ addl X, INCX, X -+ -+ fselne $f17, $f30, $f1, $f1 -+ unop -+ CMPLT($f5, $f13), $f17 -+ ldi $1, -1($1) # i -- -+ -+ fselne $f18, $f10, $f2, $f2 -+ unop -+ CMPLT($f6, $f14), $f18 -+ unop -+ -+ fselne $f19, $f11, $f3, $f3 -+ unop -+ CMPLT($f28, $f15), $f19 -+ bgt $1,$L12 -+ .align 4 -+ -+$L13: -+ fselne $f16, $f12, $f4, $f4 -+ fabs $f20, $f29 -+ fselne $f17, $f13, $f5, $f5 -+ fabs $f21, $f30 -+ -+ fselne $f18, $f14, $f6, $f6 -+ fabs $f22, $f10 -+ fselne $f19, $f15, $f28, $f28 -+ fabs $f23, $f11 -+ -+ fabs $f24, $f12 -+ CMPLT($f0, $f29), $f16 -+ fabs $f25, $f13 -+ CMPLT($f1, $f30), $f17 -+ -+ fabs $f26, $f14 -+ CMPLT($f2, $f10), $f18 -+ fabs $f27, $f15 -+ CMPLT($f3, $f11), $f19 -+ -+ fselne $f16, $f29, $f0, $f0 -+ CMPLT($f4, $f12), $f16 -+ fselne $f17, $f30, $f1, $f1 -+ CMPLT($f5, $f13), $f17 -+ -+ fselne $f18, $f10, $f2, $f2 -+ CMPLT($f6, $f14), $f18 -+ fselne $f19, $f11, $f3, $f3 -+ CMPLT($f28, $f15), $f19 -+ -+ fselne $f16, $f12, $f4, $f4 -+ CMPLT($f0, $f1), $f16 -+ fselne $f17, $f13, $f5, $f5 -+ CMPLT($f2, $f3), $f17 -+ -+ fselne $f18, $f14, $f6, $f6 -+ CMPLT($f4, $f5), $f18 -+ fselne $f19, $f15, $f28, $f28 -+ CMPLT($f6, $f28), $f19 -+ -+ fselne $f16, $f1, $f0, $f0 -+ fselne $f17, $f3, $f2, $f2 -+ fselne $f18, $f5, $f4, $f4 -+ fselne $f19, $f28, $f6, $f6 -+ -+ CMPLT($f0, $f2), $f16 -+ CMPLT($f4, $f6), $f17 -+ -+ fselne $f16, $f2, $f0, $f0 -+ fselne $f17, $f6, $f4, $f4 -+ -+ CMPLT($f0, $f4), $f16 -+ fselne $f16, $f4, $f0, $f0 -+ .align 4 -+ -+$L15: -+ and N, 7, $1 -+ unop -+ unop -+ ble $1, $L20 -+ .align 4 -+ -+$L16: -+ LD $f20, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ fabs $f20, $f29 -+ CMPLT($f0, $f29), $f16 -+ fselne $f16, $f29, $f0, $f0 -+ -+ ldi $1, -1($1) # i -- -+ bgt $1, $L16 -+ .align 4 -+ -+/* -+ find the index -+*/ -+$L20: -+ sra N, 3, $1 -+ ble $1, $L40 -+ .align 4 -+ -+ LD $f10, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ LD $f11, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f12, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ LD $f13, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f14, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ LD $f15, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f16, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ LD $f17, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ fabs $f10, $f18 -+ fabs $f11, $f19 -+ fabs $f12, $f20 -+ fabs $f13, $f21 -+ -+ ldi $1, -1($1) -+ ble $1, $L23 -+ .align 4 -+ -+$L22: -+ LD $f10, 0 * SIZE(XX) -+ fabs $f14, $f22 -+ addl XX, INCX, XX -+ fcmpeq $f0, $f18, $f2 -+ -+ LD $f11, 0 * SIZE(XX) -+ fabs $f15, $f23 -+ addl XX, INCX, XX -+ fcmpeq $f0, $f19, $f3 -+ -+ LD $f12, 0 * SIZE(XX) -+ fabs $f16, $f24 -+ addl XX, INCX, XX -+ fcmpeq $f0, $f20, $f4 -+ -+ LD $f13, 0 * SIZE(XX) -+ fabs $f17, $f25 -+ addl XX, INCX, XX -+ fcmpeq $f0, $f21, $f5 -+ -+ LD $f14, 0 * SIZE(XX) -+ ldi $1, -1($1) # i -- -+ fcmpeq $f0, $f22, $f26 -+ addl XX, INCX, XX -+ -+ ldi $0, 1($0) -+ fbne $f2, $End -+ -+ LD $f15, 0 * SIZE(XX) -+ fcmpeq $f0, $f23, $f27 -+ ldi $0, 1($0) -+ fbne $f3, $End -+ -+ addl XX, INCX, XX -+ fcmpeq $f0, $f24, $f28 -+ ldi $0, 1($0) -+ fbne $f4, $End -+ -+ LD $f16, 0 * SIZE(XX) -+ fcmpeq $f0, $f25, $f29 -+ ldi $0, 1($0) -+ fbne $f5, $End -+ -+ addl XX, INCX, XX -+ ldi $0, 1($0) -+ fabs $f10, $f18 -+ fbne $f26, $End -+ -+ LD $f17, 0 * SIZE(XX) -+ ldi $0, 1($0) -+ fabs $f11, $f19 -+ fbne $f27, $End -+ -+ addl XX, INCX, XX -+ ldi $0, 1($0) -+ fabs $f12, $f20 -+ fbne $f28, $End -+ -+ ldi $0, 1($0) -+ fabs $f13, $f21 -+ fbne $f29, $End -+ bgt $1, $L22 -+ .align 4 -+ -+$L23: -+ fabs $f14, $f22 -+ fcmpeq $f0, $f18, $f2 -+ fabs $f15, $f23 -+ fcmpeq $f0, $f19, $f3 -+ -+ fabs $f16, $f24 -+ fcmpeq $f0, $f20, $f4 -+ fabs $f17, $f25 -+ fcmpeq $f0, $f21, $f5 -+ -+ fcmpeq $f0, $f22, $f26 -+ ldi $0, 1($0) -+ unop -+ fbne $f2, $End -+ -+ fcmpeq $f0, $f23, $f27 -+ ldi $0, 1($0) -+ unop -+ fbne $f3, $End -+ -+ fcmpeq $f0, $f24, $f28 -+ ldi $0, 1($0) -+ unop -+ fbne $f4, $End -+ -+ fcmpeq $f0, $f25, $f29 -+ ldi $0, 1($0) -+ unop -+ fbne $f5, $End -+ -+ ldi $0, 1($0) -+ fbne $f26, $End -+ ldi $0, 1($0) -+ fbne $f27, $End -+ ldi $0, 1($0) -+ fbne $f28, $End -+ ldi $0, 1($0) -+ fbne $f29, $End -+ .align 4 -+ -+$L40: -+ LD $f20, 0 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ fabs $f20, $f25 -+ fcmpeq $f0, $f25, $f29 -+ -+ ldi $0, 1($0) -+ fbne $f29, $End -+ br $31, $L40 -+ .align 4 -+ -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ -+ fldd $f6, 32($sp) -+ ldi $sp, STACKSIZE($sp) -+ ret -+ -+ EPILOGUE -diff --git a/kernel/sw_64/imax.S b/kernel/sw_64/imax.S -new file mode 100644 -index 0000000..b0cf5c8 ---- /dev/null -+++ b/kernel/sw_64/imax.S -@@ -0,0 +1,351 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+ -+#ifndef USE_MIN -+#define CMPLT(a, b) cmptlt a, b -+#else -+#define CMPLT(a, b) cmptlt b, a -+#endif -+ -+#define STACKSIZE 8 * 8 -+ -+ PROLOGUE -+ PROFCODE -+ -+ clr $0 -+ mov X, XX -+ .align 4 -+ -+ cmplt $31, N, $2 -+ cmplt $31, INCX, $3 -+ SXADDQ INCX, $31, INCX -+ and $2, $3, $2 -+ -+ sra N, 3, $1 -+ fclr $f0 -+ unop -+ beq $2, $End # if (n <= 0) or (incx <= 0) return -+ .align 4 -+ -+ LD $f0, 0 * SIZE(X) -+ unop -+ unop -+ ble $1, $L15 -+ .align 4 -+ -+ fmov $f0, $f1 -+ addq X, INCX, X -+ fmov $f0, $f10 -+ lda $1, -1($1) -+ -+ LD $f21, 0 * SIZE(X) -+ fmov $f0, $f11 -+ addq X, INCX, X -+ fmov $f0, $f12 -+ -+ LD $f22, 0 * SIZE(X) -+ fmov $f0, $f13 -+ addq X, INCX, X -+ fmov $f0, $f14 -+ -+ LD $f23, 0 * SIZE(X) -+ fmov $f0, $f15 -+ addq X, INCX, X -+ fmov $f0, $f20 -+ -+ LD $f24, 0 * SIZE(X) -+ addq X, INCX, X -+ LD $f25, 0 * SIZE(X) -+ addq X, INCX, X -+ LD $f26, 0 * SIZE(X) -+ addq X, INCX, X -+ LD $f27, 0 * SIZE(X) -+ addq X, INCX, X -+ -+ CMPLT($f0, $f20), $f16 -+ CMPLT($f1, $f21), $f17 -+ CMPLT($f10, $f22), $f18 -+ CMPLT($f11, $f23), $f19 -+ -+ ble $1, $L13 -+ .align 4 -+ -+$L12: -+ fcmovne $f16, $f20, $f0 -+ LD $f20, 0 * SIZE(X) -+ CMPLT($f12, $f24), $f16 -+ addq X, INCX, X -+ -+ fcmovne $f17, $f21, $f1 -+ LD $f21, 0 * SIZE(X) -+ CMPLT($f13, $f25), $f17 -+ addq X, INCX, X -+ -+ fcmovne $f18, $f22, $f10 -+ LD $f22, 0 * SIZE(X) -+ CMPLT($f14, $f26), $f18 -+ addq X, INCX, X -+ -+ fcmovne $f19, $f23, $f11 -+ LD $f23, 0 * SIZE(X) -+ CMPLT($f15, $f27), $f19 -+ addq X, INCX, X -+ -+ fcmovne $f16, $f24, $f12 -+ LD $f24, 0 * SIZE(X) -+ CMPLT($f0, $f20), $f16 -+ addq X, INCX, X -+ -+ fcmovne $f17, $f25, $f13 -+ LD $f25, 0 * SIZE(X) -+ CMPLT($f1, $f21), $f17 -+ addq X, INCX, X -+ -+ fcmovne $f18, $f26, $f14 -+ LD $f26, 0 * SIZE(X) -+ CMPLT($f10, $f22), $f18 -+ addq X, INCX, X -+ -+ fcmovne $f19, $f27, $f15 -+ LD $f27, 0 * SIZE(X) -+ CMPLT($f11, $f23), $f19 -+ lda $1, -1($1) # i -- -+ -+ addq X, INCX, X -+ unop -+ unop -+ bgt $1,$L12 -+ .align 4 -+ -+$L13: -+ fcmovne $f16, $f20, $f0 -+ CMPLT($f12, $f24), $f16 -+ -+ fcmovne $f17, $f21, $f1 -+ CMPLT($f13, $f25), $f17 -+ -+ fcmovne $f18, $f22, $f10 -+ CMPLT($f14, $f26), $f18 -+ -+ fcmovne $f19, $f23, $f11 -+ CMPLT($f15, $f27), $f19 -+ -+ fcmovne $f16, $f24, $f12 -+ CMPLT($f0, $f1), $f16 -+ fcmovne $f17, $f25, $f13 -+ CMPLT($f10, $f11), $f17 -+ -+ fcmovne $f18, $f26, $f14 -+ CMPLT($f12, $f13), $f18 -+ fcmovne $f19, $f27, $f15 -+ CMPLT($f14, $f15), $f19 -+ -+ fcmovne $f16, $f1, $f0 -+ fcmovne $f17, $f11, $f10 -+ fcmovne $f18, $f13, $f12 -+ fcmovne $f19, $f15, $f14 -+ -+ CMPLT($f0, $f10), $f16 -+ CMPLT($f12, $f14), $f17 -+ -+ fcmovne $f16, $f10, $f0 -+ fcmovne $f17, $f14, $f12 -+ -+ CMPLT($f0, $f12), $f16 -+ fcmovne $f16, $f12, $f0 -+ .align 4 -+ -+$L15: -+ and N, 7, $1 -+ unop -+ unop -+ ble $1, $L20 -+ .align 4 -+ -+$L16: -+ LD $f20, 0 * SIZE(X) -+ addq X, INCX, X -+ -+ CMPLT($f0, $f20), $f16 -+ fcmovne $f16, $f20, $f0 -+ lda $1, -1($1) # i -- -+ bgt $1, $L16 -+ .align 4 -+ -+$L20: -+ sra N, 3, $1 -+ ble $1, $L40 -+ .align 4 -+ -+ LD $f10, 0 * SIZE(XX) -+ addq XX, INCX, XX -+ LD $f11, 0 * SIZE(XX) -+ addq XX, INCX, XX -+ -+ LD $f12, 0 * SIZE(XX) -+ addq XX, INCX, XX -+ LD $f13, 0 * SIZE(XX) -+ addq XX, INCX, XX -+ -+ LD $f14, 0 * SIZE(XX) -+ addq XX, INCX, XX -+ LD $f15, 0 * SIZE(XX) -+ addq XX, INCX, XX -+ -+ LD $f16, 0 * SIZE(XX) -+ addq XX, INCX, XX -+ LD $f17, 0 * SIZE(XX) -+ addq XX, INCX, XX -+ -+ cmpteq $f0, $f10, $f20 -+ cmpteq $f0, $f11, $f21 -+ cmpteq $f0, $f12, $f22 -+ cmpteq $f0, $f13, $f23 -+ -+ lda $1, -1($1) -+ ble $1, $L23 -+ .align 4 -+ -+$L22: -+ LD $f10, 0 * SIZE(XX) -+ cmpteq $f0, $f14, $f24 -+ lda $0, 1($0) -+ addq XX, INCX, XX -+ fbne $f20, $End -+ -+ LD $f11, 0 * SIZE(XX) -+ cmpteq $f0, $f15, $f25 -+ lda $0, 1($0) -+ addq XX, INCX, XX -+ fbne $f21, $End -+ -+ LD $f12, 0 * SIZE(XX) -+ cmpteq $f0, $f16, $f26 -+ lda $0, 1($0) -+ addq XX, INCX, XX -+ fbne $f22, $End -+ -+ LD $f13, 0 * SIZE(XX) -+ cmpteq $f0, $f17, $f27 -+ lda $0, 1($0) -+ addq XX, INCX, XX -+ fbne $f23, $End -+ -+ LD $f14, 0 * SIZE(XX) -+ cmpteq $f0, $f10, $f20 -+ lda $0, 1($0) -+ addq XX, INCX, XX -+ fbne $f24, $End -+ -+ LD $f15, 0 * SIZE(XX) -+ cmpteq $f0, $f11, $f21 -+ lda $0, 1($0) -+ addq XX, INCX, XX -+ fbne $f25, $End -+ -+ LD $f16, 0 * SIZE(XX) -+ lda $1, -1($1) # i -- -+ cmpteq $f0, $f12, $f22 -+ lda $0, 1($0) -+ addq XX, INCX, XX -+ fbne $f26, $End -+ -+ LD $f17, 0 * SIZE(XX) -+ cmpteq $f0, $f13, $f23 -+ lda $0, 1($0) -+ addq XX, INCX, XX -+ fbne $f27, $End -+ -+ bgt $1, $L22 -+ .align 4 -+ -+$L23: -+ lda $0, 1($0) -+ cmpteq $f0, $f14, $f24 -+ unop -+ fbne $f20, $End -+ -+ lda $0, 1($0) -+ cmpteq $f0, $f15, $f25 -+ unop -+ fbne $f21, $End -+ -+ lda $0, 1($0) -+ cmpteq $f0, $f16, $f26 -+ unop -+ fbne $f22, $End -+ -+ lda $0, 1($0) -+ cmpteq $f0, $f17, $f27 -+ unop -+ fbne $f23, $End -+ -+ lda $0, 1($0) -+ fbne $f24, $End -+ lda $0, 1($0) -+ fbne $f25, $End -+ lda $0, 1($0) -+ fbne $f26, $End -+ lda $0, 1($0) -+ fbne $f27, $End -+ .align 4 -+ -+$L40: -+ LD $f20, 0 * SIZE(XX) -+ addq XX, INCX, XX -+ -+ cmpteq $f0, $f20, $f29 -+ -+ lda $0, 1($0) -+ fbne $f29, $End -+ br $31, $L40 -+ .align 4 -+ -+$End: -+ ret -+ -+ EPILOGUE -diff --git a/kernel/sw_64/imax.c b/kernel/sw_64/imax.c -new file mode 100644 -index 0000000..5072dd1 ---- /dev/null -+++ b/kernel/sw_64/imax.c -@@ -0,0 +1,69 @@ -+/*************************************************************************** -+Copyright (c) 2013, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+ -+/************************************************************************************** -+* 2013/09/14 Saar -+* BLASTEST float : NoTest -+* BLASTEST double : NoTest -+* CTEST : NoTest -+* TEST : NoTest -+* -+**************************************************************************************/ -+ -+#include "common.h" -+#include -+ -+ -+ -+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -+{ -+ BLASLONG i=0; -+ BLASLONG ix=0; -+ FLOAT maxf=0.0; -+ BLASLONG max=0; -+ -+ if (n <= 0 || inc_x <= 0) return(max); -+ -+ maxf=x[0]; -+ ix += inc_x; -+ i++; -+ -+ while(i < n) -+ { -+ if( x[ix] > maxf ) -+ { -+ max = i; -+ maxf = x[ix]; -+ } -+ ix += inc_x; -+ i++; -+ } -+ return(max+1); -+} -+ -+ -diff --git a/kernel/sw_64/imin.c b/kernel/sw_64/imin.c -new file mode 100644 -index 0000000..ffc6522 ---- /dev/null -+++ b/kernel/sw_64/imin.c -@@ -0,0 +1,67 @@ -+/*************************************************************************** -+Copyright (c) 2013, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+ -+/************************************************************************************** -+* 2013/08/19 Saar -+* BLASTEST float -+* BLASTEST double -+* -+**************************************************************************************/ -+ -+#include "common.h" -+#include -+ -+ -+ -+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -+{ -+ BLASLONG i=0; -+ BLASLONG ix=0; -+ FLOAT minf=0.0; -+ BLASLONG min=0; -+ -+ if (n <= 0 || inc_x <= 0) return(min); -+ -+ minf=x[0]; -+ ix += inc_x; -+ i++; -+ -+ while(i < n) -+ { -+ if( x[ix] < minf ) -+ { -+ min = i; -+ minf = x[ix]; -+ } -+ ix += inc_x; -+ i++; -+ } -+ return(min+1); -+} -+ -+ -diff --git a/kernel/sw_64/izamax.S b/kernel/sw_64/izamax.S -new file mode 100644 -index 0000000..5ccc60e ---- /dev/null -+++ b/kernel/sw_64/izamax.S -@@ -0,0 +1,429 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+ -+#ifndef USE_MIN -+#define CMPLT(a, b) fcmplt a, b -+#else -+#define CMPLT(a, b) fcmplt b, a -+#endif -+ -+#define STACKSIZE 8 * 8 -+ -+ PROLOGUE -+ PROFCODE -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ fstd $f2, 0($sp) -+ fclr $f16 -+ cmplt $31, N, $2 -+ unop -+ -+ fstd $f3, 8($sp) -+ fclr $f17 -+ cmplt $31, INCX, $3 -+ unop -+ -+ fstd $f4, 16($sp) -+ fclr $f18 -+ SXADDQ INCX, $31, INCX -+ unop -+ -+ fstd $f5, 24($sp) -+ fclr $f19 -+ and $2, $3, $2 -+ clr $0 -+ -+ fstd $f6, 32($sp) -+ mov X, XX -+ -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ -+ fclr $f0 -+ beq $2, $End # if (n <= 0) or (incx <= 0) return -+ .align 4 -+ -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ sra N, 2, $1 -+ addl INCX, INCX, INCX -+ -+ fabs $f20, $f20 -+ fabs $f21, $f21 -+ faddd $f20, $f21, $f0 -+ ble $1, $L15 -+ .align 4 -+ -+ ldi $1, -1($1) -+ unop -+ addl X, INCX, X -+ unop -+ -+ LD $f22, 0 * SIZE(X) -+ fmov $f0, $f1 -+ LD $f23, 1 * SIZE(X) -+ addl X, INCX, X -+ -+ LD $f24, 0 * SIZE(X) -+ fmov $f0, $f2 -+ LD $f25, 1 * SIZE(X) -+ addl X, INCX, X -+ -+ LD $f26, 0 * SIZE(X) -+ fmov $f0, $f3 -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ -+ fabs $f20, $f8 -+ fabs $f21, $f9 -+ fabs $f22, $f10 -+ fabs $f23, $f11 -+ -+ fabs $f24, $f12 -+ fabs $f25, $f13 -+ fabs $f26, $f14 -+ fabs $f27, $f15 -+ -+ ble $1, $L14 -+ .align 4 -+ -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ ldi $1, -1($1) -+ addl X, INCX, X -+ -+ LD $f22, 0 * SIZE(X) -+ LD $f23, 1 * SIZE(X) -+ unop -+ addl X, INCX, X -+ -+ LD $f24, 0 * SIZE(X) -+ LD $f25, 1 * SIZE(X) -+ unop -+ addl X, INCX, X -+ -+ LD $f26, 0 * SIZE(X) -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ ble $1, $L13 -+ .align 4 -+ -+$L12: -+ faddd $f8, $f9, $f16 -+ unop -+ fabs $f20, $f8 -+ fillcs 64 * SIZE(X) -+ -+ faddd $f10, $f11, $f17 -+ unop -+ fabs $f21, $f9 -+ LD $f20, 0 * SIZE(X) -+ -+ faddd $f12, $f13, $f18 -+ LD $f21, 1 * SIZE(X) -+ fabs $f22, $f10 -+ addl X, INCX, X -+ -+ faddd $f14, $f15, $f19 -+ LD $f22, 0 * SIZE(X) -+ fabs $f23, $f11 -+ unop -+ -+ CMPLT($f0, $f16), $f4 -+ LD $f23, 1 * SIZE(X) -+ fabs $f24, $f12 -+ addl X, INCX, X -+ -+ CMPLT($f1, $f17), $f5 -+ LD $f24, 0 * SIZE(X) -+ fabs $f25, $f13 -+ unop -+ -+ CMPLT($f2, $f18), $f6 -+ LD $f25, 1 * SIZE(X) -+ fabs $f26, $f14 -+ addl X, INCX, X -+ -+ CMPLT($f3, $f19), $f7 -+ LD $f26, 0 * SIZE(X) -+ fabs $f27, $f15 -+ unop -+ -+ fselne $f4, $f16, $f0, $f0 -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ ldi $1, -1($1) # i -- -+ -+ fselne $f5, $f17, $f1, $f1 -+ fselne $f6, $f18, $f2, $f2 -+ fselne $f7, $f19, $f3, $f3 -+ bgt $1,$L12 -+ .align 4 -+ -+$L13: -+ faddd $f8, $f9, $f16 -+ fabs $f20, $f8 -+ -+ faddd $f10, $f11, $f17 -+ fabs $f21, $f9 -+ -+ faddd $f12, $f13, $f18 -+ fabs $f22, $f10 -+ -+ faddd $f14, $f15, $f19 -+ fabs $f23, $f11 -+ -+ CMPLT($f0, $f16), $f4 -+ fabs $f24, $f12 -+ -+ CMPLT($f1, $f17), $f5 -+ fabs $f25, $f13 -+ -+ CMPLT($f2, $f18), $f6 -+ fabs $f26, $f14 -+ CMPLT($f3, $f19), $f7 -+ fabs $f27, $f15 -+ -+ fselne $f4, $f16, $f0, $f0 -+ fselne $f5, $f17, $f1, $f1 -+ fselne $f6, $f18, $f2, $f2 -+ fselne $f7, $f19, $f3, $f3 -+ .align 4 -+ -+$L14: -+ faddd $f8, $f9, $f16 -+ faddd $f10, $f11, $f17 -+ faddd $f12, $f13, $f18 -+ faddd $f14, $f15, $f19 -+ -+ CMPLT($f0, $f16), $f4 -+ CMPLT($f1, $f17), $f5 -+ CMPLT($f2, $f18), $f6 -+ CMPLT($f3, $f19), $f7 -+ -+ fselne $f4, $f16, $f0, $f0 -+ fselne $f5, $f17, $f1, $f1 -+ fselne $f6, $f18, $f2, $f2 -+ fselne $f7, $f19, $f3, $f3 -+ -+ CMPLT($f0, $f1), $f16 -+ CMPLT($f2, $f3), $f17 -+ -+ fselne $f16, $f1, $f0, $f0 -+ fselne $f17, $f3, $f2, $f2 -+ -+ CMPLT($f0, $f2), $f16 -+ fselne $f16, $f2, $f0, $f0 -+ .align 4 -+ -+$L15: -+ and N, 3, $1 -+ unop -+ unop -+ ble $1, $L20 -+ .align 4 -+ -+$L16: -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ unop -+ addl X, INCX, X -+ -+ fabs $f20, $f29 -+ fabs $f21, $f30 -+ faddd $f29, $f30, $f24 -+ fmov $f24,$f29 -+ -+ CMPLT($f0, $f29), $f16 -+ fselne $f16, $f29, $f0, $f0 -+ -+ ldi $1, -1($1) # i -- -+ bgt $1, $L16 -+ .align 4 -+ -+$L20: -+ sra N, 2, $1 -+ ble $1, $L40 -+ .align 4 -+ -+ LD $f10, 0 * SIZE(XX) -+ LD $f11, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f12, 0 * SIZE(XX) -+ LD $f13, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f14, 0 * SIZE(XX) -+ LD $f15, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f16, 0 * SIZE(XX) -+ LD $f17, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ fabs $f10, $f18 -+ fabs $f11, $f19 -+ fabs $f12, $f20 -+ fabs $f13, $f21 -+ -+ ldi $1, -1($1) -+ ble $1, $L23 -+ .align 4 -+ -+$L22: -+ LD $f10, 0 * SIZE(XX) -+ fabs $f14, $f22 -+ LD $f11, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f12, 0 * SIZE(XX) -+ fabs $f15, $f23 -+ LD $f13, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f14, 0 * SIZE(XX) -+ fabs $f16, $f24 -+ LD $f15, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f16, 0 * SIZE(XX) -+ fabs $f17, $f25 -+ LD $f17, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ faddd $f18, $f19, $f4 -+ faddd $f20, $f21, $f5 -+ faddd $f22, $f23, $f6 -+ faddd $f24, $f25, $f7 -+ -+ fcmpeq $f0, $f4, $f26 -+ fcmpeq $f0, $f5, $f27 -+ fcmpeq $f0, $f6, $f28 -+ fcmpeq $f0, $f7, $f29 -+ -+ fabs $f10, $f18 -+ ldi $0, 1($0) -+ ldi $1, -1($1) # i -- -+ fbne $f26, $End -+ -+ fabs $f11, $f19 -+ ldi $0, 1($0) -+ unop -+ fbne $f27, $End -+ -+ fabs $f12, $f20 -+ ldi $0, 1($0) -+ unop -+ fbne $f28, $End -+ -+ fabs $f13, $f21 -+ ldi $0, 1($0) -+ fbne $f29, $End -+ bgt $1, $L22 -+ .align 4 -+ -+$L23: -+ fabs $f14, $f22 -+ fabs $f15, $f23 -+ fabs $f16, $f24 -+ fabs $f17, $f25 -+ -+ faddd $f18, $f19, $f4 -+ faddd $f20, $f21, $f5 -+ faddd $f22, $f23, $f6 -+ faddd $f24, $f25, $f7 -+ -+ fcmpeq $f0, $f4, $f26 -+ fcmpeq $f0, $f5, $f27 -+ fcmpeq $f0, $f6, $f28 -+ fcmpeq $f0, $f7, $f29 -+ -+ ldi $0, 1($0) -+ fbne $f26, $End -+ ldi $0, 1($0) -+ fbne $f27, $End -+ ldi $0, 1($0) -+ fbne $f28, $End -+ ldi $0, 1($0) -+ fbne $f29, $End -+ .align 4 -+ -+$L40: -+ LD $f10, 0 * SIZE(XX) -+ LD $f11, 1 * SIZE(XX) -+ -+ addl XX, INCX, XX -+ -+ fabs $f10, $f18 -+ fabs $f11, $f19 -+ -+ faddd $f18, $f19, $f2 -+ fmov $f2,$f18 -+ fcmpeq $f0, $f18, $f2 -+ -+ ldi $0, 1($0) -+ fbne $f2, $End -+ br $31, $L40 -+ .align 4 -+ -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldi $sp, STACKSIZE($sp) -+ ret -+ -+ EPILOGUE -diff --git a/kernel/sw_64/izamax.S.bak b/kernel/sw_64/izamax.S.bak -new file mode 100644 -index 0000000..34e4c88 ---- /dev/null -+++ b/kernel/sw_64/izamax.S.bak -@@ -0,0 +1,427 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+ -+#ifndef USE_MIN -+#define CMPLT(a, b) fcmplt a, b -+#else -+#define CMPLT(a, b) fcmplt b, a -+#endif -+ -+#define STACKSIZE 8 * 8 -+ -+ PROLOGUE -+ PROFCODE -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ fstd $f2, 0($sp) -+ fclr $f16 -+ cmplt $31, N, $2 -+ unop -+ -+ fstd $f3, 8($sp) -+ fclr $f17 -+ cmplt $31, INCX, $3 -+ unop -+ -+ fstd $f4, 16($sp) -+ fclr $f18 -+ SXADDQ INCX, $31, INCX -+ unop -+ -+ fstd $f5, 24($sp) -+ fclr $f19 -+ and $2, $3, $2 -+ clr $0 -+ -+ fstd $f6, 32($sp) -+ mov X, XX -+ -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ -+ fclr $f0 -+ beq $2, $End # if (n <= 0) or (incx <= 0) return -+ .align 4 -+ -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ sra N, 2, $1 -+ addl INCX, INCX, INCX -+ -+ fabs $f20, $f20 -+ fabs $f21, $f21 -+ faddd $f20, $f21, $f0 -+ ble $1, $L15 -+ .align 4 -+ -+ ldi $1, -1($1) -+ unop -+ addl X, INCX, X -+ unop -+ -+ LD $f22, 0 * SIZE(X) -+ fmov $f0, $f1 -+ LD $f23, 1 * SIZE(X) -+ addl X, INCX, X -+ -+ LD $f24, 0 * SIZE(X) -+ fmov $f0, $f2 -+ LD $f25, 1 * SIZE(X) -+ addl X, INCX, X -+ -+ LD $f26, 0 * SIZE(X) -+ fmov $f0, $f3 -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ -+ fabs $f20, $f8 -+ fabs $f21, $f9 -+ fabs $f22, $f10 -+ fabs $f23, $f11 -+ -+ fabs $f24, $f12 -+ fabs $f25, $f13 -+ fabs $f26, $f14 -+ fabs $f27, $f15 -+ -+ ble $1, $L14 -+ .align 4 -+ -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ ldi $1, -1($1) -+ addl X, INCX, X -+ -+ LD $f22, 0 * SIZE(X) -+ LD $f23, 1 * SIZE(X) -+ unop -+ addl X, INCX, X -+ -+ LD $f24, 0 * SIZE(X) -+ LD $f25, 1 * SIZE(X) -+ unop -+ addl X, INCX, X -+ -+ LD $f26, 0 * SIZE(X) -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ ble $1, $L13 -+ .align 4 -+ -+$L12: -+ faddd $f8, $f9, $f16 -+ unop -+ fabs $f20, $f8 -+ fillcs 64 * SIZE(X) -+ -+ faddd $f10, $f11, $f17 -+ unop -+ fabs $f21, $f9 -+ LD $f20, 0 * SIZE(X) -+ -+ faddd $f12, $f13, $f18 -+ LD $f21, 1 * SIZE(X) -+ fabs $f22, $f10 -+ addl X, INCX, X -+ -+ faddd $f14, $f15, $f19 -+ LD $f22, 0 * SIZE(X) -+ fabs $f23, $f11 -+ unop -+ -+ CMPLT($f0, $f16), $f4 -+ LD $f23, 1 * SIZE(X) -+ fabs $f24, $f12 -+ addl X, INCX, X -+ -+ CMPLT($f1, $f17), $f5 -+ LD $f24, 0 * SIZE(X) -+ fabs $f25, $f13 -+ unop -+ -+ CMPLT($f2, $f18), $f6 -+ LD $f25, 1 * SIZE(X) -+ fabs $f26, $f14 -+ addl X, INCX, X -+ -+ CMPLT($f3, $f19), $f7 -+ LD $f26, 0 * SIZE(X) -+ fabs $f27, $f15 -+ unop -+ -+fselne $f4,$f16,$f0, $f0 -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ ldi $1, -1($1) # i -- -+ -+fselne $f5,$f17,$f1, $f1 -+fselne $f6,$f18,$f2, $f2 -+fselne $f7,$f19,$f3, $f3 -+ bgt $1,$L12 -+ .align 4 -+ -+$L13: -+ faddd $f8, $f9, $f16 -+ fabs $f20, $f8 -+ -+ faddd $f10, $f11, $f17 -+ fabs $f21, $f9 -+ -+ faddd $f12, $f13, $f18 -+ fabs $f22, $f10 -+ -+ faddd $f14, $f15, $f19 -+ fabs $f23, $f11 -+ -+ CMPLT($f0, $f16), $f4 -+ fabs $f24, $f12 -+ -+ CMPLT($f1, $f17), $f5 -+ fabs $f25, $f13 -+ -+ CMPLT($f2, $f18), $f6 -+ fabs $f26, $f14 -+ CMPLT($f3, $f19), $f7 -+ fabs $f27, $f15 -+ -+fselne $f4,$f16,$f0, $f0 -+fselne $f5,$f17,$f1, $f1 -+fselne $f6,$f18,$f2, $f2 -+fselne $f7,$f19,$f3, $f3 -+ .align 4 -+ -+$L14: -+ faddd $f8, $f9, $f16 -+ faddd $f10, $f11, $f17 -+ faddd $f12, $f13, $f18 -+ faddd $f14, $f15, $f19 -+ -+ CMPLT($f0, $f16), $f4 -+ CMPLT($f1, $f17), $f5 -+ CMPLT($f2, $f18), $f6 -+ CMPLT($f3, $f19), $f7 -+ -+fselne $f4,$f16,$f0, $f0 -+fselne $f5,$f17,$f1, $f1 -+fselne $f6,$f18,$f2, $f2 -+fselne $f7,$f19,$f3, $f3 -+ -+ CMPLT($f0, $f1), $f16 -+ CMPLT($f2, $f3), $f17 -+ -+fselne $f16,$f1,$f0, $f0 -+fselne $f17,$f3,$f2, $f2 -+ -+ CMPLT($f0, $f2), $f16 -+fselne $f16,$f2,$f0, $f0 -+ .align 4 -+ -+$L15: -+ and N, 3, $1 -+ unop -+ unop -+ ble $1, $L20 -+ .align 4 -+ -+$L16: -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ unop -+ addl X, INCX, X -+ -+ fabs $f20, $f29 -+ fabs $f21, $f30 -+ faddd $f29, $f30, $f29 -+ -+ CMPLT($f0, $f29), $f16 -+fselne $f16,$f29,$f0, $f0 -+ -+ ldi $1, -1($1) # i -- -+ bgt $1, $L16 -+ .align 4 -+ -+$L20: -+ sra N, 2, $1 -+ ble $1, $L40 -+ .align 4 -+ -+ LD $f10, 0 * SIZE(XX) -+ LD $f11, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f12, 0 * SIZE(XX) -+ LD $f13, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f14, 0 * SIZE(XX) -+ LD $f15, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f16, 0 * SIZE(XX) -+ LD $f17, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ fabs $f10, $f18 -+ fabs $f11, $f19 -+ fabs $f12, $f20 -+ fabs $f13, $f21 -+ -+ ldi $1, -1($1) -+ ble $1, $L23 -+ .align 4 -+ -+$L22: -+ LD $f10, 0 * SIZE(XX) -+ fabs $f14, $f22 -+ LD $f11, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f12, 0 * SIZE(XX) -+ fabs $f15, $f23 -+ LD $f13, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f14, 0 * SIZE(XX) -+ fabs $f16, $f24 -+ LD $f15, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f16, 0 * SIZE(XX) -+ fabs $f17, $f25 -+ LD $f17, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ faddd $f18, $f19, $f4 -+ faddd $f20, $f21, $f5 -+ faddd $f22, $f23, $f6 -+ faddd $f24, $f25, $f7 -+ -+ fcmpeq $f0, $f4, $f26 -+ fcmpeq $f0, $f5, $f27 -+ fcmpeq $f0, $f6, $f28 -+ fcmpeq $f0, $f7, $f29 -+ -+ fabs $f10, $f18 -+ ldi $0, 1($0) -+ ldi $1, -1($1) # i -- -+ fbne $f26, $End -+ -+ fabs $f11, $f19 -+ ldi $0, 1($0) -+ unop -+ fbne $f27, $End -+ -+ fabs $f12, $f20 -+ ldi $0, 1($0) -+ unop -+ fbne $f28, $End -+ -+ fabs $f13, $f21 -+ ldi $0, 1($0) -+ fbne $f29, $End -+ bgt $1, $L22 -+ .align 4 -+ -+$L23: -+ fabs $f14, $f22 -+ fabs $f15, $f23 -+ fabs $f16, $f24 -+ fabs $f17, $f25 -+ -+ faddd $f18, $f19, $f4 -+ faddd $f20, $f21, $f5 -+ faddd $f22, $f23, $f6 -+ faddd $f24, $f25, $f7 -+ -+ fcmpeq $f0, $f4, $f26 -+ fcmpeq $f0, $f5, $f27 -+ fcmpeq $f0, $f6, $f28 -+ fcmpeq $f0, $f7, $f29 -+ -+ ldi $0, 1($0) -+ fbne $f26, $End -+ ldi $0, 1($0) -+ fbne $f27, $End -+ ldi $0, 1($0) -+ fbne $f28, $End -+ ldi $0, 1($0) -+ fbne $f29, $End -+ .align 4 -+ -+$L40: -+ LD $f10, 0 * SIZE(XX) -+ LD $f11, 1 * SIZE(XX) -+ -+ addl XX, INCX, XX -+ -+ fabs $f10, $f18 -+ fabs $f11, $f19 -+ -+ faddd $f18, $f19, $f18 -+ fcmpeq $f0, $f18, $f2 -+ -+ ldi $0, 1($0) -+ fbne $f2, $End -+ br $31, $L40 -+ .align 4 -+ -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldi $sp, STACKSIZE($sp) -+ ret -+ -+ EPILOGUE -diff --git a/kernel/sw_64/izamax_simd.S b/kernel/sw_64/izamax_simd.S -new file mode 100644 -index 0000000..8b00f60 ---- /dev/null -+++ b/kernel/sw_64/izamax_simd.S -@@ -0,0 +1,609 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 96 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+ -+#define I $2 -+ -+#ifndef USE_MIN -+#define CMPLT(a, b) fcmplt a, b -+#define VCMPLT(a, b) vfcmplt a, b -+#else -+#define CMPLT(a, b) fcmplt b, a -+#define VCMPLT(a, b) vfcmplt b, a -+#endif -+ -+#define STACKSIZE 8 * 8 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ fstd $f2, 0($sp) -+ fclr $f16 -+ cmplt $31, N, $2 -+ unop -+ -+ fstd $f3, 8($sp) -+ fclr $f17 -+ cmplt $31, INCX, $3 -+ unop -+ -+ fstd $f4, 16($sp) -+ fclr $f18 -+ SXADDQ INCX, $31, INCX -+ unop -+ -+ fstd $f5, 24($sp) -+ fclr $f19 -+ and $2, $3, $2 -+ clr $0 -+ -+ fstd $f6, 32($sp) -+ mov X, XX -+ -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ -+ fclr $f0 -+ cmpeq INCX, SIZE, $3 -+ and X, (VEC_LEN*SIZE-1), $4 # test the address of X (aligment) -+ beq $2, $End # if (n <= 0) or (incx <= 0) return -+ .align 4 -+ -+ bic $3, $4, $3 -+ nop -+ nop -+ beq $3, $Sub -+ .align 4 -+ -+$Align_Access: -+/* -+ Unloop 8*2=16 reals -+*/ -+#ifdef USE_MIN -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ fabs $f20, $f20 -+ fabs $f21, $f21 -+ ADD $f20, $f21, $f0 # init temp min result value -+#endif -+ sra N, 3, I -+ and N, 7, $3 -+ addl INCX, INCX, INCX -+ ble I, $Remain -+ .align 4 -+/* -+ Init max or min value -+*/ -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ fabs $f20, $f20 -+ fabs $f21, $f21 -+ -+ ADD $f20, $f21, $f4 -+ nop -+ vcpyf $f4, $f0 -+ vcpyf $f4, $f1 -+ -+ -+ VLD $f22, 0*VEC_LEN*SIZE(X) -+ VLD $f23, 1*VEC_LEN*SIZE(X) -+ VLD $f24, 2*VEC_LEN*SIZE(X) -+ VLD $f25, 3*VEC_LEN*SIZE(X) -+ -+ /*vfabs*/ -+ vcpys $f31, $f22, $f10 -+ subl I, 1, I -+ vcpys $f31, $f23, $f11 -+ addl X, 16*SIZE, X -+ -+ vcpys $f31, $f24, $f12 -+ nop -+ vcpys $f31, $f25, $f13 -+ ble I, $MainLoopEnd -+ .align 4 -+ -+$MainLoop: -+ vextf $f10, 1, $f4 -+ VLD $f22, 0*VEC_LEN*SIZE(X) -+ vextf $f10, 3, $f5 -+ VLD $f23, 1*VEC_LEN*SIZE(X) -+ -+ vextf $f11, 0, $f6 -+ VLD $f24, 2*VEC_LEN*SIZE(X) -+ vextf $f11, 2, $f7 -+ VLD $f25, 3*VEC_LEN*SIZE(X) -+ -+ vextf $f12, 1, $f14 -+ vextf $f12, 3, $f15 -+ vextf $f13, 0, $f16 -+ vextf $f13, 2, $f17 -+ -+ vinsf $f4, $f11, 0, $f11 -+ vinsf $f6, $f10, 1, $f10 -+ vinsf $f14, $f13, 0, $f13 -+ vinsf $f16, $f12, 1, $f12 -+ -+ vinsf $f5, $f11, 2, $f11 -+ vinsf $f7, $f10, 3, $f10 -+ vinsf $f15, $f13, 2, $f13 -+ vinsf $f17, $f12, 3, $f12 -+ -+ VADD $f10, $f11, $f2 -+ addl X, 16*SIZE, X -+ VADD $f12, $f13, $f3 -+ subl I, 1, I -+ -+ vcpys $f31, $f22, $f10 -+ vcpys $f31, $f23, $f11 -+ VCMPLT($f0, $f2), $f18 -+ VCMPLT($f1, $f3), $f19 -+ -+ vcpys $f31, $f24, $f12 -+ fillcs PREFETCHSIZE * SIZE(X) -+ vcpys $f31, $f25, $f13 -+ nop -+ -+ vfseleq $f18, $f0, $f2, $f0 -+ vfseleq $f19, $f1, $f3, $f1 -+ nop -+ bgt I, $MainLoop -+ .align 4 -+ -+$MainLoopEnd: -+/*spilt the complex vector to real vector($f10,$f12) and image vector ($f11,$f13)*/ -+ vextf $f10, 1, $f4 -+ vextf $f10, 3, $f5 -+ vextf $f11, 0, $f6 -+ vextf $f11, 2, $f7 -+ -+ vextf $f12, 1, $f14 -+ vextf $f12, 3, $f15 -+ vextf $f13, 0, $f16 -+ vextf $f13, 2, $f17 -+ -+ vinsf $f4, $f11, 0, $f11 -+ vinsf $f6, $f10, 1, $f10 -+ vinsf $f14, $f13, 0, $f13 -+ vinsf $f16, $f12, 1, $f12 -+ -+ vinsf $f5, $f11, 2, $f11 -+ vinsf $f7, $f10, 3, $f10 -+ vinsf $f15, $f13, 2, $f13 -+ vinsf $f17, $f12, 3, $f12 -+ -+ VADD $f10, $f11, $f2 -+ VADD $f12, $f13, $f3 -+ VCMPLT($f0, $f2), $f18 -+ VCMPLT($f1, $f3), $f19 -+ -+ vfseleq $f18, $f0, $f2, $f0 -+ vfseleq $f19, $f1, $f3, $f1 -+/*find the max or min between f0 and f1*/ -+ VCMPLT($f0, $f1), $f18 -+ vfseleq $f18, $f0, $f1, $f0 -+ -+ -+ vextf $f0, 1, $f22 -+ vextf $f0, 2, $f23 -+ vextf $f0, 3, $f24 -+ CMPLT($f0, $f22), $f16 -+ -+ CMPLT($f23, $f24), $f17 -+ fseleq $f16, $f0, $f22, $f0 -+ fseleq $f17, $f23, $f24, $f23 -+ CMPLT($f0, $f23), $f18 -+ -+ fseleq $f18, $f0, $f23, $f0 -+ nop -+ .align 4 -+$Remain: -+ ble $3, $Continuous_FindIndex -+ .align 4 -+$RemainLoop: -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ unop -+ addl X, 2*SIZE, X -+ -+ fabs $f20, $f29 -+ fabs $f21, $f30 -+ ADD $f29, $f30, $f29 -+ -+ CMPLT($f0, $f29), $f16 -+ fselne $f16,$f29,$f0, $f0 -+ -+ subl $3, 1, $3 -+ bgt $3, $RemainLoop -+ .align 4 -+ -+ /*find index*/ -+$Continuous_FindIndex: -+ -+ jmp $L20 -+ -+$Sub: -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ sra N, 2, $1 -+ addl INCX, INCX, INCX -+ -+ fabs $f20, $f20 -+ fabs $f21, $f21 -+ ADD $f20, $f21, $f0 -+ ble $1, $L15 -+ .align 4 -+ -+ ldi $1, -1($1) -+ unop -+ addl X, INCX, X -+ unop -+ -+ LD $f22, 0 * SIZE(X) -+ fmov $f0, $f1 -+ LD $f23, 1 * SIZE(X) -+ addl X, INCX, X -+ -+ LD $f24, 0 * SIZE(X) -+ fmov $f0, $f2 -+ LD $f25, 1 * SIZE(X) -+ addl X, INCX, X -+ -+ LD $f26, 0 * SIZE(X) -+ fmov $f0, $f3 -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ -+ fabs $f20, $f8 -+ fabs $f21, $f9 -+ fabs $f22, $f10 -+ fabs $f23, $f11 -+ -+ fabs $f24, $f12 -+ fabs $f25, $f13 -+ fabs $f26, $f14 -+ fabs $f27, $f15 -+ -+ ble $1, $L14 -+ .align 4 -+ -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ ldi $1, -1($1) -+ addl X, INCX, X -+ -+ LD $f22, 0 * SIZE(X) -+ LD $f23, 1 * SIZE(X) -+ unop -+ addl X, INCX, X -+ -+ LD $f24, 0 * SIZE(X) -+ LD $f25, 1 * SIZE(X) -+ unop -+ addl X, INCX, X -+ -+ LD $f26, 0 * SIZE(X) -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ ble $1, $L13 -+ .align 4 -+ -+$L12: -+ ADD $f8, $f9, $f16 -+ fillcs PREFETCHSIZE * SIZE(X) -+ fabs $f20, $f8 -+ fillcs 64 * SIZE(X) -+ -+ ADD $f10, $f11, $f17 -+ unop -+ fabs $f21, $f9 -+ LD $f20, 0 * SIZE(X) -+ -+ ADD $f12, $f13, $f18 -+ LD $f21, 1 * SIZE(X) -+ fabs $f22, $f10 -+ addl X, INCX, X -+ -+ ADD $f14, $f15, $f19 -+ LD $f22, 0 * SIZE(X) -+ fabs $f23, $f11 -+ unop -+ -+ CMPLT($f0, $f16), $f4 -+ LD $f23, 1 * SIZE(X) -+ fabs $f24, $f12 -+ addl X, INCX, X -+ -+ CMPLT($f1, $f17), $f5 -+ LD $f24, 0 * SIZE(X) -+ fabs $f25, $f13 -+ unop -+ -+ CMPLT($f2, $f18), $f6 -+ LD $f25, 1 * SIZE(X) -+ fabs $f26, $f14 -+ addl X, INCX, X -+ -+ CMPLT($f3, $f19), $f7 -+ LD $f26, 0 * SIZE(X) -+ fabs $f27, $f15 -+ unop -+ -+ fselne $f4,$f16,$f0, $f0 -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ ldi $1, -1($1) # i -- -+ -+ fselne $f5,$f17,$f1, $f1 -+ fselne $f6,$f18,$f2, $f2 -+ fselne $f7,$f19,$f3, $f3 -+ bgt $1,$L12 -+ .align 4 -+ -+$L13: -+ ADD $f8, $f9, $f16 -+ fabs $f20, $f8 -+ -+ ADD $f10, $f11, $f17 -+ fabs $f21, $f9 -+ -+ ADD $f12, $f13, $f18 -+ fabs $f22, $f10 -+ -+ ADD $f14, $f15, $f19 -+ fabs $f23, $f11 -+ -+ CMPLT($f0, $f16), $f4 -+ fabs $f24, $f12 -+ -+ CMPLT($f1, $f17), $f5 -+ fabs $f25, $f13 -+ -+ CMPLT($f2, $f18), $f6 -+ fabs $f26, $f14 -+ CMPLT($f3, $f19), $f7 -+ fabs $f27, $f15 -+ -+ fselne $f4,$f16,$f0, $f0 -+ fselne $f5,$f17,$f1, $f1 -+ fselne $f6,$f18,$f2, $f2 -+ fselne $f7,$f19,$f3, $f3 -+ .align 4 -+ -+$L14: -+ ADD $f8, $f9, $f16 -+ ADD $f10, $f11, $f17 -+ ADD $f12, $f13, $f18 -+ ADD $f14, $f15, $f19 -+ -+ CMPLT($f0, $f16), $f4 -+ CMPLT($f1, $f17), $f5 -+ CMPLT($f2, $f18), $f6 -+ CMPLT($f3, $f19), $f7 -+ -+ fselne $f4,$f16,$f0, $f0 -+ fselne $f5,$f17,$f1, $f1 -+ fselne $f6,$f18,$f2, $f2 -+ fselne $f7,$f19,$f3, $f3 -+ -+ CMPLT($f0, $f1), $f16 -+ CMPLT($f2, $f3), $f17 -+ -+ fselne $f16,$f1,$f0, $f0 -+ fselne $f17,$f3,$f2, $f2 -+ -+ CMPLT($f0, $f2), $f16 -+ fselne $f16,$f2,$f0, $f0 -+ .align 4 -+ -+$L15: -+ and N, 3, $1 -+ unop -+ unop -+ ble $1, $L20 -+ .align 4 -+ -+$L16: -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ unop -+ addl X, INCX, X -+ -+ fabs $f20, $f29 -+ fabs $f21, $f30 -+ ADD $f29, $f30, $f29 -+ -+ CMPLT($f0, $f29), $f16 -+ fselne $f16,$f29,$f0, $f0 -+ -+ ldi $1, -1($1) # i -- -+ bgt $1, $L16 -+ .align 4 -+ -+$L20: -+ sra N, 2, $1 -+ ble $1, $L40 -+ .align 4 -+ -+ LD $f10, 0 * SIZE(XX) -+ LD $f11, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f12, 0 * SIZE(XX) -+ LD $f13, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f14, 0 * SIZE(XX) -+ LD $f15, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f16, 0 * SIZE(XX) -+ LD $f17, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ fabs $f10, $f18 -+ fabs $f11, $f19 -+ fabs $f12, $f20 -+ fabs $f13, $f21 -+ -+ ldi $1, -1($1) -+ ble $1, $L23 -+ .align 4 -+ -+$L22: -+ LD $f10, 0 * SIZE(XX) -+ fabs $f14, $f22 -+ LD $f11, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f12, 0 * SIZE(XX) -+ fabs $f15, $f23 -+ LD $f13, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f14, 0 * SIZE(XX) -+ fabs $f16, $f24 -+ LD $f15, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ LD $f16, 0 * SIZE(XX) -+ fabs $f17, $f25 -+ LD $f17, 1 * SIZE(XX) -+ addl XX, INCX, XX -+ -+ ADD $f18, $f19, $f4 -+ ADD $f20, $f21, $f5 -+ ADD $f22, $f23, $f6 -+ ADD $f24, $f25, $f7 -+ -+ fcmpeq $f0, $f4, $f26 -+ fcmpeq $f0, $f5, $f27 -+ fcmpeq $f0, $f6, $f28 -+ fcmpeq $f0, $f7, $f29 -+ -+ fabs $f10, $f18 -+ ldi $0, 1($0) -+ ldi $1, -1($1) # i -- -+ fbne $f26, $End -+ -+ fabs $f11, $f19 -+ ldi $0, 1($0) -+ unop -+ fbne $f27, $End -+ -+ fabs $f12, $f20 -+ ldi $0, 1($0) -+ fillcs PREFETCHSIZE * SIZE(X) -+ fbne $f28, $End -+ -+ fabs $f13, $f21 -+ ldi $0, 1($0) -+ fbne $f29, $End -+ bgt $1, $L22 -+ .align 4 -+ -+$L23: -+ fabs $f14, $f22 -+ fabs $f15, $f23 -+ fabs $f16, $f24 -+ fabs $f17, $f25 -+ -+ ADD $f18, $f19, $f4 -+ ADD $f20, $f21, $f5 -+ ADD $f22, $f23, $f6 -+ ADD $f24, $f25, $f7 -+ -+ fcmpeq $f0, $f4, $f26 -+ fcmpeq $f0, $f5, $f27 -+ fcmpeq $f0, $f6, $f28 -+ fcmpeq $f0, $f7, $f29 -+ -+ ldi $0, 1($0) -+ fbne $f26, $End -+ ldi $0, 1($0) -+ fbne $f27, $End -+ ldi $0, 1($0) -+ fbne $f28, $End -+ ldi $0, 1($0) -+ fbne $f29, $End -+ .align 4 -+ -+$L40: -+ LD $f10, 0 * SIZE(XX) -+ LD $f11, 1 * SIZE(XX) -+ -+ addl XX, INCX, XX -+ -+ fabs $f10, $f18 -+ fabs $f11, $f19 -+ -+ ADD $f18, $f19, $f18 -+ fcmpeq $f0, $f18, $f2 -+ -+ ldi $0, 1($0) -+ fbne $f2, $End -+ br $31, $L40 -+ .align 4 -+ -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldi $sp, STACKSIZE($sp) -+ ret -+ -+ EPILOGUE -diff --git a/kernel/sw_64/lsame.S b/kernel/sw_64/lsame.S -new file mode 100644 -index 0000000..c2c0863 ---- /dev/null -+++ b/kernel/sw_64/lsame.S -@@ -0,0 +1,77 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#include "version.h" -+ -+ .set noat -+ .set noreorder -+.text -+ .align 5 -+ .globl lsame_ -+ .ent lsame_ -+lsame_: -+ .frame $sp,0,$26,0 -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ lda $28, _mcount -+ jsr $28, ($28), _mcount -+ .prologue 1 -+#else -+ .prologue 0 -+#endif -+ -+ ldbu $5, 0($16) -+ ldbu $6, 0($17) -+// extb $2, $5 -+// extbl $3, $6 -+ -+ subl $5, 96, $1 -+ subl $6, 96, $2 -+ subl $5, 32, $3 -+ subl $6, 32, $4 -+ -+ -+ selgt $1, $3, $5, $5 -+ selgt $2, $4, $6, $6 -+ cmpeq $5, $6, $0 -+ .align 4 -+ -+$End: -+ ret -+ .end lsame_ -+ .ident VERSION -diff --git a/kernel/sw_64/max.S b/kernel/sw_64/max.S -new file mode 100644 -index 0000000..07925d1 ---- /dev/null -+++ b/kernel/sw_64/max.S -@@ -0,0 +1,227 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+ -+#ifndef USE_MIN -+#define CMPLT(a, b) fcmplt a, b -+#else -+#define CMPLT(a, b) fcmplt b, a -+#endif -+ -+#define STACKSIZE 8 * 8 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+#ifdef F_INTERFACE -+ ldl N, 0(N) # n -+ ldl INCX, 0(INCX) # incx -+#endif -+ ldi $sp, -STACKSIZE($sp) -+ nop -+ .align 4 -+ -+ cmplt $31, N, $2 -+ cmplt $31, INCX, $3 -+ SXADDQ INCX, $31, INCX -+ and $2, $3, $0 -+ -+ sra N, 3, $1 -+ fclr $f0 -+ unop -+ beq $0, $End # if (n <= 0) or (incx <= 0) return -+ .align 4 -+ -+ LD $f0, 0 * SIZE(X) -+ unop -+ unop -+ ble $1, $L15 -+ .align 4 -+ -+ fmov $f0, $f1 -+ addl X, INCX, X -+ fmov $f0, $f10 -+ ldi $1, -1($1) -+ -+ LD $f21, 0 * SIZE(X) -+ fmov $f0, $f11 -+ addl X, INCX, X -+ fmov $f0, $f12 -+ -+ LD $f22, 0 * SIZE(X) -+ fmov $f0, $f13 -+ addl X, INCX, X -+ fmov $f0, $f14 -+ -+ LD $f23, 0 * SIZE(X) -+ fmov $f0, $f15 -+ addl X, INCX, X -+ fmov $f0, $f20 -+ -+ LD $f24, 0 * SIZE(X) -+ addl X, INCX, X -+ LD $f25, 0 * SIZE(X) -+ addl X, INCX, X -+ LD $f26, 0 * SIZE(X) -+ addl X, INCX, X -+ LD $f27, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ CMPLT($f0, $f20), $f16 -+ CMPLT($f1, $f21), $f17 -+ CMPLT($f10, $f22), $f18 -+ CMPLT($f11, $f23), $f19 -+ -+ ble $1, $L13 -+ .align 4 -+ -+$L12: -+ fselne $f16, $f20, $f0, $f0 -+ LD $f20, 0 * SIZE(X) -+ CMPLT($f12, $f24), $f16 -+ addl X, INCX, X -+ -+ fselne $f17, $f21, $f1, $f1 -+ LD $f21, 0 * SIZE(X) -+ CMPLT($f13, $f25), $f17 -+ addl X, INCX, X -+ -+ fselne $f18, $f22, $f10, $f10 -+ LD $f22, 0 * SIZE(X) -+ CMPLT($f14, $f26), $f18 -+ addl X, INCX, X -+ -+ fselne $f19, $f23, $f11, $f11 -+ LD $f23, 0 * SIZE(X) -+ CMPLT($f15, $f27), $f19 -+ addl X, INCX, X -+ -+ fselne $f16, $f24, $f12, $f12 -+ LD $f24, 0 * SIZE(X) -+ CMPLT($f0, $f20), $f16 -+ addl X, INCX, X -+ -+ fselne $f17, $f25, $f13, $f13 -+ LD $f25, 0 * SIZE(X) -+ CMPLT($f1, $f21), $f17 -+ addl X, INCX, X -+ -+ fselne $f18, $f26, $f14, $f14 -+ LD $f26, 0 * SIZE(X) -+ CMPLT($f10, $f22), $f18 -+ addl X, INCX, X -+ -+ fselne $f19, $f27, $f15, $f15 -+ LD $f27, 0 * SIZE(X) -+ CMPLT($f11, $f23), $f19 -+ ldi $1, -1($1) # i -- -+ -+ addl X, INCX, X -+ unop -+ unop -+ bgt $1,$L12 -+ .align 4 -+ -+$L13: -+ fselne $f16, $f20, $f0, $f0 -+ CMPLT($f12, $f24), $f16 -+ -+ fselne $f17, $f21, $f1, $f1 -+ CMPLT($f13, $f25), $f17 -+ -+ fselne $f18, $f22, $f10, $f10 -+ CMPLT($f14, $f26), $f18 -+ -+ fselne $f19, $f23, $f11, $f11 -+ CMPLT($f15, $f27), $f19 -+ -+ fselne $f16, $f24, $f12, $f12 -+ CMPLT($f0, $f1), $f16 -+ fselne $f17, $f25, $f13, $f13 -+ CMPLT($f10, $f11), $f17 -+ -+ fselne $f18, $f26, $f14, $f14 -+ CMPLT($f12, $f13), $f18 -+ fselne $f19, $f27, $f15, $f15 -+ CMPLT($f14, $f15), $f19 -+ -+ fselne $f16, $f1, $f0, $f0 -+ fselne $f17, $f11, $f10, $f10 -+ fselne $f18, $f13, $f12, $f12 -+ fselne $f19, $f15, $f14, $f14 -+ -+ CMPLT($f0, $f10), $f16 -+ CMPLT($f12, $f14), $f17 -+ -+ fselne $f16, $f10, $f0, $f0 -+ fselne $f17, $f14, $f12, $f12 -+ -+ CMPLT($f0, $f12), $f16 -+ fselne $f16, $f12, $f0, $f0 -+ .align 4 -+ -+$L15: -+ and N, 7, $1 -+ unop -+ unop -+ ble $1, $End -+ .align 4 -+ -+$L16: -+ LD $f20, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ CMPLT($f0, $f20), $f16 -+ fselne $f16, $f20, $f0, $f0 -+ ldi $1, -1($1) # i -- -+ bgt $1, $L16 -+ .align 4 -+ -+$End: -+ ldi $sp, STACKSIZE($sp) -+ ret -+ -+ EPILOGUE -diff --git a/kernel/sw_64/nrm2_simd.S b/kernel/sw_64/nrm2_simd.S -new file mode 100644 -index 0000000..0888454 ---- /dev/null -+++ b/kernel/sw_64/nrm2_simd.S -@@ -0,0 +1,493 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 80 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+ -+#define I $0 -+ -+#define a0 $f0 -+#define a1 $f1 -+#define a2 $f10 -+#define a3 $f11 -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 -+ -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f19 -+#define x4 $f20 -+#define x5 $f21 -+#define x6 $f22 -+#define x7 $f23 -+ -+ PROLOGUE -+ -+ -+ PROFCODE -+ -+ -+ fclr a0 -+ SXADDQ INCX, 0, INCX -+ fclr a1 -+ ble N, $L999 -+ -+ fclr a2 -+ cmpeq INCX, SIZE, $0 -+ fclr a3 -+ beq $0, $L20 #stride access -+ -+/* test the address of X */ -+ and X, (VEC_LEN*SIZE-1), $3 -+ fclr t0 -+ nop -+ bne $3, $UnAlign_ACCESS -+/*Align access. Use simd instructions.*/ -+ sra N, 4, I -+ ble I, $Remain -+ -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t0 #clear s0 vector -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t1 -+ -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t2 -+ VLD a3, 3*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t3 -+ -+ addl X, 16 * SIZE, X -+ subl I, 1, I -+ nop -+ ble I, $MainLoopEnd -+$MainLoop: -+ fillcs PREFETCHSIZE * SIZE(X) -+ VMAD a0, a0, t0, t0 -+ subl I, 1, I -+ VMAD a1, a1, t1, t1 -+ -+ addl X, 16 * SIZE, X -+ VMAD a2, a2, t2, t2 -+ nop -+ VMAD a3, a3, t3, t3 -+ -+ VLD a0, -4*VEC_LEN*SIZE(X) -+ VLD a1, -3*VEC_LEN*SIZE(X) -+ VLD a2, -2*VEC_LEN*SIZE(X) -+ VLD a3, -1*VEC_LEN*SIZE(X) -+ -+ bgt I, $MainLoop -+ .align 4 -+$MainLoopEnd: -+ VMAD a0, a0, t0, t0 -+ VMAD a1, a1, t1, t1 -+ VMAD a2, a2, t2, t2 -+ VMAD a3, a3, t3, t3 -+ -+ VADD t0, t1, a0 -+ VADD t2, t3, a1 -+ nop -+ VADD a0, a1, t0 -+ -+ vextf t0, 1, t1 -+ vextf t0, 2, t2 -+ vextf t0, 3, t3 -+ nop -+ -+ ADD t0, t1, a2 -+ ADD t2, t3, a3 -+ nop -+ ADD a2, a3, t0 -+ -+ .align 4 -+$Remain: -+ and N, 15, I -+ ble I, $End -+ .align 4 -+$RemainLoop: -+ LD a0, 0 * SIZE(X) -+ addl X, SIZE, X -+ MAD a0, a0, t0, t0 -+ subl I, 1, I -+ -+ bgt I, $RemainLoop -+ .align 4 -+$End: -+ SQRT t0, a0 -+ ret -+ .align 4 -+ -+/*Don't use simd*/ -+ -+$UnAlign_ACCESS: -+ -+ fclr t0 -+ sra N, 4, I -+ fclr t1 -+ ble I, $L15 -+ -+ fclr t2 -+ LD x0, 0 * SIZE(X) -+ fclr t3 -+ LD x1, 1 * SIZE(X) -+ -+ LD x2, 2 * SIZE(X) -+ LD x3, 3 * SIZE(X) -+ LD x4, 4 * SIZE(X) -+ LD x5, 5 * SIZE(X) -+ LD x6, 6 * SIZE(X) -+ LD x7, 7 * SIZE(X) -+ -+ ldi I, -1(I) -+ ble I, $L12 -+ .align 4 -+ -+$L11: -+ ADD a0, t0, a0 -+ fillcs (PREFETCHSIZE) * SIZE(X) -+ MUL x0, x0, t0 -+ LD x0, 8 * SIZE(X) -+ -+ ADD a1, t1, a1 -+ mov X, XX -+ MUL x1, x1, t1 -+ LD x1, 9 * SIZE(X) -+ -+ ADD a2, t2, a2 -+ unop -+ MUL x2, x2, t2 -+ LD x2, 10 * SIZE(X) -+ -+ ADD a3, t3, a3 -+ unop -+ MUL x3, x3, t3 -+ LD x3, 11 * SIZE(X) -+ -+ ADD a0, t0, a0 -+ unop -+ MUL x4, x4, t0 -+ LD x4, 12 * SIZE(X) -+ -+ ADD a1, t1, a1 -+ unop -+ MUL x5, x5, t1 -+ LD x5, 13 * SIZE(X) -+ -+ ADD a2, t2, a2 -+ unop -+ MUL x6, x6, t2 -+ LD x6, 14 * SIZE(X) -+ -+ ADD a3, t3, a3 -+ unop -+ MUL x7, x7, t3 -+ LD x7, 15 * SIZE(X) -+ -+ ADD a0, t0, a0 -+ unop -+ MUL x0, x0, t0 -+ LD x0, 16 * SIZE(X) -+ -+ ADD a1, t1, a1 -+ ldi X, 16 * SIZE(X) -+ MUL x1, x1, t1 -+ LD x1, 17 * SIZE(XX) -+ -+ ADD a2, t2, a2 -+ unop -+ MUL x2, x2, t2 -+ LD x2, 18 * SIZE(XX) -+ -+ ADD a3, t3, a3 -+ unop -+ MUL x3, x3, t3 -+ LD x3, 19 * SIZE(XX) -+ -+ ADD a0, t0, a0 -+ unop -+ MUL x4, x4, t0 -+ LD x4, 20 * SIZE(XX) -+ -+ ADD a1, t1, a1 -+ ldi I, -1(I) -+ MUL x5, x5, t1 -+ LD x5, 21 * SIZE(XX) -+ -+ ADD a2, t2, a2 -+ unop -+ MUL x6, x6, t2 -+ LD x6, 22 * SIZE(XX) -+ -+ ADD a3, t3, a3 -+ MUL x7, x7, t3 -+ LD x7, 23 * SIZE(XX) -+ bgt I, $L11 -+ .align 4 -+ -+$L12: -+ ADD a0, t0, a0 -+ mov X, XX -+ MUL x0, x0, t0 -+ LD x0, 8 * SIZE(X) -+ -+ ADD a1, t1, a1 -+ unop -+ MUL x1, x1, t1 -+ LD x1, 9 * SIZE(X) -+ -+ ADD a2, t2, a2 -+ unop -+ MUL x2, x2, t2 -+ LD x2, 10 * SIZE(X) -+ -+ ADD a3, t3, a3 -+ unop -+ MUL x3, x3, t3 -+ LD x3, 11 * SIZE(X) -+ -+ ADD a0, t0, a0 -+ unop -+ MUL x4, x4, t0 -+ LD x4, 12 * SIZE(XX) -+ -+ ADD a1, t1, a1 -+ unop -+ MUL x5, x5, t1 -+ LD x5, 13 * SIZE(XX) -+ -+ ADD a2, t2, a2 -+ unop -+ MUL x6, x6, t2 -+ LD x6, 14 * SIZE(XX) -+ -+ ADD a3, t3, a3 -+ ldi X, 16 * SIZE(X) -+ MUL x7, x7, t3 -+ LD x7, 15 * SIZE(XX) -+ -+ ADD a0, t0, a0 -+ MUL x0, x0, t0 -+ ADD a1, t1, a1 -+ MUL x1, x1, t1 -+ -+ ADD a2, t2, a2 -+ MUL x2, x2, t2 -+ ADD a3, t3, a3 -+ MUL x3, x3, t3 -+ -+ ADD a0, t0, a0 -+ MUL x4, x4, t0 -+ ADD a1, t1, a1 -+ MUL x5, x5, t1 -+ -+ ADD a2, t2, a2 -+ MUL x6, x6, t2 -+ ADD a3, t3, a3 -+ MUL x7, x7, t3 -+ -+ ADD a1, t1, a1 -+ ADD a2, t2, a2 -+ ADD a3, t3, a3 -+ .align 4 -+ -+$L15: -+ and N, 15, I -+ ble I, $L998 -+ .align 4 -+ -+$L16: -+ LD x0, 0 * SIZE(X) -+ ldi X, 1 * SIZE(X) -+ -+ ADD a0, t0, a0 -+ MUL x0, x0, t0 -+ -+ ldi I, -1(I) -+ bgt I, $L16 -+ bsr $31, $L998 -+ .align 4 -+ -+$L20: -+ fclr t0 -+ sra N, 3, I -+ fclr t1 -+ ble I, $L25 -+ -+ fclr t2 -+ fclr t3 -+ -+ LD x0, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x1, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x2, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x3, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ LD x4, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x5, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x6, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ ldi I, -1(I) -+ ble I, $L22 -+ .align 4 -+ -+$L21: -+ ADD a0, t0, a0 -+ LD x7, 0 * SIZE(X) -+ MUL x0, x0, t0 -+ addl X, INCX, X -+ -+ ADD a1, t1, a1 -+ LD x0, 0 * SIZE(X) -+ MUL x1, x1, t1 -+ addl X, INCX, X -+ -+ ADD a2, t2, a2 -+ LD x1, 0 * SIZE(X) -+ MUL x2, x2, t2 -+ addl X, INCX, X -+ -+ ADD a3, t3, a3 -+ LD x2, 0 * SIZE(X) -+ MUL x3, x3, t3 -+ addl X, INCX, X -+ -+ ADD a0, t0, a0 -+ LD x3, 0 * SIZE(X) -+ MUL x4, x4, t0 -+ addl X, INCX, X -+ -+ ADD a1, t1, a1 -+ LD x4, 0 * SIZE(X) -+ MUL x5, x5, t1 -+ addl X, INCX, X -+ -+ ADD a2, t2, a2 -+ LD x5, 0 * SIZE(X) -+ MUL x6, x6, t2 -+ addl X, INCX, X -+ -+ ADD a3, t3, a3 -+ LD x6, 0 * SIZE(X) -+ MUL x7, x7, t3 -+ addl X, INCX, X -+ -+ ldi I, -1(I) -+ bgt I, $L21 -+ .align 4 -+ -+$L22: -+ ADD a0, t0, a0 -+ LD x7, 0 * SIZE(X) -+ MUL x0, x0, t0 -+ addl X, INCX, X -+ -+ ADD a1, t1, a1 -+ unop -+ MUL x1, x1, t1 -+ unop -+ -+ ADD a2, t2, a2 -+ MUL x2, x2, t2 -+ ADD a3, t3, a3 -+ MUL x3, x3, t3 -+ -+ ADD a0, t0, a0 -+ MUL x4, x4, t0 -+ ADD a1, t1, a1 -+ MUL x5, x5, t1 -+ -+ ADD a2, t2, a2 -+ MUL x6, x6, t2 -+ ADD a3, t3, a3 -+ MUL x7, x7, t3 -+ -+ ADD a1, t1, a1 -+ ADD a2, t2, a2 -+ ADD a3, t3, a3 -+ .align 4 -+ -+$L25: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 -+ -+$L26: -+ LD x0, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ ADD a0, t0, a0 -+ MUL x0, x0, t0 -+ -+ ldi I, -1(I) -+ bgt I, $L26 -+ .align 4 -+ -+ -+$L998: -+ ADD a0, t0, a0 -+ -+ ADD a0, a1, a0 -+ ADD a2, a3, a2 -+ -+ -+ ADD a0, a2, a0 -+ SQRT a0, a0 -+ -+ .align 4 -+ -+$L999: -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/rot.S b/kernel/sw_64/rot.S -new file mode 100644 -index 0000000..3c8624e ---- /dev/null -+++ b/kernel/sw_64/rot.S -@@ -0,0 +1,680 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 -+#define I $21 -+#define XX $23 -+#define YY $24 -+ -+#define C $f10 -+#define S $f11 -+ -+#define PREFETCH_SIZE 80 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 16, $26, 0 -+ -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif -+ ldi $sp, -16($sp) -+ fstd $f20, 8($sp) -+ -+ fmov $f21, C -+ LD S, 16($sp) -+ cmpeq INCX, 1, $23 -+ cmpeq INCY, 1, $24 -+ ble N, $L998 -+ -+ -+ and $23, $24, $23 -+ beq $23, $L50 -+ -+ sra N, 3, I -+ ble I, $L15 -+ -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) -+ -+ LD $f16, 2*SIZE(X) -+ LD $f17, 2*SIZE(Y) -+ LD $f18, 3*SIZE(X) -+ LD $f19, 3*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ unop -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ -+ LD $f13, 4*SIZE(Y) -+ MUL S, $f12, $f24 -+ LD $f12, 4*SIZE(X) -+ MUL C, $f14, $f25 -+ -+ ldi I, -1(I) -+ MUL S, $f15, $f26 -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ MUL C, $f15, $f27 -+ -+ LD $f15, 5*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ MUL C, $f16, $f21 -+ flds $f31, (PREFETCH_SIZE) * SIZE(X) -+ unop -+ LD $f14, 5*SIZE(X) -+ -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ -+ MUL C, $f17, $f23 -+ flds $f31, (PREFETCH_SIZE) * SIZE(Y) -+ unop -+ LD $f17, 6*SIZE(Y) -+ -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ MUL C, $f18, $f25 -+ LD $f16, 6*SIZE(X) -+ unop -+ unop -+ -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) -+ -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ -+ MUL C, $f12, $f21 -+ LD $f18, 7*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ -+ MUL C, $f13, $f23 -+ LD $f13, 8*SIZE(Y) -+ unop -+ unop -+ -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ MUL C, $f14, $f25 -+ LD $f12, 8*SIZE(X) -+ unop -+ unop -+ -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ -+ MUL C, $f15, $f27 -+ LD $f15, 9*SIZE(Y) -+ unop -+ unop -+ -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ -+ MUL C, $f16, $f21 -+ LD $f14, 9*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ -+ MUL C, $f17, $f23 -+ LD $f17, 10*SIZE(Y) -+ unop -+ unop -+ -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ MUL C, $f18, $f25 -+ LD $f16, 10*SIZE(X) -+ unop -+ unop -+ -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ -+ MUL C, $f19, $f27 -+ LD $f19, 11*SIZE(Y) -+ unop -+ unop -+ -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ ldi I, -1(I) -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ -+ MUL C, $f12, $f21 -+ LD $f18, 11*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 6*SIZE(X) -+ MUL S, $f13, $f22 -+ unop -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ -+ MUL C, $f13, $f23 -+ LD $f13, 12*SIZE(Y) -+ ldi X, 8*SIZE(X) -+ unop -+ -+ ST $f24, 6*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ MUL C, $f14, $f25 -+ LD $f12, 4*SIZE(X) -+ ldi Y, 8*SIZE(Y) -+ unop -+ -+ ST $f26, -1*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ -+ MUL C, $f15, $f27 -+ LD $f15, 5*SIZE(Y) -+ unop -+ unop -+ -+ ST $f28, -1*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ bgt I, $L12 -+ .align 4 -+ -+$L13: -+ MUL C, $f16, $f21 -+ LD $f14, 5*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ -+ MUL C, $f17, $f23 -+ unop -+ unop -+ LD $f17, 6*SIZE(Y) -+ -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ LD $f16, 6*SIZE(X) -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ MUL C, $f18, $f25 -+ unop -+ unop -+ unop -+ -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) -+ -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ SUB $f23, $f24, $f18 -+ fmov $f18,$f24 -+ LD $f18, 7*SIZE(X) -+ -+ MUL C, $f12, $f21 -+ unop -+ unop -+ unop -+ -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ -+ MUL C, $f13, $f23 -+ unop -+ unop -+ unop -+ -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ MUL C, $f14, $f25 -+ unop -+ unop -+ unop -+ -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ -+ MUL C, $f15, $f27 -+ unop -+ unop -+ unop -+ -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ -+ MUL C, $f16, $f21 -+ unop -+ unop -+ unop -+ -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ -+ MUL C, $f17, $f23 -+ unop -+ unop -+ unop -+ -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ MUL C, $f18, $f25 -+ unop -+ unop -+ unop -+ -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ -+ MUL C, $f19, $f27 -+ unop -+ unop -+ unop -+ -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ -+ ST $f22, 6*SIZE(X) -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ ST $f24, 6*SIZE(Y) -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ ST $f26, 7*SIZE(X) -+ ldi X, 8*SIZE(X) -+ ST $f28, 7*SIZE(Y) -+ ldi Y, 8*SIZE(Y) -+ .align 4 -+ -+ -+$L15: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 -+ -+$L16: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f25 -+ SUB $f23, $f24, $f26 -+ ldi I, -1(I) -+ -+ ST $f25, 0*SIZE(X) -+ ldi X, 1 * SIZE(X) -+ ST $f26, 0*SIZE(Y) -+ ldi Y, 1 * SIZE(Y) -+ -+ bgt I, $L16 -+ .align 4 -+ -+$L998: -+ clr $0 -+ fldd $f20, 8($sp) -+ ldi $sp, 16($sp) -+ ret -+ .align 4 -+ -+$L50: -+ mov X, XX -+ mov Y, YY -+ -+ sra N, 3, I -+ ble I, $L55 -+ .align 4 -+ -+$L51: -+ LD $f12, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f13, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f14, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f16, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f17, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f18, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f19, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 -+ -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f16, $f21 -+ MUL S, $f17, $f22 -+ MUL C, $f17, $f23 -+ MUL S, $f16, $f24 -+ -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f18, $f25 -+ MUL S, $f19, $f26 -+ MUL C, $f19, $f27 -+ MUL S, $f18, $f28 -+ -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ -+ LD $f12, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f13, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f14, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f16, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f17, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f18, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f19, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 -+ -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f16, $f21 -+ MUL S, $f17, $f22 -+ MUL C, $f17, $f23 -+ MUL S, $f16, $f24 -+ -+ ADD $f21, $f22, $f20 -+ fmov $f20,$f22 -+ SUB $f23, $f24, $f20 -+ fmov $f20,$f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f18, $f25 -+ MUL S, $f19, $f26 -+ MUL C, $f19, $f27 -+ MUL S, $f18, $f28 -+ -+ ADD $f25, $f26, $f20 -+ fmov $f20,$f26 -+ SUB $f27, $f28, $f20 -+ fmov $f20,$f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ ldi I, -1(I) -+ bgt I, $L51 -+ .align 4 -+ -+$L55: -+ and N, 7, I -+ ble I, $L999 -+ .align 4 -+ -+$L56: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f25 -+ SUB $f23, $f24, $f26 -+ ldi I, -1(I) -+ -+ ST $f25, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ ST $f26, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ bgt I, $L56 -+ .align 4 -+ -+$L999: -+ fldd $f20, 8($sp) -+ ldi $sp, 16($sp) -+ -+ clr $0 -+# fldd $f20, 8($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/rot.S.bak b/kernel/sw_64/rot.S.bak -new file mode 100644 -index 0000000..62e9ff9 ---- /dev/null -+++ b/kernel/sw_64/rot.S.bak -@@ -0,0 +1,624 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 -+#define I $21 -+#define XX $23 -+#define YY $24 -+ -+#define C $f10 -+#define S $f11 -+ -+#define PREFETCH_SIZE 80 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 -+ -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif -+ -+ fmov $f21, C -+ LD S, 0($sp) -+ -+ cmpeq INCX, 1, $23 -+ cmpeq INCY, 1, $24 -+ ble N, $L998 -+ -+ and $23, $24, $23 -+ beq $23, $L50 -+ -+ sra N, 3, I -+ ble I, $L15 -+ -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) -+ -+ LD $f16, 2*SIZE(X) -+ LD $f17, 2*SIZE(Y) -+ LD $f18, 3*SIZE(X) -+ LD $f19, 3*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ unop -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ -+ LD $f13, 4*SIZE(Y) -+ MUL S, $f12, $f24 -+ LD $f12, 4*SIZE(X) -+ MUL C, $f14, $f25 -+ -+ ldi I, -1(I) -+ MUL S, $f15, $f26 -+ ADD $f21, $f22, $f22 -+ MUL C, $f15, $f27 -+ -+ LD $f15, 5*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, $f24 -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ MUL C, $f16, $f21 -+ fillcs (PREFETCH_SIZE) * SIZE(X) -+ unop -+ LD $f14, 5*SIZE(X) -+ -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f17, $f23 -+ fillcs (PREFETCH_SIZE) * SIZE(Y) -+ unop -+ LD $f17, 6*SIZE(Y) -+ -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f18, $f25 -+ LD $f16, 6*SIZE(X) -+ unop -+ unop -+ -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) -+ -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, $f24 -+ -+ MUL C, $f12, $f21 -+ LD $f18, 7*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f13, $f23 -+ LD $f13, 8*SIZE(Y) -+ unop -+ unop -+ -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f14, $f25 -+ LD $f12, 8*SIZE(X) -+ unop -+ unop -+ -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f15, $f27 -+ LD $f15, 9*SIZE(Y) -+ unop -+ unop -+ -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, $f24 -+ -+ MUL C, $f16, $f21 -+ LD $f14, 9*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f17, $f23 -+ LD $f17, 10*SIZE(Y) -+ unop -+ unop -+ -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f18, $f25 -+ LD $f16, 10*SIZE(X) -+ unop -+ unop -+ -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f19, $f27 -+ LD $f19, 11*SIZE(Y) -+ unop -+ unop -+ -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ ldi I, -1(I) -+ SUB $f23, $f24, $f24 -+ -+ MUL C, $f12, $f21 -+ LD $f18, 11*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 6*SIZE(X) -+ MUL S, $f13, $f22 -+ unop -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f13, $f23 -+ LD $f13, 12*SIZE(Y) -+ ldi X, 8*SIZE(X) -+ unop -+ -+ ST $f24, 6*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f14, $f25 -+ LD $f12, 4*SIZE(X) -+ ldi Y, 8*SIZE(Y) -+ unop -+ -+ ST $f26, -1*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f15, $f27 -+ LD $f15, 5*SIZE(Y) -+ unop -+ unop -+ -+ ST $f28, -1*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, $f24 -+ bgt I, $L12 -+ .align 4 -+ -+$L13: -+ MUL C, $f16, $f21 -+ LD $f14, 5*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f17, $f23 -+ unop -+ unop -+ LD $f17, 6*SIZE(Y) -+ -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ LD $f16, 6*SIZE(X) -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f18, $f25 -+ unop -+ unop -+ unop -+ -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) -+ -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ LD $f18, 7*SIZE(X) -+ SUB $f23, $f24, $f24 -+ -+ MUL C, $f12, $f21 -+ unop -+ unop -+ unop -+ -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f13, $f23 -+ unop -+ unop -+ unop -+ -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f14, $f25 -+ unop -+ unop -+ unop -+ -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f15, $f27 -+ unop -+ unop -+ unop -+ -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, $f24 -+ -+ MUL C, $f16, $f21 -+ unop -+ unop -+ unop -+ -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f17, $f23 -+ unop -+ unop -+ unop -+ -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f18, $f25 -+ unop -+ unop -+ unop -+ -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f19, $f27 -+ unop -+ unop -+ unop -+ -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, $f24 -+ -+ ST $f22, 6*SIZE(X) -+ ADD $f25, $f26, $f26 -+ ST $f24, 6*SIZE(Y) -+ SUB $f27, $f28, $f28 -+ -+ ST $f26, 7*SIZE(X) -+ ldi X, 8*SIZE(X) -+ ST $f28, 7*SIZE(Y) -+ ldi Y, 8*SIZE(Y) -+ .align 4 -+ -+ -+$L15: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 -+ -+$L16: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f25 -+ SUB $f23, $f24, $f26 -+ ldi I, -1(I) -+ -+ ST $f25, 0*SIZE(X) -+ ldi X, 1 * SIZE(X) -+ ST $f26, 0*SIZE(Y) -+ ldi Y, 1 * SIZE(Y) -+ -+ bgt I, $L16 -+ .align 4 -+ -+$L998: -+ clr $0 -+ ret -+ .align 4 -+ -+$L50: -+ mov X, XX -+ mov Y, YY -+ -+ sra N, 3, I -+ ble I, $L55 -+ .align 4 -+ -+$L51: -+ LD $f12, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f13, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f14, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f16, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f17, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f18, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f19, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 -+ -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f16, $f21 -+ MUL S, $f17, $f22 -+ MUL C, $f17, $f23 -+ MUL S, $f16, $f24 -+ -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f18, $f25 -+ MUL S, $f19, $f26 -+ MUL C, $f19, $f27 -+ MUL S, $f18, $f28 -+ -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ -+ LD $f12, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f13, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f14, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f16, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f17, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f18, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f19, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 -+ -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f16, $f21 -+ MUL S, $f17, $f22 -+ MUL C, $f17, $f23 -+ MUL S, $f16, $f24 -+ -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f18, $f25 -+ MUL S, $f19, $f26 -+ MUL C, $f19, $f27 -+ MUL S, $f18, $f28 -+ -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ ldi I, -1(I) -+ bgt I, $L51 -+ .align 4 -+ -+$L55: -+ and N, 7, I -+ ble I, $L999 -+ .align 4 -+ -+$L56: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f25 -+ SUB $f23, $f24, $f26 -+ ldi I, -1(I) -+ -+ ST $f25, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ ST $f26, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ bgt I, $L56 -+ .align 4 -+ -+$L999: -+ clr $0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/rot_simd.S b/kernel/sw_64/rot_simd.S -new file mode 100644 -index 0000000..99f3e05 ---- /dev/null -+++ b/kernel/sw_64/rot_simd.S -@@ -0,0 +1,783 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 -+#define I $21 -+#define XX $23 -+#define YY $24 -+ -+#define C $f10 -+#define S $f11 -+ -+#define x0 $f12 -+#define x1 $f14 -+#define x2 $f16 -+#define x3 $f18 -+ -+#define y0 $f13 -+#define y1 $f15 -+#define y2 $f17 -+#define y3 $f19 -+ -+#define t0 $f20 -+#define t1 $f21 -+#define t2 $f22 -+#define t3 $f23 -+#define t4 $f24 -+#define t5 $f25 -+#define t6 $f26 -+#define t7 $f27 -+ -+#define PREFETCHSIZE 80 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 -+ -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif -+ -+ fmov $f21, C -+ LD S, 0($sp) -+ -+ cmpeq INCX, 1, $23 -+ cmpeq INCY, 1, $24 -+ ble N, $L998 -+ -+ and $23, $24, $23 -+ beq $23, $L50 #incx!=1 or incy !=1 -+ -+/* test the address of X */ -+ and X, (VEC_LEN*SIZE-1), $3 -+ and Y, (VEC_LEN*SIZE-1), $4 -+ or $3, $4, $4 -+ bne $4, $UnAlign_ACCESS -+ -+/*Align Accessing*/ -+ sra N, 4, I -+ ble I, $Remain -+ -+ vcpyf C, C -+ vcpyf S, S -+ -+ VLD x0, 0*VEC_LEN*SIZE(X) -+ VLD x1, 1*VEC_LEN*SIZE(X) -+ VLD x2, 2*VEC_LEN*SIZE(X) -+ VLD x3, 3*VEC_LEN*SIZE(X) -+ -+ VLD y0, 0*VEC_LEN*SIZE(Y) -+ VLD y1, 1*VEC_LEN*SIZE(Y) -+ VLD y2, 2*VEC_LEN*SIZE(Y) -+ VLD y3, 3*VEC_LEN*SIZE(Y) -+ -+ addl X, 16 * SIZE, X -+ addl Y, 16 * SIZE, Y -+ subl I, 1, I -+ ble I, $MainLoopEnd -+ .align 4 -+$MainLoop: -+ VMUL C, x0, t0 -+ fillcs (PREFETCHSIZE) * SIZE(X) -+ VMUL C, x1, t1 -+ fillcs (PREFETCHSIZE) * SIZE(Y) -+ -+ VMUL C, x2, t2 -+ subl I, 1, I -+ VMUL C, x3, t3 -+ nop -+ -+ VMUL S, x0, t4 -+ VLD x0, 0*VEC_LEN*SIZE(X) -+ VMUL S, x1, t5 -+ VLD x1, 1*VEC_LEN*SIZE(X) -+ -+ VMUL S, x2, t6 -+ VLD x2, 2*VEC_LEN*SIZE(X) -+ VMUL S, x3, t7 -+ VLD x3, 3*VEC_LEN*SIZE(X) -+ -+ VMAD S, y0, t0, t0 -+ VMAD S, y1, t1, t1 -+ VMAD S, y2, t2, t2 -+ VMAD S, y3, t3, t3 -+ -+ VMSUB C, y0, t4, t4 -+ VLD y0, 0*VEC_LEN*SIZE(Y) -+ VMSUB C, y1, t5, t5 -+ VLD y1, 1*VEC_LEN*SIZE(Y) -+ -+ VMSUB C, y2, t6, t6 -+ VLD y2, 2*VEC_LEN*SIZE(Y) -+ VMSUB C, y3, t7, t7 -+ VLD y3, 3*VEC_LEN*SIZE(Y) -+ -+ VST t0, -4*VEC_LEN*SIZE(X) -+ VST t1, -3*VEC_LEN*SIZE(X) -+ VST t2, -2*VEC_LEN*SIZE(X) -+ VST t3, -1*VEC_LEN*SIZE(X) -+ -+ VST t4, -4*VEC_LEN*SIZE(Y) -+ VST t5, -3*VEC_LEN*SIZE(Y) -+ VST t6, -2*VEC_LEN*SIZE(Y) -+ VST t7, -1*VEC_LEN*SIZE(Y) -+ -+ addl X, 16 * SIZE, X -+ addl Y, 16 * SIZE, Y -+ nop -+ bgt I, $MainLoop -+ .align 4 -+$MainLoopEnd: -+ VMUL C, x0, t0 -+ VMUL C, x1, t1 -+ VMUL C, x2, t2 -+ VMUL C, x3, t3 -+ -+ VMUL S, x0, t4 -+ VMUL S, x1, t5 -+ VMUL S, x2, t6 -+ VMUL S, x3, t7 -+ -+ VMAD S, y0, t0, t0 -+ VMAD S, y1, t1, t1 -+ VMAD S, y2, t2, t2 -+ VMAD S, y3, t3, t3 -+ -+ VMSUB C, y0, t4, t4 -+ VMSUB C, y1, t5, t5 -+ VMSUB C, y2, t6, t6 -+ VMSUB C, y3, t7, t7 -+ -+ VST t0, -4*VEC_LEN*SIZE(X) -+ VST t1, -3*VEC_LEN*SIZE(X) -+ VST t2, -2*VEC_LEN*SIZE(X) -+ VST t3, -1*VEC_LEN*SIZE(X) -+ -+ VST t4, -4*VEC_LEN*SIZE(Y) -+ VST t5, -3*VEC_LEN*SIZE(Y) -+ VST t6, -2*VEC_LEN*SIZE(Y) -+ VST t7, -1*VEC_LEN*SIZE(Y) -+ -+ .align 4 -+$Remain: -+ and N, 15, I -+ ble I, $End -+$RemainLoop: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ MUL S, $f12, $f24 -+ MAD S, $f13, $f21, $f25 -+ MSUB C, $f13, $f24, $f26 -+ -+ -+ ldi I, -1(I) -+ ST $f25, 0*SIZE(X) -+ ldi X, 1 * SIZE(X) -+ ST $f26, 0*SIZE(Y) -+ -+ ldi Y, 1 * SIZE(Y) -+ bgt I, $RemainLoop -+ -+ .align 4 -+$End: -+ clr $0 -+ ret -+ .align 4 -+ -+$UnAlign_ACCESS: -+ -+ sra N, 3, I -+ ble I, $L15 -+ -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) -+ -+ LD $f16, 2*SIZE(X) -+ LD $f17, 2*SIZE(Y) -+ LD $f18, 3*SIZE(X) -+ LD $f19, 3*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ unop -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ -+ LD $f13, 4*SIZE(Y) -+ MUL S, $f12, $f24 -+ LD $f12, 4*SIZE(X) -+ MUL C, $f14, $f25 -+ -+ ldi I, -1(I) -+ MUL S, $f15, $f26 -+ ADD $f21, $f22, $f22 -+ MUL C, $f15, $f27 -+ -+ LD $f15, 5*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, $f24 -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ MUL C, $f16, $f21 -+ fillcs (PREFETCHSIZE) * SIZE(X) -+ unop -+ LD $f14, 5*SIZE(X) -+ -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f17, $f23 -+ fillcs (PREFETCHSIZE) * SIZE(Y) -+ unop -+ LD $f17, 6*SIZE(Y) -+ -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f18, $f25 -+ LD $f16, 6*SIZE(X) -+ unop -+ unop -+ -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) -+ -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, $f24 -+ -+ MUL C, $f12, $f21 -+ LD $f18, 7*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f13, $f23 -+ LD $f13, 8*SIZE(Y) -+ unop -+ unop -+ -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f14, $f25 -+ LD $f12, 8*SIZE(X) -+ unop -+ unop -+ -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f15, $f27 -+ LD $f15, 9*SIZE(Y) -+ unop -+ unop -+ -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, $f24 -+ -+ MUL C, $f16, $f21 -+ LD $f14, 9*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f17, $f23 -+ LD $f17, 10*SIZE(Y) -+ unop -+ unop -+ -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f18, $f25 -+ LD $f16, 10*SIZE(X) -+ unop -+ unop -+ -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f19, $f27 -+ LD $f19, 11*SIZE(Y) -+ unop -+ unop -+ -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ ldi I, -1(I) -+ SUB $f23, $f24, $f24 -+ -+ MUL C, $f12, $f21 -+ LD $f18, 11*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 6*SIZE(X) -+ MUL S, $f13, $f22 -+ unop -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f13, $f23 -+ LD $f13, 12*SIZE(Y) -+ ldi X, 8*SIZE(X) -+ unop -+ -+ ST $f24, 6*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f14, $f25 -+ LD $f12, 4*SIZE(X) -+ ldi Y, 8*SIZE(Y) -+ unop -+ -+ ST $f26, -1*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f15, $f27 -+ LD $f15, 5*SIZE(Y) -+ unop -+ unop -+ -+ ST $f28, -1*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, $f24 -+ bgt I, $L12 -+ .align 4 -+ -+$L13: -+ MUL C, $f16, $f21 -+ LD $f14, 5*SIZE(X) -+ unop -+ unop -+ -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f17, $f23 -+ unop -+ unop -+ LD $f17, 6*SIZE(Y) -+ -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ LD $f16, 6*SIZE(X) -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f18, $f25 -+ unop -+ unop -+ unop -+ -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) -+ -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ LD $f18, 7*SIZE(X) -+ SUB $f23, $f24, $f24 -+ -+ MUL C, $f12, $f21 -+ unop -+ unop -+ unop -+ -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f13, $f23 -+ unop -+ unop -+ unop -+ -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f14, $f25 -+ unop -+ unop -+ unop -+ -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f15, $f27 -+ unop -+ unop -+ unop -+ -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, $f24 -+ -+ MUL C, $f16, $f21 -+ unop -+ unop -+ unop -+ -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 -+ -+ MUL C, $f17, $f23 -+ unop -+ unop -+ unop -+ -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 -+ -+ MUL C, $f18, $f25 -+ unop -+ unop -+ unop -+ -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 -+ -+ MUL C, $f19, $f27 -+ unop -+ unop -+ unop -+ -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, $f24 -+ -+ ST $f22, 6*SIZE(X) -+ ADD $f25, $f26, $f26 -+ ST $f24, 6*SIZE(Y) -+ SUB $f27, $f28, $f28 -+ -+ ST $f26, 7*SIZE(X) -+ ldi X, 8*SIZE(X) -+ ST $f28, 7*SIZE(Y) -+ ldi Y, 8*SIZE(Y) -+ .align 4 -+ -+ -+$L15: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 -+ -+$L16: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f25 -+ SUB $f23, $f24, $f26 -+ ldi I, -1(I) -+ -+ ST $f25, 0*SIZE(X) -+ ldi X, 1 * SIZE(X) -+ ST $f26, 0*SIZE(Y) -+ ldi Y, 1 * SIZE(Y) -+ -+ bgt I, $L16 -+ .align 4 -+ -+$L998: -+ clr $0 -+ ret -+ .align 4 -+ -+$L50: -+ mov X, XX -+ mov Y, YY -+ -+ sra N, 3, I -+ ble I, $L55 -+ .align 4 -+ -+$L51: -+ LD $f12, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f13, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f14, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f16, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f17, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f18, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f19, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 -+ -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f16, $f21 -+ MUL S, $f17, $f22 -+ MUL C, $f17, $f23 -+ MUL S, $f16, $f24 -+ -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f18, $f25 -+ MUL S, $f19, $f26 -+ MUL C, $f19, $f27 -+ MUL S, $f18, $f28 -+ -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ -+ LD $f12, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f13, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f14, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f16, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f17, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ LD $f18, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f19, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 -+ -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f16, $f21 -+ MUL S, $f17, $f22 -+ MUL C, $f17, $f23 -+ MUL S, $f16, $f24 -+ -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 -+ -+ ST $f22, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f24, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ MUL C, $f18, $f25 -+ MUL S, $f19, $f26 -+ MUL C, $f19, $f27 -+ MUL S, $f18, $f28 -+ -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 -+ -+ ST $f26, 0*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 0*SIZE(YY) -+ SXADDQ INCY, YY, YY -+ -+ ldi I, -1(I) -+ bgt I, $L51 -+ .align 4 -+ -+$L55: -+ and N, 7, I -+ ble I, $L999 -+ .align 4 -+ -+$L56: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, $f25 -+ SUB $f23, $f24, $f26 -+ ldi I, -1(I) -+ -+ ST $f25, 0*SIZE(X) -+ SXADDQ INCX, X, X -+ ST $f26, 0*SIZE(Y) -+ SXADDQ INCY, Y, Y -+ -+ bgt I, $L56 -+ .align 4 -+ -+$L999: -+ clr $0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/scal-sw.S.bak b/kernel/sw_64/scal-sw.S.bak -new file mode 100644 -index 0000000..f8da324 ---- /dev/null -+++ b/kernel/sw_64/scal-sw.S.bak -@@ -0,0 +1,480 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 88 -+ -+#define N $16 -+#define X $20 -+#define INCX $21 -+ -+#define XX $18 -+#define I $19 -+ -+#define ALPHA $f19 -+ -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 -+ -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f21 -+ -+#define t0 $f22 -+#define t1 $f23 -+#define t2 $f24 -+#define t3 $f25 -+ -+ PROLOGUE -+ PROFCODE -+ -+ mov X, XX -+ ble N, $L999 -+ -+ cmpeq INCX, 1, $0 -+ beq $0, $L20 -+ -+#ifndef DOUBLE -+ sra N, 4, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ -+ LD a4, 4 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ LD a5, 5 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ LD a6, 6 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ LD a7, 7 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ ST t0, 0 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 1 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 2 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 3 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ LD a0, 8 * SIZE(X) -+ LD a1, 9 * SIZE(X) -+ LD a2, 10 * SIZE(X) -+ LD a3, 11 * SIZE(X) -+ -+ ST t0, 4 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ST t1, 5 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ -+ ST t2, 6 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ ST t3, 7 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ LD a4, 12 * SIZE(X) -+ LD a5, 13 * SIZE(X) -+ LD a6, 14 * SIZE(X) -+ LD a7, 15 * SIZE(X) -+ -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ST t0, 8 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 9 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 10 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 11 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ LD a0, 16 * SIZE(X) -+ LD a1, 17 * SIZE(X) -+ LD a2, 18 * SIZE(X) -+ LD a3, 19 * SIZE(X) -+ -+ ST t0, 12 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ST t1, 13 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ -+ ST t2, 14 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ ST t3, 15 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ LD a4, 20 * SIZE(X) -+ LD a5, 21 * SIZE(X) -+ LD a6, 22 * SIZE(X) -+ LD a7, 23 * SIZE(X) -+ -+ ST t0, 16 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 17 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 18 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 19 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ LD a0, 24 * SIZE(X) -+ LD a1, 25 * SIZE(X) -+ LD a2, 26 * SIZE(X) -+ LD a3, 27 * SIZE(X) -+ -+ ST t0, 20 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ST t1, 21 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ -+ ST t2, 22 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ ST t3, 23 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ LD a4, 28 * SIZE(X) -+ LD a5, 29 * SIZE(X) -+ LD a6, 30 * SIZE(X) -+ LD a7, 31 * SIZE(X) -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ ldi I, -1(I) -+ addl X, 16 * SIZE, X -+ bne I, $L12 -+ .align 4 -+ -+$L13: -+ ST t0, 8 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 9 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 10 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 11 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ ST t0, 12 * SIZE(X) -+ ST t1, 13 * SIZE(X) -+ ST t2, 14 * SIZE(X) -+ ST t3, 15 * SIZE(X) -+ addl X, 16 * SIZE, X -+ .align 4 -+ -+$L15: -+ and N, 15, I -+ -+#else -+ -+ sra N, 3, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ -+ LD a4, 4 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ LD a5, 5 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ -+ LD a6, 6 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ LD a7, 7 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ST t0, 0 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 1 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 2 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 3 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ LD a0, 8 * SIZE(X) -+ ldi I, -1(I) -+ LD a1, 9 * SIZE(X) -+ addl X, 8 * SIZE, X -+ -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ -+ ST t0, -4 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ST t1, -3 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ -+ ST t2, -2 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ ST t3, -1 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ LD a4, 4 * SIZE(X) -+ LD a5, 5 * SIZE(X) -+ -+ LD a6, 6 * SIZE(X) -+ LD a7, 7 * SIZE(X) -+ fillcs PREFETCHSIZE * SIZE(X) -+ bne I, $L12 -+ .align 4 -+ -+$L13: -+ ST t0, 0 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 1 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 2 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 3 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ ST t0, 4 * SIZE(X) -+ ST t1, 5 * SIZE(X) -+ ST t2, 6 * SIZE(X) -+ ST t3, 7 * SIZE(X) -+ addl X, 8 * SIZE, X -+ .align 4 -+ -+$L15: -+ and N, 7, I -+ -+#endif -+ -+ unop -+ unop -+ ble I, $L999 -+ .align 4 -+ -+$L17: -+ LD a0, 0 * SIZE(X) -+ -+ MUL a0, ALPHA, t0 -+ -+ ST t0, 0 * SIZE(X) -+ -+ addl X, SIZE, X -+ -+ ldi I, -1(I) -+ bne I, $L17 -+ ret -+ .align 4 -+ -+$L20: -+ sra N, 3, I -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD a4, 0 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ldi I, -1(I) -+ SXADDQ INCX, X, X -+ -+ LD a5, 0 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ SXADDQ INCX, X, X -+ unop -+ -+ LD a6, 0 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ SXADDQ INCX, X, X -+ unop -+ -+ LD a7, 0 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ SXADDQ INCX, X, X -+ ble I, $L23 -+ .align 4 -+ -+$L22: -+ ST t0, 0 * SIZE(XX) -+ MUL a4, ALPHA, t0 -+ fillcs PREFETCHSIZE * SIZE(X) -+ SXADDQ INCX, XX, XX -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ ldi I, -1(I) -+ unop -+ -+ ST t1, 0 * SIZE(XX) -+ MUL a5, ALPHA, t1 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t2, 0 * SIZE(XX) -+ MUL a6, ALPHA, t2 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t3, 0 * SIZE(XX) -+ MUL a7, ALPHA, t3 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t0, 0 * SIZE(XX) -+ MUL a0, ALPHA, t0 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a4, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t1, 0 * SIZE(XX) -+ MUL a1, ALPHA, t1 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a5, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t2, 0 * SIZE(XX) -+ MUL a2, ALPHA, t2 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a6, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t3, 0 * SIZE(XX) -+ MUL a3, ALPHA, t3 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a7, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ unop -+ bne I, $L22 -+ .align 4 -+ -+$L23: -+ ST t0, 0 * SIZE(XX) -+ MUL a4, ALPHA, t0 -+ SXADDQ INCX, XX, XX -+ -+ ST t1, 0 * SIZE(XX) -+ MUL a5, ALPHA, t1 -+ SXADDQ INCX, XX, XX -+ -+ ST t2, 0 * SIZE(XX) -+ MUL a6, ALPHA, t2 -+ SXADDQ INCX, XX, XX -+ -+ ST t3, 0 * SIZE(XX) -+ MUL a7, ALPHA, t3 -+ SXADDQ INCX, XX, XX -+ -+ ST t0, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST t1, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST t2, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST t3, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ .align 4 -+ -+$L25: -+ and N, 7, I -+ unop -+ unop -+ ble I, $L999 -+ .align 4 -+ -+$L27: -+ LD a0, 0 * SIZE(X) -+ -+ MUL a0, ALPHA, t0 -+ -+ ST t0, 0 * SIZE(XX) -+ -+ SXADDQ INCX, X, X -+ SXADDQ INCX, XX, XX -+ -+ ldi I, -1(I) -+ bne I, $L27 -+ .align 4 -+ -+$L999: -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/scal.S b/kernel/sw_64/scal.S -new file mode 100644 -index 0000000..87b89c9 ---- /dev/null -+++ b/kernel/sw_64/scal.S -@@ -0,0 +1,480 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 88 -+ -+#define N $16 -+#define X $20 -+#define INCX $21 -+ -+#define XX $18 -+#define I $19 -+ -+#define ALPHA $f19 -+ -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 -+ -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f21 -+ -+#define t0 $f22 -+#define t1 $f23 -+#define t2 $f24 -+#define t3 $f25 -+ -+ PROLOGUE -+ PROFCODE -+ -+ mov X, XX -+ ble N, $L999 -+ -+ cmpeq INCX, 1, $0 -+ beq $0, $L20 -+ -+#ifndef DOUBLE -+ sra N, 4, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ -+ LD a4, 4 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ LD a5, 5 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ LD a6, 6 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ LD a7, 7 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ ST t0, 0 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 1 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 2 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 3 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ LD a0, 8 * SIZE(X) -+ LD a1, 9 * SIZE(X) -+ LD a2, 10 * SIZE(X) -+ LD a3, 11 * SIZE(X) -+ -+ ST t0, 4 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ST t1, 5 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ -+ ST t2, 6 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ ST t3, 7 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ LD a4, 12 * SIZE(X) -+ LD a5, 13 * SIZE(X) -+ LD a6, 14 * SIZE(X) -+ LD a7, 15 * SIZE(X) -+ -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ST t0, 8 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 9 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 10 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 11 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ LD a0, 16 * SIZE(X) -+ LD a1, 17 * SIZE(X) -+ LD a2, 18 * SIZE(X) -+ LD a3, 19 * SIZE(X) -+ -+ ST t0, 12 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ST t1, 13 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ -+ ST t2, 14 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ ST t3, 15 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ LD a4, 20 * SIZE(X) -+ LD a5, 21 * SIZE(X) -+ LD a6, 22 * SIZE(X) -+ LD a7, 23 * SIZE(X) -+ -+ ST t0, 16 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 17 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 18 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 19 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ LD a0, 24 * SIZE(X) -+ LD a1, 25 * SIZE(X) -+ LD a2, 26 * SIZE(X) -+ LD a3, 27 * SIZE(X) -+ -+ ST t0, 20 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ST t1, 21 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ -+ ST t2, 22 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ ST t3, 23 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ LD a4, 28 * SIZE(X) -+ LD a5, 29 * SIZE(X) -+ LD a6, 30 * SIZE(X) -+ LD a7, 31 * SIZE(X) -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ ldi I, -1(I) -+ addl X, 16 * SIZE, X -+ bne I, $L12 -+ .align 4 -+ -+$L13: -+ ST t0, 8 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 9 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 10 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 11 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ ST t0, 12 * SIZE(X) -+ ST t1, 13 * SIZE(X) -+ ST t2, 14 * SIZE(X) -+ ST t3, 15 * SIZE(X) -+ addl X, 16 * SIZE, X -+ .align 4 -+ -+$L15: -+ and N, 15, I -+ -+#else -+ -+ sra N, 3, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ -+ LD a4, 4 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ LD a5, 5 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ -+ LD a6, 6 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ LD a7, 7 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ST t0, 0 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 1 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 2 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 3 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ LD a0, 8 * SIZE(X) -+ ldi I, -1(I) -+ LD a1, 9 * SIZE(X) -+ addl X, 8 * SIZE, X -+ -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ -+ ST t0, -4 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ST t1, -3 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ -+ ST t2, -2 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ ST t3, -1 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ -+ LD a4, 4 * SIZE(X) -+ LD a5, 5 * SIZE(X) -+ -+ LD a6, 6 * SIZE(X) -+ LD a7, 7 * SIZE(X) -+ fillcs PREFETCHSIZE * SIZE(X) -+ bne I, $L12 -+ .align 4 -+ -+$L13: -+ ST t0, 0 * SIZE(X) -+ MUL a4, ALPHA, t0 -+ ST t1, 1 * SIZE(X) -+ MUL a5, ALPHA, t1 -+ -+ ST t2, 2 * SIZE(X) -+ MUL a6, ALPHA, t2 -+ ST t3, 3 * SIZE(X) -+ MUL a7, ALPHA, t3 -+ -+ ST t0, 4 * SIZE(X) -+ ST t1, 5 * SIZE(X) -+ ST t2, 6 * SIZE(X) -+ ST t3, 7 * SIZE(X) -+ addl X, 8 * SIZE, X -+ .align 4 -+ -+$L15: -+ and N, 7, I -+ -+#endif -+ -+ unop -+ unop -+ ble I, $L999 -+ .align 4 -+ -+$L17: -+ LD a0, 0 * SIZE(X) -+ -+ MUL a0, ALPHA, t0 -+ -+ ST t0, 0 * SIZE(X) -+ -+ addl X, SIZE, X -+ -+ ldi I, -1(I) -+ bne I, $L17 -+ ret -+ .align 4 -+ -+$L20: -+ sra N, 3, I -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD a4, 0 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ldi I, -1(I) -+ SXADDQ INCX, X, X -+ -+ LD a5, 0 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ SXADDQ INCX, X, X -+ unop -+ -+ LD a6, 0 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ SXADDQ INCX, X, X -+ unop -+ -+ LD a7, 0 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ SXADDQ INCX, X, X -+ ble I, $L23 -+ .align 4 -+ -+$L22: -+ ST t0, 0 * SIZE(XX) -+ MUL a4, ALPHA, t0 -+ fillcs PREFETCHSIZE * SIZE(X) -+ SXADDQ INCX, XX, XX -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ ldi I, -1(I) -+ unop -+ -+ ST t1, 0 * SIZE(XX) -+ MUL a5, ALPHA, t1 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t2, 0 * SIZE(XX) -+ MUL a6, ALPHA, t2 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t3, 0 * SIZE(XX) -+ MUL a7, ALPHA, t3 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t0, 0 * SIZE(XX) -+ MUL a0, ALPHA, t0 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a4, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t1, 0 * SIZE(XX) -+ MUL a1, ALPHA, t1 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a5, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t2, 0 * SIZE(XX) -+ MUL a2, ALPHA, t2 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a6, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t3, 0 * SIZE(XX) -+ MUL a3, ALPHA, t3 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a7, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ unop -+ bne I, $L22 -+ .align 4 -+ -+$L23: -+ ST t0, 0 * SIZE(XX) -+ MUL a4, ALPHA, t0 -+ SXADDQ INCX, XX, XX -+ -+ ST t1, 0 * SIZE(XX) -+ MUL a5, ALPHA, t1 -+ SXADDQ INCX, XX, XX -+ -+ ST t2, 0 * SIZE(XX) -+ MUL a6, ALPHA, t2 -+ SXADDQ INCX, XX, XX -+ -+ ST t3, 0 * SIZE(XX) -+ MUL a7, ALPHA, t3 -+ SXADDQ INCX, XX, XX -+ -+ ST t0, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST t1, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST t2, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST t3, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ .align 4 -+ -+$L25: -+ and N, 7, I -+ unop -+ unop -+ ble I, $L999 -+ .align 4 -+ -+$L27: -+ LD a0, 0 * SIZE(X) -+ -+ MUL a0, ALPHA, t0 -+ -+ ST t0, 0 * SIZE(XX) -+ -+ SXADDQ INCX, X, X -+ SXADDQ INCX, XX, XX -+ -+ ldi I, -1(I) -+ bne I, $L27 -+ .align 4 -+ -+$L999: -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/scal_simd.S b/kernel/sw_64/scal_simd.S -new file mode 100644 -index 0000000..7462e99 ---- /dev/null -+++ b/kernel/sw_64/scal_simd.S -@@ -0,0 +1,344 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 144 -+ -+#define N $16 -+#define X $20 -+#define INCX $21 -+ -+#define XX $18 -+#define I $19 -+ -+#define ALPHA $f19 -+ -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 -+ -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f21 -+ -+#define t0 $f22 -+#define t1 $f23 -+#define t2 $f24 -+#define t3 $f25 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 -+ -+ mov X, XX -+ ble N, $L999 -+ -+ cmpeq INCX, 1, $0 -+ beq $0, $L20 -+ -+/** -+ test the address of X -+**/ -+ and X, (VEC_LEN*SIZE-1), $4 -+ beq $4, $Align_X_Access -+ -+ .align 5 -+/** -+ process the unalign address of X -+**/ -+ sra N, 4, I -+ ble I, $Remain /*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/ -+ -+ sra $4, BASE_SHIFT, $4 -+ ldi $3, VEC_LEN -+ subl $3, $4, $4 -+ subl N, $4, N -+ -+$UnAlign_X_Loop: -+ LD a0, 0*SIZE(X) -+ MUL a0, ALPHA, t0 -+ ST t0, 0*SIZE(X) -+ addl X, SIZE, X -+ -+ -+ -+ subl $4, 1, $4 -+ bgt $4, $UnAlign_X_Loop -+ .align 5 -+ -+$Align_X_Access: -+ -+/* -+ Unloop 16 -+*/ -+ sra N, 4, I -+ vcpyf ALPHA, ALPHA -+ ble I, $Remain -+ -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ VLD a3, 3*VEC_LEN*SIZE(X) -+ -+ ldi I, -1(I) -+ ble I, $MainLoop_End -+ .align 5 -+$MainLoop: -+ VMUL a0, ALPHA, t0 -+ VLD a0, 4*VEC_LEN*SIZE(X) -+ VMUL a1, ALPHA, t1 -+ VLD a1, 5*VEC_LEN*SIZE(X) -+ -+ VMUL a2, ALPHA, t2 -+ VLD a2, 6*VEC_LEN*SIZE(X) -+ VMUL a3, ALPHA, t3 -+ VLD a3, 7*VEC_LEN*SIZE(X) -+ -+ VST t0, 0*VEC_LEN*SIZE(X) -+ VST t1, 1*VEC_LEN*SIZE(X) -+ VST t2, 2*VEC_LEN*SIZE(X) -+ VST t3, 3*VEC_LEN*SIZE(X) -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ ldi I, -1(I) -+ addl X, 16 * SIZE, X -+ bne I, $MainLoop -+ .align 5 -+ -+$MainLoop_End: -+ VMUL a0, ALPHA, t0 -+ VST t0, 0*VEC_LEN*SIZE(X) -+ VMUL a1, ALPHA, t1 -+ VST t1, 1*VEC_LEN*SIZE(X) -+ -+ VMUL a2, ALPHA, t2 -+ VST t2, 2*VEC_LEN*SIZE(X) -+ VMUL a3, ALPHA, t3 -+ VST t3, 3*VEC_LEN*SIZE(X) -+ -+ addl X, 16 * SIZE, X -+ .align 5 -+ -+$Remain: -+ and N, 15, I -+ unop -+ unop -+ ble I, $L999 -+ .align 5 -+ -+$L17: -+ LD a0, 0 * SIZE(X) -+ -+ MUL a0, ALPHA, t0 -+ -+ ST t0, 0 * SIZE(X) -+ -+ addl X, SIZE, X -+ -+ ldi I, -1(I) -+ bne I, $L17 -+ ret -+ .align 5 -+ -+$L20: -+ sra N, 3, I -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD a4, 0 * SIZE(X) -+ MUL a0, ALPHA, t0 -+ ldi I, -1(I) -+ SXADDQ INCX, X, X -+ -+ LD a5, 0 * SIZE(X) -+ MUL a1, ALPHA, t1 -+ SXADDQ INCX, X, X -+ unop -+ -+ LD a6, 0 * SIZE(X) -+ MUL a2, ALPHA, t2 -+ SXADDQ INCX, X, X -+ unop -+ -+ LD a7, 0 * SIZE(X) -+ MUL a3, ALPHA, t3 -+ SXADDQ INCX, X, X -+ ble I, $L23 -+ .align 5 -+ -+$L22: -+ ST t0, 0 * SIZE(XX) -+ MUL a4, ALPHA, t0 -+/* -+ fillcs PREFETCHSIZE * SIZE(X) -+*/ -+ fillcs PREFETCHSIZE * SIZE(X) -+ SXADDQ INCX, XX, XX -+ -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ ldi I, -1(I) -+ unop -+ -+ ST t1, 0 * SIZE(XX) -+ MUL a5, ALPHA, t1 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a1, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t2, 0 * SIZE(XX) -+ MUL a6, ALPHA, t2 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t3, 0 * SIZE(XX) -+ MUL a7, ALPHA, t3 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t0, 0 * SIZE(XX) -+ MUL a0, ALPHA, t0 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a4, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t1, 0 * SIZE(XX) -+ MUL a1, ALPHA, t1 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a5, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t2, 0 * SIZE(XX) -+ MUL a2, ALPHA, t2 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a6, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ST t3, 0 * SIZE(XX) -+ MUL a3, ALPHA, t3 -+ SXADDQ INCX, XX, XX -+ unop -+ -+ LD a7, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ unop -+ bne I, $L22 -+ .align 5 -+ -+$L23: -+ ST t0, 0 * SIZE(XX) -+ MUL a4, ALPHA, t0 -+ SXADDQ INCX, XX, XX -+ -+ ST t1, 0 * SIZE(XX) -+ MUL a5, ALPHA, t1 -+ SXADDQ INCX, XX, XX -+ -+ ST t2, 0 * SIZE(XX) -+ MUL a6, ALPHA, t2 -+ SXADDQ INCX, XX, XX -+ -+ ST t3, 0 * SIZE(XX) -+ MUL a7, ALPHA, t3 -+ SXADDQ INCX, XX, XX -+ -+ ST t0, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST t1, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST t2, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST t3, 0 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ .align 5 -+ -+$L25: -+ and N, 7, I -+ unop -+ unop -+ ble I, $L999 -+ .align 5 -+ -+$L27: -+ LD a0, 0 * SIZE(X) -+ -+ MUL a0, ALPHA, t0 -+ -+ ST t0, 0 * SIZE(XX) -+ -+ SXADDQ INCX, X, X -+ SXADDQ INCX, XX, XX -+ -+ ldi I, -1(I) -+ bne I, $L27 -+ .align 5 -+ -+$L999: -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/snrm2.S b/kernel/sw_64/snrm2.S -new file mode 100644 -index 0000000..ff1ec57 ---- /dev/null -+++ b/kernel/sw_64/snrm2.S -@@ -0,0 +1,491 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCH_SIZE 80 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+ -+#define I $0 -+ -+#define a0 $f0 -+#define a1 $f1 -+#define a2 $f10 -+#define a3 $f11 -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 -+ -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f19 -+#define x4 $f20 -+#define x5 $f21 -+#define x6 $f22 -+#define x7 $f23 -+#define x8 $f24 -+ -+ PROLOGUE -+ -+#if defined(EV4) || defined(EV5) -+ .frame $30,16,$26,0 -+ .mask 0x4000000,-16 -+ ldih $29, 0($27) !gpdisp!1 -+ ldi $29, 0($29) !gpdisp!1 -+ -+ ldi $sp, -16($sp) -+ ldl $27, sqrt($29) !literal!2 -+ stl $26, 0($sp) -+ -+ PROFCODE -+ .prologue 1 -+#else -+ PROFCODE -+#endif -+ -+ fclr a0 -+ SXADDQ INCX, 0, INCX -+ fclr a1 -+ ble N, $L999 -+ -+ fclr a2 -+ cmpeq INCX, SIZE, $0 -+ fclr a3 -+ beq $0, $L20 -+ -+ fclr t0 -+ sra N, 4, I -+ fclr t1 -+ ble I, $L15 -+ -+ fclr t2 -+ LD x0, 0 * SIZE(X) -+ fclr t3 -+ LD x1, 1 * SIZE(X) -+ -+ LD x2, 2 * SIZE(X) -+ LD x3, 3 * SIZE(X) -+ LD x4, 4 * SIZE(X) -+ LD x5, 5 * SIZE(X) -+ LD x6, 6 * SIZE(X) -+ LD x7, 7 * SIZE(X) -+ -+ ldi I, -1(I) -+ ble I, $L12 -+ .align 4 -+ -+$L11: -+ faddd a0, t0, x8 -+ fmov x8,a0 -+ fillcs (PREFETCH_SIZE) * SIZE(X) -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) -+ -+ faddd a1, t1,x8 -+ fmov x8,a1 -+ mov X, XX -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) -+ -+ faddd a2, t2,x8 -+ fmov x8,a2 -+ #unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) -+ -+ faddd a3, t3,x8 -+ fmov x8,a3 -+ #unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) -+ -+ faddd a0, t0, x8 -+ fmov x8,a0 -+ #unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(X) -+ -+ faddd a1, t1, x8 -+ fmov x8,a1 -+ #unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(X) -+ -+ faddd a2, t2, x8 -+ fmov x8,a2 -+ #unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(X) -+ -+ faddd a3, t3, x8 -+ fmov x8,a3 -+ #unop -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(X) -+ -+ faddd a0, t0, x8 -+ fmov x8,a0 -+ #unop -+ fmuld x0, x0, t0 -+ LD x0, 16 * SIZE(X) -+ -+ faddd a1, t1,x8 -+ fmov x8,a1 -+ ldi X, 16 * SIZE(X) -+ fmuld x1, x1, t1 -+ LD x1, 17 * SIZE(XX) -+ -+ faddd a2, t2, x8 -+ fmov x8,a2 -+ #unop -+ fmuld x2, x2, t2 -+ LD x2, 18 * SIZE(XX) -+ -+ faddd a3, t3,x8 -+ fmov x8,a3 -+ #unop -+ fmuld x3, x3, t3 -+ LD x3, 19 * SIZE(XX) -+ -+ faddd a0, t0, x8 -+ fmov x8,a0 -+ #unop -+ fmuld x4, x4, t0 -+ LD x4, 20 * SIZE(XX) -+ -+ faddd a1, t1,x8 -+ fmov x8,a1 -+ ldi I, -1(I) -+ fmuld x5, x5, t1 -+ LD x5, 21 * SIZE(XX) -+ -+ faddd a2, t2, x8 -+ fmov x8,a2 -+ #unop -+ fmuld x6, x6, t2 -+ LD x6, 22 * SIZE(XX) -+ -+ faddd a3, t3,x8 -+ fmov x8,a3 -+ fmuld x7, x7, t3 -+ LD x7, 23 * SIZE(XX) -+ bgt I, $L11 -+ .align 4 -+ -+$L12: -+ faddd a0, t0,x8 -+ fmov x8,a0 -+ mov X, XX -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) -+ -+ faddd a1, t1,x8 -+ fmov x8,a1 -+ #unop -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) -+ -+ faddd a2, t2,x8 -+ fmov x8,a2 -+ #unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) -+ -+ faddd a3, t3, x8 -+ fmov x8,a3 -+ #unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) -+ -+ faddd a0, t0, x8 -+ fmov x8,a0 -+ #unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(XX) -+ -+ faddd a1, t1, x8 -+ fmov x8,a1 -+ #unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(XX) -+ -+ faddd a2, t2, x8 -+ fmov x8,a2 -+ #unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(XX) -+ -+ faddd a3, t3,x8 -+ fmov x8,a3 -+ ldi X, 16 * SIZE(X) -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(XX) -+ -+ faddd a0, t0,x8 -+ fmov x8,a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, x8 -+ fmov x8,a1 -+ fmuld x1, x1, t1 -+ -+ faddd a2, t2, x8 -+ fmov x8,a2 -+ fmuld x2, x2, t2 -+ faddd a3, t3, x8 -+ fmov x8,a3 -+ fmuld x3, x3, t3 -+ -+ faddd a0, t0, x8 -+ fmov x8,a0 -+ fmuld x4, x4, t0 -+ faddd a1, t1, x8 -+ fmov x8,a1 -+ fmuld x5, x5, t1 -+ -+ faddd a2, t2, x8 -+ fmov x8,a2 -+ fmuld x6, x6, t2 -+ faddd a3, t3, x8 -+ fmov x8,a3 -+ fmuld x7, x7, t3 -+ -+ faddd a1, t1, x8 -+ fmov x8,a1 -+ faddd a2, t2, x8 -+ fmov x8,a2 -+ faddd a3, t3, x8 -+ fmov x8,a3 -+ .align 4 -+ -+$L15: -+ and N, 15, I -+ ble I, $L998 -+ .align 4 -+ -+$L16: -+ LD x0, 0 * SIZE(X) -+ ldi X, 1 * SIZE(X) -+ -+ faddd a0, t0,x8 -+ fmov x8,a0 -+ fmuld x0, x0, t0 -+ -+ ldi I, -1(I) -+ bgt I, $L16 -+ bsr $31, $L998 -+ .align 4 -+ -+$L20: -+ fclr t0 -+ sra N, 3, I -+ fclr t1 -+ ble I, $L25 -+ -+ fclr t2 -+ fclr t3 -+ -+ LD x0, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x1, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x2, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x3, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ LD x4, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x5, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x6, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ ldi I, -1(I) -+ ble I, $L22 -+ .align 4 -+ -+$L21: -+ faddd a0, t0, x8 -+ fmov x8,a0 -+ LD x7, 0 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X -+ -+ faddd a1, t1,x8 -+ fmov x8,a1 -+ LD x0, 0 * SIZE(X) -+ fmuld x1, x1, t1 -+ addl X, INCX, X -+ -+ faddd a2, t2,x8 -+ fmov x8,a2 -+ LD x1, 0 * SIZE(X) -+ fmuld x2, x2, t2 -+ addl X, INCX, X -+ -+ faddd a3, t3,x8 -+ fmov x8,a3 -+ LD x2, 0 * SIZE(X) -+ fmuld x3, x3, t3 -+ addl X, INCX, X -+ -+ faddd a0, t0,x8 -+ fmov x8,a0 -+ LD x3, 0 * SIZE(X) -+ fmuld x4, x4, t0 -+ addl X, INCX, X -+ -+ faddd a1, t1,x8 -+ fmov x8,a1 -+ LD x4, 0 * SIZE(X) -+ fmuld x5, x5, t1 -+ addl X, INCX, X -+ -+ faddd a2, t2,x8 -+ fmov x8,a2 -+ LD x5, 0 * SIZE(X) -+ fmuld x6, x6, t2 -+ addl X, INCX, X -+ -+ faddd a3, t3, x8 -+ fmov x8,a3 -+ LD x6, 0 * SIZE(X) -+ fmuld x7, x7, t3 -+ addl X, INCX, X -+ -+ ldi I, -1(I) -+ bgt I, $L21 -+ .align 4 -+ -+$L22: -+ faddd a0, t0,x8 -+ fmov x8,a0 -+ LD x7, 0 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X -+ -+ faddd a1, t1, x8 -+ fmov x8,a1 -+ unop -+ fmuld x1, x1, t1 -+ unop -+ -+ faddd a2, t2,x8 -+ fmov x8,a2 -+ fmuld x2, x2, t2 -+ faddd a3, t3, x8 -+ fmov x8,a3 -+ fmuld x3, x3, t3 -+ -+ faddd a0, t0, x8 -+ fmov x8,a0 -+ fmuld x4, x4, t0 -+ faddd a1, t1, x8 -+ fmov x8,a1 -+ fmuld x5, x5, t1 -+ -+ faddd a2, t2, x8 -+ fmov x8,a2 -+ fmuld x6, x6, t2 -+ faddd a3, t3, x8 -+ fmov x8,a3 -+ fmuld x7, x7, t3 -+ -+ faddd a1, t1, x8 -+ fmov x8,a1 -+ faddd a2, t2, x8 -+ fmov x8,a2 -+ faddd a3, t3, x8 -+ fmov x8,a3 -+ .align 4 -+ -+$L25: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 -+ -+$L26: -+ LD x0, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ faddd a0, t0,x8 -+ fmov x8,a0 -+ fmuld x0, x0, t0 -+ -+ ldi I, -1(I) -+ bgt I, $L26 -+ .align 4 -+ -+ -+$L998: -+ faddd a0, t0,x8 -+ fmov x8,a0 -+ -+ faddd a0, a1, x8 -+ fmov x8,a1 -+ faddd a2, a3, x8 -+ fmov x8,a2 -+ -+#if defined(EV4) || defined(EV5) -+ faddd a0, a2, $f16 -+ jsr $26, ($27), sqrt !lituse_jsr!2 -+ -+ ldih $29, 0($26) !gpdisp!3 -+ ldi $29, 0($29) !gpdisp!3 -+#else -+ faddd a0, a2,x8 -+ fsqrtd x8, a0 -+#endif -+ .align 4 -+ -+$L999: -+#if defined(EV4) || defined(EV5) -+ ldl $26, 0($sp) -+ ldi $sp, 16($sp) -+#endif -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/snrm2.S.bak b/kernel/sw_64/snrm2.S.bak -new file mode 100644 -index 0000000..753c90b ---- /dev/null -+++ b/kernel/sw_64/snrm2.S.bak -@@ -0,0 +1,431 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCH_SIZE 80 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 -+ -+#define I $0 -+ -+#define a0 $f0 -+#define a1 $f1 -+#define a2 $f10 -+#define a3 $f11 -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 -+ -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f19 -+#define x4 $f20 -+#define x5 $f21 -+#define x6 $f22 -+#define x7 $f23 -+ -+ PROLOGUE -+ -+#if defined(EV4) || defined(EV5) -+ .frame $30,16,$26,0 -+ .mask 0x4000000,-16 -+ ldih $29, 0($27) !gpdisp!1 -+ ldi $29, 0($29) !gpdisp!1 -+ -+ ldi $sp, -16($sp) -+ ldl $27, sqrt($29) !literal!2 -+ stq $26, 0($sp) -+ -+ PROFCODE -+ .prologue 1 -+#else -+ PROFCODE -+#endif -+ -+ fclr a0 -+ SXADDQ INCX, 0, INCX -+ fclr a1 -+ ble N, $L999 -+ -+ fclr a2 -+ cmpeq INCX, SIZE, $0 -+ fclr a3 -+ beq $0, $L20 -+ -+ fclr t0 -+ sra N, 4, I -+ fclr t1 -+ ble I, $L15 -+ -+ fclr t2 -+ LD x0, 0 * SIZE(X) -+ fclr t3 -+ LD x1, 1 * SIZE(X) -+ -+ LD x2, 2 * SIZE(X) -+ LD x3, 3 * SIZE(X) -+ LD x4, 4 * SIZE(X) -+ LD x5, 5 * SIZE(X) -+ LD x6, 6 * SIZE(X) -+ LD x7, 7 * SIZE(X) -+ -+ ldi I, -1(I) -+ ble I, $L12 -+ .align 4 -+ -+$L11: -+ faddd a0, t0, a0 -+ fillcs (PREFETCH_SIZE) * SIZE(X) -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) -+ -+ faddd a1, t1, a1 -+ mov X, XX -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) -+ -+ faddd a2, t2, a2 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) -+ -+ faddd a3, t3, a3 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) -+ -+ faddd a0, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(X) -+ -+ faddd a1, t1, a1 -+ unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(X) -+ -+ faddd a2, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(X) -+ -+ faddd a3, t3, a3 -+ unop -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(X) -+ -+ faddd a0, t0, a0 -+ unop -+ fmuld x0, x0, t0 -+ LD x0, 16 * SIZE(X) -+ -+ faddd a1, t1, a1 -+ ldi X, 16 * SIZE(X) -+ fmuld x1, x1, t1 -+ LD x1, 17 * SIZE(XX) -+ -+ faddd a2, t2, a2 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 18 * SIZE(XX) -+ -+ faddd a3, t3, a3 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 19 * SIZE(XX) -+ -+ faddd a0, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 20 * SIZE(XX) -+ -+ faddd a1, t1, a1 -+ ldi I, -1(I) -+ fmuld x5, x5, t1 -+ LD x5, 21 * SIZE(XX) -+ -+ faddd a2, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 22 * SIZE(XX) -+ -+ faddd a3, t3, a3 -+ fmuld x7, x7, t3 -+ LD x7, 23 * SIZE(XX) -+ bgt I, $L11 -+ .align 4 -+ -+$L12: -+ faddd a0, t0, a0 -+ mov X, XX -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) -+ -+ faddd a1, t1, a1 -+ unop -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) -+ -+ faddd a2, t2, a2 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) -+ -+ faddd a3, t3, a3 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) -+ -+ faddd a0, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(XX) -+ -+ faddd a1, t1, a1 -+ unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(XX) -+ -+ faddd a2, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(XX) -+ -+ faddd a3, t3, a3 -+ ldi X, 16 * SIZE(X) -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(XX) -+ -+ faddd a0, t0, a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, a1 -+ fmuld x1, x1, t1 -+ -+ faddd a2, t2, a2 -+ fmuld x2, x2, t2 -+ faddd a3, t3, a3 -+ fmuld x3, x3, t3 -+ -+ faddd a0, t0, a0 -+ fmuld x4, x4, t0 -+ faddd a1, t1, a1 -+ fmuld x5, x5, t1 -+ -+ faddd a2, t2, a2 -+ fmuld x6, x6, t2 -+ faddd a3, t3, a3 -+ fmuld x7, x7, t3 -+ -+ faddd a1, t1, a1 -+ faddd a2, t2, a2 -+ faddd a3, t3, a3 -+ .align 4 -+ -+$L15: -+ and N, 15, I -+ ble I, $L998 -+ .align 4 -+ -+$L16: -+ LD x0, 0 * SIZE(X) -+ ldi X, 1 * SIZE(X) -+ -+ faddd a0, t0, a0 -+ fmuld x0, x0, t0 -+ -+ ldi I, -1(I) -+ bgt I, $L16 -+ bsr $31, $L998 -+ .align 4 -+ -+$L20: -+ fclr t0 -+ sra N, 3, I -+ fclr t1 -+ ble I, $L25 -+ -+ fclr t2 -+ fclr t3 -+ -+ LD x0, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x1, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x2, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x3, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ LD x4, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x5, 0 * SIZE(X) -+ addl X, INCX, X -+ LD x6, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ ldi I, -1(I) -+ ble I, $L22 -+ .align 4 -+ -+$L21: -+ faddd a0, t0, a0 -+ LD x7, 0 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X -+ -+ faddd a1, t1, a1 -+ LD x0, 0 * SIZE(X) -+ fmuld x1, x1, t1 -+ addl X, INCX, X -+ -+ faddd a2, t2, a2 -+ LD x1, 0 * SIZE(X) -+ fmuld x2, x2, t2 -+ addl X, INCX, X -+ -+ faddd a3, t3, a3 -+ LD x2, 0 * SIZE(X) -+ fmuld x3, x3, t3 -+ addl X, INCX, X -+ -+ faddd a0, t0, a0 -+ LD x3, 0 * SIZE(X) -+ fmuld x4, x4, t0 -+ addl X, INCX, X -+ -+ faddd a1, t1, a1 -+ LD x4, 0 * SIZE(X) -+ fmuld x5, x5, t1 -+ addl X, INCX, X -+ -+ faddd a2, t2, a2 -+ LD x5, 0 * SIZE(X) -+ fmuld x6, x6, t2 -+ addl X, INCX, X -+ -+ faddd a3, t3, a3 -+ LD x6, 0 * SIZE(X) -+ fmuld x7, x7, t3 -+ addl X, INCX, X -+ -+ ldi I, -1(I) -+ bgt I, $L21 -+ .align 4 -+ -+$L22: -+ faddd a0, t0, a0 -+ LD x7, 0 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X -+ -+ faddd a1, t1, a1 -+ unop -+ fmuld x1, x1, t1 -+ unop -+ -+ faddd a2, t2, a2 -+ fmuld x2, x2, t2 -+ faddd a3, t3, a3 -+ fmuld x3, x3, t3 -+ -+ faddd a0, t0, a0 -+ fmuld x4, x4, t0 -+ faddd a1, t1, a1 -+ fmuld x5, x5, t1 -+ -+ faddd a2, t2, a2 -+ fmuld x6, x6, t2 -+ faddd a3, t3, a3 -+ fmuld x7, x7, t3 -+ -+ faddd a1, t1, a1 -+ faddd a2, t2, a2 -+ faddd a3, t3, a3 -+ .align 4 -+ -+$L25: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 -+ -+$L26: -+ LD x0, 0 * SIZE(X) -+ addl X, INCX, X -+ -+ faddd a0, t0, a0 -+ fmuld x0, x0, t0 -+ -+ ldi I, -1(I) -+ bgt I, $L26 -+ .align 4 -+ -+ -+$L998: -+ faddd a0, t0, a0 -+ -+ faddd a0, a1, a0 -+ faddd a2, a3, a2 -+ -+#if defined(EV4) || defined(EV5) -+ faddd a0, a2, $f16 -+ jsr $26, ($27), sqrt !lituse_jsr!2 -+ -+ ldih $29, 0($26) !gpdisp!3 -+ ldi $29, 0($29) !gpdisp!3 -+#else -+ faddd a0, a2, a0 -+ fsqrtd a0, a0 -+#endif -+ .align 4 -+ -+$L999: -+#if defined(EV4) || defined(EV5) -+ ldl $26, 0($sp) -+ ldi $sp, 16($sp) -+#endif -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/staticbuffer.S b/kernel/sw_64/staticbuffer.S -new file mode 100644 -index 0000000..7bbd23d ---- /dev/null -+++ b/kernel/sw_64/staticbuffer.S -@@ -0,0 +1,45 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+ -+#ifdef ALLOC_STATIC -+ .align 8 -+ .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 -+#endif -diff --git a/kernel/sw_64/sum.S b/kernel/sw_64/sum.S -new file mode 100644 -index 0000000..0be6d53 ---- /dev/null -+++ b/kernel/sw_64/sum.S -@@ -0,0 +1,230 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 88 -+ -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define I $19 -+ -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 -+ -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f19 -+ -+#define t0 $f20 -+#define t1 $f21 -+#define t2 $f22 -+#define t3 $f23 -+ -+ PROLOGUE -+ PROFCODE -+ -+ fclr s0 -+ unop -+ fclr t0 -+ ble N, $L999 -+ -+ sra N, 3, I -+ fclr s1 -+ fclr s2 -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(X) -+ fclr t1 -+ SXADDQ INCX, X, X -+ fclr t2 -+ -+ LD a1, 0 * SIZE(X) -+ fclr t3 -+ SXADDQ INCX, X, X -+ fclr s3 -+ -+ LD a2, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a3, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD a4, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a5, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 -+ -+$L12: -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ ldw $31, PREFETCHSIZE * 2 * SIZE(X) -+ fmov a0, t0 -+ ldi I, -1(I) -+ -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ LD a6, 0 * SIZE(X) -+ fmov a1, t1 -+ SXADDQ INCX, X, X -+ -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ LD a7, 0 * SIZE(X) -+ fmov a2, t2 -+ SXADDQ INCX, X, X -+ -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ LD a0, 0 * SIZE(X) -+ fmov a3, t3 -+ SXADDQ INCX, X, X -+ -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ LD a1, 0 * SIZE(X) -+ fmov a4, t0 -+ SXADDQ INCX, X, X -+ -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ LD a2, 0 * SIZE(X) -+ fmov a5, t1 -+ SXADDQ INCX, X, X -+ -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ LD a3, 0 * SIZE(X) -+ fmov a6, t2 -+ SXADDQ INCX, X, X -+ -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ LD a4, 0 * SIZE(X) -+ fmov a7, t3 -+ SXADDQ INCX, X, X -+ -+ LD a5, 0 * SIZE(X) -+ unop -+ SXADDQ INCX, X, X -+ bne I, $L12 -+ .align 4 -+ -+$L13: -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ LD a6, 0 * SIZE(X) -+ fmov a0, t0 -+ SXADDQ INCX, X, X -+ -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ LD a7, 0 * SIZE(X) -+ fmov a1, t1 -+ SXADDQ INCX, X, X -+ -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ fmov a2, t2 -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ fmov a3, t3 -+ -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ fmov a4, t0 -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ fmov a5, t1 -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ fmov a6, t2 -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ fmov a7, t3 -+ -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ -+ ADD s0, s1, $f24 -+ fmov $f24,s0 -+ ADD s2, s3, $f24 -+ fmov $f24,s2 -+ .align 4 -+ -+$L15: -+ and N, 7, I -+ ADD s0, s2, $f24 -+ fmov $f24,s0 -+ unop -+ ble I, $L999 -+ .align 4 -+ -+$L17: -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ LD a0, 0 * SIZE(X) -+ SXADDQ INCX, X, X -+ fmov a0, t0 -+ -+ ldi I, -1(I) -+ bne I, $L17 -+ .align 4 -+ -+$L999: -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/sw_fpcr.S b/kernel/sw_64/sw_fpcr.S -new file mode 100644 -index 0000000..5dee238 ---- /dev/null -+++ b/kernel/sw_64/sw_fpcr.S -@@ -0,0 +1,39 @@ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+ .arch sw2b -+ .set noat -+ .set noreorder -+.text -+ .align 5 -+ .globl read_fpcr -+ .ent read_fpcr -+read_fpcr: -+ .frame $sp, 0, $26, 0 -+ RFPCR $f10 -+ fstd $f10, 0($16) -+ ret -+ .end read_fpcr -+ -+ .globl write_fpcr -+ .ent write_fpcr -+write_fpcr: -+ .frame $sp, 0, $26, 0 -+ fldd $f10, 0($16) -+ WFPCR $f10 -+ ret -+ .end write_fpcr -+/** -+ .globl fadd_test -+ .ent fadd_test -+ -+fadd_test: -+ .frame $sp, 0, $26, 0 -+ faddd $f16, $f17, $f16 -+ fmov $f16, $f0 -+ ret -+ .end fadd_test -+**/ -+ .ident VERSION -+ -diff --git a/kernel/sw_64/sw_fpcr_inline.c b/kernel/sw_64/sw_fpcr_inline.c -new file mode 100644 -index 0000000..1943e3e ---- /dev/null -+++ b/kernel/sw_64/sw_fpcr_inline.c -@@ -0,0 +1,13 @@ -+#include "common.h" -+ -+void read_fpcr(long * test){ -+ -+ __asm__("rfpcr $f10 \n fstd $f10, %0":"=m"(*test):); -+ return; -+} -+ -+void write_fpcr(long * test){ -+ -+ __asm__("fldd $f10, %0\nwfpcr $f10"::"m"(*test)); -+ return; -+} -diff --git a/kernel/sw_64/swap.S b/kernel/sw_64/swap.S -new file mode 100644 -index 0000000..5c8b679 ---- /dev/null -+++ b/kernel/sw_64/swap.S -@@ -0,0 +1,249 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 -+ -+ mov $20, $17 -+ mov $21, $18 -+ ldl $19, 0($sp) -+ ldl $20, 8($sp) -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif -+ -+ subl $18, 1, $1 -+ subl $20, 1, $2 -+ ble $16, $SubEnd # if n <= 0 goto $End -+ or $1, $2, $1 -+ -+ sra $16, 3, $21 -+ -+ and $16, 7, $22 -+ bne $1, $Sub -+ ble $21, $MainRemain -+ .align 4 -+ -+$MainLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f11, 1*SIZE($19) -+ LD $f12, 2*SIZE($19) -+ LD $f13, 3*SIZE($19) -+ LD $f14, 4*SIZE($19) -+ LD $f15, 5*SIZE($19) -+ LD $f16, 6*SIZE($19) -+ LD $f17, 7*SIZE($19) -+ -+ LD $f20, 0*SIZE($17) -+ LD $f21, 1*SIZE($17) -+ LD $f22, 2*SIZE($17) -+ LD $f23, 3*SIZE($17) -+ LD $f24, 4*SIZE($17) -+ LD $f25, 5*SIZE($17) -+ LD $f26, 6*SIZE($17) -+ LD $f27, 7*SIZE($17) -+ -+ fillcs 32*SIZE($17) -+ unop -+ fillcs 32*SIZE($19) -+ subl $21, 1, $21 -+ -+ ST $f10, 0*SIZE($17) -+ ST $f11, 1*SIZE($17) -+ ST $f12, 2*SIZE($17) -+ ST $f13, 3*SIZE($17) -+ ST $f14, 4*SIZE($17) -+ ST $f15, 5*SIZE($17) -+ ST $f16, 6*SIZE($17) -+ ST $f17, 7*SIZE($17) -+ -+ ST $f20, 0*SIZE($19) -+ ST $f21, 1*SIZE($19) -+ ST $f22, 2*SIZE($19) -+ ST $f23, 3*SIZE($19) -+ ST $f24, 4*SIZE($19) -+ ST $f25, 5*SIZE($19) -+ ST $f26, 6*SIZE($19) -+ ST $f27, 7*SIZE($19) -+ -+ ldi $17, 8*SIZE($17) -+ ldi $19, 8*SIZE($19) -+ bgt $21, $MainLoop -+ .align 4 -+ -+$MainRemain: -+ ble $22, $MainEnd -+ .align 4 -+ -+$MainRemainLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f20, 0*SIZE($17) -+ ldi $17, 1*SIZE($17) -+ ldi $19, 1*SIZE($19) -+ subl $22, 1, $22 -+ ST $f10, -1*SIZE($17) -+ ST $f20, -1*SIZE($19) -+ bgt $22, $MainRemainLoop -+ .align 4 -+ -+$MainEnd: -+ clr $0 -+ ret -+ .align 4 -+ -+$Sub: -+ mov $17, $23 -+ mov $19, $24 -+ -+ ble $21, $SubRemain -+ .align 4 -+ -+$SubLoop: -+ LD $f10, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ LD $f11, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f12, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ LD $f13, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f14, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ LD $f15, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f16, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ LD $f17, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f20, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ LD $f21, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ -+ LD $f22, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ LD $f23, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ -+ LD $f24, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ LD $f25, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ -+ LD $f26, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ LD $f27, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ -+ ST $f10, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ ST $f11, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ -+ ST $f12, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ ST $f13, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ -+ ST $f14, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ ST $f15, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ -+ ST $f16, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ ST $f17, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ -+ ST $f20, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ ST $f21, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ -+ ST $f22, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ ST $f23, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ -+ ST $f24, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ ST $f25, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ -+ ST $f26, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ ST $f27, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ -+ subl $21, 1, $21 -+ bgt $21, $SubLoop -+ .align 4 -+ -+$SubRemain: -+ ble $22, $SubEnd -+ .align 4 -+ -+$SubRemainLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f20, 0*SIZE($17) -+ -+ subl $22, 1, $22 -+ -+ ST $f10, 0*SIZE($17) -+ ST $f20, 0*SIZE($19) -+ -+ SXADDQ $18, $17, $17 -+ SXADDQ $20, $19, $19 -+ bgt $22, $SubRemainLoop -+ .align 4 -+ -+$SubEnd: -+ clr $0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/swap_simd.S b/kernel/sw_64/swap_simd.S -new file mode 100644 -index 0000000..8a6141d ---- /dev/null -+++ b/kernel/sw_64/swap_simd.S -@@ -0,0 +1,327 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#define PREFETCHSIZE 64 -+#define X $17 -+#define Y $19 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 -+ -+ mov $20, $17 -+ mov $21, $18 -+ ldl $19, 0($sp) -+ ldl $20, 8($sp) -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif -+ -+ subl $18, 1, $1 -+ subl $20, 1, $2 -+ ble $16, $SubEnd # if n <= 0 goto $End -+ or $1, $2, $1 -+ -+/* -+ Unloop 16 -+*/ -+ sra $16, 4, $21 -+ and $16, 15, $22 -+ bne $1, $Sub -+ ble $21, $MainRemain -+ .align 4 -+ -+/* -+ test the address of Y & X -+*/ -+ and Y, (VEC_LEN*SIZE-1), $4 -+ and X, (VEC_LEN*SIZE-1), $3 -+ or $3, $4, $4 -+ bne $4, $UnAlign_ACCESS -+ -+/* align access*/ -+ -+$MainLoop: -+ VLD $f10, 0*VEC_LEN*SIZE(Y) -+ VLD $f11, 1*VEC_LEN*SIZE(Y) -+ VLD $f12, 2*VEC_LEN*SIZE(Y) -+ VLD $f13, 3*VEC_LEN*SIZE(Y) -+ -+ -+ VLD $f20, 0*VEC_LEN*SIZE(X) -+ VLD $f21, 1*VEC_LEN*SIZE(X) -+ VLD $f22, 2*VEC_LEN*SIZE(X) -+ VLD $f23, 3*VEC_LEN*SIZE(X) -+ -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ unop -+ fillcs PREFETCHSIZE * SIZE(Y) -+ subl $21, 1, $21 -+ -+ VST $f10, 0*VEC_LEN*SIZE(X) -+ VST $f11, 1*VEC_LEN*SIZE(X) -+ VST $f12, 2*VEC_LEN*SIZE(X) -+ VST $f13, 3*VEC_LEN*SIZE(X) -+ -+ VST $f20, 0*VEC_LEN*SIZE(Y) -+ VST $f21, 1*VEC_LEN*SIZE(Y) -+ VST $f22, 2*VEC_LEN*SIZE(Y) -+ VST $f23, 3*VEC_LEN*SIZE(Y) -+ -+ ldi $17, 16*SIZE(X) -+ ldi $19, 16*SIZE(Y) -+ bgt $21, $MainLoop -+ .align 4 -+ -+$MainRemain: -+ ble $22, $MainEnd -+ .align 4 -+ -+$MainRemainLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f20, 0*SIZE($17) -+ ldi $17, 1*SIZE($17) -+ ldi $19, 1*SIZE($19) -+ subl $22, 1, $22 -+ ST $f10, -1*SIZE($17) -+ ST $f20, -1*SIZE($19) -+ bgt $22, $MainRemainLoop -+ .align 4 -+ -+$MainEnd: -+ clr $0 -+ ret -+ .align 4 -+ -+$UnAlign_ACCESS: -+ sra $16, 3, $21 -+ and $16, 7, $22 -+ nop -+ ble $21, $UnAlign_ACCESS_MainRemain -+ .align 4 -+$UnAlign_ACCESS_MainLoop: -+ LD $f10, 0*SIZE(Y) -+ LD $f11, 1*SIZE(Y) -+ LD $f12, 2*SIZE(Y) -+ LD $f13, 3*SIZE(Y) -+ LD $f14, 4*SIZE(Y) -+ LD $f15, 5*SIZE(Y) -+ LD $f16, 6*SIZE(Y) -+ LD $f17, 7*SIZE(Y) -+ -+ LD $f20, 0*SIZE(X) -+ LD $f21, 1*SIZE(X) -+ LD $f22, 2*SIZE(X) -+ LD $f23, 3*SIZE(X) -+ LD $f24, 4*SIZE(X) -+ LD $f25, 5*SIZE(X) -+ LD $f26, 6*SIZE(X) -+ LD $f27, 7*SIZE(X) -+ -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ unop -+ fillcs PREFETCHSIZE * SIZE(Y) -+ subl $21, 1, $21 -+ -+ ST $f10, 0*SIZE(X) -+ ST $f11, 1*SIZE(X) -+ ST $f12, 2*SIZE(X) -+ ST $f13, 3*SIZE(X) -+ ST $f14, 4*SIZE(X) -+ ST $f15, 5*SIZE(X) -+ ST $f16, 6*SIZE(X) -+ ST $f17, 7*SIZE(X) -+ -+ ST $f20, 0*SIZE(Y) -+ ST $f21, 1*SIZE(Y) -+ ST $f22, 2*SIZE(Y) -+ ST $f23, 3*SIZE(Y) -+ ST $f24, 4*SIZE(Y) -+ ST $f25, 5*SIZE(Y) -+ ST $f26, 6*SIZE(Y) -+ ST $f27, 7*SIZE(Y) -+ -+ ldi X, 8*SIZE(X) -+ ldi Y, 8*SIZE(Y) -+ bgt $21, $UnAlign_ACCESS_MainLoop -+ .align 4 -+ -+$UnAlign_ACCESS_MainRemain: -+ ble $22, $UnAlign_ACCESS_MainEnd -+ .align 4 -+ -+$UnAlign_ACCESS_MainRemainLoop: -+ LD $f10, 0*SIZE(Y) -+ LD $f20, 0*SIZE(X) -+ ldi X, 1*SIZE(X) -+ ldi Y, 1*SIZE(Y) -+ subl $22, 1, $22 -+ ST $f10, -1*SIZE(X) -+ ST $f20, -1*SIZE(Y) -+ bgt $22, $UnAlign_ACCESS_MainRemainLoop -+ .align 4 -+ -+$UnAlign_ACCESS_MainEnd: -+ clr $0 -+ ret -+ .align 4 -+ -+$Sub: -+ sra $16, 3, $21 -+ and $16, 7, $22 -+ mov $17, $23 -+ mov $19, $24 -+ -+ ble $21, $SubRemain -+ .align 4 -+ -+$SubLoop: -+ LD $f10, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ LD $f11, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f12, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ LD $f13, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f14, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ LD $f15, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f16, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ LD $f17, 0*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f20, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ LD $f21, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ -+ LD $f22, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ LD $f23, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ -+ LD $f24, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ LD $f25, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ -+ LD $f26, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ LD $f27, 0*SIZE($17) -+ SXADDQ $18, $17, $17 -+ -+ ST $f10, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ ST $f11, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ -+ ST $f12, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ ST $f13, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ -+ ST $f14, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ ST $f15, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ -+ ST $f16, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ ST $f17, 0*SIZE($23) -+ SXADDQ $18, $23, $23 -+ -+ ST $f20, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ ST $f21, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ -+ ST $f22, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ ST $f23, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ -+ ST $f24, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ ST $f25, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ -+ ST $f26, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ ST $f27, 0*SIZE($24) -+ SXADDQ $20, $24, $24 -+ -+ subl $21, 1, $21 -+ bgt $21, $SubLoop -+ .align 4 -+ -+$SubRemain: -+ ble $22, $SubEnd -+ .align 4 -+ -+$SubRemainLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f20, 0*SIZE($17) -+ -+ subl $22, 1, $22 -+ -+ ST $f10, 0*SIZE($17) -+ ST $f20, 0*SIZE($19) -+ -+ SXADDQ $18, $17, $17 -+ SXADDQ $20, $19, $19 -+ bgt $22, $SubRemainLoop -+ .align 4 -+ -+$SubEnd: -+ clr $0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S b/kernel/sw_64/trsm_kernel_4x4_LN.S -new file mode 100644 -index 0000000..109c471 ---- /dev/null -+++ b/kernel/sw_64/trsm_kernel_4x4_LN.S -@@ -0,0 +1,5144 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#if !defined(EV4) && !defined(EV5) && !defined(SW6) -+#error "Architecture is not specified." -+#endif -+ -+#ifdef SW6 -+#define PREFETCHSIZE 56 -+#define UNOP unop -+#endif -+ -+#ifdef EV5 -+#define PREFETCHSIZE 56 -+#define UNOP -+#endif -+ -+#ifdef EV4 -+#define UNOP -+#endif -+ -+#define STACKSIZE 80 -+ -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $20 -+#define B $21 -+#define C $22 -+#define LDC $23 -+ -+#define C1 $19 -+#define C2 $24 -+#define C3 $25 -+#define C4 $27 -+ -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 -+ -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 -+ -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 -+ -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 -+ -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 -+ -+#define alpha $f30 -+ -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 -+ -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 -+ -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 -+ -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 -+ -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define AORIG $3 -+#define OFFSET $4 -+#define tmp $9 -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ ldl C, 0 + STACKSIZE($sp) -+ ldl LDC, 8 + STACKSIZE($sp) -+ ldl OFFSET, 16 + STACKSIZE($sp) -+ -+ SXADDQ LDC, 0, LDC -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ stl tmp, 64($sp) -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 -+ -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 -+ -+#ifdef LN -+ mull M, K, TMP1 -+ SXADDQ TMP1, A, A -+ SXADDQ M, C, C -+#endif -+ -+#ifdef RN -+ negq OFFSET, KK -+#endif -+ -+#ifdef RT -+ mulq N, K, TMP1 -+ SXADDQ TMP1, B, B -+ -+ mulq N, LDC, TMP1 -+ addl TMP1, C, C -+ -+ subl N, OFFSET, KK -+#endif -+ -+ sra N, 2, J -+ ble J, $L40 -+ .align 4 -+ -+$L01: -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ s4addl LDC, 0, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ addl C2, LDC, C3 -+#ifndef RT -+ s4addl LDC, C, C -+#endif -+ -+ fclr t1 -+ addl C3, LDC, C4 -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ fclr t3 -+ fclr t4 -+ -+ and M, 1, I -+ ble I, $L20 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c09 -+ LD b4, 3 * SIZE(B) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(B) -+ ble KK, $L38 -+ -+ ble L, $L35 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c09 -+ LD b4, 3 * SIZE(BO) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(BO) -+ ble TMP1, $L38 -+ -+ ble L, $L35 -+#endif -+ .align 4 -+ -+$L32: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ ldi AO, 2 * SIZE(AO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ LD b5, 3 * SIZE(BO) -+ FIMOVD b5, tmp -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL a1, b4, t4 -+ LD a1, -1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a2, b1, t1 -+ LD b1, 4 * SIZE(BO) -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a2, b2, t2 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ LD b4, -1 * SIZE(BO) -+ MUL a2, b3, t3 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ IFMOVD tmp, b5 -+ MUL a2, b5, t4 -+ LD a2, 0 * SIZE(AO) -+ bgt L, $L32 -+ .align 4 -+ -+$L35: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L37 -+#else -+ blbs TMP1, $L37 -+#endif -+ .align 4 -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ LD b1, 0 * SIZE(BO) -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ MUL a1, b3, b5 -+ fmov b5, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL a1, b4, b5 -+ fmov b5, t4 -+ LD a1, 0 * SIZE(AO) -+ ldi AO, 1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L37: -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ MUL a1, b3, b5 -+ fmov b5, t3 -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b4, b5 -+ fmov b5, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ -+$L38: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, b5 -+ fmov b5, c05 -+ MUL b2, c05, b5 -+ fmov b5, t1 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ MUL b3, c05, b5 -+ fmov b5, t1 -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a2, c09, b5 -+ fmov b5, t1 -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ MUL a3, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ MUL a2, c13, b5 -+ fmov b5, t1 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ MUL a3, c13, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL a4, c13, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, b5 -+ fmov b5, c09 -+ MUL b2, c09, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL b3, c09, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c13, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+ ldi C3, -1 * SIZE(C3) -+ ldi C4, -1 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c09, 0 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L20: -+ and M, 2, I -+ ble I, $L30 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c01 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(B) -+ fclr c02 -+ fclr c06 -+ ble KK, $L28 -+ -+ ble L, $L25 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c01 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(BO) -+ fclr c02 -+ fclr c06 -+ ble TMP1, $L28 -+ -+ ble L, $L25 -+#endif -+ .align 4 -+ -+$L22: -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ FIMOVD b5, tmp -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ IFMOVD tmp, b5 -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ IFMOVD tmp, b5 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 -+ -+$L25: -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L27 -+#else -+ blbs TMP1, $L27 -+#endif -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ unop -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, b5 -+ fmov b5, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b4, b5 -+ fmov b5, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L27: -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b4, b5 -+ fmov b5, t3 -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, b5 -+ fmov b5, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ .align 4 -+ -+$L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+ -+ SUB b1, c02, b5 -+ fmov b5, c02 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c10, b5 -+ fmov b5, c10 -+ SUB b4, c14, b5 -+ fmov b5, c14 -+ -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c05, b5 -+ fmov b5, c05 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+ -+ SUB b1, c09, b5 -+ fmov b5, c09 -+ SUB b2, c10, b5 -+ fmov b5, c10 -+ SUB b3, c13, b5 -+ fmov b5, c13 -+ SUB b4, c14, b5 -+ fmov b5, c14 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c10, b5 -+ fmov b5, t3 -+ MUL a2, c14, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c09, b5 -+ fmov b5, c09 -+ MUL a3, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ MUL a2, c09, b5 -+ fmov b5, t3 -+ MUL a2, c13, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+ MUL a3, c10, b5 -+ fmov b5, c10 -+ MUL a3, c14, b5 -+ fmov b5, c14 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c02, b5 -+ fmov b5, t2 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c02, b5 -+ fmov b5, t2 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, b5 -+ fmov b5, c05 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ -+ MUL b2, c05, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL b3, c05, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ MUL a2, c09, b5 -+ fmov b5, t1 -+ MUL a2, c10, b5 -+ fmov b5, t2 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ -+ MUL a3, c13, b5 -+ fmov b5, c13 -+ MUL a3, c14, b5 -+ fmov b5, c14 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ -+ MUL a2, c13, b5 -+ fmov b5, t1 -+ MUL a2, c14, b5 -+ fmov b5, t2 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL a3, c13, b5 -+ fmov b5, t1 -+ MUL a3, c14, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a4, c13, b5 -+ fmov b5, t1 -+ MUL a4, c14, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, b5 -+ fmov b5, c09 -+ MUL b1, c10, b5 -+ fmov b5, c10 -+ -+ MUL b2, c09, b5 -+ fmov b5, t1 -+ MUL b2, c10, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL b3, c09, b5 -+ fmov b5, t1 -+ MUL b3, c10, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+ -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) -+ -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c13, 6 * SIZE(AO) -+ ST c14, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+ ldi C3, -2 * SIZE(C3) -+ ldi C4, -2 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+ ldi C3, 2 * SIZE(C3) -+ ldi C4, 2 * SIZE(C4) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L30: -+ sra M, 2, I -+ ble I, $L39 -+ .align 4 -+ -+$L11: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(B) -+ fclr c06 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(KK) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(B) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble KK, $L18 -+#else -+ -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c06 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(TMP1) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(BO) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble TMP1, $L18 -+#endif -+ -+ ble L, $L15 -+ .align 5 -+ -+$L12: -+/* 1 */ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop -+#endif -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ unop -+ MUL b1, a2, t2 -+ unop -+ -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ FIMOVD b5, tmp -+ -+/* 2 */ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a1, t4 -+ unop -+ -+/* 3 */ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, t1 -+ unop -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+/* 4 */ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) -+ -+/* 5 */ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ ldi L, -2(L) -+ IFMOVD tmp, b5 -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a5, t4 -+ unop -+ -+/* 6 */ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a6, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a4, t2 -+ unop -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a5, t4 -+ unop -+ -+/* 7 */ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 -+ unop -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) -+ -+/* 8 */ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 -+ .align 4 -+ -+$L15: -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ MUL b1, a1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L17 -+#else -+ blbs TMP1, $L17 -+#endif -+ .align 4 -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, b5 -+ fmov b5, t2 -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, b5 -+ fmov b5, t3 -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL b1, a4, b5 -+ fmov b5, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, b5 -+ fmov b5, t3 -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, b5 -+ fmov b5, t4 -+ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, b5 -+ fmov b5, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, b5 -+ fmov b5, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, b5 -+ fmov b5, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, b5 -+ fmov b5, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, b5 -+ fmov b5, t1 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, b5 -+ fmov b5, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, b5 -+ fmov b5, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, b5 -+ fmov b5, t4 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ LD b4, 3 * SIZE(BO) -+ MUL b1, a1, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L17: -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, b5 -+ fmov b5, t2 -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, b5 -+ fmov b5, t3 -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL b1, a4, b5 -+ fmov b5, t2 -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, b5 -+ fmov b5, t3 -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, b5 -+ fmov b5, t4 -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ MUL b3, a1, b5 -+ fmov b5, t1 -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ MUL b3, a2, b5 -+ fmov b5, t2 -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ MUL b4, a2, b5 -+ fmov b5, t3 -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL b2, a3, b5 -+ fmov b5, t4 -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ MUL b3, a3, b5 -+ fmov b5, t1 -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ MUL b3, a4, b5 -+ fmov b5, t2 -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ MUL b4, a4, b5 -+ fmov b5, t3 -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ ldi AO, 4 * SIZE(AO) -+ MUL b4, a3, b5 -+ fmov b5, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ .align 4 -+ -+$L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+ -+ SUB b1, c02, b5 -+ fmov b5, c02 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c10, b5 -+ fmov b5, c10 -+ SUB b4, c14, b5 -+ fmov b5, c14 -+ -+ LD a1, 8 * SIZE(BO) -+ LD a2, 9 * SIZE(BO) -+ LD a3, 10 * SIZE(BO) -+ LD a4, 11 * SIZE(BO) -+ -+ LD b1, 12 * SIZE(BO) -+ LD b2, 13 * SIZE(BO) -+ LD b3, 14 * SIZE(BO) -+ LD b4, 15 * SIZE(BO) -+ -+ SUB a1, c03, b5 -+ fmov b5, c03 -+ SUB a2, c07, b5 -+ fmov b5, c07 -+ SUB a3, c11, b5 -+ fmov b5, c11 -+ SUB a4, c15, b5 -+ fmov b5, c15 -+ -+ SUB b1, c04, b5 -+ fmov b5, c04 -+ SUB b2, c08, b5 -+ fmov b5, c08 -+ SUB b3, c12, b5 -+ fmov b5, c12 -+ SUB b4, c16, b5 -+ fmov b5, c16 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+ -+ SUB b1, c05, b5 -+ fmov b5, c05 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c07, b5 -+ fmov b5, c07 -+ SUB b4, c08, b5 -+ fmov b5, c08 -+ -+ LD a1, 8 * SIZE(AO) -+ LD a2, 9 * SIZE(AO) -+ LD a3, 10 * SIZE(AO) -+ LD a4, 11 * SIZE(AO) -+ -+ LD b1, 12 * SIZE(AO) -+ LD b2, 13 * SIZE(AO) -+ LD b3, 14 * SIZE(AO) -+ LD b4, 15 * SIZE(AO) -+ -+ SUB a1, c09, b5 -+ fmov b5, c09 -+ SUB a2, c10, b5 -+ fmov b5, c10 -+ SUB a3, c11, b5 -+ fmov b5, c11 -+ SUB a4, c12, b5 -+ fmov b5, c12 -+ -+ SUB b1, c13, b5 -+ fmov b5, c13 -+ SUB b2, c14, b5 -+ fmov b5, c14 -+ SUB b3, c15, b5 -+ fmov b5, c15 -+ SUB b4, c16, b5 -+ fmov b5, c16 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ MUL a1, c16, b5 -+ fmov b5, c16 -+ -+ MUL a2, c04, b5 -+ fmov b5, t1 -+ MUL a2, c08, b5 -+ fmov b5, t2 -+ MUL a2, c12, b5 -+ fmov b5, t3 -+ MUL a2, c16, b5 -+ fmov b5, t4 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c15, t4, b5 -+ fmov b5, c15 -+ -+ MUL a3, c04, b5 -+ fmov b5, t1 -+ MUL a3, c08, b5 -+ fmov b5, t2 -+ MUL a3, c12, b5 -+ fmov b5, t3 -+ MUL a3, c16, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL a4, c04, b5 -+ fmov b5, t1 -+ MUL a4, c08, b5 -+ fmov b5, t2 -+ MUL a4, c12, b5 -+ fmov b5, t3 -+ MUL a4, c16, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, b5 -+ fmov b5, c03 -+ MUL b1, c07, b5 -+ fmov b5, c07 -+ MUL b1, c11, b5 -+ fmov b5, c11 -+ MUL b1, c15, b5 -+ fmov b5, c15 -+ -+ MUL b2, c03, b5 -+ fmov b5, t1 -+ MUL b2, c07, b5 -+ fmov b5, t2 -+ MUL b2, c11, b5 -+ fmov b5, t3 -+ MUL b2, c15, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL b3, c03, b5 -+ fmov b5, t1 -+ MUL b3, c07, b5 -+ fmov b5, t2 -+ MUL b3, c11, b5 -+ fmov b5, t3 -+ MUL b3, c15, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c10, b5 -+ fmov b5, t3 -+ MUL a2, c14, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c09, b5 -+ fmov b5, c09 -+ MUL a3, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ MUL a2, c09, b5 -+ fmov b5, t3 -+ MUL a2, c13, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c05, b5 -+ fmov b5, t2 -+ MUL a3, c09, b5 -+ fmov b5, t3 -+ MUL a3, c13, b5 -+ fmov b5, t4 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c15, t4, b5 -+ fmov b5, c15 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c05, b5 -+ fmov b5, t2 -+ MUL a4, c09, b5 -+ fmov b5, t3 -+ MUL a4, c13, b5 -+ fmov b5, t4 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ SUB c12, t3, b5 -+ fmov b5, c12 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, b5 -+ fmov b5, c02 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ MUL b1, c10, b5 -+ fmov b5, c10 -+ MUL b1, c14, b5 -+ fmov b5, c14 -+ -+ MUL b2, c02, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ MUL b2, c10, b5 -+ fmov b5, t3 -+ MUL b2, c14, b5 -+ fmov b5, t4 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c15, t4, b5 -+ fmov b5, c15 -+ -+ MUL b3, c02, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ MUL b3, c10, b5 -+ fmov b5, t3 -+ MUL b3, c14, b5 -+ fmov b5, t4 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ SUB c12, t3, b5 -+ fmov b5, c12 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c15, b5 -+ fmov b5, c15 -+ -+ MUL a2, c03, b5 -+ fmov b5, t1 -+ MUL a2, c07, b5 -+ fmov b5, t2 -+ MUL a2, c11, b5 -+ fmov b5, t3 -+ MUL a2, c15, b5 -+ fmov b5, t4 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ SUB c12, t3, b5 -+ fmov b5, c12 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ MUL a3, c04, b5 -+ fmov b5, c04 -+ MUL a3, c08, b5 -+ fmov b5, c08 -+ MUL a3, c12, b5 -+ fmov b5, c12 -+ MUL a3, c16, b5 -+ fmov b5, c16 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ MUL a2, c03, b5 -+ fmov b5, t3 -+ MUL a2, c04, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c02, b5 -+ fmov b5, t2 -+ MUL a3, c03, b5 -+ fmov b5, t3 -+ MUL a3, c04, b5 -+ fmov b5, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c02, b5 -+ fmov b5, t2 -+ MUL a4, c03, b5 -+ fmov b5, t3 -+ MUL a4, c04, b5 -+ fmov b5, t4 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ SUB c15, t3, b5 -+ fmov b5, c15 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, b5 -+ fmov b5, c05 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ MUL b1, c07, b5 -+ fmov b5, c07 -+ MUL b1, c08, b5 -+ fmov b5, c08 -+ -+ MUL b2, c05, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ MUL b2, c07, b5 -+ fmov b5, t3 -+ MUL b2, c08, b5 -+ fmov b5, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL b3, c05, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ MUL b3, c07, b5 -+ fmov b5, t3 -+ MUL b3, c08, b5 -+ fmov b5, t4 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ SUB c15, t3, b5 -+ fmov b5, c15 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ MUL a2, c09, b5 -+ fmov b5, t1 -+ MUL a2, c10, b5 -+ fmov b5, t2 -+ MUL a2, c11, b5 -+ fmov b5, t3 -+ MUL a2, c12, b5 -+ fmov b5, t4 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ SUB c15, t3, b5 -+ fmov b5, c15 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ MUL a3, c13, b5 -+ fmov b5, c13 -+ MUL a3, c14, b5 -+ fmov b5, c14 -+ MUL a3, c15, b5 -+ fmov b5, c15 -+ MUL a3, c16, b5 -+ fmov b5, c16 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ MUL a1, c15, b5 -+ fmov b5, c15 -+ MUL a1, c16, b5 -+ fmov b5, c16 -+ -+ MUL a2, c13, b5 -+ fmov b5, t1 -+ MUL a2, c14, b5 -+ fmov b5, t2 -+ MUL a2, c15, b5 -+ fmov b5, t3 -+ MUL a2, c16, b5 -+ fmov b5, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a3, c13, b5 -+ fmov b5, t1 -+ MUL a3, c14, b5 -+ fmov b5, t2 -+ MUL a3, c15, b5 -+ fmov b5, t3 -+ MUL a3, c16, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL a4, c13, b5 -+ fmov b5, t1 -+ MUL a4, c14, b5 -+ fmov b5, t2 -+ MUL a4, c15, b5 -+ fmov b5, t3 -+ MUL a4, c16, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, b5 -+ fmov b5, c09 -+ MUL b1, c10, b5 -+ fmov b5, c10 -+ MUL b1, c11, b5 -+ fmov b5, c11 -+ MUL b1, c12, b5 -+ fmov b5, c12 -+ -+ MUL b2, c09, b5 -+ fmov b5, t1 -+ MUL b2, c10, b5 -+ fmov b5, t2 -+ MUL b2, c11, b5 -+ fmov b5, t3 -+ MUL b2, c12, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL b3, c09, b5 -+ fmov b5, t1 -+ MUL b3, c10, b5 -+ fmov b5, t2 -+ MUL b3, c11, b5 -+ fmov b5, t3 -+ MUL b3, c12, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c07, b5 -+ fmov b5, t3 -+ MUL a2, c08, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c03, b5 -+ fmov b5, c03 -+ MUL a3, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+ -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+ -+ ST c03, 8 * SIZE(BO) -+ ST c07, 9 * SIZE(BO) -+ ST c11, 10 * SIZE(BO) -+ ST c15, 11 * SIZE(BO) -+ -+ ST c04, 12 * SIZE(BO) -+ ST c08, 13 * SIZE(BO) -+ ST c12, 14 * SIZE(BO) -+ ST c16, 15 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) -+ -+ ST c09, 8 * SIZE(AO) -+ ST c10, 9 * SIZE(AO) -+ ST c11, 10 * SIZE(AO) -+ ST c12, 11 * SIZE(AO) -+ -+ ST c13, 12 * SIZE(AO) -+ ST c14, 13 * SIZE(AO) -+ ST c15, 14 * SIZE(AO) -+ ST c16, 15 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+ ldi C3, -4 * SIZE(C3) -+ ldi C4, -4 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c11, 2 * SIZE(C3) -+ ST c12, 3 * SIZE(C3) -+ -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ ST c15, 2 * SIZE(C4) -+ ST c16, 3 * SIZE(C4) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+ ldi C3, 4 * SIZE(C3) -+ ldi C4, 4 * SIZE(C4) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ -+ bgt I, $L11 -+ .align 4 -+ -+$L39: -+#ifdef LN -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 4, KK -+#endif -+ -+#ifdef RT -+ subl KK, 4, KK -+#endif -+ ldi J, -1(J) -+ bgt J, $L01 -+ .align 4 -+ -+$L40: -+ and N, 2, J -+ ble J, $L80 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ addl LDC, LDC, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ fclr t1 -+#ifndef RT -+ addl C2, LDC, C -+#endif -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ fclr t3 -+ fclr t4 -+ -+ and M, 1, I -+ ble I, $L60 -+ -+#if defined(LT) || defined(RN) -+ -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c02 -+ LD b2, 1 * SIZE(B) -+ fclr c06 -+ -+ ldi L, -2(KK) -+ -+ LD b3, 2 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L78 -+ -+ ble L, $L75 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c02 -+ LD b2, 1 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ -+ LD b3, 2 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L78 -+ -+ ble L, $L75 -+#endif -+ .align 4 -+ -+$L72: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ LD a1, 1 * SIZE(AO) -+ LD b2, 3 * SIZE(BO) -+ -+ ADD c02, t3, b5 -+ fmov b5, c02 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b3, b5 -+ fmov b5, t3 -+ LD b3, 4 * SIZE(BO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, b5 -+ fmov b5, t4 -+ LD a2, 0 * SIZE(AO) -+ LD b4, 5 * SIZE(BO) -+ -+ ldi BO, 4 * SIZE(BO) -+ unop -+ unop -+ bgt L, $L72 -+ .align 4 -+ -+$L75: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L77 -+#else -+ blbs TMP1, $L77 -+#endif -+ .align 4 -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ LD a1, 0 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L77: -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ ADD c02, t3, b5 -+ fmov b5, c02 -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ -+ ADD c01, c02, b5 -+ fmov b5, c01 -+ ldi AO, 1 * SIZE(AO) -+ ADD c05, c06, b5 -+ fmov b5, c05 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ -+ .align 4 -+ -+$L78: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L60: -+ and M, 2, I -+ ble I, $L70 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L68 -+ -+ ble L, $L65 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L68 -+ -+ ble L, $L65 -+#endif -+ .align 4 -+ -+$L62: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ldi L, -2(L) -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a4, b3, b5 -+ fmov b5, t2 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a4, b4, b5 -+ fmov b5, t4 -+ LD b4, 1 * SIZE(BO) -+ unop -+ -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L62 -+ .align 4 -+ -+$L65: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L67 -+#else -+ blbs TMP1, $L67 -+#endif -+ .align 4 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi AO, 2 * SIZE(AO) -+ .align 4 -+ -+$L67: -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ .align 4 -+ -+$L68: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c02, b5 -+ fmov b5, c02 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c05, b5 -+ fmov b5, c05 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L70: -+ sra M, 2, I -+ ble I, $L79 -+ .align 4 -+ -+$L51: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 -+ -+ ldi L, -2(KK) -+ -+ ldi BO, 2 * SIZE(B) -+ ldi AO, 4 * SIZE(AO) -+ -+ ble KK, $L58 -+ -+ ble L, $L55 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ ldi BO, 2 * SIZE(BO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ble TMP1, $L58 -+ -+ ble L, $L55 -+#endif -+ .align 4 -+ -+$L52: -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ unop -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ ldi L, -2(L) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ unop -+ -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ unop -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ unop -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a5, 3 * SIZE(AO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ LD b2, -1 * SIZE(BO) -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ unop -+ -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b3, b5 -+ fmov b5, t3 -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a5, b3, b5 -+ fmov b5, t4 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b4, b5 -+ fmov b5, t1 -+ LD a1, -4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b4, b5 -+ fmov b5, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, b5 -+ fmov b5, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L52 -+ .align 4 -+ -+$L55: -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L57 -+#else -+ blbs TMP1, $L57 -+#endif -+ .align 4 -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L57: -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ .align 4 -+ -+$L58: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c02, b5 -+ fmov b5, c02 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+ -+ SUB b1, c03, b5 -+ fmov b5, c03 -+ SUB b2, c07, b5 -+ fmov b5, c07 -+ SUB b3, c04, b5 -+ fmov b5, c04 -+ SUB b4, c08, b5 -+ fmov b5, c08 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+ -+ SUB b1, c05, b5 -+ fmov b5, c05 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c07, b5 -+ fmov b5, c07 -+ SUB b4, c08, b5 -+ fmov b5, c08 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ -+ MUL a2, c04, b5 -+ fmov b5, t1 -+ MUL a2, c08, b5 -+ fmov b5, t2 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ -+ MUL a3, c04, b5 -+ fmov b5, t1 -+ MUL a3, c08, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a4, c04, b5 -+ fmov b5, t1 -+ MUL a4, c08, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, b5 -+ fmov b5, c03 -+ MUL b1, c07, b5 -+ fmov b5, c07 -+ -+ MUL b2, c03, b5 -+ fmov b5, t1 -+ MUL b2, c07, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL b3, c03, b5 -+ fmov b5, t1 -+ MUL b3, c07, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c05, b5 -+ fmov b5, t2 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c05, b5 -+ fmov b5, t2 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, b5 -+ fmov b5, c02 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ -+ MUL b2, c02, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ -+ MUL b3, c02, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ -+ MUL a2, c03, b5 -+ fmov b5, t1 -+ MUL a2, c07, b5 -+ fmov b5, t2 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ -+ MUL a3, c04, b5 -+ fmov b5, c04 -+ MUL a3, c08, b5 -+ fmov b5, c08 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ MUL a2, c03, b5 -+ fmov b5, t3 -+ MUL a2, c04, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+ MUL a3, c07, b5 -+ fmov b5, c07 -+ MUL a3, c08, b5 -+ fmov b5, c08 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c07, b5 -+ fmov b5, t3 -+ MUL a2, c08, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c03, b5 -+ fmov b5, c03 -+ MUL a3, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) -+ -+ ST c03, 4 * SIZE(BO) -+ ST c07, 5 * SIZE(BO) -+ ST c04, 6 * SIZE(BO) -+ ST c08, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ -+ bgt I, $L51 -+ .align 4 -+ -+$L79: -+#ifdef LN -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 2, KK -+#endif -+ -+#ifdef RT -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L80: -+ and N, 1, J -+ ble J, $L999 -+ -+#ifdef RT -+ sll K, BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ subl C, LDC, C -+#endif -+ -+ mov C, C1 -+#ifndef RT -+ addl C, LDC, C -+#endif -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ and M, 1, I -+ ble I, $L100 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ unop -+ ble L, $L115 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L115 -+#endif -+ .align 4 -+ -+$L112: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ MUL a3, b3, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b4, b5 -+ fmov b5, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b4, 7 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 4 * SIZE(AO) -+ ldi BO, 4 * SIZE(BO) -+ bgt L, $L112 -+ .align 4 -+ -+$L115: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L118 -+ .align 4 -+ -+$L116: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 1 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 1 * SIZE(AO) -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L116 -+ .align 4 -+ -+$L118: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ -+ ADD c01, c02, b5 -+ fmov b5, c01 -+ ADD c03, c04, b5 -+ fmov b5, c03 -+ ADD c01, c03, b5 -+ fmov b5, c01 -+ -+#if defined(LN) || defined(RT) -+ subl KK, 1, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+#else -+ LD a1, 0 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 1 * SIZE(C1) -+#endif -+ -+#ifdef RT -+ SXADDQ K, AORIG, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L100: -+ and M, 2, I -+ ble I, $L110 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L105 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ ble L, $L105 -+#endif -+ .align 5 -+ -+$L102: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ldi BO, 4 * SIZE(BO) -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a5, 7 * SIZE(AO) -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ LD a1, 8 * SIZE(AO) -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ LD b3, 2 * SIZE(BO) -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, b5 -+ fmov b5, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L102 -+ .align 4 -+ -+$L105: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L108 -+ .align 4 -+ -+$L106: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 3 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi AO, 2 * SIZE(AO) -+ unop -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L106 -+ .align 4 -+ -+$L108: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ -+ ADD c01, c03, b5 -+ fmov b5, c01 -+ ADD c02, c04, b5 -+ fmov b5, c02 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L110: -+ sra M, 2, I -+ ble I, $L119 -+ .align 4 -+ -+$L91: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L95 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L95 -+#endif -+ .align 5 -+ -+$L92: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi L, -1(L) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ LD a1, 8 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 9 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 10 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a4, 11 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ LD a1, 12 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ LD a2, 13 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b3, b5 -+ fmov b5, t3 -+ LD a3, 14 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b3, b5 -+ fmov b5, t4 -+ LD a5, 15 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b4, b5 -+ fmov b5, t1 -+ LD a1, 16 * SIZE(AO) -+ ldi AO, 16 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b4, b5 -+ fmov b5, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, b5 -+ fmov b5, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L92 -+ .align 4 -+ -+$L95: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ unop -+ ble L, $L98 -+ .align 4 -+ -+$L96: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 1 * SIZE(BO) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ldi AO, 4 * SIZE(AO) -+ bgt L, $L96 -+ .align 4 -+ -+$L98: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a2, c04, b5 -+ fmov b5, t1 -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ MUL a3, c04, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL a4, c04, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, b5 -+ fmov b5, c03 -+ MUL b2, c03, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL b3, c03, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, b5 -+ fmov b5, c02 -+ MUL b2, c02, b5 -+ fmov b5, t1 -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ MUL b3, c02, b5 -+ fmov b5, t1 -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a2, c03, b5 -+ fmov b5, t1 -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ MUL a3, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ bgt I, $L91 -+ .align 4 -+ -+$L119: -+#ifdef LN -+ SXADDQ K, B, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 1, KK -+#endif -+ -+#ifdef RT -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldl tmp, 64($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S.bak b/kernel/sw_64/trsm_kernel_4x4_LN.S.bak -new file mode 100644 -index 0000000..8405570 ---- /dev/null -+++ b/kernel/sw_64/trsm_kernel_4x4_LN.S.bak -@@ -0,0 +1,4073 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#if !defined(SW2B) -+#error "Architecture is not specified." -+#endif -+ -+#ifdef SW2B -+#define PREFETCHSIZE 56 -+#define UNOP nop -+#endif -+ -+#ifdef EV6 -+#define PREFETCHSIZE 56 -+#define UNOP unop -+#endif -+ -+#ifdef EV5 -+#define PREFETCHSIZE 56 -+#define UNOP -+#endif -+ -+#ifdef EV4 -+#define UNOP -+#endif -+ -+#define STACKSIZE 80 -+ -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $20 -+#define B $21 -+#define C $22 -+#define LDC $23 -+ -+#define C1 $19 -+#define C2 $24 -+#define C3 $25 -+#define C4 $27 -+ -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 -+ -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 -+ -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 -+ -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 -+ -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 -+ -+#define alpha $f30 -+ -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 -+ -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 -+ -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 -+ -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 -+ -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define AORIG $3 -+#define OFFSET $4 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ ldl C, 0 + STACKSIZE($sp) -+ ldl LDC, 8 + STACKSIZE($sp) -+ ldl OFFSET, 16 + STACKSIZE($sp) -+ -+ SXADDQ LDC, 0, LDC -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 -+ -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 -+ -+#ifdef LN -+ mull M, K, TMP1 -+ SXADDQ TMP1, A, A -+ SXADDQ M, C, C -+#endif -+ -+#ifdef RN -+ negq OFFSET, KK -+#endif -+ -+#ifdef RT -+ mull N, K, TMP1 -+ SXADDQ TMP1, B, B -+ -+ mull N, LDC, TMP1 -+ addl TMP1, C, C -+ -+ subl N, OFFSET, KK -+#endif -+ -+ sra N, 2, J -+ ble J, $L40 -+ .align 4 -+ -+$L01: -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ s4addl LDC, 0, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ addl C2, LDC, C3 -+#ifndef RT -+ s4addl LDC, C, C -+#endif -+ -+ fclr t1 -+ addl C3, LDC, C4 -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ fclr t3 -+ fclr t4 -+ -+ and M, 1, I -+ ble I, $L20 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c09 -+ LD b4, 3 * SIZE(B) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(B) -+ ble KK, $L38 -+ -+ ble L, $L35 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c09 -+ LD b4, 3 * SIZE(BO) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(BO) -+ ble TMP1, $L38 -+ -+ ble L, $L35 -+#endif -+ .align 4 -+ -+$L32: -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t2, c05 -+ ldi AO, 2 * SIZE(AO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, c09 -+ LD b5, 3 * SIZE(BO) -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, c13 -+ MUL a1, b4, t4 -+ LD a1, -1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ MUL a2, b1, t1 -+ LD b1, 4 * SIZE(BO) -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c05, t2, c05 -+ MUL a2, b2, t2 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c09, t3, c09 -+ LD b4, -1 * SIZE(BO) -+ MUL a2, b3, t3 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c13, t4, c13 -+ MUL a2, b5, t4 -+ LD a2, 0 * SIZE(AO) -+ bgt L, $L32 -+ .align 4 -+ -+$L35: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L37 -+#else -+ blbs TMP1, $L37 -+#endif -+ .align 4 -+ -+ ADD c05, t2, c05 -+ LD b1, 0 * SIZE(BO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, c09 -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, c13 -+ MUL a1, b4, t4 -+ LD a1, 0 * SIZE(AO) -+ ldi AO, 1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L37: -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ ADD c09, t3, c09 -+ MUL a1, b3, t3 -+ -+ ADD c13, t4, c13 -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b4, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ ADD c05, t2, c05 -+ ADD c09, t3, c09 -+ ADD c13, t4, c13 -+ -+$L38: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ MUL a1, c09, c09 -+ MUL a1, c13, c13 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c05, t1, c05 -+ MUL a3, c01, t1 -+ SUB c09, t1, c09 -+ MUL a4, c01, t1 -+ SUB c13, t1, c13 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, c05 -+ MUL b2, c05, t1 -+ SUB c09, t1, c09 -+ MUL b3, c05, t1 -+ SUB c13, t1, c13 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, c09 -+ MUL a2, c09, t1 -+ SUB c13, t1, c13 -+ MUL a3, c13, c13 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, c13 -+ MUL a2, c13, t1 -+ SUB c09, t1, c09 -+ MUL a3, c13, t1 -+ SUB c05, t1, c05 -+ MUL a4, c13, t1 -+ SUB c01, t1, c01 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, c09 -+ MUL b2, c09, t1 -+ SUB c05, t1, c05 -+ MUL b3, c09, t1 -+ SUB c01, t1, c01 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a2, c05, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c13, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+ ldi C3, -1 * SIZE(C3) -+ ldi C4, -1 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c09, 0 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L20: -+ and M, 2, I -+ ble I, $L30 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c01 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(B) -+ fclr c02 -+ fclr c06 -+ ble KK, $L28 -+ -+ ble L, $L25 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c01 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(BO) -+ fclr c02 -+ fclr c06 -+ ble TMP1, $L28 -+ -+ ble L, $L25 -+#endif -+ .align 4 -+ -+$L22: -+ ADD c09, t1, c09 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c10, t2, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ -+ ADD c09, t1, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD c13, t3, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c14, t4, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 -+ -+$L25: -+ ADD c09, t1, c09 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L27 -+#else -+ blbs TMP1, $L27 -+#endif -+ -+ ADD c10, t2, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ unop -+ -+ ADD c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c09, t1, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L27: -+ ADD c10, t2, c10 -+ MUL a2, b1, t2 -+ ADD c13, t3, c13 -+ MUL a1, b2, t3 -+ -+ ADD c14, t4, c14 -+ MUL a2, b2, t4 -+ ADD c01, t1, c01 -+ MUL a1, b3, t1 -+ -+ ADD c02, t2, c02 -+ MUL a2, b3, t2 -+ ADD c05, t3, c05 -+ MUL a1, b4, t3 -+ -+ ADD c06, t4, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c09, t1, c09 -+ ADD c10, t2, c10 -+ ADD c13, t3, c13 -+ ADD c14, t4, c14 -+ .align 4 -+ -+$L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 -+ -+ SUB b1, c02, c02 -+ SUB b2, c06, c06 -+ SUB b3, c10, c10 -+ SUB b4, c14, c14 -+ -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c05, c05 -+ SUB a4, c06, c06 -+ -+ SUB b1, c09, c09 -+ SUB b2, c10, c10 -+ SUB b3, c13, c13 -+ SUB b4, c14, c14 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ MUL a1, c10, c10 -+ MUL a1, c14, c14 -+ -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ MUL a2, c10, t3 -+ MUL a2, c14, t4 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 -+ -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+ MUL a3, c09, c09 -+ MUL a3, c13, c13 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ MUL a1, c09, c09 -+ MUL a1, c13, c13 -+ -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ MUL a2, c09, t3 -+ MUL a2, c13, t4 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 -+ -+ MUL a3, c02, c02 -+ MUL a3, c06, c06 -+ MUL a3, c10, c10 -+ MUL a3, c14, c14 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ -+ MUL a4, c01, t1 -+ MUL a4, c02, t2 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, c05 -+ MUL b1, c06, c06 -+ -+ MUL b2, c05, t1 -+ MUL b2, c06, t2 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ -+ MUL b3, c05, t1 -+ MUL b3, c06, t2 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ -+ MUL a2, c09, t1 -+ MUL a2, c10, t2 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ -+ MUL a3, c13, c13 -+ MUL a3, c14, c14 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, c13 -+ MUL a1, c14, c14 -+ -+ MUL a2, c13, t1 -+ MUL a2, c14, t2 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ -+ MUL a3, c13, t1 -+ MUL a3, c14, t2 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ -+ MUL a4, c13, t1 -+ MUL a4, c14, t2 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, c09 -+ MUL b1, c10, c10 -+ -+ MUL b2, c09, t1 -+ MUL b2, c10, t2 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ -+ MUL b3, c09, t1 -+ MUL b3, c10, t2 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+ -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) -+ -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c13, 6 * SIZE(AO) -+ ST c14, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+ ldi C3, -2 * SIZE(C3) -+ ldi C4, -2 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+ ldi C3, 2 * SIZE(C3) -+ ldi C4, 2 * SIZE(C4) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L30: -+ sra M, 2, I -+ ble I, $L39 -+ .align 4 -+ -+$L11: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(B) -+ fclr c06 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(KK) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(B) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble KK, $L18 -+#else -+ -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c06 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(TMP1) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(BO) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble TMP1, $L18 -+#endif -+ -+ ble L, $L15 -+ .align 5 -+ -+$L12: -+/* 1 */ -+ ADD c11, t1, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop -+#endif -+ -+ ADD c12, t2, c12 -+ unop -+ MUL b1, a2, t2 -+ unop -+ -+ ADD c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) -+ -+ ADD c15, t4, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ -+/* 2 */ -+ ADD c01, t1, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP -+ -+ ADD c02, t2, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP -+ -+ ADD c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, c05 -+ unop -+ MUL b4, a1, t4 -+ unop -+ -+/* 3 */ -+ ADD c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+ unop -+ -+ ADD c04, t2, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+/* 4 */ -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) -+ -+ ADD c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) -+ -+/* 5 */ -+ ADD c11, t1, c11 -+ unop -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c12, t2, c12 -+ ldi L, -2(L) -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD c15, t4, c15 -+ unop -+ MUL b2, a5, t4 -+ unop -+ -+/* 6 */ -+ ADD c01, t1, c01 -+ unop -+ MUL b5, a6, t1 -+ unop -+ -+ ADD c02, t2, c02 -+ unop -+ MUL b5, a4, t2 -+ unop -+ -+ ADD c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, c05 -+ unop -+ MUL b4, a5, t4 -+ unop -+ -+/* 7 */ -+ ADD c03, t1, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 -+ unop -+ -+ ADD c04, t2, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) -+ -+/* 8 */ -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 -+ .align 4 -+ -+$L15: -+ ADD c11, t1, c11 -+ MUL b1, a1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L17 -+#else -+ blbs TMP1, $L17 -+#endif -+ .align 4 -+ -+ ADD c12, t2, c12 -+ MUL b1, a2, t2 -+ ADD c16, t3, c16 -+ MUL b2, a2, t3 -+ -+ ADD c15, t4, c15 -+ MUL b2, a1, t4 -+ ADD c01, t1, c01 -+ MUL b1, a3, t1 -+ -+ ADD c02, t2, c02 -+ unop -+ MUL b1, a4, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c06, t3, c06 -+ MUL b2, a4, t3 -+ ADD c05, t4, c05 -+ MUL b4, a1, t4 -+ -+ ADD c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c04, t2, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a3, t1 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c11, t1, c11 -+ LD b4, 3 * SIZE(BO) -+ MUL b1, a1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L17: -+ ADD c12, t2, c12 -+ MUL b1, a2, t2 -+ ADD c16, t3, c16 -+ MUL b2, a2, t3 -+ -+ ADD c15, t4, c15 -+ MUL b2, a1, t4 -+ ADD c01, t1, c01 -+ MUL b1, a3, t1 -+ -+ ADD c02, t2, c02 -+ MUL b1, a4, t2 -+ ADD c06, t3, c06 -+ MUL b2, a4, t3 -+ -+ ADD c05, t4, c05 -+ MUL b4, a1, t4 -+ ADD c03, t1, c03 -+ MUL b3, a1, t1 -+ -+ ADD c04, t2, c04 -+ MUL b3, a2, t2 -+ ADD c08, t3, c08 -+ MUL b4, a2, t3 -+ -+ ADD c13, t4, c13 -+ MUL b2, a3, t4 -+ ADD c09, t1, c09 -+ MUL b3, a3, t1 -+ -+ ADD c10, t2, c10 -+ MUL b3, a4, t2 -+ ADD c14, t3, c14 -+ MUL b4, a4, t3 -+ -+ ADD c07, t4, c07 -+ ldi AO, 4 * SIZE(AO) -+ MUL b4, a3, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c11, t1, c11 -+ ADD c12, t2, c12 -+ ADD c16, t3, c16 -+ ADD c15, t4, c15 -+ .align 4 -+ -+$L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 -+ -+ SUB b1, c02, c02 -+ SUB b2, c06, c06 -+ SUB b3, c10, c10 -+ SUB b4, c14, c14 -+ -+ LD a1, 8 * SIZE(BO) -+ LD a2, 9 * SIZE(BO) -+ LD a3, 10 * SIZE(BO) -+ LD a4, 11 * SIZE(BO) -+ -+ LD b1, 12 * SIZE(BO) -+ LD b2, 13 * SIZE(BO) -+ LD b3, 14 * SIZE(BO) -+ LD b4, 15 * SIZE(BO) -+ -+ SUB a1, c03, c03 -+ SUB a2, c07, c07 -+ SUB a3, c11, c11 -+ SUB a4, c15, c15 -+ -+ SUB b1, c04, c04 -+ SUB b2, c08, c08 -+ SUB b3, c12, c12 -+ SUB b4, c16, c16 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+ -+ SUB b1, c05, c05 -+ SUB b2, c06, c06 -+ SUB b3, c07, c07 -+ SUB b4, c08, c08 -+ -+ LD a1, 8 * SIZE(AO) -+ LD a2, 9 * SIZE(AO) -+ LD a3, 10 * SIZE(AO) -+ LD a4, 11 * SIZE(AO) -+ -+ LD b1, 12 * SIZE(AO) -+ LD b2, 13 * SIZE(AO) -+ LD b3, 14 * SIZE(AO) -+ LD b4, 15 * SIZE(AO) -+ -+ SUB a1, c09, c09 -+ SUB a2, c10, c10 -+ SUB a3, c11, c11 -+ SUB a4, c12, c12 -+ -+ SUB b1, c13, c13 -+ SUB b2, c14, c14 -+ SUB b3, c15, c15 -+ SUB b4, c16, c16 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, c04 -+ MUL a1, c08, c08 -+ MUL a1, c12, c12 -+ MUL a1, c16, c16 -+ -+ MUL a2, c04, t1 -+ MUL a2, c08, t2 -+ MUL a2, c12, t3 -+ MUL a2, c16, t4 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ SUB c11, t3, c11 -+ SUB c15, t4, c15 -+ -+ MUL a3, c04, t1 -+ MUL a3, c08, t2 -+ MUL a3, c12, t3 -+ MUL a3, c16, t4 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 -+ -+ MUL a4, c04, t1 -+ MUL a4, c08, t2 -+ MUL a4, c12, t3 -+ MUL a4, c16, t4 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, c03 -+ MUL b1, c07, c07 -+ MUL b1, c11, c11 -+ MUL b1, c15, c15 -+ -+ MUL b2, c03, t1 -+ MUL b2, c07, t2 -+ MUL b2, c11, t3 -+ MUL b2, c15, t4 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 -+ -+ MUL b3, c03, t1 -+ MUL b3, c07, t2 -+ MUL b3, c11, t3 -+ MUL b3, c15, t4 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ MUL a1, c10, c10 -+ MUL a1, c14, c14 -+ -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ MUL a2, c10, t3 -+ MUL a2, c14, t4 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 -+ -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+ MUL a3, c09, c09 -+ MUL a3, c13, c13 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ MUL a1, c09, c09 -+ MUL a1, c13, c13 -+ -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ MUL a2, c09, t3 -+ MUL a2, c13, t4 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 -+ -+ MUL a3, c01, t1 -+ MUL a3, c05, t2 -+ MUL a3, c09, t3 -+ MUL a3, c13, t4 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ SUB c11, t3, c11 -+ SUB c15, t4, c15 -+ -+ MUL a4, c01, t1 -+ MUL a4, c05, t2 -+ MUL a4, c09, t3 -+ MUL a4, c13, t4 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ SUB c12, t3, c12 -+ SUB c16, t4, c16 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, c02 -+ MUL b1, c06, c06 -+ MUL b1, c10, c10 -+ MUL b1, c14, c14 -+ -+ MUL b2, c02, t1 -+ MUL b2, c06, t2 -+ MUL b2, c10, t3 -+ MUL b2, c14, t4 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ SUB c11, t3, c11 -+ SUB c15, t4, c15 -+ -+ MUL b3, c02, t1 -+ MUL b3, c06, t2 -+ MUL b3, c10, t3 -+ MUL b3, c14, t4 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ SUB c12, t3, c12 -+ SUB c16, t4, c16 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, c03 -+ MUL a1, c07, c07 -+ MUL a1, c11, c11 -+ MUL a1, c15, c15 -+ -+ MUL a2, c03, t1 -+ MUL a2, c07, t2 -+ MUL a2, c11, t3 -+ MUL a2, c15, t4 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ SUB c12, t3, c12 -+ SUB c16, t4, c16 -+ -+ MUL a3, c04, c04 -+ MUL a3, c08, c08 -+ MUL a3, c12, c12 -+ MUL a3, c16, c16 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ MUL a2, c03, t3 -+ MUL a2, c04, t4 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 -+ -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c03, t3 -+ MUL a3, c04, t4 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 -+ -+ MUL a4, c01, t1 -+ MUL a4, c02, t2 -+ MUL a4, c03, t3 -+ MUL a4, c04, t4 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ SUB c15, t3, c15 -+ SUB c16, t4, c16 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, c05 -+ MUL b1, c06, c06 -+ MUL b1, c07, c07 -+ MUL b1, c08, c08 -+ -+ MUL b2, c05, t1 -+ MUL b2, c06, t2 -+ MUL b2, c07, t3 -+ MUL b2, c08, t4 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 -+ -+ MUL b3, c05, t1 -+ MUL b3, c06, t2 -+ MUL b3, c07, t3 -+ MUL b3, c08, t4 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ SUB c15, t3, c15 -+ SUB c16, t4, c16 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 -+ -+ MUL a2, c09, t1 -+ MUL a2, c10, t2 -+ MUL a2, c11, t3 -+ MUL a2, c12, t4 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ SUB c15, t3, c15 -+ SUB c16, t4, c16 -+ -+ MUL a3, c13, c13 -+ MUL a3, c14, c14 -+ MUL a3, c15, c15 -+ MUL a3, c16, c16 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, c13 -+ MUL a1, c14, c14 -+ MUL a1, c15, c15 -+ MUL a1, c16, c16 -+ -+ MUL a2, c13, t1 -+ MUL a2, c14, t2 -+ MUL a2, c15, t3 -+ MUL a2, c16, t4 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 -+ -+ MUL a3, c13, t1 -+ MUL a3, c14, t2 -+ MUL a3, c15, t3 -+ MUL a3, c16, t4 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 -+ -+ MUL a4, c13, t1 -+ MUL a4, c14, t2 -+ MUL a4, c15, t3 -+ MUL a4, c16, t4 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, c09 -+ MUL b1, c10, c10 -+ MUL b1, c11, c11 -+ MUL b1, c12, c12 -+ -+ MUL b2, c09, t1 -+ MUL b2, c10, t2 -+ MUL b2, c11, t3 -+ MUL b2, c12, t4 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 -+ -+ MUL b3, c09, t1 -+ MUL b3, c10, t2 -+ MUL b3, c11, t3 -+ MUL b3, c12, t4 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ MUL a1, c07, c07 -+ MUL a1, c08, c08 -+ -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ MUL a2, c07, t3 -+ MUL a2, c08, t4 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 -+ -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+ MUL a3, c03, c03 -+ MUL a3, c04, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+ -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+ -+ ST c03, 8 * SIZE(BO) -+ ST c07, 9 * SIZE(BO) -+ ST c11, 10 * SIZE(BO) -+ ST c15, 11 * SIZE(BO) -+ -+ ST c04, 12 * SIZE(BO) -+ ST c08, 13 * SIZE(BO) -+ ST c12, 14 * SIZE(BO) -+ ST c16, 15 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) -+ -+ ST c09, 8 * SIZE(AO) -+ ST c10, 9 * SIZE(AO) -+ ST c11, 10 * SIZE(AO) -+ ST c12, 11 * SIZE(AO) -+ -+ ST c13, 12 * SIZE(AO) -+ ST c14, 13 * SIZE(AO) -+ ST c15, 14 * SIZE(AO) -+ ST c16, 15 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+ ldi C3, -4 * SIZE(C3) -+ ldi C4, -4 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c11, 2 * SIZE(C3) -+ ST c12, 3 * SIZE(C3) -+ -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ ST c15, 2 * SIZE(C4) -+ ST c16, 3 * SIZE(C4) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+ ldi C3, 4 * SIZE(C3) -+ ldi C4, 4 * SIZE(C4) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ -+ bgt I, $L11 -+ .align 4 -+ -+$L39: -+#ifdef LN -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 4, KK -+#endif -+ -+#ifdef RT -+ subl KK, 4, KK -+#endif -+ ldi J, -1(J) -+ bgt J, $L01 -+ .align 4 -+ -+$L40: -+ and N, 2, J -+ ble J, $L80 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ addl LDC, LDC, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ fclr t1 -+#ifndef RT -+ addl C2, LDC, C -+#endif -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ fclr t3 -+ fclr t4 -+ -+ and M, 1, I -+ ble I, $L60 -+ -+#if defined(LT) || defined(RN) -+ -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c02 -+ LD b2, 1 * SIZE(B) -+ fclr c06 -+ -+ ldi L, -2(KK) -+ -+ LD b3, 2 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L78 -+ -+ ble L, $L75 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c02 -+ LD b2, 1 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ -+ LD b3, 2 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L78 -+ -+ ble L, $L75 -+#endif -+ .align 4 -+ -+$L72: -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ LD a1, 1 * SIZE(AO) -+ LD b2, 3 * SIZE(BO) -+ -+ ADD c02, t3, c02 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b3, t3 -+ LD b3, 4 * SIZE(BO) -+ -+ ADD c06, t4, c06 -+ MUL a2, b4, t4 -+ LD a2, 0 * SIZE(AO) -+ LD b4, 5 * SIZE(BO) -+ -+ ldi BO, 4 * SIZE(BO) -+ unop -+ unop -+ bgt L, $L72 -+ .align 4 -+ -+$L75: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L77 -+#else -+ blbs TMP1, $L77 -+#endif -+ .align 4 -+ -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ LD a1, 0 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L77: -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ ADD c02, t3, c02 -+ ADD c06, t4, c06 -+ -+ ADD c01, c02, c01 -+ ldi AO, 1 * SIZE(AO) -+ ADD c05, c06, c05 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ ADD c05, t2, c05 -+ -+ .align 4 -+ -+$L78: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c05, t1, c05 -+ MUL a3, c05, c05 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a2, c05, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L60: -+ and M, 2, I -+ ble I, $L70 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L68 -+ -+ ble L, $L65 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L68 -+ -+ ble L, $L65 -+#endif -+ .align 4 -+ -+$L62: -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c02, t2, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ ldi L, -2(L) -+ MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, t1 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a3, b4, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ MUL a4, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ unop -+ -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L62 -+ .align 4 -+ -+$L65: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L67 -+#else -+ blbs TMP1, $L67 -+#endif -+ .align 4 -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, t3 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi AO, 2 * SIZE(AO) -+ .align 4 -+ -+$L67: -+ ADD c02, t2, c02 -+ MUL a2, b1, t2 -+ ADD c05, t3, c05 -+ MUL a1, b2, t3 -+ -+ ADD c06, t4, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b2, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c05, t3, c05 -+ ADD c06, t4, c06 -+ .align 4 -+ -+$L68: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c02, c02 -+ SUB a4, c06, c06 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c05, c05 -+ SUB a4, c06, c06 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL a3, c02, c02 -+ MUL a3, c06, c06 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ -+ MUL a3, c05, c05 -+ MUL a3, c06, c06 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L70: -+ sra M, 2, I -+ ble I, $L79 -+ .align 4 -+ -+$L51: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 -+ -+ ldi L, -2(KK) -+ -+ ldi BO, 2 * SIZE(B) -+ ldi AO, 4 * SIZE(AO) -+ -+ ble KK, $L58 -+ -+ ble L, $L55 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ ldi BO, 2 * SIZE(BO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ble TMP1, $L58 -+ -+ ble L, $L55 -+#endif -+ .align 4 -+ -+$L52: -+ ADD c05, t1, c05 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c06, t2, c06 -+ ldi L, -2(L) -+ MUL a2, b1, t2 -+ unop -+ -+ ADD c07, t3, c07 -+ unop -+ MUL a3, b1, t3 -+ unop -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ unop -+ MUL a4, b2, t4 -+ LD a5, 3 * SIZE(AO) -+ -+ ADD c05, t1, c05 -+ unop -+ MUL a1, b3, t1 -+ LD b2, -1 * SIZE(BO) -+ -+ ADD c06, t2, c06 -+ unop -+ MUL a2, b3, t2 -+ unop -+ -+ ADD c07, t3, c07 -+ unop -+ MUL a3, b3, t3 -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a5, b3, t4 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b4, t1 -+ LD a1, -4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b4, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L52 -+ .align 4 -+ -+$L55: -+ ADD c05, t1, c05 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L57 -+#else -+ blbs TMP1, $L57 -+#endif -+ .align 4 -+ -+ ADD c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD c07, t3, c07 -+ MUL a3, b1, t3 -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c05, t1, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L57: -+ ADD c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD c07, t3, c07 -+ MUL a3, b1, t3 -+ -+ ADD c08, t4, c08 -+ MUL a4, b1, t4 -+ ADD c01, t1, c01 -+ MUL a1, b2, t1 -+ -+ ADD c02, t2, c02 -+ MUL a2, b2, t2 -+ ADD c03, t3, c03 -+ MUL a3, b2, t3 -+ -+ ADD c04, t4, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c05, t1, c05 -+ ADD c06, t2, c06 -+ ADD c07, t3, c07 -+ ADD c08, t4, c08 -+ .align 4 -+ -+$L58: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c02, c02 -+ SUB a4, c06, c06 -+ -+ SUB b1, c03, c03 -+ SUB b2, c07, c07 -+ SUB b3, c04, c04 -+ SUB b4, c08, c08 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+ -+ SUB b1, c05, c05 -+ SUB b2, c06, c06 -+ SUB b3, c07, c07 -+ SUB b4, c08, c08 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, c04 -+ MUL a1, c08, c08 -+ -+ MUL a2, c04, t1 -+ MUL a2, c08, t2 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ -+ MUL a3, c04, t1 -+ MUL a3, c08, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL a4, c04, t1 -+ MUL a4, c08, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, c03 -+ MUL b1, c07, c07 -+ -+ MUL b2, c03, t1 -+ MUL b2, c07, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL b3, c03, t1 -+ MUL b3, c07, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL a3, c01, t1 -+ MUL a3, c05, t2 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ -+ MUL a4, c01, t1 -+ MUL a4, c05, t2 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, c02 -+ MUL b1, c06, c06 -+ -+ MUL b2, c02, t1 -+ MUL b2, c06, t2 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ -+ MUL b3, c02, t1 -+ MUL b3, c06, t2 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, c03 -+ MUL a1, c07, c07 -+ -+ MUL a2, c03, t1 -+ MUL a2, c07, t2 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ -+ MUL a3, c04, c04 -+ MUL a3, c08, c08 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ MUL a2, c03, t3 -+ MUL a2, c04, t4 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 -+ -+ MUL a3, c05, c05 -+ MUL a3, c06, c06 -+ MUL a3, c07, c07 -+ MUL a3, c08, c08 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ MUL a1, c07, c07 -+ MUL a1, c08, c08 -+ -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ MUL a2, c07, t3 -+ MUL a2, c08, t4 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 -+ -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+ MUL a3, c03, c03 -+ MUL a3, c04, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) -+ -+ ST c03, 4 * SIZE(BO) -+ ST c07, 5 * SIZE(BO) -+ ST c04, 6 * SIZE(BO) -+ ST c08, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ -+ bgt I, $L51 -+ .align 4 -+ -+$L79: -+#ifdef LN -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 2, KK -+#endif -+ -+#ifdef RT -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L80: -+ and N, 1, J -+ ble J, $L999 -+ -+#ifdef RT -+ sll K, BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ subl C, LDC, C -+#endif -+ -+ mov C, C1 -+#ifndef RT -+ addl C, LDC, C -+#endif -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ and M, 1, I -+ ble I, $L100 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ unop -+ ble L, $L115 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L115 -+#endif -+ .align 4 -+ -+$L112: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b2, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c03, t3, c03 -+ MUL a3, b3, t3 -+ LD a3, 6 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b4, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b4, 7 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 4 * SIZE(AO) -+ ldi BO, 4 * SIZE(BO) -+ bgt L, $L112 -+ .align 4 -+ -+$L115: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L118 -+ .align 4 -+ -+$L116: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+ LD a1, 1 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 1 * SIZE(AO) -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L116 -+ .align 4 -+ -+$L118: -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+ -+ ADD c01, c02, c01 -+ ADD c03, c04, c03 -+ ADD c01, c03, c01 -+ -+#if defined(LN) || defined(RT) -+ subl KK, 1, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+#else -+ LD a1, 0 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 1 * SIZE(C1) -+#endif -+ -+#ifdef RT -+ SXADDQ K, AORIG, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L100: -+ and M, 2, I -+ ble I, $L110 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L105 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ ble L, $L105 -+#endif -+ .align 5 -+ -+$L102: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c03, t3, c03 -+ ldi BO, 4 * SIZE(BO) -+ MUL a3, b2, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a5, 7 * SIZE(AO) -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ MUL a1, b3, t1 -+ LD a1, 8 * SIZE(AO) -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L102 -+ .align 4 -+ -+$L105: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L108 -+ .align 4 -+ -+$L106: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b1, t2 -+ LD a2, 3 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi AO, 2 * SIZE(AO) -+ unop -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L106 -+ .align 4 -+ -+$L108: -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+ -+ ADD c01, c03, c01 -+ ADD c02, c04, c02 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a2, c02, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c02, t1, c02 -+ MUL a3, c02, c02 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L110: -+ sra M, 2, I -+ ble I, $L119 -+ .align 4 -+ -+$L91: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L95 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L95 -+#endif -+ .align 5 -+ -+$L92: -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi L, -1(L) -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b1, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b1, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 8 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 9 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 10 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a4, 11 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ LD a1, 12 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD a2, 13 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b3, t3 -+ LD a3, 14 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b3, t4 -+ LD a5, 15 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ MUL a1, b4, t1 -+ LD a1, 16 * SIZE(AO) -+ ldi AO, 16 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b4, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L92 -+ .align 4 -+ -+$L95: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ unop -+ ble L, $L98 -+ .align 4 -+ -+$L96: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 1 * SIZE(BO) -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b1, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b1, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ldi AO, 4 * SIZE(AO) -+ bgt L, $L96 -+ .align 4 -+ -+$L98: -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, c04 -+ MUL a2, c04, t1 -+ SUB c03, t1, c03 -+ MUL a3, c04, t1 -+ SUB c02, t1, c02 -+ MUL a4, c04, t1 -+ SUB c01, t1, c01 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, c03 -+ MUL b2, c03, t1 -+ SUB c02, t1, c02 -+ MUL b3, c03, t1 -+ SUB c01, t1, c01 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a2, c02, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c02, t1, c02 -+ MUL a3, c01, t1 -+ SUB c03, t1, c03 -+ MUL a4, c01, t1 -+ SUB c04, t1, c04 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, c02 -+ MUL b2, c02, t1 -+ SUB c03, t1, c03 -+ MUL b3, c02, t1 -+ SUB c04, t1, c04 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, c03 -+ MUL a2, c03, t1 -+ SUB c04, t1, c04 -+ MUL a3, c04, c04 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ bgt I, $L91 -+ .align 4 -+ -+$L119: -+#ifdef LN -+ SXADDQ K, B, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 1, KK -+#endif -+ -+#ifdef RT -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S b/kernel/sw_64/trsm_kernel_4x4_LT.S -new file mode 100644 -index 0000000..54f8a51 ---- /dev/null -+++ b/kernel/sw_64/trsm_kernel_4x4_LT.S -@@ -0,0 +1,5145 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#if !defined(EV4) && !defined(EV5) && !defined(SW6) -+#error "Architecture is not specified." -+#endif -+ -+#ifdef SW6 -+#define PREFETCHSIZE 56 -+#define UNOP unop -+#endif -+ -+#ifdef EV5 -+#define PREFETCHSIZE 56 -+#define UNOP -+#endif -+ -+#ifdef EV4 -+#define UNOP -+#endif -+ -+#define STACKSIZE 88 -+ -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $20 -+#define B $21 -+#define C $22 -+#define LDC $23 -+ -+#define C1 $19 -+#define C2 $24 -+#define C3 $25 -+#define C4 $27 -+ -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 -+ -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 -+ -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 -+ -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 -+ -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 -+ -+#define tmp $9 -+ -+#define alpha $f30 -+ -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 -+ -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 -+ -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 -+ -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 -+ -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define AORIG $3 -+#define OFFSET $4 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ ldl C, 0 + STACKSIZE($sp) -+ ldl LDC, 8 + STACKSIZE($sp) -+ ldl OFFSET, 16 + STACKSIZE($sp) -+ -+ SXADDQ LDC, 0, LDC -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ stl $9, 64($sp) -+ -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 -+ -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 -+ -+#ifdef LN -+ mulq M, K, TMP1 -+ SXADDQ TMP1, A, A -+ SXADDQ M, C, C -+#endif -+ -+#ifdef RN -+ negl OFFSET, KK -+#endif -+ -+#ifdef RT -+ mulq N, K, TMP1 -+ SXADDQ TMP1, B, B -+ -+ mulq N, LDC, TMP1 -+ addl TMP1, C, C -+ -+ subl N, OFFSET, KK -+#endif -+ -+ sra N, 2, J -+ ble J, $L40 -+ .align 4 -+ -+$L01: -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ s4addl LDC, 0, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ addl C2, LDC, C3 -+#ifndef RT -+ s4addl LDC, C, C -+#endif -+ -+ fclr t1 -+ addl C3, LDC, C4 -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L20 -+ .align 4 -+ -+$L11: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(B) -+ fclr c06 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(KK) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(B) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ flds $f31, 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble KK, $L18 -+#else -+ -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c06 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(TMP1) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(BO) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble TMP1, $L18 -+#endif -+ -+ ble L, $L15 -+ .align 5 -+ -+$L12: -+/* 1 */ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop -+#endif -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ unop -+ MUL b1, a2, t2 -+ unop -+ -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ FIMOVD b5, tmp -+/* 2 */ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a1, t4 -+ unop -+ -+/* 3 */ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, t1 -+ unop -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+/* 4 */ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) -+ -+/* 5 */ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ ldi L, -2(L) -+ IFMOVD tmp, b5 -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a5, t4 -+ unop -+ -+/* 6 */ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a6, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a4, t2 -+ unop -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a5, t4 -+ unop -+ -+/* 7 */ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 -+ unop -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) -+ -+/* 8 */ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 -+ .align 4 -+ -+$L15: -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ MUL b1, a1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L17 -+#else -+ blbs TMP1, $L17 -+#endif -+ .align 4 -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, b5 -+ fmov b5, t2 -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, b5 -+ fmov b5, t3 -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL b1, a4, b5 -+ fmov b5, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, b5 -+ fmov b5, t3 -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, b5 -+ fmov b5, t4 -+ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, b5 -+ fmov b5, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, b5 -+ fmov b5, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, b5 -+ fmov b5, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, b5 -+ fmov b5, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, b5 -+ fmov b5, t1 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, b5 -+ fmov b5, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, b5 -+ fmov b5, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, b5 -+ fmov b5, t4 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ LD b4, 3 * SIZE(BO) -+ MUL b1, a1, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L17: -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, b5 -+ fmov b5, t2 -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, b5 -+ fmov b5, t3 -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL b1, a4, b5 -+ fmov b5, t2 -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, b5 -+ fmov b5, t3 -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, b5 -+ fmov b5, t4 -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ MUL b3, a1, b5 -+ fmov b5, t1 -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ MUL b3, a2, b5 -+ fmov b5, t2 -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ MUL b4, a2, b5 -+ fmov b5, t3 -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL b2, a3, b5 -+ fmov b5, t4 -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ MUL b3, a3, b5 -+ fmov b5, t1 -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ MUL b3, a4, b5 -+ fmov b5, t2 -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ MUL b4, a4, b5 -+ fmov b5, t3 -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ ldi AO, 4 * SIZE(AO) -+ MUL b4, a3, b5 -+ fmov b5, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ .align 4 -+ -+$L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+ -+ SUB b1, c02, b5 -+ fmov b5, c02 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c10, b5 -+ fmov b5, c10 -+ SUB b4, c14, b5 -+ fmov b5, c14 -+ -+ LD a1, 8 * SIZE(BO) -+ LD a2, 9 * SIZE(BO) -+ LD a3, 10 * SIZE(BO) -+ LD a4, 11 * SIZE(BO) -+ -+ LD b1, 12 * SIZE(BO) -+ LD b2, 13 * SIZE(BO) -+ LD b3, 14 * SIZE(BO) -+ LD b4, 15 * SIZE(BO) -+ -+ SUB a1, c03, b5 -+ fmov b5, c03 -+ SUB a2, c07, b5 -+ fmov b5, c07 -+ SUB a3, c11, b5 -+ fmov b5, c11 -+ SUB a4, c15, b5 -+ fmov b5, c15 -+ -+ SUB b1, c04, b5 -+ fmov b5, c04 -+ SUB b2, c08, b5 -+ fmov b5, c08 -+ SUB b3, c12, b5 -+ fmov b5, c12 -+ SUB b4, c16, b5 -+ fmov b5, c16 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+ -+ SUB b1, c05, b5 -+ fmov b5, c05 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c07, b5 -+ fmov b5, c07 -+ SUB b4, c08, b5 -+ fmov b5, c08 -+ -+ LD a1, 8 * SIZE(AO) -+ LD a2, 9 * SIZE(AO) -+ LD a3, 10 * SIZE(AO) -+ LD a4, 11 * SIZE(AO) -+ -+ LD b1, 12 * SIZE(AO) -+ LD b2, 13 * SIZE(AO) -+ LD b3, 14 * SIZE(AO) -+ LD b4, 15 * SIZE(AO) -+ -+ SUB a1, c09, b5 -+ fmov b5, c09 -+ SUB a2, c10, b5 -+ fmov b5, c10 -+ SUB a3, c11, b5 -+ fmov b5, c11 -+ SUB a4, c12, b5 -+ fmov b5, c12 -+ -+ SUB b1, c13, b5 -+ fmov b5, c13 -+ SUB b2, c14, b5 -+ fmov b5, c14 -+ SUB b3, c15, b5 -+ fmov b5, c15 -+ SUB b4, c16, b5 -+ fmov b5, c16 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ MUL a1, c16, b5 -+ fmov b5, c16 -+ -+ MUL a2, c04, b5 -+ fmov b5, t1 -+ MUL a2, c08, b5 -+ fmov b5, t2 -+ MUL a2, c12, b5 -+ fmov b5, t3 -+ MUL a2, c16, b5 -+ fmov b5, t4 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c15, t4, b5 -+ fmov b5, c15 -+ -+ MUL a3, c04, b5 -+ fmov b5, t1 -+ MUL a3, c08, b5 -+ fmov b5, t2 -+ MUL a3, c12, b5 -+ fmov b5, t3 -+ MUL a3, c16, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL a4, c04, b5 -+ fmov b5, t1 -+ MUL a4, c08, b5 -+ fmov b5, t2 -+ MUL a4, c12, b5 -+ fmov b5, t3 -+ MUL a4, c16, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, b5 -+ fmov b5, c03 -+ MUL b1, c07, b5 -+ fmov b5, c07 -+ MUL b1, c11, b5 -+ fmov b5, c11 -+ MUL b1, c15, b5 -+ fmov b5, c15 -+ -+ MUL b2, c03, b5 -+ fmov b5, t1 -+ MUL b2, c07, b5 -+ fmov b5, t2 -+ MUL b2, c11, b5 -+ fmov b5, t3 -+ MUL b2, c15, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL b3, c03, b5 -+ fmov b5, t1 -+ MUL b3, c07, b5 -+ fmov b5, t2 -+ MUL b3, c11, b5 -+ fmov b5, t3 -+ MUL b3, c15, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c10, b5 -+ fmov b5, t3 -+ MUL a2, c14, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c09, b5 -+ fmov b5, c09 -+ MUL a3, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ MUL a2, c09, b5 -+ fmov b5, t3 -+ MUL a2, c13, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c05, b5 -+ fmov b5, t2 -+ MUL a3, c09, b5 -+ fmov b5, t3 -+ MUL a3, c13, b5 -+ fmov b5, t4 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c15, t4, b5 -+ fmov b5, c15 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c05, b5 -+ fmov b5, t2 -+ MUL a4, c09, b5 -+ fmov b5, t3 -+ MUL a4, c13, b5 -+ fmov b5, t4 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ SUB c12, t3, b5 -+ fmov b5, c12 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, b5 -+ fmov b5, c02 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ MUL b1, c10, b5 -+ fmov b5, c10 -+ MUL b1, c14, b5 -+ fmov b5, c14 -+ -+ MUL b2, c02, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ MUL b2, c10, b5 -+ fmov b5, t3 -+ MUL b2, c14, b5 -+ fmov b5, t4 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c15, t4, b5 -+ fmov b5, c15 -+ -+ MUL b3, c02, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ MUL b3, c10, b5 -+ fmov b5, t3 -+ MUL b3, c14, b5 -+ fmov b5, t4 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ SUB c12, t3, b5 -+ fmov b5, c12 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c15, b5 -+ fmov b5, c15 -+ -+ MUL a2, c03, b5 -+ fmov b5, t1 -+ MUL a2, c07, b5 -+ fmov b5, t2 -+ MUL a2, c11, b5 -+ fmov b5, t3 -+ MUL a2, c15, b5 -+ fmov b5, t4 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ SUB c12, t3, b5 -+ fmov b5, c12 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ MUL a3, c04, b5 -+ fmov b5, c04 -+ MUL a3, c08, b5 -+ fmov b5, c08 -+ MUL a3, c12, b5 -+ fmov b5, c12 -+ MUL a3, c16, b5 -+ fmov b5, c16 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ MUL a2, c03, b5 -+ fmov b5, t3 -+ MUL a2, c04, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c02, b5 -+ fmov b5, t2 -+ MUL a3, c03, b5 -+ fmov b5, t3 -+ MUL a3, c04, b5 -+ fmov b5, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c02, b5 -+ fmov b5, t2 -+ MUL a4, c03, b5 -+ fmov b5, t3 -+ MUL a4, c04, b5 -+ fmov b5, t4 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ SUB c15, t3, b5 -+ fmov b5, c15 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, b5 -+ fmov b5, c05 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ MUL b1, c07, b5 -+ fmov b5, c07 -+ MUL b1, c08, b5 -+ fmov b5, c08 -+ -+ MUL b2, c05, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ MUL b2, c07, b5 -+ fmov b5, t3 -+ MUL b2, c08, b5 -+ fmov b5, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL b3, c05, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ MUL b3, c07, b5 -+ fmov b5, t3 -+ MUL b3, c08, b5 -+ fmov b5, t4 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ SUB c15, t3, b5 -+ fmov b5, c15 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ MUL a2, c09, b5 -+ fmov b5, t1 -+ MUL a2, c10, b5 -+ fmov b5, t2 -+ MUL a2, c11, b5 -+ fmov b5, t3 -+ MUL a2, c12, b5 -+ fmov b5, t4 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ SUB c15, t3, b5 -+ fmov b5, c15 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ MUL a3, c13, b5 -+ fmov b5, c13 -+ MUL a3, c14, b5 -+ fmov b5, c14 -+ MUL a3, c15, b5 -+ fmov b5, c15 -+ MUL a3, c16, b5 -+ fmov b5, c16 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ MUL a1, c15, b5 -+ fmov b5, c15 -+ MUL a1, c16, b5 -+ fmov b5, c16 -+ -+ MUL a2, c13, b5 -+ fmov b5, t1 -+ MUL a2, c14, b5 -+ fmov b5, t2 -+ MUL a2, c15, b5 -+ fmov b5, t3 -+ MUL a2, c16, b5 -+ fmov b5, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a3, c13, b5 -+ fmov b5, t1 -+ MUL a3, c14, b5 -+ fmov b5, t2 -+ MUL a3, c15, b5 -+ fmov b5, t3 -+ MUL a3, c16, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL a4, c13, b5 -+ fmov b5, t1 -+ MUL a4, c14, b5 -+ fmov b5, t2 -+ MUL a4, c15, b5 -+ fmov b5, t3 -+ MUL a4, c16, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, b5 -+ fmov b5, c09 -+ MUL b1, c10, b5 -+ fmov b5, c10 -+ MUL b1, c11, b5 -+ fmov b5, c11 -+ MUL b1, c12, b5 -+ fmov b5, c12 -+ -+ MUL b2, c09, b5 -+ fmov b5, t1 -+ MUL b2, c10, b5 -+ fmov b5, t2 -+ MUL b2, c11, b5 -+ fmov b5, t3 -+ MUL b2, c12, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL b3, c09, b5 -+ fmov b5, t1 -+ MUL b3, c10, b5 -+ fmov b5, t2 -+ MUL b3, c11, b5 -+ fmov b5, t3 -+ MUL b3, c12, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c07, b5 -+ fmov b5, t3 -+ MUL a2, c08, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c03, b5 -+ fmov b5, c03 -+ MUL a3, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+ -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+ -+ ST c03, 8 * SIZE(BO) -+ ST c07, 9 * SIZE(BO) -+ ST c11, 10 * SIZE(BO) -+ ST c15, 11 * SIZE(BO) -+ -+ ST c04, 12 * SIZE(BO) -+ ST c08, 13 * SIZE(BO) -+ ST c12, 14 * SIZE(BO) -+ ST c16, 15 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) -+ -+ ST c09, 8 * SIZE(AO) -+ ST c10, 9 * SIZE(AO) -+ ST c11, 10 * SIZE(AO) -+ ST c12, 11 * SIZE(AO) -+ -+ ST c13, 12 * SIZE(AO) -+ ST c14, 13 * SIZE(AO) -+ ST c15, 14 * SIZE(AO) -+ ST c16, 15 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+ ldi C3, -4 * SIZE(C3) -+ ldi C4, -4 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c11, 2 * SIZE(C3) -+ ST c12, 3 * SIZE(C3) -+ -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ ST c15, 2 * SIZE(C4) -+ ST c16, 3 * SIZE(C4) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+ ldi C3, 4 * SIZE(C3) -+ ldi C4, 4 * SIZE(C4) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ -+ bgt I, $L11 -+ .align 4 -+ -+$L20: -+ and M, 2, I -+ ble I, $L30 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c01 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(B) -+ fclr c02 -+ fclr c06 -+ ble KK, $L28 -+ -+ ble L, $L25 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c01 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(BO) -+ fclr c02 -+ fclr c06 -+ ble TMP1, $L28 -+ -+ ble L, $L25 -+#endif -+ .align 4 -+ -+$L22: -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ FIMOVD b5, tmp -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ IFMOVD tmp, b5 -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ IFMOVD tmp, b5 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 -+ -+$L25: -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L27 -+#else -+ blbs TMP1, $L27 -+#endif -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ unop -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, b5 -+ fmov b5, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b4, b5 -+ fmov b5, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L27: -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b4, b5 -+ fmov b5, t3 -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, b5 -+ fmov b5, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ .align 4 -+ -+$L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+ -+ SUB b1, c02, b5 -+ fmov b5, c02 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c10, b5 -+ fmov b5, c10 -+ SUB b4, c14, b5 -+ fmov b5, c14 -+ -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c05, b5 -+ fmov b5, c05 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+ -+ SUB b1, c09, b5 -+ fmov b5, c09 -+ SUB b2, c10, b5 -+ fmov b5, c10 -+ SUB b3, c13, b5 -+ fmov b5, c13 -+ SUB b4, c14, b5 -+ fmov b5, c14 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c10, b5 -+ fmov b5, t3 -+ MUL a2, c14, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c09, b5 -+ fmov b5, c09 -+ MUL a3, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ MUL a2, c09, b5 -+ fmov b5, t3 -+ MUL a2, c13, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+ MUL a3, c10, b5 -+ fmov b5, c10 -+ MUL a3, c14, b5 -+ fmov b5, c14 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c02, b5 -+ fmov b5, t2 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c02, b5 -+ fmov b5, t2 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, b5 -+ fmov b5, c05 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ -+ MUL b2, c05, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL b3, c05, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ MUL a2, c09, b5 -+ fmov b5, t1 -+ MUL a2, c10, b5 -+ fmov b5, t2 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ -+ MUL a3, c13, b5 -+ fmov b5, c13 -+ MUL a3, c14, b5 -+ fmov b5, c14 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ -+ MUL a2, c13, b5 -+ fmov b5, t1 -+ MUL a2, c14, b5 -+ fmov b5, t2 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL a3, c13, b5 -+ fmov b5, t1 -+ MUL a3, c14, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a4, c13, b5 -+ fmov b5, t1 -+ MUL a4, c14, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, b5 -+ fmov b5, c09 -+ MUL b1, c10, b5 -+ fmov b5, c10 -+ -+ MUL b2, c09, b5 -+ fmov b5, t1 -+ MUL b2, c10, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL b3, c09, b5 -+ fmov b5, t1 -+ MUL b3, c10, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+ -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) -+ -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c13, 6 * SIZE(AO) -+ ST c14, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+ ldi C3, -2 * SIZE(C3) -+ ldi C4, -2 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+ ldi C3, 2 * SIZE(C3) -+ ldi C4, 2 * SIZE(C4) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L30: -+ and M, 1, I -+ ble I, $L39 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c09 -+ LD b4, 3 * SIZE(B) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(B) -+ ble KK, $L38 -+ -+ ble L, $L35 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c09 -+ LD b4, 3 * SIZE(BO) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(BO) -+ ble TMP1, $L38 -+ -+ ble L, $L35 -+#endif -+ .align 4 -+ -+$L32: -+ ADD c01, t1, b5 -+ fmov b5,c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ ldi AO, 2 * SIZE(AO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ LD b5, 3 * SIZE(BO) -+ FIMOVD b5, tmp -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL a1, b4, t4 -+ LD a1, -1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a2, b1, t1 -+ LD b1, 4 * SIZE(BO) -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a2, b2, t2 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ LD b4, -1 * SIZE(BO) -+ MUL a2, b3, t3 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ IFMOVD tmp, b5 -+ MUL a2, b5, t4 -+ LD a2, 0 * SIZE(AO) -+ bgt L, $L32 -+ .align 4 -+ -+$L35: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L37 -+#else -+ blbs TMP1, $L37 -+#endif -+ .align 4 -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ LD b1, 0 * SIZE(BO) -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ MUL a1, b3, b5 -+ fmov b5, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL a1, b4, b5 -+ fmov b5, t4 -+ LD a1, 0 * SIZE(AO) -+ ldi AO, 1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L37: -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ MUL a1, b3, b5 -+ fmov b5, t3 -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b4, b5 -+ fmov b5, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ -+$L38: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, b5 -+ fmov b5, c05 -+ MUL b2, c05, b5 -+ fmov b5, t1 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ MUL b3, c05, b5 -+ fmov b5, t1 -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a2, c09, b5 -+ fmov b5, t1 -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ MUL a3, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ MUL a2, c13, b5 -+ fmov b5, t1 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ MUL a3, c13, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL a4, c13, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, b5 -+ fmov b5, c09 -+ MUL b2, c09, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL b3, c09, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c13, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+ ldi C3, -1 * SIZE(C3) -+ ldi C4, -1 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c09, 0 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L39: -+#ifdef LN -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 4, KK -+#endif -+ -+#ifdef RT -+ subl KK, 4, KK -+#endif -+ ldi J, -1(J) -+ bgt J, $L01 -+ .align 4 -+ -+$L40: -+ and N, 2, J -+ ble J, $L80 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ addl LDC, LDC, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ fclr t1 -+#ifndef RT -+ addl C2, LDC, C -+#endif -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L60 -+ .align 4 -+ -+$L51: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 -+ -+ ldi L, -2(KK) -+ -+ ldi BO, 2 * SIZE(B) -+ ldi AO, 4 * SIZE(AO) -+ -+ ble KK, $L58 -+ -+ ble L, $L55 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ ldi BO, 2 * SIZE(BO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ble TMP1, $L58 -+ -+ ble L, $L55 -+#endif -+ .align 4 -+ -+$L52: -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ unop -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ ldi L, -2(L) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ unop -+ -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ unop -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ unop -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a5, 3 * SIZE(AO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ LD b2, -1 * SIZE(BO) -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ unop -+ -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b3, b5 -+ fmov b5, t3 -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a5, b3, b5 -+ fmov b5, t4 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b4, b5 -+ fmov b5, t1 -+ LD a1, -4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b4, b5 -+ fmov b5, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, b5 -+ fmov b5, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L52 -+ .align 4 -+ -+$L55: -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L57 -+#else -+ blbs TMP1, $L57 -+#endif -+ .align 4 -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L57: -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ .align 4 -+ -+$L58: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c02, b5 -+ fmov b5, c02 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+ -+ SUB b1, c03, b5 -+ fmov b5, c03 -+ SUB b2, c07, b5 -+ fmov b5, c07 -+ SUB b3, c04, b5 -+ fmov b5, c04 -+ SUB b4, c08, b5 -+ fmov b5, c08 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+ -+ SUB b1, c05, b5 -+ fmov b5, c05 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c07, b5 -+ fmov b5, c07 -+ SUB b4, c08, b5 -+ fmov b5, c08 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ -+ MUL a2, c04, b5 -+ fmov b5, t1 -+ MUL a2, c08, b5 -+ fmov b5, t2 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ -+ MUL a3, c04, b5 -+ fmov b5, t1 -+ MUL a3, c08, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a4, c04, b5 -+ fmov b5, t1 -+ MUL a4, c08, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, b5 -+ fmov b5, c03 -+ MUL b1, c07, b5 -+ fmov b5, c07 -+ -+ MUL b2, c03, b5 -+ fmov b5, t1 -+ MUL b2, c07, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL b3, c03, b5 -+ fmov b5, t1 -+ MUL b3, c07, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c05, b5 -+ fmov b5, t2 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c05, b5 -+ fmov b5, t2 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, b5 -+ fmov b5, c02 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ -+ MUL b2, c02, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ -+ MUL b3, c02, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ -+ MUL a2, c03, b5 -+ fmov b5, t1 -+ MUL a2, c07, b5 -+ fmov b5, t2 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ -+ MUL a3, c04, b5 -+ fmov b5, c04 -+ MUL a3, c08, b5 -+ fmov b5, c08 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ MUL a2, c03, b5 -+ fmov b5, t3 -+ MUL a2, c04, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+ MUL a3, c07, b5 -+ fmov b5, c07 -+ MUL a3, c08, b5 -+ fmov b5, c08 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c07, b5 -+ fmov b5, t3 -+ MUL a2, c08, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c03, b5 -+ fmov b5, c03 -+ MUL a3, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) -+ -+ ST c03, 4 * SIZE(BO) -+ ST c07, 5 * SIZE(BO) -+ ST c04, 6 * SIZE(BO) -+ ST c08, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ -+ bgt I, $L51 -+ .align 4 -+ -+$L60: -+ and M, 2, I -+ ble I, $L70 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L68 -+ -+ ble L, $L65 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L68 -+ -+ ble L, $L65 -+#endif -+ .align 4 -+ -+$L62: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ldi L, -2(L) -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a4, b3, b5 -+ fmov b5, t2 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a4, b4, b5 -+ fmov b5, t4 -+ LD b4, 1 * SIZE(BO) -+ unop -+ -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L62 -+ .align 4 -+ -+$L65: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L67 -+#else -+ blbs TMP1, $L67 -+#endif -+ .align 4 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi AO, 2 * SIZE(AO) -+ .align 4 -+ -+$L67: -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ .align 4 -+ -+$L68: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c02, b5 -+ fmov b5, c02 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c05, b5 -+ fmov b5, c05 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L70: -+ and M, 1, I -+ ble I, $L79 -+ -+#if defined(LT) || defined(RN) -+ -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c02 -+ LD b2, 1 * SIZE(B) -+ fclr c06 -+ -+ ldi L, -2(KK) -+ -+ LD b3, 2 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L78 -+ -+ ble L, $L75 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c02 -+ LD b2, 1 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ -+ LD b3, 2 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L78 -+ -+ ble L, $L75 -+#endif -+ .align 4 -+ -+$L72: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ LD a1, 1 * SIZE(AO) -+ LD b2, 3 * SIZE(BO) -+ -+ ADD c02, t3, b5 -+ fmov b5, c02 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b3, b5 -+ fmov b5, t3 -+ LD b3, 4 * SIZE(BO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, b5 -+ fmov b5, t4 -+ LD a2, 0 * SIZE(AO) -+ LD b4, 5 * SIZE(BO) -+ -+ ldi BO, 4 * SIZE(BO) -+ unop -+ unop -+ bgt L, $L72 -+ .align 4 -+ -+$L75: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L77 -+#else -+ blbs TMP1, $L77 -+#endif -+ .align 4 -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ LD a1, 0 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L77: -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ ADD c02, t3, b5 -+ fmov b5, c02 -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ -+ ADD c01, c02, b5 -+ fmov b5, c01 -+ ldi AO, 1 * SIZE(AO) -+ ADD c05, c06, b5 -+ fmov b5, c05 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ -+ .align 4 -+ -+$L78: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L79: -+#ifdef LN -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 2, KK -+#endif -+ -+#ifdef RT -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L80: -+ and N, 1, J -+ ble J, $L999 -+ -+#ifdef RT -+ sll K, BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ subl C, LDC, C -+#endif -+ -+ mov C, C1 -+#ifndef RT -+ addl C, LDC, C -+#endif -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ ble I, $L100 -+ .align 4 -+ -+$L91: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L95 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L95 -+#endif -+ .align 5 -+ -+$L92: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi L, -1(L) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ LD a1, 8 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 9 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 10 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a4, 11 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ LD a1, 12 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ LD a2, 13 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b3, b5 -+ fmov b5, t3 -+ LD a3, 14 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b3, b5 -+ fmov b5, t4 -+ LD a5, 15 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b4, b5 -+ fmov b5, t1 -+ LD a1, 16 * SIZE(AO) -+ ldi AO, 16 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b4, b5 -+ fmov b5, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, b5 -+ fmov b5, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L92 -+ .align 4 -+ -+$L95: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ unop -+ ble L, $L98 -+ .align 4 -+ -+$L96: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 1 * SIZE(BO) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ldi AO, 4 * SIZE(AO) -+ bgt L, $L96 -+ .align 4 -+ -+$L98: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a2, c04, b5 -+ fmov b5, t1 -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ MUL a3, c04, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL a4, c04, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, b5 -+ fmov b5, c03 -+ MUL b2, c03, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL b3, c03, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, b5 -+ fmov b5, c02 -+ MUL b2, c02, b5 -+ fmov b5, t1 -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ MUL b3, c02, b5 -+ fmov b5, t1 -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a2, c03, b5 -+ fmov b5, t1 -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ MUL a3, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ bgt I, $L91 -+ .align 4 -+ -+$L100: -+ and M, 2, I -+ ble I, $L110 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L105 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ ble L, $L105 -+#endif -+ .align 5 -+ -+$L102: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ldi BO, 4 * SIZE(BO) -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a5, 7 * SIZE(AO) -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ LD a1, 8 * SIZE(AO) -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ LD b3, 2 * SIZE(BO) -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, b5 -+ fmov b5, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L102 -+ .align 4 -+ -+$L105: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L108 -+ .align 4 -+ -+$L106: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 3 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi AO, 2 * SIZE(AO) -+ unop -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L106 -+ .align 4 -+ -+$L108: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ -+ ADD c01, c03, b5 -+ fmov b5, c01 -+ ADD c02, c04, b5 -+ fmov b5, c02 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L110: -+ and M, 1, I -+ ble I, $L119 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ unop -+ ble L, $L115 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L115 -+#endif -+ .align 4 -+ -+$L112: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ MUL a3, b3, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b4, b5 -+ fmov b5, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b4, 7 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 4 * SIZE(AO) -+ ldi BO, 4 * SIZE(BO) -+ bgt L, $L112 -+ .align 4 -+ -+$L115: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L118 -+ .align 4 -+ -+$L116: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 1 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 1 * SIZE(AO) -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L116 -+ .align 4 -+ -+$L118: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ -+ ADD c01, c02, b5 -+ fmov b5, c01 -+ ADD c03, c04, b5 -+ fmov b5, c03 -+ ADD c01, c03, b5 -+ fmov b5, c01 -+ -+#if defined(LN) || defined(RT) -+ subl KK, 1, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+#else -+ LD a1, 0 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 1 * SIZE(C1) -+#endif -+ -+#ifdef RT -+ SXADDQ K, AORIG, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L119: -+#ifdef LN -+ SXADDQ K, B, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 1, KK -+#endif -+ -+#ifdef RT -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldl $9, 64($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S.bak b/kernel/sw_64/trsm_kernel_4x4_LT.S.bak -new file mode 100644 -index 0000000..86136ae ---- /dev/null -+++ b/kernel/sw_64/trsm_kernel_4x4_LT.S.bak -@@ -0,0 +1,4072 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+ -+#if !defined(SW2B) -+#error "Architecture is not specified." -+#endif -+ -+#ifdef SW2B -+#define PREFETCHSIZE 56 -+#define UNOP nop -+#endif -+ -+#ifdef EV6 -+#define PREFETCHSIZE 56 -+#define UNOP unop -+#endif -+ -+#ifdef EV5 -+#define PREFETCHSIZE 56 -+#define UNOP -+#endif -+ -+#ifdef EV4 -+#define UNOP -+#endif -+ -+#define STACKSIZE 80 -+ -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $20 -+#define B $21 -+#define C $22 -+#define LDC $23 -+ -+#define C1 $19 -+#define C2 $24 -+#define C3 $25 -+#define C4 $27 -+ -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 -+ -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 -+ -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 -+ -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 -+ -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 -+ -+#define alpha $f30 -+ -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 -+ -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 -+ -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 -+ -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 -+ -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define AORIG $3 -+#define OFFSET $4 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ ldl C, 0 + STACKSIZE($sp) -+ ldl LDC, 8 + STACKSIZE($sp) -+ ldl OFFSET, 16 + STACKSIZE($sp) -+ -+ SXADDQ LDC, 0, LDC -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 -+ -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 -+ -+#ifdef LN -+ mull M, K, TMP1 -+ SXADDQ TMP1, A, A -+ SXADDQ M, C, C -+#endif -+ -+#ifdef RN -+ negl OFFSET, KK -+#endif -+ -+#ifdef RT -+ mull N, K, TMP1 -+ SXADDQ TMP1, B, B -+ -+ mull N, LDC, TMP1 -+ addl TMP1, C, C -+ -+ subl N, OFFSET, KK -+#endif -+ -+ sra N, 2, J -+ ble J, $L40 -+ .align 4 -+ -+$L01: -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ s4addl LDC, 0, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ addl C2, LDC, C3 -+#ifndef RT -+ s4addl LDC, C, C -+#endif -+ -+ fclr t1 -+ addl C3, LDC, C4 -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L20 -+ .align 4 -+ -+$L11: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(B) -+ fclr c06 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(KK) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(B) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble KK, $L18 -+#else -+ -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c06 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(TMP1) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(BO) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble TMP1, $L18 -+#endif -+ -+ ble L, $L15 -+ .align 5 -+ -+$L12: -+/* 1 */ -+ ADD c11, t1, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop -+#endif -+ -+ ADD c12, t2, c12 -+ unop -+ MUL b1, a2, t2 -+ unop -+ -+ ADD c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) -+ -+ ADD c15, t4, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ -+/* 2 */ -+ ADD c01, t1, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP -+ -+ ADD c02, t2, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP -+ -+ ADD c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, c05 -+ unop -+ MUL b4, a1, t4 -+ unop -+ -+/* 3 */ -+ ADD c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+ unop -+ -+ ADD c04, t2, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+/* 4 */ -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) -+ -+ ADD c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) -+ -+/* 5 */ -+ ADD c11, t1, c11 -+ unop -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c12, t2, c12 -+ ldi L, -2(L) -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD c15, t4, c15 -+ unop -+ MUL b2, a5, t4 -+ unop -+ -+/* 6 */ -+ ADD c01, t1, c01 -+ unop -+ MUL b5, a6, t1 -+ unop -+ -+ ADD c02, t2, c02 -+ unop -+ MUL b5, a4, t2 -+ unop -+ -+ ADD c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, c05 -+ unop -+ MUL b4, a5, t4 -+ unop -+ -+/* 7 */ -+ ADD c03, t1, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 -+ unop -+ -+ ADD c04, t2, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) -+ -+/* 8 */ -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 -+ .align 4 -+ -+$L15: -+ ADD c11, t1, c11 -+ MUL b1, a1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L17 -+#else -+ blbs TMP1, $L17 -+#endif -+ .align 4 -+ -+ ADD c12, t2, c12 -+ MUL b1, a2, t2 -+ ADD c16, t3, c16 -+ MUL b2, a2, t3 -+ -+ ADD c15, t4, c15 -+ MUL b2, a1, t4 -+ ADD c01, t1, c01 -+ MUL b1, a3, t1 -+ -+ ADD c02, t2, c02 -+ unop -+ MUL b1, a4, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c06, t3, c06 -+ MUL b2, a4, t3 -+ ADD c05, t4, c05 -+ MUL b4, a1, t4 -+ -+ ADD c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c04, t2, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a3, t1 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c11, t1, c11 -+ LD b4, 3 * SIZE(BO) -+ MUL b1, a1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L17: -+ ADD c12, t2, c12 -+ MUL b1, a2, t2 -+ ADD c16, t3, c16 -+ MUL b2, a2, t3 -+ -+ ADD c15, t4, c15 -+ MUL b2, a1, t4 -+ ADD c01, t1, c01 -+ MUL b1, a3, t1 -+ -+ ADD c02, t2, c02 -+ MUL b1, a4, t2 -+ ADD c06, t3, c06 -+ MUL b2, a4, t3 -+ -+ ADD c05, t4, c05 -+ MUL b4, a1, t4 -+ ADD c03, t1, c03 -+ MUL b3, a1, t1 -+ -+ ADD c04, t2, c04 -+ MUL b3, a2, t2 -+ ADD c08, t3, c08 -+ MUL b4, a2, t3 -+ -+ ADD c13, t4, c13 -+ MUL b2, a3, t4 -+ ADD c09, t1, c09 -+ MUL b3, a3, t1 -+ -+ ADD c10, t2, c10 -+ MUL b3, a4, t2 -+ ADD c14, t3, c14 -+ MUL b4, a4, t3 -+ -+ ADD c07, t4, c07 -+ ldi AO, 4 * SIZE(AO) -+ MUL b4, a3, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c11, t1, c11 -+ ADD c12, t2, c12 -+ ADD c16, t3, c16 -+ ADD c15, t4, c15 -+ .align 4 -+ -+$L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 -+ -+ SUB b1, c02, c02 -+ SUB b2, c06, c06 -+ SUB b3, c10, c10 -+ SUB b4, c14, c14 -+ -+ LD a1, 8 * SIZE(BO) -+ LD a2, 9 * SIZE(BO) -+ LD a3, 10 * SIZE(BO) -+ LD a4, 11 * SIZE(BO) -+ -+ LD b1, 12 * SIZE(BO) -+ LD b2, 13 * SIZE(BO) -+ LD b3, 14 * SIZE(BO) -+ LD b4, 15 * SIZE(BO) -+ -+ SUB a1, c03, c03 -+ SUB a2, c07, c07 -+ SUB a3, c11, c11 -+ SUB a4, c15, c15 -+ -+ SUB b1, c04, c04 -+ SUB b2, c08, c08 -+ SUB b3, c12, c12 -+ SUB b4, c16, c16 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+ -+ SUB b1, c05, c05 -+ SUB b2, c06, c06 -+ SUB b3, c07, c07 -+ SUB b4, c08, c08 -+ -+ LD a1, 8 * SIZE(AO) -+ LD a2, 9 * SIZE(AO) -+ LD a3, 10 * SIZE(AO) -+ LD a4, 11 * SIZE(AO) -+ -+ LD b1, 12 * SIZE(AO) -+ LD b2, 13 * SIZE(AO) -+ LD b3, 14 * SIZE(AO) -+ LD b4, 15 * SIZE(AO) -+ -+ SUB a1, c09, c09 -+ SUB a2, c10, c10 -+ SUB a3, c11, c11 -+ SUB a4, c12, c12 -+ -+ SUB b1, c13, c13 -+ SUB b2, c14, c14 -+ SUB b3, c15, c15 -+ SUB b4, c16, c16 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, c04 -+ MUL a1, c08, c08 -+ MUL a1, c12, c12 -+ MUL a1, c16, c16 -+ -+ MUL a2, c04, t1 -+ MUL a2, c08, t2 -+ MUL a2, c12, t3 -+ MUL a2, c16, t4 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ SUB c11, t3, c11 -+ SUB c15, t4, c15 -+ -+ MUL a3, c04, t1 -+ MUL a3, c08, t2 -+ MUL a3, c12, t3 -+ MUL a3, c16, t4 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 -+ -+ MUL a4, c04, t1 -+ MUL a4, c08, t2 -+ MUL a4, c12, t3 -+ MUL a4, c16, t4 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, c03 -+ MUL b1, c07, c07 -+ MUL b1, c11, c11 -+ MUL b1, c15, c15 -+ -+ MUL b2, c03, t1 -+ MUL b2, c07, t2 -+ MUL b2, c11, t3 -+ MUL b2, c15, t4 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 -+ -+ MUL b3, c03, t1 -+ MUL b3, c07, t2 -+ MUL b3, c11, t3 -+ MUL b3, c15, t4 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ MUL a1, c10, c10 -+ MUL a1, c14, c14 -+ -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ MUL a2, c10, t3 -+ MUL a2, c14, t4 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 -+ -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+ MUL a3, c09, c09 -+ MUL a3, c13, c13 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ MUL a1, c09, c09 -+ MUL a1, c13, c13 -+ -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ MUL a2, c09, t3 -+ MUL a2, c13, t4 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 -+ -+ MUL a3, c01, t1 -+ MUL a3, c05, t2 -+ MUL a3, c09, t3 -+ MUL a3, c13, t4 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ SUB c11, t3, c11 -+ SUB c15, t4, c15 -+ -+ MUL a4, c01, t1 -+ MUL a4, c05, t2 -+ MUL a4, c09, t3 -+ MUL a4, c13, t4 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ SUB c12, t3, c12 -+ SUB c16, t4, c16 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, c02 -+ MUL b1, c06, c06 -+ MUL b1, c10, c10 -+ MUL b1, c14, c14 -+ -+ MUL b2, c02, t1 -+ MUL b2, c06, t2 -+ MUL b2, c10, t3 -+ MUL b2, c14, t4 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ SUB c11, t3, c11 -+ SUB c15, t4, c15 -+ -+ MUL b3, c02, t1 -+ MUL b3, c06, t2 -+ MUL b3, c10, t3 -+ MUL b3, c14, t4 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ SUB c12, t3, c12 -+ SUB c16, t4, c16 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, c03 -+ MUL a1, c07, c07 -+ MUL a1, c11, c11 -+ MUL a1, c15, c15 -+ -+ MUL a2, c03, t1 -+ MUL a2, c07, t2 -+ MUL a2, c11, t3 -+ MUL a2, c15, t4 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ SUB c12, t3, c12 -+ SUB c16, t4, c16 -+ -+ MUL a3, c04, c04 -+ MUL a3, c08, c08 -+ MUL a3, c12, c12 -+ MUL a3, c16, c16 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ MUL a2, c03, t3 -+ MUL a2, c04, t4 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 -+ -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c03, t3 -+ MUL a3, c04, t4 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 -+ -+ MUL a4, c01, t1 -+ MUL a4, c02, t2 -+ MUL a4, c03, t3 -+ MUL a4, c04, t4 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ SUB c15, t3, c15 -+ SUB c16, t4, c16 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, c05 -+ MUL b1, c06, c06 -+ MUL b1, c07, c07 -+ MUL b1, c08, c08 -+ -+ MUL b2, c05, t1 -+ MUL b2, c06, t2 -+ MUL b2, c07, t3 -+ MUL b2, c08, t4 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 -+ -+ MUL b3, c05, t1 -+ MUL b3, c06, t2 -+ MUL b3, c07, t3 -+ MUL b3, c08, t4 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ SUB c15, t3, c15 -+ SUB c16, t4, c16 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 -+ -+ MUL a2, c09, t1 -+ MUL a2, c10, t2 -+ MUL a2, c11, t3 -+ MUL a2, c12, t4 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ SUB c15, t3, c15 -+ SUB c16, t4, c16 -+ -+ MUL a3, c13, c13 -+ MUL a3, c14, c14 -+ MUL a3, c15, c15 -+ MUL a3, c16, c16 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, c13 -+ MUL a1, c14, c14 -+ MUL a1, c15, c15 -+ MUL a1, c16, c16 -+ -+ MUL a2, c13, t1 -+ MUL a2, c14, t2 -+ MUL a2, c15, t3 -+ MUL a2, c16, t4 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 -+ -+ MUL a3, c13, t1 -+ MUL a3, c14, t2 -+ MUL a3, c15, t3 -+ MUL a3, c16, t4 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 -+ -+ MUL a4, c13, t1 -+ MUL a4, c14, t2 -+ MUL a4, c15, t3 -+ MUL a4, c16, t4 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, c09 -+ MUL b1, c10, c10 -+ MUL b1, c11, c11 -+ MUL b1, c12, c12 -+ -+ MUL b2, c09, t1 -+ MUL b2, c10, t2 -+ MUL b2, c11, t3 -+ MUL b2, c12, t4 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 -+ -+ MUL b3, c09, t1 -+ MUL b3, c10, t2 -+ MUL b3, c11, t3 -+ MUL b3, c12, t4 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ MUL a1, c07, c07 -+ MUL a1, c08, c08 -+ -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ MUL a2, c07, t3 -+ MUL a2, c08, t4 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 -+ -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+ MUL a3, c03, c03 -+ MUL a3, c04, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+ -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+ -+ ST c03, 8 * SIZE(BO) -+ ST c07, 9 * SIZE(BO) -+ ST c11, 10 * SIZE(BO) -+ ST c15, 11 * SIZE(BO) -+ -+ ST c04, 12 * SIZE(BO) -+ ST c08, 13 * SIZE(BO) -+ ST c12, 14 * SIZE(BO) -+ ST c16, 15 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) -+ -+ ST c09, 8 * SIZE(AO) -+ ST c10, 9 * SIZE(AO) -+ ST c11, 10 * SIZE(AO) -+ ST c12, 11 * SIZE(AO) -+ -+ ST c13, 12 * SIZE(AO) -+ ST c14, 13 * SIZE(AO) -+ ST c15, 14 * SIZE(AO) -+ ST c16, 15 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+ ldi C3, -4 * SIZE(C3) -+ ldi C4, -4 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c11, 2 * SIZE(C3) -+ ST c12, 3 * SIZE(C3) -+ -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ ST c15, 2 * SIZE(C4) -+ ST c16, 3 * SIZE(C4) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+ ldi C3, 4 * SIZE(C3) -+ ldi C4, 4 * SIZE(C4) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ -+ bgt I, $L11 -+ .align 4 -+ -+$L20: -+ and M, 2, I -+ ble I, $L30 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c01 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(B) -+ fclr c02 -+ fclr c06 -+ ble KK, $L28 -+ -+ ble L, $L25 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c01 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(BO) -+ fclr c02 -+ fclr c06 -+ ble TMP1, $L28 -+ -+ ble L, $L25 -+#endif -+ .align 4 -+ -+$L22: -+ ADD c09, t1, c09 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c10, t2, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ -+ ADD c09, t1, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD c10, t2, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD c13, t3, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c14, t4, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 -+ -+$L25: -+ ADD c09, t1, c09 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L27 -+#else -+ blbs TMP1, $L27 -+#endif -+ -+ ADD c10, t2, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ unop -+ -+ ADD c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c09, t1, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L27: -+ ADD c10, t2, c10 -+ MUL a2, b1, t2 -+ ADD c13, t3, c13 -+ MUL a1, b2, t3 -+ -+ ADD c14, t4, c14 -+ MUL a2, b2, t4 -+ ADD c01, t1, c01 -+ MUL a1, b3, t1 -+ -+ ADD c02, t2, c02 -+ MUL a2, b3, t2 -+ ADD c05, t3, c05 -+ MUL a1, b4, t3 -+ -+ ADD c06, t4, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c09, t1, c09 -+ ADD c10, t2, c10 -+ ADD c13, t3, c13 -+ ADD c14, t4, c14 -+ .align 4 -+ -+$L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 -+ -+ SUB b1, c02, c02 -+ SUB b2, c06, c06 -+ SUB b3, c10, c10 -+ SUB b4, c14, c14 -+ -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c05, c05 -+ SUB a4, c06, c06 -+ -+ SUB b1, c09, c09 -+ SUB b2, c10, c10 -+ SUB b3, c13, c13 -+ SUB b4, c14, c14 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ MUL a1, c10, c10 -+ MUL a1, c14, c14 -+ -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ MUL a2, c10, t3 -+ MUL a2, c14, t4 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 -+ -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+ MUL a3, c09, c09 -+ MUL a3, c13, c13 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ MUL a1, c09, c09 -+ MUL a1, c13, c13 -+ -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ MUL a2, c09, t3 -+ MUL a2, c13, t4 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 -+ -+ MUL a3, c02, c02 -+ MUL a3, c06, c06 -+ MUL a3, c10, c10 -+ MUL a3, c14, c14 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ -+ MUL a4, c01, t1 -+ MUL a4, c02, t2 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, c05 -+ MUL b1, c06, c06 -+ -+ MUL b2, c05, t1 -+ MUL b2, c06, t2 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ -+ MUL b3, c05, t1 -+ MUL b3, c06, t2 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ -+ MUL a2, c09, t1 -+ MUL a2, c10, t2 -+ -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ -+ MUL a3, c13, c13 -+ MUL a3, c14, c14 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, c13 -+ MUL a1, c14, c14 -+ -+ MUL a2, c13, t1 -+ MUL a2, c14, t2 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ -+ MUL a3, c13, t1 -+ MUL a3, c14, t2 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ -+ MUL a4, c13, t1 -+ MUL a4, c14, t2 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, c09 -+ MUL b1, c10, c10 -+ -+ MUL b2, c09, t1 -+ MUL b2, c10, t2 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ -+ MUL b3, c09, t1 -+ MUL b3, c10, t2 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+ -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) -+ -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c13, 6 * SIZE(AO) -+ ST c14, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+ ldi C3, -2 * SIZE(C3) -+ ldi C4, -2 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+ ldi C3, 2 * SIZE(C3) -+ ldi C4, 2 * SIZE(C4) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L30: -+ and M, 1, I -+ ble I, $L39 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c09 -+ LD b4, 3 * SIZE(B) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(B) -+ ble KK, $L38 -+ -+ ble L, $L35 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c09 -+ LD b4, 3 * SIZE(BO) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(BO) -+ ble TMP1, $L38 -+ -+ ble L, $L35 -+#endif -+ .align 4 -+ -+$L32: -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t2, c05 -+ ldi AO, 2 * SIZE(AO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, c09 -+ LD b5, 3 * SIZE(BO) -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, c13 -+ MUL a1, b4, t4 -+ LD a1, -1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ MUL a2, b1, t1 -+ LD b1, 4 * SIZE(BO) -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c05, t2, c05 -+ MUL a2, b2, t2 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c09, t3, c09 -+ LD b4, -1 * SIZE(BO) -+ MUL a2, b3, t3 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c13, t4, c13 -+ MUL a2, b5, t4 -+ LD a2, 0 * SIZE(AO) -+ bgt L, $L32 -+ .align 4 -+ -+$L35: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L37 -+#else -+ blbs TMP1, $L37 -+#endif -+ .align 4 -+ -+ ADD c05, t2, c05 -+ LD b1, 0 * SIZE(BO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, c09 -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, c13 -+ MUL a1, b4, t4 -+ LD a1, 0 * SIZE(AO) -+ ldi AO, 1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L37: -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ ADD c09, t3, c09 -+ MUL a1, b3, t3 -+ -+ ADD c13, t4, c13 -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b4, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ ADD c05, t2, c05 -+ ADD c09, t3, c09 -+ ADD c13, t4, c13 -+ -+$L38: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ MUL a1, c09, c09 -+ MUL a1, c13, c13 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c05, t1, c05 -+ MUL a3, c01, t1 -+ SUB c09, t1, c09 -+ MUL a4, c01, t1 -+ SUB c13, t1, c13 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, c05 -+ MUL b2, c05, t1 -+ SUB c09, t1, c09 -+ MUL b3, c05, t1 -+ SUB c13, t1, c13 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, c09 -+ MUL a2, c09, t1 -+ SUB c13, t1, c13 -+ MUL a3, c13, c13 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, c13 -+ MUL a2, c13, t1 -+ SUB c09, t1, c09 -+ MUL a3, c13, t1 -+ SUB c05, t1, c05 -+ MUL a4, c13, t1 -+ SUB c01, t1, c01 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, c09 -+ MUL b2, c09, t1 -+ SUB c05, t1, c05 -+ MUL b3, c09, t1 -+ SUB c01, t1, c01 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a2, c05, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c13, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+ ldi C3, -1 * SIZE(C3) -+ ldi C4, -1 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c09, 0 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L39: -+#ifdef LN -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 4, KK -+#endif -+ -+#ifdef RT -+ subl KK, 4, KK -+#endif -+ ldi J, -1(J) -+ bgt J, $L01 -+ .align 4 -+ -+$L40: -+ and N, 2, J -+ ble J, $L80 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ addl LDC, LDC, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ fclr t1 -+#ifndef RT -+ addl C2, LDC, C -+#endif -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L60 -+ .align 4 -+ -+$L51: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 -+ -+ ldi L, -2(KK) -+ -+ ldi BO, 2 * SIZE(B) -+ ldi AO, 4 * SIZE(AO) -+ -+ ble KK, $L58 -+ -+ ble L, $L55 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ ldi BO, 2 * SIZE(BO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ble TMP1, $L58 -+ -+ ble L, $L55 -+#endif -+ .align 4 -+ -+$L52: -+ ADD c05, t1, c05 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c06, t2, c06 -+ ldi L, -2(L) -+ MUL a2, b1, t2 -+ unop -+ -+ ADD c07, t3, c07 -+ unop -+ MUL a3, b1, t3 -+ unop -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ unop -+ MUL a4, b2, t4 -+ LD a5, 3 * SIZE(AO) -+ -+ ADD c05, t1, c05 -+ unop -+ MUL a1, b3, t1 -+ LD b2, -1 * SIZE(BO) -+ -+ ADD c06, t2, c06 -+ unop -+ MUL a2, b3, t2 -+ unop -+ -+ ADD c07, t3, c07 -+ unop -+ MUL a3, b3, t3 -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a5, b3, t4 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b4, t1 -+ LD a1, -4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b4, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L52 -+ .align 4 -+ -+$L55: -+ ADD c05, t1, c05 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L57 -+#else -+ blbs TMP1, $L57 -+#endif -+ .align 4 -+ -+ ADD c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD c07, t3, c07 -+ MUL a3, b1, t3 -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c05, t1, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L57: -+ ADD c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD c07, t3, c07 -+ MUL a3, b1, t3 -+ -+ ADD c08, t4, c08 -+ MUL a4, b1, t4 -+ ADD c01, t1, c01 -+ MUL a1, b2, t1 -+ -+ ADD c02, t2, c02 -+ MUL a2, b2, t2 -+ ADD c03, t3, c03 -+ MUL a3, b2, t3 -+ -+ ADD c04, t4, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c05, t1, c05 -+ ADD c06, t2, c06 -+ ADD c07, t3, c07 -+ ADD c08, t4, c08 -+ .align 4 -+ -+$L58: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c02, c02 -+ SUB a4, c06, c06 -+ -+ SUB b1, c03, c03 -+ SUB b2, c07, c07 -+ SUB b3, c04, c04 -+ SUB b4, c08, c08 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+ -+ SUB b1, c05, c05 -+ SUB b2, c06, c06 -+ SUB b3, c07, c07 -+ SUB b4, c08, c08 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, c04 -+ MUL a1, c08, c08 -+ -+ MUL a2, c04, t1 -+ MUL a2, c08, t2 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ -+ MUL a3, c04, t1 -+ MUL a3, c08, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL a4, c04, t1 -+ MUL a4, c08, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, c03 -+ MUL b1, c07, c07 -+ -+ MUL b2, c03, t1 -+ MUL b2, c07, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL b3, c03, t1 -+ MUL b3, c07, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL a3, c01, t1 -+ MUL a3, c05, t2 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ -+ MUL a4, c01, t1 -+ MUL a4, c05, t2 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, c02 -+ MUL b1, c06, c06 -+ -+ MUL b2, c02, t1 -+ MUL b2, c06, t2 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ -+ MUL b3, c02, t1 -+ MUL b3, c06, t2 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, c03 -+ MUL a1, c07, c07 -+ -+ MUL a2, c03, t1 -+ MUL a2, c07, t2 -+ -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ -+ MUL a3, c04, c04 -+ MUL a3, c08, c08 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ MUL a2, c03, t3 -+ MUL a2, c04, t4 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 -+ -+ MUL a3, c05, c05 -+ MUL a3, c06, c06 -+ MUL a3, c07, c07 -+ MUL a3, c08, c08 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ MUL a1, c07, c07 -+ MUL a1, c08, c08 -+ -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ MUL a2, c07, t3 -+ MUL a2, c08, t4 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 -+ -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+ MUL a3, c03, c03 -+ MUL a3, c04, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) -+ -+ ST c03, 4 * SIZE(BO) -+ ST c07, 5 * SIZE(BO) -+ ST c04, 6 * SIZE(BO) -+ ST c08, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ -+ bgt I, $L51 -+ .align 4 -+ -+$L60: -+ and M, 2, I -+ ble I, $L70 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L68 -+ -+ ble L, $L65 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L68 -+ -+ ble L, $L65 -+#endif -+ .align 4 -+ -+$L62: -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c02, t2, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ ldi L, -2(L) -+ MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, t1 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ unop -+ MUL a3, b4, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ MUL a4, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ unop -+ -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L62 -+ .align 4 -+ -+$L65: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L67 -+#else -+ blbs TMP1, $L67 -+#endif -+ .align 4 -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t3, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, t3 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c01, t1, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi AO, 2 * SIZE(AO) -+ .align 4 -+ -+$L67: -+ ADD c02, t2, c02 -+ MUL a2, b1, t2 -+ ADD c05, t3, c05 -+ MUL a1, b2, t3 -+ -+ ADD c06, t4, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b2, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c05, t3, c05 -+ ADD c06, t4, c06 -+ .align 4 -+ -+$L68: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c02, c02 -+ SUB a4, c06, c06 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c05, c05 -+ SUB a4, c06, c06 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL a3, c02, c02 -+ MUL a3, c06, c06 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ -+ MUL a3, c05, c05 -+ MUL a3, c06, c06 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L70: -+ and M, 1, I -+ ble I, $L79 -+ -+#if defined(LT) || defined(RN) -+ -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c02 -+ LD b2, 1 * SIZE(B) -+ fclr c06 -+ -+ ldi L, -2(KK) -+ -+ LD b3, 2 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L78 -+ -+ ble L, $L75 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c02 -+ LD b2, 1 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ -+ LD b3, 2 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L78 -+ -+ ble L, $L75 -+#endif -+ .align 4 -+ -+$L72: -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ LD a1, 1 * SIZE(AO) -+ LD b2, 3 * SIZE(BO) -+ -+ ADD c02, t3, c02 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b3, t3 -+ LD b3, 4 * SIZE(BO) -+ -+ ADD c06, t4, c06 -+ MUL a2, b4, t4 -+ LD a2, 0 * SIZE(AO) -+ LD b4, 5 * SIZE(BO) -+ -+ ldi BO, 4 * SIZE(BO) -+ unop -+ unop -+ bgt L, $L72 -+ .align 4 -+ -+$L75: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L77 -+#else -+ blbs TMP1, $L77 -+#endif -+ .align 4 -+ -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ LD a1, 0 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L77: -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ ADD c02, t3, c02 -+ ADD c06, t4, c06 -+ -+ ADD c01, c02, c01 -+ ldi AO, 1 * SIZE(AO) -+ ADD c05, c06, c05 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ ADD c05, t2, c05 -+ -+ .align 4 -+ -+$L78: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c05, t1, c05 -+ MUL a3, c05, c05 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a2, c05, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L79: -+#ifdef LN -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 2, KK -+#endif -+ -+#ifdef RT -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L80: -+ and N, 1, J -+ ble J, $L999 -+ -+#ifdef RT -+ sll K, BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ subl C, LDC, C -+#endif -+ -+ mov C, C1 -+#ifndef RT -+ addl C, LDC, C -+#endif -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ ble I, $L100 -+ .align 4 -+ -+$L91: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L95 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L95 -+#endif -+ .align 5 -+ -+$L92: -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi L, -1(L) -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b1, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b1, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 8 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 9 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 10 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a4, 11 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ LD a1, 12 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD a2, 13 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b3, t3 -+ LD a3, 14 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b3, t4 -+ LD a5, 15 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ MUL a1, b4, t1 -+ LD a1, 16 * SIZE(AO) -+ ldi AO, 16 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b4, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L92 -+ .align 4 -+ -+$L95: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ unop -+ ble L, $L98 -+ .align 4 -+ -+$L96: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 1 * SIZE(BO) -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b1, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b1, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ldi AO, 4 * SIZE(AO) -+ bgt L, $L96 -+ .align 4 -+ -+$L98: -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, c04 -+ MUL a2, c04, t1 -+ SUB c03, t1, c03 -+ MUL a3, c04, t1 -+ SUB c02, t1, c02 -+ MUL a4, c04, t1 -+ SUB c01, t1, c01 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, c03 -+ MUL b2, c03, t1 -+ SUB c02, t1, c02 -+ MUL b3, c03, t1 -+ SUB c01, t1, c01 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a2, c02, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c02, t1, c02 -+ MUL a3, c01, t1 -+ SUB c03, t1, c03 -+ MUL a4, c01, t1 -+ SUB c04, t1, c04 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, c02 -+ MUL b2, c02, t1 -+ SUB c03, t1, c03 -+ MUL b3, c02, t1 -+ SUB c04, t1, c04 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, c03 -+ MUL a2, c03, t1 -+ SUB c04, t1, c04 -+ MUL a3, c04, c04 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ bgt I, $L91 -+ .align 4 -+ -+$L100: -+ and M, 2, I -+ ble I, $L110 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L105 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ ble L, $L105 -+#endif -+ .align 5 -+ -+$L102: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c03, t3, c03 -+ ldi BO, 4 * SIZE(BO) -+ MUL a3, b2, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a5, 7 * SIZE(AO) -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ MUL a1, b3, t1 -+ LD a1, 8 * SIZE(AO) -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L102 -+ .align 4 -+ -+$L105: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L108 -+ .align 4 -+ -+$L106: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b1, t2 -+ LD a2, 3 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi AO, 2 * SIZE(AO) -+ unop -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L106 -+ .align 4 -+ -+$L108: -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+ -+ ADD c01, c03, c01 -+ ADD c02, c04, c02 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a2, c02, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c02, t1, c02 -+ MUL a3, c02, c02 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L110: -+ and M, 1, I -+ ble I, $L119 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ unop -+ ble L, $L115 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L115 -+#endif -+ .align 4 -+ -+$L112: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b2, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c03, t3, c03 -+ MUL a3, b3, t3 -+ LD a3, 6 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b4, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b4, 7 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 4 * SIZE(AO) -+ ldi BO, 4 * SIZE(BO) -+ bgt L, $L112 -+ .align 4 -+ -+$L115: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L118 -+ .align 4 -+ -+$L116: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+ LD a1, 1 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 1 * SIZE(AO) -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L116 -+ .align 4 -+ -+$L118: -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+ -+ ADD c01, c02, c01 -+ ADD c03, c04, c03 -+ ADD c01, c03, c01 -+ -+#if defined(LN) || defined(RT) -+ subl KK, 1, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+#else -+ LD a1, 0 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 1 * SIZE(C1) -+#endif -+ -+#ifdef RT -+ SXADDQ K, AORIG, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L119: -+#ifdef LN -+ SXADDQ K, B, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 1, KK -+#endif -+ -+#ifdef RT -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S b/kernel/sw_64/trsm_kernel_4x4_RT.S -new file mode 100644 -index 0000000..b9a1975 ---- /dev/null -+++ b/kernel/sw_64/trsm_kernel_4x4_RT.S -@@ -0,0 +1,5148 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+#if !defined(EV4) && !defined(EV5) && !defined(SW6) -+#error "Architecture is not specified." -+#endif -+ -+#ifdef SW6 -+#define PREFETCHSIZE 56 -+#define UNOP unop -+#endif -+ -+#ifdef EV5 -+#define PREFETCHSIZE 56 -+#define UNOP -+#endif -+ -+#ifdef EV4 -+#define UNOP -+#endif -+ -+#define STACKSIZE 88 -+ -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $20 -+#define B $21 -+#define C $22 -+#define LDC $23 -+ -+#define C1 $19 -+#define C2 $24 -+#define C3 $25 -+#define C4 $27 -+ -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 -+ -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 -+ -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 -+ -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 -+ -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 -+ -+#define alpha $f30 -+ -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 -+ -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 -+ -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 -+ -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 -+ -+#define tmp $9 -+ -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define AORIG $3 -+#define OFFSET $4 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ ldl C, 0 + STACKSIZE($sp) -+ ldl LDC, 8 + STACKSIZE($sp) -+ ldl OFFSET, 16 + STACKSIZE($sp) -+ -+ SXADDQ LDC, 0, LDC -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ stl $9, 64($sp) -+ -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 -+ -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 -+ -+#ifdef LN -+ mulq M, K, TMP1 -+ SXADDQ TMP1, A, A -+ SXADDQ M, C, C -+#endif -+ -+#ifdef RN -+ negq OFFSET, KK -+#endif -+ -+#ifdef RT -+ mull N, K, TMP1 -+ SXADDQ TMP1, B, B -+ -+ mull N, LDC, TMP1 -+ addl TMP1, C, C -+ -+ subl N, OFFSET, KK -+#endif -+ -+ and N, 1, J -+ ble J, $L40 -+ -+#ifdef RT -+ sll K, BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ subl C, LDC, C -+#endif -+ -+ mov C, C1 -+#ifndef RT -+ addl C, LDC, C -+#endif -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ ble I, $L100 -+ .align 4 -+ -+$L91: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L95 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L95 -+#endif -+ .align 5 -+ -+$L92: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi L, -1(L) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ LD a1, 8 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 9 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 10 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a4, 11 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ LD a1, 12 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ LD a2, 13 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b3, b5 -+ fmov b5, t3 -+ LD a3, 14 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b3, b5 -+ fmov b5, t4 -+ LD a5, 15 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b4, b5 -+ fmov b5, t1 -+ LD a1, 16 * SIZE(AO) -+ ldi AO, 16 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b4, b5 -+ fmov b5, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, b5 -+ fmov b5, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L92 -+ .align 4 -+ -+$L95: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ unop -+ ble L, $L98 -+ .align 4 -+ -+$L96: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 1 * SIZE(BO) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ldi AO, 4 * SIZE(AO) -+ bgt L, $L96 -+ .align 4 -+ -+$L98: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a2, c04, b5 -+ fmov b5, t1 -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ MUL a3, c04, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL a4, c04, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, b5 -+ fmov b5, c03 -+ MUL b2, c03, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL b3, c03, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, b5 -+ fmov b5, c02 -+ MUL b2, c02, b5 -+ fmov b5, t1 -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ MUL b3, c02, b5 -+ fmov b5, t1 -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a2, c03, b5 -+ fmov b5, t1 -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ MUL a3, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ bgt I, $L91 -+ .align 4 -+ -+$L100: -+ and M, 2, I -+ ble I, $L110 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L105 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ ble L, $L105 -+#endif -+ .align 5 -+ -+$L102: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ldi BO, 4 * SIZE(BO) -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a5, 7 * SIZE(AO) -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ LD a1, 8 * SIZE(AO) -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ LD b3, 2 * SIZE(BO) -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, b5 -+ fmov b5, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L102 -+ .align 4 -+ -+$L105: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L108 -+ .align 4 -+ -+$L106: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -1(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD a2, 3 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi AO, 2 * SIZE(AO) -+ unop -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L106 -+ .align 4 -+ -+$L108: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ -+ ADD c01, c03, b5 -+ fmov b5, c01 -+ ADD c02, c04, b5 -+ fmov b5, c02 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L110: -+ and M, 1, I -+ ble I, $L119 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ unop -+ ble L, $L115 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L115 -+#endif -+ .align 4 -+ -+$L112: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 4 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ MUL a3, b3, b5 -+ fmov b5, t3 -+ LD a3, 6 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b4, b5 -+ fmov b5, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b4, 7 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 4 * SIZE(AO) -+ ldi BO, 4 * SIZE(BO) -+ bgt L, $L112 -+ .align 4 -+ -+$L115: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L118 -+ .align 4 -+ -+$L116: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD a1, 1 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 1 * SIZE(AO) -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L116 -+ .align 4 -+ -+$L118: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ -+ ADD c01, c02, b5 -+ fmov b5, c01 -+ ADD c03, c04, b5 -+ fmov b5, c03 -+ ADD c01, c03, b5 -+ fmov b5, c01 -+ -+#if defined(LN) || defined(RT) -+ subl KK, 1, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+#else -+ LD a1, 0 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 1 * SIZE(C1) -+#endif -+ -+#ifdef RT -+ SXADDQ K, AORIG, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L119: -+#ifdef LN -+ SXADDQ K, B, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 1, KK -+#endif -+ -+#ifdef RT -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L40: -+ and N, 2, J -+ ble J, $L80 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ addl LDC, LDC, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ fclr t1 -+#ifndef RT -+ addl C2, LDC, C -+#endif -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L60 -+ .align 4 -+ -+$L51: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 -+ -+ ldi L, -2(KK) -+ -+ ldi BO, 2 * SIZE(B) -+ ldi AO, 4 * SIZE(AO) -+ -+ ble KK, $L58 -+ -+ ble L, $L55 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ ldi BO, 2 * SIZE(BO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ble TMP1, $L58 -+ -+ ble L, $L55 -+#endif -+ .align 4 -+ -+$L52: -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ unop -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ ldi L, -2(L) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ unop -+ -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ unop -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ unop -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a5, 3 * SIZE(AO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ LD b2, -1 * SIZE(BO) -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ unop -+ -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b3, b5 -+ fmov b5, t3 -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a5, b3, b5 -+ fmov b5, t4 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b4, b5 -+ fmov b5, t1 -+ LD a1, -4 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b4, b5 -+ fmov b5, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, b5 -+ fmov b5, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L52 -+ .align 4 -+ -+$L55: -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L57 -+#else -+ blbs TMP1, $L57 -+#endif -+ .align 4 -+ -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L57: -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, b5 -+ fmov b5, t3 -+ -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ MUL a4, b1, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b2, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b2, b5 -+ fmov b5, t2 -+ ADD c03, t3, b5 -+ fmov b5, c03 -+ MUL a3, b2, b5 -+ fmov b5, t3 -+ -+ ADD c04, t4, b5 -+ fmov b5, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, b5 -+ fmov b5, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c05, t1, b5 -+ fmov b5, c05 -+ ADD c06, t2, b5 -+ fmov b5, c06 -+ ADD c07, t3, b5 -+ fmov b5, c07 -+ ADD c08, t4, b5 -+ fmov b5, c08 -+ .align 4 -+ -+$L58: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c02, b5 -+ fmov b5, c02 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+ -+ SUB b1, c03, b5 -+ fmov b5, c03 -+ SUB b2, c07, b5 -+ fmov b5, c07 -+ SUB b3, c04, b5 -+ fmov b5, c04 -+ SUB b4, c08, b5 -+ fmov b5, c08 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+ -+ SUB b1, c05, b5 -+ fmov b5, c05 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c07, b5 -+ fmov b5, c07 -+ SUB b4, c08, b5 -+ fmov b5, c08 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ -+ MUL a2, c04, b5 -+ fmov b5, t1 -+ MUL a2, c08, b5 -+ fmov b5, t2 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ -+ MUL a3, c04, b5 -+ fmov b5, t1 -+ MUL a3, c08, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a4, c04, b5 -+ fmov b5, t1 -+ MUL a4, c08, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, b5 -+ fmov b5, c03 -+ MUL b1, c07, b5 -+ fmov b5, c07 -+ -+ MUL b2, c03, b5 -+ fmov b5, t1 -+ MUL b2, c07, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL b3, c03, b5 -+ fmov b5, t1 -+ MUL b3, c07, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c05, b5 -+ fmov b5, t2 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c05, b5 -+ fmov b5, t2 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, b5 -+ fmov b5, c02 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ -+ MUL b2, c02, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ -+ MUL b3, c02, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ -+ MUL a2, c03, b5 -+ fmov b5, t1 -+ MUL a2, c07, b5 -+ fmov b5, t2 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ -+ MUL a3, c04, b5 -+ fmov b5, c04 -+ MUL a3, c08, b5 -+ fmov b5, c08 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ MUL a2, c03, b5 -+ fmov b5, t3 -+ MUL a2, c04, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+ MUL a3, c07, b5 -+ fmov b5, c07 -+ MUL a3, c08, b5 -+ fmov b5, c08 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c07, b5 -+ fmov b5, t3 -+ MUL a2, c08, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c03, b5 -+ fmov b5, c03 -+ MUL a3, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) -+ -+ ST c03, 4 * SIZE(BO) -+ ST c07, 5 * SIZE(BO) -+ ST c04, 6 * SIZE(BO) -+ ST c08, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ -+ bgt I, $L51 -+ .align 4 -+ -+$L60: -+ and M, 2, I -+ ble I, $L70 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L68 -+ -+ ble L, $L65 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L68 -+ -+ ble L, $L65 -+#endif -+ .align 4 -+ -+$L62: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ldi L, -2(L) -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a4, b3, b5 -+ fmov b5, t2 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a3, b4, b5 -+ fmov b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a4, b4, b5 -+ fmov b5, t4 -+ LD b4, 1 * SIZE(BO) -+ unop -+ -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L62 -+ .align 4 -+ -+$L65: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L67 -+#else -+ blbs TMP1, $L67 -+#endif -+ .align 4 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi AO, 2 * SIZE(AO) -+ .align 4 -+ -+$L67: -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ .align 4 -+ -+$L68: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c02, b5 -+ fmov b5, c02 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c05, b5 -+ fmov b5, c05 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L70: -+ and M, 1, I -+ ble I, $L79 -+ -+#if defined(LT) || defined(RN) -+ -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c02 -+ LD b2, 1 * SIZE(B) -+ fclr c06 -+ -+ ldi L, -2(KK) -+ -+ LD b3, 2 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L78 -+ -+ ble L, $L75 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c02 -+ LD b2, 1 * SIZE(BO) -+ fclr c06 -+ -+ ldi L, -2(TMP1) -+ -+ LD b3, 2 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L78 -+ -+ ble L, $L75 -+#endif -+ .align 4 -+ -+$L72: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ LD a1, 1 * SIZE(AO) -+ LD b2, 3 * SIZE(BO) -+ -+ ADD c02, t3, b5 -+ fmov b5, c02 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b3, b5 -+ fmov b5, t3 -+ LD b3, 4 * SIZE(BO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, b5 -+ fmov b5, t4 -+ LD a2, 0 * SIZE(AO) -+ LD b4, 5 * SIZE(BO) -+ -+ ldi BO, 4 * SIZE(BO) -+ unop -+ unop -+ bgt L, $L72 -+ .align 4 -+ -+$L75: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L77 -+#else -+ blbs TMP1, $L77 -+#endif -+ .align 4 -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ LD a1, 0 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L77: -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ ADD c02, t3, b5 -+ fmov b5, c02 -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ -+ ADD c01, c02, b5 -+ fmov b5, c01 -+ ldi AO, 1 * SIZE(AO) -+ ADD c05, c06, b5 -+ fmov b5, c05 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ -+ .align 4 -+ -+$L78: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L79: -+#ifdef LN -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 2, KK -+#endif -+ -+#ifdef RT -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L80: -+ sra N, 2, J -+ ble J, $L999 -+ .align 4 -+ -+$L01: -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ s4addl LDC, 0, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ addl C2, LDC, C3 -+#ifndef RT -+ s4addl LDC, C, C -+#endif -+ -+ fclr t1 -+ addl C3, LDC, C4 -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L20 -+ .align 4 -+ -+$L11: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(B) -+ fclr c06 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(KK) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ flds $f31, 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(B) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble KK, $L18 -+#else -+ -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c06 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(TMP1) -+ fclr c04 -+ -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(BO) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble TMP1, $L18 -+#endif -+ -+ ble L, $L15 -+ .align 5 -+ -+$L12: -+/* 1 */ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, b5 -+ fmov b5, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop -+#endif -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ unop -+ MUL b1, a2, t2 -+ unop -+ -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ FIMOVD b5, tmp -+ -+/* 2 */ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a1, t4 -+ unop -+ -+/* 3 */ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, t1 -+ unop -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+/* 4 */ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) -+ -+/* 5 */ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ ldi L, -2(L) -+ IFMOVD tmp, b5 -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a5, t4 -+ unop -+ -+/* 6 */ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a6, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a4, t2 -+ unop -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a5, t4 -+ unop -+ -+/* 7 */ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 -+ unop -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) -+ -+/* 8 */ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 -+ .align 4 -+ -+$L15: -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ MUL b1, a1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L17 -+#else -+ blbs TMP1, $L17 -+#endif -+ .align 4 -+ -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, b5 -+ fmov b5, t2 -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, b5 -+ fmov b5, t3 -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL b1, a4, b5 -+ fmov b5, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, b5 -+ fmov b5, t3 -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, b5 -+ fmov b5, t4 -+ -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, b5 -+ fmov b5, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, b5 -+ fmov b5, t2 -+ unop -+ -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, b5 -+ fmov b5, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, b5 -+ fmov b5, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, b5 -+ fmov b5, t1 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, b5 -+ fmov b5, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, b5 -+ fmov b5, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, b5 -+ fmov b5, t4 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ LD b4, 3 * SIZE(BO) -+ MUL b1, a1, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L17: -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, b5 -+ fmov b5, t2 -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, b5 -+ fmov b5, t3 -+ -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL b1, a4, b5 -+ fmov b5, t2 -+ ADD c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, b5 -+ fmov b5, t3 -+ -+ ADD c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, b5 -+ fmov b5, t4 -+ ADD c03, t1, b5 -+ fmov b5, c03 -+ MUL b3, a1, b5 -+ fmov b5, t1 -+ -+ ADD c04, t2, b5 -+ fmov b5, c04 -+ MUL b3, a2, b5 -+ fmov b5, t2 -+ ADD c08, t3, b5 -+ fmov b5, c08 -+ MUL b4, a2, b5 -+ fmov b5, t3 -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL b2, a3, b5 -+ fmov b5, t4 -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ MUL b3, a3, b5 -+ fmov b5, t1 -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ MUL b3, a4, b5 -+ fmov b5, t2 -+ ADD c14, t3, b5 -+ fmov b5, c14 -+ MUL b4, a4, b5 -+ fmov b5, t3 -+ -+ ADD c07, t4, b5 -+ fmov b5, c07 -+ ldi AO, 4 * SIZE(AO) -+ MUL b4, a3, b5 -+ fmov b5, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c11, t1, b5 -+ fmov b5, c11 -+ ADD c12, t2, b5 -+ fmov b5, c12 -+ ADD c16, t3, b5 -+ fmov b5, c16 -+ ADD c15, t4, b5 -+ fmov b5, c15 -+ .align 4 -+ -+$L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+ -+ SUB b1, c02, b5 -+ fmov b5, c02 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c10, b5 -+ fmov b5, c10 -+ SUB b4, c14, b5 -+ fmov b5, c14 -+ -+ LD a1, 8 * SIZE(BO) -+ LD a2, 9 * SIZE(BO) -+ LD a3, 10 * SIZE(BO) -+ LD a4, 11 * SIZE(BO) -+ -+ LD b1, 12 * SIZE(BO) -+ LD b2, 13 * SIZE(BO) -+ LD b3, 14 * SIZE(BO) -+ LD b4, 15 * SIZE(BO) -+ -+ SUB a1, c03, b5 -+ fmov b5, c03 -+ SUB a2, c07, b5 -+ fmov b5, c07 -+ SUB a3, c11, b5 -+ fmov b5, c11 -+ SUB a4, c15, b5 -+ fmov b5, c15 -+ -+ SUB b1, c04, b5 -+ fmov b5, c04 -+ SUB b2, c08, b5 -+ fmov b5, c08 -+ SUB b3, c12, b5 -+ fmov b5, c12 -+ SUB b4, c16, b5 -+ fmov b5, c16 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+ -+ SUB b1, c05, b5 -+ fmov b5, c05 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c07, b5 -+ fmov b5, c07 -+ SUB b4, c08, b5 -+ fmov b5, c08 -+ -+ LD a1, 8 * SIZE(AO) -+ LD a2, 9 * SIZE(AO) -+ LD a3, 10 * SIZE(AO) -+ LD a4, 11 * SIZE(AO) -+ -+ LD b1, 12 * SIZE(AO) -+ LD b2, 13 * SIZE(AO) -+ LD b3, 14 * SIZE(AO) -+ LD b4, 15 * SIZE(AO) -+ -+ SUB a1, c09, b5 -+ fmov b5, c09 -+ SUB a2, c10, b5 -+ fmov b5, c10 -+ SUB a3, c11, b5 -+ fmov b5, c11 -+ SUB a4, c12, b5 -+ fmov b5, c12 -+ -+ SUB b1, c13, b5 -+ fmov b5, c13 -+ SUB b2, c14, b5 -+ fmov b5, c14 -+ SUB b3, c15, b5 -+ fmov b5, c15 -+ SUB b4, c16, b5 -+ fmov b5, c16 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ MUL a1, c16, b5 -+ fmov b5, c16 -+ -+ MUL a2, c04, b5 -+ fmov b5, t1 -+ MUL a2, c08, b5 -+ fmov b5, t2 -+ MUL a2, c12, b5 -+ fmov b5, t3 -+ MUL a2, c16, b5 -+ fmov b5, t4 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c15, t4, b5 -+ fmov b5, c15 -+ -+ MUL a3, c04, b5 -+ fmov b5, t1 -+ MUL a3, c08, b5 -+ fmov b5, t2 -+ MUL a3, c12, b5 -+ fmov b5, t3 -+ MUL a3, c16, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL a4, c04, b5 -+ fmov b5, t1 -+ MUL a4, c08, b5 -+ fmov b5, t2 -+ MUL a4, c12, b5 -+ fmov b5, t3 -+ MUL a4, c16, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, b5 -+ fmov b5, c03 -+ MUL b1, c07, b5 -+ fmov b5, c07 -+ MUL b1, c11, b5 -+ fmov b5, c11 -+ MUL b1, c15, b5 -+ fmov b5, c15 -+ -+ MUL b2, c03, b5 -+ fmov b5, t1 -+ MUL b2, c07, b5 -+ fmov b5, t2 -+ MUL b2, c11, b5 -+ fmov b5, t3 -+ MUL b2, c15, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL b3, c03, b5 -+ fmov b5, t1 -+ MUL b3, c07, b5 -+ fmov b5, t2 -+ MUL b3, c11, b5 -+ fmov b5, t3 -+ MUL b3, c15, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c10, b5 -+ fmov b5, t3 -+ MUL a2, c14, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c09, b5 -+ fmov b5, c09 -+ MUL a3, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ MUL a2, c09, b5 -+ fmov b5, t3 -+ MUL a2, c13, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c05, b5 -+ fmov b5, t2 -+ MUL a3, c09, b5 -+ fmov b5, t3 -+ MUL a3, c13, b5 -+ fmov b5, t4 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c15, t4, b5 -+ fmov b5, c15 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c05, b5 -+ fmov b5, t2 -+ MUL a4, c09, b5 -+ fmov b5, t3 -+ MUL a4, c13, b5 -+ fmov b5, t4 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ SUB c12, t3, b5 -+ fmov b5, c12 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, b5 -+ fmov b5, c02 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ MUL b1, c10, b5 -+ fmov b5, c10 -+ MUL b1, c14, b5 -+ fmov b5, c14 -+ -+ MUL b2, c02, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ MUL b2, c10, b5 -+ fmov b5, t3 -+ MUL b2, c14, b5 -+ fmov b5, t4 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c07, t2, b5 -+ fmov b5, c07 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c15, t4, b5 -+ fmov b5, c15 -+ -+ MUL b3, c02, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ MUL b3, c10, b5 -+ fmov b5, t3 -+ MUL b3, c14, b5 -+ fmov b5, t4 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ SUB c12, t3, b5 -+ fmov b5, c12 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c15, b5 -+ fmov b5, c15 -+ -+ MUL a2, c03, b5 -+ fmov b5, t1 -+ MUL a2, c07, b5 -+ fmov b5, t2 -+ MUL a2, c11, b5 -+ fmov b5, t3 -+ MUL a2, c15, b5 -+ fmov b5, t4 -+ -+ SUB c04, t1, b5 -+ fmov b5, c04 -+ SUB c08, t2, b5 -+ fmov b5, c08 -+ SUB c12, t3, b5 -+ fmov b5, c12 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ MUL a3, c04, b5 -+ fmov b5, c04 -+ MUL a3, c08, b5 -+ fmov b5, c08 -+ MUL a3, c12, b5 -+ fmov b5, c12 -+ MUL a3, c16, b5 -+ fmov b5, c16 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ MUL a2, c03, b5 -+ fmov b5, t3 -+ MUL a2, c04, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c02, b5 -+ fmov b5, t2 -+ MUL a3, c03, b5 -+ fmov b5, t3 -+ MUL a3, c04, b5 -+ fmov b5, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c02, b5 -+ fmov b5, t2 -+ MUL a4, c03, b5 -+ fmov b5, t3 -+ MUL a4, c04, b5 -+ fmov b5, t4 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ SUB c15, t3, b5 -+ fmov b5, c15 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, b5 -+ fmov b5, c05 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ MUL b1, c07, b5 -+ fmov b5, c07 -+ MUL b1, c08, b5 -+ fmov b5, c08 -+ -+ MUL b2, c05, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ MUL b2, c07, b5 -+ fmov b5, t3 -+ MUL b2, c08, b5 -+ fmov b5, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL b3, c05, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ MUL b3, c07, b5 -+ fmov b5, t3 -+ MUL b3, c08, b5 -+ fmov b5, t4 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ SUB c15, t3, b5 -+ fmov b5, c15 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ MUL a2, c09, b5 -+ fmov b5, t1 -+ MUL a2, c10, b5 -+ fmov b5, t2 -+ MUL a2, c11, b5 -+ fmov b5, t3 -+ MUL a2, c12, b5 -+ fmov b5, t4 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ SUB c15, t3, b5 -+ fmov b5, c15 -+ SUB c16, t4, b5 -+ fmov b5, c16 -+ -+ MUL a3, c13, b5 -+ fmov b5, c13 -+ MUL a3, c14, b5 -+ fmov b5, c14 -+ MUL a3, c15, b5 -+ fmov b5, c15 -+ MUL a3, c16, b5 -+ fmov b5, c16 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ MUL a1, c15, b5 -+ fmov b5, c15 -+ MUL a1, c16, b5 -+ fmov b5, c16 -+ -+ MUL a2, c13, b5 -+ fmov b5, t1 -+ MUL a2, c14, b5 -+ fmov b5, t2 -+ MUL a2, c15, b5 -+ fmov b5, t3 -+ MUL a2, c16, b5 -+ fmov b5, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a3, c13, b5 -+ fmov b5, t1 -+ MUL a3, c14, b5 -+ fmov b5, t2 -+ MUL a3, c15, b5 -+ fmov b5, t3 -+ MUL a3, c16, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL a4, c13, b5 -+ fmov b5, t1 -+ MUL a4, c14, b5 -+ fmov b5, t2 -+ MUL a4, c15, b5 -+ fmov b5, t3 -+ MUL a4, c16, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, b5 -+ fmov b5, c09 -+ MUL b1, c10, b5 -+ fmov b5, c10 -+ MUL b1, c11, b5 -+ fmov b5, c11 -+ MUL b1, c12, b5 -+ fmov b5, c12 -+ -+ MUL b2, c09, b5 -+ fmov b5, t1 -+ MUL b2, c10, b5 -+ fmov b5, t2 -+ MUL b2, c11, b5 -+ fmov b5, t3 -+ MUL b2, c12, b5 -+ fmov b5, t4 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c07, t3, b5 -+ fmov b5, c07 -+ SUB c08, t4, b5 -+ fmov b5, c08 -+ -+ MUL b3, c09, b5 -+ fmov b5, t1 -+ MUL b3, c10, b5 -+ fmov b5, t2 -+ MUL b3, c11, b5 -+ fmov b5, t3 -+ MUL b3, c12, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c07, b5 -+ fmov b5, c07 -+ MUL a1, c08, b5 -+ fmov b5, c08 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c07, b5 -+ fmov b5, t3 -+ MUL a2, c08, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c03, b5 -+ fmov b5, c03 -+ MUL a3, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+ -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+ -+ ST c03, 8 * SIZE(BO) -+ ST c07, 9 * SIZE(BO) -+ ST c11, 10 * SIZE(BO) -+ ST c15, 11 * SIZE(BO) -+ -+ ST c04, 12 * SIZE(BO) -+ ST c08, 13 * SIZE(BO) -+ ST c12, 14 * SIZE(BO) -+ ST c16, 15 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) -+ -+ ST c09, 8 * SIZE(AO) -+ ST c10, 9 * SIZE(AO) -+ ST c11, 10 * SIZE(AO) -+ ST c12, 11 * SIZE(AO) -+ -+ ST c13, 12 * SIZE(AO) -+ ST c14, 13 * SIZE(AO) -+ ST c15, 14 * SIZE(AO) -+ ST c16, 15 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+ ldi C3, -4 * SIZE(C3) -+ ldi C4, -4 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c11, 2 * SIZE(C3) -+ ST c12, 3 * SIZE(C3) -+ -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ ST c15, 2 * SIZE(C4) -+ ST c16, 3 * SIZE(C4) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+ ldi C3, 4 * SIZE(C3) -+ ldi C4, 4 * SIZE(C4) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ -+ bgt I, $L11 -+ .align 4 -+ -+$L20: -+ and M, 2, I -+ ble I, $L30 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c01 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(B) -+ fclr c02 -+ fclr c06 -+ ble KK, $L28 -+ -+ ble L, $L25 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c01 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ ldi BO, 4 * SIZE(BO) -+ fclr c02 -+ fclr c06 -+ ble TMP1, $L28 -+ -+ ble L, $L25 -+#endif -+ .align 4 -+ -+$L22: -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ FIMOVD b5, tmp -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ IFMOVD tmp, b5 -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ IFMOVD tmp, b5 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 -+ -+ -+$L25: -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L27 -+#else -+ blbs TMP1, $L27 -+#endif -+ -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ unop -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ unop -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, b5 -+ fmov b5, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b4, b5 -+ fmov b5, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L27: -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ MUL a2, b1, b5 -+ fmov b5, t2 -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ MUL a1, b2, b5 -+ fmov b5, t3 -+ -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ MUL a2, b2, b5 -+ fmov b5, t4 -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b3, b5 -+ fmov b5, t1 -+ -+ ADD c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b3, b5 -+ fmov b5, t2 -+ ADD c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b4, b5 -+ fmov b5, t3 -+ -+ ADD c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, b5 -+ fmov b5, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c09, t1, b5 -+ fmov b5, c09 -+ ADD c10, t2, b5 -+ fmov b5, c10 -+ ADD c13, t3, b5 -+ fmov b5, c13 -+ ADD c14, t4, b5 -+ fmov b5, c14 -+ .align 4 -+ -+$L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+ -+ SUB b1, c02, b5 -+ fmov b5, c02 -+ SUB b2, c06, b5 -+ fmov b5, c06 -+ SUB b3, c10, b5 -+ fmov b5, c10 -+ SUB b4, c14, b5 -+ fmov b5, c14 -+ -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c05, b5 -+ fmov b5, c05 -+ SUB a4, c06, b5 -+ fmov b5, c06 -+ -+ SUB b1, c09, b5 -+ fmov b5, c09 -+ SUB b2, c10, b5 -+ fmov b5, c10 -+ SUB b3, c13, b5 -+ fmov b5, c13 -+ SUB b4, c14, b5 -+ fmov b5, c14 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ -+ MUL a2, c02, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ MUL a2, c10, b5 -+ fmov b5, t3 -+ MUL a2, c14, b5 -+ fmov b5, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c05, t2, b5 -+ fmov b5, c05 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c13, t4, b5 -+ fmov b5, c13 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c05, b5 -+ fmov b5, c05 -+ MUL a3, c09, b5 -+ fmov b5, c09 -+ MUL a3, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c05, b5 -+ fmov b5, t2 -+ MUL a2, c09, b5 -+ fmov b5, t3 -+ MUL a2, c13, b5 -+ fmov b5, t4 -+ -+ SUB c02, t1, b5 -+ fmov b5, c02 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ SUB c10, t3, b5 -+ fmov b5, c10 -+ SUB c14, t4, b5 -+ fmov b5, c14 -+ -+ MUL a3, c02, b5 -+ fmov b5, c02 -+ MUL a3, c06, b5 -+ fmov b5, c06 -+ MUL a3, c10, b5 -+ fmov b5, c10 -+ MUL a3, c14, b5 -+ fmov b5, c14 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ MUL a2, c02, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ MUL a3, c02, b5 -+ fmov b5, t2 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ MUL a4, c02, b5 -+ fmov b5, t2 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, b5 -+ fmov b5, c05 -+ MUL b1, c06, b5 -+ fmov b5, c06 -+ -+ MUL b2, c05, b5 -+ fmov b5, t1 -+ MUL b2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL b3, c05, b5 -+ fmov b5, t1 -+ MUL b3, c06, b5 -+ fmov b5, t2 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ MUL a2, c09, b5 -+ fmov b5, t1 -+ MUL a2, c10, b5 -+ fmov b5, t2 -+ -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ SUB c14, t2, b5 -+ fmov b5, c14 -+ -+ MUL a3, c13, b5 -+ fmov b5, c13 -+ MUL a3, c14, b5 -+ fmov b5, c14 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ MUL a1, c14, b5 -+ fmov b5, c14 -+ -+ MUL a2, c13, b5 -+ fmov b5, t1 -+ MUL a2, c14, b5 -+ fmov b5, t2 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL a3, c13, b5 -+ fmov b5, t1 -+ MUL a3, c14, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL a4, c13, b5 -+ fmov b5, t1 -+ MUL a4, c14, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, b5 -+ fmov b5, c09 -+ MUL b1, c10, b5 -+ fmov b5, c10 -+ -+ MUL b2, c09, b5 -+ fmov b5, t1 -+ MUL b2, c10, b5 -+ fmov b5, t2 -+ -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ SUB c06, t2, b5 -+ fmov b5, c06 -+ -+ MUL b3, c09, b5 -+ fmov b5, t1 -+ MUL b3, c10, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c06, b5 -+ fmov b5, c06 -+ -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ MUL a2, c06, b5 -+ fmov b5, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ MUL a3, c01, b5 -+ fmov b5, c01 -+ MUL a3, c02, b5 -+ fmov b5, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+ -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) -+ -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c13, 6 * SIZE(AO) -+ ST c14, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+ ldi C3, -2 * SIZE(C3) -+ ldi C4, -2 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+ ldi C3, 2 * SIZE(C3) -+ ldi C4, 2 * SIZE(C4) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L30: -+ and M, 1, I -+ ble I, $L39 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(B) -+ fclr c09 -+ LD b4, 3 * SIZE(B) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(B) -+ ble KK, $L38 -+ -+ ble L, $L35 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 -+ -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c09 -+ LD b4, 3 * SIZE(BO) -+ fclr c13 -+ -+ ldi BO, 4 * SIZE(BO) -+ ble TMP1, $L38 -+ -+ ble L, $L35 -+#endif -+ .align 4 -+ -+$L32: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ ldi AO, 2 * SIZE(AO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ LD b5, 3 * SIZE(BO) -+ FIMOVD b5, tmp -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL a1, b4, t4 -+ LD a1, -1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a2, b1, t1 -+ LD b1, 4 * SIZE(BO) -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a2, b2, t2 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ LD b4, -1 * SIZE(BO) -+ MUL a2, b3, t3 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ IFMOVD tmp, b5 -+ MUL a2, b5, t4 -+ LD a2, 0 * SIZE(AO) -+ bgt L, $L32 -+ .align 4 -+ -+$L35: -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, b5 -+ fmov b5, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L37 -+#else -+ blbs TMP1, $L37 -+#endif -+ .align 4 -+ -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ LD b1, 0 * SIZE(BO) -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ MUL a1, b3, b5 -+ fmov b5, t3 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ MUL a1, b4, b5 -+ fmov b5, t4 -+ LD a1, 0 * SIZE(AO) -+ ldi AO, 1 * SIZE(AO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, b5 -+ fmov b5, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L37: -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ MUL a1, b2, b5 -+ fmov b5, t2 -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ MUL a1, b3, b5 -+ fmov b5, t3 -+ -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b4, b5 -+ fmov b5, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD c01, t1, b5 -+ fmov b5, c01 -+ ADD c05, t2, b5 -+ fmov b5, c05 -+ ADD c09, t3, b5 -+ fmov b5, c09 -+ ADD c13, t4, b5 -+ fmov b5, c13 -+ -+$L38: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c05, b5 -+ fmov b5, c05 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a2, c01, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL a3, c01, b5 -+ fmov b5, t1 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ MUL a4, c01, b5 -+ fmov b5, t1 -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, b5 -+ fmov b5, c05 -+ MUL b2, c05, b5 -+ fmov b5, t1 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ MUL b3, c05, b5 -+ fmov b5, t1 -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a2, c09, b5 -+ fmov b5, t1 -+ SUB c13, t1, b5 -+ fmov b5, c13 -+ MUL a3, c13, b5 -+ fmov b5, c13 -+#endif -+ -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, b5 -+ fmov b5, c13 -+ MUL a2, c13, b5 -+ fmov b5, t1 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ MUL a3, c13, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL a4, c13, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, b5 -+ fmov b5, c09 -+ MUL b2, c09, b5 -+ fmov b5, t1 -+ SUB c05, t1, b5 -+ fmov b5, c05 -+ MUL b3, c09, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, b5 -+ fmov b5, c05 -+ MUL a2, c05, b5 -+ fmov b5, t1 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ MUL a3, c01, b5 -+ fmov b5, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c13, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+ ldi C3, -1 * SIZE(C3) -+ ldi C4, -1 * SIZE(C4) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c09, 0 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L39: -+#ifdef LN -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 4, KK -+#endif -+ -+#ifdef RT -+ subl KK, 4, KK -+#endif -+ ldi J, -1(J) -+ bgt J, $L01 -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldl $9, 64($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S.bak b/kernel/sw_64/trsm_kernel_4x4_RT.S.bak -new file mode 100644 -index 0000000..af57279 ---- /dev/null -+++ b/kernel/sw_64/trsm_kernel_4x4_RT.S.bak -@@ -0,0 +1,4072 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+ -+#if !defined(SW2B) -+#error "Architecture is not specified." -+#endif -+ -+#ifdef SW2B -+#define PREFETCHSIZE 56 -+#define UNOP nop -+#endif -+ -+#ifdef EV6 -+#define PREFETCHSIZE 56 -+#define UNOP unop -+#endif -+ -+#ifdef EV5 -+#define PREFETCHSIZE 56 -+#define UNOP -+#endif -+ -+#ifdef EV4 -+#define UNOP -+#endif -+ -+#define STACKSIZE 80 -+ -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $20 -+#define B $21 -+#define C $22 -+#define LDC $23 -+ -+#define C1 $19 -+#define C2 $24 -+#define C3 $25 -+#define C4 $27 -+ -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 -+ -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 -+ -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 -+ -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 -+ -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 -+ -+#define alpha $f30 -+ -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 -+ -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 -+ -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 -+ -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 -+ -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define AORIG $3 -+#define OFFSET $4 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ ldl C, 0 + STACKSIZE($sp) -+ ldl LDC, 8 + STACKSIZE($sp) -+ ldl OFFSET, 16 + STACKSIZE($sp) -+ -+ SXADDQ LDC, 0, LDC -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 -+ -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 -+ -+#ifdef LN -+ mull M, K, TMP1 -+ SXADDQ TMP1, A, A -+ SXADDQ M, C, C -+#endif -+ -+#ifdef RN -+ negq OFFSET, KK -+#endif -+ -+#ifdef RT -+ mull N, K, TMP1 -+ SXADDQ TMP1, B, B -+ -+ mull N, LDC, TMP1 -+ addl TMP1, C, C -+ -+ subl N, OFFSET, KK -+#endif -+ -+ and N, 1, J -+ ble J, $L40 -+ -+#ifdef RT -+ sll K, BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ subl C, LDC, C -+#endif -+ -+ mov C, C1 -+#ifndef RT -+ addl C, LDC, C -+#endif -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ ble I, $L100 -+ .align 4 -+ -+$L91: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L95 -+ -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L95 -+#endif -+ .align 5 -+ -+$L92: -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi L, -1(L) -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b1, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b1, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 8 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 9 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 10 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a4, 11 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ LD a1, 12 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD a2, 13 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b3, t3 -+ LD a3, 14 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b3, t4 -+ LD a5, 15 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ MUL a1, b4, t1 -+ LD a1, 16 * SIZE(AO) -+ ldi AO, 16 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b4, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L92 -+ .align 4 -+ -+$L95: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ unop -+ ble L, $L98 -+ .align 4 -+ -+$L96: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ ldi BO, 1 * SIZE(BO) -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b1, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b1, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b1, 0 * SIZE(BO) -+ -+ ldi AO, 4 * SIZE(AO) -+ bgt L, $L96 -+ .align 4 -+ -+$L98: -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, c04 -+ MUL a2, c04, t1 -+ SUB c03, t1, c03 -+ MUL a3, c04, t1 -+ SUB c02, t1, c02 -+ MUL a4, c04, t1 -+ SUB c01, t1, c01 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, c03 -+ MUL b2, c03, t1 -+ SUB c02, t1, c02 -+ MUL b3, c03, t1 -+ SUB c01, t1, c01 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a2, c02, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c02, t1, c02 -+ MUL a3, c01, t1 -+ SUB c03, t1, c03 -+ MUL a4, c01, t1 -+ SUB c04, t1, c04 -+ -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, c02 -+ MUL b2, c02, t1 -+ SUB c03, t1, c03 -+ MUL b3, c02, t1 -+ SUB c04, t1, c04 -+ -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, c03 -+ MUL a2, c03, t1 -+ SUB c04, t1, c04 -+ MUL a3, c04, c04 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 4, KK -+#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ bgt I, $L91 -+ .align 4 -+ -+$L100: -+ and M, 2, I -+ ble I, $L110 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ ble L, $L105 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ ble L, $L105 -+#endif -+ .align 5 -+ -+$L102: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b1, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c03, t3, c03 -+ ldi BO, 4 * SIZE(BO) -+ MUL a3, b2, t3 -+ LD a3, 6 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a5, 7 * SIZE(AO) -+ LD b2, 1 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ MUL a1, b3, t1 -+ LD a1, 8 * SIZE(AO) -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, 3 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 3 * SIZE(BO) -+ bgt L, $L102 -+ .align 4 -+ -+$L105: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L108 -+ .align 4 -+ -+$L106: -+ ADD c01, t1, c01 -+ ldi L, -1(L) -+ MUL a1, b1, t1 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b1, t2 -+ LD a2, 3 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi AO, 2 * SIZE(AO) -+ unop -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L106 -+ .align 4 -+ -+$L108: -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+ -+ ADD c01, c03, c01 -+ ADD c02, c04, c02 -+ -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a2, c02, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c02, t1, c02 -+ MUL a3, c02, c02 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 -+ -+$L110: -+ and M, 1, I -+ ble I, $L119 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c02 -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c04 -+ -+ sra KK, 2, L -+ mov B, BO -+ unop -+ ble L, $L115 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c02 -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c04 -+ -+ sra TMP1, 2, L -+ unop -+ ble L, $L115 -+#endif -+ .align 4 -+ -+$L112: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+ LD a1, 4 * SIZE(AO) -+ LD b1, 4 * SIZE(BO) -+ -+ ADD c02, t2, c02 -+ MUL a2, b2, t2 -+ LD a2, 5 * SIZE(AO) -+ LD b2, 5 * SIZE(BO) -+ -+ ADD c03, t3, c03 -+ MUL a3, b3, t3 -+ LD a3, 6 * SIZE(AO) -+ LD b3, 6 * SIZE(BO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b4, t4 -+ LD a4, 7 * SIZE(AO) -+ LD b4, 7 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 4 * SIZE(AO) -+ ldi BO, 4 * SIZE(BO) -+ bgt L, $L112 -+ .align 4 -+ -+$L115: -+#if defined(LT) || defined(RN) -+ and KK, 3, L -+#else -+ and TMP1, 3, L -+#endif -+ ble L, $L118 -+ .align 4 -+ -+$L116: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+ LD a1, 1 * SIZE(AO) -+ LD b1, 1 * SIZE(BO) -+ -+ ldi L, -1(L) -+ ldi AO, 1 * SIZE(AO) -+ ldi BO, 1 * SIZE(BO) -+ bgt L, $L116 -+ .align 4 -+ -+$L118: -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c03, t3, c03 -+ ADD c04, t4, c04 -+ -+ ADD c01, c02, c01 -+ ADD c03, c04, c03 -+ ADD c01, c03, c01 -+ -+#if defined(LN) || defined(RT) -+ subl KK, 1, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ addl B, TMP2, BO -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+#else -+ LD a1, 0 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 1 * SIZE(C1) -+#endif -+ -+#ifdef RT -+ SXADDQ K, AORIG, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 1, KK -+#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L119: -+#ifdef LN -+ SXADDQ K, B, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 1, KK -+#endif -+ -+#ifdef RT -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L40: -+ and N, 2, J -+ ble J, $L80 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ addl LDC, LDC, TMP1 -+ subl C, TMP1, C -+#endif -+ -+ mov C, C1 -+ addl C, LDC, C2 -+ fclr t1 -+#ifndef RT -+ addl C2, LDC, C -+#endif -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L60 -+ .align 4 -+ -+$L51: -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c03 -+ LD a2, 1 * SIZE(AO) -+ fclr c07 -+ LD a3, 2 * SIZE(AO) -+ fclr c04 -+ LD a4, 3 * SIZE(AO) -+ fclr c08 ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 @@ -59843,25 +5221,19 @@ index 0000000..af57279 + LD b4, 3 * SIZE(B) + fclr c06 + -+ ldi L, -2(KK) -+ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif + ldi BO, 2 * SIZE(B) + ldi AO, 4 * SIZE(AO) -+ -+ ble KK, $L58 -+ + ble L, $L55 +#else -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ + sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) @@ -59885,9 +5257,6 @@ index 0000000..af57279 + ldi L, -2(TMP1) + ldi BO, 2 * SIZE(BO) + ldi AO, 4 * SIZE(AO) -+ -+ ble TMP1, $L58 -+ + ble L, $L55 +#endif + .align 4 @@ -59946,413 +5315,228 @@ index 0000000..af57279 + ADD c07, t3, c07 + unop + MUL a3, b3, t3 -+ ldi AO, 8 * SIZE(AO) -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a5, b3, t4 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b4, t1 -+ LD a1, -4 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b4, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L52 -+ .align 4 -+ -+$L55: -+ ADD c05, t1, c05 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L57 -+#else -+ blbs TMP1, $L57 -+#endif -+ .align 4 -+ -+ ADD c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD c07, t3, c07 -+ MUL a3, b1, t3 -+ -+ ADD c08, t4, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD c05, t1, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L57: -+ ADD c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD c07, t3, c07 -+ MUL a3, b1, t3 -+ -+ ADD c08, t4, c08 -+ MUL a4, b1, t4 -+ ADD c01, t1, c01 -+ MUL a1, b2, t1 -+ -+ ADD c02, t2, c02 -+ MUL a2, b2, t2 -+ ADD c03, t3, c03 -+ MUL a3, b2, t3 -+ -+ ADD c04, t4, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c05, t1, c05 -+ ADD c06, t2, c06 -+ ADD c07, t3, c07 -+ ADD c08, t4, c08 -+ .align 4 -+ -+$L58: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c02, c02 -+ SUB a4, c06, c06 -+ -+ SUB b1, c03, c03 -+ SUB b2, c07, c07 -+ SUB b3, c04, c04 -+ SUB b4, c08, c08 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+ -+ SUB b1, c05, c05 -+ SUB b2, c06, c06 -+ SUB b3, c07, c07 -+ SUB b4, c08, c08 -+#endif -+ -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, c04 -+ MUL a1, c08, c08 -+ -+ MUL a2, c04, t1 -+ MUL a2, c08, t2 -+ -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ -+ MUL a3, c04, t1 -+ MUL a3, c08, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL a4, c04, t1 -+ MUL a4, c08, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, c03 -+ MUL b1, c07, c07 -+ -+ MUL b2, c03, t1 -+ MUL b2, c07, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL b3, c03, t1 -+ MUL b3, c07, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 ++ ldi AO, 8 * SIZE(AO) + -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) + -+ MUL a3, c01, t1 -+ MUL a3, c05, t2 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) + -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) + -+ MUL a4, c01, t1 -+ MUL a4, c05, t2 ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) + -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 + -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, c02 -+ MUL b1, c06, c06 ++$L55: ++ ADD c05, t1, c05 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L58 ++#else ++ blbs TMP1, $L58 ++#endif ++ .align 4 + -+ MUL b2, c02, t1 -+ MUL b2, c06, t2 ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 + -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) + -+ MUL b3, c02, t1 -+ MUL b3, c06, t2 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) + -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) + -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, c03 -+ MUL a1, c07, c07 ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) + -+ MUL a2, c03, t1 -+ MUL a2, c07, t2 ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) + -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 + -+ MUL a3, c04, c04 -+ MUL a3, c08, c08 ++$L58: ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) ++#else ++ unop +#endif + -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ MUL a2, c03, t3 -+ MUL a2, c04, t4 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) ++#else ++ unop ++#endif + -+ MUL a3, c05, c05 -+ MUL a3, c06, c06 -+ MUL a3, c07, c07 -+ MUL a3, c08, c08 ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++#ifndef TRMMKERNEL ++ LD c11, 2 * SIZE(C1) ++#else ++ unop +#endif + -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++#ifndef TRMMKERNEL ++ LD c12, 3 * SIZE(C1) ++#else ++ unop ++#endif + -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ MUL a1, c07, c07 -+ MUL a1, c08, c08 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++#ifndef TRMMKERNEL ++ LD c13, 0 * SIZE(C2) ++ unop ++#endif + -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ MUL a2, c07, t3 -+ MUL a2, c08, t4 ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++#ifndef TRMMKERNEL ++ LD c14, 1 * SIZE(C2) ++#else ++ unop ++#endif + -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++#ifndef TRMMKERNEL ++ LD c15, 2 * SIZE(C2) ++#else ++ unop ++#endif + -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+ MUL a3, c03, c03 -+ MUL a3, c04, c04 ++ ADD c05, t1, c05 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD c16, 3 * SIZE(C2) ++#else ++ unop +#endif + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) ++ ADD c06, t2, c06 ++ ldi I, -1(I) ++ MUL alpha, c02, c02 ++ unop + -+ ST c03, 4 * SIZE(BO) -+ ST c07, 5 * SIZE(BO) -+ ST c04, 6 * SIZE(BO) -+ ST c08, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) ++ ADD c07, t3, c07 ++ MUL alpha, c03, c03 ++ ADD c08, t4, c08 ++ MUL alpha, c04, c04 + -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) ++ MUL alpha, c05, c05 ++#ifndef TRMMKERNEL ++ ADD c01, c09, c01 ++#endif ++ MUL alpha, c06, c06 ++#ifndef TRMMKERNEL ++ ADD c02, c10, c02 +#endif + -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) ++ MUL alpha, c07, c07 ++#ifndef TRMMKERNEL ++ ADD c03, c11, c03 ++#endif ++ MUL alpha, c08, c08 ++#ifndef TRMMKERNEL ++ ADD c04, c12, c04 +#endif + ++#ifndef TRMMKERNEL ++ ADD c05, c13, c05 ++#endif + ST c01, 0 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c06, c14, c06 ++#endif + ST c02, 1 * SIZE(C1) ++ ++#ifndef TRMMKERNEL ++ ADD c07, c15, c07 ++#endif + ST c03, 2 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c08, c16, c08 ++#endif + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+#endif -+ + fclr t1 ++ ST c06, 1 * SIZE(C2) + fclr t2 ++ ST c07, 2 * SIZE(C2) + fclr t3 ++ ST c08, 3 * SIZE(C2) + fclr t4 + -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) + -+#if defined(LT) || defined(RN) ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + -+#ifdef LT ++#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 4, KK +#endif -+ -+#ifdef LN -+ subl KK, 4, KK -+#endif -+ -+ ldi I, -1(I) -+ + bgt I, $L51 + .align 4 + @@ -60360,7 +5544,17 @@ index 0000000..af57279 + and M, 2, I + ble I, $L70 + -+#if defined(LT) || defined(RN) ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif + + LD a1, 0 * SIZE(AO) + fclr c01 @@ -60372,28 +5566,23 @@ index 0000000..af57279 + fclr c06 + + LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif + LD b2, 1 * SIZE(B) + ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L68 -+ + ble L, $L65 +#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) @@ -60413,9 +5602,6 @@ index 0000000..af57279 + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L68 -+ + ble L, $L65 +#endif + .align 4 @@ -60469,11 +5655,12 @@ index 0000000..af57279 + +$L65: + ADD c01, t1, c01 ++ fldd alpha, ALPHA + MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L67 ++#ifndef TRMMKERNEL ++ blbs K, $L68 +#else -+ blbs TMP1, $L67 ++ blbs TMP1, $L68 +#endif + .align 4 + @@ -60498,193 +5685,103 @@ index 0000000..af57279 + ldi AO, 2 * SIZE(AO) + .align 4 + -+$L67: ++$L68: + ADD c02, t2, c02 ++ unop + MUL a2, b1, t2 -+ ADD c05, t3, c05 -+ MUL a1, b2, t3 -+ -+ ADD c06, t4, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b2, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD c01, t1, c01 -+ ADD c02, t2, c02 -+ ADD c05, t3, c05 -+ ADD c06, t4, c06 -+ .align 4 -+ -+$L68: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) +#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) ++ unop +#endif + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c02, c02 -+ SUB a4, c06, c06 ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) +#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c05, c05 -+ SUB a4, c06, c06 -+#endif -+ -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) -+ -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ -+ MUL a3, c02, c02 -+ MUL a3, c06, c06 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ -+ MUL a3, c05, c05 -+ MUL a3, c06, c06 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 ++ unop +#endif + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c02, 2 * SIZE(BO) -+ ST c06, 3 * SIZE(BO) ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c11, 0 * SIZE(C2) +#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) ++ unop +#endif + -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) ++ ADD c01, t1, c01 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD c12, 1 * SIZE(C2) ++#else ++ unop +#endif + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ -+#ifndef LN ++ ADD c02, t2, c02 + ldi C1, 2 * SIZE(C1) ++ MUL alpha, c02, c02 + ldi C2, 2 * SIZE(C2) ++ ++ ADD c05, t3, c05 ++ MUL alpha, c05, c05 ++ ADD c06, t4, c06 ++ MUL alpha, c06, c06 ++ ++#ifndef TRMMKERNEL ++ ADD c01, c09, c01 ++ ADD c02, c10, c02 ++ ADD c05, c11, c05 ++ ADD c06, c12, c06 +#endif + ++ ST c01, -2 * SIZE(C1) + fclr t1 ++ ST c02, -1 * SIZE(C1) + fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ST c05, -2 * SIZE(C2) ++ fclr t3 ++ ST c06, -1 * SIZE(C2) ++ fclr t4 + -+#if defined(LT) || defined(RN) ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + -+#ifdef LT ++#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + -+#if defined(LT) || defined(RN) ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif + + LD a1, 0 * SIZE(AO) + fclr c01 @@ -60696,27 +5793,22 @@ index 0000000..af57279 + LD b2, 1 * SIZE(B) + fclr c06 + -+ ldi L, -2(KK) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif + + LD b3, 2 * SIZE(B) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + ldi BO, 2 * SIZE(B) -+ -+ ble KK, $L78 -+ + ble L, $L75 +#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ + sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) @@ -60729,15 +5821,16 @@ index 0000000..af57279 + LD b2, 1 * SIZE(BO) + fclr c06 + ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else + ldi L, -2(TMP1) ++#endif + + LD b3, 2 * SIZE(BO) + ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + ldi BO, 2 * SIZE(BO) -+ -+ ble TMP1, $L78 -+ + ble L, $L75 +#endif + .align 4 @@ -60771,11 +5864,12 @@ index 0000000..af57279 + +$L75: + ADD c01, t1, c01 ++ fldd alpha, ALPHA + MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L77 ++#ifndef TRMMKERNEL ++ blbs K, $L78 +#else -+ blbs TMP1, $L77 ++ blbs TMP1, $L78 +#endif + .align 4 + @@ -60791,2061 +5885,2013 @@ index 0000000..af57279 + ldi BO, 2 * SIZE(BO) + .align 4 + -+$L77: ++$L78: + ADD c05, t2, c05 + MUL a1, b2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ + ADD c02, t3, c02 + ADD c06, t4, c06 ++#ifndef TRMMKERNEL ++ LD b5, 0 * SIZE(C2) ++#else ++ unop ++#endif + + ADD c01, c02, c01 -+ ldi AO, 1 * SIZE(AO) + ADD c05, c06, c05 -+ ldi BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + -+ .align 4 -+ -+$L78: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c05, t1, c05 -+ MUL a3, c05, c05 -+#endif -+ -+#ifdef RT -+ LD a1, 3 * SIZE(BO) -+ LD a2, 2 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) -+ -+ MUL a1, c05, c05 -+ MUL a2, c05, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+#endif ++ MUL alpha, c01, c01 ++ MUL alpha, c05, c05 + -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c01, a5, c01 ++ ADD c05, b5, c05 +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + -+#ifdef LT ++#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 1, KK +#endif -+ -+#ifdef LN -+ subl KK, 1, KK -+#endif + .align 4 + +$L79: -+#ifdef LN -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN ++ mov BO, B ++#if defined(TRMMKERNEL) && !defined(LEFT) + addl KK, 2, KK ++#else ++ unop +#endif -+ -+#ifdef RT -+ subl KK, 2, KK -+#endif ++ unop ++ unop + .align 4 + +$L80: -+ sra N, 2, J ++ and N, 1, J + ble J, $L999 -+ .align 4 -+ -+$L01: -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ subl B, TMP1, B -+ -+ s4addl LDC, 0, TMP1 -+ subl C, TMP1, C -+#endif + + mov C, C1 -+ addl C, LDC, C2 -+ addl C2, LDC, C3 -+#ifndef RT -+ s4addl LDC, C, C -+#endif -+ -+ fclr t1 -+ addl C3, LDC, C4 -+ fclr t2 -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif ++ mov A, AO + -+#ifdef LT ++#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ + sra M, 2, I -+ fclr t3 -+ fclr t4 -+ ble I, $L20 ++ ble I, $L100 + .align 4 + -+$L11: -+#if defined(LT) || defined(RN) ++$L91: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif + -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ LD b1, 0 * SIZE(B) ++ LD b1, 0 * SIZE(B) + fclr c01 -+ LD b2, 1 * SIZE(B) ++ LD b2, 1 * SIZE(B) + fclr c02 -+ -+ LD b3, 2 * SIZE(B) -+ fclr c06 -+ LD b4, 3 * SIZE(B) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(KK) ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) + fclr c04 + -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(B) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble KK, $L18 ++#ifndef TRMMKERNEL ++ sra K, 2, L +#else -+ -+#ifdef LN -+ sll K, BASE_SHIFT + 2, TMP1 -+ subl AORIG, TMP1, AORIG ++ sra TMP1, 2, L +#endif -+ ++ mov B, BO ++ unop ++ ble L, $L95 ++#else + sll KK, BASE_SHIFT + 2, TMP1 -+ addl AORIG, TMP1, AO -+ addl B, TMP1, BO -+ ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO + subl K, KK, TMP1 + -+ LD a1, 0 * SIZE(AO) -+ fclr c11 -+ LD a2, 1 * SIZE(AO) -+ fclr c12 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c16 -+ LD a4, 3 * SIZE(AO) -+ fclr c15 ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ LD b1, 0 * SIZE(BO) ++ LD b1, 0 * SIZE(BO) + fclr c01 -+ LD b2, 1 * SIZE(BO) ++ LD b2, 1 * SIZE(BO) + fclr c02 -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c06 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c03 -+ ldi L, -2(TMP1) ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) + fclr c04 + -+ fillcs 7 * SIZE(C2) -+ fclr c08 -+ ldi BO, 4 * SIZE(BO) -+ fclr c13 -+ -+ fillcs 4 * SIZE(C3) -+ fclr c09 -+ ldi AO, 4 * SIZE(AO) -+ fclr c10 -+ -+ fillcs 7 * SIZE(C4) -+ fclr c14 -+ fclr c07 -+ ble TMP1, $L18 -+#endif -+ -+ ble L, $L15 -+ .align 5 -+ -+$L12: -+/* 1 */ -+ ADD c11, t1, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) ++#ifndef TRMMKERNEL ++ sra K, 2, L +#else -+ unop ++ sra TMP1, 2, L +#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else + unop ++ ble L, $L95 +#endif ++ .align 5 + -+ ADD c12, t2, c12 -+ unop -+ MUL b1, a2, t2 -+ unop -+ -+ ADD c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) -+ -+ ADD c15, t4, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ -+/* 2 */ ++$L92: + ADD c01, t1, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP -+ -+ ADD c02, t2, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP -+ -+ ADD c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, c05 -+ unop -+ MUL b4, a1, t4 + unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + -+/* 3 */ -+ ADD c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+ unop ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) + -+ ADD c04, t2, c04 -+ unop -+ MUL b3, a2, t2 ++ ADD c03, t3, c03 + unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) + -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+ ADD c13, t4, c13 ++ ADD c01, t1, c01 + unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) + -+/* 4 */ -+ ADD c09, t1, c09 ++ ADD c02, t2, c02 + unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) + -+ ADD c10, t2, c10 ++ ADD c03, t3, c03 + unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) + -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) + -+ ADD c07, t4, c07 ++ ADD c01, t1, c01 + unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) + -+/* 5 */ -+ ADD c11, t1, c11 ++ ADD c02, t2, c02 + unop -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD c12, t2, c12 -+ ldi L, -2(L) -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) + -+ ADD c16, t3, c16 -+ unop -+ MUL b2, a2, t3 ++ ADD c03, t3, c03 + unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) + -+ ADD c15, t4, c15 -+ unop -+ MUL b2, a5, t4 -+ unop ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) + -+/* 6 */ + ADD c01, t1, c01 -+ unop -+ MUL b5, a6, t1 -+ unop ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) + + ADD c02, t2, c02 -+ unop -+ MUL b5, a4, t2 -+ unop -+ -+ ADD c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD c05, t4, c05 -+ unop -+ MUL b4, a5, t4 -+ unop ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) + -+/* 7 */ -+ ADD c03, t1, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 -+ unop ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) + -+ ADD c04, t2, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 -+ unop ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 + -+ ADD c08, t3, c08 ++$L95: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA + unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) ++ ble L, $L98 ++ .align 4 + -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + -+/* 8 */ -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) + -+ ADD c10, t2, c10 ++ ADD c03, t3, c03 + unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) + -+ ADD c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) + -+ ADD c07, t4, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 + .align 4 + -+$L15: -+ ADD c11, t1, c11 -+ MUL b1, a1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L17 ++$L98: ++#ifndef TRMMKERNEL ++ ADD c01, t1, c01 ++ LD c05, 0 * SIZE(C1) ++ ADD c02, t2, c02 ++ LD c06, 1 * SIZE(C1) ++ ADD c03, t3, c03 ++ LD c07, 2 * SIZE(C1) ++ ADD c04, t4, c04 ++ LD c08, 3 * SIZE(C1) +#else -+ blbs TMP1, $L17 ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 +#endif -+ .align 4 + -+ ADD c12, t2, c12 -+ MUL b1, a2, t2 -+ ADD c16, t3, c16 -+ MUL b2, a2, t3 ++ MUL alpha, c01, c01 ++ MUL alpha, c02, c02 ++ MUL alpha, c03, c03 ++ MUL alpha, c04, c04 + -+ ADD c15, t4, c15 -+ MUL b2, a1, t4 -+ ADD c01, t1, c01 -+ MUL b1, a3, t1 ++#ifndef TRMMKERNEL ++ ADD c01, c05, c01 ++ ADD c02, c06, c02 ++ ADD c03, c07, c03 ++ ADD c04, c08, c04 ++#endif + -+ ADD c02, t2, c02 -+ unop -+ MUL b1, a4, t2 -+ LD b1, 0 * SIZE(BO) ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) + -+ ADD c06, t3, c06 -+ MUL b2, a4, t3 -+ ADD c05, t4, c05 -+ MUL b4, a1, t4 ++ ldi C1, 4 * SIZE(C1) + -+ ADD c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+ LD a1, 0 * SIZE(AO) ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ADD c04, t2, c04 ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L100: ++ and M, 2, I + unop -+ MUL b3, a2, t2 + unop ++ ble I, $L110 ++ .align 4 + -+ ADD c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) ++$L101: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + -+ ADD c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif + -+ ADD c09, t1, c09 -+ unop -+ MUL b3, a3, t1 -+ ldi AO, 4 * SIZE(AO) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 + -+ ADD c14, t3, c14 ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO + unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) ++ ble L, $L105 ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 + -+ ADD c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+ LD a3, -2 * SIZE(AO) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD c11, t1, c11 ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 + LD b4, 3 * SIZE(BO) -+ MUL b1, a1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 ++ fclr c04 + -+$L17: -+ ADD c12, t2, c12 -+ MUL b1, a2, t2 -+ ADD c16, t3, c16 -+ MUL b2, a2, t3 ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L105 ++#endif ++ .align 5 + -+ ADD c15, t4, c15 -+ MUL b2, a1, t4 ++$L102: + ADD c01, t1, c01 -+ MUL b1, a3, t1 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 -+ MUL b1, a4, t2 -+ ADD c06, t3, c06 -+ MUL b2, a4, t3 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+ ADD c05, t4, c05 -+ MUL b4, a1, t4 -+ ADD c03, t1, c03 -+ MUL b3, a1, t1 ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) + -+ ADD c04, t2, c04 -+ MUL b3, a2, t2 -+ ADD c08, t3, c08 -+ MUL b4, a2, t3 ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) + -+ ADD c13, t4, c13 -+ MUL b2, a3, t4 -+ ADD c09, t1, c09 -+ MUL b3, a3, t1 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) + -+ ADD c10, t2, c10 -+ MUL b3, a4, t2 -+ ADD c14, t3, c14 -+ MUL b4, a4, t3 ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) + -+ ADD c07, t4, c07 -+ ldi AO, 4 * SIZE(AO) -+ MUL b4, a3, t4 -+ ldi BO, 4 * SIZE(BO) ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) + -+ ADD c11, t1, c11 -+ ADD c12, t2, c12 -+ ADD c16, t3, c16 -+ ADD c15, t4, c15 ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 + .align 4 + -+$L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 4, TMP1 ++$L105: ++#ifndef TRMMKERNEL ++ and K, 3, L +#else -+ subl KK, 4, TMP1 ++ and TMP1, 3, L +#endif -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) ++ fldd alpha, ALPHA ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C1) ++ LD a4, 1 * SIZE(C1) +#endif ++ ble L, $L108 ++ .align 4 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) + -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) + -+ SUB b1, c02, c02 -+ SUB b2, c06, c06 -+ SUB b3, c10, c10 -+ SUB b4, c14, c14 ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 + -+ LD a1, 8 * SIZE(BO) -+ LD a2, 9 * SIZE(BO) -+ LD a3, 10 * SIZE(BO) -+ LD a4, 11 * SIZE(BO) ++$L108: ++ ADD c01, t1, c01 ++ fclr t1 ++ ADD c02, t2, c02 ++ fclr t2 ++ ADD c03, t3, c03 ++ fclr t3 ++ ADD c04, t4, c04 ++ fclr t4 + -+ LD b1, 12 * SIZE(BO) -+ LD b2, 13 * SIZE(BO) -+ LD b3, 14 * SIZE(BO) -+ LD b4, 15 * SIZE(BO) ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 + -+ SUB a1, c03, c03 -+ SUB a2, c07, c07 -+ SUB a3, c11, c11 -+ SUB a4, c15, c15 ++ MUL alpha, c01, c01 ++ MUL alpha, c02, c02 + -+ SUB b1, c04, c04 -+ SUB b2, c08, c08 -+ SUB b3, c12, c12 -+ SUB b4, c16, c16 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) ++#ifndef TRMMKERNEL ++ ADD c01, a3, c01 ++ ADD c02, a4, c02 ++#endif + -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ldi C1, 2 * SIZE(C1) + -+ SUB b1, c05, c05 -+ SUB b2, c06, c06 -+ SUB b3, c07, c07 -+ SUB b4, c08, c08 ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ LD a1, 8 * SIZE(AO) -+ LD a2, 9 * SIZE(AO) -+ LD a3, 10 * SIZE(AO) -+ LD a4, 11 * SIZE(AO) ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 + -+ LD b1, 12 * SIZE(AO) -+ LD b2, 13 * SIZE(AO) -+ LD b3, 14 * SIZE(AO) -+ LD b4, 15 * SIZE(AO) ++$L110: ++ and M, 1, I ++ ble I, $L999 ++ .align 4 + -+ SUB a1, c09, c09 -+ SUB a2, c10, c10 -+ SUB a3, c11, c11 -+ SUB a4, c12, c12 ++$L111: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + -+ SUB b1, c13, c13 -+ SUB b2, c14, c14 -+ SUB b3, c15, c15 -+ SUB b4, c16, c16 ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif +#endif + -+#ifdef LN -+ LD a1, 15 * SIZE(AO) -+ LD a2, 14 * SIZE(AO) -+ LD a3, 13 * SIZE(AO) -+ LD a4, 12 * SIZE(AO) -+ -+ MUL a1, c04, c04 -+ MUL a1, c08, c08 -+ MUL a1, c12, c12 -+ MUL a1, c16, c16 ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ MUL a2, c04, t1 -+ MUL a2, c08, t2 -+ MUL a2, c12, t3 -+ MUL a2, c16, t4 ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 + -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ SUB c11, t3, c11 -+ SUB c15, t4, c15 ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 + -+ MUL a3, c04, t1 -+ MUL a3, c08, t2 -+ MUL a3, c12, t3 -+ MUL a3, c16, t4 ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 + -+ MUL a4, c04, t1 -+ MUL a4, c08, t2 -+ MUL a4, c12, t3 -+ MUL a4, c16, t4 ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L115 ++#endif ++ .align 4 + -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+ LD b1, 10 * SIZE(AO) -+ LD b2, 9 * SIZE(AO) -+ LD b3, 8 * SIZE(AO) -+ -+ MUL b1, c03, c03 -+ MUL b1, c07, c07 -+ MUL b1, c11, c11 -+ MUL b1, c15, c15 ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) + -+ MUL b2, c03, t1 -+ MUL b2, c07, t2 -+ MUL b2, c11, t3 -+ MUL b2, c15, t4 ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) + -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) + -+ MUL b3, c03, t1 -+ MUL b3, c07, t2 -+ MUL b3, c11, t3 -+ MUL b3, c15, t4 ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 + -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 ++$L115: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA ++#ifndef TRMMKERNEL ++ LD a2, 0 * SIZE(C1) ++#endif ++ ble L, $L118 ++ .align 4 + -+ LD a1, 5 * SIZE(AO) -+ LD a2, 4 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) + -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ MUL a1, c10, c10 -+ MUL a1, c14, c14 ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 + -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ MUL a2, c10, t3 -+ MUL a2, c14, t4 ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 + -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 + -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+ MUL a3, c09, c09 -+ MUL a3, c13, c13 ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ ADD c01, a2, c01 +#endif ++ ST c01, 0 * SIZE(C1) ++ .align 4 + -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ MUL a1, c09, c09 -+ MUL a1, c13, c13 ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/gemv_n.S b/kernel/sw_64/gemv_n.S +new file mode 100644 +index 000000000..1d9f65493 +--- /dev/null ++++ b/kernel/sw_64/gemv_n.S +@@ -0,0 +1,1307 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ MUL a2, c09, t3 -+ MUL a2, c13, t4 ++#define ASSEMBLER ++#include "common.h" + -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 + -+ MUL a3, c01, t1 -+ MUL a3, c05, t2 -+ MUL a3, c09, t3 -+ MUL a3, c13, t4 ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 + -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ SUB c11, t3, c11 -+ SUB c15, t4, c15 ++#define M $16 ++#define N $17 ++#define A $20 ++#define LDA $21 + -+ MUL a4, c01, t1 -+ MUL a4, c05, t2 -+ MUL a4, c09, t3 -+ MUL a4, c13, t4 ++#define X $18 ++#define INCX $19 ++#define Y $22 ++#define INCY $23 + -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ SUB c12, t3, c12 -+ SUB c16, t4, c16 ++#define BUFFER $24 + -+ LD b1, 5 * SIZE(AO) -+ LD b2, 6 * SIZE(AO) -+ LD b3, 7 * SIZE(AO) -+ -+ MUL b1, c02, c02 -+ MUL b1, c06, c06 -+ MUL b1, c10, c10 -+ MUL b1, c14, c14 ++#define I $25 ++#define J $27 + -+ MUL b2, c02, t1 -+ MUL b2, c06, t2 -+ MUL b2, c10, t3 -+ MUL b2, c14, t4 ++#define Y1 $4 + -+ SUB c03, t1, c03 -+ SUB c07, t2, c07 -+ SUB c11, t3, c11 -+ SUB c15, t4, c15 ++#define A1 $5 ++#define A2 $6 ++#define A3 $7 ++#define A4 $8 + -+ MUL b3, c02, t1 -+ MUL b3, c06, t2 -+ MUL b3, c10, t3 -+ MUL b3, c14, t4 ++#define alpha $f19 + -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ SUB c12, t3, c12 -+ SUB c16, t4, c16 ++#define alpha1 $f0 ++#define alpha2 $f1 ++#define alpha3 $f10 ++#define alpha4 $f11 + -+ LD a1, 10 * SIZE(AO) -+ LD a2, 11 * SIZE(AO) -+ LD a3, 15 * SIZE(AO) -+ -+ MUL a1, c03, c03 -+ MUL a1, c07, c07 -+ MUL a1, c11, c11 -+ MUL a1, c15, c15 ++#define y0 $f12 ++#define y1 $f13 ++#define y2 $f14 ++#define y3 $f15 + -+ MUL a2, c03, t1 -+ MUL a2, c07, t2 -+ MUL a2, c11, t3 -+ MUL a2, c15, t4 ++#define y4 $f16 ++#define y5 $f17 ++#define y6 $f18 ++#define y7 $f21 + -+ SUB c04, t1, c04 -+ SUB c08, t2, c08 -+ SUB c12, t3, c12 -+ SUB c16, t4, c16 ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 + -+ MUL a3, c04, c04 -+ MUL a3, c08, c08 -+ MUL a3, c12, c12 -+ MUL a3, c16, c16 -+#endif ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 + -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 ++ PROLOGUE + -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 -+ MUL a2, c03, t3 -+ MUL a2, c04, t4 ++ ldi $sp, -STACKSIZE($sp) ++ ldl X, 0 + STACKSIZE($sp) ++ ldl INCX, 8 + STACKSIZE($sp) ++ ldl Y, 16 + STACKSIZE($sp) ++ ldl INCY, 24 + STACKSIZE($sp) ++ ldl BUFFER, 32 + STACKSIZE($sp) + -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) + -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c03, t3 -+ MUL a3, c04, t4 ++ PROFCODE + -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 ++ cmple M, 0, $0 ++ SXADDQ INCX, 0, INCX ++ cmple N, 0, $1 ++ SXADDQ INCY, 0, INCY + -+ MUL a4, c01, t1 -+ MUL a4, c02, t2 -+ MUL a4, c03, t3 -+ MUL a4, c04, t4 ++ or $0, $1, $0 ++ bne $0, $L999 + -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ SUB c15, t3, c15 -+ SUB c16, t4, c16 ++ SXADDQ LDA, 0, LDA + -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, c05 -+ MUL b1, c06, c06 -+ MUL b1, c07, c07 -+ MUL b1, c08, c08 ++ cmpeq INCY, SIZE, $0 ++ bne $0, $L10 + -+ MUL b2, c05, t1 -+ MUL b2, c06, t2 -+ MUL b2, c07, t3 -+ MUL b2, c08, t4 ++ mov BUFFER, Y1 + -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 ++ mov Y, BUFFER ++ mov Y1, Y + -+ MUL b3, c05, t1 -+ MUL b3, c06, t2 -+ MUL b3, c07, t3 -+ MUL b3, c08, t4 ++ sra M, 3, I ++ ble I, $L05 ++ .align 4 + -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ SUB c15, t3, c15 -+ SUB c16, t4, c16 ++$L02: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ ST $f31, 2 * SIZE(Y1) ++ ST $f31, 3 * SIZE(Y1) ++ ST $f31, 4 * SIZE(Y1) ++ ST $f31, 5 * SIZE(Y1) ++ ST $f31, 6 * SIZE(Y1) ++ ST $f31, 7 * SIZE(Y1) + -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 ++ ldi Y1, 8 * SIZE(Y1) ++ ldi I, -1(I) ++ bgt I, $L02 ++ .align 4 + -+ MUL a2, c09, t1 -+ MUL a2, c10, t2 -+ MUL a2, c11, t3 -+ MUL a2, c12, t4 ++$L05: ++ and M, 7, I ++ ble I, $L10 ++ .align 4 + -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 -+ SUB c15, t3, c15 -+ SUB c16, t4, c16 ++$L06: ++ ST $f31, 0 * SIZE(Y1) ++ addl Y1, SIZE, Y1 + -+ MUL a3, c13, c13 -+ MUL a3, c14, c14 -+ MUL a3, c15, c15 -+ MUL a3, c16, c16 -+#endif ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 + -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, c13 -+ MUL a1, c14, c14 -+ MUL a1, c15, c15 -+ MUL a1, c16, c16 ++$L10: ++ sra N, 2, J ++ ble J, $L20 ++ .align 4 + -+ MUL a2, c13, t1 -+ MUL a2, c14, t2 -+ MUL a2, c15, t3 -+ MUL a2, c16, t4 ++$L11: ++ LD alpha1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha3, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha4, 0 * SIZE(X) ++ addl X, INCX, X + -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 ++ MUL alpha, alpha1, alpha1 ++ MUL alpha, alpha2, alpha2 ++ MUL alpha, alpha3, alpha3 ++ MUL alpha, alpha4, alpha4 + -+ MUL a3, c13, t1 -+ MUL a3, c14, t2 -+ MUL a3, c15, t3 -+ MUL a3, c16, t4 ++ mov A, A1 ++ addl A, LDA, A2 ++ addl A2, LDA, A3 ++ addl A3, LDA, A4 ++ s4addl LDA, A, A + -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 ++ mov Y, Y1 ++ s_fillcs 4 * SIZE(X) + -+ MUL a4, c13, t1 -+ MUL a4, c14, t2 -+ MUL a4, c15, t3 -+ MUL a4, c16, t4 ++ sra M, 3, I ++ ble I, $L15 + -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) + -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, c09 -+ MUL b1, c10, c10 -+ MUL b1, c11, c11 -+ MUL b1, c12, c12 ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) + -+ MUL b2, c09, t1 -+ MUL b2, c10, t2 -+ MUL b2, c11, t3 -+ MUL b2, c12, t4 ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) + -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 -+ SUB c07, t3, c07 -+ SUB c08, t4, c08 ++ LD a8, 0 * SIZE(A3) ++ LD a9, 1 * SIZE(A3) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 3 * SIZE(A3) + -+ MUL b3, c09, t1 -+ MUL b3, c10, t2 -+ MUL b3, c11, t3 -+ MUL b3, c12, t4 ++ LD y4, 4 * SIZE(Y1) ++ LD y5, 5 * SIZE(Y1) ++ LD y6, 6 * SIZE(Y1) ++ LD y7, 7 * SIZE(Y1) + -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 ++ MUL alpha1, a0, a0 ++ LD a12, 0 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ LD a13, 1 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ LD a14, 2 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ LD a15, 3 * SIZE(A4) + -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) ++ ADD y0, a0, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, a4 ++ unop + -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 -+ MUL a1, c07, c07 -+ MUL a1, c08, c08 ++ ADD y1, a1, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, a5 ++ unop + -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 -+ MUL a2, c07, t3 -+ MUL a2, c08, t4 ++ ADD y2, a2, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, a6 ++ unop + -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 ++ ADD y3, a3, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, a7 ++ unop + -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+ MUL a3, c03, c03 -+ MUL a3, c04, c04 -+#endif ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha3, a8, a8 ++ unop + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha3, a9, a9 ++ ldi I, -1(I) + -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha3, a10, a10 ++ unop + -+ ST c03, 8 * SIZE(BO) -+ ST c07, 9 * SIZE(BO) -+ ST c11, 10 * SIZE(BO) -+ ST c15, 11 * SIZE(BO) ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha3, a11, a11 ++ unop + -+ ST c04, 12 * SIZE(BO) -+ ST c08, 13 * SIZE(BO) -+ ST c12, 14 * SIZE(BO) -+ ST c16, 15 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) ++ ADD y0, a8, y0 ++ LD a8, 4 * SIZE(A3) ++ MUL alpha4, a12, a12 ++ ble I, $L13 ++ .align 4 + -+ ST c05, 4 * SIZE(AO) -+ ST c06, 5 * SIZE(AO) -+ ST c07, 6 * SIZE(AO) -+ ST c08, 7 * SIZE(AO) ++$L12: ++ ADD y1, a9, y1 ++ LD a9, 5 * SIZE(A3) ++ MUL alpha4, a13, a13 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) + -+ ST c09, 8 * SIZE(AO) -+ ST c10, 9 * SIZE(AO) -+ ST c11, 10 * SIZE(AO) -+ ST c12, 11 * SIZE(AO) ++ ADD y2, a10, y2 ++ LD a10, 6 * SIZE(A3) ++ MUL alpha4, a14, a14 ++ unop + -+ ST c13, 12 * SIZE(AO) -+ ST c14, 13 * SIZE(AO) -+ ST c15, 14 * SIZE(AO) -+ ST c16, 15 * SIZE(AO) -+#endif ++ ADD y3, a11, y3 ++ LD a11, 7 * SIZE(A3) ++ MUL alpha4, a15, a15 ++ ldi I, -1(I) + -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+ ldi C3, -4 * SIZE(C3) -+ ldi C4, -4 * SIZE(C4) -+#endif ++ ADD y0, a12, y0 ++ LD a12, 4 * SIZE(A4) ++ MUL alpha1, a0, a0 ++ fillde (PREFETCHSIZE + 0) * SIZE(Y1) + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) ++ ADD y1, a13, y1 ++ LD a13, 5 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ unop + -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) -+ ST c07, 2 * SIZE(C2) -+ ST c08, 3 * SIZE(C2) ++ ADD y2, a14, y2 ++ LD a14, 6 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ unop + -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c11, 2 * SIZE(C3) -+ ST c12, 3 * SIZE(C3) ++ ADD y3, a15, y3 ++ LD a15, 7 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) + -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) -+ ST c15, 2 * SIZE(C4) -+ ST c16, 3 * SIZE(C4) ++ ADD y4, a0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ LD a0, 8 * SIZE(A1) + -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+ ldi C3, 4 * SIZE(C3) -+ ldi C4, 4 * SIZE(C4) -+#endif ++ ADD y5, a1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ LD a1, 9 * SIZE(A1) + -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 ++ ADD y6, a2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ LD a2, 10 * SIZE(A1) + -+#ifdef RT -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ADD y7, a3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ LD a3, 11 * SIZE(A1) + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 2, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif ++ ADD y4, a4, y4 ++ LD a4, 8 * SIZE(A2) ++ MUL alpha3, a8, a8 ++ LD y0, 8 * SIZE(Y1) + -+#ifdef LT -+ addl KK, 4, KK -+#endif ++ ADD y5, a5, y5 ++ LD a5, 9 * SIZE(A2) ++ MUL alpha3, a9, a9 ++ LD y1, 9 * SIZE(Y1) + -+#ifdef LN -+ subl KK, 4, KK -+#endif ++ ADD y6, a6, y6 ++ LD a6, 10 * SIZE(A2) ++ MUL alpha3, a10, a10 ++ LD y2, 10 * SIZE(Y1) + -+ ldi I, -1(I) ++ ADD y7, a7, y7 ++ LD a7, 11 * SIZE(A2) ++ MUL alpha3, a11, a11 ++ LD y3, 11 * SIZE(Y1) + -+ bgt I, $L11 -+ .align 4 ++ ADD y4, a8, y4 ++ LD a8, 8 * SIZE(A3) ++ MUL alpha4, a12, a12 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A3) + -+$L20: -+ and M, 2, I -+ ble I, $L30 ++ ADD y5, a9, y5 ++ LD a9, 9 * SIZE(A3) ++ MUL alpha4, a13, a13 ++ ldi A1, 8 * SIZE(A1) + -+#if defined(LT) || defined(RN) ++ ADD y6, a10, y6 ++ LD a10, 10 * SIZE(A3) ++ MUL alpha4, a14, a14 ++ ldi A2, 8 * SIZE(A2) + -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 ++ ADD y7, a11, y7 ++ LD a11, 11 * SIZE(A3) ++ MUL alpha4, a15, a15 ++ ldi Y1, 8 * SIZE(Y1) + -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 ++ ADD y4, a12, y4 ++ LD a12, 8 * SIZE(A4) ++ MUL alpha1, a0, a0 ++ unop + -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) ++ ADD y5, a13, y5 ++ LD a13, 9 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ ldi A3, 8 * SIZE(A3) + -+ LD b3, 2 * SIZE(B) -+ fclr c01 -+ LD b4, 3 * SIZE(B) -+ fclr c05 ++ ADD y6, a14, y6 ++ LD a14, 10 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A4) + -+ ldi BO, 4 * SIZE(B) -+ fclr c02 -+ fclr c06 -+ ble KK, $L28 ++ ADD y7, a15, y7 ++ LD a15, 11 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ ldi A4, 8 * SIZE(A4) + -+ ble L, $L25 ++ ADD y0, a0, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, a4 ++ ST y4, -4 * SIZE(Y1) + -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif ++ ADD y1, a1, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, a5 ++ ST y5, -3 * SIZE(Y1) + -+ sll KK, BASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO ++ ADD y2, a2, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, a6 ++ ST y6, -2 * SIZE(Y1) + -+ subl K, KK, TMP1 ++ ADD y3, a3, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, a7 ++ ST y7, -1 * SIZE(Y1) + -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha3, a8, a8 ++ LD y4, 4 * SIZE(Y1) + -+ LD a3, 2 * SIZE(AO) -+ fclr c10 -+ LD a4, 3 * SIZE(AO) -+ fclr c14 ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha3, a9, a9 ++ LD y5, 5 * SIZE(Y1) + -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha3, a10, a10 ++ LD y6, 6 * SIZE(Y1) + -+ LD b3, 2 * SIZE(BO) -+ fclr c01 -+ LD b4, 3 * SIZE(BO) -+ fclr c05 ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha3, a11, a11 ++ LD y7, 7 * SIZE(Y1) + -+ ldi BO, 4 * SIZE(BO) -+ fclr c02 -+ fclr c06 -+ ble TMP1, $L28 ++ ADD y0, a8, y0 ++ LD a8, 4 * SIZE(A3) ++ MUL alpha4, a12, a12 ++ bgt I, $L12 ++ .align 4 + -+ ble L, $L25 -+#endif -+ .align 4 ++$L13: ++ ADD y1, a9, y1 ++ LD a9, 5 * SIZE(A3) ++ MUL alpha4, a13, a13 ++ unop + -+$L22: -+ ADD c09, t1, c09 ++ ADD y2, a10, y2 ++ LD a10, 6 * SIZE(A3) ++ MUL alpha4, a14, a14 + unop -+ MUL a1, b1, t1 ++ ++ ADD y3, a11, y3 ++ LD a11, 7 * SIZE(A3) ++ MUL alpha4, a15, a15 + unop + -+ ADD c10, t2, c10 ++ ADD y0, a12, y0 ++ LD a12, 4 * SIZE(A4) ++ MUL alpha1, a0, a0 + unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) + -+ ADD c13, t3, c13 ++ ADD y1, a13, y1 ++ LD a13, 5 * SIZE(A4) ++ MUL alpha1, a1, a1 + unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) + -+ ADD c14, t4, c14 ++ ADD y2, a14, y2 ++ LD a14, 6 * SIZE(A4) ++ MUL alpha1, a2, a2 + unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) + -+ ADD c01, t1, c01 ++ ADD y3, a15, y3 ++ LD a15, 7 * SIZE(A4) ++ MUL alpha1, a3, a3 + unop -+ MUL a1, b3, t1 ++ ++ ST y0, 0 * SIZE(Y1) ++ ADD y4, a0, y4 + unop ++ MUL alpha2, a4, a4 + -+ ADD c02, t2, c02 ++ ST y1, 1 * SIZE(Y1) ++ ADD y5, a1, y5 + unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) ++ MUL alpha2, a5, a5 ++ ++ ST y2, 2 * SIZE(Y1) ++ ADD y6, a2, y6 ++ unop ++ MUL alpha2, a6, a6 ++ ++ ST y3, 3 * SIZE(Y1) ++ ADD y7, a3, y7 ++ ldi Y1, 8 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ ++ ADD y4, a4, y4 ++ MUL alpha3, a8, a8 ++ ADD y5, a5, y5 ++ MUL alpha3, a9, a9 ++ ADD y6, a6, y6 ++ MUL alpha3, a10, a10 ++ ADD y7, a7, y7 ++ MUL alpha3, a11, a11 ++ ++ ADD y4, a8, y4 ++ MUL alpha4, a12, a12 ++ ADD y5, a9, y5 ++ MUL alpha4, a13, a13 ++ ADD y6, a10, y6 ++ MUL alpha4, a14, a14 ++ ADD y7, a11, y7 ++ MUL alpha4, a15, a15 ++ ++ ADD y4, a12, y4 ++ ADD y5, a13, y5 ++ ADD y6, a14, y6 ++ ADD y7, a15, y7 ++ ++ ST y4, -4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, -3 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) ++ ST y6, -2 * SIZE(Y1) ++ ldi A3, 8 * SIZE(A3) ++ ST y7, -1 * SIZE(Y1) ++ ldi A4, 8 * SIZE(A4) ++ .align 4 ++ ++$L15: ++ and M, 4, I ++ ble I, $L16 ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD a8, 0 * SIZE(A3) ++ LD a9, 1 * SIZE(A3) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 3 * SIZE(A3) ++ ++ MUL alpha1, a0, a0 ++ LD a12, 0 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ LD a13, 1 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ LD a14, 2 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ LD a15, 3 * SIZE(A4) ++ ++ ADD y0, a0, y0 ++ MUL alpha2, a4, a4 ++ ADD y1, a1, y1 ++ MUL alpha2, a5, a5 ++ ADD y2, a2, y2 ++ MUL alpha2, a6, a6 ++ ADD y3, a3, y3 ++ MUL alpha2, a7, a7 ++ ++ ADD y0, a4, y0 ++ MUL alpha3, a8, a8 ++ ADD y1, a5, y1 ++ MUL alpha3, a9, a9 ++ ADD y2, a6, y2 ++ MUL alpha3, a10, a10 ++ ADD y3, a7, y3 ++ MUL alpha3, a11, a11 ++ ++ ADD y0, a8, y0 ++ MUL alpha4, a12, a12 ++ ADD y1, a9, y1 ++ MUL alpha4, a13, a13 ++ ADD y2, a10, y2 ++ MUL alpha4, a14, a14 ++ ADD y3, a11, y3 ++ MUL alpha4, a15, a15 + -+ ADD c05, t3, c05 ++ ADD y0, a12, y0 ++ ldi Y1, 4 * SIZE(Y1) ++ ADD y1, a13, y1 + unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) + -+ ADD c06, t4, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ -+ ADD c09, t1, c09 ++ ADD y2, a14, y2 + unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD c10, t2, c10 ++ ADD y3, a15, y3 + unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) + -+ ADD c13, t3, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) ++ ST y0, -4 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, -3 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, -2 * SIZE(Y1) ++ ldi A3, 4 * SIZE(A3) ++ ST y3, -1 * SIZE(Y1) ++ ldi A4, 4 * SIZE(A4) ++ .align 4 + -+ ADD c14, t4, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) ++$L16: ++ and M, 2, I ++ ble I, $L17 + -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) + -+ ADD c02, t2, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) + -+ ADD c05, t3, c05 -+ unop -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) ++ LD a4, 0 * SIZE(A3) ++ MUL alpha1, a0, a0 ++ LD a5, 1 * SIZE(A3) ++ MUL alpha1, a1, a1 ++ LD a6, 0 * SIZE(A4) ++ MUL alpha2, a2, a2 ++ LD a7, 1 * SIZE(A4) ++ MUL alpha2, a3, a3 + -+ ADD c06, t4, c06 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 ++ ADD y0, a0, y0 ++ MUL alpha3, a4, a4 ++ ADD y1, a1, y1 ++ MUL alpha3, a5, a5 ++ ADD y0, a2, y0 ++ MUL alpha4, a6, a6 ++ ADD y1, a3, y1 ++ MUL alpha4, a7, a7 + -+$L25: -+ ADD c09, t1, c09 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L27 -+#else -+ blbs TMP1, $L27 -+#endif ++ ADD y0, a4, y0 ++ ldi A1, 2 * SIZE(A1) ++ ADD y1, a5, y1 ++ ldi A2, 2 * SIZE(A2) ++ ADD y0, a6, y0 ++ ldi A3, 2 * SIZE(A3) ++ ADD y1, a7, y1 ++ ldi A4, 2 * SIZE(A4) + -+ ADD c10, t2, c10 ++ ST y0, 0 * SIZE(Y1) + unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 + -+ ADD c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ unop ++$L17: ++ blbc M, $L18 + -+ ADD c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) ++ LD y0, 0 * SIZE(Y1) + -+ ADD c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) + -+ ADD c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) ++ MUL alpha1, a0, a0 ++ MUL alpha2, a1, a1 ++ MUL alpha3, a2, a2 ++ MUL alpha4, a3, a3 + -+ ADD c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) ++ ADD y0, a0, y0 ++ ADD y0, a1, y0 ++ ADD y0, a2, y0 ++ ADD y0, a3, y0 + -+ ADD c06, t4, c06 -+ unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) ++ ST y0, 0 * SIZE(Y1) ++ .align 4 + -+ ADD c09, t1, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) ++$L18: ++ ldi J, -1(J) ++ bgt J, $L11 + .align 4 + -+$L27: -+ ADD c10, t2, c10 -+ MUL a2, b1, t2 -+ ADD c13, t3, c13 -+ MUL a1, b2, t3 ++$L20: ++ and N, 2, J ++ ble J, $L30 + -+ ADD c14, t4, c14 -+ MUL a2, b2, t4 -+ ADD c01, t1, c01 -+ MUL a1, b3, t1 ++ LD alpha1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha2, 0 * SIZE(X) ++ addl X, INCX, X + -+ ADD c02, t2, c02 -+ MUL a2, b3, t2 -+ ADD c05, t3, c05 -+ MUL a1, b4, t3 ++ mov A, A1 ++ MUL alpha, alpha1, alpha1 ++ addl A, LDA, A2 ++ MUL alpha, alpha2, alpha2 + -+ ADD c06, t4, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, t4 -+ ldi BO, 4 * SIZE(BO) ++ addl A2, LDA, A ++ mov Y, Y1 + -+ ADD c09, t1, c09 -+ ADD c10, t2, c10 -+ ADD c13, t3, c13 -+ ADD c14, t4, c14 -+ .align 4 ++ sra M, 3, I ++ ble I, $L25 + -+$L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) + -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) + -+ SUB b1, c02, c02 -+ SUB b2, c06, c06 -+ SUB b3, c10, c10 -+ SUB b4, c14, c14 ++ MUL alpha1, a0, a0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a2, a2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a3, a3 ++ LD y7, 7 * SIZE(Y1) + -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) ++ ADD y0, a0, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, a4 + -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c05, c05 -+ SUB a4, c06, c06 ++ ADD y1, a1, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, a5 + -+ SUB b1, c09, c09 -+ SUB b2, c10, c10 -+ SUB b3, c13, c13 -+ SUB b4, c14, c14 -+#endif ++ ADD y2, a2, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, a6 + -+#ifdef LN -+ LD a1, 3 * SIZE(AO) -+ LD a2, 2 * SIZE(AO) -+ LD a3, 0 * SIZE(AO) ++ ADD y3, a3, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, a7 + -+ MUL a1, c02, c02 -+ MUL a1, c06, c06 -+ MUL a1, c10, c10 -+ MUL a1, c14, c14 ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha1, a0, a0 + -+ MUL a2, c02, t1 -+ MUL a2, c06, t2 -+ MUL a2, c10, t3 -+ MUL a2, c14, t4 ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha1, a1, a1 + -+ SUB c01, t1, c01 -+ SUB c05, t2, c05 -+ SUB c09, t3, c09 -+ SUB c13, t4, c13 ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha1, a2, a2 + -+ MUL a3, c01, c01 -+ MUL a3, c05, c05 -+ MUL a3, c09, c09 -+ MUL a3, c13, c13 -+#endif ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha1, a3, a3 + -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 3 * SIZE(AO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ MUL a1, c09, c09 -+ MUL a1, c13, c13 ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 + -+ MUL a2, c01, t1 -+ MUL a2, c05, t2 -+ MUL a2, c09, t3 -+ MUL a2, c13, t4 ++$L22: ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ldi I, -1(I) ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ ldi A2, 8 * SIZE(A2) + -+ SUB c02, t1, c02 -+ SUB c06, t2, c06 -+ SUB c10, t3, c10 -+ SUB c14, t4, c14 ++ ADD y4, a0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ LD a0, 8 * SIZE(A1) + -+ MUL a3, c02, c02 -+ MUL a3, c06, c06 -+ MUL a3, c10, c10 -+ MUL a3, c14, c14 -+#endif ++ ADD y5, a1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ LD a1, 9 * SIZE(A1) + -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 ++ ADD y6, a2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ LD a2, 10 * SIZE(A1) + -+ MUL a2, c01, t1 -+ MUL a2, c02, t2 ++ ADD y7, a3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ LD a3, 11 * SIZE(A1) + -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 ++ ADD y4, a4, y4 ++ LD a4, 0 * SIZE(A2) ++ MUL alpha1, a0, a0 ++ LD y0, 8 * SIZE(Y1) + -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 ++ ADD y5, a5, y5 ++ LD a5, 1 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ LD y1, 9 * SIZE(Y1) + -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 ++ ADD y6, a6, y6 ++ LD a6, 2 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ LD y2, 10 * SIZE(Y1) + -+ MUL a4, c01, t1 -+ MUL a4, c02, t2 ++ ADD y7, a7, y7 ++ LD a7, 3 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ LD y3, 11 * SIZE(Y1) + -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 ++ ADD y0, a0, y0 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ LD a0, 12 * SIZE(A1) + -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, c05 -+ MUL b1, c06, c06 ++ ADD y1, a1, y1 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ LD a1, 13 * SIZE(A1) + -+ MUL b2, c05, t1 -+ MUL b2, c06, t2 ++ ADD y2, a2, y2 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ LD a2, 14 * SIZE(A1) + -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 ++ ADD y3, a3, y3 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ LD a3, 15 * SIZE(A1) + -+ MUL b3, c05, t1 -+ MUL b3, c06, t2 ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha1, a0, a0 ++ LD y4, 12 * SIZE(Y1) + -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ LD y5, 13 * SIZE(Y1) + -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ LD y6, 14 * SIZE(Y1) + -+ MUL a2, c09, t1 -+ MUL a2, c10, t2 ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ LD y7, 15 * SIZE(Y1) + -+ SUB c13, t1, c13 -+ SUB c14, t2, c14 ++ fillde (PREFETCHSIZE + 0) * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L22 ++ .align 4 + -+ MUL a3, c13, c13 -+ MUL a3, c14, c14 -+#endif ++$L23: ++ ADD y4, a0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ unop + -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, c13 -+ MUL a1, c14, c14 ++ ADD y5, a1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ unop + -+ MUL a2, c13, t1 -+ MUL a2, c14, t2 ++ ADD y6, a2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ unop + -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 ++ ADD y7, a3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ unop + -+ MUL a3, c13, t1 -+ MUL a3, c14, t2 ++ ADD y4, a4, y4 ++ ADD y5, a5, y5 ++ ADD y6, a6, y6 ++ ADD y7, a7, y7 + -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 ++ ST y4, 4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, 5 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) + -+ MUL a4, c13, t1 -+ MUL a4, c14, t2 ++ ST y6, 6 * SIZE(Y1) ++ unop ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 + -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 ++$L25: ++ and M, 4, I ++ ble I, $L26 + -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, c09 -+ MUL b1, c10, c10 ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) + -+ MUL b2, c09, t1 -+ MUL b2, c10, t2 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) + -+ SUB c05, t1, c05 -+ SUB c06, t2, c06 ++ MUL alpha1, a0, a0 ++ LD a4, 0 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ LD a5, 1 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ LD a6, 2 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ LD a7, 3 * SIZE(A2) + -+ MUL b3, c09, t1 -+ MUL b3, c10, t2 ++ ADD y0, a0, y0 ++ MUL alpha2, a4, a4 ++ ADD y1, a1, y1 ++ MUL alpha2, a5, a5 ++ ADD y2, a2, y2 ++ MUL alpha2, a6, a6 ++ ADD y3, a3, y3 ++ MUL alpha2, a7, a7 + -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 ++ ADD y0, a4, y0 ++ ldi Y1, 4 * SIZE(Y1) ++ ADD y1, a5, y1 ++ unop ++ ADD y2, a6, y2 ++ unop ++ ADD y3, a7, y3 ++ unop + -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) ++ ST y0, -4 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, -3 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, -2 * SIZE(Y1) ++ ldi A3, 4 * SIZE(A3) ++ ST y3, -1 * SIZE(Y1) ++ ldi A4, 4 * SIZE(A4) ++ .align 4 + -+ MUL a1, c05, c05 -+ MUL a1, c06, c06 ++$L26: ++ and M, 2, I ++ ble I, $L27 + -+ MUL a2, c05, t1 -+ MUL a2, c06, t2 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) + -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) + -+ MUL a3, c01, c01 -+ MUL a3, c02, c02 -+#endif ++ MUL alpha1, a0, a0 ++ MUL alpha1, a1, a1 ++ MUL alpha2, a2, a2 ++ MUL alpha2, a3, a3 + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) ++ ADD y0, a0, y0 ++ ldi A1, 2 * SIZE(A1) ++ ADD y1, a1, y1 ++ ldi A2, 2 * SIZE(A2) ++ ADD y0, a2, y0 ++ unop ++ ADD y1, a3, y1 ++ unop + -+ ST c02, 4 * SIZE(BO) -+ ST c06, 5 * SIZE(BO) -+ ST c10, 6 * SIZE(BO) -+ ST c14, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c05, 2 * SIZE(AO) -+ ST c06, 3 * SIZE(AO) ++ ST y0, 0 * SIZE(Y1) ++ unop ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 + -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c13, 6 * SIZE(AO) -+ ST c14, 7 * SIZE(AO) -+#endif ++$L27: ++ blbc M, $L30 + -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+ ldi C3, -2 * SIZE(C3) -+ ldi C4, -2 * SIZE(C4) -+#endif ++ LD y0, 0 * SIZE(Y1) + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c06, 1 * SIZE(C2) ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) + -+ ST c09, 0 * SIZE(C3) -+ ST c10, 1 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) -+ ST c14, 1 * SIZE(C4) ++ MUL alpha1, a0, a0 ++ MUL alpha2, a1, a1 + -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+ ldi C3, 2 * SIZE(C3) -+ ldi C4, 2 * SIZE(C4) -+#endif ++ ADD y0, a0, y0 ++ ADD y0, a1, y0 + -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 ++ ST y0, 0 * SIZE(Y1) ++ .align 4 + -+#ifdef RT -+ sll K, 1 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++$L30: ++ blbc N, $L990 + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif ++ LD alpha1, 0 * SIZE(X) ++ mov A, A1 ++ MUL alpha, alpha1, alpha1 ++ mov Y, Y1 + -+#ifdef LT -+ addl KK, 2, KK -+#endif ++ sra M, 3, I ++ ble I, $L35 + -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ .align 4 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ LD a4, 4 * SIZE(A1) ++ LD a5, 5 * SIZE(A1) ++ LD a6, 6 * SIZE(A1) ++ LD a7, 7 * SIZE(A1) + -+$L30: -+ and M, 1, I -+ ble I, $L39 ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ LD y4, 4 * SIZE(Y1) ++ LD y5, 5 * SIZE(Y1) ++ LD y6, 6 * SIZE(Y1) ++ LD y7, 7 * SIZE(Y1) + -+#if defined(LT) || defined(RN) ++ MUL alpha1, a0, a0 ++ MUL alpha1, a1, a1 ++ MUL alpha1, a2, a2 ++ MUL alpha1, a3, a3 + -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 ++ ldi I, -1(I) ++ ble I, $L33 ++ .align 4 + -+ LD b1, 0 * SIZE(B) -+ ldi L, -2(KK) -+ LD b2, 1 * SIZE(B) -+ ldi AO, 1 * SIZE(AO) ++$L32: ++ ADD y0, a0, y0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, a4 ++ LD a0, 8 * SIZE(A1) + -+ LD b3, 2 * SIZE(B) -+ fclr c09 -+ LD b4, 3 * SIZE(B) -+ fclr c13 ++ ADD y1, a1, y1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, a5 ++ LD a1, 9 * SIZE(A1) + -+ ldi BO, 4 * SIZE(B) -+ ble KK, $L38 ++ ADD y2, a2, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, a6 ++ LD a2, 10 * SIZE(A1) + -+ ble L, $L35 -+#else -+#ifdef LN -+ sll K, BASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif ++ ADD y3, a3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, a7 ++ LD a3, 11 * SIZE(A1) + -+ sll KK, BASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ ST y2, 2 * SIZE(Y1) ++ ST y3, 3 * SIZE(Y1) + -+ subl K, KK, TMP1 ++ ADD y4, a4, y4 ++ LD y0, 8 * SIZE(Y1) ++ MUL alpha1, a0, a0 ++ LD a4, 12 * SIZE(A1) + -+ LD a1, 0 * SIZE(AO) -+ fclr c01 -+ LD a2, 1 * SIZE(AO) -+ fclr c05 ++ ADD y5, a5, y5 ++ LD y1, 9 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ LD a5, 13 * SIZE(A1) + -+ LD b1, 0 * SIZE(BO) -+ ldi L, -2(TMP1) -+ LD b2, 1 * SIZE(BO) -+ ldi AO, 1 * SIZE(AO) ++ ADD y6, a6, y6 ++ LD y2, 10 * SIZE(Y1) ++ MUL alpha1, a2, a2 ++ LD a6, 14 * SIZE(A1) + -+ LD b3, 2 * SIZE(BO) -+ fclr c09 -+ LD b4, 3 * SIZE(BO) -+ fclr c13 ++ ADD y7, a7, y7 ++ LD y3, 11 * SIZE(Y1) ++ MUL alpha1, a3, a3 ++ LD a7, 15 * SIZE(A1) + -+ ldi BO, 4 * SIZE(BO) -+ ble TMP1, $L38 ++ ST y4, 4 * SIZE(Y1) ++ ldi I, -1(I) ++ ST y5, 5 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) + -+ ble L, $L35 -+#endif -+ .align 4 ++ ST y6, 6 * SIZE(Y1) ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ fillde (PREFETCHSIZE + 0) * SIZE(Y1) + -+$L32: -+ ADD c01, t1, c01 -+ ldi L, -2(L) -+ MUL a1, b1, t1 -+ LD b1, 0 * SIZE(BO) ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L32 ++ .align 4 + -+ ADD c05, t2, c05 -+ ldi AO, 2 * SIZE(AO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) ++$L33: ++ ADD y0, a0, y0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, a4 ++ unop + -+ ADD c09, t3, c09 -+ LD b5, 3 * SIZE(BO) -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) ++ ADD y1, a1, y1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, a5 ++ unop + -+ ADD c13, t4, c13 -+ MUL a1, b4, t4 -+ LD a1, -1 * SIZE(AO) ++ ADD y2, a2, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, a6 ++ unop + -+ ADD c01, t1, c01 -+ MUL a2, b1, t1 -+ LD b1, 4 * SIZE(BO) -+ ldi BO, 8 * SIZE(BO) ++ ADD y3, a3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, a7 ++ unop + -+ ADD c05, t2, c05 -+ MUL a2, b2, t2 -+ LD b2, -3 * SIZE(BO) ++ ADD y4, a4, y4 ++ ST y0, 0 * SIZE(Y1) ++ ADD y5, a5, y5 ++ ST y1, 1 * SIZE(Y1) ++ ADD y6, a6, y6 ++ ST y2, 2 * SIZE(Y1) ++ ADD y7, a7, y7 ++ ST y3, 3 * SIZE(Y1) + -+ ADD c09, t3, c09 -+ LD b4, -1 * SIZE(BO) -+ MUL a2, b3, t3 -+ LD b3, -2 * SIZE(BO) ++ ST y4, 4 * SIZE(Y1) ++ unop ++ ST y5, 5 * SIZE(Y1) ++ unop + -+ ADD c13, t4, c13 -+ MUL a2, b5, t4 -+ LD a2, 0 * SIZE(AO) -+ bgt L, $L32 ++ ST y6, 6 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) + .align 4 + +$L35: -+ ADD c01, t1, c01 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L37 -+#else -+ blbs TMP1, $L37 -+#endif -+ .align 4 ++ and M, 4, I ++ ble I, $L36 + -+ ADD c05, t2, c05 -+ LD b1, 0 * SIZE(BO) -+ MUL a1, b2, t2 -+ LD b2, 1 * SIZE(BO) ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) + -+ ADD c09, t3, c09 -+ MUL a1, b3, t3 -+ LD b3, 2 * SIZE(BO) ++ MUL alpha1, a0, a0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, a2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, a3 ++ LD y3, 3 * SIZE(Y1) + -+ ADD c13, t4, c13 -+ MUL a1, b4, t4 -+ LD a1, 0 * SIZE(AO) -+ ldi AO, 1 * SIZE(AO) ++ ADD y0, a0, y0 ++ ADD y1, a1, y1 ++ ADD y2, a2, y2 ++ ADD y3, a3, y3 + -+ ADD c01, t1, c01 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, 2 * SIZE(Y1) ++ unop ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) + .align 4 + -+$L37: -+ ADD c05, t2, c05 -+ MUL a1, b2, t2 -+ ADD c09, t3, c09 -+ MUL a1, b3, t3 ++$L36: ++ and M, 2, I ++ ble I, $L37 + -+ ADD c13, t4, c13 -+ ldi AO, 1 * SIZE(AO) -+ MUL a1, b4, t4 -+ ldi BO, 4 * SIZE(BO) ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) + -+ ADD c01, t1, c01 -+ ADD c05, t2, c05 -+ ADD c09, t3, c09 -+ ADD c13, t4, c13 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a0, a0 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a1, a1 + -+$L38: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 4, TMP1 -+#endif -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -1 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif ++ ADD y0, a0, y0 ++ ADD y1, a1, y1 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c05, c05 -+ SUB a3, c09, c09 -+ SUB a4, c13, c13 -+#endif ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 2 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) ++$L37: ++ blbc M, $L990 + -+ MUL a1, c01, c01 -+ MUL a1, c05, c05 -+ MUL a1, c09, c09 -+ MUL a1, c13, c13 -+#endif ++ LD y0, 0 * SIZE(Y1) ++ LD a0, 0 * SIZE(A1) + -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a1, c01, c01 -+ MUL a2, c01, t1 -+ SUB c05, t1, c05 -+ MUL a3, c01, t1 -+ SUB c09, t1, c09 -+ MUL a4, c01, t1 -+ SUB c13, t1, c13 ++ MUL alpha1, a0, a0 + -+ LD b1, 5 * SIZE(BO) -+ LD b2, 6 * SIZE(BO) -+ LD b3, 7 * SIZE(BO) -+ -+ MUL b1, c05, c05 -+ MUL b2, c05, t1 -+ SUB c09, t1, c09 -+ MUL b3, c05, t1 -+ SUB c13, t1, c13 ++ ADD y0, a0, y0 ++ ST y0, 0 * SIZE(Y1) ++ .align 4 + -+ LD a1, 10 * SIZE(BO) -+ LD a2, 11 * SIZE(BO) -+ LD a3, 15 * SIZE(BO) -+ -+ MUL a1, c09, c09 -+ MUL a2, c09, t1 -+ SUB c13, t1, c13 -+ MUL a3, c13, c13 -+#endif ++$L990: ++ cmpeq INCY, SIZE, $0 ++ bne $0, $L999 + -+#ifdef RT -+ LD a1, 15 * SIZE(BO) -+ LD a2, 14 * SIZE(BO) -+ LD a3, 13 * SIZE(BO) -+ LD a4, 12 * SIZE(BO) -+ -+ MUL a1, c13, c13 -+ MUL a2, c13, t1 -+ SUB c09, t1, c09 -+ MUL a3, c13, t1 -+ SUB c05, t1, c05 -+ MUL a4, c13, t1 -+ SUB c01, t1, c01 ++ mov BUFFER, Y1 + -+ LD b1, 10 * SIZE(BO) -+ LD b2, 9 * SIZE(BO) -+ LD b3, 8 * SIZE(BO) -+ -+ MUL b1, c09, c09 -+ MUL b2, c09, t1 -+ SUB c05, t1, c05 -+ MUL b3, c09, t1 -+ SUB c01, t1, c01 ++ sra M, 3, I ++ ble I, $L995 ++ .align 4 + -+ LD a1, 5 * SIZE(BO) -+ LD a2, 4 * SIZE(BO) -+ LD a3, 0 * SIZE(BO) ++$L992: ++ LD a0, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a1, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a2, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a3, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER + -+ MUL a1, c05, c05 -+ MUL a2, c05, t1 -+ SUB c01, t1, c01 -+ MUL a3, c01, c01 -+#endif ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ LD y2, 2 * SIZE(Y) ++ LD y3, 3 * SIZE(Y) + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c05, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c13, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c05, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c13, 3 * SIZE(AO) -+#endif ++ LD a4, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a5, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a6, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a7, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER + -+#ifdef LN -+ ldi C1, -1 * SIZE(C1) -+ ldi C2, -1 * SIZE(C2) -+ ldi C3, -1 * SIZE(C3) -+ ldi C4, -1 * SIZE(C4) -+#endif ++ LD y4, 4 * SIZE(Y) ++ LD y5, 5 * SIZE(Y) ++ LD y6, 6 * SIZE(Y) ++ LD y7, 7 * SIZE(Y) + -+ ST c01, 0 * SIZE(C1) -+ ST c05, 0 * SIZE(C2) -+ ST c09, 0 * SIZE(C3) -+ ST c13, 0 * SIZE(C4) ++ ADD a0, y0, a0 ++ ADD a1, y1, a1 ++ ADD a2, y2, a2 ++ ADD a3, y3, a3 ++ ADD a4, y4, a4 ++ ADD a5, y5, a5 ++ ADD a6, y6, a6 ++ ADD a7, y7, a7 + -+#ifdef RT -+ sll K, 0 + BASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a1, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a2, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a3, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, BASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, BASE_SHIFT + 2, TMP2 -+ addl BO, TMP2, BO -+#endif ++ ST a4, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a5, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a6, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a7, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 + -+#ifdef LT -+ addl KK, 1, KK -+#endif ++ ldi I, -1(I) ++ ldi Y, 8 * SIZE(Y) ++ bgt I, $L992 ++ .align 4 + -+#ifdef LN -+ subl KK, 1, KK -+#endif ++$L995: ++ and M, 7, I ++ ble I, $L999 + .align 4 + -+$L39: -+#ifdef LN -+ sll K, 2 + BASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif ++$L996: ++ LD a0, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER + -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif ++ LD y0, 0 * SIZE(Y) ++ ldi Y, 1 * SIZE(Y) + -+#ifdef RN -+ addl KK, 4, KK -+#endif ++ ADD a0, y0, a0 + -+#ifdef RT -+ subl KK, 4, KK -+#endif -+ ldi J, -1(J) -+ bgt J, $L01 ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L996 + .align 4 + +$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) + ret + EPILOGUE -diff --git a/kernel/sw_64/zamax.S b/kernel/sw_64/zamax.S +diff --git a/kernel/sw_64/gemv_t.S b/kernel/sw_64/gemv_t.S new file mode 100644 -index 0000000..c453e9d +index 000000000..68bce3fe5 --- /dev/null -+++ b/kernel/sw_64/zamax.S -@@ -0,0 +1,302 @@ ++++ b/kernel/sw_64/gemv_t.S +@@ -0,0 +1,1061 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -62886,818 +7932,1033 @@ index 0000000..c453e9d + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#define N $16 -+#define X $17 -+#define INCX $18 + -+#ifndef USE_MIN -+#define CMPLT(a, b) fcmplt a, b -+#else -+#define CMPLT(a, b) fcmplt b, a -+#endif ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 + -+#define STACKSIZE 8 * 8 ++#define M $16 ++#define N $17 ++#define A $20 ++#define LDA $21 ++ ++#define X $18 ++#define INCX $19 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define X1 $3 ++#define Y1 $4 ++ ++#define A1 $5 ++#define A2 $6 ++#define A3 $7 ++#define A4 $8 ++ ++#define alpha $f19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 + + PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl X, 0 + STACKSIZE($sp) ++ ldl INCX, 8 + STACKSIZE($sp) ++ ldl Y, 16 + STACKSIZE($sp) ++ ldl INCY, 24 + STACKSIZE($sp) ++ ldl BUFFER, 32 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ + PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 + -+ ldi $sp, -STACKSIZE($sp) ++ cmple M, 0, $0 ++ SXADDQ INCX, 0, INCX ++ cmple N, 0, $1 ++ SXADDQ INCY, 0, INCY + -+ fstd $f2, 0($sp) -+ fclr $f16 -+ cmplt $31, N, $2 ++ or $0, $1, $0 ++ bne $0, $L999 + -+ fstd $f3, 8($sp) -+ fclr $f17 -+ cmplt $31, INCX, $3 ++ cmpeq INCX, SIZE, $0 ++ mov X, X1 ++ SXADDQ LDA, 0, LDA ++ bne $0, $L10 ++ ++ sra M, 3, I ++ mov BUFFER, Y1 ++ mov BUFFER, X ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ ldi I, -1(I) ++ ++ LD a0, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a1, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a2, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a3, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ST a2, 2 * SIZE(Y1) ++ ST a3, 3 * SIZE(Y1) ++ ++ LD a4, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a5, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a6, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a7, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a4, 4 * SIZE(Y1) ++ ST a5, 5 * SIZE(Y1) ++ ST a6, 6 * SIZE(Y1) ++ ST a7, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 7, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ LD a0, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, SIZE, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ mov Y, Y1 ++ fclr t0 + unop ++ fclr t1 + -+ fstd $f4, 16($sp) -+ fclr $f18 -+ SXADDQ INCX, $31, INCX ++ sra N, 2, J ++ fclr t2 ++ fclr t3 ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ mov A, A1 ++ fclr s0 ++ addl A, LDA, A2 ++ fclr s1 ++ ++ addl A2, LDA, A3 ++ fclr s2 ++ addl A3, LDA, A4 ++ fclr s3 ++ ++ s4addl LDA, A, A + unop ++ mov X, X1 ++ fillde 3 * SIZE(Y) + -+ fstd $f5, 24($sp) -+ fclr $f19 -+ and $2, $3, $0 ++ sra M, 3, I ++ ble I, $L15 ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) ++ LD a4, 1 * SIZE(A1) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 1 * SIZE(A3) ++ LD a7, 1 * SIZE(A4) ++ LD a8, 2 * SIZE(A1) ++ LD a9, 2 * SIZE(A2) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 2 * SIZE(A4) ++ LD a12, 3 * SIZE(A1) ++ LD a13, 3 * SIZE(A2) ++ LD a14, 3 * SIZE(A3) ++ LD a15, 3 * SIZE(A4) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 4 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 4 * SIZE(A2) ++ ++ ADD s2, t2, s2 + unop ++ MUL x0, a2, t2 ++ LD a2, 4 * SIZE(A3) + -+ fstd $f6, 32($sp) ++ ADD s3, t3, s3 + unop ++ MUL x0, a3, t3 ++ LD a3, 4 * SIZE(A4) + -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) ++ ADD s0, t0, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 5 * SIZE(A1) + -+ fclr $f0 -+ beq $0, $End # if (n <= 0) or (incx <= 0) return -+ .align 4 ++ ADD s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a5, t1 ++ LD a5, 5 * SIZE(A2) + -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ sra N, 2, $1 -+ addl INCX, INCX, INCX ++ ADD s2, t2, s2 ++ unop ++ MUL x1, a6, t2 ++ LD a6, 5 * SIZE(A3) + -+ fabs $f20, $f20 -+ fabs $f21, $f21 -+ faddd $f20, $f21, $f0 -+ ble $1, $L15 -+ .align 4 ++ ADD s3, t3, s3 ++ unop ++ MUL x1, a7, t3 ++ LD a7, 5 * SIZE(A4) + -+ ldi $1, -1($1) ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, -2 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x2, a9, t1 ++ LD a9, 6 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ ldi A2, 8 * SIZE(A2) ++ MUL x2, a10, t2 ++ LD a10, 6 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ ldi A3, 8 * SIZE(A3) ++ MUL x2, a11, t3 ++ LD a11, 6 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, -1 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A4, 8 * SIZE(A4) ++ MUL x3, a13, t1 ++ LD a13, -1 * SIZE(A2) ++ ++ ADD s2, t2, s2 + unop -+ addl X, INCX, X ++ MUL x3, a14, t2 ++ LD a14, -1 * SIZE(A3) ++ ++ ADD s3, t3, s3 + unop ++ MUL x3, a15, t3 ++ LD a15, -1 * SIZE(A4) + -+ LD $f22, 0 * SIZE(X) -+ fmov $f0, $f1 -+ LD $f23, 1 * SIZE(X) -+ addl X, INCX, X ++ ADD s0, t0, s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 0 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE - 8) * SIZE(A3) ++ MUL x0, a1, t1 ++ LD a1, 0 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x0, a2, t2 ++ LD a2, 0 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD a3, 0 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x0, 8 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 1 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x1, a5, t1 ++ LD a5, 1 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x1, a6, t2 ++ LD a6, 1 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x1, a7, t3 ++ LD a7, 1 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x1, 9 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, 2 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE - 8) * SIZE(A4) ++ MUL x2, a9, t1 ++ LD a9, 2 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ ldi X1, 8 * SIZE(X1) ++ MUL x2, a10, t2 ++ LD a10, 2 * SIZE(A3) + -+ LD $f24, 0 * SIZE(X) -+ fmov $f0, $f2 -+ LD $f25, 1 * SIZE(X) -+ addl X, INCX, X ++ ADD s3, t3, s3 ++ ldi I, -1(I) ++ MUL x2, a11, t3 ++ LD a11, 2 * SIZE(A4) + -+ LD $f26, 0 * SIZE(X) -+ fmov $f0, $f3 -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X ++ ADD s0, t0, s0 ++ LD x2, 2 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, 3 * SIZE(A1) + -+ fabs $f20, $f8 -+ fabs $f21, $f9 -+ fabs $f22, $f10 -+ fabs $f23, $f11 ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE - 8) * SIZE(X1) ++ MUL x3, a13, t1 ++ LD a13, 3 * SIZE(A2) + -+ fabs $f24, $f12 -+ fabs $f25, $f13 -+ fabs $f26, $f14 -+ fabs $f27, $f15 ++ ADD s2, t2, s2 ++ unop ++ MUL x3, a14, t2 ++ LD a14, 3 * SIZE(A3) + -+ ble $1, $L14 ++ ADD s3, t3, s3 ++ MUL x3, a15, t3 ++ LD a15, 3 * SIZE(A4) ++ bgt I, $L12 + .align 4 + -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ ldi $1, -1($1) -+ addl X, INCX, X ++$L13: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 4 * SIZE(A1) + -+ LD $f22, 0 * SIZE(X) -+ LD $f23, 1 * SIZE(X) ++ ADD s1, t1, s1 + unop -+ addl X, INCX, X ++ MUL x0, a1, t1 ++ LD a1, 4 * SIZE(A2) + -+ LD $f24, 0 * SIZE(X) -+ LD $f25, 1 * SIZE(X) ++ ADD s2, t2, s2 + unop -+ addl X, INCX, X -+ -+ LD $f26, 0 * SIZE(X) -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ ble $1, $L13 -+ .align 4 ++ MUL x0, a2, t2 ++ LD a2, 4 * SIZE(A3) + -+$L12: -+ faddd $f8, $f9, $f16 ++ ADD s3, t3, s3 + unop -+ fabs $f20, $f8 -+ fillcs 64 * SIZE(X) ++ MUL x0, a3, t3 ++ LD a3, 4 * SIZE(A4) + -+ faddd $f10, $f11, $f17 ++ ADD s0, t0, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 5 * SIZE(A1) ++ ++ ADD s1, t1, s1 + unop -+ fabs $f21, $f9 -+ LD $f20, 0 * SIZE(X) ++ MUL x1, a5, t1 ++ LD a5, 5 * SIZE(A2) + -+ faddd $f12, $f13, $f18 -+ LD $f21, 1 * SIZE(X) -+ fabs $f22, $f10 -+ addl X, INCX, X ++ ADD s2, t2, s2 ++ unop ++ MUL x1, a6, t2 ++ LD a6, 5 * SIZE(A3) + -+ faddd $f14, $f15, $f19 -+ LD $f22, 0 * SIZE(X) -+ fabs $f23, $f11 ++ ADD s3, t3, s3 + unop ++ MUL x1, a7, t3 ++ LD a7, 5 * SIZE(A4) + -+ CMPLT($f0, $f16), $f4 -+ LD $f23, 1 * SIZE(X) -+ fabs $f24, $f12 -+ addl X, INCX, X ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, 6 * SIZE(A1) + -+ CMPLT($f1, $f17), $f5 -+ LD $f24, 0 * SIZE(X) -+ fabs $f25, $f13 ++ ADD s1, t1, s1 + unop ++ MUL x2, a9, t1 ++ LD a9, 6 * SIZE(A2) + -+ CMPLT($f2, $f18), $f6 -+ LD $f25, 1 * SIZE(X) -+ fabs $f26, $f14 -+ addl X, INCX, X ++ ADD s2, t2, s2 ++ unop ++ MUL x2, a10, t2 ++ LD a10, 6 * SIZE(A3) + -+ CMPLT($f3, $f19), $f7 -+ LD $f26, 0 * SIZE(X) -+ fabs $f27, $f15 ++ ADD s3, t3, s3 + unop ++ MUL x2, a11, t3 ++ LD a11, 6 * SIZE(A4) + -+ fselne $f4, $f16, $f0, $f0 -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ ldi $1, -1($1) # i -- ++ ADD s0, t0, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, 7 * SIZE(A1) + -+ fselne $f5, $f17, $f1, $f1 -+ fselne $f6, $f18, $f2, $f2 -+ fselne $f7, $f19, $f3, $f3 -+ bgt $1,$L12 -+ .align 4 ++ ADD s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x3, a13, t1 ++ LD a13, 7 * SIZE(A2) + -+$L13: -+ faddd $f8, $f9, $f16 -+ fabs $f20, $f8 ++ ADD s2, t2, s2 ++ ldi A2, 8 * SIZE(A2) ++ MUL x3, a14, t2 ++ LD a14, 7 * SIZE(A3) + -+ faddd $f10, $f11, $f17 -+ fabs $f21, $f9 ++ ADD s3, t3, s3 ++ ldi A3, 8 * SIZE(A3) ++ MUL x3, a15, t3 ++ LD a15, 7 * SIZE(A4) + -+ faddd $f12, $f13, $f18 -+ fabs $f22, $f10 ++ ADD s0, t0, s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a0, t0 ++ unop + -+ faddd $f14, $f15, $f19 -+ fabs $f23, $f11 ++ ADD s1, t1, s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x0, a1, t1 ++ ldi A4, 8 * SIZE(A4) + -+ CMPLT($f0, $f16), $f4 -+ fabs $f24, $f12 ++ ADD s2, t2, s2 ++ MUL x0, a2, t2 ++ ADD s3, t3, s3 ++ MUL x0, a3, t3 + -+ CMPLT($f1, $f17), $f5 -+ fabs $f25, $f13 ++ ADD s0, t0, s0 ++ MUL x1, a4, t0 ++ ADD s1, t1, s1 ++ MUL x1, a5, t1 + -+ CMPLT($f2, $f18), $f6 -+ fabs $f26, $f14 -+ CMPLT($f3, $f19), $f7 -+ fabs $f27, $f15 ++ ADD s2, t2, s2 ++ MUL x1, a6, t2 ++ ADD s3, t3, s3 ++ MUL x1, a7, t3 + -+ fselne $f4, $f16, $f0, $f0 -+ fselne $f5, $f17, $f1, $f1 -+ fselne $f6, $f18, $f2, $f2 -+ fselne $f7, $f19, $f3, $f3 -+ .align 4 ++ ADD s0, t0, s0 ++ MUL x2, a8, t0 ++ ADD s1, t1, s1 ++ MUL x2, a9, t1 + -+$L14: -+ faddd $f8, $f9, $f16 -+ faddd $f10, $f11, $f17 -+ faddd $f12, $f13, $f18 -+ faddd $f14, $f15, $f19 ++ ADD s2, t2, s2 ++ MUL x2, a10, t2 ++ ADD s3, t3, s3 ++ MUL x2, a11, t3 + -+ CMPLT($f0, $f16), $f4 -+ CMPLT($f1, $f17), $f5 -+ CMPLT($f2, $f18), $f6 -+ CMPLT($f3, $f19), $f7 ++ ADD s0, t0, s0 ++ MUL x3, a12, t0 ++ ADD s1, t1, s1 ++ MUL x3, a13, t1 + -+ fselne $f4, $f16, $f0, $f0 -+ fselne $f5, $f17, $f1, $f1 -+ fselne $f6, $f18, $f2, $f2 -+ fselne $f7, $f19, $f3, $f3 ++ ADD s2, t2, s2 ++ MUL x3, a14, t2 ++ ADD s3, t3, s3 ++ MUL x3, a15, t3 ++ .align 4 + -+ CMPLT($f0, $f1), $f16 -+ CMPLT($f2, $f3), $f17 ++$L15: ++ and M, 7, I ++ ble I, $L18 + -+ fselne $f16, $f1, $f0, $f0 -+ fselne $f17, $f3, $f2, $f2 ++ LD x0, 0 * SIZE(X1) + -+ CMPLT($f0, $f2), $f16 -+ fselne $f16, $f2, $f0, $f0 -+ .align 4 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) + -+$L15: -+ and N, 3, $1 -+ unop -+ unop -+ ble $1, $End ++ ldi I, -1(I) ++ ble I, $L17 + .align 4 + +$L16: -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ unop -+ addl X, INCX, X ++ ADD s0, t0, s0 ++ ldi A4, 1 * SIZE(A4) ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) + -+ fabs $f20, $f29 -+ fabs $f21, $f30 -+ faddd $f29, $f30, $f20 -+ fmov $f20,$f29 ++ ADD s1, t1, s1 ++ ldi A1, 1 * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 1 * SIZE(A2) + -+ CMPLT($f0, $f29), $f16 -+ fselne $f16, $f29, $f0, $f0 ++ ADD s2, t2, s2 ++ ldi A2, 1 * SIZE(A2) ++ MUL x0, a2, t2 ++ LD a2, 1 * SIZE(A3) + -+ ldi $1, -1($1) # i -- -+ bgt $1, $L16 -+ .align 4 ++ ADD s3, t3, s3 ++ ldi A3, 1 * SIZE(A3) ++ MUL x0, a3, t3 ++ LD a3, 0 * SIZE(A4) + -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) ++ LD x0, 1 * SIZE(X1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L16 ++ .align 4 + -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldi $sp, STACKSIZE($sp) -+ ret ++$L17: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ ADD s1, t1, s1 ++ MUL x0, a1, t1 + -+ EPILOGUE -diff --git a/kernel/sw_64/zamax.S.bak b/kernel/sw_64/zamax.S.bak -new file mode 100644 -index 0000000..74b9331 ---- /dev/null -+++ b/kernel/sw_64/zamax.S.bak -@@ -0,0 +1,301 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ ADD s2, t2, s2 ++ MUL x0, a2, t2 ++ ADD s3, t3, s3 ++ MUL x0, a3, t3 ++ .align 4 + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++$L18: ++ LD a0, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a1, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a2, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a3, 0 * SIZE(Y) ++ addl Y, INCY, Y + -+#define N $16 -+#define X $17 -+#define INCX $18 ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 + -+#ifndef USE_MIN -+#define CMPLT(a, b) fcmplt a, b -+#else -+#define CMPLT(a, b) fcmplt b, a -+#endif ++ MUL alpha, s0, s0 ++ MUL alpha, s1, s1 ++ MUL alpha, s2, s2 ++ MUL alpha, s3, s3 + -+#define STACKSIZE 8 * 8 ++ ADD a0, s0, a0 ++ fclr t0 ++ ADD a1, s1, a1 ++ fclr t1 ++ ADD a2, s2, a2 ++ fclr t2 ++ ADD a3, s3, a3 ++ fclr t3 + -+ PROLOGUE -+ PROFCODE -+ .frame $sp, STACKSIZE, $26, 0 ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a1, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a2, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a3, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 + -+ ldi $sp, -STACKSIZE($sp) ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 + -+ fstd $f2, 0($sp) -+ fclr $f16 -+ cmplt $31, N, $2 ++$L20: ++ and N, 2, J ++ ble J, $L30 ++ mov A, A1 ++ addl A, LDA, A2 + -+ fstd $f3, 8($sp) -+ fclr $f17 -+ cmplt $31, INCX, $3 -+ unop ++ addl A2, LDA, A ++ fclr s0 ++ mov X, X1 ++ fclr s1 + -+ fstd $f4, 16($sp) -+ fclr $f18 -+ SXADDQ INCX, $31, INCX -+ unop ++ sra M, 3, I ++ fclr s2 ++ fclr s3 ++ ble I, $L25 + -+ fstd $f5, 24($sp) -+ fclr $f19 -+ and $2, $3, $0 -+ unop ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 1 * SIZE(A1) ++ LD a3, 1 * SIZE(A2) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 2 * SIZE(A2) ++ LD a6, 3 * SIZE(A1) ++ LD a7, 3 * SIZE(A2) + -+ fstd $f6, 32($sp) -+ unop ++ LD a8, 4 * SIZE(A1) ++ LD a9, 4 * SIZE(A2) ++ LD a10, 5 * SIZE(A1) ++ LD a11, 5 * SIZE(A2) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 6 * SIZE(A2) ++ LD a14, 7 * SIZE(A1) ++ LD a15, 7 * SIZE(A2) + -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) + -+ fclr $f0 -+ beq $0, $End # if (n <= 0) or (incx <= 0) return ++ ldi I, -1(I) ++ ble I, $L23 + .align 4 + -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ sra N, 2, $1 -+ addl INCX, INCX, INCX ++$L22: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 8 * SIZE(A1) + -+ fabs $f20, $f20 -+ fabs $f21, $f21 -+ faddd $f20, $f21, $f0 -+ ble $1, $L15 -+ .align 4 ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 8 * SIZE(A2) + -+ ldi $1, -1($1) -+ unop -+ addl X, INCX, X ++ ADD s0, t2, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a2, t2 ++ LD a2, 9 * SIZE(A1) ++ ++ ADD s1, t3, s1 + unop ++ MUL x1, a3, t3 ++ LD a3, 9 * SIZE(A2) + -+ LD $f22, 0 * SIZE(X) -+ fmov $f0, $f1 -+ LD $f23, 1 * SIZE(X) -+ addl X, INCX, X ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a4, t0 ++ LD a4, 10 * SIZE(A1) + -+ LD $f24, 0 * SIZE(X) -+ fmov $f0, $f2 -+ LD $f25, 1 * SIZE(X) -+ addl X, INCX, X ++ ADD s1, t1, s1 ++ ldi I, -1(I) ++ MUL x2, a5, t1 ++ LD a5, 10 * SIZE(A2) ++ ++ ADD s0, t2, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a6, t2 ++ LD a6, 11 * SIZE(A1) ++ ++ ADD s1, t3, s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a7, t3 ++ LD a7, 11 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x3, -1 * SIZE(X1) ++ MUL x0, a8, t0 ++ LD a8, 12 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x0, a9, t1 ++ LD a9, 12 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x0, 0 * SIZE(X1) ++ MUL x1, a10, t0 ++ LD a10, 13 * SIZE(A1) + -+ LD $f26, 0 * SIZE(X) -+ fmov $f0, $f3 -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X ++ ADD s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a11, t1 ++ LD a11, 13 * SIZE(A2) + -+ fabs $f20, $f8 -+ fabs $f21, $f9 -+ fabs $f22, $f10 -+ fabs $f23, $f11 ++ ADD s0, t0, s0 ++ LD x1, 1 * SIZE(X1) ++ MUL x2, a12, t0 ++ LD a12, 6 * SIZE(A1) + -+ fabs $f24, $f12 -+ fabs $f25, $f13 -+ fabs $f26, $f14 -+ fabs $f27, $f15 ++ ADD s1, t1, s1 ++ MUL x2, a13, t1 ++ LD a13, 14 * SIZE(A2) ++ ldi A2, 8 * SIZE(A2) + -+ ble $1, $L14 ++ ADD s0, t0, s0 ++ LD x2, 2 * SIZE(X1) ++ MUL x3, a14, t0 ++ LD a14, 7 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ MUL x3, a15, t1 ++ LD a15, 7 * SIZE(A2) ++ bgt I, $L22 + .align 4 + -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ ldi $1, -1($1) -+ addl X, INCX, X ++$L23: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ ldi A1, 8 * SIZE(A1) + -+ LD $f22, 0 * SIZE(X) -+ LD $f23, 1 * SIZE(X) ++ ADD s1, t1, s1 + unop -+ addl X, INCX, X -+ -+ LD $f24, 0 * SIZE(X) -+ LD $f25, 1 * SIZE(X) ++ MUL x0, a1, t1 + unop -+ addl X, INCX, X + -+ LD $f26, 0 * SIZE(X) -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ ble $1, $L13 -+ .align 4 ++ ADD s0, t2, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a2, t2 ++ ldi A2, 8 * SIZE(A2) + -+$L12: -+ faddd $f8, $f9, $f16 ++ ADD s1, t3, s1 + unop -+ fabs $f20, $f8 -+ fillcs 64 * SIZE(X) -+ -+ faddd $f10, $f11, $f17 ++ MUL x1, a3, t3 + unop -+ fabs $f21, $f9 -+ LD $f20, 0 * SIZE(X) -+ -+ faddd $f12, $f13, $f18 -+ LD $f21, 1 * SIZE(X) -+ fabs $f22, $f10 -+ addl X, INCX, X + -+ faddd $f14, $f15, $f19 -+ LD $f22, 0 * SIZE(X) -+ fabs $f23, $f11 ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a4, t0 + unop + -+ CMPLT($f0, $f16), $f4 -+ LD $f23, 1 * SIZE(X) -+ fabs $f24, $f12 -+ addl X, INCX, X -+ -+ CMPLT($f1, $f17), $f5 -+ LD $f24, 0 * SIZE(X) -+ fabs $f25, $f13 ++ ADD s1, t1, s1 + unop -+ -+ CMPLT($f2, $f18), $f6 -+ LD $f25, 1 * SIZE(X) -+ fabs $f26, $f14 -+ addl X, INCX, X -+ -+ CMPLT($f3, $f19), $f7 -+ LD $f26, 0 * SIZE(X) -+ fabs $f27, $f15 ++ MUL x2, a5, t1 + unop + -+fselne $f4,$f16,$f0, $f0 -+ LD $f27, 1 * SIZE(X) -+ addl X, INCX, X -+ ldi $1, -1($1) # i -- ++ ADD s0, t2, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a6, t2 ++ unop + -+fselne $f5,$f17,$f1, $f1 -+fselne $f6,$f18,$f2, $f2 -+fselne $f7,$f19,$f3, $f3 -+ bgt $1,$L12 -+ .align 4 ++ ADD s1, t3, s1 ++ unop ++ MUL x3, a7, t3 ++ unop + -+$L13: -+ faddd $f8, $f9, $f16 -+ fabs $f20, $f8 ++ ADD s0, t0, s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a8, t0 ++ ldi X1, 8 * SIZE(X1) + -+ faddd $f10, $f11, $f17 -+ fabs $f21, $f9 ++ ADD s1, t1, s1 ++ unop ++ MUL x0, a9, t1 ++ unop + -+ faddd $f12, $f13, $f18 -+ fabs $f22, $f10 ++ ADD s0, t0, s0 ++ MUL x1, a10, t0 ++ ADD s1, t1, s1 ++ MUL x1, a11, t1 + -+ faddd $f14, $f15, $f19 -+ fabs $f23, $f11 ++ ADD s0, t0, s0 ++ MUL x2, a12, t0 ++ ADD s1, t1, s1 ++ MUL x2, a13, t1 + -+ CMPLT($f0, $f16), $f4 -+ fabs $f24, $f12 ++ ADD s0, t0, s0 ++ MUL x3, a14, t0 ++ ADD s1, t1, s1 ++ MUL x3, a15, t1 ++ .align 4 + -+ CMPLT($f1, $f17), $f5 -+ fabs $f25, $f13 ++$L25: ++ and M, 7, I ++ ble I, $L28 + -+ CMPLT($f2, $f18), $f6 -+ fabs $f26, $f14 -+ CMPLT($f3, $f19), $f7 -+ fabs $f27, $f15 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD x0, 0 * SIZE(X1) + -+fselne $f4,$f16,$f0, $f0 -+fselne $f5,$f17,$f1, $f1 -+fselne $f6,$f18,$f2, $f2 -+fselne $f7,$f19,$f3, $f3 ++ ldi I, -1(I) ++ ble I, $L27 + .align 4 -+ -+$L14: -+ faddd $f8, $f9, $f16 -+ faddd $f10, $f11, $f17 -+ faddd $f12, $f13, $f18 -+ faddd $f14, $f15, $f19 -+ -+ CMPLT($f0, $f16), $f4 -+ CMPLT($f1, $f17), $f5 -+ CMPLT($f2, $f18), $f6 -+ CMPLT($f3, $f19), $f7 + -+fselne $f4,$f16,$f0, $f0 -+fselne $f5,$f17,$f1, $f1 -+fselne $f6,$f18,$f2, $f2 -+fselne $f7,$f19,$f3, $f3 -+ -+ CMPLT($f0, $f1), $f16 -+ CMPLT($f2, $f3), $f17 ++$L26: ++ ADD s0, t0, s0 ++ ldi A2, 1 * SIZE(A2) ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) + -+fselne $f16,$f1,$f0, $f0 -+fselne $f17,$f3,$f2, $f2 ++ ADD s1, t1, s1 ++ ldi A1, 1 * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 0 * SIZE(A2) + -+ CMPLT($f0, $f2), $f16 -+fselne $f16,$f2,$f0, $f0 ++ LD x0, 1 * SIZE(X1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L26 + .align 4 + -+$L15: -+ and N, 3, $1 -+ unop -+ unop -+ ble $1, $End ++$L27: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ ADD s1, t1, s1 ++ MUL x0, a1, t1 + .align 4 + -+$L16: -+ LD $f20, 0 * SIZE(X) -+ LD $f21, 1 * SIZE(X) -+ unop -+ addl X, INCX, X ++$L28: ++ LD a0, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a1, 0 * SIZE(Y) ++ addl Y, INCY, Y + -+ fabs $f20, $f29 -+ fabs $f21, $f30 -+ faddd $f29, $f30, $f29 ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 + -+ CMPLT($f0, $f29), $f16 -+fselne $f16,$f29,$f0, $f0 ++ ADD s0, s2, s0 ++ ADD s1, s3, s1 + -+ ldi $1, -1($1) # i -- -+ bgt $1, $L16 -+ .align 4 ++ MUL alpha, s0, s0 ++ MUL alpha, s1, s1 + -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) ++ ADD a0, s0, a0 ++ ADD a1, s1, a1 + -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldi $sp, STACKSIZE($sp) -+ ret ++ ST a0, 0 * SIZE(Y1) ++ fclr t0 ++ addl Y1, INCY, Y1 ++ fclr t1 + -+ EPILOGUE -diff --git a/kernel/sw_64/zasum.S b/kernel/sw_64/zasum.S -new file mode 100644 -index 0000000..72e120c ---- /dev/null -+++ b/kernel/sw_64/zasum.S -@@ -0,0 +1,231 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ ST a1, 0 * SIZE(Y1) ++ fclr t2 ++ addl Y1, INCY, Y1 ++ fclr t3 ++ .align 4 + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++$L30: ++ blbc N, $L999 + -+#define PREFETCHSIZE 88 ++ mov A, A1 ++ fclr s0 ++ mov X, X1 ++ fclr s1 + -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define I $19 ++ sra M, 3, I ++ fclr s2 ++ fclr s3 ++ ble I, $L35 + -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a8, 0 * SIZE(X1) ++ LD a9, 1 * SIZE(X1) + -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f19 ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ LD a10, 2 * SIZE(X1) ++ LD a11, 3 * SIZE(X1) + -+#define t0 $f20 -+#define t1 $f21 -+#define t2 $f22 -+#define t3 $f23 -+#define t4 $f24 -+#define s4 $f27 -+ PROLOGUE -+ PROFCODE ++ LD a4, 4 * SIZE(A1) ++ LD a5, 5 * SIZE(A1) ++ LD a12, 4 * SIZE(X1) ++ LD a13, 5 * SIZE(X1) + -+ fclr s0 -+ unop -+ fclr t0 -+ addl INCX, INCX, INCX ++ LD a6, 6 * SIZE(A1) ++ LD a7, 7 * SIZE(A1) ++ LD a14, 6 * SIZE(X1) + -+ fclr s1 -+ unop -+ fclr t1 -+ ble N, $L999 ++ ldi I, -1(I) ++ ble I, $L33 ++ .align 4 + -+ fclr s2 -+ sra N, 2, I -+ fclr s3 -+ ble I, $L15 ++$L32: ++ ADD s0, t0, s0 ++ LD a15, 7 * SIZE(X1) ++ MUL a0, a8, t0 ++ LD a0, 8 * SIZE(A1) + -+ LD a0, 0 * SIZE(X) -+ fclr t2 -+ LD a1, 1 * SIZE(X) -+ SXADDQ INCX, X, X ++ ADD s1, t1, s1 ++ LD a8, 8 * SIZE(X1) ++ MUL a1, a9, t1 ++ LD a1, 9 * SIZE(A1) + -+ LD a2, 0 * SIZE(X) -+ fclr t3 -+ LD a3, 1 * SIZE(X) -+ SXADDQ INCX, X, X ++ ADD s2, t2, s2 ++ LD a9, 9 * SIZE(X1) ++ MUL a2, a10, t2 ++ LD a2, 10 * SIZE(A1) + -+ LD a4, 0 * SIZE(X) -+ LD a5, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ ldi I, -1(I) ++ ADD s3, t3, s3 ++ LD a10, 10 * SIZE(X1) ++ MUL a3, a11, t3 ++ LD a3, 11 * SIZE(A1) + -+ ble I, $L13 -+ .align 4 ++ ADD s0, t0, s0 ++ LD a11, 11 * SIZE(X1) ++ MUL a4, a12, t0 ++ LD a4, 12 * SIZE(A1) + -+$L12: -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ fillcs PREFETCHSIZE * SIZE(X) -+ fabs a0, t0 -+ ldi I, -1(I) ++ ADD s1, t1, s1 ++ LD a12, 12 * SIZE(X1) ++ MUL a5, a13, t1 ++ LD a5, 13 * SIZE(A1) + -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ LD a6, 0 * SIZE(X) -+ fabs a1, t1 -+ unop ++ ADD s2, t2, s2 ++ LD a13, 13 * SIZE(X1) ++ MUL a6, a14, t2 ++ LD a6, 14 * SIZE(A1) + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ LD a7, 1 * SIZE(X) -+ fabs a2, t2 -+ SXADDQ INCX, X, X ++ ADD s3, t3, s3 ++ LD a14, 14 * SIZE(X1) ++ MUL a7, a15, t3 ++ LD a7, 15 * SIZE(A1) + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a0, 0 * SIZE(X) -+ fabs a3, t3 -+ unop ++ ldi A1, 8 * SIZE(A1) ++ ldi I, -1(I) ++ ldi X1, 8 * SIZE(X1) ++ bgt I, $L32 ++ .align 4 + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD a1, 1 * SIZE(X) -+ fabs a4, t0 -+ SXADDQ INCX, X, X ++$L33: ++ ADD s0, t0, s0 ++ LD a15, 7 * SIZE(X1) ++ MUL a0, a8, t0 ++ ldi A1, 8 * SIZE(A1) + -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ LD a2, 0 * SIZE(X) -+ fabs a5, t1 ++ ADD s1, t1, s1 + unop ++ MUL a1, a9, t1 ++ ldi X1, 8 * SIZE(X1) + -+ fadds s2, t2, s4 -+ fmov s4,s2 -+ LD a3, 1 * SIZE(X) -+ fabs a6, t2 -+ SXADDQ INCX, X, X ++ ADD s2, t2, s2 ++ MUL a2, a10, t2 ++ ADD s3, t3, s3 ++ MUL a3, a11, t3 + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a4, 0 * SIZE(X) -+ fabs a7, t3 -+ unop ++ ADD s0, t0, s0 ++ MUL a4, a12, t0 ++ ADD s1, t1, s1 ++ MUL a5, a13, t1 + -+ LD a5, 1 * SIZE(X) -+ unop -+ SXADDQ INCX, X, X -+ bne I, $L12 ++ ADD s2, t2, s2 ++ MUL a6, a14, t2 ++ ADD s3, t3, s3 ++ MUL a7, a15, t3 + .align 4 + -+$L13: -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD a6, 0 * SIZE(X) -+ fabs a0, t0 -+ -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ LD a7, 1 * SIZE(X) -+ fabs a1, t1 -+ SXADDQ INCX, X, X ++$L35: ++ and M, 7, I ++ ble I, $L38 + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ fabs a2, t2 -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ fabs a3, t3 ++ LD a0, 0 * SIZE(A1) ++ LD x0, 0 * SIZE(X1) + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ fabs a4, t0 -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ fabs a5, t1 -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ fabs a6, t2 -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ fabs a7, t3 ++ ldi I, -1(I) ++ ble I, $L37 ++ .align 4 + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ ADD s3, t3, s4 -+ fmov s4,s3 ++$L36: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) ++ LD x0, 1 * SIZE(X1) + ++ ldi A1, 1 * SIZE(A1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L36 + .align 4 + -+$L15: -+ ADD s0, s2, $f25 -+ fmov $f25, s0 -+ and N, 3, I -+ ADD s1, s3, $f25 -+ fmov $f25, s1 -+ ble I, $L999 ++$L37: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 + .align 4 + -+$L17: -+ ADD s0, t0, $f25 -+ fmov $f25, s0 -+ LD a0, 0 * SIZE(X) -+ fabs a0, t0 -+ ldi I, -1(I) ++$L38: ++ LD a0, 0 * SIZE(Y) + -+ ADD s1, t1, $f25 -+ fmov $f25, s1 -+ LD a1, 1 * SIZE(X) -+ fabs a1, t1 -+ SXADDQ INCX, X, X ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 + -+ bne I, $L17 ++ ADD s0, s2, s0 ++ ADD s1, s3, s1 ++ ADD s0, s1, s0 ++ ++ MUL alpha, s0, s0 ++ ADD a0, s0, a0 ++ ++ ST a0, 0 * SIZE(Y1) + .align 4 + +$L999: -+ ADD s0, t0, $f25 -+ ADD s1, t1, $f26 ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) + -+ ADD $f25, $f26, s0 ++ ldi $sp, STACKSIZE($sp) + ret + EPILOGUE -diff --git a/kernel/sw_64/zasum.S.bak b/kernel/sw_64/zasum.S.bak +diff --git a/kernel/sw_64/iamax.S b/kernel/sw_64/iamax.S new file mode 100644 -index 0000000..db79771 +index 000000000..662dc8292 --- /dev/null -+++ b/kernel/sw_64/zasum.S.bak -@@ -0,0 +1,208 @@ ++++ b/kernel/sw_64/iamax.S +@@ -0,0 +1,440 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -63738,571 +8999,412 @@ index 0000000..db79771 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 -+#define I $19 -+ -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 ++#define XX $19 + -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f19 ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif + -+#define t0 $f20 -+#define t1 $f21 -+#define t2 $f22 -+#define t3 $f23 ++#define STACKSIZE 6 * 8 + + PROLOGUE + PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 + -+ fclr s0 -+ unop -+ fclr t0 -+ addl INCX, INCX, INCX ++#ifdef F_INTERFACE ++ ldl N, 0(N) # n ++ ldl INCX, 0(INCX) # incx ++#endif ++ ldi $sp, -STACKSIZE($sp) ++ mov X, XX ++ .align 4 + -+ fclr s1 ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 + unop -+ fclr t1 -+ ble N, $L999 + -+ fclr s2 -+ sra N, 2, I -+ fclr s3 -+ ble I, $L15 ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop + -+ LD a0, 0 * SIZE(X) -+ fclr t2 -+ LD a1, 1 * SIZE(X) -+ SXADDQ INCX, X, X ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop + -+ LD a2, 0 * SIZE(X) -+ fclr t3 -+ LD a3, 1 * SIZE(X) -+ SXADDQ INCX, X, X ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $2 ++ clr $0 + -+ LD a4, 0 * SIZE(X) -+ LD a5, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ ldi I, -1(I) ++ fstd $f6, 32($sp) ++ fclr $f0 ++ sra N, 3, $1 ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 + -+ ble I, $L13 ++ LD $f20, 0 * SIZE(X) ++ unop ++ fabs $f20, $f0 ++ ble $1, $L15 + .align 4 + -+$L12: -+ ADD s0, t0, s0 -+ fillcs PREFETCHSIZE * SIZE(X) -+ fabs a0, t0 -+ ldi I, -1(I) ++ fabs $f20, $f1 ++ unop ++ addl X, INCX, X ++ unop + -+ ADD s1, t1, s1 -+ LD a6, 0 * SIZE(X) -+ fabs a1, t1 ++ LD $f21, 0 * SIZE(X) ++ fabs $f20, $f2 ++ addl X, INCX, X + unop + -+ ADD s2, t2, s2 -+ LD a7, 1 * SIZE(X) -+ fabs a2, t2 -+ SXADDQ INCX, X, X ++ LD $f22, 0 * SIZE(X) ++ fabs $f20, $f3 ++ addl X, INCX, X ++ unop + -+ ADD s3, t3, s3 -+ LD a0, 0 * SIZE(X) -+ fabs a3, t3 ++ LD $f23, 0 * SIZE(X) ++ fabs $f20, $f4 ++ addl X, INCX, X + unop + -+ ADD s0, t0, s0 -+ LD a1, 1 * SIZE(X) -+ fabs a4, t0 -+ SXADDQ INCX, X, X ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ fabs $f20, $f5 ++ unop + -+ ADD s1, t1, s1 -+ LD a2, 0 * SIZE(X) -+ fabs a5, t1 ++ LD $f25, 0 * SIZE(X) ++ fabs $f20, $f6 ++ addl X, INCX, X + unop + -+ ADD s2, t2, s2 -+ LD a3, 1 * SIZE(X) -+ fabs a6, t2 -+ SXADDQ INCX, X, X ++ LD $f26, 0 * SIZE(X) ++ fabs $f20, $f28 ++ addl X, INCX, X ++ ldi $1, -1($1) + -+ ADD s3, t3, s3 -+ LD a4, 0 * SIZE(X) -+ fabs a7, t3 ++ LD $f27, 0 * SIZE(X) + unop ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 + -+ LD a5, 1 * SIZE(X) ++$L12: ++ fselne $f16, $f12, $f4, $f4 + unop -+ SXADDQ INCX, X, X -+ bne I, $L12 -+ .align 4 ++ fabs $f20, $f29 ++ s_fillcs 56 * SIZE(X) + -+$L13: -+ ADD s0, t0, s0 -+ LD a6, 0 * SIZE(X) -+ fabs a0, t0 ++ fselne $f17, $f13, $f5, $f5 ++ LD $f20, 0 * SIZE(X) ++ fabs $f21, $f30 ++ addl X, INCX, X + -+ ADD s1, t1, s1 -+ LD a7, 1 * SIZE(X) -+ fabs a1, t1 -+ SXADDQ INCX, X, X ++ fselne $f18, $f14, $f6, $f6 ++ LD $f21, 0 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X + -+ ADD s2, t2, s2 -+ fabs a2, t2 -+ ADD s3, t3, s3 -+ fabs a3, t3 ++ fselne $f19, $f15, $f28, $f28 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ addl X, INCX, X + -+ ADD s0, t0, s0 -+ fabs a4, t0 -+ ADD s1, t1, s1 -+ fabs a5, t1 -+ ADD s2, t2, s2 -+ fabs a6, t2 -+ ADD s3, t3, s3 -+ fabs a7, t3 ++ fabs $f24, $f12 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f0, $f29), $f16 ++ addl X, INCX, X + -+ ADD s2, t2, s2 -+ ADD s3, t3, s3 ++ fabs $f25, $f13 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f1, $f30), $f17 ++ addl X, INCX, X + -+ .align 4 ++ fabs $f26, $f14 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f2, $f10), $f18 ++ addl X, INCX, X + -+$L15: -+ ADD s0, s2, s0 -+ and N, 3, I -+ ADD s1, s3, s1 -+ ble I, $L999 -+ .align 4 ++ fabs $f27, $f15 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f3, $f11), $f19 ++ addl X, INCX, X + -+$L17: -+ ADD s0, t0, s0 -+ LD a0, 0 * SIZE(X) -+ fabs a0, t0 -+ ldi I, -1(I) ++ fselne $f16, $f29, $f0, $f0 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f4, $f12), $f16 ++ addl X, INCX, X + -+ ADD s1, t1, s1 -+ LD a1, 1 * SIZE(X) -+ fabs a1, t1 -+ SXADDQ INCX, X, X ++ fselne $f17, $f30, $f1, $f1 ++ unop ++ CMPLT($f5, $f13), $f17 ++ ldi $1, -1($1) # i -- + -+ bne I, $L17 ++ fselne $f18, $f10, $f2, $f2 ++ unop ++ CMPLT($f6, $f14), $f18 ++ unop ++ ++ fselne $f19, $f11, $f3, $f3 ++ unop ++ CMPLT($f28, $f15), $f19 ++ bgt $1,$L12 + .align 4 + -+$L999: -+ ADD s0, t0, s0 -+ ADD s1, t1, s1 ++$L13: ++ fselne $f16, $f12, $f4, $f4 ++ fabs $f20, $f29 ++ fselne $f17, $f13, $f5, $f5 ++ fabs $f21, $f30 + -+ ADD s0, s1, s0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/zasum_simd.S b/kernel/sw_64/zasum_simd.S -new file mode 100644 -index 0000000..5606fdf ---- /dev/null -+++ b/kernel/sw_64/zasum_simd.S -@@ -0,0 +1,385 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ fselne $f18, $f14, $f6, $f6 ++ fabs $f22, $f10 ++ fselne $f19, $f15, $f28, $f28 ++ fabs $f23, $f11 + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++ fabs $f24, $f12 ++ CMPLT($f0, $f29), $f16 ++ fabs $f25, $f13 ++ CMPLT($f1, $f30), $f17 + -+#define PREFETCHSIZE 96 ++ fabs $f26, $f14 ++ CMPLT($f2, $f10), $f18 ++ fabs $f27, $f15 ++ CMPLT($f3, $f11), $f19 + -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define I $19 ++ fselne $f16, $f29, $f0, $f0 ++ CMPLT($f4, $f12), $f16 ++ fselne $f17, $f30, $f1, $f1 ++ CMPLT($f5, $f13), $f17 + -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 ++ fselne $f18, $f10, $f2, $f2 ++ CMPLT($f6, $f14), $f18 ++ fselne $f19, $f11, $f3, $f3 ++ CMPLT($f28, $f15), $f19 + -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f19 ++ fselne $f16, $f12, $f4, $f4 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f13, $f5, $f5 ++ CMPLT($f2, $f3), $f17 + -+#define t0 $f20 -+#define t1 $f21 -+#define t2 $f22 -+#define t3 $f23 ++ fselne $f18, $f14, $f6, $f6 ++ CMPLT($f4, $f5), $f18 ++ fselne $f19, $f15, $f28, $f28 ++ CMPLT($f6, $f28), $f19 + -+#define t4 $f24 -+#define t5 $f25 -+#define t6 $f26 -+#define t7 $f27 ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ fselne $f18, $f5, $f4, $f4 ++ fselne $f19, $f28, $f6, $f6 + -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 16, $26, 0 -+ -+ fclr s0 -+ unop -+ fclr t0 -+ addl INCX, INCX, INCX ++ CMPLT($f0, $f2), $f16 ++ CMPLT($f4, $f6), $f17 + -+ fclr s1 -+ unop -+ fclr t1 -+ ble N, $L999 ++ fselne $f16, $f2, $f0, $f0 ++ fselne $f17, $f6, $f4, $f4 + -+ cmpeq INCX, 2, $3 -+ beq $3, $Sub ++ CMPLT($f0, $f4), $f16 ++ fselne $f16, $f4, $f0, $f0 + .align 4 + -+ and X, (VEC_LEN*SIZE-1), $6 -+ bgt $6, $UnAlign_X_ACCESS ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $L20 + .align 4 -+$Align_Access: + -+/* -+ Unloop 8*2= 16 reals -+*/ -+ sra N, 3, I -+ fclr s2 -+ fclr s3 -+ ble I, $Remain -+ -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t0 -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t1 -+ -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t2 -+ VLD a3, 3*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t3 -+ -+ subl I, 1, I -+ addl X, 16*SIZE, X -+ unop -+ ble I, $MainLoopEnd ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X + -+$MainLoop: -+ vcpys $f31, a0, a4 -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ vcpys $f31, a1, a5 -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ -+ vcpys $f31, a2, a6 -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ vcpys $f31, a3, a7 -+ VLD a3, 3*VEC_LEN*SIZE(X) -+ -+ VADD t0, a4, t0 -+ subl I, 1, I -+ VADD t1, a5, t1 -+ fillcs PREFETCHSIZE * SIZE(X) -+ -+ VADD t2, a6, t2 -+ addl X, 16*SIZE, X -+ VADD t3, a7, t3 -+ bgt I, $MainLoop ++ fabs $f20, $f29 ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 + -+$MainLoopEnd: -+ /*fabs*/ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 + -+ vcpys $f31, a0, a4 -+ vcpys $f31, a1, a5 -+ vcpys $f31, a2, a6 -+ vcpys $f31, a3, a7 ++$L20: ++ sra N, 3, $1 ++ ble $1, $L40 ++ .align 4 + -+ VADD t0, a4, t0 -+ VADD t1, a5, t1 -+ VADD t2, a6, t2 -+ VADD t3, a7, t3 ++ LD $f10, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f11, 0 * SIZE(XX) ++ addl XX, INCX, XX + -+ VADD t0, t1, t0 -+ VADD t2, t3, t2 -+ VADD t0, t2, t0 -+ nop -+ -+ vextf t0, 0, s0 -+ vextf t0, 1, s1 -+ vextf t0, 2, s2 -+ vextf t0, 3, s3 -+ -+$Remain: -+ and N, 7, I -+ ADD s0, s2, s0 -+ ADD s1, s3, s1 -+ ble I, $End -+ .align 4 ++ LD $f12, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f13, 0 * SIZE(XX) ++ addl XX, INCX, XX + -+$RemainLoop: -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ fabs a0, t0 -+ addl X, 2*SIZE, X ++ LD $f14, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f15, 0 * SIZE(XX) ++ addl XX, INCX, XX + -+ fabs a1, t1 -+ ldi I, -1(I) -+ ADD s0, t0, s0 -+ ADD s1, t1, s1 -+ -+ bne I, $RemainLoop -+ .align 4 -+$End: -+ ADD s0, s1, s0 -+ ret -+ .align 4 -+ -+$UnAlign_X_ACCESS: -+ sra N, 3, I -+ fclr s2 -+ fclr s3 -+ ble I, $Remain -+ -+ VLD_UL a0, 0*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t0 -+ VLD_UH t4, 1*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t1 -+ -+ VLD_UL a1, 1*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t2 -+ VLD_UH t5, 2*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t3 -+ -+ VLD_UL a2, 2*VEC_LEN*SIZE(X) -+ VLD_UH t6, 3*VEC_LEN*SIZE(X) -+ VLD_UL a3, 3*VEC_LEN*SIZE(X) -+ VLD_UH t7, 4*VEC_LEN*SIZE(X) -+ -+ vbisw a0, t4, a0 -+ subl I, 1, I -+ vbisw a1, t5, a1 -+ addl X, 16*SIZE, X -+ -+ vbisw a2, t6, a2 -+ unop -+ vbisw a3, t7, a3 -+ ble I, $MainLoopEnd -+ -+$UnAlign_X_ACCESS_MainLoop: -+/*fabs*/ -+ vcpys $f31, a0, a4 -+ VLD_UL a0, 0*VEC_LEN*SIZE(X) -+ vcpys $f31, a1, a5 -+ VLD_UH t4, 1*VEC_LEN*SIZE(X) -+ -+ vcpys $f31, a2, a6 -+ VLD_UL a1, 1*VEC_LEN*SIZE(X) -+ vcpys $f31, a3, a7 -+ VLD_UH t5, 2*VEC_LEN*SIZE(X) -+ -+ VADD t0, a4, t0 -+ VLD_UL a2, 2*VEC_LEN*SIZE(X) -+ VADD t1, a5, t1 -+ VLD_UH t6, 3*VEC_LEN*SIZE(X) -+ -+ VADD t2, a6, t2 -+ VLD_UL a3, 3*VEC_LEN*SIZE(X) -+ VADD t3, a7, t3 -+ VLD_UH t7, 4*VEC_LEN*SIZE(X) -+ -+ -+ vbisw a0, t4, a0 -+ subl I, 1, I -+ vbisw a1, t5, a1 -+ fillcs PREFETCHSIZE * SIZE(X) ++ LD $f16, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f17, 0 * SIZE(XX) ++ addl XX, INCX, XX + -+ vbisw a2, t6, a2 -+ addl X, 16*SIZE, X -+ vbisw a3, t7, a3 -+ bgt I, $UnAlign_X_ACCESS_MainLoop ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ fabs $f12, $f20 ++ fabs $f13, $f21 + -+ jmp $MainLoopEnd ++ ldi $1, -1($1) ++ ble $1, $L23 + .align 4 -+ -+ -+$Sub: -+ fclr s2 -+ sra N, 2, I -+ fclr s3 -+ ble I, $L15 + -+ LD a0, 0 * SIZE(X) -+ fclr t2 -+ LD a1, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ LD a2, 0 * SIZE(X) -+ fclr t3 -+ LD a3, 1 * SIZE(X) -+ SXADDQ INCX, X, X ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fabs $f14, $f22 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f18, $f2 + -+ LD a4, 0 * SIZE(X) -+ LD a5, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ ldi I, -1(I) ++ LD $f11, 0 * SIZE(XX) ++ fabs $f15, $f23 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f19, $f3 + -+ ble I, $L13 -+ .align 4 ++ LD $f12, 0 * SIZE(XX) ++ fabs $f16, $f24 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f20, $f4 + -+$L12: -+ ADD s0, t0, s0 -+ fillcs PREFETCHSIZE * SIZE(X) -+ fabs a0, t0 -+ ldi I, -1(I) ++ LD $f13, 0 * SIZE(XX) ++ fabs $f17, $f25 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f21, $f5 + -+ ADD s1, t1, s1 -+ LD a6, 0 * SIZE(X) -+ fabs a1, t1 -+ unop ++ LD $f14, 0 * SIZE(XX) ++ ldi $1, -1($1) # i -- ++ fcmpeq $f0, $f22, $f26 ++ addl XX, INCX, XX + -+ ADD s2, t2, s2 -+ LD a7, 1 * SIZE(X) -+ fabs a2, t2 -+ SXADDQ INCX, X, X ++ ldi $0, 1($0) ++ fbne $f2, $End + -+ ADD s3, t3, s3 -+ LD a0, 0 * SIZE(X) -+ fabs a3, t3 -+ unop ++ LD $f15, 0 * SIZE(XX) ++ fcmpeq $f0, $f23, $f27 ++ ldi $0, 1($0) ++ fbne $f3, $End + -+ ADD s0, t0, s0 -+ LD a1, 1 * SIZE(X) -+ fabs a4, t0 -+ SXADDQ INCX, X, X ++ addl XX, INCX, XX ++ fcmpeq $f0, $f24, $f28 ++ ldi $0, 1($0) ++ fbne $f4, $End + -+ ADD s1, t1, s1 -+ LD a2, 0 * SIZE(X) -+ fabs a5, t1 -+ unop ++ LD $f16, 0 * SIZE(XX) ++ fcmpeq $f0, $f25, $f29 ++ ldi $0, 1($0) ++ fbne $f5, $End + -+ ADD s2, t2, s2 -+ LD a3, 1 * SIZE(X) -+ fabs a6, t2 -+ SXADDQ INCX, X, X ++ addl XX, INCX, XX ++ ldi $0, 1($0) ++ fabs $f10, $f18 ++ fbne $f26, $End + -+ ADD s3, t3, s3 -+ LD a4, 0 * SIZE(X) -+ fabs a7, t3 -+ unop ++ LD $f17, 0 * SIZE(XX) ++ ldi $0, 1($0) ++ fabs $f11, $f19 ++ fbne $f27, $End + -+ LD a5, 1 * SIZE(X) -+ unop -+ SXADDQ INCX, X, X -+ bne I, $L12 ++ addl XX, INCX, XX ++ ldi $0, 1($0) ++ fabs $f12, $f20 ++ fbne $f28, $End ++ ++ ldi $0, 1($0) ++ fabs $f13, $f21 ++ fbne $f29, $End ++ bgt $1, $L22 + .align 4 + -+$L13: -+ ADD s0, t0, s0 -+ LD a6, 0 * SIZE(X) -+ fabs a0, t0 ++$L23: ++ fabs $f14, $f22 ++ fcmpeq $f0, $f18, $f2 ++ fabs $f15, $f23 ++ fcmpeq $f0, $f19, $f3 + -+ ADD s1, t1, s1 -+ LD a7, 1 * SIZE(X) -+ fabs a1, t1 -+ SXADDQ INCX, X, X ++ fabs $f16, $f24 ++ fcmpeq $f0, $f20, $f4 ++ fabs $f17, $f25 ++ fcmpeq $f0, $f21, $f5 + -+ ADD s2, t2, s2 -+ fabs a2, t2 -+ ADD s3, t3, s3 -+ fabs a3, t3 ++ fcmpeq $f0, $f22, $f26 ++ ldi $0, 1($0) ++ unop ++ fbne $f2, $End + -+ ADD s0, t0, s0 -+ fabs a4, t0 -+ ADD s1, t1, s1 -+ fabs a5, t1 -+ ADD s2, t2, s2 -+ fabs a6, t2 -+ ADD s3, t3, s3 -+ fabs a7, t3 ++ fcmpeq $f0, $f23, $f27 ++ ldi $0, 1($0) ++ unop ++ fbne $f3, $End + -+ ADD s2, t2, s2 -+ ADD s3, t3, s3 ++ fcmpeq $f0, $f24, $f28 ++ ldi $0, 1($0) ++ unop ++ fbne $f4, $End + -+ .align 4 ++ fcmpeq $f0, $f25, $f29 ++ ldi $0, 1($0) ++ unop ++ fbne $f5, $End + -+$L15: -+ ADD s0, s2, s0 -+ and N, 3, I -+ ADD s1, s3, s1 -+ ble I, $L999 ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ ldi $0, 1($0) ++ fbne $f28, $End ++ ldi $0, 1($0) ++ fbne $f29, $End + .align 4 + -+$L17: -+ ADD s0, t0, s0 -+ LD a0, 0 * SIZE(X) -+ fabs a0, t0 -+ ldi I, -1(I) ++$L40: ++ LD $f20, 0 * SIZE(XX) ++ addl XX, INCX, XX + -+ ADD s1, t1, s1 -+ LD a1, 1 * SIZE(X) -+ fabs a1, t1 -+ SXADDQ INCX, X, X ++ fabs $f20, $f25 ++ fcmpeq $f0, $f25, $f29 + -+ bne I, $L17 ++ ldi $0, 1($0) ++ fbne $f29, $End ++ br $31, $L40 + .align 4 + -+$L999: -+ ADD s0, t0, s0 -+ ADD s1, t1, s1 ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) + -+ ADD s0, s1, s0 ++ fldd $f6, 32($sp) ++ ldi $sp, STACKSIZE($sp) + ret ++ + EPILOGUE -diff --git a/kernel/sw_64/zaxpy.S b/kernel/sw_64/zaxpy.S +diff --git a/kernel/sw_64/imax.S b/kernel/sw_64/imax.S new file mode 100644 -index 0000000..19b6398 +index 000000000..025a10911 --- /dev/null -+++ b/kernel/sw_64/zaxpy.S -@@ -0,0 +1,654 @@ ++++ b/kernel/sw_64/imax.S +@@ -0,0 +1,351 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -64343,626 +9445,756 @@ index 0000000..19b6398 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#define PREFETCHSIZE 40 + -+#ifndef CONJ -+#define ADD1 SUB -+#define ADD2 ADD ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b +#else -+#define ADD1 ADD -+#define ADD2 SUB ++#define CMPLT(a, b) fcmplt b, a +#endif + -+#define tmp $f9 ++#define STACKSIZE 8 * 8 ++ + PROLOGUE + PROFCODE -+ .frame $sp, 16, $26, 0 + -+ ldw $19, 0($sp) -+ fmov $f19, $f29 -+ ldl $20, 8($sp) -+ fmov $f20, $f30 ++ clr $0 ++ mov X, XX ++ .align 4 + -+ mov $21, $18 -+ ldw $21, 16($sp) -+ ldi $sp, -64($sp) -+ nop ++ cmplt $31, N, $2 ++ cmplt $31, INCX, $3 ++ SXADDQ INCX, $31, INCX ++ and $2, $3, $2 + -+ fstd $f2, 0($sp) -+ cmpeq $19, 1, $1 -+ fstd $f3, 8($sp) -+ cmpeq $21, 1, $2 ++ sra N, 3, $1 ++ fclr $f0 ++ unop ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 + -+ fstd $f4, 16($sp) -+ and $16, 3, $5 -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) ++ LD $f0, 0 * SIZE(X) ++ unop ++ unop ++ ble $1, $L15 ++ .align 4 + -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd tmp, 56($sp) -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif ++ fmov $f0, $f1 ++ addl X, INCX, X ++ fmov $f0, $f10 ++ ldi $1, -1($1) + -+ and $1, $2, $1 -+ ble $16, $End -+ sra $16, 2, $4 -+ beq $1, $Sub ++ LD $f21, 0 * SIZE(X) ++ fmov $f0, $f11 ++ addl X, INCX, X ++ fmov $f0, $f12 + -+ ble $4, $Remain -+ subl $4, 1, $4 ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f13 ++ addl X, INCX, X ++ fmov $f0, $f14 + -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) -+ LD $f2, 2*SIZE($18) -+ LD $f3, 3*SIZE($18) -+ LD $f4, 4*SIZE($18) -+ LD $f5, 5*SIZE($18) -+ LD $f6, 6*SIZE($18) -+ LD $f7, 7*SIZE($18) ++ LD $f23, 0 * SIZE(X) ++ fmov $f0, $f15 ++ addl X, INCX, X ++ fmov $f0, $f20 + -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ LD $f10, 2*SIZE($20) -+ LD $f11, 3*SIZE($20) -+ LD $f12, 4*SIZE($20) -+ LD $f13, 5*SIZE($20) -+ LD $f14, 6*SIZE($20) -+ LD $f15, 7*SIZE($20) ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f25, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f26, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f27, 0 * SIZE(X) ++ addl X, INCX, X + -+ addl $18, 8*SIZE, $18 -+ ble $4, $MainLoopEnd ++ CMPLT($f0, $f20), $f16 ++ CMPLT($f1, $f21), $f17 ++ CMPLT($f10, $f22), $f18 ++ CMPLT($f11, $f23), $f19 ++ ++ ble $1, $L13 + .align 4 + -+$MainLoop: -+ fillcs PREFETCHSIZE * SIZE($20) -+ fillcs PREFETCHSIZE * SIZE($18) ++$L12: ++ fselne $f16, $f20, $f0, $f0 ++ LD $f20, 0 * SIZE(X) ++ CMPLT($f12, $f24), $f16 ++ addl X, INCX, X + -+ MUL $f29, $f0, $f20 -+ fillcs 9*SIZE($18) -+ MUL $f30, $f1, $f21 ++ fselne $f17, $f21, $f1, $f1 ++ LD $f21, 0 * SIZE(X) ++ CMPLT($f13, $f25), $f17 ++ addl X, INCX, X ++ ++ fselne $f18, $f22, $f10, $f10 ++ LD $f22, 0 * SIZE(X) ++ CMPLT($f14, $f26), $f18 ++ addl X, INCX, X ++ ++ fselne $f19, $f23, $f11, $f11 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f15, $f27), $f19 ++ addl X, INCX, X ++ ++ fselne $f16, $f24, $f12, $f12 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f0, $f20), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f25, $f13, $f13 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f1, $f21), $f17 ++ addl X, INCX, X ++ ++ fselne $f18, $f26, $f14, $f14 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f10, $f22), $f18 ++ addl X, INCX, X ++ ++ fselne $f19, $f27, $f15, $f15 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f11, $f23), $f19 ++ ldi $1, -1($1) # i -- ++ ++ addl X, INCX, X + unop ++ unop ++ bgt $1,$L12 ++ .align 4 + -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) ++$L13: ++ fselne $f16, $f20, $f0, $f0 ++ CMPLT($f12, $f24), $f16 + -+ MUL $f29, $f2, $f24 ++ fselne $f17, $f21, $f1, $f1 ++ CMPLT($f13, $f25), $f17 ++ ++ fselne $f18, $f22, $f10, $f10 ++ CMPLT($f14, $f26), $f18 ++ ++ fselne $f19, $f23, $f11, $f11 ++ CMPLT($f15, $f27), $f19 ++ ++ fselne $f16, $f24, $f12, $f12 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f25, $f13, $f13 ++ CMPLT($f10, $f11), $f17 ++ ++ fselne $f18, $f26, $f14, $f14 ++ CMPLT($f12, $f13), $f18 ++ fselne $f19, $f27, $f15, $f15 ++ CMPLT($f14, $f15), $f19 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f11, $f10, $f10 ++ fselne $f18, $f13, $f12, $f12 ++ fselne $f19, $f15, $f14, $f14 ++ ++ CMPLT($f0, $f10), $f16 ++ CMPLT($f12, $f14), $f17 ++ ++ fselne $f16, $f10, $f0, $f0 ++ fselne $f17, $f14, $f12, $f12 ++ ++ CMPLT($f0, $f12), $f16 ++ fselne $f16, $f12, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 7, $1 + unop -+ MUL $f30, $f3, $f25 -+ nop ++ unop ++ ble $1, $L20 ++ .align 4 + -+ MUL $f30, $f2, $f26 -+ LD $f2, 2*SIZE($18) -+ MUL $f29, $f3, $f27 -+ LD $f3, 3*SIZE($18) ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X + -+ ADD1 $f20, $f21, $f16 -+ MUL $f29, $f4, $f20 -+ ADD2 $f22, $f23, $f17 -+ MUL $f30, $f5, $f21 ++ CMPLT($f0, $f20), $f16 ++ fselne $f16, $f20, $f0, $f0 ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 + -+ ADD1 $f24, $f25, $f18 ++$L20: ++ sra N, 3, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f11, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f13, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f15, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f17, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fcmpeq $f0, $f10, $f20 ++ fcmpeq $f0, $f11, $f21 ++ fcmpeq $f0, $f12, $f22 ++ fcmpeq $f0, $f13, $f23 ++ ++ ldi $1, -1($1) ++ ble $1, $L23 ++ .align 4 ++ ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fcmpeq $f0, $f14, $f24 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f20, $End ++ ++ LD $f11, 0 * SIZE(XX) ++ fcmpeq $f0, $f15, $f25 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f21, $End ++ ++ LD $f12, 0 * SIZE(XX) ++ fcmpeq $f0, $f16, $f26 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f22, $End ++ ++ LD $f13, 0 * SIZE(XX) ++ fcmpeq $f0, $f17, $f27 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f23, $End ++ ++ LD $f14, 0 * SIZE(XX) ++ fcmpeq $f0, $f10, $f20 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f24, $End ++ ++ LD $f15, 0 * SIZE(XX) ++ fcmpeq $f0, $f11, $f21 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f25, $End ++ ++ LD $f16, 0 * SIZE(XX) ++ ldi $1, -1($1) # i -- ++ fcmpeq $f0, $f12, $f22 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f26, $End ++ ++ LD $f17, 0 * SIZE(XX) ++ fcmpeq $f0, $f13, $f23 ++ ldi $0, 1($0) ++ addl XX, INCX, XX ++ fbne $f27, $End ++ ++ bgt $1, $L22 ++ .align 4 ++ ++$L23: ++ ldi $0, 1($0) ++ fcmpeq $f0, $f14, $f24 + unop -+ MUL $f30, $f4, $f22 -+ LD $f4, 4*SIZE($18) ++ fbne $f20, $End + -+ ADD2 $f26, $f27, $f19 -+ addl $20, 8*SIZE, $20 -+ MUL $f29, $f5, $f23 -+ LD $f5, 5*SIZE($18) ++ ldi $0, 1($0) ++ fcmpeq $f0, $f15, $f25 ++ unop ++ fbne $f21, $End + -+ ADD $f16, $f8, tmp -+ fmov tmp, $f16 -+ LD $f8, 0*SIZE($20) -+ MUL $f29, $f6, $f24 ++ ldi $0, 1($0) ++ fcmpeq $f0, $f16, $f26 + unop ++ fbne $f22, $End + -+ ADD $f17, $f28, tmp -+ fmov tmp, $f17 -+ LD $f28, 1*SIZE($20) -+ MUL $f30, $f7, $f25 ++ ldi $0, 1($0) ++ fcmpeq $f0, $f17, $f27 + unop ++ fbne $f23, $End + -+ ADD $f18, $f10, tmp -+ fmov tmp, $f18 -+ LD $f10, 2*SIZE($20) -+ MUL $f30, $f6, $f26 -+ LD $f6, 6*SIZE($18) ++ ldi $0, 1($0) ++ fbne $f24, $End ++ ldi $0, 1($0) ++ fbne $f25, $End ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ .align 4 + -+ ADD $f19, $f11, tmp -+ fmov tmp, $f19 -+ LD $f11, 3*SIZE($20) -+ MUL $f29, $f7, $f27 -+ LD $f7, 7*SIZE($18) ++$L40: ++ LD $f20, 0 * SIZE(XX) ++ addl XX, INCX, XX + -+ ST $f16,-8*SIZE($20) -+ ADD1 $f20, $f21, $f16 -+ ST $f17,-7*SIZE($20) -+ ADD2 $f22, $f23, $f17 ++ fcmpeq $f0, $f20, $f29 + -+ ST $f18,-6*SIZE($20) -+ ADD1 $f24, $f25, $f18 -+ ST $f19,-5*SIZE($20) -+ ADD2 $f26, $f27, $f19 ++ ldi $0, 1($0) ++ fbne $f29, $End ++ br $31, $L40 ++ .align 4 ++ ++$End: ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/izamax.S b/kernel/sw_64/izamax.S +new file mode 100644 +index 000000000..bbb2ff4d7 +--- /dev/null ++++ b/kernel/sw_64/izamax.S +@@ -0,0 +1,427 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+ ADD $f16, $f12, tmp -+ fmov tmp, $f16 -+ LD $f12, 4*SIZE($20) -+ ADD $f17, $f13, tmp -+ fmov tmp, $f17 -+ LD $f13, 5*SIZE($20) -+ ADD $f18, $f14, tmp -+ fmov tmp, $f18 -+ LD $f14, 6*SIZE($20) -+ ADD $f19, $f15, tmp -+ fmov tmp, $f19 -+ LD $f15, 7*SIZE($20) ++#define ASSEMBLER ++#include "common.h" + -+ ST $f16,-4*SIZE($20) -+ addl $18, 8*SIZE, $18 -+ ST $f17,-3*SIZE($20) -+ subl $4, 1, $4 + -+ ST $f18,-2*SIZE($20) -+ nop -+ ST $f19,-1*SIZE($20) -+ bgt $4, $MainLoop -+ .align 4 ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 + -+$MainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif + -+ MUL $f29, $f2, $f24 -+ MUL $f30, $f3, $f25 -+ MUL $f30, $f2, $f26 -+ MUL $f29, $f3, $f27 ++#define STACKSIZE 8 * 8 + -+ ADD1 $f20, $f21, $f16 -+ MUL $f29, $f4, $f20 -+ ADD2 $f22, $f23, $f17 -+ MUL $f30, $f5, $f21 ++ PROLOGUE ++ PROFCODE + -+ ADD1 $f24, $f25, $f18 -+ MUL $f30, $f4, $f22 -+ ADD2 $f26, $f27, $f19 -+ MUL $f29, $f5, $f23 ++ ldi $sp, -STACKSIZE($sp) + -+ ADD $f16, $f8, tmp -+ fmov tmp, $f16 -+ MUL $f29, $f6, $f24 -+ ADD $f17, $f28, tmp -+ fmov tmp, $f17 -+ MUL $f30, $f7, $f25 ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ unop + -+ ADD $f18, $f10, tmp -+ fmov tmp, $f18 -+ MUL $f30, $f6, $f26 -+ ADD $f19, $f11, tmp -+ fmov tmp, $f19 -+ MUL $f29, $f7, $f27 ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop + -+ ST $f16, 0*SIZE($20) -+ ADD1 $f20, $f21, $f16 -+ ST $f17, 1*SIZE($20) -+ ADD2 $f22, $f23, $f17 ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop + -+ ST $f18, 2*SIZE($20) -+ ADD1 $f24, $f25, $f18 -+ ST $f19, 3*SIZE($20) -+ ADD2 $f26, $f27, $f19 ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $2 ++ clr $0 + -+ ADD $f16, $f12, tmp -+ fmov tmp, $f16 -+ ADD $f17, $f13, tmp -+ fmov tmp, $f17 -+ ADD $f18, $f14, tmp -+ fmov tmp, $f18 -+ ADD $f19, $f15, tmp -+ fmov tmp, $f19 ++ fstd $f6, 32($sp) ++ mov X, XX + -+ ST $f16, 4*SIZE($20) -+ ST $f17, 5*SIZE($20) -+ ST $f18, 6*SIZE($20) -+ ST $f19, 7*SIZE($20) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) + -+ unop -+ addl $20, 8*SIZE, $20 -+ unop -+ ble $5, $End ++ fclr $f0 ++ beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + -+$Remain: -+ subl $5, 1, $6 -+ ble $5, $End -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ sra N, 2, $1 ++ addl INCX, INCX, INCX + -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ addl $18, 2*SIZE, $18 -+ ble $6, $RemainLoopEnd ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ faddd $f20, $f21, $f0 ++ ble $1, $L15 + .align 4 + -+$RemainLoop: -+ MUL $f29, $f0, $f20 -+ subl $6, 1, $6 -+ MUL $f30, $f1, $f21 -+ addl $20, 2*SIZE, $20 -+ -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) ++ ldi $1, -1($1) ++ unop ++ addl X, INCX, X ++ unop + -+ ADD1 $f20, $f21, $f16 -+ ADD2 $f22, $f23, $f17 -+ ADD $f16, $f8, tmp -+ fmov tmp, $f16 -+ LD $f8, 0*SIZE($20) -+ ADD $f17, $f28, tmp -+ fmov tmp, $f17 -+ LD $f28, 1*SIZE($20) ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f1 ++ LD $f23, 1 * SIZE(X) ++ addl X, INCX, X + -+ ST $f16,-2*SIZE($20) -+ addl $18, 2*SIZE, $18 -+ ST $f17,-1*SIZE($20) -+ bgt $6, $RemainLoop -+ .align 4 ++ LD $f24, 0 * SIZE(X) ++ fmov $f0, $f2 ++ LD $f25, 1 * SIZE(X) ++ addl X, INCX, X + -+$RemainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 ++ LD $f26, 0 * SIZE(X) ++ fmov $f0, $f3 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X + -+ ADD1 $f20, $f21, $f16 -+ ADD2 $f22, $f23, $f17 -+ ADD $f16, $f8, tmp -+ fmov tmp, $f16 -+ ADD $f17, $f28, tmp -+ fmov tmp, $f17 ++ fabs $f20, $f8 ++ fabs $f21, $f9 ++ fabs $f22, $f10 ++ fabs $f23, $f11 + -+ ST $f16, 0*SIZE($20) -+ nop -+ ST $f17, 1*SIZE($20) -+ nop -+ .align 4 ++ fabs $f24, $f12 ++ fabs $f25, $f13 ++ fabs $f26, $f14 ++ fabs $f27, $f15 + -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd tmp, 56($sp) -+ ldi $sp, 64($sp) -+ ret ++ ble $1, $L14 + .align 4 + -+$Sub: -+ SXSUBL $16, SIZE, $22 -+ addl $22, $22, $22 # Complex -+ .align 4 ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ ldi $1, -1($1) ++ addl X, INCX, X + -+ addl $19, $19, $19 # Complex -+ addl $21, $21, $21 # Complex ++ LD $f22, 0 * SIZE(X) ++ LD $f23, 1 * SIZE(X) ++ unop ++ addl X, INCX, X + -+ ble $4, $SubRemain -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) -+ SXADDQ $19, $18, $18 ++ LD $f24, 0 * SIZE(X) ++ LD $f25, 1 * SIZE(X) ++ unop ++ addl X, INCX, X + -+ LD $f2, 0*SIZE($18) -+ LD $f3, 1*SIZE($18) -+ SXADDQ $19, $18, $18 ++ LD $f26, 0 * SIZE(X) ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 + -+ LD $f4, 0*SIZE($18) -+ LD $f5, 1*SIZE($18) -+ SXADDQ $19, $18, $18 ++$L12: ++ faddd $f8, $f9, $f16 ++ unop ++ fabs $f20, $f8 ++ s_fillcs 64 * SIZE(X) + -+ LD $f6, 0*SIZE($18) -+ LD $f7, 1*SIZE($18) -+ SXADDQ $19, $18, $18 ++ faddd $f10, $f11, $f17 ++ unop ++ fabs $f21, $f9 ++ LD $f20, 0 * SIZE(X) + -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ SXADDQ $21, $20, $24 ++ faddd $f12, $f13, $f18 ++ LD $f21, 1 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X + -+ LD $f10, 0*SIZE($24) -+ LD $f11, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ faddd $f14, $f15, $f19 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ unop + -+ LD $f12, 0*SIZE($24) -+ LD $f13, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ CMPLT($f0, $f16), $f4 ++ LD $f23, 1 * SIZE(X) ++ fabs $f24, $f12 ++ addl X, INCX, X + -+ LD $f14, 0*SIZE($24) -+ LD $f15, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ CMPLT($f1, $f17), $f5 ++ LD $f24, 0 * SIZE(X) ++ fabs $f25, $f13 ++ unop + -+ subl $4, 1, $4 -+ ble $4, $SubMainLoopEnd -+ .align 4 ++ CMPLT($f2, $f18), $f6 ++ LD $f25, 1 * SIZE(X) ++ fabs $f26, $f14 ++ addl X, INCX, X + -+$SubMainLoop: -+ MUL $f29, $f0, $f20 -+ unop -+ MUL $f30, $f1, $f21 ++ CMPLT($f3, $f19), $f7 ++ LD $f26, 0 * SIZE(X) ++ fabs $f27, $f15 + unop + -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) ++ fselne $f4, $f16, $f0, $f0 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ldi $1, -1($1) # i -- + -+ MUL $f29, $f2, $f24 -+ SXADDQ $19, $18, $18 -+ MUL $f30, $f3, $f25 -+ unop ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ bgt $1,$L12 ++ .align 4 + -+ MUL $f30, $f2, $f26 -+ LD $f2, 0*SIZE($18) -+ MUL $f29, $f3, $f27 -+ LD $f3, 1*SIZE($18) ++$L13: ++ faddd $f8, $f9, $f16 ++ fabs $f20, $f8 + -+ ADD1 $f20, $f21, $f16 -+ SXADDQ $19, $18, $18 -+ MUL $f29, $f4, $f20 -+ unop ++ faddd $f10, $f11, $f17 ++ fabs $f21, $f9 + -+ ADD2 $f22, $f23, $f17 -+ unop -+ MUL $f30, $f5, $f21 -+ unop ++ faddd $f12, $f13, $f18 ++ fabs $f22, $f10 + -+ ADD1 $f24, $f25, $f18 -+ unop -+ MUL $f30, $f4, $f22 -+ LD $f4, 0*SIZE($18) ++ faddd $f14, $f15, $f19 ++ fabs $f23, $f11 + -+ ADD2 $f26, $f27, $f19 -+ unop -+ MUL $f29, $f5, $f23 -+ LD $f5, 1*SIZE($18) ++ CMPLT($f0, $f16), $f4 ++ fabs $f24, $f12 + -+ ADD $f16, $f8, tmp -+ fmov tmp, $f16 -+ LD $f8, 0*SIZE($24) -+ MUL $f29, $f6, $f24 -+ SXADDQ $19, $18, $18 ++ CMPLT($f1, $f17), $f5 ++ fabs $f25, $f13 + -+ ADD $f17, $f28, tmp -+ fmov tmp, $f17 -+ LD $f28, 1*SIZE($24) -+ MUL $f30, $f7, $f25 -+ SXADDQ $21, $24, $24 ++ CMPLT($f2, $f18), $f6 ++ fabs $f26, $f14 ++ CMPLT($f3, $f19), $f7 ++ fabs $f27, $f15 + -+ ADD $f18, $f10, tmp -+ fmov tmp, $f18 -+ LD $f10, 0*SIZE($24) -+ MUL $f30, $f6, $f26 -+ LD $f6, 0*SIZE($18) ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ .align 4 + -+ ADD $f19, $f11, tmp -+ fmov tmp, $f19 -+ LD $f11, 1*SIZE($24) -+ MUL $f29, $f7, $f27 -+ LD $f7, 1*SIZE($18) ++$L14: ++ faddd $f8, $f9, $f16 ++ faddd $f10, $f11, $f17 ++ faddd $f12, $f13, $f18 ++ faddd $f14, $f15, $f19 + -+ ST $f16, 0*SIZE($20) -+ SXADDQ $19, $18, $18 -+ ADD1 $f20, $f21, $f16 -+ unop ++ CMPLT($f0, $f16), $f4 ++ CMPLT($f1, $f17), $f5 ++ CMPLT($f2, $f18), $f6 ++ CMPLT($f3, $f19), $f7 + -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ ADD2 $f22, $f23, $f17 -+ unop ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 + -+ ST $f18, 0*SIZE($20) -+ SXADDQ $21, $24, $24 -+ ADD1 $f24, $f25, $f18 -+ unop ++ CMPLT($f0, $f1), $f16 ++ CMPLT($f2, $f3), $f17 + -+ ST $f19, 1*SIZE($20) -+ unop -+ ADD2 $f26, $f27, $f19 -+ SXADDQ $21, $20, $20 ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ ++ CMPLT($f0, $f2), $f16 ++ fselne $f16, $f2, $f0, $f0 ++ .align 4 + -+ ADD $f16, $f12, tmp -+ fmov tmp, $f16 ++$L15: ++ and N, 3, $1 + unop -+ LD $f12, 0*SIZE($24) + unop ++ ble $1, $L20 ++ .align 4 + -+ ADD $f17, $f13, tmp -+ fmov tmp, $f17 ++$L16: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) + unop -+ LD $f13, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ addl X, INCX, X + -+ ADD $f18, $f14, tmp -+ fmov tmp, $f18 -+ subl $4, 1, $4 -+ LD $f14, 0*SIZE($24) -+ unop ++ fabs $f20, $f29 ++ fabs $f21, $f30 ++ faddd $f29, $f30, $f29 + -+ ADD $f19, $f15, tmp -+ fmov tmp, $f19 -+ unop -+ LD $f15, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 + -+ ST $f16, 0*SIZE($20) -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ unop ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 + -+ ST $f18, 0*SIZE($20) -+ ST $f19, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ bgt $4, $SubMainLoop ++$L20: ++ sra N, 2, $1 ++ ble $1, $L40 + .align 4 + -+$SubMainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) ++ addl XX, INCX, XX + -+ MUL $f29, $f2, $f24 -+ MUL $f30, $f3, $f25 -+ MUL $f30, $f2, $f26 -+ MUL $f29, $f3, $f27 ++ LD $f12, 0 * SIZE(XX) ++ LD $f13, 1 * SIZE(XX) ++ addl XX, INCX, XX + -+ ADD1 $f20, $f21, $f16 -+ MUL $f29, $f4, $f20 -+ ADD2 $f22, $f23, $f17 -+ MUL $f30, $f5, $f21 ++ LD $f14, 0 * SIZE(XX) ++ LD $f15, 1 * SIZE(XX) ++ addl XX, INCX, XX + -+ ADD1 $f24, $f25, $f18 -+ MUL $f30, $f4, $f22 -+ ADD2 $f26, $f27, $f19 -+ MUL $f29, $f5, $f23 ++ LD $f16, 0 * SIZE(XX) ++ LD $f17, 1 * SIZE(XX) ++ addl XX, INCX, XX + -+ ADD $f16, $f8, tmp -+ fmov tmp, $f16 -+ MUL $f29, $f6, $f24 -+ ADD $f17, $f28, tmp -+ fmov tmp, $f17 -+ MUL $f30, $f7, $f25 ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ fabs $f12, $f20 ++ fabs $f13, $f21 + -+ ADD $f18, $f10, tmp -+ fmov tmp, $f18 -+ MUL $f30, $f6, $f26 -+ ADD $f19, $f11, tmp -+ fmov tmp, $f19 -+ MUL $f29, $f7, $f27 ++ ldi $1, -1($1) ++ ble $1, $L23 ++ .align 4 + -+ ST $f16, 0*SIZE($20) -+ ADD1 $f20, $f21, $f16 -+ ST $f17, 1*SIZE($20) -+ ADD2 $f22, $f23, $f17 ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fabs $f14, $f22 ++ LD $f11, 1 * SIZE(XX) ++ addl XX, INCX, XX + -+ SXADDQ $21, $20, $20 -+ nop -+ ST $f18, 0*SIZE($20) -+ ADD1 $f24, $f25, $f18 ++ LD $f12, 0 * SIZE(XX) ++ fabs $f15, $f23 ++ LD $f13, 1 * SIZE(XX) ++ addl XX, INCX, XX + -+ ST $f19, 1*SIZE($20) -+ ADD2 $f26, $f27, $f19 -+ SXADDQ $21, $20, $20 -+ ADD $f16, $f12, tmp -+ fmov tmp, $f16 ++ LD $f14, 0 * SIZE(XX) ++ fabs $f16, $f24 ++ LD $f15, 1 * SIZE(XX) ++ addl XX, INCX, XX + -+ ADD $f17, $f13, tmp -+ fmov tmp, $f17 -+ ADD $f18, $f14, tmp -+ fmov tmp, $f18 -+ ADD $f19, $f15, tmp -+ fmov tmp, $f19 ++ LD $f16, 0 * SIZE(XX) ++ fabs $f17, $f25 ++ LD $f17, 1 * SIZE(XX) ++ addl XX, INCX, XX + -+ ST $f16, 0*SIZE($20) -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 ++ faddd $f18, $f19, $f4 ++ faddd $f20, $f21, $f5 ++ faddd $f22, $f23, $f6 ++ faddd $f24, $f25, $f7 + -+ ST $f18, 0*SIZE($20) -+ ST $f19, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ ble $5, $SubEnd -+ .align 4 ++ fcmpeq $f0, $f4, $f26 ++ fcmpeq $f0, $f5, $f27 ++ fcmpeq $f0, $f6, $f28 ++ fcmpeq $f0, $f7, $f29 + -+$SubRemain: -+ subl $5, 1, $6 -+ ble $5, $SubEnd -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) ++ fabs $f10, $f18 ++ ldi $0, 1($0) ++ ldi $1, -1($1) # i -- ++ fbne $f26, $End + -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ SXADDQ $19, $18, $18 -+ SXADDQ $21, $20, $24 -+ ble $6, $SubRemainLoopEnd ++ fabs $f11, $f19 ++ ldi $0, 1($0) ++ unop ++ fbne $f27, $End ++ ++ fabs $f12, $f20 ++ ldi $0, 1($0) ++ unop ++ fbne $f28, $End ++ ++ fabs $f13, $f21 ++ ldi $0, 1($0) ++ fbne $f29, $End ++ bgt $1, $L22 + .align 4 + -+$SubRemainLoop: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) -+ -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) -+ ADD1 $f20, $f21, $f16 -+ SXADDQ $19, $18, $18 ++$L23: ++ fabs $f14, $f22 ++ fabs $f15, $f23 ++ fabs $f16, $f24 ++ fabs $f17, $f25 + -+ ADD2 $f22, $f23, $f17 -+ nop -+ ADD $f16, $f8, tmp -+ fmov tmp, $f16 -+ LD $f8, 0*SIZE($24) ++ faddd $f18, $f19, $f4 ++ faddd $f20, $f21, $f5 ++ faddd $f22, $f23, $f6 ++ faddd $f24, $f25, $f7 + -+ ADD $f17, $f28, tmp -+ fmov tmp, $f17 -+ LD $f28, 1*SIZE($24) -+ SXADDQ $21, $24, $24 -+ subl $6, 1, $6 ++ fcmpeq $f0, $f4, $f26 ++ fcmpeq $f0, $f5, $f27 ++ fcmpeq $f0, $f6, $f28 ++ fcmpeq $f0, $f7, $f29 + -+ ST $f16, 0*SIZE($20) -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ bgt $6, $SubRemainLoop ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ ldi $0, 1($0) ++ fbne $f28, $End ++ ldi $0, 1($0) ++ fbne $f29, $End + .align 4 + -+$SubRemainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 ++$L40: ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) + -+ ADD1 $f20, $f21, $f16 -+ ADD2 $f22, $f23, $f17 -+ ADD $f16, $f8, tmp -+ fmov tmp, $f16 -+ ADD $f17, $f28, tmp -+ fmov tmp, $f17 ++ addl XX, INCX, XX + -+ ST $f16, 0*SIZE($20) -+ nop -+ ST $f17, 1*SIZE($20) -+ nop ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ ++ faddd $f18, $f19, $f18 ++ fcmpeq $f0, $f18, $f2 ++ ++ ldi $0, 1($0) ++ fbne $f2, $End ++ br $31, $L40 + .align 4 + -+$SubEnd: ++$End: + fldd $f2, 0($sp) + fldd $f3, 8($sp) + fldd $f4, 16($sp) + fldd $f5, 24($sp) ++ + fldd $f6, 32($sp) + fldd $f7, 40($sp) + fldd $f8, 48($sp) -+ fldd tmp, 56($sp) -+ ldi $sp, 64($sp) ++ fldd $f9, 56($sp) ++ ldi $sp, STACKSIZE($sp) + ret ++ + EPILOGUE -diff --git a/kernel/sw_64/zaxpy.S.bak b/kernel/sw_64/zaxpy.S.bak +diff --git a/kernel/sw_64/lsame.S b/kernel/sw_64/lsame.S new file mode 100644 -index 0000000..c6cd44b +index 000000000..94dc5495d --- /dev/null -+++ b/kernel/sw_64/zaxpy.S.bak -@@ -0,0 +1,611 @@ ++++ b/kernel/sw_64/lsame.S +@@ -0,0 +1,76 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -65001,585 +10233,283 @@ index 0000000..c6cd44b +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" + -+#define PREFETCHSIZE 40 + -+#ifndef CONJ -+#define ADD1 SUB -+#define ADD2 ADD ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl lsame_ ++ .ent lsame_ ++lsame_: ++ .frame $sp,0,$26,0 ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++ .prologue 1 +#else -+#define ADD1 ADD -+#define ADD2 SUB -+#endif -+ -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 16, $26, 0 -+ -+ ldl $19, 0($sp) -+ fmov $f19, $f29 -+ ldl $20, 8($sp) -+ fmov $f20, $f30 -+ -+ mov $21, $18 -+ ldl $21, 16($sp) -+ ldi $sp, -64($sp) -+ nop -+ -+ fstd $f2, 0($sp) -+ cmpeq $19, 1, $1 -+ fstd $f3, 8($sp) -+ cmpeq $21, 1, $2 -+ -+ fstd $f4, 16($sp) -+ and $16, 3, $5 -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+#ifndef PROFILE + .prologue 0 -+#else -+ .prologue 1 +#endif + -+ and $1, $2, $1 -+ ble $16, $End -+ sra $16, 2, $4 -+ beq $1, $Sub -+ -+ ble $4, $Remain -+ subl $4, 1, $4 -+ -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) -+ LD $f2, 2*SIZE($18) -+ LD $f3, 3*SIZE($18) -+ LD $f4, 4*SIZE($18) -+ LD $f5, 5*SIZE($18) -+ LD $f6, 6*SIZE($18) -+ LD $f7, 7*SIZE($18) -+ -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ LD $f10, 2*SIZE($20) -+ LD $f11, 3*SIZE($20) -+ LD $f12, 4*SIZE($20) -+ LD $f13, 5*SIZE($20) -+ LD $f14, 6*SIZE($20) -+ LD $f15, 7*SIZE($20) -+ -+ addl $18, 8*SIZE, $18 -+ ble $4, $MainLoopEnd -+ .align 4 -+ -+$MainLoop: -+ fillcs PREFETCHSIZE * SIZE($20) -+ fillcs PREFETCHSIZE * SIZE($18) -+ -+ MUL $f29, $f0, $f20 -+ fillcs 9*SIZE($18) -+ MUL $f30, $f1, $f21 -+ unop -+ -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) -+ -+ MUL $f29, $f2, $f24 -+ unop -+ MUL $f30, $f3, $f25 -+ nop -+ -+ MUL $f30, $f2, $f26 -+ LD $f2, 2*SIZE($18) -+ MUL $f29, $f3, $f27 -+ LD $f3, 3*SIZE($18) -+ -+ ADD1 $f20, $f21, $f16 -+ MUL $f29, $f4, $f20 -+ ADD2 $f22, $f23, $f17 -+ MUL $f30, $f5, $f21 -+ -+ ADD1 $f24, $f25, $f18 -+ unop -+ MUL $f30, $f4, $f22 -+ LD $f4, 4*SIZE($18) -+ -+ ADD2 $f26, $f27, $f19 -+ addl $20, 8*SIZE, $20 -+ MUL $f29, $f5, $f23 -+ LD $f5, 5*SIZE($18) -+ -+ ADD $f16, $f8, $f16 -+ LD $f8, 0*SIZE($20) -+ MUL $f29, $f6, $f24 -+ unop -+ -+ ADD $f17, $f28, $f17 -+ LD $f28, 1*SIZE($20) -+ MUL $f30, $f7, $f25 -+ unop -+ -+ ADD $f18, $f10, $f18 -+ LD $f10, 2*SIZE($20) -+ MUL $f30, $f6, $f26 -+ LD $f6, 6*SIZE($18) -+ -+ ADD $f19, $f11, $f19 -+ LD $f11, 3*SIZE($20) -+ MUL $f29, $f7, $f27 -+ LD $f7, 7*SIZE($18) -+ -+ ST $f16,-8*SIZE($20) -+ ADD1 $f20, $f21, $f16 -+ ST $f17,-7*SIZE($20) -+ ADD2 $f22, $f23, $f17 -+ -+ ST $f18,-6*SIZE($20) -+ ADD1 $f24, $f25, $f18 -+ ST $f19,-5*SIZE($20) -+ ADD2 $f26, $f27, $f19 -+ -+ ADD $f16, $f12, $f16 -+ LD $f12, 4*SIZE($20) -+ ADD $f17, $f13, $f17 -+ LD $f13, 5*SIZE($20) -+ ADD $f18, $f14, $f18 -+ LD $f14, 6*SIZE($20) -+ ADD $f19, $f15, $f19 -+ LD $f15, 7*SIZE($20) -+ -+ ST $f16,-4*SIZE($20) -+ addl $18, 8*SIZE, $18 -+ ST $f17,-3*SIZE($20) -+ subl $4, 1, $4 -+ -+ ST $f18,-2*SIZE($20) -+ nop -+ ST $f19,-1*SIZE($20) -+ bgt $4, $MainLoop -+ .align 4 -+ -+$MainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 -+ -+ MUL $f29, $f2, $f24 -+ MUL $f30, $f3, $f25 -+ MUL $f30, $f2, $f26 -+ MUL $f29, $f3, $f27 -+ -+ ADD1 $f20, $f21, $f16 -+ MUL $f29, $f4, $f20 -+ ADD2 $f22, $f23, $f17 -+ MUL $f30, $f5, $f21 -+ -+ ADD1 $f24, $f25, $f18 -+ MUL $f30, $f4, $f22 -+ ADD2 $f26, $f27, $f19 -+ MUL $f29, $f5, $f23 -+ -+ ADD $f16, $f8, $f16 -+ MUL $f29, $f6, $f24 -+ ADD $f17, $f28, $f17 -+ MUL $f30, $f7, $f25 -+ -+ ADD $f18, $f10, $f18 -+ MUL $f30, $f6, $f26 -+ ADD $f19, $f11, $f19 -+ MUL $f29, $f7, $f27 -+ -+ ST $f16, 0*SIZE($20) -+ ADD1 $f20, $f21, $f16 -+ ST $f17, 1*SIZE($20) -+ ADD2 $f22, $f23, $f17 -+ -+ ST $f18, 2*SIZE($20) -+ ADD1 $f24, $f25, $f18 -+ ST $f19, 3*SIZE($20) -+ ADD2 $f26, $f27, $f19 -+ -+ ADD $f16, $f12, $f16 -+ ADD $f17, $f13, $f17 -+ ADD $f18, $f14, $f18 -+ ADD $f19, $f15, $f19 -+ -+ ST $f16, 4*SIZE($20) -+ ST $f17, 5*SIZE($20) -+ ST $f18, 6*SIZE($20) -+ ST $f19, 7*SIZE($20) -+ -+ unop -+ addl $20, 8*SIZE, $20 -+ unop -+ ble $5, $End -+ .align 4 -+ -+$Remain: -+ subl $5, 1, $6 -+ ble $5, $End -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) -+ -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ addl $18, 2*SIZE, $18 -+ ble $6, $RemainLoopEnd -+ .align 4 -+ -+$RemainLoop: -+ MUL $f29, $f0, $f20 -+ subl $6, 1, $6 -+ MUL $f30, $f1, $f21 -+ addl $20, 2*SIZE, $20 -+ -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) -+ -+ ADD1 $f20, $f21, $f16 -+ ADD2 $f22, $f23, $f17 -+ ADD $f16, $f8, $f16 -+ LD $f8, 0*SIZE($20) -+ ADD $f17, $f28, $f17 -+ LD $f28, 1*SIZE($20) -+ -+ ST $f16,-2*SIZE($20) -+ addl $18, 2*SIZE, $18 -+ ST $f17,-1*SIZE($20) -+ bgt $6, $RemainLoop -+ .align 4 ++ ldbu $5, 0($16) ++ ldbu $6, 0($17) ++ // extbl $5, $16, $5 ++ // extbl $6, $17, $6 + -+$RemainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 -+ -+ ADD1 $f20, $f21, $f16 -+ ADD2 $f22, $f23, $f17 -+ ADD $f16, $f8, $f16 -+ ADD $f17, $f28, $f17 ++ subl $5, 96, $1 ++ subl $6, 96, $2 ++ subl $5, 32, $3 ++ subl $6, 32, $4 + -+ ST $f16, 0*SIZE($20) -+ nop -+ ST $f17, 1*SIZE($20) -+ nop ++ selgt $1, $3, $5, $5 ++ selgt $2, $4, $6, $6 ++ cmpeq $5, $6, $0 + .align 4 + +$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ ldi $sp, 64($sp) + ret -+ .align 4 -+ -+$Sub: -+ SXSUBL $16, SIZE, $22 -+ addl $22, $22, $22 # Complex -+ .align 4 -+ -+ addl $19, $19, $19 # Complex -+ addl $21, $21, $21 # Complex -+ -+ ble $4, $SubRemain -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) -+ SXADDQ $19, $18, $18 -+ -+ LD $f2, 0*SIZE($18) -+ LD $f3, 1*SIZE($18) -+ SXADDQ $19, $18, $18 -+ -+ LD $f4, 0*SIZE($18) -+ LD $f5, 1*SIZE($18) -+ SXADDQ $19, $18, $18 -+ -+ LD $f6, 0*SIZE($18) -+ LD $f7, 1*SIZE($18) -+ SXADDQ $19, $18, $18 -+ -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ SXADDQ $21, $20, $24 -+ -+ LD $f10, 0*SIZE($24) -+ LD $f11, 1*SIZE($24) -+ SXADDQ $21, $24, $24 -+ -+ LD $f12, 0*SIZE($24) -+ LD $f13, 1*SIZE($24) -+ SXADDQ $21, $24, $24 -+ -+ LD $f14, 0*SIZE($24) -+ LD $f15, 1*SIZE($24) -+ SXADDQ $21, $24, $24 -+ -+ subl $4, 1, $4 -+ ble $4, $SubMainLoopEnd -+ .align 4 -+ -+$SubMainLoop: -+ MUL $f29, $f0, $f20 -+ unop -+ MUL $f30, $f1, $f21 -+ unop -+ -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) -+ -+ MUL $f29, $f2, $f24 -+ SXADDQ $19, $18, $18 -+ MUL $f30, $f3, $f25 -+ unop ++ .end lsame_ ++ .ident VERSION +diff --git a/kernel/sw_64/max.S b/kernel/sw_64/max.S +new file mode 100644 +index 000000000..d4e4bf261 +--- /dev/null ++++ b/kernel/sw_64/max.S +@@ -0,0 +1,227 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+ MUL $f30, $f2, $f26 -+ LD $f2, 0*SIZE($18) -+ MUL $f29, $f3, $f27 -+ LD $f3, 1*SIZE($18) ++#define ASSEMBLER ++#include "common.h" + -+ ADD1 $f20, $f21, $f16 -+ SXADDQ $19, $18, $18 -+ MUL $f29, $f4, $f20 -+ unop + -+ ADD2 $f22, $f23, $f17 -+ unop -+ MUL $f30, $f5, $f21 -+ unop ++#define N $16 ++#define X $17 ++#define INCX $18 + -+ ADD1 $f24, $f25, $f18 -+ unop -+ MUL $f30, $f4, $f22 -+ LD $f4, 0*SIZE($18) ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif + -+ ADD2 $f26, $f27, $f19 -+ unop -+ MUL $f29, $f5, $f23 -+ LD $f5, 1*SIZE($18) -+ -+ ADD $f16, $f8, $f16 -+ LD $f8, 0*SIZE($24) -+ MUL $f29, $f6, $f24 -+ SXADDQ $19, $18, $18 ++#define STACKSIZE 8 * 8 + -+ ADD $f17, $f28, $f17 -+ LD $f28, 1*SIZE($24) -+ MUL $f30, $f7, $f25 -+ SXADDQ $21, $24, $24 ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 + -+ ADD $f18, $f10, $f18 -+ LD $f10, 0*SIZE($24) -+ MUL $f30, $f6, $f26 -+ LD $f6, 0*SIZE($18) ++#ifdef F_INTERFACE ++ ldl N, 0(N) # n ++ ldl INCX, 0(INCX) # incx ++#endif ++ ldi $sp, -STACKSIZE($sp) ++ nop ++ .align 4 + -+ ADD $f19, $f11, $f19 -+ LD $f11, 1*SIZE($24) -+ MUL $f29, $f7, $f27 -+ LD $f7, 1*SIZE($18) ++ cmplt $31, N, $2 ++ cmplt $31, INCX, $3 ++ SXADDQ INCX, $31, INCX ++ and $2, $3, $0 + -+ ST $f16, 0*SIZE($20) -+ SXADDQ $19, $18, $18 -+ ADD1 $f20, $f21, $f16 ++ sra N, 3, $1 ++ fclr $f0 + unop ++ beq $0, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 + -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ ADD2 $f22, $f23, $f17 ++ LD $f0, 0 * SIZE(X) + unop -+ -+ ST $f18, 0*SIZE($20) -+ SXADDQ $21, $24, $24 -+ ADD1 $f24, $f25, $f18 + unop ++ ble $1, $L15 ++ .align 4 + -+ ST $f19, 1*SIZE($20) -+ unop -+ ADD2 $f26, $f27, $f19 -+ SXADDQ $21, $20, $20 ++ fmov $f0, $f1 ++ addl X, INCX, X ++ fmov $f0, $f10 ++ ldi $1, -1($1) + -+ ADD $f16, $f12, $f16 -+ unop -+ LD $f12, 0*SIZE($24) -+ unop ++ LD $f21, 0 * SIZE(X) ++ fmov $f0, $f11 ++ addl X, INCX, X ++ fmov $f0, $f12 + -+ ADD $f17, $f13, $f17 -+ unop -+ LD $f13, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f13 ++ addl X, INCX, X ++ fmov $f0, $f14 + -+ ADD $f18, $f14, $f18 -+ subl $4, 1, $4 -+ LD $f14, 0*SIZE($24) -+ unop ++ LD $f23, 0 * SIZE(X) ++ fmov $f0, $f15 ++ addl X, INCX, X ++ fmov $f0, $f20 + -+ ADD $f19, $f15, $f19 -+ unop -+ LD $f15, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f25, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f26, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f27, 0 * SIZE(X) ++ addl X, INCX, X + -+ ST $f16, 0*SIZE($20) -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ unop ++ CMPLT($f0, $f20), $f16 ++ CMPLT($f1, $f21), $f17 ++ CMPLT($f10, $f22), $f18 ++ CMPLT($f11, $f23), $f19 + -+ ST $f18, 0*SIZE($20) -+ ST $f19, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ bgt $4, $SubMainLoop ++ ble $1, $L13 + .align 4 + -+$SubMainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 -+ -+ MUL $f29, $f2, $f24 -+ MUL $f30, $f3, $f25 -+ MUL $f30, $f2, $f26 -+ MUL $f29, $f3, $f27 -+ -+ ADD1 $f20, $f21, $f16 -+ MUL $f29, $f4, $f20 -+ ADD2 $f22, $f23, $f17 -+ MUL $f30, $f5, $f21 ++$L12: ++ fselne $f16, $f20, $f0, $f0 ++ LD $f20, 0 * SIZE(X) ++ CMPLT($f12, $f24), $f16 ++ addl X, INCX, X + -+ ADD1 $f24, $f25, $f18 -+ MUL $f30, $f4, $f22 -+ ADD2 $f26, $f27, $f19 -+ MUL $f29, $f5, $f23 -+ -+ ADD $f16, $f8, $f16 -+ MUL $f29, $f6, $f24 -+ ADD $f17, $f28, $f17 -+ MUL $f30, $f7, $f25 ++ fselne $f17, $f21, $f1, $f1 ++ LD $f21, 0 * SIZE(X) ++ CMPLT($f13, $f25), $f17 ++ addl X, INCX, X + -+ ADD $f18, $f10, $f18 -+ MUL $f30, $f6, $f26 -+ ADD $f19, $f11, $f19 -+ MUL $f29, $f7, $f27 ++ fselne $f18, $f22, $f10, $f10 ++ LD $f22, 0 * SIZE(X) ++ CMPLT($f14, $f26), $f18 ++ addl X, INCX, X + -+ ST $f16, 0*SIZE($20) -+ ADD1 $f20, $f21, $f16 -+ ST $f17, 1*SIZE($20) -+ ADD2 $f22, $f23, $f17 ++ fselne $f19, $f23, $f11, $f11 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f15, $f27), $f19 ++ addl X, INCX, X + -+ SXADDQ $21, $20, $20 -+ nop -+ ST $f18, 0*SIZE($20) -+ ADD1 $f24, $f25, $f18 ++ fselne $f16, $f24, $f12, $f12 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f0, $f20), $f16 ++ addl X, INCX, X + -+ ST $f19, 1*SIZE($20) -+ ADD2 $f26, $f27, $f19 -+ SXADDQ $21, $20, $20 -+ ADD $f16, $f12, $f16 ++ fselne $f17, $f25, $f13, $f13 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f1, $f21), $f17 ++ addl X, INCX, X + -+ ADD $f17, $f13, $f17 -+ ADD $f18, $f14, $f18 -+ ADD $f19, $f15, $f19 ++ fselne $f18, $f26, $f14, $f14 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f10, $f22), $f18 ++ addl X, INCX, X + -+ ST $f16, 0*SIZE($20) -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 ++ fselne $f19, $f27, $f15, $f15 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f11, $f23), $f19 ++ ldi $1, -1($1) # i -- + -+ ST $f18, 0*SIZE($20) -+ ST $f19, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ ble $5, $SubEnd ++ addl X, INCX, X ++ unop ++ unop ++ bgt $1,$L12 + .align 4 + -+$SubRemain: -+ subl $5, 1, $6 -+ ble $5, $SubEnd -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) ++$L13: ++ fselne $f16, $f20, $f0, $f0 ++ CMPLT($f12, $f24), $f16 + -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ SXADDQ $19, $18, $18 -+ SXADDQ $21, $20, $24 -+ ble $6, $SubRemainLoopEnd -+ .align 4 ++ fselne $f17, $f21, $f1, $f1 ++ CMPLT($f13, $f25), $f17 + -+$SubRemainLoop: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) ++ fselne $f18, $f22, $f10, $f10 ++ CMPLT($f14, $f26), $f18 + -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) -+ ADD1 $f20, $f21, $f16 -+ SXADDQ $19, $18, $18 ++ fselne $f19, $f23, $f11, $f11 ++ CMPLT($f15, $f27), $f19 + -+ ADD2 $f22, $f23, $f17 -+ nop -+ ADD $f16, $f8, $f16 -+ LD $f8, 0*SIZE($24) ++ fselne $f16, $f24, $f12, $f12 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f25, $f13, $f13 ++ CMPLT($f10, $f11), $f17 + -+ ADD $f17, $f28, $f17 -+ LD $f28, 1*SIZE($24) -+ SXADDQ $21, $24, $24 -+ subl $6, 1, $6 ++ fselne $f18, $f26, $f14, $f14 ++ CMPLT($f12, $f13), $f18 ++ fselne $f19, $f27, $f15, $f15 ++ CMPLT($f14, $f15), $f19 + -+ ST $f16, 0*SIZE($20) -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ bgt $6, $SubRemainLoop ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f11, $f10, $f10 ++ fselne $f18, $f13, $f12, $f12 ++ fselne $f19, $f15, $f14, $f14 ++ ++ CMPLT($f0, $f10), $f16 ++ CMPLT($f12, $f14), $f17 ++ ++ fselne $f16, $f10, $f0, $f0 ++ fselne $f17, $f14, $f12, $f12 ++ ++ CMPLT($f0, $f12), $f16 ++ fselne $f16, $f12, $f0, $f0 + .align 4 + -+$SubRemainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 -+ -+ ADD1 $f20, $f21, $f16 -+ ADD2 $f22, $f23, $f17 -+ ADD $f16, $f8, $f16 -+ ADD $f17, $f28, $f17 ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $End ++ .align 4 + -+ ST $f16, 0*SIZE($20) -+ nop -+ ST $f17, 1*SIZE($20) -+ nop ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ CMPLT($f0, $f20), $f16 ++ fselne $f16, $f20, $f0, $f0 ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 + .align 4 + -+$SubEnd: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ ldi $sp, 64($sp) ++$End: ++ ldi $sp, STACKSIZE($sp) + ret ++ + EPILOGUE -diff --git a/kernel/sw_64/zaxpy_simd.S b/kernel/sw_64/zaxpy_simd.S +diff --git a/kernel/sw_64/rot.S b/kernel/sw_64/rot.S new file mode 100644 -index 0000000..a823ebf +index 000000000..6680a7e73 --- /dev/null -+++ b/kernel/sw_64/zaxpy_simd.S -@@ -0,0 +1,1479 @@ ++++ b/kernel/sw_64/rot.S +@@ -0,0 +1,624 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -65620,1451 +10550,596 @@ index 0000000..a823ebf + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#define PREFETCHSIZE 128 + -+#ifndef CONJ -+#define ADD1 SUB -+#define ADD2 ADD ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define I $21 ++#define XX $23 ++#define YY $24 + -+#define VADD1 VSUB -+#define VADD2 VADD -+#define VMAD1 VNMAD -+#define VMAD2 VMAD -+ -+#else -+#define ADD1 ADD -+#define ADD2 SUB -+ -+#define VADD1 VADD -+#define VADD2 VSUB -+#define VMAD1 VMAD -+#define VMAD2 VNMAD -+ -+#endif ++#define C $f10 ++#define S $f11 + ++#define PREFETCH_SIZE 80 + + PROLOGUE + PROFCODE -+ .frame $sp, 64, $26, 0 -+ -+ ldl $19, 0($sp) -+ fmov $f19, $f29 -+ ldl $20, 8($sp) -+ fmov $f20, $f30 -+ -+ mov $21, $18 -+ ldl $21, 16($sp) -+ ldi $sp, -64($sp) -+ nop -+ -+ fstd $f2, 0($sp) -+ cmpeq $19, 1, $1 -+ fstd $f3, 8($sp) -+ cmpeq $21, 1, $2 -+ -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ nop ++ .frame $sp, 0, $26, 0 + -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif -+/* -+ unloop 8: process 8 complex=16 float/double -+*/ -+ and $1, $2, $1 -+ ble $16, $End -+ sra $16, 3, $4 -+ and $16, 7, $5 + -+ beq $1, $Sub -+ ble $4, $Remain -+ subl $4, 1, $4 -+ nop -+/*extern alpha_r alpha_i to vector*/ ++ fmov $f21, C ++ LD S, 0($sp) + -+ vcpyf $f29, $f29 -+ vcpyf $f30, $f30 ++ cmpeq INCX, 1, $23 ++ cmpeq INCY, 1, $24 ++ ble N, $L998 + -+/** -+ align ? -+ test the address of Y & X -+**/ -+ and $20, (VEC_LEN*SIZE-1), $6 -+ bgt $6, $UnAlign_Y_ACCESS ++ and $23, $24, $23 ++ beq $23, $L50 + -+ and $18, (VEC_LEN*SIZE-1), $7 -+ nop -+ nop -+ bgt $7, $UnAlign_X_ACCESS ++ sra N, 3, I ++ ble I, $L15 + -+ .align 4 ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) + -+ VLD $f0, 0*VEC_LEN*SIZE($18) -+ VLD $f1, 1*VEC_LEN*SIZE($18) -+ VLD $f2, 2*VEC_LEN*SIZE($18) -+ VLD $f3, 3*VEC_LEN*SIZE($18) ++ LD $f16, 2*SIZE(X) ++ LD $f17, 2*SIZE(Y) ++ LD $f18, 3*SIZE(X) ++ LD $f19, 3*SIZE(Y) + -+/* -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) -+ LD $f2, 2*SIZE($18) -+ LD $f3, 3*SIZE($18) -+ -+ LD $f4, 4*SIZE($18) -+ LD $f5, 5*SIZE($18) -+ LD $f6, 6*SIZE($18) -+ LD $f7, 7*SIZE($18) -+*/ ++ MUL C, $f12, $f21 ++ unop ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 + -+ VLD $f8, 0*VEC_LEN*SIZE($20) -+ VLD $f28, 1*VEC_LEN*SIZE($20) -+ VLD $f10, 2*VEC_LEN*SIZE($20) -+ VLD $f11, 3*VEC_LEN*SIZE($20) ++ LD $f13, 4*SIZE(Y) ++ MUL S, $f12, $f24 ++ LD $f12, 4*SIZE(X) ++ MUL C, $f14, $f25 + -+/* -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ LD $f10, 2*SIZE($20) -+ LD $f11, 3*SIZE($20) -+ LD $f12, 4*SIZE($20) -+ LD $f13, 5*SIZE($20) -+ LD $f14, 6*SIZE($20) -+ LD $f15, 7*SIZE($20) -+*/ -+ addl $18, 16*SIZE, $18 -+ ble $4, $MainLoopEnd -+ .align 4 ++ ldi I, -1(I) ++ MUL S, $f15, $f26 ++ ADD $f21, $f22, $f22 ++ MUL C, $f15, $f27 + -+$MainLoop: -+/* -+ fillcs PREFETCHSIZE * SIZE($20) -+ fillcs PREFETCHSIZE * SIZE($18) -+*/ -+ fillcs PREFETCHSIZE * SIZE($20) -+ fillcs PREFETCHSIZE * SIZE($18) -+ -+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ -+ vextf $f0, 1, $f4 -+ vextf $f0, 3, $f5 -+ vextf $f1, 0, $f6 -+ vextf $f1, 2, $f7 -+ -+ vextf $f2, 1, $f12 -+ vextf $f2, 3, $f13 -+ vextf $f3, 0, $f14 -+ vextf $f3, 2, $f15 -+ -+ vinsf $f4, $f1, 0, $f1 -+ vinsf $f5, $f1, 2, $f1 -+ vinsf $f6, $f0, 1, $f0 -+ vinsf $f7, $f0, 3, $f0 -+ -+ vinsf $f12, $f3, 0, $f3 -+ vinsf $f13, $f3, 2, $f3 -+ vinsf $f14, $f2, 1, $f2 -+ vinsf $f15, $f2, 3, $f2 -+ -+/*Compute*/ -+ VMUL $f29, $f0, $f20 -+ VMUL $f30, $f0, $f21 -+ VMUL $f29, $f2, $f22 -+ VMUL $f30, $f2, $f23 -+ -+ -+ VMAD1 $f30, $f1, $f20, $f16 -+ VMAD2 $f29, $f1, $f21, $f17 -+ VMAD1 $f30, $f3, $f22, $f18 -+ VMAD2 $f29, $f3, $f23, $f19 -+ -+ VLD $f0, 0*VEC_LEN*SIZE($18) -+ VLD $f1, 1*VEC_LEN*SIZE($18) -+ VLD $f2, 2*VEC_LEN*SIZE($18) -+ VLD $f3, 3*VEC_LEN*SIZE($18) -+ -+/*combine the real & image vector to complex vector*/ -+ vextf $f16, 1, $f24 -+ vextf $f16, 3, $f25 -+ vextf $f17, 0, $f26 -+ vextf $f17, 2, $f27 -+ -+ vextf $f18, 1, $f12 -+ vextf $f18, 3, $f13 -+ vextf $f19, 0, $f14 -+ vextf $f19, 2, $f15 -+ -+ vinsf $f24, $f17, 0, $f17 -+ addl $20, 16*SIZE, $20 -+ vinsf $f25, $f17, 2, $f17 -+ addl $18, 16*SIZE, $18 -+ -+ vinsf $f26, $f16, 1, $f16 -+ subl $4, 1, $4 -+ vinsf $f27, $f16, 3, $f16 -+ nop ++ LD $f15, 5*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ ble I, $L13 ++ .align 4 + -+ vinsf $f12, $f19, 0, $f19 -+ vinsf $f13, $f19, 2, $f19 -+ vinsf $f14, $f18, 1, $f18 -+ vinsf $f15, $f18, 3, $f18 -+ -+ VADD $f16, $f8, $f16 -+ VLD $f8, 0*VEC_LEN*SIZE($20) -+ VADD $f17, $f28, $f17 -+ VLD $f28, 1*VEC_LEN*SIZE($20) -+ -+ VADD $f18, $f10, $f18 -+ VLD $f10, 2*VEC_LEN*SIZE($20) -+ VADD $f19, $f11, $f19 -+ VLD $f11, 3*VEC_LEN*SIZE($20) -+ -+ VST $f16, -4*VEC_LEN*SIZE($20) -+ VST $f17, -3*VEC_LEN*SIZE($20) -+ VST $f18, -2*VEC_LEN*SIZE($20) -+ VST $f19, -1*VEC_LEN*SIZE($20) -+ -+/* -+ MUL $f29, $f0, $f20 -+ fillcs 9*SIZE($18) -+ MUL $f30, $f1, $f21 ++$L12: ++ MUL C, $f16, $f21 ++ fillde (PREFETCH_SIZE) * SIZE(X) + unop ++ LD $f14, 5*SIZE(X) + -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) -+ -+ MUL $f29, $f2, $f24 ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 + unop -+ MUL $f30, $f3, $f25 -+ nop -+ -+ MUL $f30, $f2, $f26 -+ LD $f2, 2*SIZE($18) -+ MUL $f29, $f3, $f27 -+ LD $f3, 3*SIZE($18) -+ -+ ADD1 $f20, $f21, $f16 -+ MUL $f29, $f4, $f20 -+ ADD2 $f22, $f23, $f17 -+ MUL $f30, $f5, $f21 ++ ADD $f25, $f26, $f26 + -+ ADD1 $f24, $f25, $f18 ++ MUL C, $f17, $f23 ++ fillde (PREFETCH_SIZE) * SIZE(Y) + unop -+ MUL $f30, $f4, $f22 -+ LD $f4, 4*SIZE($18) ++ LD $f17, 6*SIZE(Y) + -+ ADD2 $f26, $f27, $f19 -+ addl $20, 8*SIZE, $20 -+ MUL $f29, $f5, $f23 -+ LD $f5, 5*SIZE($18) -+ -+ ADD $f16, $f8, $f16 -+ LD $f8, 0*SIZE($20) -+ MUL $f29, $f6, $f24 ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 + unop ++ SUB $f27, $f28, $f28 + -+ ADD $f17, $f28, $f17 -+ LD $f28, 1*SIZE($20) -+ MUL $f30, $f7, $f25 ++ MUL C, $f18, $f25 ++ LD $f16, 6*SIZE(X) ++ unop + unop + -+ ADD $f18, $f10, $f18 -+ LD $f10, 2*SIZE($20) -+ MUL $f30, $f6, $f26 -+ LD $f6, 6*SIZE($18) -+ -+ ADD $f19, $f11, $f19 -+ LD $f11, 3*SIZE($20) -+ MUL $f29, $f7, $f27 -+ LD $f7, 7*SIZE($18) -+ -+ ST $f16,-8*SIZE($20) -+ ADD1 $f20, $f21, $f16 -+ ST $f17,-7*SIZE($20) -+ ADD2 $f22, $f23, $f17 -+ -+ ST $f18,-6*SIZE($20) -+ ADD1 $f24, $f25, $f18 -+ ST $f19,-5*SIZE($20) -+ ADD2 $f26, $f27, $f19 -+ -+ ADD $f16, $f12, $f16 -+ LD $f12, 4*SIZE($20) -+ ADD $f17, $f13, $f17 -+ LD $f13, 5*SIZE($20) -+ ADD $f18, $f14, $f18 -+ LD $f14, 6*SIZE($20) -+ ADD $f19, $f15, $f19 -+ LD $f15, 7*SIZE($20) -+ -+ ST $f16,-4*SIZE($20) -+ -+ ST $f17,-3*SIZE($20) -+ -+ -+ ST $f18,-2*SIZE($20) -+ nop -+ ST $f19,-1*SIZE($20) -+*/ -+ bgt $4, $MainLoop -+ .align 4 -+ -+$MainLoopEnd: -+ -+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ -+ vextf $f0, 1, $f4 -+ vextf $f0, 3, $f5 -+ vextf $f1, 0, $f6 -+ vextf $f1, 2, $f7 -+ -+ vextf $f2, 1, $f12 -+ vextf $f2, 3, $f13 -+ vextf $f3, 0, $f14 -+ vextf $f3, 2, $f15 -+ -+ vinsf $f4, $f1, 0, $f1 -+ vinsf $f5, $f1, 2, $f1 -+ vinsf $f6, $f0, 1, $f0 -+ vinsf $f7, $f0, 3, $f0 -+ -+ vinsf $f12, $f3, 0, $f3 -+ vinsf $f13, $f3, 2, $f3 -+ vinsf $f14, $f2, 1, $f2 -+ vinsf $f15, $f2, 3, $f2 -+ -+ VMUL $f29, $f0, $f20 -+ VMUL $f30, $f0, $f21 -+ VMUL $f29, $f2, $f22 -+ VMUL $f30, $f2, $f23 -+ -+ VMAD1 $f30, $f1, $f20, $f16 -+ VMAD2 $f29, $f1, $f21, $f17 -+ VMAD1 $f30, $f3, $f22, $f18 -+ VMAD2 $f29, $f3, $f23, $f19 -+ -+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ -+ vextf $f16, 1, $f24 -+ vextf $f16, 3, $f25 -+ vextf $f17, 0, $f26 -+ vextf $f17, 2, $f27 -+ -+ vextf $f18, 1, $f12 -+ vextf $f18, 3, $f13 -+ vextf $f19, 0, $f14 -+ vextf $f19, 2, $f15 -+ -+ vinsf $f24, $f17, 0, $f17 -+ vinsf $f25, $f17, 2, $f17 -+ vinsf $f26, $f16, 1, $f16 -+ vinsf $f27, $f16, 3, $f16 -+ -+ vinsf $f12, $f19, 0, $f19 -+ vinsf $f13, $f19, 2, $f19 -+ vinsf $f14, $f18, 1, $f18 -+ vinsf $f15, $f18, 3, $f18 -+ -+ VADD $f16, $f8, $f16 -+ VADD $f17, $f28, $f17 -+ VADD $f18, $f10, $f18 -+ VADD $f19, $f11, $f19 -+ -+ VST $f16, 0*VEC_LEN*SIZE($20) -+ VST $f17, 1*VEC_LEN*SIZE($20) -+ VST $f18, 2*VEC_LEN*SIZE($20) -+ VST $f19, 3*VEC_LEN*SIZE($20) -+ -+ addl $20, 16*SIZE, $20 -+ ble $5, $End -+ -+/* MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 -+ -+ MUL $f29, $f2, $f24 -+ MUL $f30, $f3, $f25 -+ MUL $f30, $f2, $f26 -+ MUL $f29, $f3, $f27 -+ -+ ADD1 $f20, $f21, $f16 -+ MUL $f29, $f4, $f20 -+ ADD2 $f22, $f23, $f17 -+ MUL $f30, $f5, $f21 -+ -+ ADD1 $f24, $f25, $f18 -+ MUL $f30, $f4, $f22 -+ ADD2 $f26, $f27, $f19 -+ MUL $f29, $f5, $f23 -+ -+ ADD $f16, $f8, $f16 -+ MUL $f29, $f6, $f24 -+ ADD $f17, $f28, $f17 -+ MUL $f30, $f7, $f25 -+ -+ ADD $f18, $f10, $f18 -+ MUL $f30, $f6, $f26 -+ ADD $f19, $f11, $f19 -+ MUL $f29, $f7, $f27 -+ -+ ST $f16, 0*SIZE($20) -+ ADD1 $f20, $f21, $f16 -+ ST $f17, 1*SIZE($20) -+ ADD2 $f22, $f23, $f17 -+ -+ ST $f18, 2*SIZE($20) -+ ADD1 $f24, $f25, $f18 -+ ST $f19, 3*SIZE($20) -+ ADD2 $f26, $f27, $f19 -+ -+ ADD $f16, $f12, $f16 -+ ADD $f17, $f13, $f17 -+ ADD $f18, $f14, $f18 -+ ADD $f19, $f15, $f19 -+ -+ ST $f16, 4*SIZE($20) -+ ST $f17, 5*SIZE($20) -+ ST $f18, 6*SIZE($20) -+ ST $f19, 7*SIZE($20) -+ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 + unop + unop -+*/ -+ .align 4 ++ LD $f19, 7*SIZE(Y) + -+$Remain: -+ subl $5, 1, $6 -+ ble $5, $End -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 + -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ addl $18, 2*SIZE, $18 -+ ble $6, $RemainLoopEnd -+ .align 4 ++ MUL C, $f12, $f21 ++ LD $f18, 7*SIZE(X) ++ unop ++ unop + -+$RemainLoop: -+ MUL $f29, $f0, $f20 -+ subl $6, 1, $6 -+ MUL $f30, $f1, $f21 -+ addl $20, 2*SIZE, $20 ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 + -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) -+ -+ ADD1 $f20, $f21, $f16 -+ ADD2 $f22, $f23, $f17 -+ ADD $f16, $f8, $f16 -+ LD $f8, 0*SIZE($20) -+ ADD $f17, $f28, $f17 -+ LD $f28, 1*SIZE($20) ++ MUL C, $f13, $f23 ++ LD $f13, 8*SIZE(Y) ++ unop ++ unop + -+ ST $f16,-2*SIZE($20) -+ addl $18, 2*SIZE, $18 -+ ST $f17,-1*SIZE($20) -+ bgt $6, $RemainLoop -+ .align 4 ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 + -+$RemainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 -+ -+ ADD1 $f20, $f21, $f16 -+ ADD2 $f22, $f23, $f17 -+ ADD $f16, $f8, $f16 -+ ADD $f17, $f28, $f17 ++ MUL C, $f14, $f25 ++ LD $f12, 8*SIZE(X) ++ unop ++ unop + -+ ST $f16, 0*SIZE($20) -+ nop -+ ST $f17, 1*SIZE($20) -+ nop -+ .align 4 ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 + -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ ldi $sp, 64($sp) -+ ret -+ .align 4 ++ MUL C, $f15, $f27 ++ LD $f15, 9*SIZE(Y) ++ unop ++ unop + -+$UnAlign_Y_ACCESS: -+ and $18, (VEC_LEN*SIZE-1), $7 -+ nop -+ nop -+ bgt $7, $UnAlign_XY_ACCESS -+ .align 4 -+/* -+ Unalign access Y, Align access X -+*/ -+ -+ VLD_UL $f8, 0*VEC_LEN*SIZE($20) -+ VLD_UH $f12, 1*VEC_LEN*SIZE($20) -+ -+ VLD_UL $f28, 1*VEC_LEN*SIZE($20) -+ VLD_UH $f13, 2*VEC_LEN*SIZE($20) -+ -+ VLD_UL $f10, 2*VEC_LEN*SIZE($20) -+ VLD_UH $f14, 3*VEC_LEN*SIZE($20) -+ -+ VLD_UL $f11, 3*VEC_LEN*SIZE($20) -+ VLD_UH $f15, 4*VEC_LEN*SIZE($20) -+ -+ VLD $f0, 0*VEC_LEN*SIZE($18) -+ VLD $f1, 1*VEC_LEN*SIZE($18) -+ VLD $f2, 2*VEC_LEN*SIZE($18) -+ VLD $f3, 3*VEC_LEN*SIZE($18) -+ -+ vbisw $f8, $f12, $f8 -+ vbisw $f28, $f13, $f28 -+ vbisw $f10, $f14, $f10 -+ vbisw $f11, $f15, $f11 -+ -+ addl $18, 16*SIZE, $18 -+ ble $4, $UnAlign_Y_MainLoopEnd -+ .align 4 -+$UnAlign_Y_MainLoop: -+ fillcs PREFETCHSIZE * SIZE($20) -+ fillcs PREFETCHSIZE * SIZE($18) -+ -+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ -+ vextf $f0, 1, $f4 -+ vextf $f0, 3, $f5 -+ vextf $f1, 0, $f6 -+ vextf $f1, 2, $f7 -+ -+ vextf $f2, 1, $f12 -+ vextf $f2, 3, $f13 -+ vextf $f3, 0, $f14 -+ vextf $f3, 2, $f15 -+ -+ vinsf $f4, $f1, 0, $f1 -+ vinsf $f5, $f1, 2, $f1 -+ vinsf $f6, $f0, 1, $f0 -+ vinsf $f7, $f0, 3, $f0 -+ -+ vinsf $f12, $f3, 0, $f3 -+ vinsf $f13, $f3, 2, $f3 -+ vinsf $f14, $f2, 1, $f2 -+ vinsf $f15, $f2, 3, $f2 -+ -+/*Compute*/ -+ VMUL $f29, $f0, $f20 -+ VMUL $f30, $f0, $f21 -+ VMUL $f29, $f2, $f22 -+ VMUL $f30, $f2, $f23 -+ -+ -+ VMAD1 $f30, $f1, $f20, $f16 -+ VMAD2 $f29, $f1, $f21, $f17 -+ VMAD1 $f30, $f3, $f22, $f18 -+ VMAD2 $f29, $f3, $f23, $f19 -+ -+ VLD $f0, 0*VEC_LEN*SIZE($18) -+ VLD $f1, 1*VEC_LEN*SIZE($18) -+ VLD $f2, 2*VEC_LEN*SIZE($18) -+ VLD $f3, 3*VEC_LEN*SIZE($18) -+ -+ -+/*combine the real & image vector to complex vector*/ -+ vextf $f16, 1, $f24 -+ vextf $f16, 3, $f25 -+ vextf $f17, 0, $f26 -+ vextf $f17, 2, $f27 -+ -+ vextf $f18, 1, $f12 -+ vextf $f18, 3, $f13 -+ vextf $f19, 0, $f14 -+ vextf $f19, 2, $f15 -+ -+ vinsf $f24, $f17, 0, $f17 -+ addl $20, 16*SIZE, $20 -+ vinsf $f25, $f17, 2, $f17 -+ addl $18, 16*SIZE, $18 -+ -+ vinsf $f26, $f16, 1, $f16 -+ subl $4, 1, $4 -+ vinsf $f27, $f16, 3, $f16 -+ nop ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 + -+ vinsf $f12, $f19, 0, $f19 -+ vinsf $f13, $f19, 2, $f19 -+ vinsf $f14, $f18, 1, $f18 -+ vinsf $f15, $f18, 3, $f18 -+ -+ VADD $f16, $f8, $f16 -+ VLD_UL $f8, 0*VEC_LEN*SIZE($20) -+ VLD_UH $f12, 1*VEC_LEN*SIZE($20) -+ -+ VADD $f17, $f28, $f17 -+ VLD_UL $f28, 1*VEC_LEN*SIZE($20) -+ VLD_UH $f13, 2*VEC_LEN*SIZE($20) -+ -+ -+ VADD $f18, $f10, $f18 -+ VLD_UL $f10, 2*VEC_LEN*SIZE($20) -+ VLD_UH $f14, 3*VEC_LEN*SIZE($20) -+ -+ VADD $f19, $f11, $f19 -+ VLD_UL $f11, 3*VEC_LEN*SIZE($20) -+ VLD_UH $f15, 4*VEC_LEN*SIZE($20) -+ -+ -+ vbisw $f8, $f12, $f8 -+ VST_UL $f16, -4*VEC_LEN*SIZE($20) -+ VST_UH $f16, -3*VEC_LEN*SIZE($20) -+ -+ vbisw $f28, $f13, $f28 -+ VST_UL $f17, -3*VEC_LEN*SIZE($20) -+ VST_UH $f17, -2*VEC_LEN*SIZE($20) -+ -+ vbisw $f10, $f14, $f10 -+ VST_UL $f18, -2*VEC_LEN*SIZE($20) -+ VST_UH $f18, -1*VEC_LEN*SIZE($20) -+ -+ vbisw $f11, $f15, $f11 -+ VST_UL $f19, -1*VEC_LEN*SIZE($20) -+ VST_UH $f19, 0*VEC_LEN*SIZE($20) -+ -+ bgt $4, $UnAlign_Y_MainLoop -+ -+$UnAlign_Y_MainLoopEnd: -+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ -+ vextf $f0, 1, $f4 -+ vextf $f0, 3, $f5 -+ vextf $f1, 0, $f6 -+ vextf $f1, 2, $f7 -+ -+ vextf $f2, 1, $f12 -+ vextf $f2, 3, $f13 -+ vextf $f3, 0, $f14 -+ vextf $f3, 2, $f15 -+ -+ vinsf $f4, $f1, 0, $f1 -+ vinsf $f5, $f1, 2, $f1 -+ vinsf $f6, $f0, 1, $f0 -+ vinsf $f7, $f0, 3, $f0 -+ -+ vinsf $f12, $f3, 0, $f3 -+ vinsf $f13, $f3, 2, $f3 -+ vinsf $f14, $f2, 1, $f2 -+ vinsf $f15, $f2, 3, $f2 -+ -+ VMUL $f29, $f0, $f20 -+ VMUL $f30, $f0, $f21 -+ VMUL $f29, $f2, $f22 -+ VMUL $f30, $f2, $f23 -+ -+ VMAD1 $f30, $f1, $f20, $f16 -+ VMAD2 $f29, $f1, $f21, $f17 -+ VMAD1 $f30, $f3, $f22, $f18 -+ VMAD2 $f29, $f3, $f23, $f19 -+ -+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ -+ vextf $f16, 1, $f24 -+ vextf $f16, 3, $f25 -+ vextf $f17, 0, $f26 -+ vextf $f17, 2, $f27 -+ -+ vextf $f18, 1, $f12 -+ vextf $f18, 3, $f13 -+ vextf $f19, 0, $f14 -+ vextf $f19, 2, $f15 -+ -+ vinsf $f24, $f17, 0, $f17 -+ vinsf $f25, $f17, 2, $f17 -+ vinsf $f26, $f16, 1, $f16 -+ vinsf $f27, $f16, 3, $f16 -+ -+ vinsf $f12, $f19, 0, $f19 -+ vinsf $f13, $f19, 2, $f19 -+ vinsf $f14, $f18, 1, $f18 -+ vinsf $f15, $f18, 3, $f18 -+ -+ VADD $f16, $f8, $f16 -+ VADD $f17, $f28, $f17 -+ VADD $f18, $f10, $f18 -+ VADD $f19, $f11, $f19 -+ -+ VST_UL $f16, 0*VEC_LEN*SIZE($20) -+ VST_UH $f16, 1*VEC_LEN*SIZE($20) -+ VST_UL $f17, 1*VEC_LEN*SIZE($20) -+ VST_UH $f17, 2*VEC_LEN*SIZE($20) -+ -+ VST_UL $f18, 2*VEC_LEN*SIZE($20) -+ VST_UH $f18, 3*VEC_LEN*SIZE($20) -+ VST_UL $f19, 3*VEC_LEN*SIZE($20) -+ VST_UH $f19, 4*VEC_LEN*SIZE($20) -+ -+ addl $20, 16*SIZE, $20 -+ ble $5, $End -+ -+ jmp $Remain ++ MUL C, $f16, $f21 ++ LD $f14, 9*SIZE(X) ++ unop ++ unop + -+ .align 4 ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 + -+ -+$UnAlign_X_ACCESS: -+ and $20, (VEC_LEN*SIZE-1), $6 -+ nop -+ nop -+ bgt $6, $UnAlign_XY_ACCESS -+ -+ .align 4 -+/* -+ Unalign access X, Align access Y -+*/ -+ VLD_UL $f0, 0*VEC_LEN*SIZE($18) -+ VLD_UH $f4, 1*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f1, 1*VEC_LEN*SIZE($18) -+ VLD_UH $f5, 2*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f2, 2*VEC_LEN*SIZE($18) -+ VLD_UH $f6, 3*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f3, 3*VEC_LEN*SIZE($18) -+ VLD_UH $f7, 4*VEC_LEN*SIZE($18) -+ -+ VLD $f8, 0*VEC_LEN*SIZE($20) -+ VLD $f28, 1*VEC_LEN*SIZE($20) -+ VLD $f10, 2*VEC_LEN*SIZE($20) -+ VLD $f11, 3*VEC_LEN*SIZE($20) -+ -+ vbisw $f0, $f4, $f0 -+ vbisw $f1, $f5, $f1 -+ vbisw $f2, $f6, $f2 -+ vbisw $f3, $f7, $f3 -+ -+ addl $18, 16*SIZE, $18 -+ ble $4, $UnAlign_X_MainLoopEnd -+ .align 4 -+$UnAlign_X_MainLoop: -+ fillcs PREFETCHSIZE * SIZE($20) -+ fillcs PREFETCHSIZE * SIZE($18) -+ -+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ -+ vextf $f0, 1, $f4 -+ vextf $f0, 3, $f5 -+ vextf $f1, 0, $f6 -+ vextf $f1, 2, $f7 -+ -+ vextf $f2, 1, $f12 -+ vextf $f2, 3, $f13 -+ vextf $f3, 0, $f14 -+ vextf $f3, 2, $f15 -+ -+ vinsf $f4, $f1, 0, $f1 -+ vinsf $f5, $f1, 2, $f1 -+ vinsf $f6, $f0, 1, $f0 -+ vinsf $f7, $f0, 3, $f0 -+ -+ vinsf $f12, $f3, 0, $f3 -+ vinsf $f13, $f3, 2, $f3 -+ vinsf $f14, $f2, 1, $f2 -+ vinsf $f15, $f2, 3, $f2 -+ -+/*Compute*/ -+ VMUL $f29, $f0, $f20 -+ VMUL $f30, $f0, $f21 -+ VMUL $f29, $f2, $f22 -+ VMUL $f30, $f2, $f23 -+ -+ -+ VMAD1 $f30, $f1, $f20, $f16 -+ VMAD2 $f29, $f1, $f21, $f17 -+ VMAD1 $f30, $f3, $f22, $f18 -+ VMAD2 $f29, $f3, $f23, $f19 -+/* -+ VLD $f0, 0*VEC_LEN*SIZE($18) -+ VLD $f1, 1*VEC_LEN*SIZE($18) -+ VLD $f2, 2*VEC_LEN*SIZE($18) -+ VLD $f3, 3*VEC_LEN*SIZE($18) -+*/ -+ VLD_UL $f0, 0*VEC_LEN*SIZE($18) -+ VLD_UH $f4, 1*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f1, 1*VEC_LEN*SIZE($18) -+ VLD_UH $f5, 2*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f2, 2*VEC_LEN*SIZE($18) -+ VLD_UH $f6, 3*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f3, 3*VEC_LEN*SIZE($18) -+ VLD_UH $f7, 4*VEC_LEN*SIZE($18) -+ -+/*combine the real & image vector to complex vector*/ -+ vextf $f16, 1, $f24 -+ vextf $f16, 3, $f25 -+ vextf $f17, 0, $f26 -+ vextf $f17, 2, $f27 -+ -+ vextf $f18, 1, $f12 -+ vextf $f18, 3, $f13 -+ vextf $f19, 0, $f14 -+ vextf $f19, 2, $f15 -+ -+ vbisw $f0, $f4, $f0 -+ vbisw $f1, $f5, $f1 -+ vbisw $f2, $f6, $f2 -+ vbisw $f3, $f7, $f3 -+ -+ vinsf $f24, $f17, 0, $f17 -+ addl $20, 16*SIZE, $20 -+ vinsf $f25, $f17, 2, $f17 -+ addl $18, 16*SIZE, $18 -+ -+ vinsf $f26, $f16, 1, $f16 -+ subl $4, 1, $4 -+ vinsf $f27, $f16, 3, $f16 -+ nop ++ MUL C, $f17, $f23 ++ LD $f17, 10*SIZE(Y) ++ unop ++ unop + -+ vinsf $f12, $f19, 0, $f19 -+ vinsf $f13, $f19, 2, $f19 -+ vinsf $f14, $f18, 1, $f18 -+ vinsf $f15, $f18, 3, $f18 -+ -+ VADD $f16, $f8, $f16 -+ VLD $f8, 0*VEC_LEN*SIZE($20) -+ VADD $f17, $f28, $f17 -+ VLD $f28, 1*VEC_LEN*SIZE($20) -+ -+ VADD $f18, $f10, $f18 -+ VLD $f10, 2*VEC_LEN*SIZE($20) -+ VADD $f19, $f11, $f19 -+ VLD $f11, 3*VEC_LEN*SIZE($20) -+ -+ VST $f16, -4*VEC_LEN*SIZE($20) -+ VST $f17, -3*VEC_LEN*SIZE($20) -+ VST $f18, -2*VEC_LEN*SIZE($20) -+ VST $f19, -1*VEC_LEN*SIZE($20) -+ -+ bgt $4, $UnAlign_X_MainLoop -+ .align 4 -+ -+$UnAlign_X_MainLoopEnd: -+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ -+ vextf $f0, 1, $f4 -+ vextf $f0, 3, $f5 -+ vextf $f1, 0, $f6 -+ vextf $f1, 2, $f7 -+ -+ vextf $f2, 1, $f12 -+ vextf $f2, 3, $f13 -+ vextf $f3, 0, $f14 -+ vextf $f3, 2, $f15 -+ -+ vinsf $f4, $f1, 0, $f1 -+ vinsf $f5, $f1, 2, $f1 -+ vinsf $f6, $f0, 1, $f0 -+ vinsf $f7, $f0, 3, $f0 -+ -+ vinsf $f12, $f3, 0, $f3 -+ vinsf $f13, $f3, 2, $f3 -+ vinsf $f14, $f2, 1, $f2 -+ vinsf $f15, $f2, 3, $f2 -+ -+ VMUL $f29, $f0, $f20 -+ VMUL $f30, $f0, $f21 -+ VMUL $f29, $f2, $f22 -+ VMUL $f30, $f2, $f23 -+ -+ VMAD1 $f30, $f1, $f20, $f16 -+ VMAD2 $f29, $f1, $f21, $f17 -+ VMAD1 $f30, $f3, $f22, $f18 -+ VMAD2 $f29, $f3, $f23, $f19 -+ -+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ -+ vextf $f16, 1, $f24 -+ vextf $f16, 3, $f25 -+ vextf $f17, 0, $f26 -+ vextf $f17, 2, $f27 -+ -+ vextf $f18, 1, $f12 -+ vextf $f18, 3, $f13 -+ vextf $f19, 0, $f14 -+ vextf $f19, 2, $f15 -+ -+ vinsf $f24, $f17, 0, $f17 -+ vinsf $f25, $f17, 2, $f17 -+ vinsf $f26, $f16, 1, $f16 -+ vinsf $f27, $f16, 3, $f16 -+ -+ vinsf $f12, $f19, 0, $f19 -+ vinsf $f13, $f19, 2, $f19 -+ vinsf $f14, $f18, 1, $f18 -+ vinsf $f15, $f18, 3, $f18 -+ -+ VADD $f16, $f8, $f16 -+ VADD $f17, $f28, $f17 -+ VADD $f18, $f10, $f18 -+ VADD $f19, $f11, $f19 -+ -+ VST $f16, 0*VEC_LEN*SIZE($20) -+ VST $f17, 1*VEC_LEN*SIZE($20) -+ VST $f18, 2*VEC_LEN*SIZE($20) -+ VST $f19, 3*VEC_LEN*SIZE($20) -+ -+ addl $20, 16*SIZE, $20 -+ ble $5, $End ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 + -+ jmp $Remain -+ .align 4 -+ -+$UnAlign_XY_ACCESS: -+/* -+ Unalign access X & Y -+*/ -+ VLD_UL $f0, 0*VEC_LEN*SIZE($18) -+ VLD_UH $f4, 1*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f1, 1*VEC_LEN*SIZE($18) -+ VLD_UH $f5, 2*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f2, 2*VEC_LEN*SIZE($18) -+ VLD_UH $f6, 3*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f3, 3*VEC_LEN*SIZE($18) -+ VLD_UH $f7, 4*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f8, 0*VEC_LEN*SIZE($20) -+ VLD_UH $f12, 1*VEC_LEN*SIZE($20) -+ -+ VLD_UL $f28, 1*VEC_LEN*SIZE($20) -+ VLD_UH $f13, 2*VEC_LEN*SIZE($20) -+ -+ VLD_UL $f10, 2*VEC_LEN*SIZE($20) -+ VLD_UH $f14, 3*VEC_LEN*SIZE($20) -+ -+ VLD_UL $f11, 3*VEC_LEN*SIZE($20) -+ VLD_UH $f15, 4*VEC_LEN*SIZE($20) -+ -+ vbisw $f0, $f4, $f0 -+ vbisw $f1, $f5, $f1 -+ vbisw $f2, $f6, $f2 -+ vbisw $f3, $f7, $f3 -+ -+ vbisw $f8, $f12, $f8 -+ vbisw $f28, $f13, $f28 -+ vbisw $f10, $f14, $f10 -+ vbisw $f11, $f15, $f11 -+ -+ addl $18, 16*SIZE, $18 -+ ble $4, $UnAlign_MainLoopEnd -+ .align 4 -+ -+$UnAlign_MainLoop: -+ fillcs PREFETCHSIZE * SIZE($20) -+ fillcs PREFETCHSIZE * SIZE($18) -+ -+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ -+ vextf $f0, 1, $f4 -+ vextf $f0, 3, $f5 -+ vextf $f1, 0, $f6 -+ vextf $f1, 2, $f7 -+ -+ vextf $f2, 1, $f12 -+ vextf $f2, 3, $f13 -+ vextf $f3, 0, $f14 -+ vextf $f3, 2, $f15 -+ -+ vinsf $f4, $f1, 0, $f1 -+ vinsf $f5, $f1, 2, $f1 -+ vinsf $f6, $f0, 1, $f0 -+ vinsf $f7, $f0, 3, $f0 -+ -+ vinsf $f12, $f3, 0, $f3 -+ vinsf $f13, $f3, 2, $f3 -+ vinsf $f14, $f2, 1, $f2 -+ vinsf $f15, $f2, 3, $f2 -+ -+/*Compute*/ -+ VMUL $f29, $f0, $f20 -+ VMUL $f30, $f0, $f21 -+ VMUL $f29, $f2, $f22 -+ VMUL $f30, $f2, $f23 -+ -+ -+ VMAD1 $f30, $f1, $f20, $f16 -+ VMAD2 $f29, $f1, $f21, $f17 -+ VMAD1 $f30, $f3, $f22, $f18 -+ VMAD2 $f29, $f3, $f23, $f19 -+/* -+ VLD $f0, 0*VEC_LEN*SIZE($18) -+ VLD $f1, 1*VEC_LEN*SIZE($18) -+ VLD $f2, 2*VEC_LEN*SIZE($18) -+ VLD $f3, 3*VEC_LEN*SIZE($18) -+*/ -+ VLD_UL $f0, 0*VEC_LEN*SIZE($18) -+ VLD_UH $f4, 1*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f1, 1*VEC_LEN*SIZE($18) -+ VLD_UH $f5, 2*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f2, 2*VEC_LEN*SIZE($18) -+ VLD_UH $f6, 3*VEC_LEN*SIZE($18) -+ -+ VLD_UL $f3, 3*VEC_LEN*SIZE($18) -+ VLD_UH $f7, 4*VEC_LEN*SIZE($18) -+ -+/*combine the real & image vector to complex vector*/ -+ vextf $f16, 1, $f24 -+ vextf $f16, 3, $f25 -+ vextf $f17, 0, $f26 -+ vextf $f17, 2, $f27 -+ -+ vextf $f18, 1, $f12 -+ vextf $f18, 3, $f13 -+ vextf $f19, 0, $f14 -+ vextf $f19, 2, $f15 -+ -+ vbisw $f0, $f4, $f0 -+ vbisw $f1, $f5, $f1 -+ vbisw $f2, $f6, $f2 -+ vbisw $f3, $f7, $f3 -+ -+ vinsf $f24, $f17, 0, $f17 -+ addl $20, 16*SIZE, $20 -+ vinsf $f25, $f17, 2, $f17 -+ addl $18, 16*SIZE, $18 -+ -+ vinsf $f26, $f16, 1, $f16 -+ subl $4, 1, $4 -+ vinsf $f27, $f16, 3, $f16 -+ nop ++ MUL C, $f18, $f25 ++ LD $f16, 10*SIZE(X) ++ unop ++ unop + -+ vinsf $f12, $f19, 0, $f19 -+ vinsf $f13, $f19, 2, $f19 -+ vinsf $f14, $f18, 1, $f18 -+ vinsf $f15, $f18, 3, $f18 -+ -+ VADD $f16, $f8, $f16 -+ VLD_UL $f8, 0*VEC_LEN*SIZE($20) -+ VLD_UH $f12, 1*VEC_LEN*SIZE($20) -+ -+ VADD $f17, $f28, $f17 -+ VLD_UL $f28, 1*VEC_LEN*SIZE($20) -+ VLD_UH $f13, 2*VEC_LEN*SIZE($20) -+ -+ -+ VADD $f18, $f10, $f18 -+ VLD_UL $f10, 2*VEC_LEN*SIZE($20) -+ VLD_UH $f14, 3*VEC_LEN*SIZE($20) -+ -+ VADD $f19, $f11, $f19 -+ VLD_UL $f11, 3*VEC_LEN*SIZE($20) -+ VLD_UH $f15, 4*VEC_LEN*SIZE($20) -+ -+/* -+ VST $f16, -4*VEC_LEN*SIZE($20) -+ VST $f17, -3*VEC_LEN*SIZE($20) -+ VST $f18, -2*VEC_LEN*SIZE($20) -+ VST $f19, -1*VEC_LEN*SIZE($20) -+*/ -+ -+ vbisw $f8, $f12, $f8 -+ VST_UL $f16, -4*VEC_LEN*SIZE($20) -+ VST_UH $f16, -3*VEC_LEN*SIZE($20) -+ -+ vbisw $f28, $f13, $f28 -+ VST_UL $f17, -3*VEC_LEN*SIZE($20) -+ VST_UH $f17, -2*VEC_LEN*SIZE($20) -+ -+ vbisw $f10, $f14, $f10 -+ VST_UL $f18, -2*VEC_LEN*SIZE($20) -+ VST_UH $f18, -1*VEC_LEN*SIZE($20) -+ -+ vbisw $f11, $f15, $f11 -+ VST_UL $f19, -1*VEC_LEN*SIZE($20) -+ VST_UH $f19, 0*VEC_LEN*SIZE($20) -+ -+ bgt $4, $UnAlign_MainLoop -+ .align 4 -+ -+$UnAlign_MainLoopEnd: -+ -+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ -+ vextf $f0, 1, $f4 -+ vextf $f0, 3, $f5 -+ vextf $f1, 0, $f6 -+ vextf $f1, 2, $f7 -+ -+ vextf $f2, 1, $f12 -+ vextf $f2, 3, $f13 -+ vextf $f3, 0, $f14 -+ vextf $f3, 2, $f15 -+ -+ vinsf $f4, $f1, 0, $f1 -+ vinsf $f5, $f1, 2, $f1 -+ vinsf $f6, $f0, 1, $f0 -+ vinsf $f7, $f0, 3, $f0 -+ -+ vinsf $f12, $f3, 0, $f3 -+ vinsf $f13, $f3, 2, $f3 -+ vinsf $f14, $f2, 1, $f2 -+ vinsf $f15, $f2, 3, $f2 -+ -+ VMUL $f29, $f0, $f20 -+ VMUL $f30, $f0, $f21 -+ VMUL $f29, $f2, $f22 -+ VMUL $f30, $f2, $f23 -+ -+ VMAD1 $f30, $f1, $f20, $f16 -+ VMAD2 $f29, $f1, $f21, $f17 -+ VMAD1 $f30, $f3, $f22, $f18 -+ VMAD2 $f29, $f3, $f23, $f19 -+ -+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ -+ vextf $f16, 1, $f24 -+ vextf $f16, 3, $f25 -+ vextf $f17, 0, $f26 -+ vextf $f17, 2, $f27 -+ -+ vextf $f18, 1, $f12 -+ vextf $f18, 3, $f13 -+ vextf $f19, 0, $f14 -+ vextf $f19, 2, $f15 -+ -+ vinsf $f24, $f17, 0, $f17 -+ vinsf $f25, $f17, 2, $f17 -+ vinsf $f26, $f16, 1, $f16 -+ vinsf $f27, $f16, 3, $f16 -+ -+ vinsf $f12, $f19, 0, $f19 -+ vinsf $f13, $f19, 2, $f19 -+ vinsf $f14, $f18, 1, $f18 -+ vinsf $f15, $f18, 3, $f18 -+ -+ VADD $f16, $f8, $f16 -+ VADD $f17, $f28, $f17 -+ VADD $f18, $f10, $f18 -+ VADD $f19, $f11, $f19 -+ -+ VST_UL $f16, 0*VEC_LEN*SIZE($20) -+ VST_UH $f16, 1*VEC_LEN*SIZE($20) -+ VST_UL $f17, 1*VEC_LEN*SIZE($20) -+ VST_UH $f17, 2*VEC_LEN*SIZE($20) -+ -+ VST_UL $f18, 2*VEC_LEN*SIZE($20) -+ VST_UH $f18, 3*VEC_LEN*SIZE($20) -+ VST_UL $f19, 3*VEC_LEN*SIZE($20) -+ VST_UH $f19, 4*VEC_LEN*SIZE($20) -+ -+ addl $20, 16*SIZE, $20 -+ ble $5, $End -+ -+ jmp $Remain -+ .align 4 -+/*Unloop 4 complex = 8 float/double*/ -+$Sub: -+ sra $16, 2, $4 -+ and $16, 3, $5 -+ SXSUBL $16, SIZE, $22 -+ addl $22, $22, $22 # Complex -+ .align 4 ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 + -+ addl $19, $19, $19 # Complex -+ addl $21, $21, $21 # Complex ++ MUL C, $f19, $f27 ++ LD $f19, 11*SIZE(Y) ++ unop ++ unop + -+ ble $4, $SubRemain -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) -+ SXADDQ $19, $18, $18 ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ ldi I, -1(I) ++ SUB $f23, $f24, $f24 + -+ LD $f2, 0*SIZE($18) -+ LD $f3, 1*SIZE($18) -+ SXADDQ $19, $18, $18 ++ MUL C, $f12, $f21 ++ LD $f18, 11*SIZE(X) ++ unop ++ unop + -+ LD $f4, 0*SIZE($18) -+ LD $f5, 1*SIZE($18) -+ SXADDQ $19, $18, $18 ++ ST $f22, 6*SIZE(X) ++ MUL S, $f13, $f22 ++ unop ++ ADD $f25, $f26, $f26 + -+ LD $f6, 0*SIZE($18) -+ LD $f7, 1*SIZE($18) -+ SXADDQ $19, $18, $18 ++ MUL C, $f13, $f23 ++ LD $f13, 12*SIZE(Y) ++ ldi X, 8*SIZE(X) ++ unop + -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ SXADDQ $21, $20, $24 ++ ST $f24, 6*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 + -+ LD $f10, 0*SIZE($24) -+ LD $f11, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ MUL C, $f14, $f25 ++ LD $f12, 4*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ unop + -+ LD $f12, 0*SIZE($24) -+ LD $f13, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ ST $f26, -1*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 + -+ LD $f14, 0*SIZE($24) -+ LD $f15, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ MUL C, $f15, $f27 ++ LD $f15, 5*SIZE(Y) ++ unop ++ unop + -+ subl $4, 1, $4 -+ ble $4, $SubMainLoopEnd ++ ST $f28, -1*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ bgt I, $L12 + .align 4 + -+$SubMainLoop: -+ MUL $f29, $f0, $f20 ++$L13: ++ MUL C, $f16, $f21 ++ LD $f14, 5*SIZE(X) + unop -+ MUL $f30, $f1, $f21 + unop + -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 + -+ MUL $f29, $f2, $f24 -+ SXADDQ $19, $18, $18 -+ MUL $f30, $f3, $f25 ++ MUL C, $f17, $f23 ++ unop + unop ++ LD $f17, 6*SIZE(Y) + -+ MUL $f30, $f2, $f26 -+ LD $f2, 0*SIZE($18) -+ MUL $f29, $f3, $f27 -+ LD $f3, 1*SIZE($18) ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ LD $f16, 6*SIZE(X) ++ SUB $f27, $f28, $f28 + -+ ADD1 $f20, $f21, $f16 -+ SXADDQ $19, $18, $18 -+ MUL $f29, $f4, $f20 ++ MUL C, $f18, $f25 ++ unop ++ unop + unop + -+ ADD2 $f22, $f23, $f17 ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 + unop -+ MUL $f30, $f5, $f21 ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 + unop ++ unop ++ LD $f19, 7*SIZE(Y) + -+ ADD1 $f24, $f25, $f18 ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ LD $f18, 7*SIZE(X) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ unop ++ unop + unop -+ MUL $f30, $f4, $f22 -+ LD $f4, 0*SIZE($18) + -+ ADD2 $f26, $f27, $f19 ++ ST $f22, 2*SIZE(X) + unop -+ MUL $f29, $f5, $f23 -+ LD $f5, 1*SIZE($18) -+ -+ ADD $f16, $f8, $f16 -+ LD $f8, 0*SIZE($24) -+ MUL $f29, $f6, $f24 -+ SXADDQ $19, $18, $18 ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 + -+ ADD $f17, $f28, $f17 -+ LD $f28, 1*SIZE($24) -+ MUL $f30, $f7, $f25 -+ SXADDQ $21, $24, $24 ++ MUL C, $f13, $f23 ++ unop ++ unop ++ unop + -+ ADD $f18, $f10, $f18 -+ LD $f10, 0*SIZE($24) -+ MUL $f30, $f6, $f26 -+ LD $f6, 0*SIZE($18) ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 + -+ ADD $f19, $f11, $f19 -+ LD $f11, 1*SIZE($24) -+ MUL $f29, $f7, $f27 -+ LD $f7, 1*SIZE($18) ++ MUL C, $f14, $f25 ++ unop ++ unop ++ unop + -+ ST $f16, 0*SIZE($20) -+ SXADDQ $19, $18, $18 -+ ADD1 $f20, $f21, $f16 ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 + unop ++ ADD $f21, $f22, $f22 + -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ ADD2 $f22, $f23, $f17 ++ MUL C, $f15, $f27 ++ unop ++ unop + unop + -+ ST $f18, 0*SIZE($20) -+ SXADDQ $21, $24, $24 -+ ADD1 $f24, $f25, $f18 ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 + unop ++ SUB $f23, $f24, $f24 + -+ ST $f19, 1*SIZE($20) ++ MUL C, $f16, $f21 ++ unop ++ unop + unop -+ ADD2 $f26, $f27, $f19 -+ SXADDQ $21, $20, $20 + -+ ADD $f16, $f12, $f16 ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop + unop -+ LD $f12, 0*SIZE($24) + unop + -+ ADD $f17, $f13, $f17 ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 + unop -+ LD $f13, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ SUB $f27, $f28, $f28 + -+ ADD $f18, $f14, $f18 -+ subl $4, 1, $4 -+ LD $f14, 0*SIZE($24) ++ MUL C, $f18, $f25 ++ unop ++ unop + unop + -+ ADD $f19, $f15, $f19 ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 + unop -+ LD $f15, 1*SIZE($24) -+ SXADDQ $21, $24, $24 ++ ADD $f21, $f22, $f22 + -+ ST $f16, 0*SIZE($20) -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 ++ MUL C, $f19, $f27 ++ unop ++ unop + unop + -+ ST $f18, 0*SIZE($20) -+ ST $f19, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ bgt $4, $SubMainLoop ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 6*SIZE(X) ++ ADD $f25, $f26, $f26 ++ ST $f24, 6*SIZE(Y) ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 7*SIZE(X) ++ ldi X, 8*SIZE(X) ++ ST $f28, 7*SIZE(Y) ++ ldi Y, 8*SIZE(Y) + .align 4 + -+$SubMainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 -+ -+ MUL $f29, $f2, $f24 -+ MUL $f30, $f3, $f25 -+ MUL $f30, $f2, $f26 -+ MUL $f29, $f3, $f27 + -+ ADD1 $f20, $f21, $f16 -+ MUL $f29, $f4, $f20 -+ ADD2 $f22, $f23, $f17 -+ MUL $f30, $f5, $f21 ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f25 ++ SUB $f23, $f24, $f26 ++ ldi I, -1(I) ++ ++ ST $f25, 0*SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ST $f26, 0*SIZE(Y) ++ ldi Y, 1 * SIZE(Y) ++ ++ bgt I, $L16 ++ .align 4 ++ ++$L998: ++ clr $0 ++ ret ++ .align 4 ++ ++$L50: ++ mov X, XX ++ mov Y, YY ++ ++ sra N, 3, I ++ ble I, $L55 ++ .align 4 ++ ++$L51: ++ LD $f12, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f14, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f16, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f18, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f16, $f21 ++ MUL S, $f17, $f22 ++ MUL C, $f17, $f23 ++ MUL S, $f16, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f18, $f25 ++ MUL S, $f19, $f26 ++ MUL C, $f19, $f27 ++ MUL S, $f18, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f14, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f16, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y + -+ ADD1 $f24, $f25, $f18 -+ MUL $f30, $f4, $f22 -+ ADD2 $f26, $f27, $f19 -+ MUL $f29, $f5, $f23 -+ -+ ADD $f16, $f8, $f16 -+ MUL $f29, $f6, $f24 -+ ADD $f17, $f28, $f17 -+ MUL $f30, $f7, $f25 ++ LD $f18, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y + -+ ADD $f18, $f10, $f18 -+ MUL $f30, $f6, $f26 -+ ADD $f19, $f11, $f19 -+ MUL $f29, $f7, $f27 ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 + -+ ST $f16, 0*SIZE($20) -+ ADD1 $f20, $f21, $f16 -+ ST $f17, 1*SIZE($20) -+ ADD2 $f22, $f23, $f17 ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 + -+ SXADDQ $21, $20, $20 -+ nop -+ ST $f18, 0*SIZE($20) -+ ADD1 $f24, $f25, $f18 ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY + -+ ST $f19, 1*SIZE($20) -+ ADD2 $f26, $f27, $f19 -+ SXADDQ $21, $20, $20 -+ ADD $f16, $f12, $f16 ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 + -+ ADD $f17, $f13, $f17 -+ ADD $f18, $f14, $f18 -+ ADD $f19, $f15, $f19 ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 + -+ ST $f16, 0*SIZE($20) -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY + -+ ST $f18, 0*SIZE($20) -+ ST $f19, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ ble $5, $SubEnd -+ .align 4 ++ MUL C, $f16, $f21 ++ MUL S, $f17, $f22 ++ MUL C, $f17, $f23 ++ MUL S, $f16, $f24 + -+$SubRemain: -+ subl $5, 1, $6 -+ ble $5, $SubEnd -+ LD $f0, 0*SIZE($18) -+ LD $f1, 1*SIZE($18) ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 + -+ LD $f8, 0*SIZE($20) -+ LD $f28, 1*SIZE($20) -+ SXADDQ $19, $18, $18 -+ SXADDQ $21, $20, $24 -+ ble $6, $SubRemainLoopEnd -+ .align 4 ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY + -+$SubRemainLoop: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ LD $f0, 0*SIZE($18) ++ MUL C, $f18, $f25 ++ MUL S, $f19, $f26 ++ MUL C, $f19, $f27 ++ MUL S, $f18, $f28 + -+ MUL $f29, $f1, $f23 -+ LD $f1, 1*SIZE($18) -+ ADD1 $f20, $f21, $f16 -+ SXADDQ $19, $18, $18 ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 + -+ ADD2 $f22, $f23, $f17 -+ nop -+ ADD $f16, $f8, $f16 -+ LD $f8, 0*SIZE($24) ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY + -+ ADD $f17, $f28, $f17 -+ LD $f28, 1*SIZE($24) -+ SXADDQ $21, $24, $24 -+ subl $6, 1, $6 ++ ldi I, -1(I) ++ bgt I, $L51 ++ .align 4 + -+ ST $f16, 0*SIZE($20) -+ ST $f17, 1*SIZE($20) -+ SXADDQ $21, $20, $20 -+ bgt $6, $SubRemainLoop ++$L55: ++ and N, 7, I ++ ble I, $L999 + .align 4 + -+$SubRemainLoopEnd: -+ MUL $f29, $f0, $f20 -+ MUL $f30, $f1, $f21 -+ MUL $f30, $f0, $f22 -+ MUL $f29, $f1, $f23 -+ -+ ADD1 $f20, $f21, $f16 -+ ADD2 $f22, $f23, $f17 -+ ADD $f16, $f8, $f16 -+ ADD $f17, $f28, $f17 ++$L56: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) + -+ ST $f16, 0*SIZE($20) -+ nop -+ ST $f17, 1*SIZE($20) -+ nop ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f25 ++ SUB $f23, $f24, $f26 ++ ldi I, -1(I) ++ ++ ST $f25, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ ST $f26, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ bgt I, $L56 + .align 4 + -+$SubEnd: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ ldi $sp, 64($sp) ++$L999: ++ clr $0 + ret + EPILOGUE -diff --git a/kernel/sw_64/zdot.S b/kernel/sw_64/zdot.S +diff --git a/kernel/sw_64/scal.S b/kernel/sw_64/scal.S new file mode 100644 -index 0000000..114a7a3 +index 000000000..39ab08810 --- /dev/null -+++ b/kernel/sw_64/zdot.S -@@ -0,0 +1,583 @@ ++++ b/kernel/sw_64/scal.S +@@ -0,0 +1,693 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -67105,555 +11180,665 @@ index 0000000..114a7a3 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" ++ + +#define PREFETCHSIZE 88 + +#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 -+#define XX $21 -+#define YY $23 ++#define X $20 ++#define INCX $21 + -+#define I $5 ++#define XX $18 ++#define I $19 ++ ++#define ALPHA $f19 + +#define s0 $f0 +#define s1 $f1 -+#define s2 $f2 -+#define s3 $f30 -+#define s4 $f3 -+ -+#define a0 $f10 -+#define a1 $f11 -+#define a2 $f12 -+#define a3 $f13 -+#define a4 $f14 -+#define a5 $f15 -+#define a6 $f16 -+#define a7 $f17 ++#define s2 $f10 ++#define s3 $f11 + -+#define b0 $f18 -+#define b1 $f19 -+#define b2 $f20 -+#define b3 $f21 -+#define b4 $f22 -+#define b5 $f23 -+#define b6 $f24 -+#define b7 $f25 ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f21 + -+#define t0 $f26 -+#define t1 $f27 -+#define t2 $f28 -+#define t3 $f29 ++#define t0 $f22 ++#define t1 $f23 ++#define t2 $f24 ++#define t3 $f25 + + PROLOGUE + PROFCODE -+ .frame $sp, 24, $26, 0 -+ -+ ldi $sp, -24($sp) -+ fclr s0 -+ fstd $f2, 0($sp) -+ fstd $f3, 16($sp) -+ fclr s1 + -+ fclr s2 -+ addl INCX, INCX, INCX -+ fclr s3 ++ mov X, XX + ble N, $L999 + -+ addl INCY, INCY, INCY -+ fclr t0 -+ fclr t1 -+ fclr t2 -+ fclr t3 ++ ldl $0, 24($sp) ++ bne $0, $L11 ++ ++ fbne ALPHA, $L11 ++ cmpeq INCX, 1, $0 ++ beq $0, $L020 ++ ++#ifndef DOUBLE ++ sra N, 4, I ++ ble I, $L015 ++ ++ fmov $f31, t0 ++ fmov $f31, t1 ++ fmov $f31, t2 ++ fmov $f31, t3 ++ ++ ST t0, 0 * SIZE(X) ++ ST t1, 1 * SIZE(X) ++ ST t2, 2 * SIZE(X) ++ ST t3, 3 * SIZE(X) ++ ++ ST t0, 4 * SIZE(X) ++ ST t1, 5 * SIZE(X) ++ ST t2, 6 * SIZE(X) ++ ST t3, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L013 ++ .align 4 ++ ++$L012: ++ ST t0, 8 * SIZE(X) ++ ST t1, 9 * SIZE(X) ++ ST t2, 10 * SIZE(X) ++ ST t3, 11 * SIZE(X) ++ ++ ST t0, 12 * SIZE(X) ++ ST t1, 13 * SIZE(X) ++ ST t2, 14 * SIZE(X) ++ ST t3, 15 * SIZE(X) ++ ++ ST t0, 16 * SIZE(X) ++ ST t1, 17 * SIZE(X) ++ ST t2, 18 * SIZE(X) ++ ST t3, 19 * SIZE(X) ++ ++ ST t0, 20 * SIZE(X) ++ ST t1, 21 * SIZE(X) ++ ST t2, 22 * SIZE(X) ++ ST t3, 23 * SIZE(X) ++ ++ fillde PREFETCHSIZE * SIZE(X) ++ ldi I, -1(I) ++ addl X, 16 * SIZE, X ++ bne I, $L012 ++ .align 4 ++ ++$L013: ++ ST t0, 8 * SIZE(X) ++ ST t1, 9 * SIZE(X) ++ ST t2, 10 * SIZE(X) ++ ST t3, 11 * SIZE(X) ++ ++ ST t0, 12 * SIZE(X) ++ ST t1, 13 * SIZE(X) ++ ST t2, 14 * SIZE(X) ++ ST t3, 15 * SIZE(X) ++ ++ addl X, 16 * SIZE, X ++ .align 4 ++ ++$L015: ++ and N, 15, I ++ ++#else ++ sra N, 3, I ++ ble I, $L015 ++ ++ fmov $f31, t0 ++ fmov $f31, t1 ++ fmov $f31, t2 ++ fmov $f31, t3 ++ ++ ldi I, -1(I) ++ ble I, $L013 ++ .align 4 ++ ++$L012: ++ ST t0, 0 * SIZE(X) ++ ST t1, 1 * SIZE(X) ++ ST t2, 2 * SIZE(X) ++ ST t3, 3 * SIZE(X) ++ ++ ldi I, -1(I) ++ addl X, 8 * SIZE, X ++ ++ ST t0, -4 * SIZE(X) ++ ST t1, -3 * SIZE(X) ++ ST t2, -2 * SIZE(X) ++ ST t3, -1 * SIZE(X) ++ ++ fillde PREFETCHSIZE * SIZE(X) ++ bne I, $L012 ++ .align 4 ++ ++$L013: ++ ST t0, 0 * SIZE(X) ++ ST t1, 1 * SIZE(X) ++ ST t2, 2 * SIZE(X) ++ ST t3, 3 * SIZE(X) ++ ++ ST t0, 4 * SIZE(X) ++ ST t1, 5 * SIZE(X) ++ ST t2, 6 * SIZE(X) ++ ST t3, 7 * SIZE(X) ++ ++ addl X, 8 * SIZE, X ++ .align 4 ++ ++$L015: ++ and N, 7, I ++ ++#endif ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L017: ++ ST $f31, 0 * SIZE(X) ++ addl X, SIZE, X ++ ++ ldi I, -1(I) ++ bne I, $L017 ++ ret ++ .align 4 ++ ++$L020: ++ sra N, 3, I ++ ble I, $L025 ++ ++ fmov $f31, t0 ++ fmov $f31, t1 ++ fmov $f31, t2 ++ fmov $f31, t3 ++ ++ ldi I, -1(I) ++ ble I, $L023 ++ .align 4 ++ ++$L022: ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L022 ++ .align 4 ++ ++$L023: ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ .align 4 ++ ++$L025: ++ and N, 7, I ++ ble I, $L999 ++ .align 4 ++ ++$L027: ++ ST $f31, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L027 ++ ret ++ .align 4 ++ ++$L11: ++ cmpeq INCX, 1, $0 ++ beq $0, $L20 ++ ++#ifndef DOUBLE ++ sra N, 4, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ LD a4, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ LD a5, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ LD a6, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ LD a7, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 8 * SIZE(X) ++ LD a1, 9 * SIZE(X) ++ LD a2, 10 * SIZE(X) ++ LD a3, 11 * SIZE(X) ++ ++ ST t0, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 12 * SIZE(X) ++ LD a5, 13 * SIZE(X) ++ LD a6, 14 * SIZE(X) ++ LD a7, 15 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t0, 8 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 9 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 10 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 11 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 16 * SIZE(X) ++ LD a1, 17 * SIZE(X) ++ LD a2, 18 * SIZE(X) ++ LD a3, 19 * SIZE(X) ++ ++ ST t0, 12 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 13 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 14 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 15 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 20 * SIZE(X) ++ LD a5, 21 * SIZE(X) ++ LD a6, 22 * SIZE(X) ++ LD a7, 23 * SIZE(X) ++ ++ ST t0, 16 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 17 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 18 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 19 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 24 * SIZE(X) ++ LD a1, 25 * SIZE(X) ++ LD a2, 26 * SIZE(X) ++ LD a3, 27 * SIZE(X) ++ ++ ST t0, 20 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 21 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 22 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 23 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 28 * SIZE(X) ++ LD a5, 29 * SIZE(X) ++ LD a6, 30 * SIZE(X) ++ LD a7, 31 * SIZE(X) ++ ++ fillde PREFETCHSIZE * SIZE(X) ++ ldi I, -1(I) ++ addl X, 16 * SIZE, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ST t0, 8 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 9 * SIZE(X) ++ MUL a5, ALPHA, t1 + -+ srl N, 3, I -+ ble I, $L25 ++ ST t2, 10 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 11 * SIZE(X) ++ MUL a7, ALPHA, t3 + -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) ++ ST t0, 12 * SIZE(X) ++ ST t1, 13 * SIZE(X) ++ ST t2, 14 * SIZE(X) ++ ST t3, 15 * SIZE(X) ++ addl X, 16 * SIZE, X ++ .align 4 + -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y ++$L15: ++ and N, 15, I + -+ LD a2, 0 * SIZE(X) -+ LD a3, 1 * SIZE(X) -+ LD b2, 0 * SIZE(Y) -+ LD b3, 1 * SIZE(Y) ++#else + -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y ++ sra N, 3, I ++ ble I, $L15 + -+ LD a4, 0 * SIZE(X) -+ LD a5, 1 * SIZE(X) -+ LD b4, 0 * SIZE(Y) -+ LD b5, 1 * SIZE(Y) ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) + -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y ++ LD a4, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ LD a5, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 + -+ LD a6, 0 * SIZE(X) -+ LD b6, 0 * SIZE(Y) ++ LD a6, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ LD a7, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 + -+ subl I, 1, I -+ ble I, $L23 ++ ldi I, -1(I) ++ ble I, $L13 + .align 4 + -+$L22: -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD a7, 1 * SIZE(X) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) -+ -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ fillcs PREFETCHSIZE * SIZE(X) -+ MUL a0, b1, t1 -+ SXADDQ INCX, X, X ++$L12: ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ fillcs PREFETCHSIZE * SIZE(Y) -+ MUL a1, b0, t2 -+ SXADDQ INCY, Y, Y ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a0, 0 * SIZE(X) -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(X) ++ LD a0, 8 * SIZE(X) ++ ldi I, -1(I) ++ LD a1, 9 * SIZE(X) ++ addl X, 8 * SIZE, X + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t0 -+ LD b1, 1 * SIZE(Y) ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) + -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ SXADDQ INCX, X, X -+ MUL a2, b3, t1 -+ SXADDQ INCY, Y, Y ++ ST t0, -4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, -3 * SIZE(X) ++ MUL a1, ALPHA, t1 + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ #unop -+ MUL a3, b2, t2 -+ unop ++ ST t2, -2 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, -1 * SIZE(X) ++ MUL a3, ALPHA, t3 + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a2, 0 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, 1 * SIZE(X) ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b2, 0 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 1 * SIZE(Y) ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ fillde PREFETCHSIZE * SIZE(X) ++ bne I, $L12 ++ .align 4 + -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ SXADDQ INCX, X, X -+ MUL a4, b5, t1 -+ SXADDQ INCY, Y, Y ++$L13: ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ unop -+ MUL a5, b4, t2 -+ unop ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a4, 0 * SIZE(X) -+ MUL a5, b5, t3 -+ LD a5, 1 * SIZE(X) ++ ST t0, 4 * SIZE(X) ++ ST t1, 5 * SIZE(X) ++ ST t2, 6 * SIZE(X) ++ ST t3, 7 * SIZE(X) ++ addl X, 8 * SIZE, X ++ .align 4 + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b4, 0 * SIZE(Y) -+ MUL a6, b6, t0 -+ LD b5, 1 * SIZE(Y) ++$L15: ++ and N, 7, I + -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ SXADDQ INCX, X, X -+ MUL a6, b7, t1 -+ SXADDQ INCY, Y, Y ++#endif + -+ ADD s2, t2, s4 -+ fmov s4,s2 + unop -+ MUL a7, b6, t2 + unop ++ ble I, $L999 ++ .align 4 + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a6, 0 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, 1 * SIZE(X) ++$L17: ++ LD a0, 0 * SIZE(X) + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b6, 0 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) ++ MUL a0, ALPHA, t0 + -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y ++ ST t0, 0 * SIZE(X) + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ unop -+ MUL a1, b0, t2 -+ unop ++ addl X, SIZE, X + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a0, 0 * SIZE(X) -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(X) ++ ldi I, -1(I) ++ bne I, $L17 ++ ret ++ .align 4 + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t0 -+ LD b1, 1 * SIZE(Y) ++$L20: ++ sra N, 3, I ++ ble I, $L25 + -+ ADD s1, t1, s4 -+ fmov s4,s1 ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X -+ MUL a2, b3, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ unop -+ MUL a3, b2, t2 -+ unop -+ -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a2, 0 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, 1 * SIZE(X) -+ -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b2, 0 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 1 * SIZE(Y) + -+ ADD s1, t1, s4 -+ fmov s4,s1 ++ LD a4, 0 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ldi I, -1(I) + SXADDQ INCX, X, X -+ MUL a4, b5, t1 -+ SXADDQ INCY, Y, Y + -+ ADD s2, t2, s4 -+ fmov s4,s2 ++ LD a5, 0 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ SXADDQ INCX, X, X + unop -+ MUL a5, b4, t2 -+ subl I, 1, I -+ -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a4, 0 * SIZE(X) -+ MUL a5, b5, t3 -+ LD a5, 1 * SIZE(X) -+ -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b4, 0 * SIZE(Y) -+ MUL a6, b6, t0 -+ LD b5, 1 * SIZE(Y) + -+ ADD s1, t1, s4 -+ fmov s4,s1 ++ LD a6, 0 * SIZE(X) ++ MUL a2, ALPHA, t2 + SXADDQ INCX, X, X -+ MUL a6, b7, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ LD a6, 0 * SIZE(X) -+ MUL a7, b6, t2 + unop + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD b6, 0 * SIZE(Y) -+ MUL a7, b7, t3 -+ bgt I, $L22 ++ LD a7, 0 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ SXADDQ INCX, X, X ++ ble I, $L23 + .align 4 + -+$L23: -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD a7, 1 * SIZE(X) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) ++$L22: ++ ST t0, 0 * SIZE(XX) ++ MUL a4, ALPHA, t0 ++ fillde PREFETCHSIZE * SIZE(X) ++ SXADDQ INCX, XX, XX + -+ ADD s1, t1, s4 -+ fmov s4,s1 ++ LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ unop -+ MUL a1, b0, t2 ++ ldi I, -1(I) + unop + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a0, 0 * SIZE(X) -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(X) -+ -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t0 -+ LD b1, 1 * SIZE(Y) ++ ST t1, 0 * SIZE(XX) ++ MUL a5, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ unop + -+ ADD s1, t1, s4 -+ fmov s4,s1 ++ LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X -+ MUL a2, b3, t1 -+ SXADDQ INCY, Y, Y + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ unop -+ MUL a3, b2, t2 ++ ST t2, 0 * SIZE(XX) ++ MUL a6, ALPHA, t2 ++ SXADDQ INCX, XX, XX + unop + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a2, 0 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, 1 * SIZE(X) -+ -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b2, 0 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 1 * SIZE(Y) -+ -+ ADD s1, t1, s4 -+ fmov s4,s1 ++ LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X -+ MUL a4, b5, t1 -+ SXADDQ INCY, Y, Y + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ unop -+ MUL a5, b4, t2 ++ ST t3, 0 * SIZE(XX) ++ MUL a7, ALPHA, t3 ++ SXADDQ INCX, XX, XX + unop + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a4, 0 * SIZE(X) -+ MUL a5, b5, t3 -+ LD a5, 1 * SIZE(X) ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b4, 0 * SIZE(Y) -+ MUL a6, b6, t0 -+ LD b5, 1 * SIZE(Y) ++ ST t0, 0 * SIZE(XX) ++ MUL a0, ALPHA, t0 ++ SXADDQ INCX, XX, XX ++ unop + -+ ADD s1, t1, s4 -+ fmov s4,s1 ++ LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X -+ MUL a6, b7, t1 -+ SXADDQ INCY, Y, Y + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ unop -+ MUL a7, b6, t2 ++ ST t1, 0 * SIZE(XX) ++ MUL a1, ALPHA, t1 ++ SXADDQ INCX, XX, XX + unop + -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ LD a6, 0 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, 1 * SIZE(X) ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ LD b6, 0 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) ++ ST t2, 0 * SIZE(XX) ++ MUL a2, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ unop + -+ ADD s1, t1, s4 -+ fmov s4,s1 ++ LD a6, 0 * SIZE(X) + SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ MUL a1, b0, t2 -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ MUL a1, b1, t3 ++ ST t3, 0 * SIZE(XX) ++ MUL a3, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ unop + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ MUL a2, b2, t0 -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ MUL a2, b3, t1 ++ LD a7, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ unop ++ bne I, $L22 ++ .align 4 + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ MUL a3, b2, t2 -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ MUL a3, b3, t3 ++$L23: ++ ST t0, 0 * SIZE(XX) ++ MUL a4, ALPHA, t0 ++ SXADDQ INCX, XX, XX + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ MUL a4, b4, t0 -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ MUL a4, b5, t1 ++ ST t1, 0 * SIZE(XX) ++ MUL a5, ALPHA, t1 ++ SXADDQ INCX, XX, XX + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ MUL a5, b4, t2 -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ MUL a5, b5, t3 ++ ST t2, 0 * SIZE(XX) ++ MUL a6, ALPHA, t2 ++ SXADDQ INCX, XX, XX + -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ MUL a6, b6, t0 -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ MUL a6, b7, t1 ++ ST t3, 0 * SIZE(XX) ++ MUL a7, ALPHA, t3 ++ SXADDQ INCX, XX, XX + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ MUL a7, b6, t2 -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ MUL a7, b7, t3 ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX + .align 4 + +$L25: + and N, 7, I + unop + unop -+ ble I, $L998 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) -+ -+ SXADDQ INCX, X, X -+ subl I, 1, I -+ SXADDQ INCY, Y, Y -+ ble I, $L28 ++ ble I, $L999 + .align 4 + -+$L26: -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ mov X, XX -+ MUL a0, b0, t0 -+ mov Y, YY -+ -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ LD a0, 0 * SIZE(XX) -+ MUL a1, b0, t2 -+ LD b0, 0 * SIZE(YY) -+ -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ subl I, 1, I -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(XX) -+ -+ LD b1, 1 * SIZE(YY) -+ bgt I, $L26 -+ .align 4 ++$L27: ++ LD a0, 0 * SIZE(X) + -+$L28: -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ MUL a0, b0, t0 -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ MUL a0, b1, t1 ++ MUL a0, ALPHA, t0 + -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ MUL a1, b0, t2 -+ ADD s3, t3, s4 -+ fmov s4,s3 -+ MUL a1, b1, t3 -+ .align 4 ++ ST t0, 0 * SIZE(XX) + -+$L998: -+ ADD s0, t0, s4 -+ fmov s4,s0 -+ ADD s1, t1, s4 -+ fmov s4,s1 -+ ADD s2, t2, s4 -+ fmov s4,s2 -+ ADD s3, t3, s4 -+ fmov s4,s3 ++ SXADDQ INCX, X, X ++ SXADDQ INCX, XX, XX + -+#ifndef CONJ -+ SUB s0, s3, s4 -+ fmov s4,s0 -+ ADD s1, s2, s4 -+ fmov s4,s1 -+#else -+ ADD s0, s3, s4 -+ fmov s4,s0 -+ SUB s1, s2, s4 -+ fmov s4,s1 -+#endif ++ ldi I, -1(I) ++ bne I, $L27 + .align 4 + +$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 16($sp) -+ ldi $sp, 24($sp) + ret -+ + EPILOGUE -diff --git a/kernel/sw_64/zdot.S.bak b/kernel/sw_64/zdot.S.bak +diff --git a/kernel/sw_64/snrm2.S b/kernel/sw_64/snrm2.S new file mode 100644 -index 0000000..d10673c +index 000000000..2752e831d --- /dev/null -+++ b/kernel/sw_64/zdot.S.bak -@@ -0,0 +1,500 @@ ++++ b/kernel/sw_64/snrm2.S +@@ -0,0 +1,431 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -67693,473 +11878,455 @@ index 0000000..d10673c +/*********************************************************************/ + +#define ASSEMBLER ++ +#include "common.h" -+#include "version.h" + -+#define PREFETCHSIZE 88 ++ ++#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 -+#define Y $19 -+#define INCY $20 -+#define XX $21 -+#define YY $23 -+ -+#define I $5 -+ -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f2 -+#define s3 $f30 -+ -+#define a0 $f10 -+#define a1 $f11 -+#define a2 $f12 -+#define a3 $f13 -+#define a4 $f14 -+#define a5 $f15 -+#define a6 $f16 -+#define a7 $f17 -+ -+#define b0 $f18 -+#define b1 $f19 -+#define b2 $f20 -+#define b3 $f21 -+#define b4 $f22 -+#define b5 $f23 -+#define b6 $f24 -+#define b7 $f25 -+ -+#define t0 $f26 -+#define t1 $f27 -+#define t2 $f28 -+#define t3 $f29 -+ -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 16, $26, 0 -+ -+ ldi $sp, -16($sp) -+ fclr s0 -+ fstd $f2, 0($sp) -+ fclr s1 -+ -+ fclr s2 -+ addl INCX, INCX, INCX -+ fclr s3 -+ ble N, $L999 -+ -+ addl INCY, INCY, INCY -+ fclr t0 -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ -+ srl N, 3, I -+ ble I, $L25 ++#define XX $19 + -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) ++#define I $0 + -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 + -+ LD a2, 0 * SIZE(X) -+ LD a3, 1 * SIZE(X) -+ LD b2, 0 * SIZE(Y) -+ LD b3, 1 * SIZE(Y) ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 + -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y ++ PROLOGUE + -+ LD a4, 0 * SIZE(X) -+ LD a5, 1 * SIZE(X) -+ LD b4, 0 * SIZE(Y) -+ LD b5, 1 * SIZE(Y) ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 + -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) + -+ LD a6, 0 * SIZE(X) -+ LD b6, 0 * SIZE(Y) ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif + -+ subl I, 1, I -+ ble I, $L23 -+ .align 4 ++ fclr a0 ++ SXADDQ INCX, 0, INCX ++ fclr a1 ++ ble N, $L999 + -+$L22: -+ ADD s0, t0, s0 -+ LD a7, 1 * SIZE(X) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) ++ fclr a2 ++ cmpeq INCX, SIZE, $0 ++ fclr a3 ++ beq $0, $L20 + -+ ADD s1, t1, s1 -+ fillcs PREFETCHSIZE * SIZE(X) -+ MUL a0, b1, t1 -+ SXADDQ INCX, X, X ++ fclr t0 ++ sra N, 4, I ++ fclr t1 ++ ble I, $L15 + -+ ADD s2, t2, s2 -+ fillcs PREFETCHSIZE * SIZE(Y) -+ MUL a1, b0, t2 -+ SXADDQ INCY, Y, Y ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) + -+ ADD s3, t3, s3 -+ LD a0, 0 * SIZE(X) -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(X) ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) + -+ ADD s0, t0, s0 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t0 -+ LD b1, 1 * SIZE(Y) ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a2, b3, t1 -+ SXADDQ INCY, Y, Y ++$L11: ++ faddd a0, t0, a0 ++ s_fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) + -+ ADD s2, t2, s2 -+ unop -+ MUL a3, b2, t2 -+ unop ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) + -+ ADD s3, t3, s3 -+ LD a2, 0 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, 1 * SIZE(X) ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) + -+ ADD s0, t0, s0 -+ LD b2, 0 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 1 * SIZE(Y) ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a4, b5, t1 -+ SXADDQ INCY, Y, Y ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) + -+ ADD s2, t2, s2 ++ faddd a1, t1, a1 + unop -+ MUL a5, b4, t2 ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, a2 + unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) + -+ ADD s3, t3, s3 -+ LD a4, 0 * SIZE(X) -+ MUL a5, b5, t3 -+ LD a5, 1 * SIZE(X) ++ faddd a3, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) + -+ ADD s0, t0, s0 -+ LD b4, 0 * SIZE(Y) -+ MUL a6, b6, t0 -+ LD b5, 1 * SIZE(Y) ++ faddd a0, t0, a0 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a6, b7, t1 -+ SXADDQ INCY, Y, Y ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) + -+ ADD s2, t2, s2 -+ unop -+ MUL a7, b6, t2 ++ faddd a2, t2, a2 + unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) + -+ ADD s3, t3, s3 -+ LD a6, 0 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, 1 * SIZE(X) ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) + -+ ADD s0, t0, s0 -+ LD b6, 0 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) + -+ ADD s2, t2, s2 -+ unop -+ MUL a1, b0, t2 ++ faddd a2, t2, a2 + unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) + -+ ADD s3, t3, s3 -+ LD a0, 0 * SIZE(X) -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t0 -+ LD b1, 1 * SIZE(Y) ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a2, b3, t1 -+ SXADDQ INCY, Y, Y ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) + -+ ADD s2, t2, s2 ++ faddd a1, t1, a1 + unop -+ MUL a3, b2, t2 ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 + unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) + -+ ADD s3, t3, s3 -+ LD a2, 0 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, 1 * SIZE(X) ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) + -+ ADD s0, t0, s0 -+ LD b2, 0 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 1 * SIZE(Y) ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a4, b5, t1 -+ SXADDQ INCY, Y, Y ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) + -+ ADD s2, t2, s2 ++ faddd a2, t2, a2 + unop -+ MUL a5, b4, t2 -+ subl I, 1, I ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) + -+ ADD s3, t3, s3 -+ LD a4, 0 * SIZE(X) -+ MUL a5, b5, t3 -+ LD a5, 1 * SIZE(X) ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) + -+ ADD s0, t0, s0 -+ LD b4, 0 * SIZE(Y) -+ MUL a6, b6, t0 -+ LD b5, 1 * SIZE(Y) ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a6, b7, t1 -+ SXADDQ INCY, Y, Y ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 + -+ ADD s2, t2, s2 -+ LD a6, 0 * SIZE(X) -+ MUL a7, b6, t2 -+ unop ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 + -+ ADD s3, t3, s3 -+ LD b6, 0 * SIZE(Y) -+ MUL a7, b7, t3 -+ bgt I, $L22 -+ .align 4 ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 + -+$L23: -+ ADD s0, t0, s0 -+ LD a7, 1 * SIZE(X) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) ++ faddd a1, t1, a1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y ++$L15: ++ and N, 15, I ++ ble I, $L998 ++ .align 4 + -+ ADD s2, t2, s2 -+ unop -+ MUL a1, b0, t2 -+ unop ++$L16: ++ LD x0, 0 * SIZE(X) ++ ldi X, 1 * SIZE(X) + -+ ADD s3, t3, s3 -+ LD a0, 0 * SIZE(X) -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(X) ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 + -+ ADD s0, t0, s0 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t0 -+ LD b1, 1 * SIZE(Y) ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a2, b3, t1 -+ SXADDQ INCY, Y, Y ++$L20: ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L25 + -+ ADD s2, t2, s2 -+ unop -+ MUL a3, b2, t2 -+ unop ++ fclr t2 ++ fclr t3 + -+ ADD s3, t3, s3 -+ LD a2, 0 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, 1 * SIZE(X) ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x3, 0 * SIZE(X) ++ addl X, INCX, X + -+ ADD s0, t0, s0 -+ LD b2, 0 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 1 * SIZE(Y) ++ LD x4, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x5, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x6, 0 * SIZE(X) ++ addl X, INCX, X + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a4, b5, t1 -+ SXADDQ INCY, Y, Y ++ ldi I, -1(I) ++ ble I, $L22 ++ .align 4 + -+ ADD s2, t2, s2 -+ unop -+ MUL a5, b4, t2 -+ unop ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X + -+ ADD s3, t3, s3 -+ LD a4, 0 * SIZE(X) -+ MUL a5, b5, t3 -+ LD a5, 1 * SIZE(X) ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ addl X, INCX, X + -+ ADD s0, t0, s0 -+ LD b4, 0 * SIZE(Y) -+ MUL a6, b6, t0 -+ LD b5, 1 * SIZE(Y) ++ faddd a2, t2, a2 ++ LD x1, 0 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a6, b7, t1 -+ SXADDQ INCY, Y, Y ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ addl X, INCX, X + -+ ADD s2, t2, s2 -+ unop -+ MUL a7, b6, t2 -+ unop ++ faddd a0, t0, a0 ++ LD x3, 0 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X + -+ ADD s3, t3, s3 -+ LD a6, 0 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, 1 * SIZE(X) ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ addl X, INCX, X + -+ ADD s0, t0, s0 -+ LD b6, 0 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) ++ faddd a2, t2, a2 ++ LD x5, 0 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X + -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ addl X, INCX, X + -+ ADD s2, t2, s2 -+ MUL a1, b0, t2 -+ ADD s3, t3, s3 -+ MUL a1, b1, t3 ++ ldi I, -1(I) ++ bgt I, $L21 ++ .align 4 + -+ ADD s0, t0, s0 -+ MUL a2, b2, t0 -+ ADD s1, t1, s1 -+ MUL a2, b3, t1 ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X + -+ ADD s2, t2, s2 -+ MUL a3, b2, t2 -+ ADD s3, t3, s3 -+ MUL a3, b3, t3 ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ unop + -+ ADD s0, t0, s0 -+ MUL a4, b4, t0 -+ ADD s1, t1, s1 -+ MUL a4, b5, t1 ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 + -+ ADD s2, t2, s2 -+ MUL a5, b4, t2 -+ ADD s3, t3, s3 -+ MUL a5, b5, t3 ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 + -+ ADD s0, t0, s0 -+ MUL a6, b6, t0 -+ ADD s1, t1, s1 -+ MUL a6, b7, t1 ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 + -+ ADD s2, t2, s2 -+ MUL a7, b6, t2 -+ ADD s3, t3, s3 -+ MUL a7, b7, t3 ++ faddd a1, t1, a1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 + .align 4 + +$L25: + and N, 7, I -+ unop -+ unop + ble I, $L998 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) -+ -+ SXADDQ INCX, X, X -+ subl I, 1, I -+ SXADDQ INCY, Y, Y -+ ble I, $L28 + .align 4 + +$L26: -+ ADD s0, t0, s0 -+ mov X, XX -+ MUL a0, b0, t0 -+ mov Y, YY -+ -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ LD a0, 0 * SIZE(XX) -+ MUL a1, b0, t2 -+ LD b0, 0 * SIZE(YY) ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X + -+ ADD s3, t3, s3 -+ subl I, 1, I -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(XX) ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 + -+ LD b1, 1 * SIZE(YY) ++ ldi I, -1(I) + bgt I, $L26 + .align 4 + -+$L28: -+ ADD s0, t0, s0 -+ MUL a0, b0, t0 -+ ADD s1, t1, s1 -+ MUL a0, b1, t1 -+ -+ ADD s2, t2, s2 -+ MUL a1, b0, t2 -+ ADD s3, t3, s3 -+ MUL a1, b1, t3 -+ .align 4 + +$L998: -+ ADD s0, t0, s0 -+ ADD s1, t1, s1 -+ ADD s2, t2, s2 -+ ADD s3, t3, s3 ++ faddd a0, t0, a0 + -+#ifndef CONJ -+ SUB s0, s3, s0 -+ ADD s1, s2, s1 ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 +#else -+ ADD s0, s3, s0 -+ SUB s1, s2, s1 ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 +#endif + .align 4 + +$L999: -+ fldd $f2, 0($sp) -+ ldi $sp, 16($sp) ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif + ret -+ + EPILOGUE -diff --git a/kernel/sw_64/zdot_simd.S b/kernel/sw_64/zdot_simd.S +diff --git a/kernel/sw_64/staticbuffer.S b/kernel/sw_64/staticbuffer.S +new file mode 100644 +index 000000000..7bbd23d89 +--- /dev/null ++++ b/kernel/sw_64/staticbuffer.S +@@ -0,0 +1,45 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++#ifdef ALLOC_STATIC ++ .align 8 ++ .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 ++#endif +diff --git a/kernel/sw_64/sum.S b/kernel/sw_64/sum.S new file mode 100644 -index 0000000..ed775e6 +index 000000000..fe51d2493 --- /dev/null -+++ b/kernel/sw_64/zdot_simd.S -@@ -0,0 +1,699 @@ ++++ b/kernel/sw_64/sum.S +@@ -0,0 +1,206 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -68200,671 +12367,178 @@ index 0000000..ed775e6 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#define PREFETCHSIZE 80 ++ ++#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 -+#define Y $19 -+#define INCY $20 -+#define XX $21 -+#define YY $23 -+ -+#define I $5 ++#define I $19 + +#define s0 $f0 +#define s1 $f1 -+#define s2 $f2 -+#define s3 $f30 -+ -+#define a0 $f10 -+#define a1 $f11 -+#define a2 $f12 -+#define a3 $f13 -+#define a4 $f14 -+#define a5 $f15 -+#define a6 $f16 -+#define a7 $f17 ++#define s2 $f10 ++#define s3 $f11 + -+#define b0 $f18 -+#define b1 $f19 -+#define b2 $f20 -+#define b3 $f21 -+#define b4 $f22 -+#define b5 $f23 -+#define b6 $f24 -+#define b7 $f25 ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 + -+#define t0 $f26 -+#define t1 $f27 -+#define t2 $f28 -+#define t3 $f29 ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 + -+#define t4 $f3 -+#define t5 $f4 -+#define t6 $f5 -+#define t7 $f6 -+ + PROLOGUE + PROFCODE -+ .frame $sp, 40, $26, 0 + -+ ldi $sp, -40($sp) + fclr s0 -+ fstd $f2, 0($sp) -+ fclr s1 ++ unop ++ fclr t0 ++ ble N, $L999 + -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ ++ sra N, 3, I ++ fclr s1 + fclr s2 -+ addl INCX, INCX, INCX -+ fclr s3 -+ ble N, $L999 ++ ble I, $L15 + -+ addl INCY, INCY, INCY -+ fclr t0 ++ LD a0, 0 * SIZE(X) + fclr t1 -+ fclr t2 -+ fclr t3 -+ -+ cmpeq INCX, 2, $21 -+ cmpeq INCY, 2, $22 -+ and $21, $22, $22 -+ beq $22, $Sub -+ -+/* -+ test the address of Y & X -+*/ -+ and Y, (VEC_LEN*SIZE-1), $4 -+ and X, (VEC_LEN*SIZE-1), $3 -+ or $3, $4, $4 -+ bne $4, $UnAlign_ACCESS -+ -+/*Align access*/ -+/*UnLoop 8*/ -+ srl N, 3, I -+ ble I, $Remain -+ .align 4 -+ vcpys $f31, $f31, s0 #clear s0 vector -+ vcpys $f31, $f31, s1 #clear s0 vector -+ vcpys $f31, $f31, s2 #clear s0 vector -+ vcpys $f31, $f31, s3 #clear s0 vector -+ -+ vcpys $f31, $f31, t0 -+ vcpys $f31, $f31, t1 -+ vcpys $f31, $f31, t2 -+ vcpys $f31, $f31, t3 -+ -+$MainLoop: -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ VLD a3, 3*VEC_LEN*SIZE(X) -+ -+ VLD b0, 0*VEC_LEN*SIZE(Y) -+ VADD s0, t0, s0 -+ VLD b1, 1*VEC_LEN*SIZE(Y) -+ VADD s1, t1, s1 -+ -+ VLD b2, 2*VEC_LEN*SIZE(Y) -+ VADD s2, t2, s2 -+ VLD b3, 3*VEC_LEN*SIZE(Y) -+ VADD s3, t3, s3 -+ -+/*spilt the X complex vector to real vector(a0, a2) and image vector (a1, a3) -+ Y complex vectory to real vector(b0, b2) and image vector (b1, b3) -+*/ -+ vextf a0, 1, a4 -+ vextf a0, 3, a5 -+ vextf a1, 0, a6 -+ vextf a1, 2, a7 -+ -+ vextf a2, 1, t0 -+ vextf a2, 3, t1 -+ vextf a3, 0, t2 -+ vextf a3, 2, t3 -+ -+ vextf b0, 1, b4 -+ vextf b0, 3, b5 -+ vextf b1, 0, b6 -+ vextf b1, 2, b7 -+ -+ vextf b2, 1, t4 -+ vextf b2, 3, t5 -+ vextf b3, 0, t6 -+ vextf b3, 2, t7 -+ -+ vinsf a4, a1, 0, a1 -+ vinsf a6, a0, 1, a0 -+ vinsf t0, a3, 0, a3 -+ vinsf t2, a2, 1, a2 -+ -+ vinsf b4, b1, 0, b1 -+ addl X, 16 * SIZE, X -+ vinsf b6, b0, 1, b0 -+ addl Y, 16 * SIZE, Y -+ -+ vinsf t4, b3, 0, b3 -+ subl I, 1, I -+ vinsf t6, b2, 1, b2 -+ nop -+ -+ vinsf a5, a1, 2, a1 -+ vinsf a7, a0, 3, a0 -+ vinsf t1, a3, 2, a3 -+ vinsf t3, a2, 3, a2 -+ -+ vinsf b5, b1, 2, b1 -+ vinsf b7, b0, 3, b0 -+ vinsf t5, b3, 2, b3 -+ vinsf t7, b2, 3, b2 -+ -+ /*Computing*/ -+ -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ VMAD a0, b0, s0, s0 -+ fillcs PREFETCHSIZE * SIZE(Y) -+ VMAD a0, b1, s1, s1 -+ -+ VMAD a1, b0, s2, s2 -+ VMAD a1, b1, s3, s3 -+ VMUL a2, b2, t0 /*Just multiply. Add it in next loop.*/ -+ VMUL a2, b3, t1 -+ -+ VMUL a3, b2, t2 -+ VMUL a3, b3, t3 -+ nop -+ bgt I, $MainLoop -+ .align 4 -+$MainLoopEnd: -+ VADD s0, t0, s0 -+ VADD s1, t1, s1 -+ VADD s2, t2, s2 -+ VADD s3, t3, s3 -+ -+#ifndef CONJ -+ VSUB s0, s3, s0 -+ VADD s1, s2, s1 -+#else -+ VADD s0, s3, s0 -+ VSUB s1, s2, s1 -+#endif -+ vcpys $f31, $f31, s2 #clear s0 vector -+ vcpys $f31, $f31, s3 #clear s0 vector -+ -+ vextf s0, 1, t1 -+ vextf s0, 2, t2 -+ vextf s0, 3, t3 -+ vextf s1, 1, t5 -+ -+ vextf s1, 2, t6 -+ vextf s1, 3, t7 -+ ADD s0, t1, s0 -+ ADD t2, t3, t0 -+ -+ ADD s1, t5, s1 -+ ADD t6, t7, t4 -+ ADD s0, t0, s0 -+ ADD s1, t4, s1 -+$Remain: -+ and N, 7, I -+ ble I, $End -+ .align 4 -+$RemainLoop: -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) -+ -+ SXADDQ INCX, X, X -+ subl I, 1, I -+ SXADDQ INCY, Y, Y -+ MAD a0, b0, s0, s0 -+ -+ MAD a0, b1, s1, s1 -+ MAD a1, b0, s2, s2 -+ MAD a1, b1, s3, s3 -+ bgt I, $RemainLoop -+ .align 4 -+ -+#ifndef CONJ -+ SUB s0, s3, s0 -+ ADD s1, s2, s1 -+#else -+ ADD s0, s3, s0 -+ SUB s1, s2, s1 -+#endif -+ -+$End: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ -+ fldd $f6, 32($sp) -+ ldi $sp, 40($sp) -+ ret -+ -+ .align 4 -+ -+$UnAlign_ACCESS: -+$Sub: -+ srl N, 3, I -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) -+ -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y -+ -+ LD a2, 0 * SIZE(X) -+ LD a3, 1 * SIZE(X) -+ LD b2, 0 * SIZE(Y) -+ LD b3, 1 * SIZE(Y) -+ -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y -+ -+ LD a4, 0 * SIZE(X) -+ LD a5, 1 * SIZE(X) -+ LD b4, 0 * SIZE(Y) -+ LD b5, 1 * SIZE(Y) -+ -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y -+ -+ LD a6, 0 * SIZE(X) -+ LD b6, 0 * SIZE(Y) -+ -+ subl I, 1, I -+ ble I, $L23 -+ .align 4 -+ -+$L22: -+ ADD s0, t0, s0 -+ LD a7, 1 * SIZE(X) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ fillcs PREFETCHSIZE * SIZE(X) -+ MUL a0, b1, t1 -+ SXADDQ INCX, X, X -+ -+ ADD s2, t2, s2 -+ fillcs PREFETCHSIZE * SIZE(Y) -+ MUL a1, b0, t2 -+ SXADDQ INCY, Y, Y -+ -+ ADD s3, t3, s3 -+ LD a0, 0 * SIZE(X) -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t0 -+ LD b1, 1 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a2, b3, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ unop -+ MUL a3, b2, t2 -+ unop -+ -+ ADD s3, t3, s3 -+ LD a2, 0 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b2, 0 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 1 * SIZE(Y) -+ -+ ADD s1, t1, s1 + SXADDQ INCX, X, X -+ MUL a4, b5, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ unop -+ MUL a5, b4, t2 -+ unop -+ -+ ADD s3, t3, s3 -+ LD a4, 0 * SIZE(X) -+ MUL a5, b5, t3 -+ LD a5, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b4, 0 * SIZE(Y) -+ MUL a6, b6, t0 -+ LD b5, 1 * SIZE(Y) ++ fclr t2 + -+ ADD s1, t1, s1 ++ LD a1, 0 * SIZE(X) ++ fclr t3 + SXADDQ INCX, X, X -+ MUL a6, b7, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ unop -+ MUL a7, b6, t2 -+ unop -+ -+ ADD s3, t3, s3 -+ LD a6, 0 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b6, 0 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) ++ fclr s3 + -+ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ unop -+ MUL a1, b0, t2 -+ unop -+ -+ ADD s3, t3, s3 -+ LD a0, 0 * SIZE(X) -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t0 -+ LD b1, 1 * SIZE(Y) -+ -+ ADD s1, t1, s1 ++ LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X -+ MUL a2, b3, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ unop -+ MUL a3, b2, t2 -+ unop -+ -+ ADD s3, t3, s3 -+ LD a2, 0 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b2, 0 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 1 * SIZE(Y) + -+ ADD s1, t1, s1 ++ LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X -+ MUL a4, b5, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ unop -+ MUL a5, b4, t2 -+ subl I, 1, I -+ -+ ADD s3, t3, s3 -+ LD a4, 0 * SIZE(X) -+ MUL a5, b5, t3 -+ LD a5, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b4, 0 * SIZE(Y) -+ MUL a6, b6, t0 -+ LD b5, 1 * SIZE(Y) -+ -+ ADD s1, t1, s1 ++ LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X -+ MUL a6, b7, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ LD a6, 0 * SIZE(X) -+ MUL a7, b6, t2 -+ unop + -+ ADD s3, t3, s3 -+ LD b6, 0 * SIZE(Y) -+ MUL a7, b7, t3 -+ bgt I, $L22 ++ ldi I, -1(I) ++ ble I, $L13 + .align 4 + -+$L23: -+ ADD s0, t0, s0 -+ LD a7, 1 * SIZE(X) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ unop -+ MUL a1, b0, t2 -+ unop -+ -+ ADD s3, t3, s3 -+ LD a0, 0 * SIZE(X) -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b0, 0 * SIZE(Y) -+ MUL a2, b2, t0 -+ LD b1, 1 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a2, b3, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ unop -+ MUL a3, b2, t2 -+ unop -+ -+ ADD s3, t3, s3 -+ LD a2, 0 * SIZE(X) -+ MUL a3, b3, t3 -+ LD a3, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b2, 0 * SIZE(Y) -+ MUL a4, b4, t0 -+ LD b3, 1 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a4, b5, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ unop -+ MUL a5, b4, t2 -+ unop -+ -+ ADD s3, t3, s3 -+ LD a4, 0 * SIZE(X) -+ MUL a5, b5, t3 -+ LD a5, 1 * SIZE(X) -+ -+ ADD s0, t0, s0 -+ LD b4, 0 * SIZE(Y) -+ MUL a6, b6, t0 -+ LD b5, 1 * SIZE(Y) -+ -+ ADD s1, t1, s1 -+ SXADDQ INCX, X, X -+ MUL a6, b7, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ unop -+ MUL a7, b6, t2 -+ unop -+ -+ ADD s3, t3, s3 -+ LD a6, 0 * SIZE(X) -+ MUL a7, b7, t3 -+ LD a7, 1 * SIZE(X) -+ ++$L12: + ADD s0, t0, s0 -+ LD b6, 0 * SIZE(Y) -+ MUL a0, b0, t0 -+ LD b7, 1 * SIZE(Y) ++ s_fillcs PREFETCHSIZE * 2 * SIZE(X) ++ fmov a0, t0 ++ ldi I, -1(I) + + ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fmov a1, t1 + SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y -+ -+ ADD s2, t2, s2 -+ MUL a1, b0, t2 -+ ADD s3, t3, s3 -+ MUL a1, b1, t3 -+ -+ ADD s0, t0, s0 -+ MUL a2, b2, t0 -+ ADD s1, t1, s1 -+ MUL a2, b3, t1 + + ADD s2, t2, s2 -+ MUL a3, b2, t2 -+ ADD s3, t3, s3 -+ MUL a3, b3, t3 -+ -+ ADD s0, t0, s0 -+ MUL a4, b4, t0 -+ ADD s1, t1, s1 -+ MUL a4, b5, t1 ++ LD a7, 0 * SIZE(X) ++ fmov a2, t2 ++ SXADDQ INCX, X, X + -+ ADD s2, t2, s2 -+ MUL a5, b4, t2 + ADD s3, t3, s3 -+ MUL a5, b5, t3 ++ LD a0, 0 * SIZE(X) ++ fmov a3, t3 ++ SXADDQ INCX, X, X + + ADD s0, t0, s0 -+ MUL a6, b6, t0 ++ LD a1, 0 * SIZE(X) ++ fmov a4, t0 ++ SXADDQ INCX, X, X ++ + ADD s1, t1, s1 -+ MUL a6, b7, t1 ++ LD a2, 0 * SIZE(X) ++ fmov a5, t1 ++ SXADDQ INCX, X, X + + ADD s2, t2, s2 -+ MUL a7, b6, t2 ++ LD a3, 0 * SIZE(X) ++ fmov a6, t2 ++ SXADDQ INCX, X, X ++ + ADD s3, t3, s3 -+ MUL a7, b7, t3 -+ .align 4 ++ LD a4, 0 * SIZE(X) ++ fmov a7, t3 ++ SXADDQ INCX, X, X + -+$L25: -+ and N, 7, I -+ unop ++ LD a5, 0 * SIZE(X) + unop -+ ble I, $L998 -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ LD b0, 0 * SIZE(Y) -+ LD b1, 1 * SIZE(Y) -+ + SXADDQ INCX, X, X -+ subl I, 1, I -+ SXADDQ INCY, Y, Y -+ ble I, $L28 ++ bne I, $L12 + .align 4 + -+$L26: ++$L13: + ADD s0, t0, s0 -+ mov X, XX -+ MUL a0, b0, t0 -+ mov Y, YY ++ LD a6, 0 * SIZE(X) ++ fmov a0, t0 ++ SXADDQ INCX, X, X + + ADD s1, t1, s1 ++ LD a7, 0 * SIZE(X) ++ fmov a1, t1 + SXADDQ INCX, X, X -+ MUL a0, b1, t1 -+ SXADDQ INCY, Y, Y + + ADD s2, t2, s2 -+ LD a0, 0 * SIZE(XX) -+ MUL a1, b0, t2 -+ LD b0, 0 * SIZE(YY) -+ ++ fmov a2, t2 + ADD s3, t3, s3 -+ subl I, 1, I -+ MUL a1, b1, t3 -+ LD a1, 1 * SIZE(XX) -+ -+ LD b1, 1 * SIZE(YY) -+ bgt I, $L26 -+ .align 4 ++ fmov a3, t3 + -+$L28: + ADD s0, t0, s0 -+ MUL a0, b0, t0 ++ fmov a4, t0 + ADD s1, t1, s1 -+ MUL a0, b1, t1 -+ ++ fmov a5, t1 + ADD s2, t2, s2 -+ MUL a1, b0, t2 ++ fmov a6, t2 + ADD s3, t3, s3 -+ MUL a1, b1, t3 -+ .align 4 ++ fmov a7, t3 + -+$L998: -+ ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + -+#ifndef CONJ -+ SUB s0, s3, s0 -+ ADD s1, s2, s1 -+#else -+ ADD s0, s3, s0 -+ SUB s1, s2, s1 -+#endif ++ ADD s0, s1, s0 ++ ADD s2, s3, s2 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ADD s0, s2, s0 ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ fmov a0, t0 ++ ++ ldi I, -1(I) ++ bne I, $L17 + .align 4 + +$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ -+ fldd $f6, 32($sp) -+ ldi $sp, 40($sp) ++ ADD s0, t0, s0 + ret -+ + EPILOGUE -diff --git a/kernel/sw_64/zgemm_beta.S b/kernel/sw_64/zgemm_beta.S +diff --git a/kernel/sw_64/swap.S b/kernel/sw_64/swap.S new file mode 100644 -index 0000000..18f845c +index 000000000..431d526c9 --- /dev/null -+++ b/kernel/sw_64/zgemm_beta.S -@@ -0,0 +1,192 @@ ++++ b/kernel/sw_64/swap.S +@@ -0,0 +1,252 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -68905,164 +12579,224 @@ index 0000000..18f845c + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+ .set noat -+ .set noreorder -+.text -+ .align 5 -+ .globl CNAME -+ .ent CNAME -+CNAME: ++ ++ PROLOGUE ++ PROFCODE + .frame $sp, 0, $26, 0 + -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ ldi $28, _mcount -+ jsr $28, ($28), _mcount -+ .prologue 1 -+#else ++ mov $20, $17 ++ mov $21, $18 ++ ldl $19, 0($sp) ++ ldl $20, 8($sp) ++#ifndef PROFILE + .prologue 0 ++#else ++ .prologue 1 +#endif + -+ ldl $18, 24($sp) -+ ble $16, $End -+ ldl $19, 32($sp) -+ ble $17, $End ++ beq $18, $SubEnd ++ beq $20, $SubEnd + -+ addl $19, $19, $19 -+ fbne $f19,$Main -+ fbne $f20,$Main ++ subl $18, 1, $1 ++ subl $20, 1, $2 ++ ble $16, $SubEnd # if n <= 0 goto $End ++ or $1, $2, $1 ++ ++ sra $16, 3, $21 ++ ++ and $16, 7, $22 ++ bne $1, $Sub ++ ble $21, $MainRemain + .align 4 + -+$L13: -+ mov $18, $1 -+ ldi $17, -1($17) -+ SXADDQ $19, $18, $18 -+ mov $16, $2 ++$MainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f12, 2*SIZE($19) ++ LD $f13, 3*SIZE($19) ++ LD $f14, 4*SIZE($19) ++ LD $f15, 5*SIZE($19) ++ LD $f16, 6*SIZE($19) ++ LD $f17, 7*SIZE($19) ++ ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ LD $f22, 2*SIZE($17) ++ LD $f23, 3*SIZE($17) ++ LD $f24, 4*SIZE($17) ++ LD $f25, 5*SIZE($17) ++ LD $f26, 6*SIZE($17) ++ LD $f27, 7*SIZE($17) ++ ++ fillde 32*SIZE($17) ++ unop ++ fillde 32*SIZE($19) ++ subl $21, 1, $21 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f11, 1*SIZE($17) ++ ST $f12, 2*SIZE($17) ++ ST $f13, 3*SIZE($17) ++ ST $f14, 4*SIZE($17) ++ ST $f15, 5*SIZE($17) ++ ST $f16, 6*SIZE($17) ++ ST $f17, 7*SIZE($17) ++ ++ ST $f20, 0*SIZE($19) ++ ST $f21, 1*SIZE($19) ++ ST $f22, 2*SIZE($19) ++ ST $f23, 3*SIZE($19) ++ ST $f24, 4*SIZE($19) ++ ST $f25, 5*SIZE($19) ++ ST $f26, 6*SIZE($19) ++ ST $f27, 7*SIZE($19) ++ ++ ldi $17, 8*SIZE($17) ++ ldi $19, 8*SIZE($19) ++ bgt $21, $MainLoop + .align 4 + -+$L12: -+ ST $f31, 0*SIZE($1) -+ ST $f31, 1*SIZE($1) -+ ldi $2, -1($2) -+ ldi $1, 2*SIZE($1) -+ bgt $2, $L12 -+ bgt $17,$L13 ++$MainRemain: ++ ble $22, $MainEnd ++ .align 4 ++ ++$MainRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ ldi $17, 1*SIZE($17) ++ ldi $19, 1*SIZE($19) ++ subl $22, 1, $22 ++ ST $f10, -1*SIZE($17) ++ ST $f20, -1*SIZE($19) ++ bgt $22, $MainRemainLoop ++ .align 4 ++ ++$MainEnd: + clr $0 + ret + .align 4 + -+/* Main Routine */ -+$Main: -+ sra $16, 1, $2 # $2 = (m >> 1) -+ mov $18, $1 # c_offset = c -+ ldi $17, -1($17) # n -- -+ SXADDQ $19, $18, $18 # c += ldc -+ beq $2, $L18 ++$Sub: ++ mov $17, $23 ++ mov $19, $24 + -+ LD $f14, 0*SIZE($1) -+ LD $f15, 1*SIZE($1) -+ LD $f24, 2*SIZE($1) -+ LD $f25, 3*SIZE($1) -+ ldi $2, -1($2) # $2 -- -+ ble $2, $L19 ++ ble $21, $SubRemain + .align 4 + ++$SubLoop: ++ LD $f10, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f11, 0*SIZE($19) ++ SXADDQ $20, $19, $19 + -+$L23: -+ MUL $f19, $f14, $f10 -+ fillcs 9*SIZE($1) -+ MUL $f20, $f15, $f11 -+ ldi $2, -1($2) ++ LD $f12, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f13, 0*SIZE($19) ++ SXADDQ $20, $19, $19 + -+ MUL $f19, $f15, $f12 -+ LD $f15, 5*SIZE($1) -+ MUL $f20, $f14, $f13 -+ LD $f14, 4*SIZE($1) ++ LD $f14, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f15, 0*SIZE($19) ++ SXADDQ $20, $19, $19 + -+ MUL $f19, $f24, $f16 -+ unop -+ MUL $f20, $f25, $f17 -+ unop ++ LD $f16, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f17, 0*SIZE($19) ++ SXADDQ $20, $19, $19 + -+ MUL $f19, $f25, $f18 -+ LD $f25, 7*SIZE($1) -+ SUB $f10, $f11, $f22 -+ unop ++ LD $f20, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f21, 0*SIZE($17) ++ SXADDQ $18, $17, $17 + -+ MUL $f20, $f24, $f21 -+ LD $f24, 6*SIZE($1) -+ ADD $f12, $f13, $f23 -+ ldi $1, 4*SIZE($1) ++ LD $f22, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f23, 0*SIZE($17) ++ SXADDQ $18, $17, $17 + -+ SUB $f16, $f17, $f26 -+ ADD $f18, $f21, $f27 -+ ST $f22,-4*SIZE($1) -+ ST $f23,-3*SIZE($1) ++ LD $f24, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f25, 0*SIZE($17) ++ SXADDQ $18, $17, $17 + -+ ST $f26,-2*SIZE($1) -+ ST $f27,-1*SIZE($1) -+ unop -+ bgt $2,$L23 -+ .align 4 ++ LD $f26, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f27, 0*SIZE($17) ++ SXADDQ $18, $17, $17 + -+$L19: -+ MUL $f19, $f14, $f10 -+ MUL $f20, $f15, $f11 -+ MUL $f19, $f15, $f12 -+ MUL $f20, $f14, $f13 ++ ST $f10, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f11, 0*SIZE($23) ++ SXADDQ $18, $23, $23 + -+ MUL $f19, $f24, $f16 -+ MUL $f20, $f25, $f17 -+ MUL $f19, $f25, $f18 -+ MUL $f20, $f24, $f21 ++ ST $f12, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f13, 0*SIZE($23) ++ SXADDQ $18, $23, $23 + -+ SUB $f10, $f11, $f22 -+ ADD $f12, $f13, $f23 -+ SUB $f16, $f17, $f26 -+ ADD $f18, $f21, $f27 -+ ldi $1, 4*SIZE($1) ++ ST $f14, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f15, 0*SIZE($23) ++ SXADDQ $18, $23, $23 + -+ ST $f22, -4*SIZE($1) -+ ST $f23, -3*SIZE($1) -+ ST $f26, -2*SIZE($1) -+ ST $f27, -1*SIZE($1) ++ ST $f16, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f17, 0*SIZE($23) ++ SXADDQ $18, $23, $23 + -+ blbs $16, $L18 -+ bgt $17, $Main -+ clr $0 -+ ret ++ ST $f20, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f21, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f22, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f23, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f24, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f25, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f26, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f27, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ subl $21, 1, $21 ++ bgt $21, $SubLoop + .align 4 + -+$L18: -+ LD $f14, 0*SIZE($1) -+ LD $f15, 1*SIZE($1) -+ MUL $f19, $f15, $f13 -+ MUL $f20, $f14, $f10 ++$SubRemain: ++ ble $22, $SubEnd ++ .align 4 + -+ MUL $f19, $f14, $f12 -+ MUL $f20, $f15, $f11 -+ ADD $f13, $f10, $f26 -+ SUB $f12, $f11, $f27 ++$SubRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f20, 0*SIZE($17) + -+ ST $f26, 1*SIZE($1) -+ ST $f27, 0*SIZE($1) -+ ldi $1, 2*SIZE($1) -+ bgt $17, $Main ++ subl $22, 1, $22 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f20, 0*SIZE($19) ++ ++ SXADDQ $18, $17, $17 ++ SXADDQ $20, $19, $19 ++ bgt $22, $SubRemainLoop + .align 4 + -+$End: ++$SubEnd: + clr $0 + ret -+ .ident VERSION -+ .end CNAME -diff --git a/kernel/sw_64/zgemm_kernel_2x2.S b/kernel/sw_64/zgemm_kernel_2x2.S ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S b/kernel/sw_64/trsm_kernel_4x4_LN.S new file mode 100644 -index 0000000..6cf954b +index 000000000..e9d9093f3 --- /dev/null -+++ b/kernel/sw_64/zgemm_kernel_2x2.S -@@ -0,0 +1,1949 @@ ++++ b/kernel/sw_64/trsm_kernel_4x4_LN.S +@@ -0,0 +1,4061 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -69103,47 +12837,33 @@ index 0000000..6cf954b + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++ ++#if !defined(SW8A) +#error "Architecture is not specified." +#endif + -+#ifdef SW6 ++#ifdef SW8A +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + -+#ifdef EV5 -+#define PREFETCHSIZE 48 -+#define UNOP -+#endif -+ -+#ifdef EV4 -+#define UNOP -+#endif -+ -+ .set noat -+ .set noreorder -+ .arch sw6a + -+.text -+ .align 5 -+ .globl CNAME -+ .ent CNAME + -+#define STACKSIZE 88 ++#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 -+#define A $21 -+#define B $22 -+#define C $20 ++#define A $20 ++#define B $21 ++#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 ++#define C3 $25 ++#define C4 $27 + +#define AO $at +#define BO $5 @@ -69170,8 +12890,7 @@ index 0000000..6cf954b +#define a6 $f30 +#define b5 $f29 + -+#define alpha_i $f29 -+#define alpha_r $f30 ++#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 @@ -69196,61 +12915,20 @@ index 0000000..6cf954b +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 -+#define BB $3 ++#define AORIG $3 +#define OFFSET $4 + -+#define tmp $9 -+ -+#define ALPHA_R 64($sp) -+#define ALPHA_I 72($sp) -+ -+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 SUB -+#define ADD4 ADD -+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 ADD -+#define ADD4 SUB -+#else -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 SUB -+#define ADD4 SUB -+#endif -+ -+CNAME: ++ PROLOGUE ++ PROFCODE + .frame $sp, STACKSIZE, $26, 0 + -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ ldi $at, _mcount -+ jsr $at, ($at), _mcount -+#endif -+ -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif -+ + ldi $sp, -STACKSIZE($sp) + -+ ldl B, 0 + STACKSIZE($sp) -+ ldl C, 8 + STACKSIZE($sp) -+ ldl LDC, 16 + STACKSIZE($sp) -+#ifdef TRMMKERNEL -+ ldl OFFSET, 24 + STACKSIZE($sp) -+#endif ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) + -+ sll LDC, ZBASE_SHIFT, LDC ++ SXADDQ LDC, 0, LDC + + fstd $f2, 0($sp) + fstd $f3, 8($sp) @@ -69260,9 +12938,7 @@ index 0000000..6cf954b + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) -+ fstd $f19, ALPHA_R -+ fstd $f20, ALPHA_I -+ stl tmp, 80($sp) ++ + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 @@ -69271,739 +12947,378 @@ index 0000000..6cf954b + or $0, $2, $0 + bne $0, $L999 + -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ subl $31, OFFSET, KK ++#ifdef LN ++ mull M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C +#endif + -+ sra N, 1, J -+ ble J, $L30 ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 + .align 4 + -+$L01: ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif ++ + mov C, C1 + addl C, LDC, C2 -+ mov A, AO -+ s4addl K, 0, BB ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif + ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 + -+#if defined(TRMMKERNEL) && defined(LEFT) ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT + mov OFFSET, KK +#endif + -+ SXADDQ BB, B, BB -+ addl C2, LDC, C -+ unop ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif + -+ sra M, 1, I -+ fclr t1 -+ fclr t2 + fclr t3 + fclr t4 + -+ fclr c01 -+ fclr c05 -+ ++ and M, 1, I + ble I, $L20 -+ .align 4 -+ -+$L11: -+#ifndef EV4 -+ fillcs 0 * SIZE(BB) -+ fillcs 8 * SIZE(BB) -+ unop -+ ldi BB, 16 * SIZE(BB) -+#endif -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 2, TMP1 -+#else -+ addl KK, 2, TMP1 -+#endif -+#endif ++#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) -+ fclr c09 ++ fclr c01 + LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 ++ fclr c05 + + LD b1, 0 * SIZE(B) -+ fclr c10 ++ ldi L, -2(KK) + LD b2, 1 * SIZE(B) -+ fclr c14 ++ ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) -+ fclr c03 ++ fclr c09 + LD b4, 3 * SIZE(B) -+ fclr c07 ++ fclr c13 + + ldi BO, 4 * SIZE(B) -+ fclr c11 -+ ldi AO, 4 * SIZE(AO) -+ fclr c15 ++ ble KK, $L38 + -+ fillcs 4 * SIZE(C1) -+ fclr c04 -+#ifndef TRMMKERNEL -+ ldi L, -2(K) ++ ble L, $L35 +#else -+ ldi L, -2(TMP1) ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG +#endif -+ fclr c08 + -+ fillcs 4 * SIZE(C2) -+ fclr c12 -+ fclr c16 -+ ble L, $L15 -+#else -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ addl B, TMP1, BO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) -+ fclr c09 ++ fclr c01 + LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 ++ fclr c05 + + LD b1, 0 * SIZE(BO) -+ fclr c10 ++ ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) -+ fclr c14 ++ ldi AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) -+ fclr c03 ++ fclr c09 + LD b4, 3 * SIZE(BO) -+ fclr c07 ++ fclr c13 + + ldi BO, 4 * SIZE(BO) -+ fclr c11 -+ ldi AO, 4 * SIZE(AO) -+ fclr c15 -+ -+ fillcs 4 * SIZE(C1) -+ fclr c04 -+ ldi L, -2(TMP1) -+ fclr c08 -+ -+ fillcs 4 * SIZE(C2) -+ fclr c12 -+ fclr c16 -+ ble L, $L15 -+#endif -+ .align 5 ++ ble TMP1, $L38 + -+$L12: -+/* 1 */ -+ ADD1 c11, t1, a6 -+ fmov a6, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop ++ ble L, $L35 +#endif ++ .align 4 + -+ ADD3 c12, t2, a6 -+ fmov a6, c12 -+ unop -+ MUL b1, a2, t2 -+ unop -+ -+ ADD2 c16, t3, a6 -+ fmov a6, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) -+ -+ ADD4 c15, t4, a6 -+ fmov a6, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ -+/* 2 */ -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP -+ -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP -+ -+ ADD2 c06, t3, a6 -+ fmov a6, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD4 c05, t4, a6 -+ fmov a6, c05 -+ unop -+ MUL b4, a1, t4 -+ unop -+ -+/* 3 */ -+ ADD1 c03, t1, a6 -+ fmov a6, c03 -+ unop -+ MUL b3, a1, t1 -+ unop -+ -+ ADD3 c04, t2, a6 -+ fmov a6, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD2 c08, t3, a6 -+ fmov a6, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) + -+ ADD4 c13, t4, a6 -+ fmov a6, c13 -+ unop -+ MUL b2, a3, t4 ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + -+/* 4 */ -+ ADD1 c09, t1, a6 -+ fmov a6, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) -+ FIMOVD a6, tmp -+ -+ ADD3 c10, t2, a6 -+ fmov a6, c10 -+ unop -+ MUL b3, a4, t2 ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + -+ ADD2 c14, t3, a6 -+ fmov a6, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) -+ -+ ADD4 c07, t4, a6 -+ fmov a6, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) -+ -+/* 5 */ -+ ADD1 c11, t1, a6 -+ fmov a6, c11 -+ unop -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) + -+ ADD3 c12, t2, a6 -+ fmov a6, c12 -+ ldi L, -2(L) -+ MUL b5, a2, t2 ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) -+ -+ ADD2 c16, t3, a6 -+ fmov a6, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD4 c15, t4, a6 -+ fmov a6, c15 -+ unop -+ MUL b2, a5, t4 -+ unop -+ -+/* 6 */ -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ unop -+ IFMOVD tmp, a6 -+ MUL b5, a6, t1 -+ unop -+ -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ unop -+ MUL b5, a4, t2 -+ unop -+ -+ ADD2 c06, t3, a6 -+ fmov a6, c06 -+ unop -+ MUL b2, a4, t3 -+ unop -+ -+ ADD4 c05, t4, a6 -+ fmov a6, c05 -+ unop -+ MUL b4, a5, t4 -+ unop -+ -+/* 7 */ -+ ADD1 c03, t1, a6 -+ fmov a6, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 -+ unop -+ -+ ADD3 c04, t2, a6 -+ fmov a6, c04 + ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 -+ unop -+ -+ ADD2 c08, t3, a6 -+ fmov a6, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) + -+ ADD4 c13, t4, a6 -+ fmov a6, c13 -+ unop -+ IFMOVD tmp, a6 -+ MUL b2, a6, t4 ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + -+/* 8 */ -+ ADD1 c09, t1, a6 -+ fmov a6, c09 -+ unop -+ IFMOVD tmp, a6 -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD3 c10, t2, a6 -+ fmov a6, c10 -+ unop -+ MUL b3, a4, t2 ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + -+ ADD2 c14, t3, a6 -+ fmov a6, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD4 c07, t4, a6 -+ fmov a6, c07 -+ IFMOVD tmp, a6 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 + .align 4 + -+$L15: -+ ADD1 c11, t1, a6 -+ fmov a6, c11 -+ fldd alpha_r, ALPHA_R -+ FIMOVD alpha_r, tmp -+ MUL b1, a1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L18 ++$L35: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 +#else -+ blbs TMP1, $L18 ++ blbs TMP1, $L37 +#endif + .align 4 + -+ ADD3 c12, t2, a6 -+ fmov a6, c12 -+ MUL b1, a2, t2 -+ ADD2 c16, t3, a6 -+ fmov a6, c16 -+ MUL b2, a2, t3 -+ -+ ADD4 c15, t4, a6 -+ fmov a6, c15 -+ MUL b2, a1, t4 -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ MUL b1, a3, t1 -+ -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ unop -+ MUL b1, a4, t2 ++ ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) -+ -+ ADD2 c06, t3, a6 -+ fmov a6, c06 -+ MUL b2, a4, t3 -+ ADD4 c05, t4, a6 -+ fmov a6, c05 -+ MUL b4, a1, t4 -+ -+ ADD1 c03, t1, a6 -+ fmov a6, c03 -+ unop -+ MUL b3, a1, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD3 c04, t2, a6 -+ fmov a6, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD2 c08, t3, a6 -+ fmov a6, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD4 c13, t4, a6 -+ fmov a6, c13 -+ unop -+ MUL b2, a3, t4 ++ MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + -+ ADD1 c09, t1, a6 -+ fmov a6, c09 -+ unop -+ MUL b3, a3, t1 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD3 c10, t2, a6 -+ fmov a6, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD2 c14, t3, a6 -+ fmov a6, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) + -+ ADD4 c07, t4, a6 -+ fmov a6, c07 -+ unop -+ MUL b4, a3, t4 -+ LD a3, -2 * SIZE(AO) ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) + -+ ADD1 c11, t1, a6 -+ fmov a6, c11 ++ ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) -+ MUL b1, a1, t1 ++ MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + -+$L18: -+ ADD3 c12, t2, a6 -+ fmov a6, c12 -+ unop -+ MUL b1, a2, t2 -+ fldd alpha_i, ALPHA_I ++$L37: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 + -+ ADD2 c16, t3, a6 -+ fmov a6, c16 -+ unop -+ MUL b2, a2, t3 -+#ifndef TRMMKERNEL -+ LD a5, 0 * SIZE(C1) -+#else -+ unop -+#endif ++ ADD c13, t4, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, t4 ++ ldi BO, 4 * SIZE(BO) + -+ ADD4 c15, t4, a6 -+ fmov a6, c15 -+ MUL b2, a1, t4 -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ MUL b1, a3, t1 ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ADD c09, t3, c09 ++ ADD c13, t4, c13 + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ unop -+ MUL b1, a4, t2 -+#ifndef TRMMKERNEL -+ LD b1, 1 * SIZE(C1) ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 +#else -+ unop ++ subl KK, 4, TMP1 +#endif -+ -+ ADD2 c06, t3, a6 -+ fmov a6, c06 -+ MUL b2, a4, t3 -+ ADD4 c05, t4, a6 -+ fmov a6, c05 -+ MUL b4, a1, t4 -+ -+ ADD1 c03, t1, a6 -+ fmov a6, c03 -+ unop -+ MUL b3, a1, t1 -+#ifndef TRMMKERNEL -+ LD a1, 2 * SIZE(C1) ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO +#else -+ unop ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) +#endif + -+ ADD3 c04, t2, a6 -+ fmov a6, c04 -+ unop -+ MUL b3, a2, t2 -+ unop ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ ADD2 c08, t3, a6 -+ fmov a6, c08 -+ unop -+ MUL b4, a2, t3 -+#ifndef TRMMKERNEL -+ LD a2, 3 * SIZE(C1) ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 +#else -+ unop -+#endif ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ ADD4 c13, t4, a6 -+ fmov a6, c13 -+ unop -+ MUL b2, a3, t4 -+#ifndef TRMMKERNEL -+ LD b2, 0 * SIZE(C2) -+#else -+ unop ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 +#endif + -+ ADD1 c09, t1, a6 -+ fmov a6, c09 -+ ldi I, -1(I) -+ MUL b3, a3, t1 -+ unop -+ -+ ADD3 c10, t2, a6 -+ fmov a6, c10 -+ unop -+ MUL b3, a4, t2 -+#ifndef TRMMKERNEL -+ LD b3, 1 * SIZE(C2) -+#else -+ unop -+#endif ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) + -+ ADD2 c14, t3, a6 -+ fmov a6, c14 -+ unop -+ MUL b4, a4, t3 -+#ifndef TRMMKERNEL -+ LD a4, 2 * SIZE(C2) -+#else -+ unop ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 +#endif + -+ ADD4 c07, t4, a6 -+ fmov a6, c07 -+ unop -+ MUL b4, a3, t4 -+#ifndef TRMMKERNEL -+ LD a3, 3 * SIZE(C2) -+#else -+ unop -+#endif ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ ADD1 c11, t1, a6 -+ fmov a6, c11 -+ ADD3 c12, t2, a6 -+ fmov a6, c12 -+ ADD2 c16, t3, a6 -+ fmov a6, c16 -+ ADD4 c15, t4, a6 -+ fmov a6, c15 -+ -+ ADD c01, c06, a6 -+ fmov a6, c01 -+ ADD c02, c05, a6 -+ fmov a6, c02 -+ ADD c03, c08, a6 -+ fmov a6, c03 -+ ADD c04, c07, a6 -+ fmov a6, c04 -+ -+ ADD c09, c14, a6 -+ fmov a6, c09 -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c01, t1 -+ ADD c10, c13, a6 -+ fmov a6, c10 -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c02, t2 ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c01, t1 ++ SUB c09, t1, c09 ++ MUL a4, c01, t1 ++ SUB c13, t1, c13 + -+ ADD c11, c16, a6 -+ fmov a6, c11 -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c03, t3 -+ ADD c12, c15, a6 -+ fmov a6, c12 -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c04, t4 ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) + -+#ifndef TRMMKERNEL -+ ADD a5, t1, a6 -+ fmov a6, a5 -+ MUL alpha_i, c02, t1 -+ ADD b1, t2, a6 -+ fmov a6, b1 -+ MUL alpha_i, c01, t2 ++ MUL b1, c05, c05 ++ MUL b2, c05, t1 ++ SUB c09, t1, c09 ++ MUL b3, c05, t1 ++ SUB c13, t1, c13 + -+ ADD a1, t3, a6 -+ fmov a6, a1 -+ MUL alpha_i, c04, t3 -+ ADD a2, t4, a6 -+ fmov a6, a2 -+ MUL alpha_i, c03, t4 -+#else -+ ADD $f31, t1, a5 -+ MUL alpha_i, c02, t1 -+ ADD $f31, t2, b1 -+ MUL alpha_i, c01, t2 ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) + -+ ADD $f31, t3, a1 -+ MUL alpha_i, c04, t3 -+ ADD $f31, t4, a2 -+ MUL alpha_i, c03, t4 ++ MUL a1, c09, c09 ++ MUL a2, c09, t1 ++ SUB c13, t1, c13 ++ MUL a3, c13, c13 +#endif + -+ SUB a5, t1, a6 -+ fmov a6, a5 -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c09, t1 -+ ADD b1, t2, a6 -+ fmov a6, b1 -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c10, t2 -+ -+ SUB a1, t3, a6 -+ fmov a6, a1 -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c11, t3 -+ ADD a2, t4, a6 -+ fmov a6, a2 -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c12, t4 ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) + -+#ifndef TRMMKERNEL -+ ADD b2, t1, a6 -+ fmov a6, b2 -+ MUL alpha_i, c10, t1 -+ ADD b3, t2, a6 -+ fmov a6, b3 -+ MUL alpha_i, c09, t2 ++ MUL a1, c13, c13 ++ MUL a2, c13, t1 ++ SUB c09, t1, c09 ++ MUL a3, c13, t1 ++ SUB c05, t1, c05 ++ MUL a4, c13, t1 ++ SUB c01, t1, c01 + -+ ADD a4, t3, a6 -+ fmov a6, a4 -+ MUL alpha_i, c12, t3 -+ ADD a3, t4, a6 -+ fmov a6, a3 -+ MUL alpha_i, c11, t4 -+#else -+ ADD $f31, t1, b2 -+ MUL alpha_i, c10, t1 -+ ADD $f31, t2, b3 -+ MUL alpha_i, c09, t2 ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) + -+ ADD $f31, t3, a4 -+ MUL alpha_i, c12, t3 -+ ADD $f31, t4, a3 -+ MUL alpha_i, c11, t4 -+#endif ++ MUL b1, c09, c09 ++ MUL b2, c09, t1 ++ SUB c05, t1, c05 ++ MUL b3, c09, t1 ++ SUB c01, t1, c01 + -+ SUB b2, t1, a6 -+ fmov a6, b2 -+ ST a5, 0 * SIZE(C1) -+ fclr t1 -+ unop ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ ADD b3, t2, a6 -+ fmov a6, b3 -+ ST b1, 1 * SIZE(C1) -+ fclr t2 -+ unop ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+ SUB a4, t3, a6 -+ fmov a6, a4 -+ ST a1, 2 * SIZE(C1) -+ fclr t3 -+ unop ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif + -+ ADD a3, t4, a6 -+ fmov a6, a3 -+ ST a2, 3 * SIZE(C1) -+ fclr t4 -+ unop ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif + -+ ST b2, 0 * SIZE(C2) -+ fclr c01 -+ ST b3, 1 * SIZE(C2) -+ fclr c05 ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) + -+ ST a4, 2 * SIZE(C2) -+ ldi C1, 4 * SIZE(C1) -+ ST a3, 3 * SIZE(C2) -+ ldi C2, 4 * SIZE(C2) ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++#if defined(LT) || defined(RN) + subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 2, TMP1 -+#else -+ subl TMP1, 2, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO +#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO ++ ++#ifdef LT ++ addl KK, 1, KK +#endif + -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 2, KK ++#ifdef LN ++ subl KK, 1, KK +#endif -+ bgt I, $L11 + .align 4 + +$L20: -+ and M, 1, I -+ ble I, $L29 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ and M, 2, I ++ ble I, $L30 + -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 1, TMP1 -+#else -+ addl KK, 2, TMP1 -+#endif -+#endif ++#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 @@ -70011,31 +13326,38 @@ index 0000000..6cf954b + fclr c13 + + LD a3, 2 * SIZE(AO) -+ fclr c02 ++ fclr c10 + LD a4, 3 * SIZE(AO) -+ fclr c06 ++ fclr c14 + + LD b1, 0 * SIZE(B) -+ fclr c10 ++ ldi L, -2(KK) + LD b2, 1 * SIZE(B) -+ fclr c14 ++ ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) ++ fclr c01 + LD b4, 3 * SIZE(B) ++ fclr c05 ++ + ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 + -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif + ble L, $L25 ++ +#else -+ sll KK, ZBASE_SHIFT + 0, TMP1 -+ addl AO, TMP1, AO -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) @@ -70044,2156 +13366,2111 @@ index 0000000..6cf954b + fclr c13 + + LD a3, 2 * SIZE(AO) -+ fclr c02 ++ fclr c10 + LD a4, 3 * SIZE(AO) -+ fclr c06 ++ fclr c14 + + LD b1, 0 * SIZE(BO) -+ fclr c10 ++ ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) -+ fclr c14 ++ ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) ++ fclr c01 + LD b4, 3 * SIZE(BO) ++ fclr c05 ++ + ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 + -+ ldi L, -2(TMP1) + ble L, $L25 +#endif -+ .align 5 ++ .align 4 + +$L22: -+ ADD1 c09, t1, a6 -+ fmov a6, c09 ++ ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + -+ ADD3 c10, t2, a6 -+ fmov a6, c10 ++ ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + -+ ADD4 c13, t3, a6 -+ fmov a6, c13 ++ ADD c13, t3, c13 + unop + MUL a1, b2, t3 + ldi BO, 8 * SIZE(BO) + -+ ADD2 c14, t4, a6 -+ fmov a6, c14 ++ ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + -+ ADD1 c01, t1, a6 -+ fmov a6, c01 ++ ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 ++ ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + -+ ADD4 c05, t3, a6 -+ fmov a6, c05 ++ ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + -+ ADD2 c06, t4, a6 -+ fmov a6, c06 ++ ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + -+ ADD1 c09, t1, a6 -+ fmov a6, c09 ++ ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + -+ ADD3 c10, t2, a6 -+ fmov a6, c10 ++ ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + -+ ADD4 c13, t3, a6 -+ fmov a6, c13 ++ ADD c13, t3, c13 + unop + MUL a3, b2, t3 + ldi AO, 4 * SIZE(AO) + -+ ADD2 c14, t4, a6 -+ fmov a6, c14 ++ ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + -+ ADD1 c01, t1, a6 -+ fmov a6, c01 ++ ADD c01, t1, c01 + ldi L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 ++ ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + -+ ADD4 c05, t3, a6 -+ fmov a6, c05 ++ ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + -+ ADD2 c06, t4, a6 -+ fmov a6, c06 ++ ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: -+ ADD1 c09, t1, a6 -+ fmov a6, c09 -+ fldd alpha_r, ALPHA_R -+ FIMOVD alpha_r, tmp ++ ADD c09, t1, c09 + MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L28 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 +#else -+ blbs TMP1, $L28 ++ blbs TMP1, $L27 +#endif -+ .align 4 + -+ ADD3 c10, t2, a6 -+ fmov a6, c10 ++ ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + -+ ADD4 c13, t3, a6 -+ fmov a6, c13 ++ ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + -+ ADD2 c14, t4, a6 -+ fmov a6, c14 ++ ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + -+ ADD1 c01, t1, a6 -+ fmov a6, c01 ++ ADD c01, t1, c01 + unop + MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 ++ ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + -+ ADD4 c05, t3, a6 -+ fmov a6, c05 ++ ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + -+ ADD2 c06, t4, a6 -+ fmov a6, c06 ++ ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + -+ ADD1 c09, t1, a6 -+ fmov a6, c09 ++ ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + -+$L28: -+ ADD3 c10, t2, a6 -+ fmov a6, c10 -+ unop ++$L27: ++ ADD c10, t2, c10 + MUL a2, b1, t2 -+ fldd alpha_i, ALPHA_I -+ -+ ADD4 c13, t3, a6 -+ fmov a6, c13 -+ unop ++ ADD c13, t3, c13 + MUL a1, b2, t3 -+#ifndef TRMMKERNEL -+ LD c03, 0 * SIZE(C1) -+#else -+ unop -+#endif + -+ ADD2 c14, t4, a6 -+ fmov a6, c14 -+ unop ++ ADD c14, t4, c14 + MUL a2, b2, t4 -+#ifndef TRMMKERNEL -+ LD c04, 1 * SIZE(C1) ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ ADD c10, t2, c10 ++ ADD c13, t3, c13 ++ ADD c14, t4, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 +#else -+ unop ++ subl KK, 4, TMP1 +#endif -+ -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ unop -+ MUL a1, b3, t1 -+#ifndef TRMMKERNEL -+ LD c11, 0 * SIZE(C2) ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO +#else -+ unop ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) +#endif + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ unop -+ MUL a2, b3, t2 -+#ifndef TRMMKERNEL -+ LD c12, 1 * SIZE(C2) ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ +#else -+ unop ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c13, c13 ++ SUB b4, c14, c14 +#endif + -+ ADD4 c05, t3, a6 -+ fmov a6, c05 -+ MUL a1, b4, t3 -+ ADD2 c06, t4, a6 -+ fmov a6, c06 -+ MUL a2, b4, t4 ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ ADD1 c09, t1, a6 -+ fmov a6, c09 -+ ADD3 c10, t2, a6 -+ fmov a6, c10 -+ ADD4 c13, t3, a6 -+ fmov a6, c13 -+ ADD2 c14, t4, a6 -+ fmov a6, c14 -+ -+ ADD c01, c06, a6 -+ fmov a6, c01 -+ ADD c02, c05, a6 -+ fmov a6, c02 -+ ADD c09, c14, a6 -+ fmov a6, c09 -+ ADD c10, c13, a6 -+ fmov a6, c10 -+ -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c01, t1 -+ MUL alpha_r, c02, t2 -+ MUL alpha_r, c09, t3 -+ MUL alpha_r, c10, t4 ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++ MUL a3, c10, c10 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif + -+#ifndef TRMMKERNEL -+ ADD c03, t1, a6 -+ fmov a6, c03 -+ MUL alpha_i, c02, t1 -+ ADD c04, t2, a6 -+ fmov a6, c04 -+ MUL alpha_i, c01, t2 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) + -+ ADD c11, t3, a6 -+ fmov a6, c11 -+ MUL alpha_i, c10, t3 -+ ADD c12, t4, a6 -+ fmov a6, c12 -+ MUL alpha_i, c09, t4 -+#else -+ ADD $f31, t1, c03 -+ MUL alpha_i, c02, t1 -+ ADD $f31, t2, c04 -+ MUL alpha_i, c01, t2 ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) + -+ ADD $f31, t3, c11 -+ MUL alpha_i, c10, t3 -+ ADD $f31, t4, c12 -+ MUL alpha_i, c09, t4 ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) +#endif + -+ SUB c03, t1, a6 -+ fmov a6, c03 -+ ADD c04, t2, a6 -+ fmov a6, c04 -+ SUB c11, t3, a6 -+ fmov a6, c11 -+ ADD c12, t4, a6 -+ fmov a6, c12 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ ST c03, 0 * SIZE(C1) -+ ST c04, 1 * SIZE(C1) -+ ST c11, 0 * SIZE(C2) -+ ST c12, 1 * SIZE(C2) ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++#if defined(LT) || defined(RN) + subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 1, TMP1 -+#else -+ subl TMP1, 2, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ sll TMP1, BASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ sll TMP1, BASE_SHIFT + 2, TMP2 + addl BO, TMP2, BO +#endif + -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 1, KK ++#ifdef LT ++ addl KK, 2, KK +#endif -+ .align 4 + -+$L29: -+ mov BO, B -+ ldi J, -1(J) -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addl KK, 2, KK -+#else -+ unop ++#ifdef LN ++ subl KK, 2, KK +#endif -+ bgt J, $L01 + .align 4 + +$L30: -+ and N, 1, J -+ ble J, $L999 -+ -+ mov C, C1 -+ mov A, AO -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK -+#endif -+ -+ sra M, 1, I -+ ble I, $L50 ++ sra M, 2, I ++ ble I, $L39 + .align 4 + -+$L41: -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++$L11: ++#if defined(LT) || defined(RN) + -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 2, TMP1 -+#else -+ addl KK, 1, TMP1 -+#endif -+#endif ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 + -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 + -+ LD b1, 0 * SIZE(B) ++ LD b1, 0 * SIZE(B) + fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) + fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 + -+ ldi BO, 2 * SIZE(B) ++ fillde 4 * SIZE(C1) + fclr c03 -+ ldi AO, 4 * SIZE(AO) -+ fclr c07 -+ -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif ++ ldi L, -2(KK) + fclr c04 ++ ++ fillde 7 * SIZE(C2) + fclr c08 -+ ble L, $L45 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 +#else -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ sll KK, ZBASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO ++ ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ + subl K, KK, TMP1 + -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 + -+ LD b1, 0 * SIZE(BO) ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) + fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) + fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 + -+ ldi BO, 2 * SIZE(BO) ++ fillde 4 * SIZE(C1) + fclr c03 -+ ldi AO, 4 * SIZE(AO) -+ fclr c07 -+ + ldi L, -2(TMP1) + fclr c04 ++ ++ fillde 7 * SIZE(C2) + fclr c08 -+ ble L, $L45 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 +#endif ++ ++ ble L, $L15 + .align 5 + -+$L42: -+ ADD4 c05, t1, a6 -+ fmov a6, c05 ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else + unop -+ MUL a1, b1, t1 ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else + unop ++#endif + -+ ADD2 c06, t2, a6 -+ fmov a6, c06 -+ ldi L, -2(L) -+ MUL a2, b1, t2 ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 + unop + -+ ADD4 c07, t3, a6 -+ fmov a6, c07 ++ ADD c16, t3, c16 + unop -+ MUL a3, b1, t3 ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, c15 + unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP + -+ ADD2 c08, t4, a6 -+ fmov a6, c08 ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 + unop -+ MUL a4, b1, t4 -+ LD b1, 2 * SIZE(BO) + -+ ADD1 c01, t1, a6 -+ fmov a6, c01 ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 + unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, t2 ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + -+ ADD1 c03, t3, a6 -+ fmov a6, c03 ++ ADD c13, t4, c13 + unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) + -+ ADD3 c04, t4, a6 -+ fmov a6, c04 ++/* 4 */ ++ ADD c09, t1, c09 + unop -+ MUL a4, b2, t4 -+ LD a5, 3 * SIZE(AO) ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) + -+ ADD4 c05, t1, a6 -+ fmov a6, c05 ++ ADD c10, t2, c10 + unop -+ MUL a1, b3, t1 -+ LD b2, -1 * SIZE(BO) ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) + -+ ADD2 c06, t2, a6 -+ fmov a6, c06 ++ ADD c14, t3, c14 + unop -+ MUL a2, b3, t2 ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, c07 + unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) + -+ ADD4 c07, t3, a6 -+ fmov a6, c07 ++/* 5 */ ++ ADD c11, t1, c11 + unop -+ MUL a3, b3, t3 -+ ldi AO, 8 * SIZE(AO) ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) + -+ ADD2 c08, t4, a6 -+ fmov a6, c08 ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 + unop -+ MUL a5, b3, t4 -+ LD b3, 0 * SIZE(BO) + -+ ADD1 c01, t1, a6 -+ fmov a6, c01 ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 + unop -+ MUL a1, b4, t1 -+ LD a1, -4 * SIZE(AO) + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 ++ ADD c02, t2, c02 + unop -+ MUL a2, b4, t2 ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + -+ ADD1 c03, t3, a6 -+ fmov a6, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, t3 ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + -+ ADD3 c04, t4, a6 -+ fmov a6, c04 -+ MUL a5, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L42 ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 + .align 4 + -+$L45: -+ ADD4 c05, t1, a6 -+ fmov a6, c05 -+ fldd alpha_r, ALPHA_R -+ FIMOVD alpha_r, tmp ++$L15: ++ ADD c11, t1, c11 + MUL b1, a1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L48 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 +#else -+ blbs TMP1, $L48 ++ blbs TMP1, $L17 +#endif + .align 4 + -+ ADD2 c06, t2, a6 -+ fmov a6, c06 -+ MUL a2, b1, t2 -+ ADD4 c07, t3, a6 -+ fmov a6, c07 -+ MUL a3, b1, t3 ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 + -+ ADD2 c08, t4, a6 -+ fmov a6, c08 ++ ADD c02, t2, c02 + unop -+ MUL a4, b1, t4 ++ MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + -+ ADD1 c01, t1, a6 -+ fmov a6, c01 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD c03, t1, c03 + unop -+ MUL a1, b2, t1 ++ MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 ++ ADD c04, t2, c04 + unop -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD1 c03, t3, a6 -+ fmov a6, c03 ++ MUL b3, a2, t2 + unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD3 c04, t4, a6 -+ fmov a6, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) + -+ ADD4 c05, t1, a6 -+ fmov a6, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L48: -+ ADD2 c06, t2, a6 -+ fmov a6, c06 ++ ADD c08, t3, c08 + unop -+ MUL a2, b1, t2 -+ fldd alpha_i, ALPHA_I ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) + -+ ADD4 c07, t3, a6 -+ fmov a6, c07 -+ ldi I, -1(I) -+ MUL a3, b1, t3 -+#ifndef TRMMKERNEL -+ LD c09, 0 * SIZE(C1) -+#else ++ ADD c13, t4, c13 + unop -+#endif ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) + -+ ADD2 c08, t4, a6 -+ fmov a6, c08 -+ unop -+ MUL a4, b1, t4 -+#ifndef TRMMKERNEL -+ LD c10, 1 * SIZE(C1) -+#else ++ ADD c09, t1, c09 + unop -+#endif ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) + -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ unop -+ MUL a1, b2, t1 -+#ifndef TRMMKERNEL -+ LD c11, 2 * SIZE(C1) -+#else ++ ADD c10, t2, c10 + unop -+#endif ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ unop -+ MUL a2, b2, t2 -+#ifndef TRMMKERNEL -+ LD c12, 3 * SIZE(C1) -+#else ++ ADD c14, t3, c14 + unop -+#endif ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) + -+ ADD1 c03, t3, a6 -+ fmov a6, c03 -+ MUL a3, b2, t3 -+ ADD3 c04, t4, a6 -+ fmov a6, c04 -+ MUL a4, b2, t4 ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) + -+ ADD4 c05, t1, a6 -+ fmov a6, c05 -+ ADD2 c06, t2, a6 -+ fmov a6, c06 -+ ADD4 c07, t3, a6 -+ fmov a6, c07 -+ ADD2 c08, t4, a6 -+ fmov a6, c08 -+ -+ ADD c01, c06, a6 -+ fmov a6, c01 -+ ADD c02, c05, a6 -+ fmov a6, c02 -+ ADD c03, c08, a6 -+ fmov a6, c03 -+ ADD c04, c07, a6 -+ fmov a6, c04 -+ -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c01, t1 -+ MUL alpha_r, c02, t2 -+ MUL alpha_r, c03, t3 -+ MUL alpha_r, c04, t4 ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 + -+#ifndef TRMMKERNEL -+ ADD c09, t1, a6 -+ fmov a6, c09 -+ MUL alpha_i, c02, t1 -+ ADD c10, t2, a6 -+ fmov a6, c10 -+ MUL alpha_i, c01, t2 ++$L17: ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 + -+ ADD c11, t3, a6 -+ fmov a6, c11 -+ MUL alpha_i, c04, t3 -+ ADD c12, t4, a6 -+ fmov a6, c12 -+ MUL alpha_i, c03, t4 -+#else -+ ADD $f31, t1, c09 -+ MUL alpha_i, c02, t1 -+ ADD $f31, t2, c10 -+ MUL alpha_i, c01, t2 ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 + -+ ADD $f31, t3, c11 -+ MUL alpha_i, c04, t3 -+ ADD $f31, t4, c12 -+ MUL alpha_i, c03, t4 -+#endif ++ ADD c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 + -+ SUB c09, t1, a6 -+ fmov a6, c09 -+ ADD c10, t2, a6 -+ fmov a6, c10 -+ SUB c11, t3, a6 -+ fmov a6, c11 -+ ADD c12, t4, a6 -+ fmov a6, c12 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD c03, t1, c03 ++ MUL b3, a1, t1 + -+ ST c09, 0 * SIZE(C1) -+ ST c10, 1 * SIZE(C1) -+ ST c11, 2 * SIZE(C1) -+ ST c12, 3 * SIZE(C1) ++ ADD c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD c08, t3, c08 ++ MUL b4, a2, t3 + -+ ldi C1, 4 * SIZE(C1) ++ ADD c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD c09, t1, c09 ++ MUL b3, a3, t1 + -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 2, TMP1 -+#else -+ subl TMP1, 1, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 -+ addl BO, TMP2, BO -+#endif ++ ADD c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD c14, t3, c14 ++ MUL b4, a4, t3 + -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 2, KK -+#endif ++ ADD c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) + -+ bgt I, $L41 ++ ADD c11, t1, c11 ++ ADD c12, t2, c12 ++ ADD c16, t3, c16 ++ ADD c15, t4, c15 + .align 4 + -+$L50: -+ and M, 1, I -+ ble I, $L999 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 1, TMP1 ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 +#else -+ addl KK, 1, TMP1 ++ subl KK, 4, TMP1 +#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) +#endif + -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) + -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 + -+ ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(B) ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 + -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ ble L, $L55 -+#else -+ sll KK, ZBASE_SHIFT + 0, TMP1 -+ addl AO, TMP1, AO -+ addl B, TMP1, BO -+ subl K, KK, TMP1 ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) ++ ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) ++ ++ SUB a1, c03, c03 ++ SUB a2, c07, c07 ++ SUB a3, c11, c11 ++ SUB a4, c15, c15 + ++ SUB b1, c04, c04 ++ SUB b2, c08, c08 ++ SUB b3, c12, c12 ++ SUB b4, c16, c16 ++#else + LD a1, 0 * SIZE(AO) -+ fclr t1 + LD a2, 1 * SIZE(AO) -+ fclr t2 + LD a3, 2 * SIZE(AO) -+ fclr t3 + LD a4, 3 * SIZE(AO) -+ fclr t4 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 -+ -+ ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(BO) -+ -+ ldi L, -2(TMP1) -+ ble L, $L55 -+#endif -+ .align 5 -+ -+$L52: -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ unop -+ MUL a1, b1, t1 -+ unop + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) + -+ ADD4 c05, t3, a6 -+ fmov a6, c05 -+ ldi L, -2(L) -+ MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 + -+ ADD2 c06, t4, a6 -+ fmov a6, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 + -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, t1 -+ ldi BO, 4 * SIZE(BO) ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, 0 * SIZE(BO) ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) + -+ ADD4 c05, t3, a6 -+ fmov a6, c05 -+ unop -+ MUL a3, b4, t3 -+ LD a3, 0 * SIZE(AO) ++ SUB a1, c09, c09 ++ SUB a2, c10, c10 ++ SUB a3, c11, c11 ++ SUB a4, c12, c12 + -+ ADD2 c06, t4, a6 -+ fmov a6, c06 -+ MUL a4, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ unop ++ SUB b1, c13, c13 ++ SUB b2, c14, c14 ++ SUB b3, c15, c15 ++ SUB b4, c16, c16 ++#endif + -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L52 -+ .align 4 ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) + -+$L55: -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ fldd alpha_r, ALPHA_R -+ FIMOVD alpha_r, tmp -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L58 -+#else -+ blbs TMP1, $L58 -+#endif -+ .align 4 ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ MUL a1, c12, c12 ++ MUL a1, c16, c16 + -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ MUL a2, c12, t3 ++ MUL a2, c16, t4 + -+ ADD4 c05, t3, a6 -+ fmov a6, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, t3 -+ LD a1, 0 * SIZE(AO) ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 + -+ ADD2 c06, t4, a6 -+ fmov a6, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, 1 * SIZE(AO) ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ MUL a3, c12, t3 ++ MUL a3, c16, t4 + -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi AO, 2 * SIZE(AO) -+ .align 4 ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+$L58: -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ unop -+ MUL a2, b1, t2 -+ fldd alpha_i, ALPHA_I ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ MUL a4, c12, t3 ++ MUL a4, c16, t4 + -+ ADD4 c05, t3, a6 -+ fmov a6, c05 -+ unop -+ MUL a1, b2, t3 -+#ifndef TRMMKERNEL -+ LD c03, 0 * SIZE(C1) -+#else -+ unop -+#endif ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 + -+ ADD2 c06, t4, a6 -+ fmov a6, c06 -+ unop -+ MUL a2, b2, t4 -+#ifndef TRMMKERNEL -+ LD c04, 1 * SIZE(C1) -+#else -+ unop -+#endif ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) + -+ ADD1 c01, t1, a6 -+ fmov a6, c01 -+ ADD3 c02, t2, a6 -+ fmov a6, c02 -+ ADD4 c05, t3, a6 -+ fmov a6, c05 -+ ADD2 c06, t4, a6 -+ fmov a6, c06 ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ MUL b1, c11, c11 ++ MUL b1, c15, c15 + -+ ADD c01, c06, a6 -+ fmov a6, c01 -+ ADD c02, c05, a6 -+ fmov a6, c02 ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ MUL b2, c11, t3 ++ MUL b2, c15, t4 + -+ IFMOVD tmp, alpha_r -+ MUL alpha_r, c01, t1 -+ MUL alpha_r, c02, t2 -+ MUL alpha_i, c02, t3 -+ MUL alpha_i, c01, t4 ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+#ifndef TRMMKERNEL -+ ADD c03, t1, a6 -+ fmov a6, c03 -+ ADD c04, t2, a6 -+ fmov a6, c04 -+#else -+ ADD $f31, t1, c03 -+ ADD $f31, t2, c04 -+#endif ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ MUL b3, c11, t3 ++ MUL b3, c15, t4 + -+ SUB c03, t3, a6 -+ fmov a6, c03 -+ ADD c04, t4, a6 -+ fmov a6, c04 ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 + -+ ST c03, 0 * SIZE(C1) -+ ST c04, 1 * SIZE(C1) -+ .align 4 ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldl $9, 80($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ .ident VERSION -+ .end CNAME -diff --git a/kernel/sw_64/zgemm_kernel_2x2.S.bak b/kernel/sw_64/zgemm_kernel_2x2.S.bak -new file mode 100644 -index 0000000..2133673 ---- /dev/null -+++ b/kernel/sw_64/zgemm_kernel_2x2.S.bak -@@ -0,0 +1,1704 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 + -+#if !defined(SW2B) -+#error "Architecture is not specified." -+#endif ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 + -+#ifdef SW2B -+#define PREFETCHSIZE 56 -+#define UNOP unop ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 +#endif + ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ .set noat -+ .set noreorder -+ .arch ev6 -+ -+.text -+ .align 5 -+ .globl CNAME -+ .ent CNAME ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 + -+#define STACKSIZE 80 ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 + -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $21 -+#define B $22 -+#define C $20 -+#define LDC $23 ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+#define C1 $19 -+#define C2 $24 ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ MUL a3, c09, t3 ++ MUL a3, c13, t4 + -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 + -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ MUL a4, c09, t3 ++ MUL a4, c13, t4 + -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 + -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) + -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ MUL b1, c10, c10 ++ MUL b1, c14, c14 + -+#define alpha_i $f29 -+#define alpha_r $f30 ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ MUL b2, c10, t3 ++ MUL b2, c14, t4 + -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 + -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ MUL b3, c10, t3 ++ MUL b3, c14, t4 + -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 + -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) + -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define BB $3 -+#define OFFSET $4 ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ MUL a1, c11, c11 ++ MUL a1, c15, c15 + -+#define ALPHA_R 64($sp) -+#define ALPHA_I 72($sp) ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ MUL a2, c11, t3 ++ MUL a2, c15, t4 + -+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 SUB -+#define ADD4 ADD -+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 ADD -+#define ADD4 SUB -+#else -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 SUB -+#define ADD4 SUB ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++ MUL a3, c12, c12 ++ MUL a3, c16, c16 +#endif + -+CNAME: -+ .frame $sp, STACKSIZE, $26, 0 ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ ldi $at, _mcount -+ jsr $at, ($at), _mcount -+#endif ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 + -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 + -+ ldi $sp, -STACKSIZE($sp) ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+ ldl B, 0 + STACKSIZE($sp) -+ ldl C, 8 + STACKSIZE($sp) -+ ldl LDC, 16 + STACKSIZE($sp) -+#ifdef TRMMKERNEL -+ ldl OFFSET, 24 + STACKSIZE($sp) -+#endif ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 + -+ sll LDC, ZBASE_SHIFT, LDC ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ fstd $f19, ALPHA_R -+ fstd $f20, ALPHA_I ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ MUL a4, c03, t3 ++ MUL a4, c04, t4 + -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 + -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) + -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ subl $31, OFFSET, KK -+#endif ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ MUL b1, c07, c07 ++ MUL b1, c08, c08 + -+ sra N, 1, J -+ ble J, $L30 -+ .align 4 -+ -+$L01: -+ mov C, C1 -+ addl C, LDC, C2 -+ mov A, AO -+ s4addl K, 0, BB ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ MUL b2, c07, t3 ++ MUL b2, c08, t4 + ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK -+#endif ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ MUL b3, c07, t3 ++ MUL b3, c08, t4 + -+ SXADDQ BB, B, BB -+ addl C2, LDC, C -+ unop ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 + -+ sra M, 1, I -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) + -+ fclr c01 -+ fclr c05 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 + -+ ble I, $L20 -+ .align 4 ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ MUL a2, c11, t3 ++ MUL a2, c12, t4 + -+$L11: -+#ifndef EV4 -+ fillcs 0 * SIZE(BB) -+ fillcs 8 * SIZE(BB) -+ unop -+ ldi BB, 16 * SIZE(BB) ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++ MUL a3, c15, c15 ++ MUL a3, c16, c16 +#endif + -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) + -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 2, TMP1 -+#else -+ addl KK, 2, TMP1 -+#endif -+#endif ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ MUL a1, c15, c15 ++ MUL a1, c16, c16 + -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ MUL a2, c15, t3 ++ MUL a2, c16, t4 + -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + -+ LD b1, 0 * SIZE(B) -+ fclr c10 -+ LD b2, 1 * SIZE(B) -+ fclr c14 ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ MUL a3, c15, t3 ++ MUL a3, c16, t4 + -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c07 ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+ ldi BO, 4 * SIZE(B) -+ fclr c11 -+ ldi AO, 4 * SIZE(AO) -+ fclr c15 ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ MUL a4, c15, t3 ++ MUL a4, c16, t4 + -+ fillcs 4 * SIZE(C1) -+ fclr c04 -+#ifndef TRMMKERNEL -+ ldi L, -2(K) -+#else -+ ldi L, -2(TMP1) -+#endif -+ fclr c08 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + -+ fillcs 4 * SIZE(C2) -+ fclr c12 -+ fclr c16 -+ ble L, $L15 -+#else -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ addl B, TMP1, BO -+ subl K, KK, TMP1 ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) + -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ MUL b1, c11, c11 ++ MUL b1, c12, c12 + -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ MUL b2, c11, t3 ++ MUL b2, c12, t4 + -+ LD b1, 0 * SIZE(BO) -+ fclr c10 -+ LD b2, 1 * SIZE(BO) -+ fclr c14 ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c07 ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ MUL b3, c11, t3 ++ MUL b3, c12, t4 + -+ ldi BO, 4 * SIZE(BO) -+ fclr c11 -+ ldi AO, 4 * SIZE(AO) -+ fclr c15 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + -+ fillcs 4 * SIZE(C1) -+ fclr c04 -+ ldi L, -2(TMP1) -+ fclr c08 ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ fillcs 4 * SIZE(C2) -+ fclr c12 -+ fclr c16 -+ ble L, $L15 -+#endif -+ .align 5 ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 + -+$L12: -+/* 1 */ -+ ADD1 c11, t1, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop -+#endif ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 + -+ ADD3 c12, t2, c12 -+ unop -+ MUL b1, a2, t2 -+ unop ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + -+ ADD2 c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif + -+ ADD4 c15, t4, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) + -+/* 2 */ -+ ADD1 c01, t1, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) + -+ ADD3 c02, t2, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) + -+ ADD2 c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) + -+ ADD4 c05, t4, c05 -+ unop -+ MUL b4, a1, t4 -+ unop ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) + -+/* 3 */ -+ ADD1 c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+ unop ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) + -+ ADD3 c04, t2, c04 -+ unop -+ MUL b3, a2, t2 -+ unop ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif + -+ ADD2 c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif + -+ ADD4 c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) + -+/* 4 */ -+ ADD1 c09, t1, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) + -+ ADD3 c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) + -+ ADD2 c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) + -+ ADD4 c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif + -+/* 5 */ -+ ADD1 c11, t1, c11 -+ unop -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ ADD3 c12, t2, c12 -+ ldi L, -2(L) -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ ADD2 c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ unop ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif + -+ ADD4 c15, t4, c15 -+ unop -+ MUL b2, a5, t4 -+ unop ++#ifdef LT ++ addl KK, 4, KK ++#endif + -+/* 6 */ -+ ADD1 c01, t1, c01 -+ unop -+ MUL b5, a6, t1 -+ unop ++#ifdef LN ++ subl KK, 4, KK ++#endif + -+ ADD3 c02, t2, c02 -+ unop -+ MUL b5, a4, t2 -+ unop ++ ldi I, -1(I) + -+ ADD2 c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop ++ bgt I, $L11 ++ .align 4 + -+ ADD4 c05, t4, c05 -+ unop -+ MUL b4, a5, t4 -+ unop ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif + -+/* 7 */ -+ ADD1 c03, t1, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 -+ unop ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif + -+ ADD3 c04, t2, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 -+ unop ++#ifdef RN ++ addl KK, 4, KK ++#endif + -+ ADD2 c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 + -+ ADD4 c13, t4, c13 -+ unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) ++$L40: ++ and N, 2, J ++ ble J, $L80 + -+/* 8 */ -+ ADD1 c09, t1, c09 -+ unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B + -+ ADD3 c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif + -+ ADD2 c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 + -+ ADD4 c07, t4, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 -+ .align 4 ++#ifdef LN ++ addl M, OFFSET, KK ++#endif + -+$L15: -+ ADD1 c11, t1, c11 -+ fldd alpha_r, ALPHA_R -+ MUL b1, a1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L18 ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG +#else -+ blbs TMP1, $L18 ++ mov A, AO +#endif -+ .align 4 + -+ ADD3 c12, t2, c12 -+ MUL b1, a2, t2 -+ ADD2 c16, t3, c16 -+ MUL b2, a2, t3 ++ fclr t3 ++ fclr t4 + -+ ADD4 c15, t4, c15 -+ MUL b2, a1, t4 -+ ADD1 c01, t1, c01 -+ MUL b1, a3, t1 ++ and M, 1, I ++ ble I, $L60 + -+ ADD3 c02, t2, c02 -+ unop -+ MUL b1, a4, t2 -+ LD b1, 0 * SIZE(BO) ++#if defined(LT) || defined(RN) + -+ ADD2 c06, t3, c06 -+ MUL b2, a4, t3 -+ ADD4 c05, t4, c05 -+ MUL b4, a1, t4 + -+ ADD1 c03, t1, c03 -+ unop -+ MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+ ADD3 c04, t2, c04 -+ unop -+ MUL b3, a2, t2 -+ unop ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 + -+ ADD2 c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) ++ ldi L, -2(KK) + -+ ADD4 c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) + -+ ADD1 c09, t1, c09 -+ unop -+ MUL b3, a3, t1 -+ ldi AO, 4 * SIZE(AO) ++ ble KK, $L78 + -+ ADD3 c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ ADD2 c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO + -+ ADD4 c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+ LD a3, -2 * SIZE(AO) ++ subl K, KK, TMP1 + -+ ADD1 c11, t1, c11 ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) -+ MUL b1, a1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 ++ ldi BO, 2 * SIZE(BO) + -+$L18: -+ ADD3 c12, t2, c12 -+ unop -+ MUL b1, a2, t2 -+ fldd alpha_i, ALPHA_I ++ ble TMP1, $L78 + -+ ADD2 c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+#ifndef TRMMKERNEL -+ LD a5, 0 * SIZE(C1) -+#else -+ unop ++ ble L, $L75 +#endif ++ .align 4 + -+ ADD4 c15, t4, c15 -+ MUL b2, a1, t4 -+ ADD1 c01, t1, c01 -+ MUL b1, a3, t1 ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) + -+ ADD3 c02, t2, c02 -+ unop -+ MUL b1, a4, t2 -+#ifndef TRMMKERNEL -+ LD b1, 1 * SIZE(C1) -+#else -+ unop -+#endif ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) + -+ ADD2 c06, t3, c06 -+ MUL b2, a4, t3 -+ ADD4 c05, t4, c05 -+ MUL b4, a1, t4 ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) + -+ ADD1 c03, t1, c03 -+ unop -+ MUL b3, a1, t1 -+#ifndef TRMMKERNEL -+ LD a1, 2 * SIZE(C1) -+#else -+ unop -+#endif ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) + -+ ADD3 c04, t2, c04 ++ ldi BO, 4 * SIZE(BO) + unop -+ MUL b3, a2, t2 + unop ++ bgt L, $L72 ++ .align 4 + -+ ADD2 c08, t3, c08 -+ unop -+ MUL b4, a2, t3 -+#ifndef TRMMKERNEL -+ LD a2, 3 * SIZE(C1) ++$L75: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 +#else -+ unop ++ blbs TMP1, $L77 +#endif ++ .align 4 + -+ ADD4 c13, t4, c13 -+ unop -+ MUL b2, a3, t4 -+#ifndef TRMMKERNEL -+ LD b2, 0 * SIZE(C2) -+#else -+ unop -+#endif ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) + -+ ADD1 c09, t1, c09 -+ ldi I, -1(I) -+ MUL b3, a3, t1 -+ unop ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 + -+ ADD3 c10, t2, c10 -+ unop -+ MUL b3, a4, t2 -+#ifndef TRMMKERNEL -+ LD b3, 1 * SIZE(C2) -+#else -+ unop -+#endif ++$L77: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 + -+ ADD2 c14, t3, c14 -+ unop -+ MUL b4, a4, t3 -+#ifndef TRMMKERNEL -+ LD a4, 2 * SIZE(C2) ++ ADD c01, c02, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, c05 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ++ .align 4 ++ ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 +#else -+ unop ++ subl KK, 2, TMP1 +#endif -+ -+ ADD4 c07, t4, c07 -+ unop -+ MUL b4, a3, t4 -+#ifndef TRMMKERNEL -+ LD a3, 3 * SIZE(C2) ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO +#else -+ unop ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) +#endif + -+ ADD1 c11, t1, c11 -+ ADD3 c12, t2, c12 -+ ADD2 c16, t3, c16 -+ ADD4 c15, t4, c15 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) + -+ ADD c01, c06, c01 -+ ADD c02, c05, c02 -+ ADD c03, c08, c03 -+ ADD c04, c07, c04 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) + -+ ADD c09, c14, c09 -+ MUL alpha_r, c01, t1 -+ ADD c10, c13, c10 -+ MUL alpha_r, c02, t2 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#endif + -+ ADD c11, c16, c11 -+ MUL alpha_r, c03, t3 -+ ADD c12, c15, c12 -+ MUL alpha_r, c04, t4 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) + -+#ifndef TRMMKERNEL -+ ADD a5, t1, a5 -+ MUL alpha_i, c02, t1 -+ ADD b1, t2, b1 -+ MUL alpha_i, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++#endif + -+ ADD a1, t3, a1 -+ MUL alpha_i, c04, t3 -+ ADD a2, t4, a2 -+ MUL alpha_i, c03, t4 -+#else -+ ADD $f31, t1, a5 -+ MUL alpha_i, c02, t1 -+ ADD $f31, t2, b1 -+ MUL alpha_i, c01, t2 ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) + -+ ADD $f31, t3, a1 -+ MUL alpha_i, c04, t3 -+ ADD $f31, t4, a2 -+ MUL alpha_i, c03, t4 ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c05, c05 +#endif + -+ SUB a5, t1, a5 -+ MUL alpha_r, c09, t1 -+ ADD b1, t2, b1 -+ MUL alpha_r, c10, t2 -+ -+ SUB a1, t3, a1 -+ MUL alpha_r, c11, t3 -+ ADD a2, t4, a2 -+ MUL alpha_r, c12, t4 ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+#ifndef TRMMKERNEL -+ ADD b2, t1, b2 -+ MUL alpha_i, c10, t1 -+ ADD b3, t2, b3 -+ MUL alpha_i, c09, t2 ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+ ADD a4, t3, a4 -+ MUL alpha_i, c12, t3 -+ ADD a3, t4, a3 -+ MUL alpha_i, c11, t4 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) +#else -+ ADD $f31, t1, b2 -+ MUL alpha_i, c10, t1 -+ ADD $f31, t2, b3 -+ MUL alpha_i, c09, t2 ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif + -+ ADD $f31, t3, a4 -+ MUL alpha_i, c12, t3 -+ ADD $f31, t4, a3 -+ MUL alpha_i, c11, t4 ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) +#endif + -+ SUB b2, t1, b2 -+ ST a5, 0 * SIZE(C1) -+ fclr t1 -+ unop ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) + -+ ADD b3, t2, b3 -+ ST b1, 1 * SIZE(C1) ++ fclr t1 + fclr t2 -+ unop -+ -+ SUB a4, t3, a4 -+ ST a1, 2 * SIZE(C1) + fclr t3 -+ unop -+ -+ ADD a3, t4, a3 -+ ST a2, 3 * SIZE(C1) + fclr t4 -+ unop -+ -+ ST b2, 0 * SIZE(C2) -+ fclr c01 -+ ST b3, 1 * SIZE(C2) -+ fclr c05 -+ -+ ST a4, 2 * SIZE(C2) -+ ldi C1, 4 * SIZE(C1) -+ ST a3, 3 * SIZE(C2) -+ ldi C2, 4 * SIZE(C2) -+ -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 2, TMP1 -+#else -+ subl TMP1, 2, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif + -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 2, KK ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG +#endif -+ bgt I, $L11 -+ .align 4 -+ -+$L20: -+ and M, 1, I -+ ble I, $L29 + -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif + -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 1, TMP1 -+#else -+ addl KK, 2, TMP1 ++#ifdef LT ++ addl KK, 1, KK +#endif ++ ++#ifdef LN ++ subl KK, 1, KK +#endif ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) -+ fclr c09 ++ fclr c01 + LD a2, 1 * SIZE(AO) -+ fclr c13 -+ ++ fclr c05 + LD a3, 2 * SIZE(AO) -+ fclr c02 ++ fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) -+ fclr c10 ++ ldi L, -2(KK) + LD b2, 1 * SIZE(B) -+ fclr c14 ++ ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) -+ ldi BO, 4 * SIZE(B) ++ ldi BO, 2 * SIZE(B) + -+#ifndef TRMMKERNEL -+ ldi L, -2(K) ++ ble KK, $L68 ++ ++ ble L, $L65 +#else -+ ldi L, -2(TMP1) ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG +#endif -+ ble L, $L25 -+#else -+ sll KK, ZBASE_SHIFT + 0, TMP1 -+ addl AO, TMP1, AO -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) -+ fclr c09 ++ fclr c01 + LD a2, 1 * SIZE(AO) -+ fclr c13 -+ ++ fclr c05 + LD a3, 2 * SIZE(AO) -+ fclr c02 ++ fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) -+ fclr c10 ++ ldi L, -2(TMP1) + LD b2, 1 * SIZE(BO) -+ fclr c14 ++ ldi AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) -+ ldi BO, 4 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) + -+ ldi L, -2(TMP1) -+ ble L, $L25 ++ ble TMP1, $L68 ++ ++ ble L, $L65 +#endif -+ .align 5 ++ .align 4 + -+$L22: -+ ADD1 c09, t1, c09 ++$L62: ++ ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + -+ ADD3 c10, t2, c10 -+ unop ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) ++ LD b1, 2 * SIZE(BO) + -+ ADD4 c13, t3, c13 -+ unop ++ ADD c05, t3, c05 ++ ldi L, -2(L) + MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) ++ LD a1, -2 * SIZE(AO) + -+ ADD2 c14, t4, c14 ++ ADD c06, t4, c06 + unop + MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD1 c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD3 c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD4 c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD2 c06, t4, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ -+ ADD1 c09, t1, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD3 c10, t2, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD4 c13, t3, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD2 c14, t4, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) ++ LD a2, -1 * SIZE(AO) + -+ ADD1 c01, t1, c01 -+ ldi L, -2(L) ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) + -+ ADD3 c02, t2, c02 ++ ADD c02, t2, c02 + unop + MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) ++ LD b3, 0 * SIZE(BO) + -+ ADD4 c05, t3, c05 ++ ADD c05, t3, c05 + unop -+ MUL a3, b5, t3 ++ MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + -+ ADD2 c06, t4, c06 -+ MUL a4, b5, t4 ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ + LD a4, 1 * SIZE(AO) -+ bgt L, $L22 ++ unop ++ unop ++ bgt L, $L62 + .align 4 + -+$L25: -+ ADD1 c09, t1, c09 -+ fldd alpha_r, ALPHA_R ++$L65: ++ ADD c01, t1, c01 + MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L28 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 +#else -+ blbs TMP1, $L28 ++ blbs TMP1, $L67 +#endif + .align 4 + -+ ADD3 c10, t2, c10 ++ ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + -+ ADD4 c13, t3, c13 -+ unop ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 -+ unop ++ LD a1, 0 * SIZE(AO) + -+ ADD2 c14, t4, c14 ++ ADD c06, t4, c06 + unop + MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD1 c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD3 c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD4 c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD2 c06, t4, c06 -+ unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) + -+ ADD1 c09, t1, c09 -+ LD b4, 3 * SIZE(BO) ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) + .align 4 + -+$L28: -+ ADD3 c10, t2, c10 -+ unop ++$L67: ++ ADD c02, t2, c02 + MUL a2, b1, t2 -+ fldd alpha_i, ALPHA_I -+ -+ ADD4 c13, t3, c13 -+ unop ++ ADD c05, t3, c05 + MUL a1, b2, t3 -+#ifndef TRMMKERNEL -+ LD c03, 0 * SIZE(C1) -+#else -+ unop -+#endif + -+ ADD2 c14, t4, c14 -+ unop ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 -+#ifndef TRMMKERNEL -+ LD c04, 1 * SIZE(C1) ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c05, t3, c05 ++ ADD c06, t4, c06 ++ .align 4 ++ ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 +#else -+ unop ++ subl KK, 2, TMP1 +#endif -+ -+ ADD1 c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+#ifndef TRMMKERNEL -+ LD c11, 0 * SIZE(C2) ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO +#else -+ unop ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) +#endif + -+ ADD3 c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+#ifndef TRMMKERNEL -+ LD c12, 1 * SIZE(C2) ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 +#else -+ unop ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 +#endif + -+ ADD4 c05, t3, c05 -+ MUL a1, b4, t3 -+ ADD2 c06, t4, c06 -+ MUL a2, b4, t4 ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ ADD1 c09, t1, c09 -+ ADD3 c10, t2, c10 -+ ADD4 c13, t3, c13 -+ ADD2 c14, t4, c14 ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 + -+ ADD c01, c06, c01 -+ ADD c02, c05, c02 -+ ADD c09, c14, c09 -+ ADD c10, c13, c10 ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 + -+ MUL alpha_r, c01, t1 -+ MUL alpha_r, c02, t2 -+ MUL alpha_r, c09, t3 -+ MUL alpha_r, c10, t4 ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 + -+#ifndef TRMMKERNEL -+ ADD c03, t1, c03 -+ MUL alpha_i, c02, t1 -+ ADD c04, t2, c04 -+ MUL alpha_i, c01, t2 ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif + -+ ADD c11, t3, c11 -+ MUL alpha_i, c10, t3 -+ ADD c12, t4, c12 -+ MUL alpha_i, c09, t4 -+#else -+ ADD $f31, t1, c03 -+ MUL alpha_i, c02, t1 -+ ADD $f31, t2, c04 -+ MUL alpha_i, c01, t2 ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) + -+ ADD $f31, t3, c11 -+ MUL alpha_i, c10, t3 -+ ADD $f31, t4, c12 -+ MUL alpha_i, c09, t4 -+#endif ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 + -+ SUB c03, t1, c03 -+ ADD c04, t2, c04 -+ SUB c11, t3, c11 -+ ADD c12, t4, c12 ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 + -+ ST c03, 0 * SIZE(C1) -+ ST c04, 1 * SIZE(C1) -+ ST c11, 0 * SIZE(C2) -+ ST c12, 1 * SIZE(C2) ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 + -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 1, TMP1 -+#else -+ subl TMP1, 2, TMP1 ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 +#endif -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 +#endif + -+#if defined(TRMMKERNEL) && defined(LEFT) -+ addl KK, 1, KK ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 +#endif -+ .align 4 + -+$L29: -+ mov BO, B -+ ldi J, -1(J) -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addl KK, 2, KK ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) +#else -+ unop ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) +#endif -+ bgt J, $L01 -+ .align 4 + -+$L30: -+ and N, 1, J -+ ble J, $L999 ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif + -+ mov C, C1 -+ mov A, AO ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) + -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) +#endif + -+ sra M, 1, I -+ ble I, $L50 -+ .align 4 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+$L41: -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 2, TMP1 -+#else -+ addl KK, 1, TMP1 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK +#endif ++ ++#ifdef LN ++ subl KK, 2, KK +#endif ++ .align 4 ++ ++$L70: ++ sra M, 2, I ++ ble I, $L79 ++ .align 4 ++ ++$L51: ++#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) -+ fclr t1 ++ fclr c03 + LD a2, 1 * SIZE(AO) -+ fclr t2 ++ fclr c07 + LD a3, 2 * SIZE(AO) -+ fclr t3 ++ fclr c04 + LD a4, 3 * SIZE(AO) -+ fclr t4 ++ fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 @@ -72204,34 +15481,35 @@ index 0000000..2133673 + LD b4, 3 * SIZE(B) + fclr c06 + ++ ldi L, -2(KK) ++ + ldi BO, 2 * SIZE(B) -+ fclr c03 + ldi AO, 4 * SIZE(AO) -+ fclr c07 + -+#ifndef TRMMKERNEL -+ ldi L, -2(K) ++ ble KK, $L58 ++ ++ ble L, $L55 +#else -+ ldi L, -2(TMP1) ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG +#endif -+ fclr c04 -+ fclr c08 -+ ble L, $L45 -+#else -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ sll KK, ZBASE_SHIFT + 0, TMP1 -+ addl B, TMP1, BO ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) -+ fclr t1 ++ fclr c03 + LD a2, 1 * SIZE(AO) -+ fclr t2 ++ fclr c07 + LD a3, 2 * SIZE(AO) -+ fclr t3 ++ fclr c04 + LD a4, 3 * SIZE(AO) -+ fclr t4 ++ fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 @@ -72242,279 +15520,533 @@ index 0000000..2133673 + LD b4, 3 * SIZE(BO) + fclr c06 + ++ ldi L, -2(TMP1) + ldi BO, 2 * SIZE(BO) -+ fclr c03 + ldi AO, 4 * SIZE(AO) -+ fclr c07 + -+ ldi L, -2(TMP1) -+ fclr c04 -+ fclr c08 -+ ble L, $L45 ++ ble TMP1, $L58 ++ ++ ble L, $L55 +#endif -+ .align 5 ++ .align 4 + -+$L42: -+ ADD4 c05, t1, c05 ++$L52: ++ ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + -+ ADD2 c06, t2, c06 ++ ADD c06, t2, c06 + ldi L, -2(L) + MUL a2, b1, t2 + unop + -+ ADD4 c07, t3, c07 ++ ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + -+ ADD2 c08, t4, c08 ++ ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + -+ ADD1 c01, t1, c01 ++ ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + -+ ADD3 c02, t2, c02 ++ ADD c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + -+ ADD1 c03, t3, c03 ++ ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + -+ ADD3 c04, t4, c04 ++ ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + -+ ADD4 c05, t1, c05 ++ ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + -+ ADD2 c06, t2, c06 ++ ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + -+ ADD4 c07, t3, c07 ++ ADD c07, t3, c07 + unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + -+ ADD2 c08, t4, c08 ++ ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + -+ ADD1 c01, t1, c01 ++ ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + -+ ADD3 c02, t2, c02 ++ ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + -+ ADD1 c03, t3, c03 ++ ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + -+ ADD3 c04, t4, c04 ++ ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) -+ bgt L, $L42 ++ bgt L, $L52 + .align 4 + -+$L45: -+ ADD4 c05, t1, c05 -+ fldd alpha_r, ALPHA_R -+ MUL b1, a1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L48 ++$L55: ++ ADD c05, t1, c05 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 +#else -+ blbs TMP1, $L48 ++ blbs TMP1, $L57 +#endif + .align 4 + -+ ADD2 c06, t2, c06 ++ ADD c06, t2, c06 + MUL a2, b1, t2 -+ ADD4 c07, t3, c07 ++ ADD c07, t3, c07 + MUL a3, b1, t3 + -+ ADD2 c08, t4, c08 ++ ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + -+ ADD1 c01, t1, c01 ++ ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + -+ ADD3 c02, t2, c02 ++ ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + -+ ADD1 c03, t3, c03 ++ ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + -+ ADD3 c04, t4, c04 ++ ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + -+ ADD4 c05, t1, c05 ++ ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + ldi BO, 2 * SIZE(BO) + .align 4 + -+$L48: -+ ADD2 c06, t2, c06 -+ unop ++$L57: ++ ADD c06, t2, c06 + MUL a2, b1, t2 -+ fldd alpha_i, ALPHA_I -+ -+ ADD4 c07, t3, c07 -+ ldi I, -1(I) ++ ADD c07, t3, c07 + MUL a3, b1, t3 -+#ifndef TRMMKERNEL -+ LD c09, 0 * SIZE(C1) -+#else -+ unop -+#endif + -+ ADD2 c08, t4, c08 -+ unop ++ ADD c08, t4, c08 + MUL a4, b1, t4 -+#ifndef TRMMKERNEL -+ LD c10, 1 * SIZE(C1) ++ ADD c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, c05 ++ ADD c06, t2, c06 ++ ADD c07, t3, c07 ++ ADD c08, t4, c08 ++ .align 4 ++ ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 +#else -+ unop ++ subl KK, 2, TMP1 +#endif -+ -+ ADD1 c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+#ifndef TRMMKERNEL -+ LD c11, 2 * SIZE(C1) ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO +#else -+ unop ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) +#endif + -+ ADD3 c02, t2, c02 -+ unop -+ MUL a2, b2, t2 -+#ifndef TRMMKERNEL -+ LD c12, 3 * SIZE(C1) ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++ ++ SUB b1, c03, c03 ++ SUB b2, c07, c07 ++ SUB b3, c04, c04 ++ SUB b4, c08, c08 +#else -+ unop ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 +#endif + -+ ADD1 c03, t3, c03 -+ MUL a3, b2, t3 -+ ADD3 c04, t4, c04 -+ MUL a4, b2, t4 ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) + -+ ADD4 c05, t1, c05 -+ ADD2 c06, t2, c06 -+ ADD4 c07, t3, c07 -+ ADD2 c08, t4, c08 ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 + -+ ADD c01, c06, c01 -+ ADD c02, c05, c02 -+ ADD c03, c08, c03 -+ ADD c04, c07, c04 ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 + -+ MUL alpha_r, c01, t1 -+ MUL alpha_r, c02, t2 -+ MUL alpha_r, c03, t3 -+ MUL alpha_r, c04, t4 ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 + -+#ifndef TRMMKERNEL -+ ADD c09, t1, c09 -+ MUL alpha_i, c02, t1 -+ ADD c10, t2, c10 -+ MUL alpha_i, c01, t2 ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 + -+ ADD c11, t3, c11 -+ MUL alpha_i, c04, t3 -+ ADD c12, t4, c12 -+ MUL alpha_i, c03, t4 ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++ MUL a3, c07, c07 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) +#else -+ ADD $f31, t1, c09 -+ MUL alpha_i, c02, t1 -+ ADD $f31, t2, c10 -+ MUL alpha_i, c01, t2 ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) + -+ ADD $f31, t3, c11 -+ MUL alpha_i, c04, t3 -+ ADD $f31, t4, c12 -+ MUL alpha_i, c03, t4 ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) +#endif + -+ SUB c09, t1, c09 -+ ADD c10, t2, c10 -+ SUB c11, t3, c11 -+ ADD c12, t4, c12 ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif + -+ ST c09, 0 * SIZE(C1) -+ ST c10, 1 * SIZE(C1) -+ ST c11, 2 * SIZE(C1) -+ ST c12, 3 * SIZE(C1) ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) + ++#ifndef LN + ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif + -+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TMP1 -+#ifdef LEFT -+ subl TMP1, 2, TMP1 -+#else -+ subl TMP1, 1, TMP1 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG +#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 + addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ sll TMP1, BASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + -+#if defined(TRMMKERNEL) && defined(LEFT) ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 ++ .align 4 ++ ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN + addl KK, 2, KK +#endif + -+ bgt I, $L41 -+ .align 4 -+ -+$L50: ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L80: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C ++#endif ++ ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ + and M, 1, I -+ ble I, $L999 -+ -+#if !defined(TRMMKERNEL) || \ -+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ -+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ble I, $L100 + -+#ifdef TRMMKERNEL -+#ifdef LEFT -+ addl KK, 1, TMP1 -+#else -+ addl KK, 1, TMP1 -+#endif -+#endif ++#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 @@ -72528,26 +16060,27 @@ index 0000000..2133673 + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) -+ fclr c05 -+ ++ fclr c02 + LD b3, 2 * SIZE(B) -+ fclr c02 ++ fclr c03 + LD b4, 3 * SIZE(B) -+ fclr c06 -+ -+ ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(B) ++ fclr c04 + -+#ifndef TRMMKERNEL -+ ldi L, -2(K) ++ sra KK, 2, L ++ mov B, BO ++ unop ++ ble L, $L115 +#else -+ ldi L, -2(TMP1) ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG +#endif -+ ble L, $L55 -+#else -+ sll KK, ZBASE_SHIFT + 0, TMP1 -+ addl AO, TMP1, AO -+ addl B, TMP1, BO ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) @@ -72562,6499 +16095,4840 @@ index 0000000..2133673 + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) -+ fclr c05 -+ ++ fclr c02 + LD b3, 2 * SIZE(BO) -+ fclr c02 ++ fclr c03 + LD b4, 3 * SIZE(BO) -+ fclr c06 -+ -+ ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(BO) ++ fclr c04 + -+ ldi L, -2(TMP1) -+ ble L, $L55 ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 +#endif -+ .align 5 ++ .align 4 + -+$L52: -+ ADD1 c01, t1, c01 -+ unop -+ MUL a1, b1, t1 -+ unop ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+ ADD3 c02, t2, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) + -+ ADD4 c05, t3, c05 -+ ldi L, -2(L) -+ MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) + -+ ADD2 c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) + -+ ADD1 c01, t1, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, t1 ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) + ldi BO, 4 * SIZE(BO) -+ -+ ADD3 c02, t2, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, 0 * SIZE(BO) -+ -+ ADD4 c05, t3, c05 -+ unop -+ MUL a3, b4, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD2 c06, t4, c06 -+ MUL a4, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ unop -+ -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L52 ++ bgt L, $L112 + .align 4 + -+$L55: -+ ADD1 c01, t1, c01 -+ fldd alpha_r, ALPHA_R -+ MUL a1, b1, t1 -+#ifndef TRMMKERNEL -+ blbs K, $L58 ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L +#else -+ blbs TMP1, $L58 ++ and TMP1, 3, L +#endif -+ .align 4 -+ -+ ADD3 c02, t2, c02 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD4 c05, t3, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, t3 -+ LD a1, 0 * SIZE(AO) ++ ble L, $L118 ++ .align 4 + -+ ADD2 c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, 1 * SIZE(AO) ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) + -+ ADD1 c01, t1, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi AO, 2 * SIZE(AO) ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 + .align 4 + -+$L58: -+ ADD3 c02, t2, c02 -+ unop -+ MUL a2, b1, t2 -+ fldd alpha_i, ALPHA_I ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 + -+ ADD4 c05, t3, c05 -+ unop -+ MUL a1, b2, t3 -+#ifndef TRMMKERNEL -+ LD c03, 0 * SIZE(C1) -+#else -+ unop -+#endif ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 + -+ ADD2 c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+#ifndef TRMMKERNEL -+ LD c04, 1 * SIZE(C1) -+#else -+ unop ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO +#endif + -+ ADD1 c01, t1, c01 -+ ADD3 c02, t2, c02 -+ ADD4 c05, t3, c05 -+ ADD2 c06, t4, c06 -+ -+ ADD c01, c06, c01 -+ ADD c02, c05, c02 -+ -+ MUL alpha_r, c01, t1 -+ MUL alpha_r, c02, t2 -+ MUL alpha_i, c02, t3 -+ MUL alpha_i, c01, t4 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) + -+#ifndef TRMMKERNEL -+ ADD c03, t1, c03 -+ ADD c04, t2, c04 ++ SUB a1, c01, c01 +#else -+ ADD $f31, t1, c03 -+ ADD $f31, t2, c04 ++ LD a1, 0 * SIZE(AO) ++ ++ SUB a1, c01, c01 +#endif + -+ SUB c03, t3, c03 -+ ADD c04, t4, c04 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) + -+ ST c03, 0 * SIZE(C1) -+ ST c04, 1 * SIZE(C1) -+ .align 4 ++ MUL a1, c01, c01 ++#endif + -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ .ident VERSION -+ .end CNAME -diff --git a/kernel/sw_64/zgemm_kernel_simd_8x2.S b/kernel/sw_64/zgemm_kernel_simd_8x2.S -new file mode 100644 -index 0000000..f6a36fb ---- /dev/null -+++ b/kernel/sw_64/zgemm_kernel_simd_8x2.S -@@ -0,0 +1,3189 @@ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) + -+#if !defined(SW2B) -+#error "Architecture is not specified." ++ MUL a1, c01, c01 +#endif + -+#define STACKSIZE 128 -+ -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $21 -+#define B $22 -+#define C $20 -+#define LDC $23 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif + -+#define C1 $19 -+#define C2 $24 ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif + -+#define PREA $10 -+#define PREB $11 ++ ST c01, 0 * SIZE(C1) + -+#define AO $9 -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif + -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif + -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif + -+#define a5 $f16 -+#define a6 $f24 -+#define a7 $f25 -+#define a8 $f26 ++#ifdef LT ++ addl KK, 1, KK ++#endif + -+#define b5 $f27 -+#define b6 $f28 -+#define b7 $f29 -+#define b8 $f30 ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 + -+#define alpha_i $f29 -+#define alpha_r $f30 ++$L100: ++ and M, 2, I ++ ble I, $L110 + -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 ++#if defined(LT) || defined(RN) + -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 + -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+#define TMP1 $0 -+#define TEMP $1 -+#define KK $2 -+#define BB $3 -+#define OFFSET $4 ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO + -+#define ALPHA_R 64($sp) -+#define ALPHA_I 72($sp) ++ subl K, KK, TMP1 + -+/* -+ *=================== -+ * (a+bi)*(c+di) -+ * ADD1 ac '+' bd -+ * ADD2 ad '+' bc -+ * FMAD5 a*alpha_r + real part -+ * FMAD6 a*alpha_i + image part -+ * FMAD7 b*alpha_r + image part -+ * FMAD8 b*alpha_i + real part -+ -+ *=================== -+ */ -+ -+/* -+ *=================== -+ * (a+bi) * (c+di) -+ * (a+bi) * (alpha_r+alpha_i) -+ *=================== -+ */ -+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -+#define ADD1 SUB -+#define ADD2 ADD -+#define FMAD5 MAD -+#define FMAD6 MAD -+#define FMAD7 MAD -+#define FMAD8 NMAD -+#endif ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+/* -+ *=================== -+ * (a-bi) * (c+di) -+ * (a+bi) * (alpha_r+alpha_i) -+ *=================== -+ */ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 + -+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) -+#define ADD1 ADD -+#define ADD2 SUB -+#define FMAD5 MAD -+#define FMAD6 MAD -+#define FMAD7 MAD -+#define FMAD8 NMAD ++ sra TMP1, 2, L ++ ble L, $L105 +#endif ++ .align 5 + -+/* -+ *=================== -+ * (a+bi) * (c-di) -+ * (a-bi) * (alpha_r+alpha_i) -+ *=================== -+ */ ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + -+#if defined(RN) || defined(RT) || defined(CN) || defined(CT) -+#define ADD1 ADD -+#define ADD2 SUB -+#define FMAD5 MAD -+#define FMAD6 MAD -+#define FMAD7 NMAD -+#define FMAD8 MAD -+#endif -+ -+/* -+ *=================== -+ * (a-bi) * (c-di) -+ * (a-bi) * (alpha_r+alpha_i) -+ *=================== -+ */ -+#if defined(RR) || defined(RC) || defined(CR) || defined(CC) -+#define ADD1 SUB -+#define ADD2 ADD -+#define FMAD5 MAD -+#define FMAD6 MAD -+#define FMAD7 NMAD -+#define FMAD8 MAD -+#endif ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) + ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) + ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) + -+ PROLOGUE -+ PROFCODE ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) + -+ .frame $30, STACKSIZE, $26, 0 -+ ldi $sp, -STACKSIZE($sp) ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) + ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 + -+ ldl B, 0 + STACKSIZE($sp) -+ ldl C, 8 + STACKSIZE($sp) -+ ldl LDC, 16 + STACKSIZE($sp) -+#ifdef TRMMKERNEL -+ ldl OFFSET, 24 + STACKSIZE($sp) ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L +#endif ++ ble L, $L108 ++ .align 4 + -+ sll LDC, ZBASE_SHIFT, LDC # LDC*sizebyte -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ ST $f19, ALPHA_R -+ ST $f20, ALPHA_I ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) + -+ stl $9, 80($sp) # Integer Saved Register -+ stl $10,88($sp) -+ stl $11,96($sp) -+ stl $12,104($sp) -+ stl $13,112($sp) -+ stl $14,120($sp) ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) + ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 + -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 ++$L108: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 + -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 + -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ subl $31, OFFSET, KK ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 +#endif -+ -+ sra N, 1, J # J=N/2 -+ ble J, $L50 -+ .align 4 -+ -+$L01: -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO +#endif + -+ sra M, 3, I # I=M/8 -+ sll K, ZBASE_SHIFT, PREB -+ -+ sll K, 2+ZBASE_SHIFT, PREA -+ mov C, C1 -+ -+ addl C, LDC, C2 -+ mov A, AO # Reset A -+ -+ addl PREB, B, PREB -+ addl C2, LDC, C # Change C to next panel ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) + -+ addl PREA, A, PREA -+ beq I, $L20 # GEMM_MR=8 -+ -+$L11: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B, BO # LL && RU reset B -+ nop ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 +#else -+ sll KK, 3 + ZBASE_SHIFT, L # KK*8mr -+ sll KK, 1 + ZBASE_SHIFT, TEMP # KK*2nr ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) + -+ addl AO, L, AO # mov AO point to the data part -+ addl B,TEMP,BO # mov BO point to the data part ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 +#endif + -+ vcpys $f31,$f31,c01 # Clear result regs -+ fillcs 0(C1) -+ fillcs 4*SIZE(C1) ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ vcpys $f31,$f31,c02 -+ fillcs 8*SIZE(C1) -+ fillcs 12*SIZE(C1) ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+ vcpys $f31,$f31,c03 -+ fillcs 0(C2) -+ fillcs 4*SIZE(C2) ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) + -+ vcpys $f31,$f31,c04 -+ fillcs 8*SIZE(C2) -+ fillcs 12*SIZE(C2) ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c02, c02 ++#endif + -+ vcpys $f31,$f31,c05 -+ vcpys $f31,$f31,c06 -+ vcpys $f31,$f31,c07 -+ vcpys $f31,$f31,c08 -+ -+ vcpys $f31,$f31,c09 -+ LDDE b1, 0 * SIZE(BO) # B1R -+ LDDE b2, 1 * SIZE(BO) # B1I ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) + -+ vcpys $f31,$f31,c10 -+ VLD a1, 0 * SIZE(AO) # A1, A2 -+ VLD a2, 4 * SIZE(AO) # A3, A4 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++#endif + -+ vcpys $f31,$f31,c11 -+ LDDE b3, 2 * SIZE(BO) # B2R -+ LDDE b4, 3 * SIZE(BO) # B2I ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif + -+ vcpys $f31,$f31,c12 -+ VLD a3, 8 * SIZE(AO) # A5, A6 -+ VLD a4,12 * SIZE(AO) # A7, A8 ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif + -+ vcpys $f31,$f31,c13 -+ vcpys $f31,$f31,c14 -+ vcpys $f31,$f31,c15 -+ vcpys $f31,$f31,c16 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) + ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif + ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+#if (defined(LEFT) && !defined(TRANSA)) \ -+ ||(!defined(LEFT) && defined(TRANSA)) -+ subl K, KK, TEMP # temp is the length of data part -+#elif defined(LEFT) -+ addl KK, 8, TEMP # mr=8, careful about complex -+#else -+ addl KK, 2, TEMP # nr=2 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO +#endif -+ sra TEMP, 1, L # L=TEMP/2 -+ ble L, $L15 + -+#else -+ vcpys $f31,$f31,c01 # Clear result regs -+ mov B, BO # Set B, (block A x panel Bj) -+ sra K, 1, L # Unroll K as 2 -+ -+ vcpys $f31,$f31,c02 -+ fillcs 0(C1) -+ fillcs 4*SIZE(C1) -+ -+ vcpys $f31,$f31,c03 -+ fillcs 8*SIZE(C1) -+ fillcs 12*SIZE(C1) -+ -+ vcpys $f31,$f31,c04 -+ fillcs 0(C2) -+ fillcs 4*SIZE(C2) -+ -+ vcpys $f31,$f31,c05 -+ fillcs 8*SIZE(C2) -+ fillcs 12*SIZE(C2) -+ -+ vcpys $f31,$f31,c06 -+ vcpys $f31,$f31,c07 -+ vcpys $f31,$f31,c08 -+ vcpys $f31,$f31,c09 -+ -+ vcpys $f31,$f31,c10 -+ LDDE b1, 0 * SIZE(BO) # B1R -+ LDDE b2, 1 * SIZE(BO) # B1I -+ -+ vcpys $f31,$f31,c11 -+ VLD a1, 0 * SIZE(AO) # A1, A2 -+ VLD a2, 4 * SIZE(AO) # A3, A4 -+ -+ vcpys $f31,$f31,c12 -+ LDDE b3, 2 * SIZE(BO) # B2R -+ LDDE b4, 3 * SIZE(BO) # B2I -+ -+ vcpys $f31,$f31,c13 -+ VLD a3, 8 * SIZE(AO) # A5, A6 -+ VLD a4,12 * SIZE(AO) # A7, A8 -+ -+ vcpys $f31,$f31,c14 -+ vcpys $f31,$f31,c15 -+ -+ vcpys $f31,$f31,c16 -+ ble L, $L15 ++#ifdef LT ++ addl KK, 2, KK +#endif + ++#ifdef LN ++ subl KK, 2, KK ++#endif + .align 4 -+$L12: -+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE -+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) -+ LDDE b5, 4 * SIZE(BO) # next B1R -+ -+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) -+ LDDE b6, 5 * SIZE(BO) # next B1I -+ -+ VMAD a2,b1,c05,c05 # C31, C41 -+ VLD a8,12 * SIZE(AO) # next A7, A8 + -+ VMAD a2,b2,c06,c06 # C31, C41 -+ VLD a7, 8 * SIZE(AO) # next A5, A6 -+ -+ VMAD a1,b3,c03,c03 # C12(ac,bc), C22(ac,bc) -+ VMAD a1,b4,c04,c04 # C12(ad,bd), C22(ad,bd) -+ VMAD a3,b1,c09,c09 # C51, C61 -+ VMAD a3,b2,c10,c10 # C51, C61 ++$L110: ++ sra M, 2, I ++ ble I, $L119 ++ .align 4 + ++$L91: ++#if defined(LT) || defined(RN) + -+ VMAD a2,b3,c07,c07 # C32, C42 -+ LDDE b7, 6 * SIZE(BO) # next B2R ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ VMAD a2,b4,c08,c08 # C32, C42 -+ LDDE b8, 7 * SIZE(BO) # next B2I ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 + -+ VMAD a4,b1,c13,c13 # C71, C81 -+ VLD a5, 0 * SIZE(AO) # next A1, A2, a5==a0 ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 + -+ VMAD a4,b2,c14,c14 # C71, C81 -+ VLD a6, 4 * SIZE(AO) # next A3, A4 -+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO + -+ VMAD a3,b3,c11,c11 # C52, C62 -+ fillcs 0(PREB) ++ subl K, KK, TMP1 + -+ VMAD a3,b4,c12,c12 # C52, C62 -+ fillcs 0(PREA) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ VMAD a4,b3,c15,c15 # C72, C82 -+ fillcs 8*SIZE(PREA) ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 + -+ VMAD a4,b4,c16,c16 # C72, C82 -+ subl L, 1, L # -+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE ++ sra TMP1, 2, L ++ unop ++ ble L, $L95 ++#endif ++ .align 5 + -+ VMAD a8,b5,c13,c13 -+ LDDE b1, 0 * SIZE(BO) ++$L92: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + -+ VMAD a8,b6,c14,c14 -+ LDDE b2, 1 * SIZE(BO) ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) + -+ VMAD a7,b5,c09,c09 -+ addl PREA, 16*SIZE, PREA -+ VLD a4,12 * SIZE(AO) ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) + -+ VMAD a7,b6,c10,c10 -+ VLD a3, 8 * SIZE(AO) ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+ VMAD a5,b5,c01,c01 -+ VMAD a5,b6,c02,c02 -+ VMAD a5,b7,c03,c03 -+ VMAD a5,b8,c04,c04 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) + -+ VMAD a8,b7,c15,c15 -+ LDDE b3, 2 * SIZE(BO) ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) + -+ VMAD a8,b8,c16,c16 -+ LDDE b4, 3 * SIZE(BO) ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) + -+ VMAD a6,b5,c05,c05 -+ VLD a1, 0 * SIZE(AO) ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) + -+ VMAD a6,b6,c06,c06 -+ VLD a2, 4 * SIZE(AO) ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) + ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) + -+ VMAD a7,b7,c11,c11 -+ fillcs 4*SIZE(PREB) ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) + -+ VMAD a7,b8,c12,c12 -+ fillcs 0(PREA) ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) + -+ VMAD a6,b7,c07,c07 -+ addl PREB, 8*SIZE, PREB -+ fillcs 8*SIZE(PREA) ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) + -+ VMAD a6,b8,c08,c08 -+ addl PREA, 16*SIZE, PREA -+ bne L, $L12 # continue K ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) + -+$L15: -+ LD alpha_r, ALPHA_R # $f30==b8 -+#ifndef TRMMKERNEL -+ blbc K, $L18 # if(K&1) -+#else -+ blbc TEMP, $L18 -+#endif ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) + -+$L16: -+ VMAD a1,b1,c01,c01 # C11R C21R -+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE -+ -+ VMAD a1,b2,c02,c02 # C11I C21I -+ addl BO, 4*SIZE, BO -+ -+ VMAD a1,b3,c03,c03 # C12R c22R -+ VMAD a1,b4,c04,c04 # C12I C22I -+ -+ VMAD a2,b1,c05,c05 # C31R C41R -+ VMAD a2,b2,c06,c06 # C31I C41I -+ VMAD a2,b3,c07,c07 # C32R C42R -+ VMAD a2,b4,c08,c08 # C32I C42I -+ -+ VMAD a3,b1,c09,c09 # C51R C61R -+ VMAD a3,b2,c10,c10 # C51I C61I -+ VMAD a3,b3,c11,c11 # C52R C62R -+ VMAD a3,b4,c12,c12 # C52I C62I -+ -+ VMAD a4,b1,c13,c13 # C71R C81R -+ VMAD a4,b2,c14,c14 # C71I C81I -+ VMAD a4,b3,c15,c15 # C72R C82R -+ VMAD a4,b4,c16,c16 # C72I C82I -+ -+$L18: # Write back -+ LD alpha_i, ALPHA_I # $f29==b7 -+#ifndef TRMMKERNEL -+ vextf c01, 0, a1 # a1=C11R_ac -+ vextf c01, 1, a2 # a2=C11I_bc -+ vextf c01, 2, a3 # a3=C21R_ac -+ vextf c01, 3, a4 # a4=C21I_bc -+ -+ vextf c02, 0, b1 # b1=C11I_ad -+ vextf c02, 1, b2 # b2=C11R_bd -+ vextf c02, 2, b3 # b3=C21I_ad -+ vextf c02, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 0 * SIZE(C1) -+ LD a2, 1 * SIZE(C1) -+ LD a3, 2 * SIZE(C1) -+ LD a4, 3 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 0 * SIZE(C1) -+ ST c01, 1 * SIZE(C1) -+ ST b6, 2 * SIZE(C1) -+ ST c02, 3 * SIZE(C1) -+ -+ vextf c05, 0, a1 # a1=C11R_ac -+ vextf c05, 1, a2 # a2=C11I_bc -+ vextf c05, 2, a3 # a3=C21R_ac -+ vextf c05, 3, a4 # a4=C21I_bc -+ -+ vextf c06, 0, b1 # b1=C11I_ad -+ vextf c06, 1, b2 # b2=C11R_bd -+ vextf c06, 2, b3 # b3=C21I_ad -+ vextf c06, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 4 * SIZE(C1) -+ LD a2, 5 * SIZE(C1) -+ LD a3, 6 * SIZE(C1) -+ LD a4, 7 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 4 * SIZE(C1) -+ ST c01, 5 * SIZE(C1) -+ ST b6, 6 * SIZE(C1) -+ ST c02, 7 * SIZE(C1) -+ -+ vextf c09, 0, a1 # a1=C11R_ac -+ vextf c09, 1, a2 # a2=C11I_bc -+ vextf c09, 2, a3 # a3=C21R_ac -+ vextf c09, 3, a4 # a4=C21I_bc -+ -+ vextf c10, 0, b1 # b1=C11I_ad -+ vextf c10, 1, b2 # b2=C11R_bd -+ vextf c10, 2, b3 # b3=C21I_ad -+ vextf c10, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 8 * SIZE(C1) -+ LD a2, 9 * SIZE(C1) -+ LD a3, 10 * SIZE(C1) -+ LD a4, 11 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 8 * SIZE(C1) -+ ST c01, 9 * SIZE(C1) -+ ST b6, 10 * SIZE(C1) -+ ST c02, 11 * SIZE(C1) -+ -+ vextf c13, 0, a1 # a1=C11R_ac -+ vextf c13, 1, a2 # a2=C11I_bc -+ vextf c13, 2, a3 # a3=C21R_ac -+ vextf c13, 3, a4 # a4=C21I_bc -+ -+ vextf c14, 0, b1 # b1=C11I_ad -+ vextf c14, 1, b2 # b2=C11R_bd -+ vextf c14, 2, b3 # b3=C21I_ad -+ vextf c14, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 12 * SIZE(C1) -+ LD a2, 13 * SIZE(C1) -+ LD a3, 14 * SIZE(C1) -+ LD a4, 15 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 12 * SIZE(C1) -+ ST c01, 13 * SIZE(C1) -+ ST b6, 14 * SIZE(C1) -+ ST c02, 15 * SIZE(C1) -+ -+ -+ vextf c03, 0, a1 # a1=C11R_ac -+ vextf c03, 1, a2 # a2=C11I_bc -+ vextf c03, 2, a3 # a3=C21R_ac -+ vextf c03, 3, a4 # a4=C21I_bc -+ -+ vextf c04, 0, b1 # b1=C11I_ad -+ vextf c04, 1, b2 # b2=C11R_bd -+ vextf c04, 2, b3 # b3=C21I_ad -+ vextf c04, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 0 * SIZE(C2) -+ LD a2, 1 * SIZE(C2) -+ LD a3, 2 * SIZE(C2) -+ LD a4, 3 * SIZE(C2) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 0 * SIZE(C2) -+ ST c02, 1 * SIZE(C2) -+ ST c05, 2 * SIZE(C2) -+ ST c06, 3 * SIZE(C2) -+ -+ vextf c07, 0, a1 # a1=C11R_ac -+ vextf c07, 1, a2 # a2=C11I_bc -+ vextf c07, 2, a3 # a3=C21R_ac -+ vextf c07, 3, a4 # a4=C21I_bc -+ -+ vextf c08, 0, b1 # b1=C11I_ad -+ vextf c08, 1, b2 # b2=C11R_bd -+ vextf c08, 2, b3 # b3=C21I_ad -+ vextf c08, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 4 * SIZE(C2) -+ LD a2, 5 * SIZE(C2) -+ LD a3, 6 * SIZE(C2) -+ LD a4, 7 * SIZE(C2) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 4 * SIZE(C2) -+ ST c02, 5 * SIZE(C2) -+ ST c05, 6 * SIZE(C2) -+ ST c06, 7 * SIZE(C2) -+ -+ vextf c11, 0, a1 # a1=C11R_ac -+ vextf c11, 1, a2 # a2=C11I_bc -+ vextf c11, 2, a3 # a3=C21R_ac -+ vextf c11, 3, a4 # a4=C21I_bc -+ -+ vextf c12, 0, b1 # b1=C11I_ad -+ vextf c12, 1, b2 # b2=C11R_bd -+ vextf c12, 2, b3 # b3=C21I_ad -+ vextf c12, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 8 * SIZE(C2) -+ LD a2, 9 * SIZE(C2) -+ LD a3, 10 * SIZE(C2) -+ LD a4, 11 * SIZE(C2) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 8 * SIZE(C2) -+ ST c02, 9 * SIZE(C2) -+ ST c05, 10 * SIZE(C2) -+ ST c06, 11 * SIZE(C2) -+ -+ vextf c15, 0, a1 # a1=C11R_ac -+ vextf c15, 1, a2 # a2=C11I_bc -+ vextf c15, 2, a3 # a3=C21R_ac -+ vextf c15, 3, a4 # a4=C21I_bc -+ -+ vextf c16, 0, b1 # b1=C11I_ad -+ vextf c16, 1, b2 # b2=C11R_bd -+ vextf c16, 2, b3 # b3=C21I_ad -+ vextf c16, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 12 * SIZE(C2) -+ LD a2, 13 * SIZE(C2) -+ LD a3, 14 * SIZE(C2) -+ LD a4, 15 * SIZE(C2) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 12 * SIZE(C2) -+ ST c02, 13 * SIZE(C2) -+ ST c05, 14 * SIZE(C2) -+ ST c06, 15 * SIZE(C2) ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 + ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L +#else -+ vextf c01, 0, a1 # a1=C11R_ac -+ vextf c01, 1, a2 # a2=C11I_bc -+ vextf c01, 2, a3 # a3=C21R_ac -+ vextf c01, 3, a4 # a4=C21I_bc -+ -+ vextf c02, 0, b1 # b1=C11I_ad -+ vextf c02, 1, b2 # b2=C11R_bd -+ vextf c02, 2, b3 # b3=C21I_ad -+ vextf c02, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 0 * SIZE(C1) -+ ST c01, 1 * SIZE(C1) -+ ST b6, 2 * SIZE(C1) -+ ST c02, 3 * SIZE(C1) -+ -+ vextf c05, 0, a1 # a1=C11R_ac -+ vextf c05, 1, a2 # a2=C11I_bc -+ vextf c05, 2, a3 # a3=C21R_ac -+ vextf c05, 3, a4 # a4=C21I_bc -+ -+ vextf c06, 0, b1 # b1=C11I_ad -+ vextf c06, 1, b2 # b2=C11R_bd -+ vextf c06, 2, b3 # b3=C21I_ad -+ vextf c06, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 4 * SIZE(C1) -+ ST c01, 5 * SIZE(C1) -+ ST b6, 6 * SIZE(C1) -+ ST c02, 7 * SIZE(C1) -+ -+ vextf c09, 0, a1 # a1=C11R_ac -+ vextf c09, 1, a2 # a2=C11I_bc -+ vextf c09, 2, a3 # a3=C21R_ac -+ vextf c09, 3, a4 # a4=C21I_bc -+ -+ vextf c10, 0, b1 # b1=C11I_ad -+ vextf c10, 1, b2 # b2=C11R_bd -+ vextf c10, 2, b3 # b3=C21I_ad -+ vextf c10, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 8 * SIZE(C1) -+ ST c01, 9 * SIZE(C1) -+ ST b6, 10 * SIZE(C1) -+ ST c02, 11 * SIZE(C1) -+ -+ vextf c13, 0, a1 # a1=C11R_ac -+ vextf c13, 1, a2 # a2=C11I_bc -+ vextf c13, 2, a3 # a3=C21R_ac -+ vextf c13, 3, a4 # a4=C21I_bc -+ -+ vextf c14, 0, b1 # b1=C11I_ad -+ vextf c14, 1, b2 # b2=C11R_bd -+ vextf c14, 2, b3 # b3=C21I_ad -+ vextf c14, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 12 * SIZE(C1) -+ ST c01, 13 * SIZE(C1) -+ ST b6, 14 * SIZE(C1) -+ ST c02, 15 * SIZE(C1) -+ -+ -+ vextf c03, 0, a1 # a1=C11R_ac -+ vextf c03, 1, a2 # a2=C11I_bc -+ vextf c03, 2, a3 # a3=C21R_ac -+ vextf c03, 3, a4 # a4=C21I_bc -+ -+ vextf c04, 0, b1 # b1=C11I_ad -+ vextf c04, 1, b2 # b2=C11R_bd -+ vextf c04, 2, b3 # b3=C21I_ad -+ vextf c04, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 0 * SIZE(C2) -+ ST c02, 1 * SIZE(C2) -+ ST c05, 2 * SIZE(C2) -+ ST c06, 3 * SIZE(C2) -+ -+ vextf c07, 0, a1 # a1=C11R_ac -+ vextf c07, 1, a2 # a2=C11I_bc -+ vextf c07, 2, a3 # a3=C21R_ac -+ vextf c07, 3, a4 # a4=C21I_bc -+ -+ vextf c08, 0, b1 # b1=C11I_ad -+ vextf c08, 1, b2 # b2=C11R_bd -+ vextf c08, 2, b3 # b3=C21I_ad -+ vextf c08, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 4 * SIZE(C2) -+ ST c02, 5 * SIZE(C2) -+ ST c05, 6 * SIZE(C2) -+ ST c06, 7 * SIZE(C2) -+ -+ vextf c11, 0, a1 # a1=C11R_ac -+ vextf c11, 1, a2 # a2=C11I_bc -+ vextf c11, 2, a3 # a3=C21R_ac -+ vextf c11, 3, a4 # a4=C21I_bc -+ -+ vextf c12, 0, b1 # b1=C11I_ad -+ vextf c12, 1, b2 # b2=C11R_bd -+ vextf c12, 2, b3 # b3=C21I_ad -+ vextf c12, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 8 * SIZE(C2) -+ ST c02, 9 * SIZE(C2) -+ ST c05, 10 * SIZE(C2) -+ ST c06, 11 * SIZE(C2) -+ -+ vextf c15, 0, a1 # a1=C11R_ac -+ vextf c15, 1, a2 # a2=C11I_bc -+ vextf c15, 2, a3 # a3=C21R_ac -+ vextf c15, 3, a4 # a4=C21I_bc -+ -+ vextf c16, 0, b1 # b1=C11I_ad -+ vextf c16, 1, b2 # b2=C11R_bd -+ vextf c16, 2, b3 # b3=C21I_ad -+ vextf c16, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 12 * SIZE(C2) -+ ST c02, 13 * SIZE(C2) -+ ST c05, 14 * SIZE(C2) -+ ST c06, 15 * SIZE(C2) -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 8, TEMP -+#else -+ subl TEMP, 2, TEMP ++ and TMP1, 3, L +#endif ++ unop ++ ble L, $L98 ++ .align 4 + -+ sll TEMP, 3 + ZBASE_SHIFT,L # mr=8 -+ sll TEMP, 1 + ZBASE_SHIFT,TEMP # nr=2 ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + -+ addl AO, L, AO -+ addl BO, TEMP, BO -+#endif ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) + -+#ifdef LEFT -+ addl KK,8,KK -+#endif -+#endif ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) + -+ jmp $L09 ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) + -+ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 + .align 4 + -+$L20: # N=2, M=4 -+ and M, 4, I # I=M&4 -+ ble I, $L30 ++$L98: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 + -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B, BO -+ nop ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 +#else -+ sll KK, 2 + ZBASE_SHIFT, L # mr=4 -+ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2 -+ -+ addl AO, L, AO -+ addl B, TEMP, BO -+#endif -+ fillcs 0(C1) -+ fillcs 4*SIZE(C1) -+ fillcs 8*SIZE(C1) -+ -+ vcpys $f31,$f31,c01 # Clear result regs -+ vcpys $f31,$f31,c02 -+ vcpys $f31,$f31,c03 -+ vcpys $f31,$f31,c04 -+ -+ fillcs 0(C2) -+ fillcs 4*SIZE(C2) -+ fillcs 8*SIZE(C2) -+ -+ vcpys $f31,$f31,c05 -+ vcpys $f31,$f31,c06 -+ vcpys $f31,$f31,c07 -+ vcpys $f31,$f31,c08 -+ -+ LDDE b1, 0 * SIZE(BO) # B1R -+ LDDE b2, 1 * SIZE(BO) # B1I -+ LDDE b3, 2 * SIZE(BO) # B2R -+ LDDE b4, 3 * SIZE(BO) # B2I -+ -+ VLD a1, 0 * SIZE(AO) # A1, A2 -+ VLD a2, 4 * SIZE(AO) # A3, A4 -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl K, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 4, TEMP # mr=4 -+#else -+ addl KK, 2,TEMP # nr=2 -+#endif -+ sra TEMP, 1, L -+ ble L, $L25 ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 +#else -+ mov B, BO # Set B, (block A x panel Bj) -+ sra K, 1, L # Unroll K as 2 -+ -+ fillcs 0(C1) -+ fillcs 4*SIZE(C1) -+ fillcs 8*SIZE(C1) -+ -+ vcpys $f31,$f31,c01 # Clear result regs -+ vcpys $f31,$f31,c02 -+ vcpys $f31,$f31,c03 -+ vcpys $f31,$f31,c04 -+ -+ fillcs 0(C2) -+ fillcs 4*SIZE(C2) -+ fillcs 8*SIZE(C2) -+ -+ vcpys $f31,$f31,c05 -+ vcpys $f31,$f31,c06 -+ vcpys $f31,$f31,c07 -+ vcpys $f31,$f31,c08 -+ -+ LDDE b1, 0 * SIZE(BO) # B1R -+ LDDE b2, 1 * SIZE(BO) # B1I -+ LDDE b3, 2 * SIZE(BO) # B2R -+ LDDE b4, 3 * SIZE(BO) # B2I -+ -+ VLD a1, 0 * SIZE(AO) # A1, A2 -+ VLD a2, 4 * SIZE(AO) # A3, A4 ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ ble L, $L25 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 +#endif + -+ .align 4 -+$L22: -+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) -+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) -+ VMAD a1,b3,c03,c03 # C12(ac,bc), C22(ac,bc) -+ VMAD a1,b4,c04,c04 # C12(ad,bd), C22(ad,bd) -+ -+ LDDE b5, 4 * SIZE(BO) # next B1R -+ LDDE b6, 5 * SIZE(BO) # next B1I -+ LDDE b7, 6 * SIZE(BO) # next B2R -+ LDDE b8, 7 * SIZE(BO) # next B2I -+ -+ fillcs 0(PREB) -+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE -+ VMAD a2,b1,c05,c05 # C31, C41 -+ VMAD a2,b2,c06,c06 # C31, C41 ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) + -+ fillcs 0(PREA) -+ VMAD a2,b3,c07,c07 # C32, C42 -+ VMAD a2,b4,c08,c08 # C32, C42 ++ MUL a1, c04, c04 ++ MUL a2, c04, t1 ++ SUB c03, t1, c03 ++ MUL a3, c04, t1 ++ SUB c02, t1, c02 ++ MUL a4, c04, t1 ++ SUB c01, t1, c01 + -+ VLD a5, 8 * SIZE(AO) # next A1, A2, a5==a0 -+ VLD a6, 12 * SIZE(AO) # next A3, A4 ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) + -+ subl L, 1, L # ++ MUL b1, c03, c03 ++ MUL b2, c03, t1 ++ SUB c02, t1, c02 ++ MUL b3, c03, t1 ++ SUB c01, t1, c01 + -+ addl AO, 16*SIZE, AO # AO+=4mr*2kr*2px*SIZE -+ VMAD a5,b5,c01,c01 -+ VMAD a5,b6,c02,c02 ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ addl PREA, 16*SIZE, PREA -+ VMAD a5,b7,c03,c03 -+ VMAD a5,b8,c04,c04 ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+ LDDE b1, 0 * SIZE(BO) -+ LDDE b2, 1 * SIZE(BO) -+ LDDE b3, 2 * SIZE(BO) -+ LDDE b4, 3 * SIZE(BO) ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ fillcs 4*SIZE(PREB) -+ VMAD a6,b5,c05,c05 -+ VMAD a6,b6,c06,c06 ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c01, t1 ++ SUB c03, t1, c03 ++ MUL a4, c01, t1 ++ SUB c04, t1, c04 + -+ fillcs 0(PREA) -+ VMAD a6,b7,c07,c07 -+ VMAD a6,b8,c08,c08 ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) + -+ VLD a1, 0 * SIZE(AO) -+ VLD a2, 4 * SIZE(AO) ++ MUL b1, c02, c02 ++ MUL b2, c02, t1 ++ SUB c03, t1, c03 ++ MUL b3, c02, t1 ++ SUB c04, t1, c04 + -+ addl PREB, 8*SIZE, PREB -+ addl PREA, 16*SIZE, PREA -+ bne L, $L22 # continue K ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) + -+$L25: -+ LD alpha_r, ALPHA_R # $f30==b8 -+#ifndef TRMMKERNEL -+ blbc K, $L28 # if(K&1) -+#else -+ blbc TEMP, $L28 ++ MUL a1, c03, c03 ++ MUL a2, c03, t1 ++ SUB c04, t1, c04 ++ MUL a3, c04, c04 +#endif + -+$L26: -+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE -+ VMAD a1,b1,c01,c01 # C11R C21R -+ VMAD a1,b2,c02,c02 # C11I C21I -+ VMAD a1,b3,c03,c03 # C12R c22R -+ VMAD a1,b4,c04,c04 # C12I C22I -+ -+ addl BO, 4*SIZE, BO -+ VMAD a2,b1,c05,c05 # C31R C41R -+ VMAD a2,b2,c06,c06 # C31I C41I -+ VMAD a2,b3,c07,c07 # C32R C42R -+ VMAD a2,b4,c08,c08 # C32I C42I -+ -+$L28: # Write back -+ LD alpha_i, ALPHA_I # $f29==b7 -+#ifndef TRMMKERNEL -+ vextf c01, 0, a1 # a1=C11R_ac -+ vextf c01, 1, a2 # a2=C11I_bc -+ vextf c01, 2, a3 # a3=C21R_ac -+ vextf c01, 3, a4 # a4=C21I_bc -+ -+ vextf c02, 0, b1 # b1=C11I_ad -+ vextf c02, 1, b2 # b2=C11R_bd -+ vextf c02, 2, b3 # b3=C21I_ad -+ vextf c02, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 0 * SIZE(C1) -+ LD a2, 1 * SIZE(C1) -+ LD a3, 2 * SIZE(C1) -+ LD a4, 3 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 0 * SIZE(C1) -+ ST c01, 1 * SIZE(C1) -+ ST b6, 2 * SIZE(C1) -+ ST c02, 3 * SIZE(C1) -+ -+ vextf c05, 0, a1 # a1=C11R_ac -+ vextf c05, 1, a2 # a2=C11I_bc -+ vextf c05, 2, a3 # a3=C21R_ac -+ vextf c05, 3, a4 # a4=C21I_bc -+ -+ vextf c06, 0, b1 # b1=C11I_ad -+ vextf c06, 1, b2 # b2=C11R_bd -+ vextf c06, 2, b3 # b3=C21I_ad -+ vextf c06, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 4 * SIZE(C1) -+ LD a2, 5 * SIZE(C1) -+ LD a3, 6 * SIZE(C1) -+ LD a4, 7 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 4 * SIZE(C1) -+ ST c01, 5 * SIZE(C1) -+ ST b6, 6 * SIZE(C1) -+ ST c02, 7 * SIZE(C1) -+ -+ -+ vextf c03, 0, a1 # a1=C11R_ac -+ vextf c03, 1, a2 # a2=C11I_bc -+ vextf c03, 2, a3 # a3=C21R_ac -+ vextf c03, 3, a4 # a4=C21I_bc -+ -+ vextf c04, 0, b1 # b1=C11I_ad -+ vextf c04, 1, b2 # b2=C11R_bd -+ vextf c04, 2, b3 # b3=C21I_ad -+ vextf c04, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 0 * SIZE(C2) -+ LD a2, 1 * SIZE(C2) -+ LD a3, 2 * SIZE(C2) -+ LD a4, 3 * SIZE(C2) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 0 * SIZE(C2) -+ ST c02, 1 * SIZE(C2) -+ ST c05, 2 * SIZE(C2) -+ ST c06, 3 * SIZE(C2) -+ -+ vextf c07, 0, a1 # a1=C11R_ac -+ vextf c07, 1, a2 # a2=C11I_bc -+ vextf c07, 2, a3 # a3=C21R_ac -+ vextf c07, 3, a4 # a4=C21I_bc -+ -+ vextf c08, 0, b1 # b1=C11I_ad -+ vextf c08, 1, b2 # b2=C11R_bd -+ vextf c08, 2, b3 # b3=C21I_ad -+ vextf c08, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 4 * SIZE(C2) -+ LD a2, 5 * SIZE(C2) -+ LD a3, 6 * SIZE(C2) -+ LD a4, 7 * SIZE(C2) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 4 * SIZE(C2) -+ ST c02, 5 * SIZE(C2) -+ ST c05, 6 * SIZE(C2) -+ ST c06, 7 * SIZE(C2) -+ -+#else ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) + -+ vextf c01, 0, a1 # a1=C11R_ac -+ vextf c01, 1, a2 # a2=C11I_bc -+ vextf c01, 2, a3 # a3=C21R_ac -+ vextf c01, 3, a4 # a4=C21I_bc -+ -+ vextf c02, 0, b1 # b1=C11I_ad -+ vextf c02, 1, b2 # b2=C11R_bd -+ vextf c02, 2, b3 # b3=C21I_ad -+ vextf c02, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 0 * SIZE(C1) -+ ST c01, 1 * SIZE(C1) -+ ST b6, 2 * SIZE(C1) -+ ST c02, 3 * SIZE(C1) -+ -+ vextf c05, 0, a1 # a1=C11R_ac -+ vextf c05, 1, a2 # a2=C11I_bc -+ vextf c05, 2, a3 # a3=C21R_ac -+ vextf c05, 3, a4 # a4=C21I_bc -+ -+ vextf c06, 0, b1 # b1=C11I_ad -+ vextf c06, 1, b2 # b2=C11R_bd -+ vextf c06, 2, b3 # b3=C21I_ad -+ vextf c06, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 4 * SIZE(C1) -+ ST c01, 5 * SIZE(C1) -+ ST b6, 6 * SIZE(C1) -+ ST c02, 7 * SIZE(C1) -+ -+ -+ vextf c03, 0, a1 # a1=C11R_ac -+ vextf c03, 1, a2 # a2=C11I_bc -+ vextf c03, 2, a3 # a3=C21R_ac -+ vextf c03, 3, a4 # a4=C21I_bc -+ -+ vextf c04, 0, b1 # b1=C11I_ad -+ vextf c04, 1, b2 # b2=C11R_bd -+ vextf c04, 2, b3 # b3=C21I_ad -+ vextf c04, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 0 * SIZE(C2) -+ ST c02, 1 * SIZE(C2) -+ ST c05, 2 * SIZE(C2) -+ ST c06, 3 * SIZE(C2) -+ -+ vextf c07, 0, a1 # a1=C11R_ac -+ vextf c07, 1, a2 # a2=C11I_bc -+ vextf c07, 2, a3 # a3=C21R_ac -+ vextf c07, 3, a4 # a4=C21I_bc -+ -+ vextf c08, 0, b1 # b1=C11I_ad -+ vextf c08, 1, b2 # b2=C11R_bd -+ vextf c08, 2, b3 # b3=C21I_ad -+ vextf c08, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, c01 -+ FMAD8 a8, alpha_i, a3, c05 -+ FMAD6 b5, alpha_i, a2, c02 -+ FMAD6 a6, alpha_i, a4, c06 -+ -+ ST c01, 4 * SIZE(C2) -+ ST c02, 5 * SIZE(C2) -+ ST c05, 6 * SIZE(C2) -+ ST c06, 7 * SIZE(C2) -+ -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 4, TEMP -+#else -+ subl TEMP, 2, TEMP ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 +#endif + -+ sll TEMP, 2 + ZBASE_SHIFT, L -+ sll TEMP, 1 + ZBASE_SHIFT, TEMP -+ -+ addl AO, L, AO -+ addl BO, TEMP,BO ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) +#endif + -+#ifdef LEFT -+ addl KK, 4,KK -+#endif ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) +#endif + -+ addl C1, 8*SIZE, C1 -+ addl C2, 8*SIZE, C2 -+ -+ -+ .align 4 -+$L30: -+ and M, 2, I # I=M&2 -+ ble I, $L40 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B, BO -+ nop -+#else -+ sll KK, 1 + ZBASE_SHIFT, L # mr=2 -+ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) + -+ addl AO, L, AO -+ addl B, TEMP, BO ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) +#endif + -+ fclr c01 -+ fclr c02 -+ fclr c03 -+ fclr c04 -+ fclr c05 -+ fclr c06 -+ fclr c07 -+ fclr c08 # CLEAR 8 register -+ fclr c09 -+ fclr c10 -+ fclr c11 -+ fclr c12 -+ fclr c13 -+ fclr c14 -+ fclr c15 -+ fclr c16 -+ -+ fillcs 0*SIZE(C1) -+ fillcs 4*SIZE(C1) -+ -+ LD b1, 0*SIZE(BO) # b1 real part -+ LD b2, 1*SIZE(BO) # b1 image part -+ LD b3, 2*SIZE(BO) # b2 real part -+ LD b4, 3*SIZE(BO) # b2 image part -+ -+ fillcs 0*SIZE(C2) -+ fillcs 4*SIZE(C2) -+ -+ LD a1, 0*SIZE(AO) # a1 real part -+ LD a2, 1*SIZE(AO) # a1 image part -+ LD a3, 2*SIZE(AO) # a2 real part -+ LD a4, 3*SIZE(AO) # a2 image part ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl K, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 2, TEMP # mr=2 -+#else -+ addl KK, 2, TEMP # nr=2 ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG +#endif -+ sra TEMP, 1, L -+ ble L, $L35 -+ -+#else + -+ mov B, BO # Set B, (block A x panel Bj) -+ sra K, 1, L # Unroll K as 2 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ fclr c01 -+ fclr c02 -+ fclr c03 -+ fclr c04 -+ fclr c05 -+ fclr c06 -+ fclr c07 -+ fclr c08 # CLEAR 8 register -+ fclr c09 -+ fclr c10 -+ fclr c11 -+ fclr c12 -+ fclr c13 -+ fclr c14 -+ fclr c15 -+ fclr c16 ++#ifdef LT ++ addl KK, 4, KK ++#endif + -+ fillcs 0*SIZE(C1) -+ fillcs 4*SIZE(C1) ++#ifdef LN ++ subl KK, 4, KK ++#endif + -+ LD b1, 0*SIZE(BO) # b1 real part -+ LD b2, 1*SIZE(BO) # b1 image part -+ LD b3, 2*SIZE(BO) # b2 real part -+ LD b4, 3*SIZE(BO) # b2 image part ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 + -+ fillcs 0*SIZE(C2) -+ fillcs 4*SIZE(C2) ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif + -+ LD a1, 0*SIZE(AO) # a1 real part -+ LD a2, 1*SIZE(AO) # a1 image part -+ LD a3, 2*SIZE(AO) # a2 real part -+ LD a4, 3*SIZE(AO) # a2 image part ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif + -+ ble L, $L35 ++#ifdef RN ++ addl KK, 1, KK +#endif + ++#ifdef RT ++ subl KK, 1, KK ++#endif + .align 4 -+$L32: -+ MAD a1,b1,c01,c01 # a1*c1 -+ MAD a1,b2,c02,c02 # a1*d1 -+ MAD a1,b3,c03,c03 # a1*c2 -+ MAD a1,b4,c04,c04 # a1*d2 -+ -+ LD b5, 4 * SIZE(BO) # next B1R -+ LD b6, 5 * SIZE(BO) # next B1I -+ LD b7, 6 * SIZE(BO) # next B2R -+ LD b8, 7 * SIZE(BO) # next B2I -+ -+ LD a5, 4 * SIZE(AO) # next A1-A4 real part -+ LD a6, 5 * SIZE(AO) # next A1-A4 image part -+ LD a7, 6 * SIZE(AO) -+ LD a8, 7 * SIZE(AO) -+ -+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE -+ MAD a2,b1,c05,c05 # b1*c1 -+ MAD a2,b2,c06,c06 # b1*d1 -+ MAD a2,b3,c07,c07 # b1*c2 -+ MAD a2,b4,c08,c08 # b1*d2 -+ -+ MAD a3,b1,c09,c09 # a2*c1 -+ MAD a3,b2,c10,c10 # a2*d1 -+ MAD a3,b3,c11,c11 # a2*c2 -+ MAD a3,b4,c12,c12 # a2*d2 -+ -+ MAD a4,b1,c13,c13 # b2*c1 -+ MAD a4,b2,c14,c14 # b2*d1 -+ MAD a4,b3,c15,c15 # b2*c2 -+ MAD a4,b4,c16,c16 # b2*d2 -+ -+ subl L, 1, L # -+ -+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE -+ MAD a5,b5,c01,c01 -+ MAD a5,b6,c02,c02 -+ MAD a5,b7,c03,c03 -+ MAD a5,b8,c04,c04 -+ -+ LD b1, 0 * SIZE(BO) -+ LD b2, 1 * SIZE(BO) -+ LD b3, 2 * SIZE(BO) -+ LD b4, 3 * SIZE(BO) -+ -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MAD a6,b5,c05,c05 -+ MAD a6,b6,c06,c06 -+ MAD a6,b7,c07,c07 -+ MAD a6,b8,c08,c08 -+ -+ MAD a7,b5,c09,c09 -+ MAD a7,b6,c10,c10 -+ MAD a7,b7,c11,c11 -+ MAD a7,b8,c12,c12 -+ -+ MAD a8,b5,c13,c13 -+ MAD a8,b6,c14,c14 -+ MAD a8,b7,c15,c15 -+ MAD a8,b8,c16,c16 -+ -+ bne L, $L32 # continue K + -+$L35: -+ LD alpha_r, ALPHA_R # $f30==b8 -+#ifndef TRMMKERNEL -+ blbc K, $L38 # if(K&1) -+#else -+ blbc TEMP, $L38 -+#endif ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S b/kernel/sw_64/trsm_kernel_4x4_LT.S +new file mode 100644 +index 000000000..4ee360e6d +--- /dev/null ++++ b/kernel/sw_64/trsm_kernel_4x4_LT.S +@@ -0,0 +1,4059 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+$L36: -+ addl AO, 4*SIZE, AO # AO+=2mr*1kr*2px*SIZE -+ addl BO, 4*SIZE, BO ++#define ASSEMBLER ++#include "common.h" + -+ MAD a1,b1,c01,c01 # a1*c1 -+ MAD a1,b2,c02,c02 # a1*d1 -+ MAD a1,b3,c03,c03 # a1*c2 -+ MAD a1,b4,c04,c04 # a1*d2 + -+ MAD a2,b1,c05,c05 # b1*c1 -+ MAD a2,b2,c06,c06 # b1*d1 -+ MAD a2,b3,c07,c07 # b1*c2 -+ MAD a2,b4,c08,c08 # b1*d2 ++#if !defined(SW8A) ++#error "Architecture is not specified." ++#endif + -+ MAD a3,b1,c09,c09 # a2*c1 -+ MAD a3,b2,c10,c10 # a2*d1 -+ MAD a3,b3,c11,c11 # a2*c2 -+ MAD a3,b4,c12,c12 # a2*d2 ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif + -+ MAD a4,b1,c13,c13 # b2*c1 -+ MAD a4,b2,c14,c14 # b2*d1 -+ MAD a4,b3,c15,c15 # b2*c2 -+ MAD a4,b4,c16,c16 # b2*d2 + + -+ -+$L38: # Write back -+ LD alpha_i, ALPHA_I # $f29==b7 -+#ifndef TRMMKERNEL -+ ADD1 c01, c06, c01 # ac '+' bd -+ ADD1 c09, c14, c09 -+ ADD1 c03, c08, c03 # -+ ADD1 c11, c16, c11 -+ -+ ADD2 c05, c02, c02 # bc '+' ad -+ ADD2 c13, c10, c10 -+ ADD2 c07, c04, c04 -+ ADD2 c15, c12, c12 -+ -+ LD b1, 0 * SIZE(C1) -+ LD b2, 1 * SIZE(C1) -+ LD b3, 2 * SIZE(C1) -+ LD b4, 3 * SIZE(C1) -+ -+ LD a5, 0 * SIZE(C2) -+ LD a6, 1 * SIZE(C2) -+ LD a7, 2 * SIZE(C2) -+ LD a8, 3 * SIZE(C2) -+ -+ FMAD5 c01, alpha_r, b1, b1 -+ FMAD5 c09, alpha_r, b3, b3 -+ FMAD5 c03, alpha_r, a5, a5 -+ FMAD5 c11, alpha_r, a7, a7 -+ -+ FMAD7 c02, alpha_r, b2, b2 -+ FMAD7 c10, alpha_r, b4, b4 -+ FMAD7 c04, alpha_r, a6, a6 -+ FMAD7 c12, alpha_r, a8, a8 -+ -+ FMAD8 c02, alpha_i, b1, b1 -+ FMAD8 c10, alpha_i, b3, b3 -+ FMAD8 c04, alpha_i, a5, a5 -+ FMAD8 c12, alpha_i, a7, a7 -+ -+ FMAD6 c01, alpha_i, b2, b2 -+ FMAD6 c09, alpha_i, b4, b4 -+ FMAD6 c03, alpha_i, a6, a6 -+ FMAD6 c11, alpha_i, a8, a8 -+ -+ ST b1, 0 * SIZE(C1) -+ ST b2, 1 * SIZE(C1) -+ ST b3, 2 * SIZE(C1) -+ ST b4, 3 * SIZE(C1) -+ -+ ST a5, 0 * SIZE(C2) -+ ST a6, 1 * SIZE(C2) -+ ST a7, 2 * SIZE(C2) -+ ST a8, 3 * SIZE(C2) ++#define STACKSIZE 80 + -+#else ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 + -+ ADD1 c01, c06, c01 # ac '+' bd -+ ADD1 c09, c14, c09 -+ ADD1 c03, c08, c03 # -+ ADD1 c11, c16, c11 -+ -+ ADD2 c05, c02, c02 # bc '+' ad -+ ADD2 c13, c10, c10 -+ ADD2 c07, c04, c04 -+ ADD2 c15, c12, c12 -+ -+ FMAD5 c01, alpha_r, $f31, b1 -+ FMAD5 c09, alpha_r, $f31, b3 -+ FMAD5 c03, alpha_r, $f31, a5 -+ FMAD5 c11, alpha_r, $f31, a7 -+ -+ FMAD7 c02, alpha_r, $f31, b2 -+ FMAD7 c10, alpha_r, $f31, b4 -+ FMAD7 c04, alpha_r, $f31, a6 -+ FMAD7 c12, alpha_r, $f31, a8 -+ -+ FMAD8 c02, alpha_i, b1, b1 -+ FMAD8 c10, alpha_i, b3, b3 -+ FMAD8 c04, alpha_i, a5, a5 -+ FMAD8 c12, alpha_i, a7, a7 -+ -+ FMAD6 c01, alpha_i, b2, b2 -+ FMAD6 c09, alpha_i, b4, b4 -+ FMAD6 c03, alpha_i, a6, a6 -+ FMAD6 c11, alpha_i, a8, a8 -+ -+ ST b1, 0 * SIZE(C1) -+ ST b2, 1 * SIZE(C1) -+ ST b3, 2 * SIZE(C1) -+ ST b4, 3 * SIZE(C1) -+ -+ ST a5, 0 * SIZE(C2) -+ ST a6, 1 * SIZE(C2) -+ ST a7, 2 * SIZE(C2) -+ ST a8, 3 * SIZE(C2) -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 2, TEMP -+#else -+ subl TEMP, 2, TEMP -+#endif ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 + -+ sll TEMP, 1 + ZBASE_SHIFT, L -+ sll TEMP, 1 + ZBASE_SHIFT, TEMP ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 + -+ addl AO, L, AO -+ addl BO, TEMP, BO -+#endif ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 + -+#ifdef LEFT -+ addl KK, 2, KK -+#endif -+#endif ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 + -+ addl C1, 4*SIZE, C1 -+ addl C2, 4*SIZE, C2 ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 + ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 + -+ .align 4 -+$L40: -+ and M, 1, I # I=M&1 -+ ble I, $L09 ++#define alpha $f30 + -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B, BO -+ nop -+#else -+ sll KK, ZBASE_SHIFT, L # mr=1 -+ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2 ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 + -+ addl AO, L, AO -+ addl B, TEMP, BO -+#endif ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 + -+ fillcs 0*SIZE(C1) -+ fillcs 0*SIZE(C2) ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 + -+ fclr c01 -+ fclr c02 -+ fclr c03 -+ fclr c04 -+ fclr c05 -+ fclr c06 -+ fclr c07 -+ fclr c08 ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 + -+ LD b1, 0*SIZE(BO) # b1 real part -+ LD b2, 1*SIZE(BO) # b1 image part -+ LD b3, 2*SIZE(BO) # b2 real part -+ LD b4, 3*SIZE(BO) # b2 image part ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 + -+ LD a1, 0*SIZE(AO) # a1 real part -+ LD a2, 1*SIZE(AO) # a1 image part ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 + -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl K, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 1, TEMP # mr=1 -+#else -+ addl KK, 2, TEMP # nr=2 -+#endif -+ sra TEMP, 1, L ++ ldi $sp, -STACKSIZE($sp) + -+ ble L, $L45 ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) + -+#else -+ mov B, BO # Set B, (block A x panel Bj) -+ sra K, 1, L # Unroll K as 2 ++ SXADDQ LDC, 0, LDC + -+ fillcs 0*SIZE(C1) -+ fillcs 0*SIZE(C2) ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) + -+ fclr c01 -+ fclr c02 -+ fclr c03 -+ fclr c04 -+ fclr c05 -+ fclr c06 -+ fclr c07 -+ fclr c08 ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 + -+ LD b1, 0*SIZE(BO) # b1 real part -+ LD b2, 1*SIZE(BO) # b1 image part -+ LD b3, 2*SIZE(BO) # b2 real part -+ LD b4, 3*SIZE(BO) # b2 image part ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 + -+ LD a1, 0*SIZE(AO) # a1 real part -+ LD a2, 1*SIZE(AO) # a1 image part ++#ifdef LN ++ mull M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif + -+ ble L, $L45 ++#ifdef RN ++ negl OFFSET, KK +#endif + -+ .align 4 -+$L42: -+ MAD a1,b1,c01,c01 # C11 real part -+ MAD a1,b2,c02,c02 # C11 imag part -+ MAD a1,b3,c03,c03 # C21 real part -+ MAD a1,b4,c04,c04 # C21 imag part ++#ifdef RT ++ mull N, K, TMP1 ++ SXADDQ TMP1, B, B + -+ LD b5, 4 * SIZE(BO) # next B1R -+ LD b6, 5 * SIZE(BO) # next B1I -+ LD b7, 6 * SIZE(BO) # next B2R -+ LD b8, 7 * SIZE(BO) # next B2I ++ mull N, LDC, TMP1 ++ addl TMP1, C, C + -+ LD a5, 2 * SIZE(AO) # next A1-A4 real part -+ LD a6, 3 * SIZE(AO) # next A1-A4 image part ++ subl N, OFFSET, KK ++#endif + -+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE -+ MAD a2,b1,c05,c05 # C11 image part -+ MAD a2,b2,c06,c06 # C11 real part -+ MAD a2,b3,c07,c07 # C21 image part -+ MAD a2,b4,c08,c08 # C21 real part ++ sra N, 2, J ++ ble J, $L40 ++ .align 4 + -+ subl L, 1, L # ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B + -+ addl AO, 4*SIZE, AO # AO+=1mr*2kr*2px*SIZE -+ MAD a5,b5,c01,c01 -+ MAD a5,b6,c02,c02 -+ MAD a5,b7,c03,c03 -+ MAD a5,b8,c04,c04 ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif + -+ LD b1, 0 * SIZE(BO) -+ LD b2, 1 * SIZE(BO) -+ LD b3, 2 * SIZE(BO) -+ LD b4, 3 * SIZE(BO) ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif + -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 + -+ MAD a6,b5,c05,c05 -+ MAD a6,b6,c06,c06 -+ MAD a6,b7,c07,c07 -+ MAD a6,b8,c08,c08 ++#ifdef LN ++ addl M, OFFSET, KK ++#endif + -+ bne L, $L42 # continue K ++#ifdef LT ++ mov OFFSET, KK ++#endif + -+$L45: -+ LD alpha_r, ALPHA_R # $f30==b8 -+#ifndef TRMMKERNEL -+ blbc K, $L48 # if(K&1) ++#if defined(LN) || defined(RT) ++ mov A, AORIG +#else -+ blbc TEMP, $L48 ++ mov A, AO +#endif + -+$L46: -+ addl AO, 2*SIZE, AO # AO+=8mr*1kr*2px*SIZE -+ MAD a1,b1,c01,c01 # C11 real part -+ MAD a1,b2,c02,c02 # C11 imag part -+ MAD a1,b3,c03,c03 # C21 real part -+ MAD a1,b4,c04,c04 # C21 imag part -+ -+ addl BO, 4*SIZE, BO -+ MAD a2,b1,c05,c05 # C11 image part -+ MAD a2,b2,c06,c06 # C11 real part -+ MAD a2,b3,c07,c07 # C21 image part -+ MAD a2,b4,c08,c08 # C21 real part -+ -+ -+$L48: # Write back -+ LD alpha_i, ALPHA_I # $f29==b7 -+#ifndef TRMMKERNEL -+ ADD1 c01, c06, c01 -+ ADD1 c03, c08, c03 -+ ADD2 c05, c02, c02 -+ ADD2 c07, c04, c04 ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 + -+ LD b1, 0 * SIZE(C1) -+ LD b2, 1 * SIZE(C1) ++$L11: ++#if defined(LT) || defined(RN) + -+ LD a5, 0 * SIZE(C2) -+ LD a6, 1 * SIZE(C2) ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 + -+ FMAD5 c01, alpha_r, b1, b1 -+ FMAD5 c03, alpha_r, a5, a5 ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 + -+ FMAD7 c02, alpha_r, b2, b2 -+ FMAD7 c04, alpha_r, a6, a6 ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 + -+ FMAD8 c02, alpha_i, b1, b1 -+ FMAD8 c04, alpha_i, a5, a5 ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 + -+ FMAD6 c01, alpha_i, b2, b2 -+ FMAD6 c03, alpha_i, a6, a6 ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 + -+ ST b1, 0 * SIZE(C1) -+ ST b2, 1 * SIZE(C1) ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 + -+ ST a5, 0 * SIZE(C2) -+ ST a6, 1 * SIZE(C2) ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 + ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 +#else + -+ ADD1 c01, c06, c01 -+ ADD1 c03, c08, c03 -+ ADD2 c05, c02, c02 -+ ADD2 c07, c04, c04 ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ FMAD5 c01, alpha_r, $f31, b1 -+ FMAD5 c03, alpha_r, $f31, a5 ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO + -+ FMAD7 c02, alpha_r, $f31, b2 -+ FMAD7 c04, alpha_r, $f31, a6 ++ subl K, KK, TMP1 + -+ FMAD8 c02, alpha_i, b1, b1 -+ FMAD8 c04, alpha_i, a5, a5 ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 + -+ FMAD6 c01, alpha_i, b2, b2 -+ FMAD6 c03, alpha_i, a6, a6 ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 + -+ ST b1, 0 * SIZE(C1) -+ ST b2, 1 * SIZE(C1) ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 + -+ ST a5, 0 * SIZE(C2) -+ ST a6, 1 * SIZE(C2) ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 + -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 1, TEMP -+#else -+ subl TEMP, 2, TEMP -+#endif ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 + -+ sll TEMP, ZBASE_SHIFT, L -+ sll TEMP, 1 + ZBASE_SHIFT, TEMP ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 + -+ addl AO, L, AO -+ addl BO, TEMP,BO -+#endif ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 + -+#ifdef LEFT -+ addl KK, 1, KK -+#endif ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 +#endif + -+ addl C1, 2*SIZE, C1 -+ addl C2, 2*SIZE, C2 -+ -+ -+ .align 4 ++ ble L, $L15 ++ .align 5 + -+$L09: -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ addl KK, 2, KK # nr=2 -+ nop ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop +#endif -+ mov BO, B # Change B to next panel -+ subl J, 1, J # J-- -+ bgt J, $L01 -+ -+ -+ .align 4 -+$L50: -+ and N, 1, J -+ ble J, $L999 # Finish! -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ mov OFFSET, KK # reset KK ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop +#endif + -+ sra M, 3, I # I=M/8 -+ sll K, 1 + ZBASE_SHIFT, PREA ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop + -+ mov C, C1 -+ mov A, AO # Reset A ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) + -+ addl A, PREA, PREA -+ beq I, $L60 # GEMM_MR=8 ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) + ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP + -+$L51: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA))\ -+ || (!defined(LEFT) && !defined(TRANSA)) -+ mov B, BO -+#else -+ sll KK, 3 + ZBASE_SHIFT,L # mr=8 -+ sll KK, ZBASE_SHIFT,TEMP # nr=1 -+ -+ addl AO, L, AO -+ addl B, TEMP, BO -+#endif -+ -+ fillcs 0(C1) -+ fillcs 4*SIZE(C1) -+ fillcs 8*SIZE(C1) -+ fillcs 12*SIZE(C1) -+ fillcs 16*SIZE(C1) -+ -+ vcpys $f31,$f31,c01 # Clear result regs -+ vcpys $f31,$f31,c02 -+ -+ vcpys $f31,$f31,c05 -+ vcpys $f31,$f31,c06 -+ -+ vcpys $f31,$f31,c09 -+ vcpys $f31,$f31,c10 -+ -+ vcpys $f31,$f31,c13 -+ vcpys $f31,$f31,c14 -+ -+ LDDE b1, 0 * SIZE(BO) # B1R -+ LDDE b2, 1 * SIZE(BO) # B1I -+ -+ VLD a1, 0 * SIZE(AO) # A1, A2 -+ VLD a2, 4 * SIZE(AO) # A3, A4 -+ VLD a3, 8 * SIZE(AO) # A5, A6 -+ VLD a4,12 * SIZE(AO) # A7, A8 -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl K, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 8, TEMP # mr=8 -+#else -+ addl KK, 1, TEMP # nr=1 -+#endif -+ sra TEMP, 1, L -+ ble L, $L55 ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP + -+#else -+ mov B, BO # Set B, (block A x panel Bj) -+ sra K, 1, L # Unroll K as 2 -+ -+ fillcs 0(C1) -+ fillcs 4*SIZE(C1) -+ fillcs 8*SIZE(C1) -+ fillcs 12*SIZE(C1) -+ fillcs 16*SIZE(C1) -+ -+ vcpys $f31,$f31,c01 # Clear result regs -+ vcpys $f31,$f31,c02 -+ -+ vcpys $f31,$f31,c05 -+ vcpys $f31,$f31,c06 -+ -+ vcpys $f31,$f31,c09 -+ vcpys $f31,$f31,c10 -+ -+ vcpys $f31,$f31,c13 -+ vcpys $f31,$f31,c14 -+ -+ LDDE b1, 0 * SIZE(BO) # B1R -+ LDDE b2, 1 * SIZE(BO) # B1I -+ -+ VLD a1, 0 * SIZE(AO) # A1, A2 -+ VLD a2, 4 * SIZE(AO) # A3, A4 -+ VLD a3, 8 * SIZE(AO) # A5, A6 -+ VLD a4,12 * SIZE(AO) # A7, A8 ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop + -+ ble L, $L55 -+#endif ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop + -+ .align 4 -+$L52: -+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE -+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) -+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop + -+ LDDE b5, 2 * SIZE(BO) # next B1R -+ LDDE b6, 3 * SIZE(BO) # next B1I ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop + -+ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE -+ VMAD a2,b1,c05,c05 # C31, C41 -+ VMAD a2,b2,c06,c06 # C31, C41 ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) + -+ VLD a5, 0 * SIZE(AO) # next A1, A2, a5==a0 -+ VLD a6, 4 * SIZE(AO) # next A3, A4 -+ VLD a7, 8 * SIZE(AO) # next A5, A6 -+ VLD a8,12 * SIZE(AO) # next A7, A8 ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) + -+ VMAD a3,b1,c09,c09 # C51, C61 -+ VMAD a3,b2,c10,c10 # C51, C61 ++/* 4 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) + -+ fillcs 0(PREA) -+ VMAD a4,b1,c13,c13 # C71, C81 -+ VMAD a4,b2,c14,c14 # C71, C81 ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) + -+ subl L, 1, L # ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) + -+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE -+ VMAD a5,b5,c01,c01 -+ VMAD a5,b6,c02,c02 ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) + -+ addl PREA, 16*SIZE, PREA -+ LDDE b1, 0 * SIZE(BO) -+ LDDE b2, 1 * SIZE(BO) ++/* 5 */ ++ ADD c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) + -+ VMAD a6,b5,c05,c05 -+ VMAD a6,b6,c06,c06 ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) + -+ VLD a1, 0 * SIZE(AO) -+ VLD a2, 4 * SIZE(AO) -+ VLD a3, 8 * SIZE(AO) -+ VLD a4,12 * SIZE(AO) ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop + -+ VMAD a7,b5,c09,c09 -+ VMAD a7,b6,c10,c10 ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop + -+ fillcs 0(PREA) -+ VMAD a8,b5,c13,c13 -+ VMAD a8,b6,c14,c14 ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop + -+ addl PREA, 16*SIZE, PREA -+ bne L, $L52 # continue K ++ ADD c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop + -+$L55: -+ LD alpha_r, ALPHA_R # $f30==b8 -+#ifndef TRMMKERNEL -+ blbc K, $L58 # if(K&1) -+#else -+ blbc TEMP, $L58 -+#endif ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop + -+$L56: -+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE -+ VMAD a1,b1,c01,c01 # C11R C21R -+ VMAD a1,b2,c02,c02 # C11I C21I -+ -+ addl BO, 2*SIZE, BO -+ VMAD a2,b1,c05,c05 # C31R C41R -+ VMAD a2,b2,c06,c06 # C31I C41I -+ -+ VMAD a3,b1,c09,c09 # C51R C61R -+ VMAD a3,b2,c10,c10 # C51I C61I -+ -+ VMAD a4,b1,c13,c13 # C71R C81R -+ VMAD a4,b2,c14,c14 # C71I C81I -+ -+$L58: # Write back -+ LD alpha_i, ALPHA_I # $f29==b7 -+#ifndef TRMMKERNEL -+ vextf c01, 0, a1 # a1=C11R_ac -+ vextf c01, 1, a2 # a2=C11I_bc -+ vextf c01, 2, a3 # a3=C21R_ac -+ vextf c01, 3, a4 # a4=C21I_bc -+ -+ vextf c02, 0, b1 # b1=C11I_ad -+ vextf c02, 1, b2 # b2=C11R_bd -+ vextf c02, 2, b3 # b3=C21I_ad -+ vextf c02, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 0 * SIZE(C1) -+ LD a2, 1 * SIZE(C1) -+ LD a3, 2 * SIZE(C1) -+ LD a4, 3 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 0 * SIZE(C1) -+ ST c01, 1 * SIZE(C1) -+ ST b6, 2 * SIZE(C1) -+ ST c02, 3 * SIZE(C1) -+ -+ vextf c05, 0, a1 # a1=C11R_ac -+ vextf c05, 1, a2 # a2=C11I_bc -+ vextf c05, 2, a3 # a3=C21R_ac -+ vextf c05, 3, a4 # a4=C21I_bc -+ -+ vextf c06, 0, b1 # b1=C11I_ad -+ vextf c06, 1, b2 # b2=C11R_bd -+ vextf c06, 2, b3 # b3=C21I_ad -+ vextf c06, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 4 * SIZE(C1) -+ LD a2, 5 * SIZE(C1) -+ LD a3, 6 * SIZE(C1) -+ LD a4, 7 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 4 * SIZE(C1) -+ ST c01, 5 * SIZE(C1) -+ ST b6, 6 * SIZE(C1) -+ ST c02, 7 * SIZE(C1) -+ -+ vextf c09, 0, a1 # a1=C11R_ac -+ vextf c09, 1, a2 # a2=C11I_bc -+ vextf c09, 2, a3 # a3=C21R_ac -+ vextf c09, 3, a4 # a4=C21I_bc -+ -+ vextf c10, 0, b1 # b1=C11I_ad -+ vextf c10, 1, b2 # b2=C11R_bd -+ vextf c10, 2, b3 # b3=C21I_ad -+ vextf c10, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 8 * SIZE(C1) -+ LD a2, 9 * SIZE(C1) -+ LD a3, 10 * SIZE(C1) -+ LD a4, 11 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 8 * SIZE(C1) -+ ST c01, 9 * SIZE(C1) -+ ST b6, 10 * SIZE(C1) -+ ST c02, 11 * SIZE(C1) -+ -+ vextf c13, 0, a1 # a1=C11R_ac -+ vextf c13, 1, a2 # a2=C11I_bc -+ vextf c13, 2, a3 # a3=C21R_ac -+ vextf c13, 3, a4 # a4=C21I_bc -+ -+ vextf c14, 0, b1 # b1=C11I_ad -+ vextf c14, 1, b2 # b2=C11R_bd -+ vextf c14, 2, b3 # b3=C21I_ad -+ vextf c14, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 12 * SIZE(C1) -+ LD a2, 13 * SIZE(C1) -+ LD a3, 14 * SIZE(C1) -+ LD a4, 15 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 12 * SIZE(C1) -+ ST c01, 13 * SIZE(C1) -+ ST b6, 14 * SIZE(C1) -+ ST c02, 15 * SIZE(C1) ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop + -+#else ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop + -+ vextf c01, 0, a1 # a1=C11R_ac -+ vextf c01, 1, a2 # a2=C11I_bc -+ vextf c01, 2, a3 # a3=C21R_ac -+ vextf c01, 3, a4 # a4=C21I_bc -+ -+ vextf c02, 0, b1 # b1=C11I_ad -+ vextf c02, 1, b2 # b2=C11R_bd -+ vextf c02, 2, b3 # b3=C21I_ad -+ vextf c02, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 0 * SIZE(C1) -+ ST c01, 1 * SIZE(C1) -+ ST b6, 2 * SIZE(C1) -+ ST c02, 3 * SIZE(C1) -+ -+ vextf c05, 0, a1 # a1=C11R_ac -+ vextf c05, 1, a2 # a2=C11I_bc -+ vextf c05, 2, a3 # a3=C21R_ac -+ vextf c05, 3, a4 # a4=C21I_bc -+ -+ vextf c06, 0, b1 # b1=C11I_ad -+ vextf c06, 1, b2 # b2=C11R_bd -+ vextf c06, 2, b3 # b3=C21I_ad -+ vextf c06, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 4 * SIZE(C1) -+ ST c01, 5 * SIZE(C1) -+ ST b6, 6 * SIZE(C1) -+ ST c02, 7 * SIZE(C1) -+ -+ vextf c09, 0, a1 # a1=C11R_ac -+ vextf c09, 1, a2 # a2=C11I_bc -+ vextf c09, 2, a3 # a3=C21R_ac -+ vextf c09, 3, a4 # a4=C21I_bc -+ -+ vextf c10, 0, b1 # b1=C11I_ad -+ vextf c10, 1, b2 # b2=C11R_bd -+ vextf c10, 2, b3 # b3=C21I_ad -+ vextf c10, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 8 * SIZE(C1) -+ ST c01, 9 * SIZE(C1) -+ ST b6, 10 * SIZE(C1) -+ ST c02, 11 * SIZE(C1) -+ -+ vextf c13, 0, a1 # a1=C11R_ac -+ vextf c13, 1, a2 # a2=C11I_bc -+ vextf c13, 2, a3 # a3=C21R_ac -+ vextf c13, 3, a4 # a4=C21I_bc -+ -+ vextf c14, 0, b1 # b1=C11I_ad -+ vextf c14, 1, b2 # b2=C11R_bd -+ vextf c14, 2, b3 # b3=C21I_ad -+ vextf c14, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 12 * SIZE(C1) -+ ST c01, 13 * SIZE(C1) -+ ST b6, 14 * SIZE(C1) -+ ST c02, 15 * SIZE(C1) -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 8, TEMP -+#else -+ subl TEMP, 1, TEMP -+#endif ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop + -+ sll TEMP, 3 + ZBASE_SHIFT,L -+ sll TEMP, ZBASE_SHIFT,TEMP ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) + -+ addl AO, L, AO -+ addl BO, TEMP, BO -+#endif ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) + -+#ifdef LEFT -+ addl KK, 8, KK -+#endif -+#endif ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) + -+ jmp $L999 ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) + ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) + -+ .align 4 -+$L60: -+ and M, 4, I -+ ble I, $L70 ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 + -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA))\ -+ || (!defined(LEFT) && !defined(TRANSA)) -+ mov B, BO ++$L15: ++ ADD c11, t1, c11 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 +#else -+ sll KK, 2 + ZBASE_SHIFT,L # mr=4 -+ sll KK, ZBASE_SHIFT,TEMP # nr=1 -+ -+ addl AO, L, AO -+ addl B, TEMP, BO ++ blbs TMP1, $L17 +#endif ++ .align 4 + -+ fillcs 0(C1) -+ fillcs 4*SIZE(C1) -+ fillcs 8*SIZE(C1) ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 + -+ vcpys $f31,$f31,c01 # Clear result regs -+ vcpys $f31,$f31,c02 ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 + -+ vcpys $f31,$f31,c05 -+ vcpys $f31,$f31,c06 -+ -+ LDDE b1, 0 * SIZE(BO) # B1R -+ LDDE b2, 1 * SIZE(BO) # B1I ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) + -+ VLD a1, 0 * SIZE(AO) # A1, A2 -+ VLD a2, 4 * SIZE(AO) # A3, A4 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 + -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl K, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 4, TEMP # mr=4 -+#else -+ addl KK, 1, TEMP # nr=1 -+#endif -+ sra TEMP, 1, L -+ ble L, $L65 ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) + -+#else ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop + -+ mov B, BO # Set B, (block A x panel Bj) -+ sra K, 1, L # Unroll K as 2 ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) + -+ fillcs 0(C1) -+ fillcs 4*SIZE(C1) -+ fillcs 8*SIZE(C1) ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) + -+ vcpys $f31,$f31,c01 # Clear result regs -+ vcpys $f31,$f31,c02 ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) + -+ vcpys $f31,$f31,c05 -+ vcpys $f31,$f31,c06 -+ -+ LDDE b1, 0 * SIZE(BO) # B1R -+ LDDE b2, 1 * SIZE(BO) # B1I ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) + -+ VLD a1, 0 * SIZE(AO) # A1, A2 -+ VLD a2, 4 * SIZE(AO) # A3, A4 ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) + -+ ble L, $L65 -+#endif ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) + ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) + .align 4 -+$L62: -+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) -+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) + -+ LDDE b5, 2 * SIZE(BO) # next B1R -+ LDDE b6, 3 * SIZE(BO) # next B1I ++$L17: ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 + -+ addl BO, 4*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE -+ VMAD a2,b1,c05,c05 # C31, C41 -+ VMAD a2,b2,c06,c06 # C31, C41 ++ ADD c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 + -+ fillcs 0(PREA) -+ VLD a5, 8 * SIZE(AO) # next A1, A2, a5==a0 -+ VLD a6, 12 * SIZE(AO) # next A3, A4 -+ -+ subl L, 1, L # ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD c03, t1, c03 ++ MUL b3, a1, t1 + -+ addl AO, 16*SIZE, AO # AO+=4mr*2kr*2px*SIZE -+ VMAD a5,b5,c01,c01 -+ VMAD a5,b6,c02,c02 ++ ADD c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD c08, t3, c08 ++ MUL b4, a2, t3 + -+ addl PREA, 16*SIZE, PREA -+ LDDE b1, 0 * SIZE(BO) -+ LDDE b2, 1 * SIZE(BO) ++ ADD c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD c09, t1, c09 ++ MUL b3, a3, t1 + -+ fillcs 0(PREA) -+ VMAD a6,b5,c05,c05 -+ VMAD a6,b6,c06,c06 ++ ADD c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD c14, t3, c14 ++ MUL b4, a4, t3 + -+ VLD a1, 0 * SIZE(AO) -+ VLD a2, 4 * SIZE(AO) -+ -+ addl PREA, 16*SIZE, PREA -+ bne L, $L62 # continue K ++ ADD c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) + -+$L65: -+ LD alpha_r, ALPHA_R # $f30==b8 -+#ifndef TRMMKERNEL -+ blbc K, $L68 # if(K&1) -+#else -+ blbc TEMP, $L68 -+#endif -+ -+$L66: -+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE -+ VMAD a1,b1,c01,c01 # C11R C21R -+ VMAD a1,b2,c02,c02 # C11I C21I -+ -+ addl BO, 2*SIZE, BO -+ VMAD a2,b1,c05,c05 # C31R C41R -+ VMAD a2,b2,c06,c06 # C31I C41I -+ -+$L68: # Write back -+ LD alpha_i, ALPHA_I # $f29==b7 -+#ifndef TRMMKERNEL -+ vextf c01, 0, a1 # a1=C11R_ac -+ vextf c01, 1, a2 # a2=C11I_bc -+ vextf c01, 2, a3 # a3=C21R_ac -+ vextf c01, 3, a4 # a4=C21I_bc -+ -+ vextf c02, 0, b1 # b1=C11I_ad -+ vextf c02, 1, b2 # b2=C11R_bd -+ vextf c02, 2, b3 # b3=C21I_ad -+ vextf c02, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 0 * SIZE(C1) -+ LD a2, 1 * SIZE(C1) -+ LD a3, 2 * SIZE(C1) -+ LD a4, 3 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 0 * SIZE(C1) -+ ST c01, 1 * SIZE(C1) -+ ST b6, 2 * SIZE(C1) -+ ST c02, 3 * SIZE(C1) -+ -+ vextf c05, 0, a1 # a1=C11R_ac -+ vextf c05, 1, a2 # a2=C11I_bc -+ vextf c05, 2, a3 # a3=C21R_ac -+ vextf c05, 3, a4 # a4=C21I_bc -+ -+ vextf c06, 0, b1 # b1=C11I_ad -+ vextf c06, 1, b2 # b2=C11R_bd -+ vextf c06, 2, b3 # b3=C21I_ad -+ vextf c06, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ LD a1, 4 * SIZE(C1) -+ LD a2, 5 * SIZE(C1) -+ LD a3, 6 * SIZE(C1) -+ LD a4, 7 * SIZE(C1) -+ -+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, a3, a3 -+ FMAD7 a7, alpha_r, a2, a2 -+ FMAD7 a8, alpha_r, a4, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 4 * SIZE(C1) -+ ST c01, 5 * SIZE(C1) -+ ST b6, 6 * SIZE(C1) -+ ST c02, 7 * SIZE(C1) ++ ADD c11, t1, c11 ++ ADD c12, t2, c12 ++ ADD c16, t3, c16 ++ ADD c15, t4, c15 ++ .align 4 + ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 +#else -+ -+ vextf c01, 0, a1 # a1=C11R_ac -+ vextf c01, 1, a2 # a2=C11I_bc -+ vextf c01, 2, a3 # a3=C21R_ac -+ vextf c01, 3, a4 # a4=C21I_bc -+ -+ vextf c02, 0, b1 # b1=C11I_ad -+ vextf c02, 1, b2 # b2=C11R_bd -+ vextf c02, 2, b3 # b3=C21I_ad -+ vextf c02, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 0 * SIZE(C1) -+ ST c01, 1 * SIZE(C1) -+ ST b6, 2 * SIZE(C1) -+ ST c02, 3 * SIZE(C1) -+ -+ vextf c05, 0, a1 # a1=C11R_ac -+ vextf c05, 1, a2 # a2=C11I_bc -+ vextf c05, 2, a3 # a3=C21R_ac -+ vextf c05, 3, a4 # a4=C21I_bc -+ -+ vextf c06, 0, b1 # b1=C11I_ad -+ vextf c06, 1, b2 # b2=C11R_bd -+ vextf c06, 2, b3 # b3=C21I_ad -+ vextf c06, 3, b4 # b4=C21R_bd -+ -+ ADD1 a1, b2, b5 # ac '+' bd -+ ADD1 a3, b4, a6 -+ ADD2 a2, b1, a7 # bc '+' ad -+ ADD2 a4, b3, a8 -+ -+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 -+ FMAD5 a6, alpha_r, $f31, a3 -+ FMAD7 a7, alpha_r, $f31, a2 -+ FMAD7 a8, alpha_r, $f31, a4 -+ -+ FMAD8 a7, alpha_i, a1, b4 -+ FMAD8 a8, alpha_i, a3, b6 -+ FMAD6 b5, alpha_i, a2, c01 -+ FMAD6 a6, alpha_i, a4, c02 -+ -+ ST b4, 4 * SIZE(C1) -+ ST c01, 5 * SIZE(C1) -+ ST b6, 6 * SIZE(C1) -+ ST c02, 7 * SIZE(C1) -+ -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl K, KK,TEMP -+#ifdef LEFT -+ subl TEMP, 4, TEMP # mr=4 ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO +#else -+ subl TEMP, 1, TEMP # nr=1 ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) +#endif + -+ sll TEMP, 2 + ZBASE_SHIFT, L -+ sll TEMP, ZBASE_SHIFT,TEMP ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 + -+ addl AO, L, AO -+ addl BO,TEMP, BO -+#endif ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 + -+#ifdef LEFT -+ addl KK,4,KK -+#endif -+#endif ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) + -+ addl C1, 8*SIZE, C1 ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) + -+ -+ .align 4 -+$L70: -+ and M, 2, I # I=M&2 -+ ble I, $L80 ++ SUB a1, c03, c03 ++ SUB a2, c07, c07 ++ SUB a3, c11, c11 ++ SUB a4, c15, c15 + -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B, BO -+ nop ++ SUB b1, c04, c04 ++ SUB b2, c08, c08 ++ SUB b3, c12, c12 ++ SUB b4, c16, c16 +#else -+ sll KK, 1 + ZBASE_SHIFT, L # mr=2 -+ sll KK, ZBASE_SHIFT,TEMP # nr=1 -+ -+ addl AO, L, AO -+ addl B, TEMP, BO -+#endif ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ fillcs 0*SIZE(C1) -+ fillcs 4*SIZE(C1) ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) + -+ fclr c01 -+ fclr c02 # CLEAR 8 register -+ fclr c03 -+ fclr c04 -+ fclr c05 -+ fclr c06 -+ fclr c07 -+ fclr c08 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 + -+ LD b1, 0*SIZE(BO) # b1 real part -+ LD b2, 1*SIZE(BO) # b1 image part ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 + -+ LD a1, 0*SIZE(AO) # a1 real part -+ LD a2, 1*SIZE(AO) # a1 image part -+ LD a3, 2*SIZE(AO) # a2 real part -+ LD a4, 3*SIZE(AO) # a2 image part ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) + -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl K, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 2, TEMP # mr=2 -+#else -+ addl KK, 1, TEMP # nr=1 -+#endif -+ sra TEMP, 1, L -+ ble L, $L75 ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) + -+#else -+ mov B, BO # Set B, (block A x panel Bj) -+ sra K, 1, L # Unroll K as 2 ++ SUB a1, c09, c09 ++ SUB a2, c10, c10 ++ SUB a3, c11, c11 ++ SUB a4, c12, c12 + -+ fillcs 0*SIZE(C1) -+ fillcs 4*SIZE(C1) ++ SUB b1, c13, c13 ++ SUB b2, c14, c14 ++ SUB b3, c15, c15 ++ SUB b4, c16, c16 ++#endif + -+ fclr c01 -+ fclr c02 # CLEAR 8 register -+ fclr c03 -+ fclr c04 -+ fclr c05 -+ fclr c06 -+ fclr c07 -+ fclr c08 ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) + -+ LD b1, 0*SIZE(BO) # b1 real part -+ LD b2, 1*SIZE(BO) # b1 image part ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ MUL a1, c12, c12 ++ MUL a1, c16, c16 + -+ LD a1, 0*SIZE(AO) # a1 real part -+ LD a2, 1*SIZE(AO) # a1 image part -+ LD a3, 2*SIZE(AO) # a2 real part -+ LD a4, 3*SIZE(AO) # a2 image part ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ MUL a2, c12, t3 ++ MUL a2, c16, t4 + -+ ble L, $L75 -+#endif ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 + -+ .align 4 -+$L72: -+ MAD a1,b1,c01,c01 # C11 real part -+ MAD a1,b2,c02,c02 # C11 imag part ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ MUL a3, c12, t3 ++ MUL a3, c16, t4 + -+ LD b5, 2 * SIZE(BO) # next B1R -+ LD b6, 3 * SIZE(BO) # next B1I ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+ LD a5, 4 * SIZE(AO) # next A1-A4 real part -+ LD a6, 5 * SIZE(AO) # next A1-A4 image part -+ LD a7, 6 * SIZE(AO) -+ LD a8, 7 * SIZE(AO) ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ MUL a4, c12, t3 ++ MUL a4, c16, t4 + -+ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE -+ MAD a2,b1,c03,c03 # C11 image part -+ MAD a2,b2,c04,c04 # C11 real part ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 + -+ MAD a3,b1,c05,c05 # C12 real part -+ MAD a3,b2,c06,c06 # C12 imag part ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) + -+ MAD a4,b1,c07,c07 # C12 image part -+ MAD a4,b2,c08,c08 # C12 real part ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ MUL b1, c11, c11 ++ MUL b1, c15, c15 + -+ subl L, 1, L # ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ MUL b2, c11, t3 ++ MUL b2, c15, t4 + -+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE -+ MAD a5,b5,c01,c01 -+ MAD a5,b6,c02,c02 ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+ LD b1, 0 * SIZE(BO) -+ LD b2, 1 * SIZE(BO) ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ MUL b3, c11, t3 ++ MUL b3, c15, t4 + -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 + -+ MAD a6,b5,c03,c03 -+ MAD a6,b6,c04,c04 ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ MAD a7,b5,c05,c05 -+ MAD a7,b6,c06,c06 ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 + -+ MAD a8,b5,c07,c07 -+ MAD a8,b6,c08,c08 ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 + -+ bne L, $L72 # continue K ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 + -+$L75: -+ LD alpha_r, ALPHA_R # $f30==b8 -+#ifndef TRMMKERNEL -+ blbc K, $L78 # if(K&1) -+#else -+ blbc TEMP, $L78 ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 +#endif + -+$L76: -+ addl AO, 4*SIZE, AO # AO+=2mr*1kr*2px*SIZE -+ MAD a1,b1,c01,c01 # C11 real part -+ MAD a1,b2,c02,c02 # C11 imag part -+ -+ addl BO, 4*SIZE, BO -+ MAD a2,b1,c03,c03 # C11 image part -+ MAD a2,b2,c04,c04 # C11 real part ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ MAD a3,b1,c05,c05 # C12 real part -+ MAD a3,b2,c06,c06 # C12 imag part ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 + -+ MAD a4,b1,c07,c07 # C12 image part -+ MAD a4,b2,c08,c08 # C12 real part ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 + ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+ -+$L78: # Write back -+ LD alpha_i, ALPHA_I # $f29==b7 -+#ifndef TRMMKERNEL -+ ADD1 c01, c04, c01 -+ ADD1 c05, c08, c05 -+ ADD2 c03, c02, c02 -+ ADD2 c07, c06, c06 ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ MUL a3, c09, t3 ++ MUL a3, c13, t4 + -+ LD b1, 0 * SIZE(C1) -+ LD b2, 1 * SIZE(C1) -+ LD b3, 2 * SIZE(C1) -+ LD b4, 3 * SIZE(C1) ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 + -+ FMAD5 c01, alpha_r, b1, b1 -+ FMAD5 c05, alpha_r, b3, b3 -+ FMAD7 c02, alpha_r, b2, b2 -+ FMAD7 c06, alpha_r, b4, b4 ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ MUL a4, c09, t3 ++ MUL a4, c13, t4 + -+ FMAD8 c02, alpha_i, b1, b1 -+ FMAD8 c06, alpha_i, b3, b3 -+ FMAD6 c01, alpha_i, b2, b2 -+ FMAD6 c05, alpha_i, b4, b4 ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 + -+ ST b1, 0 * SIZE(C1) -+ ST b2, 1 * SIZE(C1) -+ ST b3, 2 * SIZE(C1) -+ ST b4, 3 * SIZE(C1) ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) + -+#else ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ MUL b1, c10, c10 ++ MUL b1, c14, c14 + -+ ADD1 c01, c04, c01 -+ ADD1 c05, c08, c05 -+ ADD2 c03, c02, c02 -+ ADD2 c07, c06, c06 -+ -+ FMAD5 c01, alpha_r, $f31, b1 -+ FMAD5 c05, alpha_r, $f31, b3 -+ FMAD7 c02, alpha_r, $f31, b2 -+ FMAD7 c06, alpha_r, $f31, b4 -+ -+ FMAD8 c02, alpha_i, b1, b1 -+ FMAD8 c06, alpha_i, b3, b3 -+ FMAD6 c01, alpha_i, b2, b2 -+ FMAD6 c05, alpha_i, b4, b4 -+ -+ ST b1, 0 * SIZE(C1) -+ ST b2, 1 * SIZE(C1) -+ ST b3, 2 * SIZE(C1) -+ ST b4, 3 * SIZE(C1) -+ -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 2, TEMP -+#else -+ subl TEMP, 1, TEMP -+#endif ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ MUL b2, c10, t3 ++ MUL b2, c14, t4 + -+ sll TEMP, 1 + ZBASE_SHIFT, L -+ sll TEMP, ZBASE_SHIFT, TEMP ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 + -+ addl AO, L, AO -+ addl BO, TEMP, BO -+#endif ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ MUL b3, c10, t3 ++ MUL b3, c14, t4 + -+#ifdef LEFT -+ addl KK, 2, KK -+#endif -+#endif ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 + -+ addl C1, 4*SIZE, C1 ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) + ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ MUL a1, c11, c11 ++ MUL a1, c15, c15 + -+ .align 4 -+$L80: -+ and M, 1, I # I=M&1 -+ ble I, $L999 ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ MUL a2, c11, t3 ++ MUL a2, c15, t4 + -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ mov B, BO -+ nop -+#else -+ sll KK, ZBASE_SHIFT, L # mr=1 -+ sll KK, ZBASE_SHIFT,TEMP # nr=1 ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 + -+ addl AO, L, AO -+ addl B, TEMP, BO ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++ MUL a3, c12, c12 ++ MUL a3, c16, c16 +#endif + -+ fillcs 0*SIZE(C1) -+ -+ fclr c01 # CLEAR 8 register -+ fclr c02 -+ fclr c03 -+ fclr c04 ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ LD b1, 0*SIZE(BO) # b1 real part -+ LD b2, 1*SIZE(BO) # b1 image part ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 + -+ LD a1, 0*SIZE(AO) # a1 real part -+ LD a2, 1*SIZE(AO) # a1 image part ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 + -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ subl K, KK, TEMP -+#elif defined(LEFT) -+ addl KK, 1, TEMP # mr=1 -+#else -+ addl KK, 1, TEMP # nr=1 -+#endif -+ sra TEMP, 1, L -+ ble L, $L85 ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+#else -+ mov B, BO # Set B, (block A x panel Bj) -+ sra K, 1, L # Unroll K as 2 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 + -+ fillcs 0*SIZE(C1) ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + -+ fclr c01 # CLEAR 8 register -+ fclr c02 -+ fclr c03 -+ fclr c04 ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ MUL a4, c03, t3 ++ MUL a4, c04, t4 + -+ LD b1, 0*SIZE(BO) # b1 real part -+ LD b2, 1*SIZE(BO) # b1 image part ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 + -+ LD a1, 0*SIZE(AO) # a1 real part -+ LD a2, 1*SIZE(AO) # a1 image part ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) + -+ ble L, $L85 -+#endif ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ MUL b1, c07, c07 ++ MUL b1, c08, c08 + -+ .align 4 -+$L82: -+ MAD a1,b1,c01,c01 # C11 real part -+ MAD a1,b2,c02,c02 # C11 imag part ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ MUL b2, c07, t3 ++ MUL b2, c08, t4 + -+ LD b5, 2 * SIZE(BO) # next B1R -+ LD b6, 3 * SIZE(BO) # next B1I ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + -+ LD a5, 2 * SIZE(AO) # next A1-A4 real part -+ LD a6, 3 * SIZE(AO) # next A1-A4 image part ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ MUL b3, c07, t3 ++ MUL b3, c08, t4 + -+ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE -+ MAD a2,b1,c03,c03 # C11 image part -+ MAD a2,b2,c04,c04 # C11 real part ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 + -+ subl L, 1, L # ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) + -+ addl AO, 4*SIZE, AO # AO+=1mr*2kr*2px*SIZE -+ MAD a5,b5,c01,c01 -+ MAD a5,b6,c02,c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 + -+ LD b1, 0 * SIZE(BO) -+ LD b2, 1 * SIZE(BO) ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ MUL a2, c11, t3 ++ MUL a2, c12, t4 + -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 + -+ MAD a6,b5,c03,c03 -+ MAD a6,b6,c04,c04 ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++ MUL a3, c15, c15 ++ MUL a3, c16, c16 ++#endif + -+ bne L, $L82 # continue K ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) + -+$L85: -+ LD alpha_r, ALPHA_R # $f30==b8 -+#ifndef TRMMKERNEL -+ blbc K, $L88 # if(K&1) -+#else -+ blbc TEMP, $L88 -+#endif ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ MUL a1, c15, c15 ++ MUL a1, c16, c16 + -+$L86: -+ addl AO, 2*SIZE, AO # AO+=8mr*1kr*2px*SIZE -+ MAD a1,b1,c01,c01 # C11 real part -+ MAD a1,b2,c02,c02 # C11 imag part ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ MUL a2, c15, t3 ++ MUL a2, c16, t4 + -+ addl BO, 2*SIZE, BO -+ MAD a2,b1,c03,c03 # C11 image part -+ MAD a2,b2,c04,c04 # C11 real part ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + -+$L88: # Write back -+ LD alpha_i, ALPHA_I # $f29==b7 -+#ifndef TRMMKERNEL -+ ADD1 c01, c04, c01 -+ ADD2 c03, c02, c02 ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ MUL a3, c15, t3 ++ MUL a3, c16, t4 + -+ LD b1, 0 * SIZE(C1) -+ LD b2, 1 * SIZE(C1) ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+ FMAD5 c01, alpha_r, b1, b1 -+ FMAD7 c02, alpha_r, b2, b2 -+ FMAD8 c02, alpha_i, b1, b1 -+ FMAD6 c01, alpha_i, b2, b2 ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ MUL a4, c15, t3 ++ MUL a4, c16, t4 + -+ ST b1, 0 * SIZE(C1) -+ ST b2, 1 * SIZE(C1) ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + -+#else ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) + -+ ADD1 c01, c04, c01 -+ ADD2 c03, c02, c02 ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ MUL b1, c11, c11 ++ MUL b1, c12, c12 + -+ FMAD5 c01, alpha_r, $f31, b1 -+ FMAD7 c02, alpha_r, $f31, b2 ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ MUL b2, c11, t3 ++ MUL b2, c12, t4 + -+ FMAD8 c02, alpha_i, b1, b1 -+ FMAD6 c01, alpha_i, b2, b2 ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+ ST b1, 0 * SIZE(C1) -+ ST b2, 1 * SIZE(C1) ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ MUL b3, c11, t3 ++ MUL b3, c12, t4 + -+#if ( defined(LEFT) && defined(TRANSA)) || \ -+ (!defined(LEFT) && !defined(TRANSA)) -+ subl K, KK, TEMP -+#ifdef LEFT -+ subl TEMP, 1, TEMP -+#else -+ subl TEMP, 1, TEMP -+#endif ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + -+ sll TEMP, ZBASE_SHIFT, L -+ sll TEMP, ZBASE_SHIFT, TEMP ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ addl AO, L, AO -+ addl BO, TEMP,BO -+#endif ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 + -+#ifdef LEFT -+ addl KK, 1, KK -+#endif -+#endif ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 + -+ addl C1, 2*SIZE, C1 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif + -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldl $9, 80($sp) -+ ldl $10,88($sp) -+ ldl $11,96($sp) -+ ldl $12,104($sp) -+ ldl $13,112($sp) -+ ldl $14,120($sp) ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) + -+ clr $0 ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) + -+ ldi $sp, STACKSIZE($sp) -+ ret $31,($26),1 # -+ -+ EPILOGUE -diff --git a/kernel/sw_64/zgemv_n.S b/kernel/sw_64/zgemv_n.S -new file mode 100644 -index 0000000..03d71ee ---- /dev/null -+++ b/kernel/sw_64/zgemv_n.S -@@ -0,0 +1,1040 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) + -+#define STACKSIZE 64 -+#define PREFETCHSIZE 32 ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) + -+#define M $16 -+#define N $17 -+#define A $21 -+#define LDA $18 ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) + -+#define X $19 -+#define INCX $20 -+#define Y $22 -+#define INCY $23 ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif + -+#define BUFFER $24 ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif + -+#define I $25 -+#define J $27 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) + -+#define Y1 $4 -+#define A1 $5 -+#define A2 $6 ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) + -+#define alpha_r $f19 -+#define alpha_i $f20 ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) + -+#define alpha1 $f0 -+#define alpha2 $f1 -+#define alpha3 $f10 -+#define alpha4 $f11 ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) + -+#define y0 $f12 -+#define y1 $f13 -+#define y2 $f14 -+#define y3 $f15 ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif + -+#define y4 $f16 -+#define y5 $f17 -+#define y6 $f18 -+#define y7 $f21 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+#define a0 $f22 -+#define a1 $f23 -+#define a2 $f24 -+#define a3 $f25 -+#define a4 $f26 -+#define a5 $f27 -+#define a6 $f28 -+#define a7 $f29 ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+#define t0 $f2 -+#define t1 $f3 -+#define t2 $f4 -+#define t3 $f5 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif + -+#if !defined(CONJ) && !defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 SUB -+#define ADD4 ADD -+#elif defined(CONJ) && !defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#elif !defined(CONJ) && defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 ADD -+#define ADD4 SUB -+#else -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 SUB -+#define ADD4 SUB ++#ifdef LT ++ addl KK, 4, KK +#endif + -+ PROLOGUE ++#ifdef LN ++ subl KK, 4, KK ++#endif + -+ ldi $sp, -STACKSIZE($sp) -+ ldl LDA, 0 + STACKSIZE($sp) -+ ldl X, 8 + STACKSIZE($sp) -+ ldl INCX, 16 + STACKSIZE($sp) -+ ldl Y, 24 + STACKSIZE($sp) -+ ldl INCY, 32 + STACKSIZE($sp) -+ ldl BUFFER, 40 + STACKSIZE($sp) ++ ldi I, -1(I) + -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) ++ bgt I, $L11 ++ .align 4 + -+ PROFCODE ++$L20: ++ and M, 2, I ++ ble I, $L30 + -+ cmple M, 0, $0 -+ sll INCX, ZBASE_SHIFT, INCX -+ cmple N, 0, $1 -+ sll INCY, ZBASE_SHIFT, INCY ++#if defined(LT) || defined(RN) + -+ or $0, $1, $0 -+ bne $0, $L999 ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 + -+ cmpeq INCY, 2 * SIZE, $0 -+ sll LDA, ZBASE_SHIFT,LDA -+ bne $0, $L10 ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 + -+ mov BUFFER, Y1 ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) + -+ mov Y, BUFFER -+ mov Y1, Y ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 + -+ sra M, 2, I -+ ble I, $L05 -+ .align 4 ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 + -+$L02: -+ ST $f31, 0 * SIZE(Y1) -+ ST $f31, 1 * SIZE(Y1) -+ ST $f31, 2 * SIZE(Y1) -+ ST $f31, 3 * SIZE(Y1) -+ ST $f31, 4 * SIZE(Y1) -+ ST $f31, 5 * SIZE(Y1) -+ ST $f31, 6 * SIZE(Y1) -+ ST $f31, 7 * SIZE(Y1) ++ ble L, $L25 + -+ ldi Y1, 8 * SIZE(Y1) -+ ldi I, -1(I) -+ bgt I, $L02 -+ .align 4 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+$L05: -+ and M, 3, I -+ ble I, $L10 -+ .align 4 ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO + -+$L06: -+ ST $f31, 0 * SIZE(Y1) -+ ST $f31, 1 * SIZE(Y1) -+ addl Y1, 2 * SIZE, Y1 ++ subl K, KK, TMP1 + -+ ldi I, -1(I) -+ bgt I, $L06 -+ .align 4 ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 + -+$L10: -+ sra N, 1, J -+ ble J, $L20 -+ .align 4 ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 + -+$L11: -+ LD alpha1, 0 * SIZE(X) -+ LD alpha2, 1 * SIZE(X) -+ addl X, INCX, X -+ LD alpha3, 0 * SIZE(X) -+ LD alpha4, 1 * SIZE(X) -+ addl X, INCX, X ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) + -+ MUL alpha_r, alpha1, y0 -+ MUL alpha_r, alpha2, y1 -+ MUL alpha_r, alpha3, y2 -+ MUL alpha_r, alpha4, y3 ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 + -+ MUL alpha_i, alpha2, t0 -+ mov A, A1 -+ MUL alpha_i, alpha1, t1 -+ addl A, LDA, A2 -+ MUL alpha_i, alpha4, t2 -+ addl A2, LDA, A -+ MUL alpha_i, alpha3, t3 -+ mov Y, Y1 ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 + -+#ifndef XCONJ -+ SUB y0, t0, alpha1 -+ ADD y1, t1, alpha2 -+ SUB y2, t2, alpha3 -+ ADD y3, t3, alpha4 -+#else -+ ADD y0, t0, alpha1 -+ SUB y1, t1, alpha2 -+ ADD y2, t2, alpha3 -+ SUB y3, t3, alpha4 ++ ble L, $L25 +#endif ++ .align 4 + -+ fillcs 4 * SIZE(X) -+ -+ sra M, 2, I -+ ble I, $L15 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) ++$L22: ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop + -+ LD a4, 0 * SIZE(A2) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) + -+ MUL alpha1, a0, t0 -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a1, t1 -+ LD y1, 1 * SIZE(Y1) ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) + -+ MUL alpha1, a2, t2 -+ LD y2, 2 * SIZE(Y1) -+ MUL alpha1, a3, t3 -+ LD y3, 3 * SIZE(Y1) ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) + -+ ADD1 y0, t0, $f6 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 + unop -+ MUL alpha3, a4, t0 -+ LD y4, 4 * SIZE(Y1) + -+ ADD2 y1, t1, $f7 ++ ADD c02, t2, c02 + unop -+ MUL alpha3, a5, t1 -+ LD y5, 5 * SIZE(Y1) ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) + -+ ADD1 y2, t2, $f8 ++ ADD c05, t3, c05 + unop -+ MUL alpha3, a6, t2 -+ LD y6, 6 * SIZE(Y1) ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) + -+ ADD2 y3, t3, $f9 ++ ADD c09, t1, c09 + unop -+ MUL alpha3, a7, t3 -+ LD y7, 7 * SIZE(Y1) ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) + -+ ADD1 $f6, t0, y0 ++ ADD c10, t2, c10 + unop -+ MUL alpha2, a1, t0 -+ LD a1, 5 * SIZE(A1) ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) + -+ ADD2 $f7, t1, y1 ++ ADD c13, t3, c13 + unop -+ MUL alpha2, a0, t1 -+ LD a0, 4 * SIZE(A1) ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) + -+ ADD1 $f8, t2, y2 ++ ADD c02, t2, c02 + unop -+ MUL alpha2, a3, t2 -+ LD a3, 7 * SIZE(A1) ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) + -+ ADD2 $f9, t3, y3 ++ ADD c05, t3, c05 + unop -+ MUL alpha2, a2, t3 -+ LD a2, 6 * SIZE(A1) ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif + -+ ADD3 y0, t0, $f6 ++ ADD c10, t2, c10 + unop -+ MUL alpha4, a5, t0 -+ LD a5, 5 * SIZE(A2) ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) + -+ ADD4 y1, t1, $f7 ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 + unop -+ MUL alpha4, a4, t1 -+ LD a4, 4 * SIZE(A2) + -+ ADD3 y2, t2, $f8 ++ ADD c14, t4, c14 + unop -+ MUL alpha4, a7, t2 -+ LD a7, 7 * SIZE(A2) ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) + -+ ADD4 y3, t3, $f9 ++ ADD c01, t1, c01 + unop -+ MUL alpha4, a6, t3 -+ LD a6, 6 * SIZE(A2) ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) + -+ ADD3 $f6, t0, y0 -+ MUL alpha1, a0, t0 -+ ADD4 $f7, t1, y1 -+ MUL alpha1, a1, t1 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) + -+ ADD3 $f8, t2, y2 ++ ADD c05, t3, c05 + unop -+ MUL alpha1, a2, t2 ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 + unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) + -+ ADD4 $f9, t3, y3 -+ ldi I, -1(I) -+ MUL alpha1, a3, t3 -+ ble I, $L13 ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) + .align 4 + -+$L12: -+ ADD1 y4, t0, $f6 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha3, a4, t0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++$L27: ++ ADD c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ ADD c10, t2, c10 ++ ADD c13, t3, c13 ++ ADD c14, t4, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c13, c13 ++ SUB b4, c14, c14 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) + -+ ADD2 y5, t1, $f7 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha3, a5, t1 -+ ldi I, -1(I) ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 + -+ ADD1 y6, t2, $f8 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha3, a6, t2 -+ unop ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 + -+ ADD2 y7, t3, $f9 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha3, a7, t3 -+ unop ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+ ADD1 $f6, t0, y4 -+ unop -+ MUL alpha2, a1, t0 -+ LD a1, 9 * SIZE(A1) ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++ MUL a3, c10, c10 ++ MUL a3, c14, c14 ++#endif + -+ ADD2 $f7, t1, y5 -+ unop -+ MUL alpha2, a0, t1 -+ LD a0, 8 * SIZE(A1) ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ ADD1 $f8, t2, y6 -+ unop -+ MUL alpha2, a3, t2 -+ LD a3, 11 * SIZE(A1) ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 + -+ ADD2 $f9, t3, y7 -+ unop -+ MUL alpha2, a2, t3 -+ LD a2, 10 * SIZE(A1) ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 + -+ ADD3 y4, t0, $f6 -+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) -+ MUL alpha4, a5, t0 -+ LD a5, 9 * SIZE(A2) ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 + -+ ADD4 y5, t1, $f7 -+ unop -+ MUL alpha4, a4, t1 -+ LD a4, 8 * SIZE(A2) ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 + -+ ADD3 y6, t2, $f8 -+ unop -+ MUL alpha4, a7, t2 -+ LD a7, 11 * SIZE(A2) ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 + -+ ADD4 y7, t3, $f9 -+ unop -+ MUL alpha4, a6, t3 -+ LD a6, 10 * SIZE(A2) ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 + -+ ADD3 $f6, t0, y4 -+ unop -+ MUL alpha1, a0, t0 -+ LD y0, 8 * SIZE(Y1) ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 + -+ ADD4 $f7, t1, y5 -+ unop -+ MUL alpha1, a1, t1 -+ LD y1, 9 * SIZE(Y1) ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) + -+ ADD3 $f8, t2, y6 -+ unop -+ MUL alpha1, a2, t2 -+ LD y2, 10 * SIZE(Y1) ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 + -+ ADD4 $f9, t3, y7 -+ unop -+ MUL alpha1, a3, t3 -+ LD y3, 11 * SIZE(Y1) ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 + -+ ADD1 y0, t0, $f6 -+ ST y4, 4 * SIZE(Y1) -+ MUL alpha3, a4, t0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 + -+ ADD2 y1, t1, $f7 -+ ST y5, 5 * SIZE(Y1) -+ MUL alpha3, a5, t1 -+ unop ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 + -+ ADD1 y2, t2, $f8 -+ ST y6, 6 * SIZE(Y1) -+ MUL alpha3, a6, t2 -+ unop ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 + -+ ADD2 y3, t3, $f9 -+ ST y7, 7 * SIZE(Y1) -+ MUL alpha3, a7, t3 -+ ldi Y1, 8 * SIZE(Y1) ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) + -+ ADD1 $f6, t0, y0 -+ unop -+ MUL alpha2, a1, t0 -+ LD a1, 13 * SIZE(A1) ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 + -+ ADD2 $f7, t1, y1 -+ unop -+ MUL alpha2, a0, t1 -+ LD a0, 12 * SIZE(A1) ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 + -+ ADD1 $f8, t2, y2 -+ unop -+ MUL alpha2, a3, t2 -+ LD a3, 15 * SIZE(A1) ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 + -+ ADD2 $f9, t3, y3 -+ unop -+ MUL alpha2, a2, t3 -+ LD a2, 14 * SIZE(A1) ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++#endif + -+ ADD3 y0, t0, $f6 -+ unop -+ MUL alpha4, a5, t0 -+ LD a5, 13 * SIZE(A2) ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) + -+ ADD4 y1, t1, $f7 -+ unop -+ MUL alpha4, a4, t1 -+ LD a4, 12 * SIZE(A2) ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 + -+ ADD3 y2, t2, $f8 -+ unop -+ MUL alpha4, a7, t2 -+ LD a7, 15 * SIZE(A2) ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 + -+ ADD4 y3, t3, $f9 -+ unop -+ MUL alpha4, a6, t3 -+ LD a6, 14 * SIZE(A2) ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 + -+ ADD3 $f6, t0, y0 -+ unop -+ MUL alpha1, a0, t0 -+ LD y4, 4 * SIZE(Y1) ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 + -+ ADD4 $f7, t1, y1 -+ ldi A2, 8 * SIZE(A2) -+ MUL alpha1, a1, t1 -+ LD y5, 5 * SIZE(Y1) ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 + -+ ADD3 $f8, t2, y2 -+ ldi A1, 8 * SIZE(A1) -+ MUL alpha1, a2, t2 -+ LD y6, 6 * SIZE(Y1) ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 + -+ ADD4 $f9, t3, y3 -+ MUL alpha1, a3, t3 -+ LD y7, 7 * SIZE(Y1) -+ bgt I, $L12 -+ .align 4 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 + -+$L13: -+ ADD1 y4, t0, $f6 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha3, a4, t0 -+ unop ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) + -+ ADD2 y5, t1, $f7 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha3, a5, t1 -+ unop ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 + -+ ADD1 y6, t2, $f8 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha3, a6, t2 -+ unop ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 + -+ ADD2 y7, t3, $f9 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha3, a7, t3 -+ unop ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 + -+ ADD1 $f6, t0, y4 -+ MUL alpha2, a1, t0 -+ ADD2 $f7, t1, y5 -+ MUL alpha2, a0, t1 ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 + -+ ADD1 $f8, t2, y6 -+ MUL alpha2, a3, t2 -+ ADD2 $f9, t3, y7 -+ MUL alpha2, a2, t3 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 + -+ ADD3 y4, t0, $f6 -+ MUL alpha4, a5, t0 -+ ADD4 y5, t1, $f7 -+ MUL alpha4, a4, t1 ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ ADD3 y6, t2, $f8 -+ MUL alpha4, a7, t2 -+ ADD4 y7, t3, $f9 -+ MUL alpha4, a6, t3 ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 + -+ ADD3 $f6, t0, y4 -+ ADD4 $f7, t1, y5 -+ ADD3 $f8, t2, y6 -+ ADD4 $f9, t3, y7 ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 + -+ ST y4, 4 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ST y5, 5 * SIZE(Y1) -+ ldi A2, 8 * SIZE(A2) ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 + -+ ST y6, 6 * SIZE(Y1) -+ unop -+ ST y7, 7 * SIZE(Y1) -+ ldi Y1, 8 * SIZE(Y1) -+ .align 4 ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif + -+$L15: -+ and M, 2, I -+ ble I, $L17 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) + -+ LD a4, 0 * SIZE(A2) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif + -+ MUL alpha1, a0, t0 -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a1, t1 -+ LD y1, 1 * SIZE(Y1) -+ MUL alpha1, a2, t2 -+ LD y2, 2 * SIZE(Y1) -+ MUL alpha1, a3, t3 -+ LD y3, 3 * SIZE(Y1) ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif + -+ ADD1 y0, t0, $f6 -+ MUL alpha3, a4, t0 -+ ADD2 y1, t1, $f7 -+ MUL alpha3, a5, t1 -+ ADD1 y2, t2, $f8 -+ MUL alpha3, a6, t2 -+ ADD2 y3, t3, $f9 -+ MUL alpha3, a7, t3 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) + -+ ADD1 $f6, t0, y0 -+ MUL alpha2, a1, t0 -+ ADD2 $f7, t1, y1 -+ MUL alpha2, a0, t1 ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) + -+ ADD1 $f8, t2, y2 -+ MUL alpha2, a3, t2 -+ ADD2 $f9, t3, y3 -+ MUL alpha2, a2, t3 ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif + -+ ADD3 y0, t0, $f6 -+ MUL alpha4, a5, t0 -+ ADD4 y1, t1, $f7 -+ MUL alpha4, a4, t1 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ ADD3 y2, t2, $f8 -+ MUL alpha4, a7, t2 -+ ADD4 y3, t3, $f9 -+ MUL alpha4, a6, t3 ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ ADD3 $f6, t0, y0 -+ ADD4 $f7, t1, y1 -+ ADD3 $f8, t2, y2 -+ ADD4 $f9, t3, y3 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ST y0, 0 * SIZE(Y1) -+ ldi A1, 4 * SIZE(A1) -+ ST y1, 1 * SIZE(Y1) -+ ldi A2, 4 * SIZE(A2) ++#ifdef LT ++ addl KK, 2, KK ++#endif + -+ ST y2, 2 * SIZE(Y1) -+ unop -+ ST y3, 3 * SIZE(Y1) -+ ldi Y1, 4 * SIZE(Y1) ++#ifdef LN ++ subl KK, 2, KK ++#endif + .align 4 + -+$L17: -+ blbc M, $L18 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 0 * SIZE(A2) -+ LD a3, 1 * SIZE(A2) ++$L30: ++ and M, 1, I ++ ble I, $L39 + -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) ++#if defined(LT) || defined(RN) + -+ MUL alpha1, a0, t0 -+ MUL alpha1, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+ ADD1 y0, t0, $f6 -+ MUL alpha3, a2, t0 -+ ADD2 y1, t1, $f7 -+ MUL alpha3, a3, t1 ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) + -+ ADD1 $f6, t0, y0 -+ MUL alpha2, a1, t0 -+ ADD2 $f7, t1, y1 -+ MUL alpha2, a0, t1 ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 + -+ ADD3 y0, t0, $f6 -+ MUL alpha4, a3, t0 -+ ADD4 y1, t1, $f7 -+ MUL alpha4, a2, t1 ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 + -+ ADD3 $f6, t0, y0 -+ ADD4 $f7, t1, y1 ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ ST y0, 0 * SIZE(Y1) -+ ST y1, 1 * SIZE(Y1) -+ .align 4 ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO + -+$L18: -+ ldi J, -1(J) -+ bgt J, $L11 -+ .align 4 ++ subl K, KK, TMP1 + -+$L20: -+ blbc N, $L990 ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+ LD alpha1, 0 * SIZE(X) -+ LD alpha2, 1 * SIZE(X) ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) + -+ MUL alpha_r, alpha1, y0 -+ MUL alpha_r, alpha2, y1 ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 + -+ MUL alpha_i, alpha2, t0 -+ mov A, A1 -+ MUL alpha_i, alpha1, t1 -+ mov Y, Y1 ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 + -+#ifndef XCONJ -+ SUB y0, t0, alpha1 -+ ADD y1, t1, alpha2 -+#else -+ ADD y0, t0, alpha1 -+ SUB y1, t1, alpha2 ++ ble L, $L35 +#endif ++ .align 4 + -+ sra M, 2, I -+ ble I, $L25 -+ -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) + -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) + -+ MUL alpha1, a0, t0 -+ LD a4, 4 * SIZE(A1) -+ MUL alpha1, a1, t1 -+ LD a5, 5 * SIZE(A1) -+ MUL alpha1, a2, t2 -+ LD a6, 6 * SIZE(A1) -+ MUL alpha1, a3, t3 -+ LD a7, 7 * SIZE(A1) ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) + -+ ADD1 y0, t0, $f6 -+ unop -+ MUL alpha2, a1, t0 -+ LD a1, 9 * SIZE(A1) ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) + -+ ADD2 y1, t1, $f7 -+ unop -+ MUL alpha2, a0, t1 -+ LD a0, 8 * SIZE(A1) ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) + -+ ADD1 y2, t2, $f8 -+ unop -+ MUL alpha2, a3, t2 -+ LD a3, 11 * SIZE(A1) ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) + -+ ADD2 y3, t3, $f9 -+ unop -+ MUL alpha2, a2, t3 -+ LD a2, 10 * SIZE(A1) ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) + -+ ADD3 $f6, t0, y0 -+ unop -+ LD y4, 4 * SIZE(Y1) -+ MUL alpha1, a4, t0 ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 + -+ ADD4 $f7, t1, y1 -+ unop -+ LD y5, 5 * SIZE(Y1) -+ MUL alpha1, a5, t1 ++$L35: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 + -+ ADD3 $f8, t2, y2 -+ LD y6, 6 * SIZE(Y1) -+ MUL alpha1, a6, t2 -+ ldi I, -1(I) ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) + -+ ADD4 $f9, t3, y3 -+ LD y7, 7 * SIZE(Y1) -+ MUL alpha1, a7, t3 -+ ble I, $L23 -+ .align 4 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) + -+$L22: -+ ADD1 y4, t0, $f6 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha2, a5, t0 -+ LD a5, 13 * SIZE(A1) ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) + -+ ADD2 y5, t1, $f7 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha2, a4, t1 -+ LD a4, 12 * SIZE(A1) ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 + -+ ADD1 y6, t2, $f8 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha2, a7, t2 -+ LD a7, 15 * SIZE(A1) ++$L37: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 + -+ ADD2 y7, t3, $f9 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha2, a6, t3 -+ LD a6, 14 * SIZE(A1) ++ ADD c13, t4, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, t4 ++ ldi BO, 4 * SIZE(BO) + -+ ADD3 $f6, t0, y4 -+ LD y0, 8 * SIZE(Y1) -+ MUL alpha1, a0, t0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ADD c09, t3, c09 ++ ADD c13, t4, c13 + -+ ADD4 $f7, t1, y5 -+ LD y1, 9 * SIZE(Y1) -+ MUL alpha1, a1, t1 -+ ldi I, -1(I) ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif + -+ ADD3 $f8, t2, y6 -+ LD y2, 10 * SIZE(Y1) -+ MUL alpha1, a2, t2 -+ unop ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ ADD4 $f9, t3, y7 -+ LD y3, 11 * SIZE(Y1) -+ MUL alpha1, a3, t3 -+ unop ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ ADD1 y0, t0, $f6 -+ ST y4, 4 * SIZE(Y1) -+ MUL alpha2, a1, t0 -+ LD a1, 17 * SIZE(A1) ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#endif + -+ ADD2 y1, t1, $f7 -+ ST y5, 5 * SIZE(Y1) -+ MUL alpha2, a0, t1 -+ LD a0, 16 * SIZE(A1) ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) + -+ ADD1 y2, t2, $f8 -+ ST y6, 6 * SIZE(Y1) -+ MUL alpha2, a3, t2 -+ LD a3, 19 * SIZE(A1) ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++#endif + -+ ADD2 y3, t3, $f9 -+ ST y7, 7 * SIZE(Y1) -+ MUL alpha2, a2, t3 -+ LD a2, 18 * SIZE(A1) ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ ADD3 $f6, t0, y0 -+ LD y4, 12 * SIZE(Y1) -+ MUL alpha1, a4, t0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c01, t1 ++ SUB c09, t1, c09 ++ MUL a4, c01, t1 ++ SUB c13, t1, c13 + -+ ADD4 $f7, t1, y1 -+ LD y5, 13 * SIZE(Y1) -+ MUL alpha1, a5, t1 -+ ldi A1, 8 * SIZE(A1) ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) + -+ ADD3 $f8, t2, y2 -+ LD y6, 14 * SIZE(Y1) -+ MUL alpha1, a6, t2 -+ ldi Y1, 8 * SIZE(Y1) ++ MUL b1, c05, c05 ++ MUL b2, c05, t1 ++ SUB c09, t1, c09 ++ MUL b3, c05, t1 ++ SUB c13, t1, c13 + -+ ADD4 $f9, t3, y3 -+ LD y7, 7 * SIZE(Y1) -+ MUL alpha1, a7, t3 -+ bgt I, $L22 -+ .align 4 ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) + -+$L23: -+ ADD1 y4, t0, $f6 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha2, a5, t0 -+ unop ++ MUL a1, c09, c09 ++ MUL a2, c09, t1 ++ SUB c13, t1, c13 ++ MUL a3, c13, c13 ++#endif + -+ ADD2 y5, t1, $f7 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha2, a4, t1 -+ unop ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) + -+ ADD1 y6, t2, $f8 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha2, a7, t2 -+ unop ++ MUL a1, c13, c13 ++ MUL a2, c13, t1 ++ SUB c09, t1, c09 ++ MUL a3, c13, t1 ++ SUB c05, t1, c05 ++ MUL a4, c13, t1 ++ SUB c01, t1, c01 + -+ ADD2 y7, t3, $f9 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha2, a6, t3 -+ unop ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) + -+ ADD3 $f6, t0, y4 -+ ADD4 $f7, t1, y5 -+ ADD3 $f8, t2, y6 -+ ADD4 $f9, t3, y7 ++ MUL b1, c09, c09 ++ MUL b2, c09, t1 ++ SUB c05, t1, c05 ++ MUL b3, c09, t1 ++ SUB c01, t1, c01 + -+ ST y4, 4 * SIZE(Y1) -+ unop -+ ST y5, 5 * SIZE(Y1) -+ unop ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ ST y6, 6 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ST y7, 7 * SIZE(Y1) -+ ldi Y1, 8 * SIZE(Y1) -+ .align 4 ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+$L25: -+ and M, 2, I -+ ble I, $L27 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif + -+ MUL alpha1, a0, t0 -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a1, t1 -+ LD y1, 1 * SIZE(Y1) -+ MUL alpha1, a2, t2 -+ LD y2, 2 * SIZE(Y1) -+ MUL alpha1, a3, t3 -+ LD y3, 3 * SIZE(Y1) ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) + -+ ADD1 y0, t0, $f6 -+ MUL alpha2, a1, t0 -+ ADD2 y1, t1, $f7 -+ MUL alpha2, a0, t1 -+ ADD1 y2, t2, $f8 -+ MUL alpha2, a3, t2 -+ ADD2 y3, t3, $f9 -+ MUL alpha2, a2, t3 ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ ADD3 $f6, t0, y0 -+ ADD4 $f7, t1, y1 -+ ADD3 $f8, t2, y2 -+ ADD4 $f9, t3, y3 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ST y0, 0 * SIZE(Y1) -+ ST y1, 1 * SIZE(Y1) ++#ifdef LT ++ addl KK, 1, KK ++#endif + -+ ST y2, 2 * SIZE(Y1) -+ ldi A1, 4 * SIZE(A1) -+ ST y3, 3 * SIZE(Y1) -+ ldi Y1, 4 * SIZE(Y1) ++#ifdef LN ++ subl KK, 1, KK ++#endif + .align 4 + -+$L27: -+ blbc M, $L990 ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif + -+ MUL alpha1, a0, t0 -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a1, t1 -+ LD y1, 1 * SIZE(Y1) ++#ifdef RN ++ addl KK, 4, KK ++#endif + -+ ADD1 y0, t0, $f6 -+ MUL alpha2, a1, t0 -+ ADD2 y1, t1, $f7 -+ MUL alpha2, a0, t1 ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 + -+ ADD3 $f6, t0, y0 -+ ADD4 $f7, t1, y1 ++$L40: ++ and N, 2, J ++ ble J, $L80 + -+ ST y0, 0 * SIZE(Y1) -+ ST y1, 1 * SIZE(Y1) -+ .align 4 ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B + -+$L990: -+ cmpeq INCY, 2 * SIZE, $0 -+ bne $0, $L999 ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif + -+ mov BUFFER, Y1 ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 + -+ sra M, 2, I -+ ble I, $L995 -+ .align 4 ++#ifdef LN ++ addl M, OFFSET, KK ++#endif + -+$L992: -+ LD a0, 0 * SIZE(BUFFER) -+ LD a1, 1 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a2, 0 * SIZE(BUFFER) -+ LD a3, 1 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER ++#ifdef LT ++ mov OFFSET, KK ++#endif + -+ LD y0, 0 * SIZE(Y) -+ LD y1, 1 * SIZE(Y) -+ LD y2, 2 * SIZE(Y) -+ LD y3, 3 * SIZE(Y) ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif + -+ LD a4, 0 * SIZE(BUFFER) -+ LD a5, 1 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a6, 0 * SIZE(BUFFER) -+ LD a7, 1 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 + -+ LD y4, 4 * SIZE(Y) -+ LD y5, 5 * SIZE(Y) -+ LD y6, 6 * SIZE(Y) -+ LD y7, 7 * SIZE(Y) ++$L51: ++#if defined(LT) || defined(RN) + -+ ADD a0, y0, $f6 -+ ADD a1, y1, $f7 -+ ADD a2, y2, $f8 -+ ADD a3, y3, $f9 ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 + -+ fmov $f6, a0 -+ fmov $f7, a1 -+ fmov $f8, a2 -+ fmov $f9, a3 -+ -+ ST a0, 0 * SIZE(Y1) -+ ADD a4, y4, $f6 -+ ST a1, 1 * SIZE(Y1) -+ ADD a5, y5, $f7 -+ addl Y1, INCY, Y1 ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 + -+ ST a2, 0 * SIZE(Y1) -+ ADD a6, y6, $f8 -+ ST a3, 1 * SIZE(Y1) -+ ADD a7, y7, $f9 -+ addl Y1, INCY, Y1 ++ ldi L, -2(KK) + -+ fmov $f6, a4 -+ fmov $f7, a5 -+ fmov $f8, a6 -+ fmov $f9, a7 ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) + -+ ST a4, 0 * SIZE(Y1) -+ ST a5, 1 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a6, 0 * SIZE(Y1) -+ ST a7, 1 * SIZE(Y1) -+ addl Y1, INCY, Y1 ++ ble KK, $L58 + -+ ldi I, -1(I) -+ ldi Y, 8 * SIZE(Y) -+ bgt I, $L992 -+ .align 4 ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+$L995: -+ and M, 3, I -+ ble I, $L999 -+ .align 4 ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO + -+$L996: -+ LD a0, 0 * SIZE(BUFFER) -+ LD a1, 1 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER ++ subl K, KK, TMP1 + -+ LD y0, 0 * SIZE(Y) -+ LD y1, 1 * SIZE(Y) -+ ldi Y, 2 * SIZE(Y) ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 + -+ ADD a0, y0, $f6 -+ ADD a1, y1, $f7 ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 + -+ fmov $f6, a0 -+ fmov $f7, a1 ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) + -+ ST a0, 0 * SIZE(Y1) -+ ST a1, 1 * SIZE(Y1) -+ addl Y1, INCY, Y1 ++ ble TMP1, $L58 + -+ ldi I, -1(I) -+ bgt I, $L996 -+ .align 4 ++ ble L, $L55 ++#endif ++ .align 4 + -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) ++$L52: ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop + -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/zgemv_n.S.bak b/kernel/sw_64/zgemv_n.S.bak -new file mode 100644 -index 0000000..3dd482e ---- /dev/null -+++ b/kernel/sw_64/zgemv_n.S.bak -@@ -0,0 +1,1027 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ ADD c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop + -+#define STACKSIZE 64 -+#define PREFETCHSIZE 32 ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) + -+#define M $16 -+#define N $17 -+#define A $21 -+#define LDA $18 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) + -+#define X $19 -+#define INCX $20 -+#define Y $22 -+#define INCY $23 ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) + -+#define BUFFER $24 ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) + -+#define I $25 -+#define J $27 ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) + -+#define Y1 $4 -+#define A1 $5 -+#define A2 $6 ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) + -+#define alpha_r $f19 -+#define alpha_i $f20 ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop + -+#define alpha1 $f0 -+#define alpha2 $f1 -+#define alpha3 $f10 -+#define alpha4 $f11 ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) + -+#define y0 $f12 -+#define y1 $f13 -+#define y2 $f14 -+#define y3 $f15 ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) + -+#define y4 $f16 -+#define y5 $f17 -+#define y6 $f18 -+#define y7 $f21 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) + -+#define a0 $f22 -+#define a1 $f23 -+#define a2 $f24 -+#define a3 $f25 -+#define a4 $f26 -+#define a5 $f27 -+#define a6 $f28 -+#define a7 $f29 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) + -+#define t0 $f2 -+#define t1 $f3 -+#define t2 $f4 -+#define t3 $f5 ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) + -+#if !defined(CONJ) && !defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 SUB -+#define ADD4 ADD -+#elif defined(CONJ) && !defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#elif !defined(CONJ) && defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 ADD -+#define ADD4 SUB ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, c05 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 +#else -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 SUB -+#define ADD4 SUB ++ blbs TMP1, $L57 +#endif ++ .align 4 + -+ PROLOGUE -+ -+ ldi $sp, -STACKSIZE($sp) -+ ldl LDA, 0 + STACKSIZE($sp) -+ ldl X, 8 + STACKSIZE($sp) -+ ldl INCX, 16 + STACKSIZE($sp) -+ ldl Y, 24 + STACKSIZE($sp) -+ ldl INCY, 32 + STACKSIZE($sp) -+ ldl BUFFER, 40 + STACKSIZE($sp) -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ -+ PROFCODE ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 + -+ cmple M, 0, $0 -+ sll INCX, ZBASE_SHIFT, INCX -+ cmple N, 0, $1 -+ sll INCY, ZBASE_SHIFT, INCY ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) + -+ or $0, $1, $0 -+ bne $0, $L999 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) + -+ cmpeq INCY, 2 * SIZE, $0 -+ sll LDA, ZBASE_SHIFT,LDA -+ bne $0, $L10 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) + -+ mov BUFFER, Y1 ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) + -+ mov Y, BUFFER -+ mov Y1, Y ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) + -+ sra M, 2, I -+ ble I, $L05 ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) + .align 4 + -+$L02: -+ ST $f31, 0 * SIZE(Y1) -+ ST $f31, 1 * SIZE(Y1) -+ ST $f31, 2 * SIZE(Y1) -+ ST $f31, 3 * SIZE(Y1) -+ ST $f31, 4 * SIZE(Y1) -+ ST $f31, 5 * SIZE(Y1) -+ ST $f31, 6 * SIZE(Y1) -+ ST $f31, 7 * SIZE(Y1) ++$L57: ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 + -+ ldi Y1, 8 * SIZE(Y1) -+ ldi I, -1(I) -+ bgt I, $L02 -+ .align 4 ++ ADD c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD c01, t1, c01 ++ MUL a1, b2, t1 + -+$L05: -+ and M, 3, I -+ ble I, $L10 -+ .align 4 ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD c03, t3, c03 ++ MUL a3, b2, t3 + -+$L06: -+ ST $f31, 0 * SIZE(Y1) -+ ST $f31, 1 * SIZE(Y1) -+ addl Y1, 2 * SIZE, Y1 ++ ADD c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) + -+ ldi I, -1(I) -+ bgt I, $L06 ++ ADD c05, t1, c05 ++ ADD c06, t2, c06 ++ ADD c07, t3, c07 ++ ADD c08, t4, c08 + .align 4 + -+$L10: -+ sra N, 1, J -+ ble J, $L20 -+ .align 4 ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif + -+$L11: -+ LD alpha1, 0 * SIZE(X) -+ LD alpha2, 1 * SIZE(X) -+ addl X, INCX, X -+ LD alpha3, 0 * SIZE(X) -+ LD alpha4, 1 * SIZE(X) -+ addl X, INCX, X ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ MUL alpha_r, alpha1, y0 -+ MUL alpha_r, alpha2, y1 -+ MUL alpha_r, alpha3, y2 -+ MUL alpha_r, alpha4, y3 ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) + -+ MUL alpha_i, alpha2, t0 -+ mov A, A1 -+ MUL alpha_i, alpha1, t1 -+ addl A, LDA, A2 -+ MUL alpha_i, alpha4, t2 -+ addl A2, LDA, A -+ MUL alpha_i, alpha3, t3 -+ mov Y, Y1 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 + -+#ifndef XCONJ -+ SUB y0, t0, alpha1 -+ ADD y1, t1, alpha2 -+ SUB y2, t2, alpha3 -+ ADD y3, t3, alpha4 ++ SUB b1, c03, c03 ++ SUB b2, c07, c07 ++ SUB b3, c04, c04 ++ SUB b4, c08, c08 +#else -+ ADD y0, t0, alpha1 -+ SUB y1, t1, alpha2 -+ ADD y2, t2, alpha3 -+ SUB y3, t3, alpha4 -+#endif -+ -+ fillcs 4 * SIZE(X) -+ -+ sra M, 2, I -+ ble I, $L15 ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) + -+ LD a4, 0 * SIZE(A2) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 + -+ MUL alpha1, a0, t0 -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a1, t1 -+ LD y1, 1 * SIZE(Y1) ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++#endif + -+ MUL alpha1, a2, t2 -+ LD y2, 2 * SIZE(Y1) -+ MUL alpha1, a3, t3 -+ LD y3, 3 * SIZE(Y1) ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) + -+ ADD1 y0, t0, y0 -+ unop -+ MUL alpha3, a4, t0 -+ LD y4, 4 * SIZE(Y1) ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 + -+ ADD2 y1, t1, y1 -+ unop -+ MUL alpha3, a5, t1 -+ LD y5, 5 * SIZE(Y1) ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 + -+ ADD1 y2, t2, y2 -+ unop -+ MUL alpha3, a6, t2 -+ LD y6, 6 * SIZE(Y1) ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 + -+ ADD2 y3, t3, y3 -+ unop -+ MUL alpha3, a7, t3 -+ LD y7, 7 * SIZE(Y1) ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 + -+ ADD1 y0, t0, y0 -+ unop -+ MUL alpha2, a1, t0 -+ LD a1, 5 * SIZE(A1) ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 + -+ ADD2 y1, t1, y1 -+ unop -+ MUL alpha2, a0, t1 -+ LD a0, 4 * SIZE(A1) ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 + -+ ADD1 y2, t2, y2 -+ unop -+ MUL alpha2, a3, t2 -+ LD a3, 7 * SIZE(A1) ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 + -+ ADD2 y3, t3, y3 -+ unop -+ MUL alpha2, a2, t3 -+ LD a2, 6 * SIZE(A1) ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) + -+ ADD3 y0, t0, y0 -+ unop -+ MUL alpha4, a5, t0 -+ LD a5, 5 * SIZE(A2) ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 + -+ ADD4 y1, t1, y1 -+ unop -+ MUL alpha4, a4, t1 -+ LD a4, 4 * SIZE(A2) ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 + -+ ADD3 y2, t2, y2 -+ unop -+ MUL alpha4, a7, t2 -+ LD a7, 7 * SIZE(A2) ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 + -+ ADD4 y3, t3, y3 -+ unop -+ MUL alpha4, a6, t3 -+ LD a6, 6 * SIZE(A2) ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 + -+ ADD3 y0, t0, y0 -+ MUL alpha1, a0, t0 -+ ADD4 y1, t1, y1 -+ MUL alpha1, a1, t1 ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 + -+ ADD3 y2, t2, y2 -+ unop -+ MUL alpha1, a2, t2 -+ unop ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ ADD4 y3, t3, y3 -+ ldi I, -1(I) -+ MUL alpha1, a3, t3 -+ ble I, $L13 -+ .align 4 ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 + -+$L12: -+ ADD1 y4, t0, y4 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha3, a4, t0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 + -+ ADD2 y5, t1, y5 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha3, a5, t1 -+ ldi I, -1(I) ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 + -+ ADD1 y6, t2, y6 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha3, a6, t2 -+ unop ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif + -+ ADD2 y7, t3, y7 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha3, a7, t3 -+ unop ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ ADD1 y4, t0, y4 -+ unop -+ MUL alpha2, a1, t0 -+ LD a1, 9 * SIZE(A1) ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 + -+ ADD2 y5, t1, y5 -+ unop -+ MUL alpha2, a0, t1 -+ LD a0, 8 * SIZE(A1) ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 + -+ ADD1 y6, t2, y6 -+ unop -+ MUL alpha2, a3, t2 -+ LD a3, 11 * SIZE(A1) ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 + -+ ADD2 y7, t3, y7 -+ unop -+ MUL alpha2, a2, t3 -+ LD a2, 10 * SIZE(A1) ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 + -+ ADD3 y4, t0, y4 -+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) -+ MUL alpha4, a5, t0 -+ LD a5, 9 * SIZE(A2) ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 + -+ ADD4 y5, t1, y5 -+ unop -+ MUL alpha4, a4, t1 -+ LD a4, 8 * SIZE(A2) ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 + -+ ADD3 y6, t2, y6 -+ unop -+ MUL alpha4, a7, t2 -+ LD a7, 11 * SIZE(A2) ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 + -+ ADD4 y7, t3, y7 -+ unop -+ MUL alpha4, a6, t3 -+ LD a6, 10 * SIZE(A2) ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) + -+ ADD3 y4, t0, y4 -+ unop -+ MUL alpha1, a0, t0 -+ LD y0, 8 * SIZE(Y1) ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 + -+ ADD4 y5, t1, y5 -+ unop -+ MUL alpha1, a1, t1 -+ LD y1, 9 * SIZE(Y1) ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 + -+ ADD3 y6, t2, y6 -+ unop -+ MUL alpha1, a2, t2 -+ LD y2, 10 * SIZE(Y1) ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 + -+ ADD4 y7, t3, y7 -+ unop -+ MUL alpha1, a3, t3 -+ LD y3, 11 * SIZE(Y1) ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 + -+ ADD1 y0, t0, y0 -+ ST y4, 4 * SIZE(Y1) -+ MUL alpha3, a4, t0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 + -+ ADD2 y1, t1, y1 -+ ST y5, 5 * SIZE(Y1) -+ MUL alpha3, a5, t1 -+ unop ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) + -+ ADD1 y2, t2, y2 -+ ST y6, 6 * SIZE(Y1) -+ MUL alpha3, a6, t2 -+ unop ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 + -+ ADD2 y3, t3, y3 -+ ST y7, 7 * SIZE(Y1) -+ MUL alpha3, a7, t3 -+ ldi Y1, 8 * SIZE(Y1) ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 + -+ ADD1 y0, t0, y0 -+ unop -+ MUL alpha2, a1, t0 -+ LD a1, 13 * SIZE(A1) ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 + -+ ADD2 y1, t1, y1 -+ unop -+ MUL alpha2, a0, t1 -+ LD a0, 12 * SIZE(A1) ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++#endif + -+ ADD1 y2, t2, y2 -+ unop -+ MUL alpha2, a3, t2 -+ LD a3, 15 * SIZE(A1) ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) + -+ ADD2 y3, t3, y3 -+ unop -+ MUL alpha2, a2, t3 -+ LD a2, 14 * SIZE(A1) ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 + -+ ADD3 y0, t0, y0 -+ unop -+ MUL alpha4, a5, t0 -+ LD a5, 13 * SIZE(A2) ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 + -+ ADD4 y1, t1, y1 -+ unop -+ MUL alpha4, a4, t1 -+ LD a4, 12 * SIZE(A2) ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+ ADD3 y2, t2, y2 -+ unop -+ MUL alpha4, a7, t2 -+ LD a7, 15 * SIZE(A2) ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++ MUL a3, c07, c07 ++ MUL a3, c08, c08 ++#endif + -+ ADD4 y3, t3, y3 -+ unop -+ MUL alpha4, a6, t3 -+ LD a6, 14 * SIZE(A2) ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ ADD3 y0, t0, y0 -+ unop -+ MUL alpha1, a0, t0 -+ LD y4, 4 * SIZE(Y1) ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 + -+ ADD4 y1, t1, y1 -+ ldi A2, 8 * SIZE(A2) -+ MUL alpha1, a1, t1 -+ LD y5, 5 * SIZE(Y1) ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 + -+ ADD3 y2, t2, y2 -+ ldi A1, 8 * SIZE(A1) -+ MUL alpha1, a2, t2 -+ LD y6, 6 * SIZE(Y1) ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + -+ ADD4 y3, t3, y3 -+ MUL alpha1, a3, t3 -+ LD y7, 7 * SIZE(Y1) -+ bgt I, $L12 -+ .align 4 ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif + -+$L13: -+ ADD1 y4, t0, y4 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha3, a4, t0 -+ unop ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) + -+ ADD2 y5, t1, y5 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha3, a5, t1 -+ unop ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) + -+ ADD1 y6, t2, y6 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha3, a6, t2 -+ unop ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif + -+ ADD2 y7, t3, y7 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha3, a7, t3 -+ unop ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif + -+ ADD1 y4, t0, y4 -+ MUL alpha2, a1, t0 -+ ADD2 y5, t1, y5 -+ MUL alpha2, a0, t1 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) + -+ ADD1 y6, t2, y6 -+ MUL alpha2, a3, t2 -+ ADD2 y7, t3, y7 -+ MUL alpha2, a2, t3 ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) + -+ ADD3 y4, t0, y4 -+ MUL alpha4, a5, t0 -+ ADD4 y5, t1, y5 -+ MUL alpha4, a4, t1 ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif + -+ ADD3 y6, t2, y6 -+ MUL alpha4, a7, t2 -+ ADD4 y7, t3, y7 -+ MUL alpha4, a6, t3 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ ADD3 y4, t0, y4 -+ ADD4 y5, t1, y5 -+ ADD3 y6, t2, y6 -+ ADD4 y7, t3, y7 ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ ST y4, 4 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ST y5, 5 * SIZE(Y1) -+ ldi A2, 8 * SIZE(A2) ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ST y6, 6 * SIZE(Y1) -+ unop -+ ST y7, 7 * SIZE(Y1) -+ ldi Y1, 8 * SIZE(Y1) -+ .align 4 ++#ifdef LT ++ addl KK, 4, KK ++#endif + -+$L15: -+ and M, 2, I -+ ble I, $L17 ++#ifdef LN ++ subl KK, 4, KK ++#endif + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) ++ ldi I, -1(I) + -+ LD a4, 0 * SIZE(A2) -+ LD a5, 1 * SIZE(A2) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) ++ bgt I, $L51 ++ .align 4 + -+ MUL alpha1, a0, t0 -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a1, t1 -+ LD y1, 1 * SIZE(Y1) -+ MUL alpha1, a2, t2 -+ LD y2, 2 * SIZE(Y1) -+ MUL alpha1, a3, t3 -+ LD y3, 3 * SIZE(Y1) ++$L60: ++ and M, 2, I ++ ble I, $L70 + -+ ADD1 y0, t0, y0 -+ MUL alpha3, a4, t0 -+ ADD2 y1, t1, y1 -+ MUL alpha3, a5, t1 -+ ADD1 y2, t2, y2 -+ MUL alpha3, a6, t2 -+ ADD2 y3, t3, y3 -+ MUL alpha3, a7, t3 ++#if defined(LT) || defined(RN) + -+ ADD1 y0, t0, y0 -+ MUL alpha2, a1, t0 -+ ADD2 y1, t1, y1 -+ MUL alpha2, a0, t1 ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 + -+ ADD1 y2, t2, y2 -+ MUL alpha2, a3, t2 -+ ADD2 y3, t3, y3 -+ MUL alpha2, a2, t3 ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) + -+ ADD3 y0, t0, y0 -+ MUL alpha4, a5, t0 -+ ADD4 y1, t1, y1 -+ MUL alpha4, a4, t1 ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) + -+ ADD3 y2, t2, y2 -+ MUL alpha4, a7, t2 -+ ADD4 y3, t3, y3 -+ MUL alpha4, a6, t3 ++ ble KK, $L68 + -+ ADD3 y0, t0, y0 -+ ADD4 y1, t1, y1 -+ ADD3 y2, t2, y2 -+ ADD4 y3, t3, y3 ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ ST y0, 0 * SIZE(Y1) -+ ldi A1, 4 * SIZE(A1) -+ ST y1, 1 * SIZE(Y1) -+ ldi A2, 4 * SIZE(A2) ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO + -+ ST y2, 2 * SIZE(Y1) -+ unop -+ ST y3, 3 * SIZE(Y1) -+ ldi Y1, 4 * SIZE(Y1) -+ .align 4 ++ subl K, KK, TMP1 + -+$L17: -+ blbc M, $L18 ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 0 * SIZE(A2) -+ LD a3, 1 * SIZE(A2) ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) + -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) + -+ MUL alpha1, a0, t0 -+ MUL alpha1, a1, t1 ++ ble TMP1, $L68 + -+ ADD1 y0, t0, y0 -+ MUL alpha3, a2, t0 -+ ADD2 y1, t1, y1 -+ MUL alpha3, a3, t1 ++ ble L, $L65 ++#endif ++ .align 4 + -+ ADD1 y0, t0, y0 -+ MUL alpha2, a1, t0 -+ ADD2 y1, t1, y1 -+ MUL alpha2, a0, t1 ++$L62: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop + -+ ADD3 y0, t0, y0 -+ MUL alpha4, a3, t0 -+ ADD4 y1, t1, y1 -+ MUL alpha4, a2, t1 ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) + -+ ADD3 y0, t0, y0 -+ ADD4 y1, t1, y1 ++ ADD c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) + -+ ST y0, 0 * SIZE(Y1) -+ ST y1, 1 * SIZE(Y1) -+ .align 4 ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) + -+$L18: -+ ldi J, -1(J) -+ bgt J, $L11 -+ .align 4 ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) + -+$L20: -+ blbc N, $L990 ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) + -+ LD alpha1, 0 * SIZE(X) -+ LD alpha2, 1 * SIZE(X) ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) + -+ MUL alpha_r, alpha1, y0 -+ MUL alpha_r, alpha2, y1 ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop + -+ MUL alpha_i, alpha2, t0 -+ mov A, A1 -+ MUL alpha_i, alpha1, t1 -+ mov Y, Y1 ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 + -+#ifndef XCONJ -+ SUB y0, t0, alpha1 -+ ADD y1, t1, alpha2 ++$L65: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 +#else -+ ADD y0, t0, alpha1 -+ SUB y1, t1, alpha2 ++ blbs TMP1, $L67 +#endif ++ .align 4 + -+ sra M, 2, I -+ ble I, $L25 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) + -+ LD y0, 0 * SIZE(Y1) -+ LD y1, 1 * SIZE(Y1) -+ LD y2, 2 * SIZE(Y1) -+ LD y3, 3 * SIZE(Y1) ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) + -+ MUL alpha1, a0, t0 -+ LD a4, 4 * SIZE(A1) -+ MUL alpha1, a1, t1 -+ LD a5, 5 * SIZE(A1) -+ MUL alpha1, a2, t2 -+ LD a6, 6 * SIZE(A1) -+ MUL alpha1, a3, t3 -+ LD a7, 7 * SIZE(A1) ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 + -+ ADD1 y0, t0, y0 -+ unop -+ MUL alpha2, a1, t0 -+ LD a1, 9 * SIZE(A1) ++$L67: ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD c05, t3, c05 ++ MUL a1, b2, t3 + -+ ADD2 y1, t1, y1 -+ unop -+ MUL alpha2, a0, t1 -+ LD a0, 8 * SIZE(A1) ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) + -+ ADD1 y2, t2, y2 -+ unop -+ MUL alpha2, a3, t2 -+ LD a3, 11 * SIZE(A1) ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c05, t3, c05 ++ ADD c06, t4, c06 ++ .align 4 + -+ ADD2 y3, t3, y3 -+ unop -+ MUL alpha2, a2, t3 -+ LD a2, 10 * SIZE(A1) ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ ADD3 y0, t0, y0 -+ unop -+ LD y4, 4 * SIZE(Y1) -+ MUL alpha1, a4, t0 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++#endif + -+ ADD4 y1, t1, y1 -+ unop -+ LD y5, 5 * SIZE(Y1) -+ MUL alpha1, a5, t1 ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ ADD3 y2, t2, y2 -+ LD y6, 6 * SIZE(Y1) -+ MUL alpha1, a6, t2 -+ ldi I, -1(I) ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 + -+ ADD4 y3, t3, y3 -+ LD y7, 7 * SIZE(Y1) -+ MUL alpha1, a7, t3 -+ ble I, $L23 -+ .align 4 ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 + -+$L22: -+ ADD1 y4, t0, y4 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha2, a5, t0 -+ LD a5, 13 * SIZE(A1) ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 + -+ ADD2 y5, t1, y5 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha2, a4, t1 -+ LD a4, 12 * SIZE(A1) ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif + -+ ADD1 y6, t2, y6 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha2, a7, t2 -+ LD a7, 15 * SIZE(A1) ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) + -+ ADD2 y7, t3, y7 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha2, a6, t3 -+ LD a6, 14 * SIZE(A1) ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 + -+ ADD3 y4, t0, y4 -+ LD y0, 8 * SIZE(Y1) -+ MUL alpha1, a0, t0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 + -+ ADD4 y5, t1, y5 -+ LD y1, 9 * SIZE(Y1) -+ MUL alpha1, a1, t1 -+ ldi I, -1(I) ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 + -+ ADD3 y6, t2, y6 -+ LD y2, 10 * SIZE(Y1) -+ MUL alpha1, a2, t2 -+ unop ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++#endif + -+ ADD4 y7, t3, y7 -+ LD y3, 11 * SIZE(Y1) -+ MUL alpha1, a3, t3 -+ unop ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) + -+ ADD1 y0, t0, y0 -+ ST y4, 4 * SIZE(Y1) -+ MUL alpha2, a1, t0 -+ LD a1, 17 * SIZE(A1) ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 + -+ ADD2 y1, t1, y1 -+ ST y5, 5 * SIZE(Y1) -+ MUL alpha2, a0, t1 -+ LD a0, 16 * SIZE(A1) ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 + -+ ADD1 y2, t2, y2 -+ ST y6, 6 * SIZE(Y1) -+ MUL alpha2, a3, t2 -+ LD a3, 19 * SIZE(A1) ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 + -+ ADD2 y3, t3, y3 -+ ST y7, 7 * SIZE(Y1) -+ MUL alpha2, a2, t3 -+ LD a2, 18 * SIZE(A1) ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++#endif + -+ ADD3 y0, t0, y0 -+ LD y4, 12 * SIZE(Y1) -+ MUL alpha1, a4, t0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ ADD4 y1, t1, y1 -+ LD y5, 13 * SIZE(Y1) -+ MUL alpha1, a5, t1 -+ ldi A1, 8 * SIZE(A1) ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 + -+ ADD3 y2, t2, y2 -+ LD y6, 14 * SIZE(Y1) -+ MUL alpha1, a6, t2 -+ ldi Y1, 8 * SIZE(Y1) ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 + -+ ADD4 y3, t3, y3 -+ LD y7, 7 * SIZE(Y1) -+ MUL alpha1, a7, t3 -+ bgt I, $L22 -+ .align 4 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 + -+$L23: -+ ADD1 y4, t0, y4 -+ ST y0, 0 * SIZE(Y1) -+ MUL alpha2, a5, t0 -+ unop ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif + -+ ADD2 y5, t1, y5 -+ ST y1, 1 * SIZE(Y1) -+ MUL alpha2, a4, t1 -+ unop ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif + -+ ADD1 y6, t2, y6 -+ ST y2, 2 * SIZE(Y1) -+ MUL alpha2, a7, t2 -+ unop ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif + -+ ADD2 y7, t3, y7 -+ ST y3, 3 * SIZE(Y1) -+ MUL alpha2, a6, t3 -+ unop ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) + -+ ADD3 y4, t0, y4 -+ ADD4 y5, t1, y5 -+ ADD3 y6, t2, y6 -+ ADD4 y7, t3, y7 ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif + -+ ST y4, 4 * SIZE(Y1) -+ unop -+ ST y5, 5 * SIZE(Y1) -+ unop ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ ST y6, 6 * SIZE(Y1) -+ ldi A1, 8 * SIZE(A1) -+ ST y7, 7 * SIZE(Y1) -+ ldi Y1, 8 * SIZE(Y1) -+ .align 4 ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+$L25: -+ and M, 2, I -+ ble I, $L27 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 2 * SIZE(A1) -+ LD a3, 3 * SIZE(A1) ++#ifdef LT ++ addl KK, 2, KK ++#endif + -+ MUL alpha1, a0, t0 -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a1, t1 -+ LD y1, 1 * SIZE(Y1) -+ MUL alpha1, a2, t2 -+ LD y2, 2 * SIZE(Y1) -+ MUL alpha1, a3, t3 -+ LD y3, 3 * SIZE(Y1) ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 + -+ ADD1 y0, t0, y0 -+ MUL alpha2, a1, t0 -+ ADD2 y1, t1, y1 -+ MUL alpha2, a0, t1 -+ ADD1 y2, t2, y2 -+ MUL alpha2, a3, t2 -+ ADD2 y3, t3, y3 -+ MUL alpha2, a2, t3 ++$L70: ++ and M, 1, I ++ ble I, $L79 + -+ ADD3 y0, t0, y0 -+ ADD4 y1, t1, y1 -+ ADD3 y2, t2, y2 -+ ADD4 y3, t3, y3 ++#if defined(LT) || defined(RN) + -+ ST y0, 0 * SIZE(Y1) -+ ST y1, 1 * SIZE(Y1) + -+ ST y2, 2 * SIZE(Y1) -+ ldi A1, 4 * SIZE(A1) -+ ST y3, 3 * SIZE(Y1) -+ ldi Y1, 4 * SIZE(Y1) -+ .align 4 ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+$L27: -+ blbc M, $L990 ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) ++ ldi L, -2(KK) + -+ MUL alpha1, a0, t0 -+ LD y0, 0 * SIZE(Y1) -+ MUL alpha1, a1, t1 -+ LD y1, 1 * SIZE(Y1) ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) + -+ ADD1 y0, t0, y0 -+ MUL alpha2, a1, t0 -+ ADD2 y1, t1, y1 -+ MUL alpha2, a0, t1 ++ ble KK, $L78 + -+ ADD3 y0, t0, y0 -+ ADD4 y1, t1, y1 ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ ST y0, 0 * SIZE(Y1) -+ ST y1, 1 * SIZE(Y1) -+ .align 4 ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO + -+$L990: -+ cmpeq INCY, 2 * SIZE, $0 -+ bne $0, $L999 ++ subl K, KK, TMP1 + -+ mov BUFFER, Y1 ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+ sra M, 2, I -+ ble I, $L995 -+ .align 4 ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 + -+$L992: -+ LD a0, 0 * SIZE(BUFFER) -+ LD a1, 1 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a2, 0 * SIZE(BUFFER) -+ LD a3, 1 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER ++ ldi L, -2(TMP1) + -+ LD y0, 0 * SIZE(Y) -+ LD y1, 1 * SIZE(Y) -+ LD y2, 2 * SIZE(Y) -+ LD y3, 3 * SIZE(Y) ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) + -+ LD a4, 0 * SIZE(BUFFER) -+ LD a5, 1 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ LD a6, 0 * SIZE(BUFFER) -+ LD a7, 1 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER ++ ble TMP1, $L78 + -+ LD y4, 4 * SIZE(Y) -+ LD y5, 5 * SIZE(Y) -+ LD y6, 6 * SIZE(Y) -+ LD y7, 7 * SIZE(Y) ++ ble L, $L75 ++#endif ++ .align 4 + -+ ADD a0, y0, a0 -+ ADD a1, y1, a1 -+ ADD a2, y2, a2 -+ ADD a3, y3, a3 ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) + -+ ST a0, 0 * SIZE(Y1) -+ ADD a4, y4, a4 -+ ST a1, 1 * SIZE(Y1) -+ ADD a5, y5, a5 -+ addl Y1, INCY, Y1 ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) + -+ ST a2, 0 * SIZE(Y1) -+ ADD a6, y6, a6 -+ ST a3, 1 * SIZE(Y1) -+ ADD a7, y7, a7 -+ addl Y1, INCY, Y1 ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) + -+ ST a4, 0 * SIZE(Y1) -+ ST a5, 1 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ ST a6, 0 * SIZE(Y1) -+ ST a7, 1 * SIZE(Y1) -+ addl Y1, INCY, Y1 ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) + -+ ldi I, -1(I) -+ ldi Y, 8 * SIZE(Y) -+ bgt I, $L992 ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 + .align 4 + -+$L995: -+ and M, 3, I -+ ble I, $L999 ++$L75: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif + .align 4 + -+$L996: -+ LD a0, 0 * SIZE(BUFFER) -+ LD a1, 1 * SIZE(BUFFER) -+ addl BUFFER, INCY, BUFFER -+ -+ LD y0, 0 * SIZE(Y) -+ LD y1, 1 * SIZE(Y) -+ ldi Y, 2 * SIZE(Y) -+ -+ ADD a0, y0, a0 -+ ADD a1, y1, a1 -+ -+ ST a0, 0 * SIZE(Y1) -+ ST a1, 1 * SIZE(Y1) -+ addl Y1, INCY, Y1 ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) + -+ ldi I, -1(I) -+ bgt I, $L996 ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) + .align 4 + -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/zgemv_t.S b/kernel/sw_64/zgemv_t.S -new file mode 100644 -index 0000000..bf31cb4 ---- /dev/null -+++ b/kernel/sw_64/zgemv_t.S -@@ -0,0 +1,1047 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++$L77: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++ ADD c01, c02, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, c05 ++ ldi BO, 2 * SIZE(BO) + -+#define STACKSIZE 64 -+#define PREFETCHSIZE 32 ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 + -+#define M $16 -+#define N $17 -+#define A $21 -+#define LDA $18 ++ .align 4 + -+#define X $19 -+#define INCX $20 -+#define Y $22 -+#define INCY $23 ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif + -+#define BUFFER $24 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) + -+#define I $25 -+#define J $27 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) + -+#define X1 $3 -+#define Y1 $4 -+#define A1 $5 -+#define A2 $6 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#endif + -+#define alpha_r $f19 -+#define alpha_i $f20 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) + -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++#endif + -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) + -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f21 ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c05, c05 ++#endif + -+#define a0 $f22 -+#define a1 $f23 -+#define a2 $f24 -+#define a3 $f25 -+#define a4 $f26 -+#define a5 $f27 -+#define a6 $f28 -+#define a7 $f29 ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+#define a8 $f2 -+#define a9 $f3 -+#define a10 $f4 -+#define a11 $f5 -+#define a12 $f6 -+#define a13 $f7 -+#define a14 $f8 -+#define a15 $f9 ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+#if !defined(CONJ) && !defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 SUB -+#define ADD4 ADD -+#elif !defined(CONJ) && defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 ADD -+#define ADD4 SUB -+#elif defined(CONJ) && !defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) +#else -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 SUB -+#define ADD4 SUB ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) +#endif + -+ PROLOGUE -+ -+ ldi $sp, -STACKSIZE($sp) -+ ldl LDA, 0 + STACKSIZE($sp) -+ ldl X, 8 + STACKSIZE($sp) -+ ldl INCX, 16 + STACKSIZE($sp) -+ ldl Y, 24 + STACKSIZE($sp) -+ ldl INCY, 32 + STACKSIZE($sp) -+ ldl BUFFER, 40 + STACKSIZE($sp) ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif + -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) + -+ PROFCODE ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ cmple M, 0, $0 -+ sll INCX, ZBASE_SHIFT, INCX -+ cmple N, 0, $1 -+ sll INCY, ZBASE_SHIFT, INCY ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ or $0, $1, $0 -+ bne $0, $L999 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ cmpeq INCX, 2 * SIZE, $0 -+ mov X, X1 -+ sll LDA, ZBASE_SHIFT,LDA -+ bne $0, $L10 ++#ifdef LT ++ addl KK, 1, KK ++#endif + -+ sra M, 2, I -+ mov BUFFER, Y1 -+ mov BUFFER, X -+ ble I, $L05 ++#ifdef LN ++ subl KK, 1, KK ++#endif + .align 4 + -+$L02: -+ fillcs (PREFETCHSIZE + 0) * SIZE(X1) -+ ldi I, -1(I) ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif + -+ LD a0, 0 * SIZE(X1) -+ LD a1, 1 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a2, 0 * SIZE(X1) -+ LD a3, 1 * SIZE(X1) -+ addl X1, INCX, X1 ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif + -+ ST a0, 0 * SIZE(Y1) -+ ST a1, 1 * SIZE(Y1) -+ ST a2, 2 * SIZE(Y1) -+ ST a3, 3 * SIZE(Y1) ++#ifdef RN ++ addl KK, 2, KK ++#endif + -+ LD a4, 0 * SIZE(X1) -+ LD a5, 1 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a6, 0 * SIZE(X1) -+ LD a7, 1 * SIZE(X1) -+ addl X1, INCX, X1 ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 + -+ ST a4, 4 * SIZE(Y1) -+ ST a5, 5 * SIZE(Y1) -+ ST a6, 6 * SIZE(Y1) -+ ST a7, 7 * SIZE(Y1) ++$L80: ++ and N, 1, J ++ ble J, $L999 + -+ ldi Y1, 8 * SIZE(Y1) -+ bgt I, $L02 -+ .align 4 ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B + -+$L05: -+ and M, 3, I -+ ble I, $L10 -+ .align 4 ++ subl C, LDC, C ++#endif + -+$L06: -+ LD a0, 0 * SIZE(X1) -+ LD a1, 1 * SIZE(X1) -+ addl X1, INCX, X1 ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif + -+ ST a0, 0 * SIZE(Y1) -+ ST a1, 1 * SIZE(Y1) -+ ldi Y1, 2 * SIZE(Y1) ++#ifdef LN ++ addl M, OFFSET, KK ++#endif + -+ ldi I, -1(I) -+ bgt I, $L06 ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 + .align 4 + -+$L10: -+ mov Y, Y1 -+ fclr t0 -+ unop -+ fclr t1 ++$L91: ++#if defined(LT) || defined(RN) + -+ sra N, 1, J ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) + fclr t2 ++ LD a3, 2 * SIZE(AO) + fclr t3 -+ ble J, $L20 -+ .align 4 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+$L11: -+ mov A, A1 -+ fclr s0 -+ addl A, LDA, A2 -+ fclr s1 ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 + -+ addl A2, LDA, A -+ unop -+ mov X, X1 -+ fillcs 3 * SIZE(Y) ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 + -+ sra M, 2, I -+ fclr s2 -+ fclr s3 -+ ble I, $L15 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 0 * SIZE(A2) -+ LD a3, 1 * SIZE(A2) -+ LD a4, 2 * SIZE(A1) -+ LD a5, 3 * SIZE(A1) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO + -+ LD a8, 4 * SIZE(A1) -+ LD a9, 5 * SIZE(A1) -+ LD a10, 4 * SIZE(A2) -+ LD a11, 5 * SIZE(A2) -+ LD a12, 6 * SIZE(A1) -+ LD a13, 7 * SIZE(A1) -+ LD a14, 6 * SIZE(A2) -+ LD a15, 7 * SIZE(A2) ++ subl K, KK, TMP1 + -+ LD x0, 0 * SIZE(X1) -+ LD x1, 1 * SIZE(X1) -+ LD x2, 2 * SIZE(X1) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 + -+$L12: -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 ++ sra TMP1, 2, L + unop -+ MUL x0, a0, t0 -+ LD x3, 3 * SIZE(X1) ++ ble L, $L95 ++#endif ++ .align 5 + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) -+ MUL x0, a1, t1 ++$L92: ++ ADD c01, t1, c01 + unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + -+ ADD3 s2, t2, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x0, a2, t2 ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 + unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+ ADD4 s3, t3, $f30 -+ fmov $f30, s3 ++ ADD c01, t1, c01 + unop -+ MUL x0, a3, t3 -+ LD x0, 4 * SIZE(X1) ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 ++ ADD c02, t2, c02 + unop -+ MUL x1, a1, t0 -+ LD a1, 9 * SIZE(A1) ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) + -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 ++ ADD c03, t3, c03 + unop -+ MUL x1, a0, t1 -+ LD a0, 8 * SIZE(A1) ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) + -+ ADD1 s2, t2, $f30 -+ fmov $f30, s2 ++ ADD c01, t1, c01 + unop -+ MUL x1, a3, t2 -+ LD a3, 9 * SIZE(A2) ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) + -+ ADD2 s3, t3, $f30 -+ fmov $f30, s3 ++ ADD c02, t2, c02 + unop -+ MUL x1, a2, t3 -+ LD a2, 8 * SIZE(A2) ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 ++ ADD c03, t3, c03 + unop -+ MUL x2, a4, t0 -+ LD x1, 5 * SIZE(X1) ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x2, a5, t1 -+ ADD3 s2, t2, $f30 -+ fmov $f30, s2 -+ MUL x2, a6, t2 ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) + -+ ADD4 s3, t3, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x2, a7, t3 -+ LD x2, 6 * SIZE(X1) ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x3, a5, t0 -+ LD a5, 11 * SIZE(A1) ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) + -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ unop -+ MUL x3, a4, t1 -+ LD a4, 10 * SIZE(A1) ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) + -+ ADD1 s2, t2, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x3, a7, t2 -+ LD a7, 11 * SIZE(A2) ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 + -+ ADD2 s3, t3, $f30 -+ fmov $f30, s3 ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif + unop -+ MUL x3, a6, t3 -+ LD a6, 10 * SIZE(A2) ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 ++ ADD c03, t3, c03 + unop -+ MUL x0, a8, t0 -+ LD x3, 7 * SIZE(X1) ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a2, c04, t1 ++ SUB c03, t1, c03 ++ MUL a3, c04, t1 ++ SUB c02, t1, c02 ++ MUL a4, c04, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A2) -+ MUL x0, a9, t1 -+ unop ++ MUL b1, c03, c03 ++ MUL b2, c03, t1 ++ SUB c02, t1, c02 ++ MUL b3, c03, t1 ++ SUB c01, t1, c01 + -+ ADD3 s2, t2, $f30 -+ fmov $f30, s2 -+ ldi I, -1(I) -+ MUL x0, a10, t2 -+ unop ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ ADD4 s3, t3, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x0, a11, t3 -+ LD x0, 8 * SIZE(X1) ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x1, a9, t0 -+ LD a9, 13 * SIZE(A1) ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ unop -+ MUL x1, a8, t1 -+ LD a8, 12 * SIZE(A1) ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c01, t1 ++ SUB c03, t1, c03 ++ MUL a4, c01, t1 ++ SUB c04, t1, c04 + -+ ADD1 s2, t2, $f30 -+ fmov $f30, s2 -+ ldi A1, 8 * SIZE(A1) -+ MUL x1, a11, t2 -+ LD a11, 13 * SIZE(A2) ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) + -+ ADD2 s3, t3, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x1, a10, t3 -+ LD a10, 12 * SIZE(A2) ++ MUL b1, c02, c02 ++ MUL b2, c02, t1 ++ SUB c03, t1, c03 ++ MUL b3, c02, t1 ++ SUB c04, t1, c04 + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x2, a12, t0 -+ LD x1, 9 * SIZE(X1) ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ fillcs (PREFETCHSIZE + 0) * SIZE(X1) -+ MUL x2, a13, t1 -+ ldi A2, 8 * SIZE(A2) ++ MUL a1, c03, c03 ++ MUL a2, c03, t1 ++ SUB c04, t1, c04 ++ MUL a3, c04, c04 ++#endif + -+ ADD3 s2, t2, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x2, a14, t2 -+ unop ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) + -+ ADD4 s3, t3, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x2, a15, t3 -+ LD x2, 10 * SIZE(X1) ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++#endif + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x3, a13, t0 -+ LD a13, 7 * SIZE(A1) ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif + -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ ldi X1, 8 * SIZE(X1) -+ MUL x3, a12, t1 -+ LD a12, 6 * SIZE(A1) ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif + -+ ADD1 s2, t2, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x3, a15, t2 -+ LD a15, 7 * SIZE(A2) ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) + -+ ADD2 s3, t3, $f30 -+ fmov $f30, s3 -+ MUL x3, a14, t3 -+ LD a14, 6 * SIZE(A2) -+ bgt I, $L12 -+ .align 4 ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif + -+$L13: -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x0, a0, t0 -+ LD x3, 3 * SIZE(X1) ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x0, a1, t1 -+ ADD3 s2, t2, $f30 -+ fmov $f30, s2 -+ MUL x0, a2, t2 ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ ADD4 s3, t3, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x0, a3, t3 -+ LD x0, 4 * SIZE(X1) ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x1, a1, t0 -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x1, a0, t1 ++#ifdef LT ++ addl KK, 4, KK ++#endif + -+ ADD1 s2, t2, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x1, a3, t2 -+ unop ++#ifdef LN ++ subl KK, 4, KK ++#endif + -+ ADD2 s3, t3, $f30 -+ fmov $f30, s3 -+ ldi A1, 8 * SIZE(A1) -+ MUL x1, a2, t3 -+ LD x1, 5 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x2, a4, t0 -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x2, a5, t1 ++$L100: ++ and M, 2, I ++ ble I, $L110 + -+ ADD3 s2, t2, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x2, a6, t2 -+ unop ++#if defined(LT) || defined(RN) + -+ ADD4 s3, t3, $f30 -+ fmov $f30, s3 -+ ldi A2, 8 * SIZE(A2) -+ MUL x2, a7, t3 -+ LD x2, 6 * SIZE(X1) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x3, a5, t0 -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x3, a4, t1 ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 + -+ ADD1 s2, t2, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x3, a7, t2 -+ ldi X1, 8 * SIZE(X1) ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ ADD2 s3, t3, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x3, a6, t3 -+ LD x3, -1 * SIZE(X1) ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x0, a8, t0 -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x0, a9, t1 ++ subl K, KK, TMP1 + -+ ADD3 s2, t2, $f30 -+ fmov $f30, s2 -+ MUL x0, a10, t2 -+ ADD4 s3, t3, $f30 -+ fmov $f30, s3 -+ MUL x0, a11, t3 ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x1, a9, t0 -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x1, a8, t1 ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 + -+ ADD1 s2, t2, $f30 -+ fmov $f30, s2 -+ MUL x1, a11, t2 -+ ADD2 s3, t3, $f30 -+ fmov $f30, s3 -+ MUL x1, a10, t3 ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x2, a12, t0 -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x2, a13, t1 ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + -+ ADD3 s2, t2, $f30 -+ fmov $f30, s2 -+ MUL x2, a14, t2 -+ ADD4 s3, t3, $f30 -+ fmov $f30, s3 -+ MUL x2, a15, t3 ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x3, a13, t0 -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x3, a12, t1 ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) + -+ ADD1 s2, t2, $f30 -+ fmov $f30, s2 -+ MUL x3, a15, t2 -+ ADD2 s3, t3, $f30 -+ fmov $f30, s3 -+ MUL x3, a14, t3 -+ .align 4 ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) + -+$L15: -+ and M, 3, I -+ ble I, $L18 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 0 * SIZE(A2) -+ LD a3, 1 * SIZE(A2) ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) + -+ LD x0, 0 * SIZE(X1) ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) + -+ ldi I, -1(I) -+ ble I, $L17 ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 + .align 4 + -+$L16: -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ ldi I, -1(I) -+ MUL x0, a0, t0 -+ LD x1, 1 * SIZE(X1) ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x0, a1, t1 -+ ADD3 s2, t2, $f30 -+ fmov $f30, s2 -+ MUL x0, a2, t2 ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) + -+ ADD4 s3, t3, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x0, a3, t3 -+ LD x0, 2 * SIZE(X1) ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ ldi A2, 2 * SIZE(A2) -+ MUL x1, a1, t0 -+ LD a1, 3 * SIZE(A1) ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 + -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ ldi X1, 2 * SIZE(X1) -+ MUL x1, a0, t1 -+ LD a0, 2 * SIZE(A1) ++$L108: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 + -+ ADD1 s2, t2, $f30 -+ fmov $f30, s2 -+ ldi A1, 2 * SIZE(A1) -+ MUL x1, a3, t2 -+ LD a3, 1 * SIZE(A2) ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 + -+ ADD2 s3, t3, $f30 -+ fmov $f30, s3 -+ MUL x1, a2, t3 -+ LD a2, 0 * SIZE(A2) -+ bgt I, $L16 -+ .align 4 ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif + -+$L17: -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x0, a0, t0 -+ LD x1, 1 * SIZE(X1) ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ unop -+ MUL x0, a1, t1 -+ unop ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) + -+ ADD3 s2, t2, $f30 -+ fmov $f30, s2 -+ MUL x0, a2, t2 -+ ADD4 s3, t3, $f30 -+ fmov $f30, s3 -+ MUL x0, a3, t3 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x1, a1, t0 -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x1, a0, t1 ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ ADD1 s2, t2, $f30 -+ fmov $f30, s2 -+ MUL x1, a3, t2 -+ ADD2 s3, t3, $f30 -+ fmov $f30, s3 -+ MUL x1, a2, t3 -+ .align 4 ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+$L18: -+ LD a0, 0 * SIZE(Y) -+ unop -+ LD a1, 1 * SIZE(Y) -+ addl Y, INCY, Y ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) + -+ LD a2, 0 * SIZE(Y) -+ unop -+ LD a3, 1 * SIZE(Y) -+ addl Y, INCY, Y ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c02, c02 ++#endif + -+ ADD3 s0, t0, a8 -+ ADD4 s1, t1, a9 -+ ADD3 s2, t2, a10 -+ ADD4 s3, t3, a11 ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) + -+ fmov a8, s0 -+ fmov a9, s1 -+ fmov a10, s2 -+ fmov a11, s3 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++#endif + -+ MUL alpha_r, s0, t0 -+ MUL alpha_r, s1, t1 -+ MUL alpha_r, s2, t2 -+ MUL alpha_r, s3, t3 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif + -+ ADD a0, t0, a8 -+ MUL alpha_i, s1, t0 -+ ADD a1, t1, a9 -+ MUL alpha_i, s0, t1 -+ ADD a2, t2, a10 -+ MUL alpha_i, s3, t2 -+ ADD a3, t3, a11 -+ MUL alpha_i, s2, t3 ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif + -+ SUB a8, t0, a0 -+ ADD a9, t1, a1 -+ SUB a10, t2, a2 -+ ADD a11, t3, a3 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) + -+ ST a0, 0 * SIZE(Y1) -+ fclr t0 -+ ST a1, 1 * SIZE(Y1) -+ addl Y1, INCY, Y1 ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif + -+ ST a2, 0 * SIZE(Y1) + fclr t1 -+ ST a3, 1 * SIZE(Y1) -+ addl Y1, INCY, Y1 -+ + fclr t2 -+ ldi J, -1(J) + fclr t3 -+ bgt J, $L11 -+ .align 4 -+ -+$L20: -+ blbc N, $L999 -+ -+ mov A, A1 -+ fclr s0 -+ fclr s1 -+ mov X, X1 ++ fclr t4 + -+ sra M, 2, I -+ fclr s2 -+ fclr s3 -+ ble I, $L25 ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a4, 2 * SIZE(A1) -+ LD a5, 3 * SIZE(A1) -+ LD a8, 4 * SIZE(A1) -+ LD a9, 5 * SIZE(A1) -+ LD a12, 6 * SIZE(A1) -+ LD a13, 7 * SIZE(A1) ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ LD x0, 0 * SIZE(X1) -+ LD x1, 1 * SIZE(X1) -+ LD x2, 2 * SIZE(X1) ++#ifdef LT ++ addl KK, 2, KK ++#endif + -+ ldi I, -1(I) -+ ble I, $L23 ++#ifdef LN ++ subl KK, 2, KK ++#endif + .align 4 + -+$L22: -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) -+ MUL x0, a0, t0 -+ LD x3, 3 * SIZE(X1) -+ -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ unop -+ MUL x0, a1, t1 -+ LD x0, 4 * SIZE(X1) -+ -+ ADD1 s2, t0, $f30 -+ fmov $f30, s2 -+ ldi I, -1(I) -+ MUL x1, a1, t0 -+ LD a1, 9 * SIZE(A1) -+ -+ ADD2 s3, t1, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x1, a0, t1 -+ LD a0, 8 * SIZE(A1) ++$L110: ++ and M, 1, I ++ ble I, $L119 + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x2, a4, t0 -+ LD x1, 5 * SIZE(X1) ++#if defined(LT) || defined(RN) + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ unop -+ MUL x2, a5, t1 -+ LD x2, 6 * SIZE(X1) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD1 s2, t0, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x3, a5, t0 -+ LD a5, 11 * SIZE(A1) ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 + -+ ADD2 s3, t1, $f30 -+ fmov $f30, s3 ++ sra KK, 2, L ++ mov B, BO + unop -+ MUL x3, a4, t1 -+ LD a4, 10 * SIZE(A1) ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x0, a8, t0 -+ LD x3, 7 * SIZE(X1) ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ unop -+ MUL x0, a9, t1 -+ LD x0, 8 * SIZE(X1) ++ subl K, KK, TMP1 + -+ ADD1 s2, t0, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x1, a9, t0 -+ LD a9, 13 * SIZE(A1) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD2 s3, t1, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x1, a8, t1 -+ LD a8, 12 * SIZE(A1) ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 ++ sra TMP1, 2, L + unop -+ MUL x2, a12, t0 -+ LD x1, 9 * SIZE(X1) ++ ble L, $L115 ++#endif ++ .align 4 + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ ldi A1, 8 * SIZE(A1) -+ MUL x2, a13, t1 -+ LD x2, 10 * SIZE(X1) ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+ ADD1 s2, t0, $f30 -+ fmov $f30, s2 -+ ldi X1, 8 * SIZE(X1) -+ MUL x3, a13, t0 -+ LD a13, 7 * SIZE(A1) ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) + -+ ADD2 s3, t1, $f30 -+ fmov $f30, s3 -+ MUL x3, a12, t1 -+ LD a12, 6 * SIZE(A1) -+ bgt I, $L22 -+ .align 4 ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) + -+$L23: -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x0, a0, t0 -+ LD x3, 3 * SIZE(X1) ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ unop -+ MUL x0, a1, t1 -+ LD x0, 4 * SIZE(X1) ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 + -+ ADD1 s2, t0, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x1, a1, t0 -+ ldi A1, 8 * SIZE(A1) ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 + -+ ADD2 s3, t1, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x1, a0, t1 -+ LD x1, 5 * SIZE(X1) ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x2, a4, t0 -+ unop ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ unop -+ MUL x2, a5, t1 -+ LD x2, 6 * SIZE(X1) ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 + -+ ADD1 s2, t0, $f30 -+ fmov $f30, s2 -+ unop -+ MUL x3, a5, t0 -+ ldi X1, 8 * SIZE(X1) ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 + -+ ADD2 s3, t1, $f30 -+ fmov $f30, s3 -+ unop -+ MUL x3, a4, t1 -+ LD x3, -1 * SIZE(X1) ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x0, a8, t0 -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x0, a9, t1 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) + -+ ADD1 s2, t0, $f30 -+ fmov $f30, s2 -+ MUL x1, a9, t0 -+ ADD2 s3, t1, $f30 -+ fmov $f30, s3 -+ MUL x1, a8, t1 ++ SUB a1, c01, c01 ++#else ++ LD a1, 0 * SIZE(AO) + -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x2, a12, t0 -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x2, a13, t1 ++ SUB a1, c01, c01 ++#endif + -+ ADD1 s2, t0, $f30 -+ fmov $f30, s2 -+ MUL x3, a13, t0 -+ ADD2 s3, t1, $f30 -+ fmov $f30, s3 -+ MUL x3, a12, t1 -+ .align 4 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) + -+$L25: -+ and M, 3, I -+ ble I, $L28 ++ MUL a1, c01, c01 ++#endif + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) + -+ LD x0, 0 * SIZE(X1) ++ MUL a1, c01, c01 ++#endif + -+ ldi I, -1(I) -+ ble I, $L27 -+ .align 4 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif + -+$L26: -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ ldi A1, 2 * SIZE(A1) -+ MUL x0, a0, t0 -+ LD x1, 1 * SIZE(X1) ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ ldi I, -1(I) -+ MUL x0, a1, t1 -+ LD x0, 2 * SIZE(X1) ++ ST c01, 0 * SIZE(C1) + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ ldi X1, 2 * SIZE(X1) -+ MUL x1, a1, t0 -+ LD a1, 1 * SIZE(A1) ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif + -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x1, a0, t1 -+ LD a0, 0 * SIZE(A1) -+ bgt I, $L26 -+ .align 4 ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif + -+$L27: -+ ADD3 s0, t0, $f30 -+ fmov $f30, s0 -+ unop -+ MUL x0, a0, t0 -+ LD x1, 1 * SIZE(X1) ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif + -+ ADD4 s1, t1, $f30 -+ fmov $f30, s1 -+ unop -+ MUL x0, a1, t1 -+ unop ++#ifdef LT ++ addl KK, 1, KK ++#endif + -+ ADD1 s0, t0, $f30 -+ fmov $f30, s0 -+ MUL x1, a1, t0 -+ ADD2 s1, t1, $f30 -+ fmov $f30, s1 -+ MUL x1, a0, t1 ++#ifdef LN ++ subl KK, 1, KK ++#endif + .align 4 + -+$L28: -+ LD a0, 0 * SIZE(Y) -+ LD a1, 1 * SIZE(Y) -+ -+ ADD3 s0, t0, a8 -+ ADD4 s1, t1, a9 -+ ADD3 s2, t2, a10 -+ ADD4 s3, t3, a11 -+ -+ ADD a8, a10, s0 -+ ADD a9, a11, s1 -+ -+ MUL alpha_r, s0, t0 -+ MUL alpha_r, s1, t1 ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif + -+ ADD a0, t0, a8 -+ MUL alpha_i, s1, t0 -+ ADD a1, t1, a9 -+ MUL alpha_i, s0, t1 ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif + -+ SUB a8, t0, a0 -+ ADD a9, t1, a1 ++#ifdef RN ++ addl KK, 1, KK ++#endif + -+ ST a0, 0 * SIZE(Y1) -+ ST a1, 1 * SIZE(Y1) ++#ifdef RT ++ subl KK, 1, KK ++#endif + .align 4 + +$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ -+ ldi $sp, STACKSIZE($sp) ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) + ret + EPILOGUE -diff --git a/kernel/sw_64/zgemv_t.S.bak b/kernel/sw_64/zgemv_t.S.bak +diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S b/kernel/sw_64/trsm_kernel_4x4_RT.S new file mode 100644 -index 0000000..f857fb7 +index 000000000..88d1a23a5 --- /dev/null -+++ b/kernel/sw_64/zgemv_t.S.bak -@@ -0,0 +1,922 @@ ++++ b/kernel/sw_64/trsm_kernel_4x4_RT.S +@@ -0,0 +1,4059 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -79095,4408 +20969,4031 @@ index 0000000..f857fb7 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#define STACKSIZE 64 -+#define PREFETCHSIZE 32 ++ ++#if !defined(SW8A) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW8A ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ ++#define STACKSIZE 80 + +#define M $16 +#define N $17 -+#define A $21 -+#define LDA $18 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 + -+#define X $19 -+#define INCX $20 -+#define Y $22 -+#define INCY $23 ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 + -+#define BUFFER $24 ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 + -+#define I $25 -+#define J $27 ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 + -+#define X1 $3 -+#define Y1 $4 -+#define A1 $5 -+#define A2 $6 ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 + -+#define alpha_r $f19 -+#define alpha_i $f20 ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 + -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 + -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 ++#define alpha $f30 + -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f21 ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 + -+#define a0 $f22 -+#define a1 $f23 -+#define a2 $f24 -+#define a3 $f25 -+#define a4 $f26 -+#define a5 $f27 -+#define a6 $f28 -+#define a7 $f29 ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 + -+#define a8 $f2 -+#define a9 $f3 -+#define a10 $f4 -+#define a11 $f5 -+#define a12 $f6 -+#define a13 $f7 -+#define a14 $f8 -+#define a15 $f9 ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 + -+#if !defined(CONJ) && !defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 SUB -+#define ADD4 ADD -+#elif !defined(CONJ) && defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 ADD -+#define ADD4 SUB -+#elif defined(CONJ) && !defined(XCONJ) -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#else -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 SUB -+#define ADD4 SUB -+#endif ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 + + PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 + -+ ldi $sp, -STACKSIZE($sp) -+ ldl LDA, 0 + STACKSIZE($sp) -+ ldl X, 8 + STACKSIZE($sp) -+ ldl INCX, 16 + STACKSIZE($sp) -+ ldl Y, 24 + STACKSIZE($sp) -+ ldl INCY, 32 + STACKSIZE($sp) -+ ldl BUFFER, 40 + STACKSIZE($sp) ++ ldi $sp, -STACKSIZE($sp) + -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) + -+ PROFCODE ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) + + cmple M, 0, $0 -+ sll INCX, ZBASE_SHIFT, INCX + cmple N, 0, $1 -+ sll INCY, ZBASE_SHIFT, INCY ++ cmple K, 0, $2 + + or $0, $1, $0 -+ bne $0, $L999 ++ or $0, $2, $0 ++ bne $0, $L999 + -+ cmpeq INCX, 2 * SIZE, $0 -+ mov X, X1 -+ sll LDA, ZBASE_SHIFT,LDA -+ bne $0, $L10 ++#ifdef LN ++ mull M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif + -+ sra M, 2, I -+ mov BUFFER, Y1 -+ mov BUFFER, X -+ ble I, $L05 -+ .align 4 ++#ifdef RN ++ negl OFFSET, KK ++#endif + -+$L02: -+ fillcs (PREFETCHSIZE + 0) * SIZE(X1) -+ ldi I, -1(I) ++#ifdef RT ++ mull N, K, TMP1 ++ SXADDQ TMP1, B, B + -+ LD a0, 0 * SIZE(X1) -+ LD a1, 1 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a2, 0 * SIZE(X1) -+ LD a3, 1 * SIZE(X1) -+ addl X1, INCX, X1 ++ mull N, LDC, TMP1 ++ addl TMP1, C, C + -+ ST a0, 0 * SIZE(Y1) -+ ST a1, 1 * SIZE(Y1) -+ ST a2, 2 * SIZE(Y1) -+ ST a3, 3 * SIZE(Y1) ++ subl N, OFFSET, KK ++#endif + -+ LD a4, 0 * SIZE(X1) -+ LD a5, 1 * SIZE(X1) -+ addl X1, INCX, X1 -+ LD a6, 0 * SIZE(X1) -+ LD a7, 1 * SIZE(X1) -+ addl X1, INCX, X1 ++ and N, 1, J ++ ble J, $L40 + -+ ST a4, 4 * SIZE(Y1) -+ ST a5, 5 * SIZE(Y1) -+ ST a6, 6 * SIZE(Y1) -+ ST a7, 7 * SIZE(Y1) ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B + -+ ldi Y1, 8 * SIZE(Y1) -+ bgt I, $L02 -+ .align 4 ++ subl C, LDC, C ++#endif + -+$L05: -+ and M, 3, I -+ ble I, $L10 -+ .align 4 ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif + -+$L06: -+ LD a0, 0 * SIZE(X1) -+ LD a1, 1 * SIZE(X1) -+ addl X1, INCX, X1 ++#ifdef LN ++ addl M, OFFSET, KK ++#endif + -+ ST a0, 0 * SIZE(Y1) -+ ST a1, 1 * SIZE(Y1) -+ ldi Y1, 2 * SIZE(Y1) ++#ifdef LT ++ mov OFFSET, KK ++#endif + -+ ldi I, -1(I) -+ bgt I, $L06 ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 + .align 4 + -+$L10: -+ mov Y, Y1 -+ fclr t0 -+ unop -+ fclr t1 ++$L91: ++#if defined(LT) || defined(RN) + -+ sra N, 1, J ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) + fclr t2 ++ LD a3, 2 * SIZE(AO) + fclr t3 -+ ble J, $L20 -+ .align 4 -+ -+$L11: -+ mov A, A1 -+ fclr s0 -+ addl A, LDA, A2 -+ fclr s1 -+ -+ addl A2, LDA, A -+ unop -+ mov X, X1 -+ fillcs 3 * SIZE(Y) ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ sra M, 2, I -+ fclr s2 -+ fclr s3 -+ ble I, $L15 ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 0 * SIZE(A2) -+ LD a3, 1 * SIZE(A2) -+ LD a4, 2 * SIZE(A1) -+ LD a5, 3 * SIZE(A1) -+ LD a6, 2 * SIZE(A2) -+ LD a7, 3 * SIZE(A2) ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 + -+ LD a8, 4 * SIZE(A1) -+ LD a9, 5 * SIZE(A1) -+ LD a10, 4 * SIZE(A2) -+ LD a11, 5 * SIZE(A2) -+ LD a12, 6 * SIZE(A1) -+ LD a13, 7 * SIZE(A1) -+ LD a14, 6 * SIZE(A2) -+ LD a15, 7 * SIZE(A2) ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ LD x0, 0 * SIZE(X1) -+ LD x1, 1 * SIZE(X1) -+ LD x2, 2 * SIZE(X1) ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO + -+ ldi I, -1(I) -+ ble I, $L13 -+ .align 4 ++ subl K, KK, TMP1 + -+$L12: -+ ADD3 s0, t0, s0 -+ unop -+ MUL x0, a0, t0 -+ LD x3, 3 * SIZE(X1) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD4 s1, t1, s1 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) -+ MUL x0, a1, t1 -+ unop ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 + -+ ADD3 s2, t2, s2 -+ unop -+ MUL x0, a2, t2 ++ sra TMP1, 2, L + unop ++ ble L, $L95 ++#endif ++ .align 5 + -+ ADD4 s3, t3, s3 ++$L92: ++ ADD c01, t1, c01 + unop -+ MUL x0, a3, t3 -+ LD x0, 4 * SIZE(X1) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + -+ ADD1 s0, t0, s0 -+ unop -+ MUL x1, a1, t0 -+ LD a1, 9 * SIZE(A1) ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) + -+ ADD2 s1, t1, s1 ++ ADD c03, t3, c03 + unop -+ MUL x1, a0, t1 -+ LD a0, 8 * SIZE(A1) ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) + -+ ADD1 s2, t2, s2 -+ unop -+ MUL x1, a3, t2 -+ LD a3, 9 * SIZE(A2) ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+ ADD2 s3, t3, s3 ++ ADD c01, t1, c01 + unop -+ MUL x1, a2, t3 -+ LD a2, 8 * SIZE(A2) ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) + -+ ADD3 s0, t0, s0 ++ ADD c02, t2, c02 + unop -+ MUL x2, a4, t0 -+ LD x1, 5 * SIZE(X1) -+ -+ ADD4 s1, t1, s1 -+ MUL x2, a5, t1 -+ ADD3 s2, t2, s2 -+ MUL x2, a6, t2 ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) + -+ ADD4 s3, t3, s3 ++ ADD c03, t3, c03 + unop -+ MUL x2, a7, t3 -+ LD x2, 6 * SIZE(X1) ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) + -+ ADD1 s0, t0, s0 -+ unop -+ MUL x3, a5, t0 -+ LD a5, 11 * SIZE(A1) ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) + -+ ADD2 s1, t1, s1 ++ ADD c01, t1, c01 + unop -+ MUL x3, a4, t1 -+ LD a4, 10 * SIZE(A1) ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) + -+ ADD1 s2, t2, s2 ++ ADD c02, t2, c02 + unop -+ MUL x3, a7, t2 -+ LD a7, 11 * SIZE(A2) ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) + -+ ADD2 s3, t3, s3 ++ ADD c03, t3, c03 + unop -+ MUL x3, a6, t3 -+ LD a6, 10 * SIZE(A2) ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) + -+ ADD3 s0, t0, s0 -+ unop -+ MUL x0, a8, t0 -+ LD x3, 7 * SIZE(X1) ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) + -+ ADD4 s1, t1, s1 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A2) -+ MUL x0, a9, t1 -+ unop ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) + -+ ADD3 s2, t2, s2 -+ ldi I, -1(I) -+ MUL x0, a10, t2 -+ unop ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) + -+ ADD4 s3, t3, s3 -+ unop -+ MUL x0, a11, t3 -+ LD x0, 8 * SIZE(X1) ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) + -+ ADD1 s0, t0, s0 -+ unop -+ MUL x1, a9, t0 -+ LD a9, 13 * SIZE(A1) ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 + -+ ADD2 s1, t1, s1 ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif + unop -+ MUL x1, a8, t1 -+ LD a8, 12 * SIZE(A1) ++ ble L, $L98 ++ .align 4 + -+ ADD1 s2, t2, s2 -+ ldi A1, 8 * SIZE(A1) -+ MUL x1, a11, t2 -+ LD a11, 13 * SIZE(A2) ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + -+ ADD2 s3, t3, s3 -+ unop -+ MUL x1, a10, t3 -+ LD a10, 12 * SIZE(A2) ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) + -+ ADD3 s0, t0, s0 ++ ADD c03, t3, c03 + unop -+ MUL x2, a12, t0 -+ LD x1, 9 * SIZE(X1) ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) + -+ ADD4 s1, t1, s1 -+ fillcs (PREFETCHSIZE + 0) * SIZE(X1) -+ MUL x2, a13, t1 -+ ldi A2, 8 * SIZE(A2) ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) + -+ ADD3 s2, t2, s2 -+ unop -+ MUL x2, a14, t2 -+ unop ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 + -+ ADD4 s3, t3, s3 -+ unop -+ MUL x2, a15, t3 -+ LD x2, 10 * SIZE(X1) ++$L98: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 + -+ ADD1 s0, t0, s0 -+ unop -+ MUL x3, a13, t0 -+ LD a13, 7 * SIZE(A1) ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif + -+ ADD2 s1, t1, s1 -+ ldi X1, 8 * SIZE(X1) -+ MUL x3, a12, t1 -+ LD a12, 6 * SIZE(A1) ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ ADD1 s2, t2, s2 -+ unop -+ MUL x3, a15, t2 -+ LD a15, 7 * SIZE(A2) ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ ADD2 s3, t3, s3 -+ MUL x3, a14, t3 -+ LD a14, 6 * SIZE(A2) -+ bgt I, $L12 -+ .align 4 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif + -+$L13: -+ ADD3 s0, t0, s0 -+ unop -+ MUL x0, a0, t0 -+ LD x3, 3 * SIZE(X1) ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) + -+ ADD4 s1, t1, s1 -+ MUL x0, a1, t1 -+ ADD3 s2, t2, s2 -+ MUL x0, a2, t2 ++ MUL a1, c04, c04 ++ MUL a2, c04, t1 ++ SUB c03, t1, c03 ++ MUL a3, c04, t1 ++ SUB c02, t1, c02 ++ MUL a4, c04, t1 ++ SUB c01, t1, c01 + -+ ADD4 s3, t3, s3 -+ unop -+ MUL x0, a3, t3 -+ LD x0, 4 * SIZE(X1) ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) + -+ ADD1 s0, t0, s0 -+ MUL x1, a1, t0 -+ ADD2 s1, t1, s1 -+ MUL x1, a0, t1 ++ MUL b1, c03, c03 ++ MUL b2, c03, t1 ++ SUB c02, t1, c02 ++ MUL b3, c03, t1 ++ SUB c01, t1, c01 + -+ ADD1 s2, t2, s2 -+ unop -+ MUL x1, a3, t2 -+ unop ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ ADD2 s3, t3, s3 -+ ldi A1, 8 * SIZE(A1) -+ MUL x1, a2, t3 -+ LD x1, 5 * SIZE(X1) ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+ ADD3 s0, t0, s0 -+ MUL x2, a4, t0 -+ ADD4 s1, t1, s1 -+ MUL x2, a5, t1 ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ ADD3 s2, t2, s2 -+ unop -+ MUL x2, a6, t2 -+ unop ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c01, t1 ++ SUB c03, t1, c03 ++ MUL a4, c01, t1 ++ SUB c04, t1, c04 + -+ ADD4 s3, t3, s3 -+ ldi A2, 8 * SIZE(A2) -+ MUL x2, a7, t3 -+ LD x2, 6 * SIZE(X1) ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) + -+ ADD1 s0, t0, s0 -+ MUL x3, a5, t0 -+ ADD2 s1, t1, s1 -+ MUL x3, a4, t1 ++ MUL b1, c02, c02 ++ MUL b2, c02, t1 ++ SUB c03, t1, c03 ++ MUL b3, c02, t1 ++ SUB c04, t1, c04 + -+ ADD1 s2, t2, s2 -+ unop -+ MUL x3, a7, t2 -+ ldi X1, 8 * SIZE(X1) ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) + -+ ADD2 s3, t3, s3 -+ unop -+ MUL x3, a6, t3 -+ LD x3, -1 * SIZE(X1) ++ MUL a1, c03, c03 ++ MUL a2, c03, t1 ++ SUB c04, t1, c04 ++ MUL a3, c04, c04 ++#endif + -+ ADD3 s0, t0, s0 -+ MUL x0, a8, t0 -+ ADD4 s1, t1, s1 -+ MUL x0, a9, t1 ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) + -+ ADD3 s2, t2, s2 -+ MUL x0, a10, t2 -+ ADD4 s3, t3, s3 -+ MUL x0, a11, t3 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++#endif + -+ ADD1 s0, t0, s0 -+ MUL x1, a9, t0 -+ ADD2 s1, t1, s1 -+ MUL x1, a8, t1 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif + -+ ADD1 s2, t2, s2 -+ MUL x1, a11, t2 -+ ADD2 s3, t3, s3 -+ MUL x1, a10, t3 ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif + -+ ADD3 s0, t0, s0 -+ MUL x2, a12, t0 -+ ADD4 s1, t1, s1 -+ MUL x2, a13, t1 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) + -+ ADD3 s2, t2, s2 -+ MUL x2, a14, t2 -+ ADD4 s3, t3, s3 -+ MUL x2, a15, t3 ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif + -+ ADD1 s0, t0, s0 -+ MUL x3, a13, t0 -+ ADD2 s1, t1, s1 -+ MUL x3, a12, t1 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ ADD1 s2, t2, s2 -+ MUL x3, a15, t2 -+ ADD2 s3, t3, s3 -+ MUL x3, a14, t3 -+ .align 4 ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+$L15: -+ and M, 3, I -+ ble I, $L18 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a2, 0 * SIZE(A2) -+ LD a3, 1 * SIZE(A2) ++#ifdef LT ++ addl KK, 4, KK ++#endif + -+ LD x0, 0 * SIZE(X1) ++#ifdef LN ++ subl KK, 4, KK ++#endif + -+ ldi I, -1(I) -+ ble I, $L17 ++ ldi I, -1(I) ++ bgt I, $L91 + .align 4 + -+$L16: -+ ADD3 s0, t0, s0 -+ ldi I, -1(I) -+ MUL x0, a0, t0 -+ LD x1, 1 * SIZE(X1) ++$L100: ++ and M, 2, I ++ ble I, $L110 + -+ ADD4 s1, t1, s1 -+ MUL x0, a1, t1 -+ ADD3 s2, t2, s2 -+ MUL x0, a2, t2 ++#if defined(LT) || defined(RN) + -+ ADD4 s3, t3, s3 -+ unop -+ MUL x0, a3, t3 -+ LD x0, 2 * SIZE(X1) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD1 s0, t0, s0 -+ ldi A2, 2 * SIZE(A2) -+ MUL x1, a1, t0 -+ LD a1, 3 * SIZE(A1) ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 + -+ ADD2 s1, t1, s1 -+ ldi X1, 2 * SIZE(X1) -+ MUL x1, a0, t1 -+ LD a0, 2 * SIZE(A1) ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ ADD1 s2, t2, s2 -+ ldi A1, 2 * SIZE(A1) -+ MUL x1, a3, t2 -+ LD a3, 1 * SIZE(A2) ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO + -+ ADD2 s3, t3, s3 -+ MUL x1, a2, t3 -+ LD a2, 0 * SIZE(A2) -+ bgt I, $L16 -+ .align 4 ++ subl K, KK, TMP1 + -+$L17: -+ ADD3 s0, t0, s0 -+ unop -+ MUL x0, a0, t0 -+ LD x1, 1 * SIZE(X1) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD4 s1, t1, s1 -+ unop -+ MUL x0, a1, t1 -+ unop ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 + -+ ADD3 s2, t2, s2 -+ MUL x0, a2, t2 -+ ADD4 s3, t3, s3 -+ MUL x0, a3, t3 ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 + -+ ADD1 s0, t0, s0 -+ MUL x1, a1, t0 -+ ADD2 s1, t1, s1 -+ MUL x1, a0, t1 ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) + -+ ADD1 s2, t2, s2 -+ MUL x1, a3, t2 -+ ADD2 s3, t3, s3 -+ MUL x1, a2, t3 -+ .align 4 ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+$L18: -+ LD a0, 0 * SIZE(Y) -+ unop -+ LD a1, 1 * SIZE(Y) -+ addl Y, INCY, Y ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) + -+ LD a2, 0 * SIZE(Y) -+ unop -+ LD a3, 1 * SIZE(Y) -+ addl Y, INCY, Y ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) + -+ ADD3 s0, t0, s0 -+ ADD4 s1, t1, s1 -+ ADD3 s2, t2, s2 -+ ADD4 s3, t3, s3 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) + -+ MUL alpha_r, s0, t0 -+ MUL alpha_r, s1, t1 -+ MUL alpha_r, s2, t2 -+ MUL alpha_r, s3, t3 ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) + -+ ADD a0, t0, a0 -+ MUL alpha_i, s1, t0 -+ ADD a1, t1, a1 -+ MUL alpha_i, s0, t1 -+ ADD a2, t2, a2 -+ MUL alpha_i, s3, t2 -+ ADD a3, t3, a3 -+ MUL alpha_i, s2, t3 ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) + -+ SUB a0, t0, a0 -+ ADD a1, t1, a1 -+ SUB a2, t2, a2 -+ ADD a3, t3, a3 ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 + -+ ST a0, 0 * SIZE(Y1) -+ fclr t0 -+ ST a1, 1 * SIZE(Y1) -+ addl Y1, INCY, Y1 ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 + -+ ST a2, 0 * SIZE(Y1) -+ fclr t1 -+ ST a3, 1 * SIZE(Y1) -+ addl Y1, INCY, Y1 ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) + -+ fclr t2 -+ ldi J, -1(J) -+ fclr t3 -+ bgt J, $L11 -+ .align 4 ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) + -+$L20: -+ blbc N, $L999 ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 + -+ mov A, A1 -+ fclr s0 -+ fclr s1 -+ mov X, X1 ++$L108: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 + -+ sra M, 2, I -+ fclr s2 -+ fclr s3 -+ ble I, $L25 ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) -+ LD a4, 2 * SIZE(A1) -+ LD a5, 3 * SIZE(A1) -+ LD a8, 4 * SIZE(A1) -+ LD a9, 5 * SIZE(A1) -+ LD a12, 6 * SIZE(A1) -+ LD a13, 7 * SIZE(A1) ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif + -+ LD x0, 0 * SIZE(X1) -+ LD x1, 1 * SIZE(X1) -+ LD x2, 2 * SIZE(X1) ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) + -+ ldi I, -1(I) -+ ble I, $L23 -+ .align 4 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) + -+$L22: -+ ADD3 s0, t0, s0 -+ fillcs (PREFETCHSIZE + 0) * SIZE(A1) -+ MUL x0, a0, t0 -+ LD x3, 3 * SIZE(X1) ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif + -+ ADD4 s1, t1, s1 -+ unop -+ MUL x0, a1, t1 -+ LD x0, 4 * SIZE(X1) ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ ADD1 s2, t0, s2 -+ ldi I, -1(I) -+ MUL x1, a1, t0 -+ LD a1, 9 * SIZE(A1) ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+ ADD2 s3, t1, s3 -+ unop -+ MUL x1, a0, t1 -+ LD a0, 8 * SIZE(A1) ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) + -+ ADD3 s0, t0, s0 -+ unop -+ MUL x2, a4, t0 -+ LD x1, 5 * SIZE(X1) ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c02, c02 ++#endif + -+ ADD4 s1, t1, s1 -+ unop -+ MUL x2, a5, t1 -+ LD x2, 6 * SIZE(X1) ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) + -+ ADD1 s2, t0, s2 -+ unop -+ MUL x3, a5, t0 -+ LD a5, 11 * SIZE(A1) ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++#endif + -+ ADD2 s3, t1, s3 -+ unop -+ MUL x3, a4, t1 -+ LD a4, 10 * SIZE(A1) ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif + -+ ADD3 s0, t0, s0 -+ unop -+ MUL x0, a8, t0 -+ LD x3, 7 * SIZE(X1) ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif + -+ ADD4 s1, t1, s1 -+ unop -+ MUL x0, a9, t1 -+ LD x0, 8 * SIZE(X1) ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) + -+ ADD1 s2, t0, s2 -+ unop -+ MUL x1, a9, t0 -+ LD a9, 13 * SIZE(A1) ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif + -+ ADD2 s3, t1, s3 -+ unop -+ MUL x1, a8, t1 -+ LD a8, 12 * SIZE(A1) ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ ADD3 s0, t0, s0 -+ unop -+ MUL x2, a12, t0 -+ LD x1, 9 * SIZE(X1) ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ ADD4 s1, t1, s1 -+ ldi A1, 8 * SIZE(A1) -+ MUL x2, a13, t1 -+ LD x2, 10 * SIZE(X1) ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ADD1 s2, t0, s2 -+ ldi X1, 8 * SIZE(X1) -+ MUL x3, a13, t0 -+ LD a13, 7 * SIZE(A1) ++#ifdef LT ++ addl KK, 2, KK ++#endif + -+ ADD2 s3, t1, s3 -+ MUL x3, a12, t1 -+ LD a12, 6 * SIZE(A1) -+ bgt I, $L22 ++#ifdef LN ++ subl KK, 2, KK ++#endif + .align 4 + -+$L23: -+ ADD3 s0, t0, s0 -+ unop -+ MUL x0, a0, t0 -+ LD x3, 3 * SIZE(X1) -+ -+ ADD4 s1, t1, s1 -+ unop -+ MUL x0, a1, t1 -+ LD x0, 4 * SIZE(X1) ++$L110: ++ and M, 1, I ++ ble I, $L119 + -+ ADD1 s2, t0, s2 -+ unop -+ MUL x1, a1, t0 -+ ldi A1, 8 * SIZE(A1) ++#if defined(LT) || defined(RN) + -+ ADD2 s3, t1, s3 -+ unop -+ MUL x1, a0, t1 -+ LD x1, 5 * SIZE(X1) ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD3 s0, t0, s0 -+ unop -+ MUL x2, a4, t0 -+ unop ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 + -+ ADD4 s1, t1, s1 ++ sra KK, 2, L ++ mov B, BO + unop -+ MUL x2, a5, t1 -+ LD x2, 6 * SIZE(X1) ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ ADD1 s2, t0, s2 -+ unop -+ MUL x3, a5, t0 -+ ldi X1, 8 * SIZE(X1) ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO + -+ ADD2 s3, t1, s3 -+ unop -+ MUL x3, a4, t1 -+ LD x3, -1 * SIZE(X1) ++ subl K, KK, TMP1 + -+ ADD3 s0, t0, s0 -+ MUL x0, a8, t0 -+ ADD4 s1, t1, s1 -+ MUL x0, a9, t1 ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 + -+ ADD1 s2, t0, s2 -+ MUL x1, a9, t0 -+ ADD2 s3, t1, s3 -+ MUL x1, a8, t1 ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 + -+ ADD3 s0, t0, s0 -+ MUL x2, a12, t0 -+ ADD4 s1, t1, s1 -+ MUL x2, a13, t1 ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 ++#endif ++ .align 4 + -+ ADD1 s2, t0, s2 -+ MUL x3, a13, t0 -+ ADD2 s3, t1, s3 -+ MUL x3, a12, t1 -+ .align 4 ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) + -+$L25: -+ and M, 3, I -+ ble I, $L28 ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) + -+ LD a0, 0 * SIZE(A1) -+ LD a1, 1 * SIZE(A1) ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) + -+ LD x0, 0 * SIZE(X1) ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) + -+ ldi I, -1(I) -+ ble I, $L27 ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 + .align 4 + -+$L26: -+ ADD3 s0, t0, s0 -+ ldi A1, 2 * SIZE(A1) -+ MUL x0, a0, t0 -+ LD x1, 1 * SIZE(X1) -+ -+ ADD4 s1, t1, s1 -+ ldi I, -1(I) -+ MUL x0, a1, t1 -+ LD x0, 2 * SIZE(X1) ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 + -+ ADD1 s0, t0, s0 -+ ldi X1, 2 * SIZE(X1) -+ MUL x1, a1, t0 -+ LD a1, 1 * SIZE(A1) ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) + -+ ADD2 s1, t1, s1 -+ MUL x1, a0, t1 -+ LD a0, 0 * SIZE(A1) -+ bgt I, $L26 ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 + .align 4 + -+$L27: -+ ADD3 s0, t0, s0 -+ unop -+ MUL x0, a0, t0 -+ LD x1, 1 * SIZE(X1) -+ -+ ADD4 s1, t1, s1 -+ unop -+ MUL x0, a1, t1 -+ unop ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 + -+ ADD1 s0, t0, s0 -+ MUL x1, a1, t0 -+ ADD2 s1, t1, s1 -+ MUL x1, a0, t1 -+ .align 4 ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 + -+$L28: -+ LD a0, 0 * SIZE(Y) -+ LD a1, 1 * SIZE(Y) ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif + -+ ADD3 s0, t0, s0 -+ ADD4 s1, t1, s1 -+ ADD3 s2, t2, s2 -+ ADD4 s3, t3, s3 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) + -+ ADD s0, s2, s0 -+ ADD s1, s3, s1 ++ SUB a1, c01, c01 ++#else ++ LD a1, 0 * SIZE(AO) + -+ MUL alpha_r, s0, t0 -+ MUL alpha_r, s1, t1 ++ SUB a1, c01, c01 ++#endif + -+ ADD a0, t0, a0 -+ MUL alpha_i, s1, t0 -+ ADD a1, t1, a1 -+ MUL alpha_i, s0, t1 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) + -+ SUB a0, t0, a0 -+ ADD a1, t1, a1 ++ MUL a1, c01, c01 ++#endif + -+ ST a0, 0 * SIZE(Y1) -+ ST a1, 1 * SIZE(Y1) -+ .align 4 ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) + -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) ++ MUL a1, c01, c01 ++#endif + -+ ldi $sp, STACKSIZE($sp) -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/znrm2.S b/kernel/sw_64/znrm2.S -new file mode 100644 -index 0000000..c1b7375 ---- /dev/null -+++ b/kernel/sw_64/znrm2.S -@@ -0,0 +1,441 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif + -+#define ASSEMBLER ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif + -+#include "common.h" -+#include "version.h" ++ ST c01, 0 * SIZE(C1) + -+#define PREFETCH_SIZE 80 ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif + -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif + -+#define I $0 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif + -+#define a0 $f0 -+#define a1 $f1 -+#define a2 $f10 -+#define a3 $f11 -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 ++#ifdef LT ++ addl KK, 1, KK ++#endif + -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f19 -+#define x4 $f20 -+#define x5 $f21 -+#define x6 $f22 -+#define x7 $f23 ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 + -+ PROLOGUE ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif + -+#if defined(EV4) || defined(EV5) -+ .frame $30,16,$26,0 -+ .mask 0x4000000,-16 -+ ldih $29, 0($27) !gpdisp!1 -+ ldi $29, 0($29) !gpdisp!1 ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif + -+ ldi $sp, -16($sp) -+ ldl $27, sqrt($29) !literal!2 -+ stl $26, 0($sp) ++#ifdef RN ++ addl KK, 1, KK ++#endif + -+ PROFCODE -+ .prologue 1 -+#else -+ PROFCODE ++#ifdef RT ++ subl KK, 1, KK +#endif ++ .align 4 + -+ fclr a0 -+ sll INCX, ZBASE_SHIFT, INCX -+ fclr a1 -+ ble N, $L999 ++$L40: ++ and N, 2, J ++ ble J, $L80 + -+ fclr a2 -+ cmpeq INCX, 2 * SIZE, $0 -+ fclr a3 -+ beq $0, $L20 ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B + -+ fclr t0 -+ sra N, 3, I -+ fclr t1 -+ ble I, $L15 ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif + ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif + fclr t2 -+ LD x0, 0 * SIZE(X) -+ fclr t3 -+ LD x1, 1 * SIZE(X) -+ -+ LD x2, 2 * SIZE(X) -+ LD x3, 3 * SIZE(X) -+ LD x4, 4 * SIZE(X) -+ LD x5, 5 * SIZE(X) -+ LD x6, 6 * SIZE(X) -+ LD x7, 7 * SIZE(X) + -+ ldi I, -1(I) -+ ble I, $L12 -+ .align 4 ++#ifdef LN ++ addl M, OFFSET, KK ++#endif + -+$L11: -+ faddd a0, t0, $f25 -+ fillcs (PREFETCH_SIZE) * SIZE(X) -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) ++#ifdef LT ++ mov OFFSET, KK ++#endif + -+ faddd a1, t1, $f26 -+ mov X, XX -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif + -+ faddd a2, t2, $f27 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 + -+ faddd a3, t3, $f28 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) ++$L51: ++#if defined(LT) || defined(RN) + -+ faddd $f25, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(X) ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 + -+ faddd $f26, t1, a1 -+ unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(X) ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 + -+ faddd $f27, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(X) ++ ldi L, -2(KK) + -+ faddd $f28, t3, a3 -+ unop -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(X) ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) + -+ faddd a0, t0, $f25 -+ unop -+ fmuld x0, x0, t0 -+ LD x0, 16 * SIZE(X) ++ ble KK, $L58 + -+ faddd a1, t1, $f26 -+ ldi X, 16 * SIZE(X) -+ fmuld x1, x1, t1 -+ LD x1, 17 * SIZE(XX) ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ faddd a2, t2, $f27 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 18 * SIZE(XX) ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO + -+ faddd a3, t3, $f28 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 19 * SIZE(XX) ++ subl K, KK, TMP1 + -+ faddd $f25, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 20 * SIZE(XX) ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 + -+ faddd $f26, t1, a1 -+ ldi I, -1(I) -+ fmuld x5, x5, t1 -+ LD x5, 21 * SIZE(XX) ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 + -+ faddd $f27, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 22 * SIZE(XX) ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) + -+ faddd $f28, t3, a3 -+ fmuld x7, x7, t3 -+ LD x7, 23 * SIZE(XX) -+ bgt I, $L11 -+ .align 4 ++ ble TMP1, $L58 + -+$L12: -+ faddd a0, t0, $f25 -+ mov X, XX -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) ++ ble L, $L55 ++#endif ++ .align 4 + -+ faddd a1, t1, $f26 ++$L52: ++ ADD c05, t1, c05 + unop -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) -+ -+ faddd a2, t2, $f27 ++ MUL a1, b1, t1 + unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) + -+ faddd a3, t3, $f28 ++ ADD c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 + unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) + -+ faddd $f25, t0, a0 ++ ADD c07, t3, c07 + unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(XX) -+ -+ faddd $f26, t1, a1 ++ MUL a3, b1, t3 + unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(XX) + -+ faddd $f27, t2, a2 ++ ADD c08, t4, c08 + unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(XX) -+ -+ faddd $f28, t3, a3 -+ ldi X, 16 * SIZE(X) -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(XX) -+ -+ faddd a0, t0, $f25 -+ fmuld x0, x0, t0 -+ faddd a1, t1, $f26 -+ fmuld x1, x1, t1 -+ -+ faddd a2, t2, $f27 -+ fmuld x2, x2, t2 -+ faddd a3, t3, $f28 -+ fmuld x3, x3, t3 -+ -+ faddd $f25, t0, a0 -+ fmuld x4, x4, t0 -+ faddd $f26, t1, a1 -+ fmuld x5, x5, t1 -+ -+ faddd $f27, t2, a2 -+ fmuld x6, x6, t2 -+ faddd $f28, t3, a3 -+ fmuld x7, x7, t3 -+ -+ faddd a2, t2, $f27 -+ fmov $f27, a2 -+ faddd a3, t3, $f28 -+ fmov $f28, a3 -+ .align 4 -+ -+$L15: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 -+ -+$L16: -+ LD x0, 0 * SIZE(X) -+ LD x1, 1 * SIZE(X) -+ -+ ldi X, 2 * SIZE(X) -+ -+ faddd a0, t0, $f25 -+ fmov $f25, a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, $f26 -+ fmov $f26, a1 -+ fmuld x1, x1, t1 ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) + -+ ldi I, -1(I) -+ bgt I, $L16 -+ bsr $31, $L998 -+ .align 4 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) + -+$L20: -+ fclr t0 -+ sra N, 2, I -+ fclr t1 -+ ble I, $L25 ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) + -+ LD x0, 0 * SIZE(X) -+ fclr t2 -+ LD x1, 1 * SIZE(X) -+ addl X, INCX, X -+ LD x2, 0 * SIZE(X) -+ fclr t3 -+ LD x3, 1 * SIZE(X) -+ addl X, INCX, X ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) + -+ LD x4, 0 * SIZE(X) -+ ldi I, -1(I) -+ LD x5, 1 * SIZE(X) -+ addl X, INCX, X ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) + -+ LD x6, 0 * SIZE(X) -+ ble I, $L22 -+ .align 4 ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) + -+$L21: -+ faddd a0, t0, $f25 -+ LD x7, 1 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop + -+ faddd a1, t1, $f26 -+ LD x0, 0 * SIZE(X) -+ fmuld x1, x1, t1 ++ ADD c07, t3, c07 + unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) + -+ faddd a2, t2, $f27 -+ LD x1, 1 * SIZE(X) -+ fmuld x2, x2, t2 -+ addl X, INCX, X ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) + -+ faddd a3, t3, $f28 -+ LD x2, 0 * SIZE(X) -+ fmuld x3, x3, t3 ++ ADD c01, t1, c01 + unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) + -+ faddd $f25, t0, a0 -+ LD x3, 1 * SIZE(X) -+ fmuld x4, x4, t0 -+ addl X, INCX, X ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) + -+ faddd $f26, t1, a1 -+ LD x4, 0 * SIZE(X) -+ fmuld x5, x5, t1 -+ ldi I, -1(I) ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) + -+ faddd $f27, t2, a2 -+ LD x5, 1 * SIZE(X) -+ fmuld x6, x6, t2 -+ addl X, INCX, X ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 + -+ faddd $f28, t3, a3 -+ LD x6, 0 * SIZE(X) -+ fmuld x7, x7, t3 -+ bgt I, $L21 ++$L55: ++ ADD c05, t1, c05 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif + .align 4 + -+$L22: -+ faddd a0, t0, $f25 -+ LD x7, 1 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 + -+ faddd a1, t1, $f26 -+ fmuld x1, x1, t1 -+ faddd a2, t2, $f27 -+ fmuld x2, x2, t2 ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) + -+ faddd a3, t3, $f28 -+ fmuld x3, x3, t3 -+ faddd $f25, t0, a0 -+ fmuld x4, x4, t0 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) + -+ faddd $f26, t1, a1 -+ fmuld x5, x5, t1 -+ faddd $f27, t2, a2 -+ fmuld x6, x6, t2 ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) + -+ faddd $f28, t3, a3 -+ fmuld x7, x7, t3 ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) + -+ faddd a2, t2, $f27 -+ fmov $f27, a2 -+ faddd a3, t3, $f28 -+ fmov $f28, a3 -+ .align 4 ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) + -+$L25: -+ and N, 3, I -+ ble I, $L998 ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) + .align 4 + -+$L26: -+ LD x0, 0 * SIZE(X) -+ ldi I, -1(I) -+ LD x1, 1 * SIZE(X) -+ addl X, INCX, X ++$L57: ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 + -+ faddd a0, t0, $f25 -+ fmov $f25, a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, $f26 -+ fmov $f26, a1 -+ fmuld x1, x1, t1 ++ ADD c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD c01, t1, c01 ++ MUL a1, b2, t1 + -+ bgt I, $L26 ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, c05 ++ ADD c06, t2, c06 ++ ADD c07, t3, c07 ++ ADD c08, t4, c08 + .align 4 + ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif + -+$L998: -+ faddd a0, t0, $f25 -+ faddd a1, t1, $f26 -+ fmov $f25, a0 -+ fmov $f26, a1 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ faddd a0, a1, $f25 -+ fmov $f25, a0 -+ faddd a2, a3, $f26 -+ fmov $f26, a2 ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) + -+#if defined(EV4) || defined(EV5) -+ faddd a0, a2, $f16 -+ jsr $26, ($27), sqrt !lituse_jsr!2 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 + -+ ldih $29, 0($26) !gpdisp!3 -+ ldi $29, 0($29) !gpdisp!3 ++ SUB b1, c03, c03 ++ SUB b2, c07, c07 ++ SUB b3, c04, c04 ++ SUB b4, c08, c08 +#else -+ faddd a0, a2, $f25 -+ fmov $f25, a0 -+ fsqrtd a0, $f25 -+ fmov $f25, a0 -+#endif -+ .align 4 ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+$L999: -+#if defined(EV4) || defined(EV5) -+ ldl $26, 0($sp) -+ ldi $sp, 16($sp) ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 +#endif -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/znrm2.S.bak b/kernel/sw_64/znrm2.S.bak -new file mode 100644 -index 0000000..b2e80e0 ---- /dev/null -+++ b/kernel/sw_64/znrm2.S.bak -@@ -0,0 +1,426 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ + -+#define ASSEMBLER ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) + -+#include "common.h" -+#include "version.h" ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 + -+#define PREFETCH_SIZE 80 ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 + -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 + -+#define I $0 ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 + -+#define a0 $f0 -+#define a1 $f1 -+#define a2 $f10 -+#define a3 $f11 -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 + -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f19 -+#define x4 $f20 -+#define x5 $f21 -+#define x6 $f22 -+#define x7 $f23 ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 + -+ PROLOGUE ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 + -+#if defined(EV4) || defined(EV5) -+ .frame $30,16,$26,0 -+ .mask 0x4000000,-16 -+ ldih $29, 0($27) !gpdisp!1 -+ ldi $29, 0($29) !gpdisp!1 -+ -+ ldi $sp, -16($sp) -+ ldl $27, sqrt($29) !literal!2 -+ stq $26, 0($sp) ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) + -+ PROFCODE -+ .prologue 1 -+#else -+ PROFCODE -+#endif -+ -+ fclr a0 -+ sll INCX, ZBASE_SHIFT, INCX -+ fclr a1 -+ ble N, $L999 ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 + -+ fclr a2 -+ cmpeq INCX, 2 * SIZE, $0 -+ fclr a3 -+ beq $0, $L20 ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 + -+ fclr t0 -+ sra N, 3, I -+ fclr t1 -+ ble I, $L15 ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 + -+ fclr t2 -+ LD x0, 0 * SIZE(X) -+ fclr t3 -+ LD x1, 1 * SIZE(X) ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 + -+ LD x2, 2 * SIZE(X) -+ LD x3, 3 * SIZE(X) -+ LD x4, 4 * SIZE(X) -+ LD x5, 5 * SIZE(X) -+ LD x6, 6 * SIZE(X) -+ LD x7, 7 * SIZE(X) ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 + -+ ldi I, -1(I) -+ ble I, $L12 -+ .align 4 ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+$L11: -+ faddd a0, t0, a0 -+ fillcs (PREFETCH_SIZE) * SIZE(X) -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 + -+ faddd a1, t1, a1 -+ mov X, XX -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 + -+ faddd a2, t2, a2 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 + -+ faddd a3, t3, a3 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif + -+ faddd a0, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(X) ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ faddd a1, t1, a1 -+ unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(X) ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 + -+ faddd a2, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(X) ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 + -+ faddd a3, t3, a3 -+ unop -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(X) ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 + -+ faddd a0, t0, a0 -+ unop -+ fmuld x0, x0, t0 -+ LD x0, 16 * SIZE(X) ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 + -+ faddd a1, t1, a1 -+ ldi X, 16 * SIZE(X) -+ fmuld x1, x1, t1 -+ LD x1, 17 * SIZE(XX) ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 + -+ faddd a2, t2, a2 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 18 * SIZE(XX) ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 + -+ faddd a3, t3, a3 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 19 * SIZE(XX) ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 + -+ faddd a0, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 20 * SIZE(XX) ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) + -+ faddd a1, t1, a1 -+ ldi I, -1(I) -+ fmuld x5, x5, t1 -+ LD x5, 21 * SIZE(XX) ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 + -+ faddd a2, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 22 * SIZE(XX) ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 + -+ faddd a3, t3, a3 -+ fmuld x7, x7, t3 -+ LD x7, 23 * SIZE(XX) -+ bgt I, $L11 -+ .align 4 ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 + -+$L12: -+ faddd a0, t0, a0 -+ mov X, XX -+ fmuld x0, x0, t0 -+ LD x0, 8 * SIZE(X) ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 + -+ faddd a1, t1, a1 -+ unop -+ fmuld x1, x1, t1 -+ LD x1, 9 * SIZE(X) ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 + -+ faddd a2, t2, a2 -+ unop -+ fmuld x2, x2, t2 -+ LD x2, 10 * SIZE(X) ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) + -+ faddd a3, t3, a3 -+ unop -+ fmuld x3, x3, t3 -+ LD x3, 11 * SIZE(X) ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 + -+ faddd a0, t0, a0 -+ unop -+ fmuld x4, x4, t0 -+ LD x4, 12 * SIZE(XX) ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+ faddd a1, t1, a1 -+ unop -+ fmuld x5, x5, t1 -+ LD x5, 13 * SIZE(XX) ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++ MUL a3, c07, c07 ++ MUL a3, c08, c08 ++#endif + -+ faddd a2, t2, a2 -+ unop -+ fmuld x6, x6, t2 -+ LD x6, 14 * SIZE(XX) ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ faddd a3, t3, a3 -+ ldi X, 16 * SIZE(X) -+ fmuld x7, x7, t3 -+ LD x7, 15 * SIZE(XX) ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 + -+ faddd a0, t0, a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, a1 -+ fmuld x1, x1, t1 ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 + -+ faddd a2, t2, a2 -+ fmuld x2, x2, t2 -+ faddd a3, t3, a3 -+ fmuld x3, x3, t3 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + -+ faddd a0, t0, a0 -+ fmuld x4, x4, t0 -+ faddd a1, t1, a1 -+ fmuld x5, x5, t1 ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif + -+ faddd a2, t2, a2 -+ fmuld x6, x6, t2 -+ faddd a3, t3, a3 -+ fmuld x7, x7, t3 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) + -+ faddd a2, t2, a2 -+ faddd a3, t3, a3 -+ .align 4 ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) + -+$L15: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif + -+$L16: -+ LD x0, 0 * SIZE(X) -+ LD x1, 1 * SIZE(X) ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif + -+ ldi X, 2 * SIZE(X) ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) + -+ faddd a0, t0, a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, a1 -+ fmuld x1, x1, t1 ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) + -+ ldi I, -1(I) -+ bgt I, $L16 -+ bsr $31, $L998 -+ .align 4 ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif + -+$L20: -+ fclr t0 -+ sra N, 2, I + fclr t1 -+ ble I, $L25 -+ -+ LD x0, 0 * SIZE(X) + fclr t2 -+ LD x1, 1 * SIZE(X) -+ addl X, INCX, X -+ LD x2, 0 * SIZE(X) + fclr t3 -+ LD x3, 1 * SIZE(X) -+ addl X, INCX, X ++ fclr t4 + -+ LD x4, 0 * SIZE(X) -+ ldi I, -1(I) -+ LD x5, 1 * SIZE(X) -+ addl X, INCX, X ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ LD x6, 0 * SIZE(X) -+ ble I, $L22 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 + .align 4 + -+$L21: -+ faddd a0, t0, a0 -+ LD x7, 1 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X ++$L60: ++ and M, 2, I ++ ble I, $L70 + -+ faddd a1, t1, a1 -+ LD x0, 0 * SIZE(X) -+ fmuld x1, x1, t1 -+ unop ++#if defined(LT) || defined(RN) + -+ faddd a2, t2, a2 -+ LD x1, 1 * SIZE(X) -+ fmuld x2, x2, t2 -+ addl X, INCX, X ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 + -+ faddd a3, t3, a3 -+ LD x2, 0 * SIZE(X) -+ fmuld x3, x3, t3 -+ unop ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) + -+ faddd a0, t0, a0 -+ LD x3, 1 * SIZE(X) -+ fmuld x4, x4, t0 -+ addl X, INCX, X ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) + -+ faddd a1, t1, a1 -+ LD x4, 0 * SIZE(X) -+ fmuld x5, x5, t1 -+ ldi I, -1(I) ++ ble KK, $L68 + -+ faddd a2, t2, a2 -+ LD x5, 1 * SIZE(X) -+ fmuld x6, x6, t2 -+ addl X, INCX, X ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ faddd a3, t3, a3 -+ LD x6, 0 * SIZE(X) -+ fmuld x7, x7, t3 -+ bgt I, $L21 -+ .align 4 ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO + -+$L22: -+ faddd a0, t0, a0 -+ LD x7, 1 * SIZE(X) -+ fmuld x0, x0, t0 -+ addl X, INCX, X ++ subl K, KK, TMP1 + -+ faddd a1, t1, a1 -+ fmuld x1, x1, t1 -+ faddd a2, t2, a2 -+ fmuld x2, x2, t2 ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 + -+ faddd a3, t3, a3 -+ fmuld x3, x3, t3 -+ faddd a0, t0, a0 -+ fmuld x4, x4, t0 ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) + -+ faddd a1, t1, a1 -+ fmuld x5, x5, t1 -+ faddd a2, t2, a2 -+ fmuld x6, x6, t2 ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) + -+ faddd a3, t3, a3 -+ fmuld x7, x7, t3 -+ faddd a2, t2, a2 -+ faddd a3, t3, a3 -+ .align 4 ++ ble TMP1, $L68 + -+$L25: -+ and N, 3, I -+ ble I, $L998 -+ .align 4 ++ ble L, $L65 ++#endif ++ .align 4 + -+$L26: -+ LD x0, 0 * SIZE(X) -+ ldi I, -1(I) -+ LD x1, 1 * SIZE(X) -+ addl X, INCX, X ++$L62: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop + -+ faddd a0, t0, a0 -+ fmuld x0, x0, t0 -+ faddd a1, t1, a1 -+ fmuld x1, x1, t1 ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) + -+ bgt I, $L26 -+ .align 4 ++ ADD c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) + ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) + -+$L998: -+ faddd a0, t0, a0 -+ faddd a1, t1, a1 ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) + -+ faddd a0, a1, a0 -+ faddd a2, a3, a2 ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) + -+#if defined(EV4) || defined(EV5) -+ faddd a0, a2, $f16 -+ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop + -+ ldih $29, 0($26) !gpdisp!3 -+ ldi $29, 0($29) !gpdisp!3 ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 +#else -+ faddd a0, a2, a0 -+ fsqrtd a0, a0 ++ blbs TMP1, $L67 +#endif + .align 4 + -+$L999: -+#if defined(EV4) || defined(EV5) -+ ldl $26, 0($sp) -+ ldi $sp, 16($sp) -+#endif -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/znrm2_simd.S b/kernel/sw_64/znrm2_simd.S -new file mode 100644 -index 0000000..5a509d4 ---- /dev/null -+++ b/kernel/sw_64/znrm2_simd.S -@@ -0,0 +1,492 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) + -+#define ASSEMBLER ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) + -+#include "common.h" -+#include "version.h" ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) + -+#define PREFETCHSIZE 80 ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 + -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define XX $19 ++$L67: ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD c05, t3, c05 ++ MUL a1, b2, t3 + -+#define I $0 ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) + -+#define a0 $f0 -+#define a1 $f1 -+#define a2 $f10 -+#define a3 $f11 -+#define t0 $f12 -+#define t1 $f13 -+#define t2 $f14 -+#define t3 $f15 ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c05, t3, c05 ++ ADD c06, t4, c06 ++ .align 4 + -+#define x0 $f16 -+#define x1 $f17 -+#define x2 $f18 -+#define x3 $f19 -+#define x4 $f20 -+#define x5 $f21 -+#define x6 $f22 -+#define x7 $f23 ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif + -+ PROLOGUE ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ PROFCODE -+ -+ fclr a0 -+ sll INCX, ZBASE_SHIFT, INCX -+ fclr a1 -+ ble N, $L999 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ fclr a2 -+ cmpeq INCX, 2 * SIZE, $0 -+ fclr a3 -+ beq $0, $L20 #stride access ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++#endif + ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+/* test the address of X */ -+ and X, (VEC_LEN*SIZE-1), $3 -+ fclr t0 -+ fclr t1 -+ bne $3, $UnAlign_ACCESS -+/*Align access. Use simd instructions. Unloop 8 complex*/ -+ sra N, 3, I -+ ble I, $Remain -+ -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t0 #clear s0 vector -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t1 -+ -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t2 -+ VLD a3, 3*VEC_LEN*SIZE(X) -+ vcpys $f31, $f31, t3 ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 + -+ addl X, 16 * SIZE, X -+ subl I, 1, I -+ nop -+ ble I, $MainLoopEnd -+$MainLoop: -+ fillcs PREFETCHSIZE * SIZE(X) -+ VMAD a0, a0, t0, t0 -+ subl I, 1, I -+ VMAD a1, a1, t1, t1 -+ -+ addl X, 16 * SIZE, X -+ VMAD a2, a2, t2, t2 -+ nop -+ VMAD a3, a3, t3, t3 -+ -+ VLD a0, -4*VEC_LEN*SIZE(X) -+ VLD a1, -3*VEC_LEN*SIZE(X) -+ VLD a2, -2*VEC_LEN*SIZE(X) -+ VLD a3, -1*VEC_LEN*SIZE(X) -+ -+ bgt I, $MainLoop -+ .align 4 -+$MainLoopEnd: -+ VMAD a0, a0, t0, t0 -+ VMAD a1, a1, t1, t1 -+ VMAD a2, a2, t2, t2 -+ VMAD a3, a3, t3, t3 ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 + -+ VADD t0, t1, a0 -+ VADD t2, t3, a1 -+ nop -+ VADD a0, a1, t0 -+ -+ vextf t0, 1, t1 -+ vextf t0, 2, t2 -+ vextf t0, 3, t3 -+ nop ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 + -+ ADD t0, t1, a2 -+ ADD t2, t3, a3 -+ fclr t1 -+ ADD a2, a3, t0 -+ -+ .align 4 -+$Remain: -+ and N, 7, I -+ ble I, $End -+ .align 4 -+$RemainLoop: -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ -+ addl X, 2*SIZE, X -+ MAD a0, a0, t0, t0 -+ subl I, 1, I -+ MAD a1, a1, t1, t1 -+ -+ bgt I, $RemainLoop -+ .align 4 ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 + -+ ADD t0, t1, t0 -+$End: -+ SQRT t0, a0 -+ ret -+ .align 4 ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 + -+$UnAlign_ACCESS: ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 + -+ fclr t0 -+ sra N, 3, I -+ fclr t1 -+ ble I, $L15 ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++#endif + -+ fclr t2 -+ LD x0, 0 * SIZE(X) -+ fclr t3 -+ LD x1, 1 * SIZE(X) ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) + -+ LD x2, 2 * SIZE(X) -+ LD x3, 3 * SIZE(X) -+ LD x4, 4 * SIZE(X) -+ LD x5, 5 * SIZE(X) -+ LD x6, 6 * SIZE(X) -+ LD x7, 7 * SIZE(X) ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 + -+ ldi I, -1(I) -+ ble I, $L12 -+ .align 4 ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 + -+$L11: -+ ADD a0, t0, a0 -+ fillcs (PREFETCHSIZE) * SIZE(X) -+ MUL x0, x0, t0 -+ LD x0, 8 * SIZE(X) ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 + -+ ADD a1, t1, a1 -+ mov X, XX -+ MUL x1, x1, t1 -+ LD x1, 9 * SIZE(X) ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++#endif + -+ ADD a2, t2, a2 -+ unop -+ MUL x2, x2, t2 -+ LD x2, 10 * SIZE(X) ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ ADD a3, t3, a3 -+ unop -+ MUL x3, x3, t3 -+ LD x3, 11 * SIZE(X) ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 + -+ ADD a0, t0, a0 -+ unop -+ MUL x4, x4, t0 -+ LD x4, 12 * SIZE(X) ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 + -+ ADD a1, t1, a1 -+ unop -+ MUL x5, x5, t1 -+ LD x5, 13 * SIZE(X) ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 + -+ ADD a2, t2, a2 -+ unop -+ MUL x6, x6, t2 -+ LD x6, 14 * SIZE(X) ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif + -+ ADD a3, t3, a3 -+ unop -+ MUL x7, x7, t3 -+ LD x7, 15 * SIZE(X) ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif + -+ ADD a0, t0, a0 -+ unop -+ MUL x0, x0, t0 -+ LD x0, 16 * SIZE(X) ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif + -+ ADD a1, t1, a1 -+ ldi X, 16 * SIZE(X) -+ MUL x1, x1, t1 -+ LD x1, 17 * SIZE(XX) ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) + -+ ADD a2, t2, a2 -+ unop -+ MUL x2, x2, t2 -+ LD x2, 18 * SIZE(XX) ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif + -+ ADD a3, t3, a3 -+ unop -+ MUL x3, x3, t3 -+ LD x3, 19 * SIZE(XX) ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ ADD a0, t0, a0 -+ unop -+ MUL x4, x4, t0 -+ LD x4, 20 * SIZE(XX) ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ ADD a1, t1, a1 -+ ldi I, -1(I) -+ MUL x5, x5, t1 -+ LD x5, 21 * SIZE(XX) ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ADD a2, t2, a2 -+ unop -+ MUL x6, x6, t2 -+ LD x6, 22 * SIZE(XX) ++#ifdef LT ++ addl KK, 2, KK ++#endif + -+ ADD a3, t3, a3 -+ MUL x7, x7, t3 -+ LD x7, 23 * SIZE(XX) -+ bgt I, $L11 ++#ifdef LN ++ subl KK, 2, KK ++#endif + .align 4 + -+$L12: -+ ADD a0, t0, a0 -+ mov X, XX -+ MUL x0, x0, t0 -+ LD x0, 8 * SIZE(X) ++$L70: ++ and M, 1, I ++ ble I, $L79 + -+ ADD a1, t1, a1 -+ unop -+ MUL x1, x1, t1 -+ LD x1, 9 * SIZE(X) ++#if defined(LT) || defined(RN) + -+ ADD a2, t2, a2 -+ unop -+ MUL x2, x2, t2 -+ LD x2, 10 * SIZE(X) + -+ ADD a3, t3, a3 -+ unop -+ MUL x3, x3, t3 -+ LD x3, 11 * SIZE(X) ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+ ADD a0, t0, a0 -+ unop -+ MUL x4, x4, t0 -+ LD x4, 12 * SIZE(XX) ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 + -+ ADD a1, t1, a1 -+ unop -+ MUL x5, x5, t1 -+ LD x5, 13 * SIZE(XX) ++ ldi L, -2(KK) + -+ ADD a2, t2, a2 -+ unop -+ MUL x6, x6, t2 -+ LD x6, 14 * SIZE(XX) ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) + -+ ADD a3, t3, a3 -+ ldi X, 16 * SIZE(X) -+ MUL x7, x7, t3 -+ LD x7, 15 * SIZE(XX) ++ ble KK, $L78 + -+ ADD a0, t0, a0 -+ MUL x0, x0, t0 -+ ADD a1, t1, a1 -+ MUL x1, x1, t1 ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ ADD a2, t2, a2 -+ MUL x2, x2, t2 -+ ADD a3, t3, a3 -+ MUL x3, x3, t3 ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO + -+ ADD a0, t0, a0 -+ MUL x4, x4, t0 -+ ADD a1, t1, a1 -+ MUL x5, x5, t1 ++ subl K, KK, TMP1 + -+ ADD a2, t2, a2 -+ MUL x6, x6, t2 -+ ADD a3, t3, a3 -+ MUL x7, x7, t3 ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+ ADD a2, t2, a2 -+ ADD a3, t3, a3 -+ .align 4 ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 + -+$L15: -+ and N, 7, I -+ ble I, $L998 -+ .align 4 ++ ldi L, -2(TMP1) + -+$L16: -+ LD x0, 0 * SIZE(X) -+ LD x1, 1 * SIZE(X) ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) + -+ ldi X, 2 * SIZE(X) ++ ble TMP1, $L78 + -+ ADD a0, t0, a0 -+ MUL x0, x0, t0 -+ ADD a1, t1, a1 -+ MUL x1, x1, t1 ++ ble L, $L75 ++#endif ++ .align 4 + -+ ldi I, -1(I) -+ bgt I, $L16 -+ bsr $31, $L998 -+ .align 4 ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) + -+$L20: -+ fclr t0 -+ sra N, 2, I -+ fclr t1 -+ ble I, $L25 ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) + -+ LD x0, 0 * SIZE(X) -+ fclr t2 -+ LD x1, 1 * SIZE(X) -+ addl X, INCX, X -+ LD x2, 0 * SIZE(X) -+ fclr t3 -+ LD x3, 1 * SIZE(X) -+ addl X, INCX, X ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) + -+ LD x4, 0 * SIZE(X) -+ ldi I, -1(I) -+ LD x5, 1 * SIZE(X) -+ addl X, INCX, X ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) + -+ LD x6, 0 * SIZE(X) -+ ble I, $L22 ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 + .align 4 + -+$L21: -+ ADD a0, t0, a0 -+ LD x7, 1 * SIZE(X) -+ MUL x0, x0, t0 -+ addl X, INCX, X -+ -+ ADD a1, t1, a1 -+ LD x0, 0 * SIZE(X) -+ MUL x1, x1, t1 -+ unop ++$L75: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif ++ .align 4 + -+ ADD a2, t2, a2 -+ LD x1, 1 * SIZE(X) -+ MUL x2, x2, t2 -+ addl X, INCX, X ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) + -+ ADD a3, t3, a3 -+ LD x2, 0 * SIZE(X) -+ MUL x3, x3, t3 -+ unop ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 + -+ ADD a0, t0, a0 -+ LD x3, 1 * SIZE(X) -+ MUL x4, x4, t0 -+ addl X, INCX, X ++$L77: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 + -+ ADD a1, t1, a1 -+ LD x4, 0 * SIZE(X) -+ MUL x5, x5, t1 -+ ldi I, -1(I) ++ ADD c01, c02, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, c05 ++ ldi BO, 2 * SIZE(BO) + -+ ADD a2, t2, a2 -+ LD x5, 1 * SIZE(X) -+ MUL x6, x6, t2 -+ addl X, INCX, X ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 + -+ ADD a3, t3, a3 -+ LD x6, 0 * SIZE(X) -+ MUL x7, x7, t3 -+ bgt I, $L21 + .align 4 + -+$L22: -+ ADD a0, t0, a0 -+ LD x7, 1 * SIZE(X) -+ MUL x0, x0, t0 -+ addl X, INCX, X -+ -+ ADD a1, t1, a1 -+ MUL x1, x1, t1 -+ ADD a2, t2, a2 -+ MUL x2, x2, t2 ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif + -+ ADD a3, t3, a3 -+ MUL x3, x3, t3 -+ ADD a0, t0, a0 -+ MUL x4, x4, t0 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) + -+ ADD a1, t1, a1 -+ MUL x5, x5, t1 -+ ADD a2, t2, a2 -+ MUL x6, x6, t2 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) + -+ ADD a3, t3, a3 -+ MUL x7, x7, t3 -+ ADD a2, t2, a2 -+ ADD a3, t3, a3 -+ .align 4 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#endif + -+$L25: -+ and N, 3, I -+ ble I, $L998 -+ .align 4 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) + -+$L26: -+ LD x0, 0 * SIZE(X) -+ ldi I, -1(I) -+ LD x1, 1 * SIZE(X) -+ addl X, INCX, X ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++#endif + -+ ADD a0, t0, a0 -+ MUL x0, x0, t0 -+ ADD a1, t1, a1 -+ MUL x1, x1, t1 ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) + -+ bgt I, $L26 -+ .align 4 ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c05, c05 ++#endif + ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+$L998: -+ ADD a0, t0, a0 -+ ADD a1, t1, a1 ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+ ADD a0, a1, a0 -+ ADD a2, a3, a2 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif + ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif + ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) + -+ ADD a0, a2, a0 -+ SQRT a0, a0 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ .align 4 ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+$L999: ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/zrot.S b/kernel/sw_64/zrot.S -new file mode 100644 -index 0000000..9016a00 ---- /dev/null -+++ b/kernel/sw_64/zrot.S -@@ -0,0 +1,689 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++#ifdef LT ++ addl KK, 1, KK ++#endif + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 + -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 -+#define I $21 -+#define XX $23 -+#define YY $24 ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif + -+#define b9 $f29 ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif + -+#define C $f10 -+#define S $f11 ++#ifdef RN ++ addl KK, 2, KK ++#endif + -+#define PREFETCH_SIZE 80 ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 + -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 ++$L80: ++ sra N, 2, J ++ ble J, $L999 ++ .align 4 + -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C +#endif + -+ fmov $f21, C -+ LD S, 0($sp) ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif + -+ addl INCX, INCX, INCX -+ addl INCY, INCY, INCY ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 + -+ cmpeq INCX, 2, $23 -+ cmpeq INCY, 2, $24 -+ ble N, $L998 ++#ifdef LN ++ addl M, OFFSET, KK ++#endif + -+ and $23, $24, $23 -+ beq $23, $L50 ++#ifdef LT ++ mov OFFSET, KK ++#endif + -+ sra N, 2, I -+ ble I, $L15 ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 + -+ LD $f16, 2*SIZE(X) -+ LD $f17, 2*SIZE(Y) -+ LD $f18, 3*SIZE(X) -+ LD $f19, 3*SIZE(Y) ++$L11: ++#if defined(LT) || defined(RN) + -+ MUL C, $f12, $f21 -+ unop -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 + -+ LD $f13, 4*SIZE(Y) -+ MUL S, $f12, $f24 -+ LD $f12, 4*SIZE(X) -+ MUL C, $f14, $f25 ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 + -+ ldi I, -1(I) -+ MUL S, $f15, $f26 -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 -+ MUL C, $f15, $f27 ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 + -+ LD $f15, 5*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 -+ ble I, $L13 -+ .align 4 ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 + -+$L12: -+ MUL C, $f16, $f21 -+ fillcs (PREFETCH_SIZE) * SIZE(X) -+ unop -+ LD $f14, 5*SIZE(X) ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 + -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 + -+ MUL C, $f17, $f23 -+ fillcs (PREFETCH_SIZE) * SIZE(Y) -+ unop -+ LD $f17, 6*SIZE(Y) ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 + -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 ++#else + -+ MUL C, $f18, $f25 -+ LD $f16, 6*SIZE(X) -+ unop -+ unop ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO + -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) ++ subl K, KK, TMP1 + -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 + -+ MUL C, $f12, $f21 -+ LD $f18, 7*SIZE(X) -+ unop -+ unop ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 + -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 + -+ MUL C, $f13, $f23 -+ LD $f13, 8*SIZE(Y) -+ unop -+ unop ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 + -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 ++ fillde 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 + -+ MUL C, $f14, $f25 -+ LD $f12, 8*SIZE(X) -+ unop -+ unop ++ fillde 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 + -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 ++ fillde 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 + -+ MUL C, $f15, $f27 -+ LD $f15, 9*SIZE(Y) -+ unop -+ unop ++ fillde 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 ++#endif + -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 ++ ble L, $L15 ++ .align 5 + -+ MUL C, $f16, $f21 -+ LD $f14, 9*SIZE(X) -+ unop ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else + unop -+ -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else + unop -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 ++#endif + -+ MUL C, $f17, $f23 -+ LD $f17, 10*SIZE(Y) -+ unop ++ ADD c12, t2, c12 + unop -+ -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 ++ MUL b1, a2, t2 + unop -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 + -+ MUL C, $f18, $f25 -+ LD $f16, 10*SIZE(X) -+ unop ++ ADD c16, t3, c16 + unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) + -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 ++ ADD c15, t4, c15 + unop -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) + -+ MUL C, $f19, $f27 -+ LD $f19, 11*SIZE(Y) -+ unop -+ unop ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP + -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ ldi I, -1(I) -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP + -+ MUL C, $f12, $f21 -+ LD $f18, 11*SIZE(X) -+ unop ++ ADD c06, t3, c06 + unop -+ -+ ST $f22, 6*SIZE(X) -+ MUL S, $f13, $f22 ++ MUL b2, a4, t3 + unop -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 + -+ MUL C, $f13, $f23 -+ LD $f13, 12*SIZE(Y) -+ ldi X, 8*SIZE(X) ++ ADD c05, t4, c05 + unop -+ -+ ST $f24, 6*SIZE(Y) -+ MUL S, $f12, $f24 ++ MUL b4, a1, t4 + unop -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 + -+ MUL C, $f14, $f25 -+ LD $f12, 4*SIZE(X) -+ ldi Y, 8*SIZE(Y) ++/* 3 */ ++ ADD c03, t1, c03 + unop -+ -+ ST $f26, -1*SIZE(X) -+ MUL S, $f15, $f26 ++ MUL b3, a1, t1 + unop -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 + -+ MUL C, $f15, $f27 -+ LD $f15, 5*SIZE(Y) ++ ADD c04, t2, c04 + unop ++ MUL b3, a2, t2 + unop + -+ ST $f28, -1*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 -+ bgt I, $L12 -+ .align 4 -+ -+$L13: -+ MUL C, $f16, $f21 -+ LD $f14, 5*SIZE(X) -+ unop ++ ADD c08, t3, c08 + unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) + -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 ++ ADD c13, t4, c13 + unop -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) + -+ MUL C, $f17, $f23 -+ unop ++/* 4 */ ++ ADD c09, t1, c09 + unop -+ LD $f17, 6*SIZE(Y) -+ -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ LD $f16, 6*SIZE(X) -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) + -+ MUL C, $f18, $f25 -+ unop -+ unop ++ ADD c10, t2, c10 + unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) + -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 ++ ADD c14, t3, c14 + unop -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) + -+ MUL C, $f19, $f27 -+ unop ++ ADD c07, t4, c07 + unop -+ LD $f19, 7*SIZE(Y) -+ -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ LD $f18, 7*SIZE(X) -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) + -+ MUL C, $f12, $f21 -+ unop -+ unop ++/* 5 */ ++ ADD c11, t1, c11 + unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) + -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) + -+ MUL C, $f13, $f23 -+ unop -+ unop ++ ADD c16, t3, c16 + unop -+ -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 ++ MUL b2, a2, t3 + unop -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 + -+ MUL C, $f14, $f25 -+ unop ++ ADD c15, t4, c15 + unop ++ MUL b2, a5, t4 + unop + -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 ++/* 6 */ ++ ADD c01, t1, c01 + unop -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 -+ -+ MUL C, $f15, $f27 ++ MUL b5, a6, t1 + unop ++ ++ ADD c02, t2, c02 + unop ++ MUL b5, a4, t2 + unop + -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 ++ ADD c06, t3, c06 + unop -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 -+ -+ MUL C, $f16, $f21 ++ MUL b2, a4, t3 + unop ++ ++ ADD c05, t4, c05 + unop ++ MUL b4, a5, t4 + unop + -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 + unop -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 + -+ MUL C, $f17, $f23 -+ unop -+ unop ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 + unop + -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 ++ ADD c08, t3, c08 + unop -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) + -+ MUL C, $f18, $f25 -+ unop -+ unop ++ ADD c13, t4, c13 + unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) + -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 ++/* 8 */ ++ ADD c09, t1, c09 + unop -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) + -+ MUL C, $f19, $f27 -+ unop -+ unop ++ ADD c10, t2, c10 + unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) + -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 ++ ADD c14, t3, c14 + unop -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 -+ -+ ST $f22, 6*SIZE(X) -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 -+ ST $f24, 6*SIZE(Y) -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) + -+ ST $f26, 7*SIZE(X) -+ ldi X, 8*SIZE(X) -+ ST $f28, 7*SIZE(Y) -+ ldi Y, 8*SIZE(Y) ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 + .align 4 + -+ +$L15: -+ and N, 3, I -+ ble I, $L998 -+ .align 4 -+ -+$L16: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) -+ -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 -+ -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 -+ -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 -+ -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 -+ -+ ST $f22, 0*SIZE(X) -+ ST $f24, 0*SIZE(Y) -+ ldi I, -1(I) -+ -+ ST $f26, 1*SIZE(X) -+ ldi X, 2 * SIZE(X) -+ ST $f28, 1*SIZE(Y) -+ ldi Y, 2 * SIZE(Y) -+ -+ bgt I, $L16 ++ ADD c11, t1, c11 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif + .align 4 + -+$L998: -+ clr $0 -+ ret -+ .align 4 ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 + -+$L50: -+ mov X, XX -+ mov Y, YY ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 + -+ sra N, 2, I -+ ble I, $L55 -+ .align 4 ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) + -+$L51: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) + -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) + -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) + ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) + -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++$L17: ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 + -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++ ADD c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 + ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD c03, t1, c03 ++ MUL b3, a1, t1 + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++ ADD c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD c08, t3, c08 ++ MUL b4, a2, t3 + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ ADD c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD c09, t1, c09 ++ MUL b3, a3, t1 + -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 ++ ADD c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD c14, t3, c14 ++ MUL b4, a4, t3 + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ ADD c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) + -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 ++ ADD c11, t1, c11 ++ ADD c12, t2, c12 ++ ADD c16, t3, c16 ++ ADD c15, t4, c15 ++ .align 4 + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif + ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 + -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) + -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++ SUB a1, c03, c03 ++ SUB a2, c07, c07 ++ SUB a3, c11, c11 ++ SUB a4, c15, c15 + -+ ldi I, -1(I) -+ bgt I, $L51 -+ .align 4 ++ SUB b1, c04, c04 ++ SUB b2, c08, c08 ++ SUB b3, c12, c12 ++ SUB b4, c16, c16 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+$L55: -+ and N, 3, I -+ ble I, $L999 -+ .align 4 ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) + -+$L56: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 + -+ ADD $f21, $f22, b9 -+ fmov b9, $f22 -+ SUB $f23, $f24, b9 -+ fmov b9, $f24 ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) + -+ ADD $f25, $f26, b9 -+ fmov b9, $f26 -+ SUB $f27, $f28, b9 -+ fmov b9, $f28 ++ SUB a1, c09, c09 ++ SUB a2, c10, c10 ++ SUB a3, c11, c11 ++ SUB a4, c12, c12 + -+ ST $f22, 0*SIZE(X) -+ ST $f24, 0*SIZE(Y) -+ ldi I, -1(I) ++ SUB b1, c13, c13 ++ SUB b2, c14, c14 ++ SUB b3, c15, c15 ++ SUB b4, c16, c16 ++#endif + -+ ST $f26, 1*SIZE(X) -+ ST $f28, 1*SIZE(Y) -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) + -+ bgt I, $L56 -+ .align 4 ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ MUL a1, c12, c12 ++ MUL a1, c16, c16 + -+$L999: -+ clr $0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/zrot.S.bak b/kernel/sw_64/zrot.S.bak -new file mode 100644 -index 0000000..83dd2b1 ---- /dev/null -+++ b/kernel/sw_64/zrot.S.bak -@@ -0,0 +1,631 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ MUL a2, c12, t3 ++ MUL a2, c16, t4 + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 + -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 -+#define I $21 -+#define XX $23 -+#define YY $24 ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ MUL a3, c12, t3 ++ MUL a3, c16, t4 + -+#define C $f10 -+#define S $f11 ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+#define PREFETCH_SIZE 80 ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ MUL a4, c12, t3 ++ MUL a4, c16, t4 + -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 + -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) + -+ fmov $f21, C -+ LD S, 0($sp) ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ MUL b1, c11, c11 ++ MUL b1, c15, c15 + -+ addl INCX, INCX, INCX -+ addl INCY, INCY, INCY ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ MUL b2, c11, t3 ++ MUL b2, c15, t4 + -+ cmpeq INCX, 2, $23 -+ cmpeq INCY, 2, $24 -+ ble N, $L998 ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+ and $23, $24, $23 -+ beq $23, $L50 ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ MUL b3, c11, t3 ++ MUL b3, c15, t4 + -+ sra N, 2, I -+ ble I, $L15 ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ LD $f16, 2*SIZE(X) -+ LD $f17, 2*SIZE(Y) -+ LD $f18, 3*SIZE(X) -+ LD $f19, 3*SIZE(Y) ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 + -+ MUL C, $f12, $f21 -+ unop -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 + -+ LD $f13, 4*SIZE(Y) -+ MUL S, $f12, $f24 -+ LD $f12, 4*SIZE(X) -+ MUL C, $f14, $f25 ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 + -+ ldi I, -1(I) -+ MUL S, $f15, $f26 -+ ADD $f21, $f22, $f22 -+ MUL C, $f15, $f27 ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif + -+ LD $f15, 5*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, $f24 -+ ble I, $L13 -+ .align 4 ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+$L12: -+ MUL C, $f16, $f21 -+ fillcs (PREFETCH_SIZE) * SIZE(X) -+ unop -+ LD $f14, 5*SIZE(X) ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 + -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 + -+ MUL C, $f17, $f23 -+ fillcs (PREFETCH_SIZE) * SIZE(Y) -+ unop -+ LD $f17, 6*SIZE(Y) ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ MUL a3, c09, t3 ++ MUL a3, c13, t4 + -+ MUL C, $f18, $f25 -+ LD $f16, 6*SIZE(X) -+ unop -+ unop ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 + -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ MUL a4, c09, t3 ++ MUL a4, c13, t4 + -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 + -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, $f24 ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) + -+ MUL C, $f12, $f21 -+ LD $f18, 7*SIZE(X) -+ unop -+ unop ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ MUL b1, c10, c10 ++ MUL b1, c14, c14 + -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, $f26 ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ MUL b2, c10, t3 ++ MUL b2, c14, t4 + -+ MUL C, $f13, $f23 -+ LD $f13, 8*SIZE(Y) -+ unop -+ unop ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 + -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ MUL b3, c10, t3 ++ MUL b3, c14, t4 + -+ MUL C, $f14, $f25 -+ LD $f12, 8*SIZE(X) -+ unop -+ unop ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 + -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) + -+ MUL C, $f15, $f27 -+ LD $f15, 9*SIZE(Y) -+ unop -+ unop ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ MUL a1, c11, c11 ++ MUL a1, c15, c15 + -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, $f24 ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ MUL a2, c11, t3 ++ MUL a2, c15, t4 + -+ MUL C, $f16, $f21 -+ LD $f14, 9*SIZE(X) -+ unop -+ unop ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 + -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++ MUL a3, c12, c12 ++ MUL a3, c16, c16 ++#endif + -+ MUL C, $f17, $f23 -+ LD $f17, 10*SIZE(Y) -+ unop -+ unop ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 + -+ MUL C, $f18, $f25 -+ LD $f16, 10*SIZE(X) -+ unop -+ unop ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 + -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+ MUL C, $f19, $f27 -+ LD $f19, 11*SIZE(Y) -+ unop -+ unop ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 + -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ ldi I, -1(I) -+ SUB $f23, $f24, $f24 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + -+ MUL C, $f12, $f21 -+ LD $f18, 11*SIZE(X) -+ unop -+ unop ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ MUL a4, c03, t3 ++ MUL a4, c04, t4 + -+ ST $f22, 6*SIZE(X) -+ MUL S, $f13, $f22 -+ unop -+ ADD $f25, $f26, $f26 ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 + -+ MUL C, $f13, $f23 -+ LD $f13, 12*SIZE(Y) -+ ldi X, 8*SIZE(X) -+ unop ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) + -+ ST $f24, 6*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ MUL b1, c07, c07 ++ MUL b1, c08, c08 + -+ MUL C, $f14, $f25 -+ LD $f12, 4*SIZE(X) -+ ldi Y, 8*SIZE(Y) -+ unop ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ MUL b2, c07, t3 ++ MUL b2, c08, t4 + -+ ST $f26, -1*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + -+ MUL C, $f15, $f27 -+ LD $f15, 5*SIZE(Y) -+ unop -+ unop ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ MUL b3, c07, t3 ++ MUL b3, c08, t4 + -+ ST $f28, -1*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, $f24 -+ bgt I, $L12 -+ .align 4 ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 + -+$L13: -+ MUL C, $f16, $f21 -+ LD $f14, 5*SIZE(X) -+ unop -+ unop ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) + -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 + -+ MUL C, $f17, $f23 -+ unop -+ unop -+ LD $f17, 6*SIZE(Y) ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ MUL a2, c11, t3 ++ MUL a2, c12, t4 + -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ LD $f16, 6*SIZE(X) -+ SUB $f27, $f28, $f28 ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 + -+ MUL C, $f18, $f25 -+ unop -+ unop -+ unop ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++ MUL a3, c15, c15 ++ MUL a3, c16, c16 ++#endif + -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) + -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ MUL a1, c15, c15 ++ MUL a1, c16, c16 + -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ LD $f18, 7*SIZE(X) -+ SUB $f23, $f24, $f24 ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ MUL a2, c15, t3 ++ MUL a2, c16, t4 + -+ MUL C, $f12, $f21 -+ unop -+ unop -+ unop ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, $f26 ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ MUL a3, c15, t3 ++ MUL a3, c16, t4 + -+ MUL C, $f13, $f23 -+ unop -+ unop -+ unop ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ MUL a4, c15, t3 ++ MUL a4, c16, t4 + -+ MUL C, $f14, $f25 -+ unop -+ unop -+ unop ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) + -+ MUL C, $f15, $f27 -+ unop -+ unop -+ unop ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ MUL b1, c11, c11 ++ MUL b1, c12, c12 + -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, $f24 ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ MUL b2, c11, t3 ++ MUL b2, c12, t4 + -+ MUL C, $f16, $f21 -+ unop -+ unop -+ unop ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 + -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ MUL b3, c11, t3 ++ MUL b3, c12, t4 + -+ MUL C, $f17, $f23 -+ unop -+ unop -+ unop ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ MUL C, $f18, $f25 -+ unop -+ unop -+ unop ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 + -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 + -+ MUL C, $f19, $f27 -+ unop -+ unop -+ unop ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, $f24 ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif + -+ ST $f22, 6*SIZE(X) -+ ADD $f25, $f26, $f26 -+ ST $f24, 6*SIZE(Y) -+ SUB $f27, $f28, $f28 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) + -+ ST $f26, 7*SIZE(X) -+ ldi X, 8*SIZE(X) -+ ST $f28, 7*SIZE(Y) -+ ldi Y, 8*SIZE(Y) -+ .align 4 ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) + ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) + -+$L15: -+ and N, 3, I -+ ble I, $L998 -+ .align 4 ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) + -+$L16: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) + -+ ST $f22, 0*SIZE(X) -+ ST $f24, 0*SIZE(Y) -+ ldi I, -1(I) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) + -+ ST $f26, 1*SIZE(X) -+ ldi X, 2 * SIZE(X) -+ ST $f28, 1*SIZE(Y) -+ ldi Y, 2 * SIZE(Y) ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) + -+ bgt I, $L16 -+ .align 4 ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) + -+$L998: -+ clr $0 -+ ret -+ .align 4 ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif + -+$L50: -+ mov X, XX -+ mov Y, YY ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ sra N, 2, I -+ ble I, $L55 -+ .align 4 ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+$L51: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++#ifdef LT ++ addl KK, 4, KK ++#endif + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++#ifdef LN ++ subl KK, 4, KK ++#endif + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ ldi I, -1(I) + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ bgt I, $L11 ++ .align 4 + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++$L20: ++ and M, 2, I ++ ble I, $L30 + ++#if defined(LT) || defined(RN) + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++ ble L, $L25 + ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ subl K, KK, TMP1 + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 + ++ ble L, $L25 ++#endif ++ .align 4 + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++$L22: ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) + -+ ldi I, -1(I) -+ bgt I, $L51 -+ .align 4 ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) + -+$L55: -+ and N, 3, I -+ ble I, $L999 -+ .align 4 ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) + -+$L56: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) ++ ADD c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ ADD c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++ ADD c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) + -+ ST $f22, 0*SIZE(X) -+ ST $f24, 0*SIZE(Y) -+ ldi I, -1(I) ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) + -+ ST $f26, 1*SIZE(X) -+ ST $f28, 1*SIZE(Y) -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) + -+ bgt I, $L56 ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 + .align 4 + -+$L999: -+ clr $0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/zrot_simd.S b/kernel/sw_64/zrot_simd.S -new file mode 100644 -index 0000000..9e00ebf ---- /dev/null -+++ b/kernel/sw_64/zrot_simd.S -@@ -0,0 +1,799 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++$L25: ++ ADD c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) + -+#define N $16 -+#define X $17 -+#define INCX $18 -+#define Y $19 -+#define INCY $20 -+#define I $21 -+#define XX $23 -+#define YY $24 ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop + -+#define C $f10 -+#define S $f11 -+ -+#define x0 $f12 -+#define x1 $f14 -+#define x2 $f16 -+#define x3 $f18 -+ -+#define y0 $f13 -+#define y1 $f15 -+#define y2 $f17 -+#define y3 $f19 -+ -+#define t0 $f20 -+#define t1 $f21 -+#define t2 $f22 -+#define t3 $f23 -+#define t4 $f24 -+#define t5 $f25 -+#define t6 $f26 -+#define t7 $f27 -+ -+#define PREFETCHSIZE 80 ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) + -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) + -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) + -+ fmov $f21, C -+ LD S, 0($sp) ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) + -+ addl INCX, INCX, INCX -+ addl INCY, INCY, INCY ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) + -+ cmpeq INCX, 2, $23 -+ cmpeq INCY, 2, $24 -+ ble N, $L998 ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 + -+ and $23, $24, $23 -+ beq $23, $L50 -+ -+/* test the address of X */ -+ and X, (VEC_LEN*SIZE-1), $3 -+ and Y, (VEC_LEN*SIZE-1), $4 -+ or $3, $4, $4 -+ bne $4, $UnAlign_ACCESS -+ -+/*Align Accessing*/ -+ sra N, 3, I -+ ble I, $Remain ++$L27: ++ ADD c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD c13, t3, c13 ++ MUL a1, b2, t3 + -+ vcpyf C, C -+ vcpyf S, S -+ -+ VLD x0, 0*VEC_LEN*SIZE(X) -+ VLD x1, 1*VEC_LEN*SIZE(X) -+ VLD x2, 2*VEC_LEN*SIZE(X) -+ VLD x3, 3*VEC_LEN*SIZE(X) ++ ADD c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 + -+ VLD y0, 0*VEC_LEN*SIZE(Y) -+ VLD y1, 1*VEC_LEN*SIZE(Y) -+ VLD y2, 2*VEC_LEN*SIZE(Y) -+ VLD y3, 3*VEC_LEN*SIZE(Y) ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD c05, t3, c05 ++ MUL a1, b4, t3 + -+ addl X, 16 * SIZE, X -+ addl Y, 16 * SIZE, Y -+ subl I, 1, I -+ ble I, $MainLoopEnd ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ ADD c10, t2, c10 ++ ADD c13, t3, c13 ++ ADD c14, t4, c14 + .align 4 -+ -+$MainLoop: -+ VMUL C, x0, t0 -+ fillcs (PREFETCHSIZE) * SIZE(X) -+ VMUL C, x1, t1 -+ fillcs (PREFETCHSIZE) * SIZE(Y) -+ -+ VMUL C, x2, t2 -+ subl I, 1, I -+ VMUL C, x3, t3 -+ nop + -+ VMUL S, x0, t4 -+ VLD x0, 0*VEC_LEN*SIZE(X) -+ VMUL S, x1, t5 -+ VLD x1, 1*VEC_LEN*SIZE(X) -+ -+ VMUL S, x2, t6 -+ VLD x2, 2*VEC_LEN*SIZE(X) -+ VMUL S, x3, t7 -+ VLD x3, 3*VEC_LEN*SIZE(X) -+ -+ VMAD S, y0, t0, t0 -+ VMAD S, y1, t1, t1 -+ VMAD S, y2, t2, t2 -+ VMAD S, y3, t3, t3 -+ -+ VMSUB C, y0, t4, t4 -+ VLD y0, 0*VEC_LEN*SIZE(Y) -+ VMSUB C, y1, t5, t5 -+ VLD y1, 1*VEC_LEN*SIZE(Y) -+ -+ VMSUB C, y2, t6, t6 -+ VLD y2, 2*VEC_LEN*SIZE(Y) -+ VMSUB C, y3, t7, t7 -+ VLD y3, 3*VEC_LEN*SIZE(Y) -+ -+ VST t0, -4*VEC_LEN*SIZE(X) -+ VST t1, -3*VEC_LEN*SIZE(X) -+ VST t2, -2*VEC_LEN*SIZE(X) -+ VST t3, -1*VEC_LEN*SIZE(X) -+ -+ VST t4, -4*VEC_LEN*SIZE(Y) -+ VST t5, -3*VEC_LEN*SIZE(Y) -+ VST t6, -2*VEC_LEN*SIZE(Y) -+ VST t7, -1*VEC_LEN*SIZE(Y) ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif + -+ addl X, 16 * SIZE, X -+ addl Y, 16 * SIZE, Y -+ nop -+ bgt I, $MainLoop -+ .align 4 -+$MainLoopEnd: -+ VMUL C, x0, t0 -+ VMUL C, x1, t1 -+ VMUL C, x2, t2 -+ VMUL C, x3, t3 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ VMUL S, x0, t4 -+ VMUL S, x1, t5 -+ VMUL S, x2, t6 -+ VMUL S, x3, t7 ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) + -+ VMAD S, y0, t0, t0 -+ VMAD S, y1, t1, t1 -+ VMAD S, y2, t2, t2 -+ VMAD S, y3, t3, t3 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 + -+ VMSUB C, y0, t4, t4 -+ VMSUB C, y1, t5, t5 -+ VMSUB C, y2, t6, t6 -+ VMSUB C, y3, t7, t7 ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 + -+ VST t0, -4*VEC_LEN*SIZE(X) -+ VST t1, -3*VEC_LEN*SIZE(X) -+ VST t2, -2*VEC_LEN*SIZE(X) -+ VST t3, -1*VEC_LEN*SIZE(X) ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ VST t4, -4*VEC_LEN*SIZE(Y) -+ VST t5, -3*VEC_LEN*SIZE(Y) -+ VST t6, -2*VEC_LEN*SIZE(Y) -+ VST t7, -1*VEC_LEN*SIZE(Y) ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) + -+ .align 4 -+$Remain: -+ and N, 7, I -+ ble I, $End -+$RemainLoop: -+ LD x0, 0*SIZE(X) -+ LD y0, 0*SIZE(Y) -+ LD x1, 1*SIZE(X) -+ LD y1, 1*SIZE(Y) -+ -+ MUL C, x0, t0 -+ MUL S, x0, t4 -+ MAD S, y0, t0, t0 -+ MSUB C, y0, t4, t4 -+ -+ MUL C, x1, t1 -+ ldi I, -1(I) -+ MUL S, x1, t5 -+ ldi X, 2 * SIZE(X) -+ -+ MAD S, y1, t1, t1 -+ ldi Y, 2 * SIZE(Y) -+ MSUB C, y1, t5, t5 -+ nop ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 + -+ ST t0, -2*SIZE(X) -+ ST t1, -1*SIZE(X) -+ ST t4, -2*SIZE(Y) -+ ST t5, -1*SIZE(Y) -+ -+ bgt I, $RemainLoop -+ .align 4 -+$End: -+ clr $0 -+ ret -+ .align 4 ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c13, c13 ++ SUB b4, c14, c14 ++#endif + -+$UnAlign_ACCESS: -+ sra N, 2, I -+ ble I, $L15 ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 + -+ LD $f16, 2*SIZE(X) -+ LD $f17, 2*SIZE(Y) -+ LD $f18, 3*SIZE(X) -+ LD $f19, 3*SIZE(Y) ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 + -+ MUL C, $f12, $f21 -+ unop -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 + -+ LD $f13, 4*SIZE(Y) -+ MUL S, $f12, $f24 -+ LD $f12, 4*SIZE(X) -+ MUL C, $f14, $f25 ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 + -+ ldi I, -1(I) -+ MUL S, $f15, $f26 -+ ADD $f21, $f22, $f22 -+ MUL C, $f15, $f27 ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 + -+ LD $f15, 5*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, $f24 -+ ble I, $L13 -+ .align 4 ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 + -+$L12: -+ MUL C, $f16, $f21 -+ fillcs (PREFETCHSIZE) * SIZE(X) -+ unop -+ LD $f14, 5*SIZE(X) ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++ MUL a3, c10, c10 ++ MUL a3, c14, c14 ++#endif + -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ MUL C, $f17, $f23 -+ fillcs (PREFETCHSIZE) * SIZE(Y) -+ unop -+ LD $f17, 6*SIZE(Y) ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 + -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 + -+ MUL C, $f18, $f25 -+ LD $f16, 6*SIZE(X) -+ unop -+ unop ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 + -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 + -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 + -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, $f24 ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 + -+ MUL C, $f12, $f21 -+ LD $f18, 7*SIZE(X) -+ unop -+ unop ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 + -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, $f26 ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) + -+ MUL C, $f13, $f23 -+ LD $f13, 8*SIZE(Y) -+ unop -+ unop ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 + -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 + -+ MUL C, $f14, $f25 -+ LD $f12, 8*SIZE(X) -+ unop -+ unop ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 + -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 + -+ MUL C, $f15, $f27 -+ LD $f15, 9*SIZE(Y) -+ unop -+ unop ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 + -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, $f24 ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) + -+ MUL C, $f16, $f21 -+ LD $f14, 9*SIZE(X) -+ unop -+ unop ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 + -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 + -+ MUL C, $f17, $f23 -+ LD $f17, 10*SIZE(Y) -+ unop -+ unop ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 + -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++#endif + -+ MUL C, $f18, $f25 -+ LD $f16, 10*SIZE(X) -+ unop -+ unop ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) + -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 + -+ MUL C, $f19, $f27 -+ LD $f19, 11*SIZE(Y) -+ unop -+ unop ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 + -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ ldi I, -1(I) -+ SUB $f23, $f24, $f24 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 + -+ MUL C, $f12, $f21 -+ LD $f18, 11*SIZE(X) -+ unop -+ unop ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 + -+ ST $f22, 6*SIZE(X) -+ MUL S, $f13, $f22 -+ unop -+ ADD $f25, $f26, $f26 ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 + -+ MUL C, $f13, $f23 -+ LD $f13, 12*SIZE(Y) -+ ldi X, 8*SIZE(X) -+ unop ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 + -+ ST $f24, 6*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 + -+ MUL C, $f14, $f25 -+ LD $f12, 4*SIZE(X) -+ ldi Y, 8*SIZE(Y) -+ unop ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) + -+ ST $f26, -1*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 + -+ MUL C, $f15, $f27 -+ LD $f15, 5*SIZE(Y) -+ unop -+ unop ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 + -+ ST $f28, -1*SIZE(Y) -+ MUL S, $f14, $f28 -+ SUB $f23, $f24, $f24 -+ bgt I, $L12 -+ .align 4 ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 + -+$L13: -+ MUL C, $f16, $f21 -+ LD $f14, 5*SIZE(X) -+ unop -+ unop ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 + -+ ST $f22, 0*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 + -+ MUL C, $f17, $f23 -+ unop -+ unop -+ LD $f17, 6*SIZE(Y) ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ ST $f24, 0*SIZE(Y) -+ MUL S, $f16, $f24 -+ LD $f16, 6*SIZE(X) -+ SUB $f27, $f28, $f28 ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 + -+ MUL C, $f18, $f25 -+ unop -+ unop -+ unop ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 + -+ ST $f26, 1*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 + -+ MUL C, $f19, $f27 -+ unop -+ unop -+ LD $f19, 7*SIZE(Y) ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif + -+ ST $f28, 1*SIZE(Y) -+ MUL S, $f18, $f28 -+ LD $f18, 7*SIZE(X) -+ SUB $f23, $f24, $f24 ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) + -+ MUL C, $f12, $f21 -+ unop -+ unop -+ unop ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) + -+ ST $f22, 2*SIZE(X) -+ unop -+ MUL S, $f13, $f22 -+ ADD $f25, $f26, $f26 ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif + -+ MUL C, $f13, $f23 -+ unop -+ unop -+ unop ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif + -+ ST $f24, 2*SIZE(Y) -+ MUL S, $f12, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) + -+ MUL C, $f14, $f25 -+ unop -+ unop -+ unop ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) + -+ ST $f26, 3*SIZE(X) -+ MUL S, $f15, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif + -+ MUL C, $f15, $f27 -+ unop -+ unop -+ unop ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 + -+ ST $f28, 3*SIZE(Y) -+ MUL S, $f14, $f28 -+ unop -+ SUB $f23, $f24, $f24 ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ MUL C, $f16, $f21 -+ unop -+ unop -+ unop ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ST $f22, 4*SIZE(X) -+ MUL S, $f17, $f22 -+ unop -+ ADD $f25, $f26, $f26 ++#ifdef LT ++ addl KK, 2, KK ++#endif + -+ MUL C, $f17, $f23 -+ unop -+ unop -+ unop ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 + -+ ST $f24, 4*SIZE(Y) -+ MUL S, $f16, $f24 -+ unop -+ SUB $f27, $f28, $f28 ++$L30: ++ and M, 1, I ++ ble I, $L39 + -+ MUL C, $f18, $f25 -+ unop -+ unop -+ unop ++#if defined(LT) || defined(RN) + -+ ST $f26, 5*SIZE(X) -+ MUL S, $f19, $f26 -+ unop -+ ADD $f21, $f22, $f22 ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+ MUL C, $f19, $f27 -+ unop -+ unop -+ unop ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) + -+ ST $f28, 5*SIZE(Y) -+ MUL S, $f18, $f28 -+ unop -+ SUB $f23, $f24, $f24 ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 + -+ ST $f22, 6*SIZE(X) -+ ADD $f25, $f26, $f26 -+ ST $f24, 6*SIZE(Y) -+ SUB $f27, $f28, $f28 ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 + -+ ST $f26, 7*SIZE(X) -+ ldi X, 8*SIZE(X) -+ ST $f28, 7*SIZE(Y) -+ ldi Y, 8*SIZE(Y) -+ .align 4 ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif + ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO + -+$L15: -+ and N, 3, I -+ ble I, $L998 -+ .align 4 ++ subl K, KK, TMP1 + -+$L16: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ ble L, $L35 ++#endif ++ .align 4 + -+ ST $f22, 0*SIZE(X) -+ ST $f24, 0*SIZE(Y) -+ ldi I, -1(I) ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) + -+ ST $f26, 1*SIZE(X) -+ ldi X, 2 * SIZE(X) -+ ST $f28, 1*SIZE(Y) -+ ldi Y, 2 * SIZE(Y) ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) + -+ bgt I, $L16 -+ .align 4 ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) + -+$L998: -+ clr $0 -+ ret -+ .align 4 ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) + -+$L50: -+ mov X, XX -+ mov Y, YY ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) + -+ sra N, 2, I -+ ble I, $L55 ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 + .align 4 + -+$L51: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++$L35: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++$L37: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 + ++ ADD c13, t4, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, t4 ++ ldi BO, 4 * SIZE(BO) + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ADD c09, t3, c09 ++ ADD c13, t4, c13 + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#endif + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) + ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++#endif + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c01, t1 ++ SUB c09, t1, c09 ++ MUL a4, c01, t1 ++ SUB c13, t1, c13 + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ MUL b1, c05, c05 ++ MUL b2, c05, t1 ++ SUB c09, t1, c09 ++ MUL b3, c05, t1 ++ SUB c13, t1, c13 + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++ MUL a1, c09, c09 ++ MUL a2, c09, t1 ++ SUB c13, t1, c13 ++ MUL a3, c13, c13 ++#endif + ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) + -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ SXADDQ INCX, X, X -+ LD $f15, 1*SIZE(Y) -+ SXADDQ INCY, Y, Y ++ MUL a1, c13, c13 ++ MUL a2, c13, t1 ++ SUB c09, t1, c09 ++ MUL a3, c13, t1 ++ SUB c05, t1, c05 ++ MUL a4, c13, t1 ++ SUB c01, t1, c01 + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++ MUL b1, c09, c09 ++ MUL b2, c09, t1 ++ SUB c05, t1, c05 ++ MUL b3, c09, t1 ++ SUB c01, t1, c01 + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif + -+ ST $f22, 0*SIZE(XX) -+ ST $f24, 0*SIZE(YY) -+ ST $f26, 1*SIZE(XX) -+ SXADDQ INCX, XX, XX -+ ST $f28, 1*SIZE(YY) -+ SXADDQ INCY, YY, YY ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif + -+ ldi I, -1(I) -+ bgt I, $L51 -+ .align 4 ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif + -+$L55: -+ and N, 3, I -+ ble I, $L999 -+ .align 4 ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) + -+$L56: -+ LD $f12, 0*SIZE(X) -+ LD $f13, 0*SIZE(Y) -+ LD $f14, 1*SIZE(X) -+ LD $f15, 1*SIZE(Y) ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif + -+ MUL C, $f12, $f21 -+ MUL S, $f13, $f22 -+ MUL C, $f13, $f23 -+ MUL S, $f12, $f24 ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif + -+ ADD $f21, $f22, $f22 -+ SUB $f23, $f24, $f24 ++#ifdef LT ++ addl KK, 1, KK ++#endif + -+ MUL C, $f14, $f25 -+ MUL S, $f15, $f26 -+ MUL C, $f15, $f27 -+ MUL S, $f14, $f28 ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 + -+ ADD $f25, $f26, $f26 -+ SUB $f27, $f28, $f28 ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif + -+ ST $f22, 0*SIZE(X) -+ ST $f24, 0*SIZE(Y) -+ ldi I, -1(I) ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif + -+ ST $f26, 1*SIZE(X) -+ ST $f28, 1*SIZE(Y) -+ SXADDQ INCX, X, X -+ SXADDQ INCY, Y, Y ++#ifdef RN ++ addl KK, 4, KK ++#endif + -+ bgt I, $L56 ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 + .align 4 + +$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) + clr $0 ++ ldi $sp, STACKSIZE($sp) + ret + EPILOGUE -diff --git a/kernel/sw_64/zscal.S b/kernel/sw_64/zscal.S +diff --git a/kernel/sw_64/zamax.S b/kernel/sw_64/zamax.S new file mode 100644 -index 0000000..9589624 +index 000000000..46674dabd --- /dev/null -+++ b/kernel/sw_64/zscal.S -@@ -0,0 +1,255 @@ ++++ b/kernel/sw_64/zamax.S +@@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -83537,227 +25034,273 @@ index 0000000..9589624 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#define PREFETCHSIZE 88 + +#define N $16 -+#define X $21 -+#define INCX $17 ++#define X $17 ++#define INCX $18 + -+#define XX $18 -+#define I $19 ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif + -+#define ALPHA_R $f19 -+#define ALPHA_I $f20 ++#define STACKSIZE 8 * 8 + -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 + -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f21 ++ ldi $sp, -STACKSIZE($sp) + -+#define t0 $f22 -+#define t1 $f23 -+#define t2 $f24 -+#define t3 $f25 ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 + -+#define t4 $f26 -+#define t5 $f27 -+#define t6 $f28 -+#define t7 $f29 ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop + -+ PROLOGUE -+ PROFCODE ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop + -+ ldl INCX, 0($sp) -+ mov X, XX -+ ble N, $L999 ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $0 ++ unop ++ ++ fstd $f6, 32($sp) ++ unop ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ fclr $f0 ++ beq $0, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 + ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ sra N, 2, $1 + addl INCX, INCX, INCX + -+ sra N, 2, I -+ ble I, $L15 ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ faddd $f20, $f21, $f0 ++ ble $1, $L15 ++ .align 4 + -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a2, 0 * SIZE(X) -+ LD a3, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a4, 0 * SIZE(X) -+ LD a5, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a6, 0 * SIZE(X) -+ LD a7, 1 * SIZE(X) -+ SXADDQ INCX, X, X ++ ldi $1, -1($1) ++ unop ++ addl X, INCX, X ++ unop + -+ MUL a0, ALPHA_R, t0 -+ MUL a1, ALPHA_I, t1 -+ MUL a0, ALPHA_I, t2 -+ MUL a1, ALPHA_R, t3 ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f1 ++ LD $f23, 1 * SIZE(X) ++ addl X, INCX, X + -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 ++ LD $f24, 0 * SIZE(X) ++ fmov $f0, $f2 ++ LD $f25, 1 * SIZE(X) ++ addl X, INCX, X + -+ ldi I, -1(I) -+ ble I, $L13 ++ LD $f26, 0 * SIZE(X) ++ fmov $f0, $f3 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f8 ++ fabs $f21, $f9 ++ fabs $f22, $f10 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ fabs $f25, $f13 ++ fabs $f26, $f14 ++ fabs $f27, $f15 ++ ++ ble $1, $L14 ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ ldi $1, -1($1) ++ addl X, INCX, X ++ ++ LD $f22, 0 * SIZE(X) ++ LD $f23, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ LD $f25, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ble $1, $L13 + .align 4 + +$L12: -+ ST t4, 0 * SIZE(XX) -+ MUL a2, ALPHA_R, t0 -+ ST t5, 1 * SIZE(XX) -+ MUL a3, ALPHA_I, t1 ++ faddd $f8, $f9, $f16 ++ unop ++ fabs $f20, $f8 ++ s_fillcs 64 * SIZE(X) ++ ++ faddd $f10, $f11, $f17 ++ unop ++ fabs $f21, $f9 ++ LD $f20, 0 * SIZE(X) ++ ++ faddd $f12, $f13, $f18 ++ LD $f21, 1 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ faddd $f14, $f15, $f19 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ unop + -+ MUL a2, ALPHA_I, t2 -+ LD a0, 0 * SIZE(X) -+ MUL a3, ALPHA_R, t3 -+ LD a1, 1 * SIZE(X) ++ CMPLT($f0, $f16), $f4 ++ LD $f23, 1 * SIZE(X) ++ fabs $f24, $f12 ++ addl X, INCX, X + -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 -+ SXADDQ INCX, X, X ++ CMPLT($f1, $f17), $f5 ++ LD $f24, 0 * SIZE(X) ++ fabs $f25, $f13 ++ unop + -+ MUL a4, ALPHA_R, t0 -+ ST t6, 0 * SIZE(XX) -+ MUL a5, ALPHA_I, t1 -+ ST t7, 1 * SIZE(XX) ++ CMPLT($f2, $f18), $f6 ++ LD $f25, 1 * SIZE(X) ++ fabs $f26, $f14 ++ addl X, INCX, X + -+ MUL a4, ALPHA_I, t2 -+ LD a2, 0 * SIZE(X) -+ MUL a5, ALPHA_R, t3 -+ LD a3, 1 * SIZE(X) ++ CMPLT($f3, $f19), $f7 ++ LD $f26, 0 * SIZE(X) ++ fabs $f27, $f15 ++ unop + -+ SUB t0, t1, t4 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t5 -+ SXADDQ INCX, X, X ++ fselne $f4, $f16, $f0, $f0 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ldi $1, -1($1) # i -- + -+ MUL a6, ALPHA_R, t0 -+ ST t4, 0 * SIZE(XX) -+ MUL a7, ALPHA_I, t1 -+ ST t5, 1 * SIZE(XX) ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ bgt $1,$L12 ++ .align 4 + -+ MUL a6, ALPHA_I, t2 -+ LD a4, 0 * SIZE(X) -+ MUL a7, ALPHA_R, t3 -+ LD a5, 1 * SIZE(X) ++$L13: ++ faddd $f8, $f9, $f16 ++ fabs $f20, $f8 + -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 -+ SXADDQ INCX, X, X ++ faddd $f10, $f11, $f17 ++ fabs $f21, $f9 + -+ MUL a0, ALPHA_R, t0 -+ ST t6, 0 * SIZE(XX) -+ MUL a1, ALPHA_I, t1 -+ ST t7, 1 * SIZE(XX) ++ faddd $f12, $f13, $f18 ++ fabs $f22, $f10 + -+ MUL a0, ALPHA_I, t2 -+ LD a6, 0 * SIZE(X) -+ MUL a1, ALPHA_R, t3 -+ LD a7, 1 * SIZE(X) ++ faddd $f14, $f15, $f19 ++ fabs $f23, $f11 + -+ SUB t0, t1, t4 -+ ldi I, -1(I) -+ ADD t2, t3, t5 -+ SXADDQ INCX, XX, XX ++ CMPLT($f0, $f16), $f4 ++ fabs $f24, $f12 + -+ fillcs PREFETCHSIZE * SIZE(X) -+ unop -+ SXADDQ INCX, X, X -+ bne I, $L12 -+ .align 4 ++ CMPLT($f1, $f17), $f5 ++ fabs $f25, $f13 + -+$L13: -+ MUL a2, ALPHA_R, t0 -+ MUL a3, ALPHA_I, t1 -+ ST t4, 0 * SIZE(XX) -+ MUL a2, ALPHA_I, t2 -+ ST t5, 1 * SIZE(XX) -+ MUL a3, ALPHA_R, t3 ++ CMPLT($f2, $f18), $f6 ++ fabs $f26, $f14 ++ CMPLT($f3, $f19), $f7 ++ fabs $f27, $f15 + -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 -+ unop ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ .align 4 + -+ ST t6, 0 * SIZE(XX) -+ MUL a4, ALPHA_R, t0 -+ ST t7, 1 * SIZE(XX) -+ MUL a5, ALPHA_I, t1 -+ MUL a4, ALPHA_I, t2 -+ MUL a5, ALPHA_R, t3 ++$L14: ++ faddd $f8, $f9, $f16 ++ faddd $f10, $f11, $f17 ++ faddd $f12, $f13, $f18 ++ faddd $f14, $f15, $f19 + -+ SUB t0, t1, t4 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t5 -+ unop ++ CMPLT($f0, $f16), $f4 ++ CMPLT($f1, $f17), $f5 ++ CMPLT($f2, $f18), $f6 ++ CMPLT($f3, $f19), $f7 + -+ MUL a6, ALPHA_R, t0 -+ ST t4, 0 * SIZE(XX) -+ MUL a7, ALPHA_I, t1 -+ ST t5, 1 * SIZE(XX) ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 + -+ MUL a6, ALPHA_I, t2 -+ MUL a7, ALPHA_R, t3 ++ CMPLT($f0, $f1), $f16 ++ CMPLT($f2, $f3), $f17 + -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 + -+ ST t6, 0 * SIZE(XX) -+ ST t7, 1 * SIZE(XX) -+ SXADDQ INCX, XX, XX ++ CMPLT($f0, $f2), $f16 ++ fselne $f16, $f2, $f0, $f0 + .align 4 + +$L15: -+ and N, 3, I ++ and N, 3, $1 + unop + unop -+ ble I, $L999 ++ ble $1, $End + .align 4 + -+$L17: -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ MUL a0, ALPHA_R, t0 -+ MUL a1, ALPHA_I, t1 -+ MUL a0, ALPHA_I, t2 -+ MUL a1, ALPHA_R, t3 ++$L16: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ unop ++ addl X, INCX, X + -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 ++ fabs $f20, $f29 ++ fabs $f21, $f30 ++ faddd $f29, $f30, $f29 + -+ ST t4, 0 * SIZE(XX) -+ ST t5, 1 * SIZE(XX) -+ SXADDQ INCX, XX, XX ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 + -+ ldi I, -1(I) -+ bne I, $L17 ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 + .align 4 + -+$L999: ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldi $sp, STACKSIZE($sp) + ret ++ + EPILOGUE -diff --git a/kernel/sw_64/zscal.S.bak b/kernel/sw_64/zscal.S.bak +diff --git a/kernel/sw_64/zasum.S b/kernel/sw_64/zasum.S new file mode 100644 -index 0000000..4525b56 +index 000000000..9a3260544 --- /dev/null -+++ b/kernel/sw_64/zscal.S.bak -@@ -0,0 +1,443 @@ ++++ b/kernel/sw_64/zasum.S +@@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -83798,20 +25341,15 @@ index 0000000..4525b56 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" ++ + +#define PREFETCHSIZE 88 + +#define N $16 -+#define X $21 -+#define INCX $17 -+ -+#define XX $18 ++#define X $17 ++#define INCX $18 +#define I $19 + -+#define ALPHA_R $f19 -+#define ALPHA_I $f20 -+ +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 @@ -83824,389 +25362,159 @@ index 0000000..4525b56 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 -+#define a7 $f21 -+ -+#define t0 $f22 -+#define t1 $f23 -+#define t2 $f24 -+#define t3 $f25 ++#define a7 $f19 + -+#define t4 $f26 -+#define t5 $f27 -+#define t6 $f28 -+#define t7 $f29 ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 + + PROLOGUE + PROFCODE -+ .frame $sp, 0, $26, 0 -+ -+ ldl INCX, 0($sp) -+ mov X, XX -+ cmpeq INCX, 1, $0 -+ ble N, $L999 -+ -+ beq $0, $Sub -+ nop -+ -+/* -+ unloop 4 (4*2=8) -+*/ -+ sra N, 2, I -+ ble I, $Remain -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) -+ -+ LD a4, 4 * SIZE(X) -+ LD a5, 5 * SIZE(X) -+ -+ LD a6, 6 * SIZE(X) -+ LD a7, 7 * SIZE(X) -+ -+ -+ MUL a0, ALPHA_R, t0 -+ MUL a0, ALPHA_I, t2 -+ -+ NMAD a1, ALPHA_I, t0, t4 -+ MAD a1, ALPHA_R, t2, t5 -+/* -+ MUL a1, ALPHA_I, t1 -+ MUL a1, ALPHA_R, t3 -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 -+*/ -+ ldi I, -1(I) -+ addl X, 8*SIZE, X -+ -+ ble I, $MainLoopEnd -+ .align 4 -+ -+$MainLoop: -+ MUL a2, ALPHA_R, t0 -+ ST t4, -8 * SIZE(X) -+ MUL a2, ALPHA_I, t2 -+ ST t5, -7 * SIZE(X) -+ -+ -+ NMAD a3, ALPHA_I, t0, t6 -+ LD a0, 0 * SIZE(X) -+ MAD a3, ALPHA_R, t2, t7 -+ LD a1, 1 * SIZE(X) -+ -+ ST t6, -6 * SIZE(X) -+ MUL a4, ALPHA_R, t0 -+ ST t7, -5 * SIZE(X) -+ MUL a4, ALPHA_I, t2 -+ -+ -+ NMAD a5, ALPHA_I, t0, t4 -+ LD a2, 2 * SIZE(X) -+ MAD a5, ALPHA_R, t2, t5 -+ LD a3, 3 * SIZE(X) -+/* -+ MUL a5, ALPHA_I, t1 -+ MUL a5, ALPHA_R, t3 -+ -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 -+*/ -+ -+ MUL a6, ALPHA_R, t0 -+ ST t4, -4 * SIZE(X) -+ MUL a6, ALPHA_I, t2 -+ ST t5, -3 * SIZE(X) -+ -+ NMAD a7, ALPHA_I, t0, t6 -+ LD a4, 4 * SIZE(X) -+ MAD a7, ALPHA_R, t2, t7 -+ LD a5, 5 * SIZE(X) -+/* -+ -+ MUL a7, ALPHA_I, t1 -+ MUL a7, ALPHA_R, t3 -+ -+ SUB t0, t1, t6 -+ ADD t2, t3, t7 -+*/ -+ MUL a0, ALPHA_R, t0 -+ ST t6, -2 * SIZE(X) -+ MUL a0, ALPHA_I, t2 -+ ST t7, -1 * SIZE(X) -+ -+ NMAD a1, ALPHA_I, t0, t4 -+ LD a6, 6 * SIZE(X) -+ MAD a1, ALPHA_R, t2, t5 -+ LD a7, 7 * SIZE(X) -+ + -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ subl I, 1, I -+ addl X, 8*SIZE, X -+ bgt I, $MainLoop -+ .align 4 -+ -+$MainLoopEnd: -+ MUL a2, ALPHA_R, t0 -+ ST t4, -8 * SIZE(X) -+ MUL a2, ALPHA_I, t2 -+ ST t5, -7 * SIZE(X) -+ -+ -+ NMAD a3, ALPHA_I, t0, t6 -+ MAD a3, ALPHA_R, t2, t7 -+ -+ -+ ST t6, -6 * SIZE(X) -+ MUL a4, ALPHA_R, t0 -+ ST t7, -5 * SIZE(X) -+ MUL a4, ALPHA_I, t2 -+ -+ -+ NMAD a5, ALPHA_I, t0, t4 -+ MAD a5, ALPHA_R, t2, t5 -+/* -+ MUL a5, ALPHA_I, t1 -+ MUL a5, ALPHA_R, t3 -+ -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 -+*/ -+ -+ MUL a6, ALPHA_R, t0 -+ ST t4, -4 * SIZE(X) -+ MUL a6, ALPHA_I, t2 -+ ST t5, -3 * SIZE(X) -+ -+ NMAD a7, ALPHA_I, t0, t6 -+ MAD a7, ALPHA_R, t2, t7 -+/* -+ -+ MUL a7, ALPHA_I, t1 -+ MUL a7, ALPHA_R, t3 -+ -+ SUB t0, t1, t6 -+ ADD t2, t3, t7 -+*/ -+ ST t6, -2 * SIZE(X) -+ ST t7, -1 * SIZE(X) -+ -+ .align 4 -+$Remain: -+ and N, 3, I -+ unop ++ fclr s0 + unop -+ ble I, $L999 -+ .align 4 -+ -+$RemainLoop: -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ -+ -+ MUL a0, ALPHA_R, t0 -+ MUL a0, ALPHA_I, t2 -+ -+ NMAD a1, ALPHA_I, t0, t4 -+ MAD a1, ALPHA_R, t2, t5 -+ -+/* -+ MUL a1, ALPHA_I, t1 -+ MUL a1, ALPHA_R, t3 -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 -+*/ -+ ST t4, 0 * SIZE(X) -+ ST t5, 1 * SIZE(X) -+ -+ addl X, 2*SIZE, X -+ ldi I, -1(I) -+ bne I, $RemainLoop -+ nop -+ -+ ret -+ .align 4 -+ -+$Sub: ++ fclr t0 + addl INCX, INCX, INCX + ++ fclr s1 ++ unop ++ fclr t1 ++ ble N, $L999 ++ ++ fclr s2 + sra N, 2, I ++ fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(X) ++ fclr t2 + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X ++ + LD a2, 0 * SIZE(X) ++ fclr t3 + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X ++ + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X -+ LD a6, 0 * SIZE(X) -+ LD a7, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ MUL a0, ALPHA_R, t0 -+ MUL a1, ALPHA_I, t1 -+ MUL a0, ALPHA_I, t2 -+ MUL a1, ALPHA_R, t3 -+ -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 -+ + ldi I, -1(I) ++ + ble I, $L13 + .align 4 + +$L12: -+ ST t4, 0 * SIZE(XX) -+ MUL a2, ALPHA_R, t0 -+ ST t5, 1 * SIZE(XX) -+ MUL a3, ALPHA_I, t1 ++ ADD s0, t0, s0 ++ s_fillcs PREFETCHSIZE * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) + -+ MUL a2, ALPHA_I, t2 -+ LD a0, 0 * SIZE(X) -+ MUL a3, ALPHA_R, t3 -+ LD a1, 1 * SIZE(X) ++ ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fabs a1, t1 ++ unop + -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 ++ ADD s2, t2, s2 ++ LD a7, 1 * SIZE(X) ++ fabs a2, t2 + SXADDQ INCX, X, X + -+ MUL a4, ALPHA_R, t0 -+ ST t6, 0 * SIZE(XX) -+ MUL a5, ALPHA_I, t1 -+ ST t7, 1 * SIZE(XX) -+ -+ MUL a4, ALPHA_I, t2 -+ LD a2, 0 * SIZE(X) -+ MUL a5, ALPHA_R, t3 -+ LD a3, 1 * SIZE(X) ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ fabs a3, t3 ++ unop + -+ SUB t0, t1, t4 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t5 ++ ADD s0, t0, s0 ++ LD a1, 1 * SIZE(X) ++ fabs a4, t0 + SXADDQ INCX, X, X + -+ MUL a6, ALPHA_R, t0 -+ ST t4, 0 * SIZE(XX) -+ MUL a7, ALPHA_I, t1 -+ ST t5, 1 * SIZE(XX) -+ -+ MUL a6, ALPHA_I, t2 -+ LD a4, 0 * SIZE(X) -+ MUL a7, ALPHA_R, t3 -+ LD a5, 1 * SIZE(X) ++ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) ++ fabs a5, t1 ++ unop + -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 ++ ADD s2, t2, s2 ++ LD a3, 1 * SIZE(X) ++ fabs a6, t2 + SXADDQ INCX, X, X + -+ MUL a0, ALPHA_R, t0 -+ ST t6, 0 * SIZE(XX) -+ MUL a1, ALPHA_I, t1 -+ ST t7, 1 * SIZE(XX) -+ -+ MUL a0, ALPHA_I, t2 -+ LD a6, 0 * SIZE(X) -+ MUL a1, ALPHA_R, t3 -+ LD a7, 1 * SIZE(X) -+ -+ SUB t0, t1, t4 -+ ldi I, -1(I) -+ ADD t2, t3, t5 -+ SXADDQ INCX, XX, XX ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ fabs a7, t3 ++ unop + -+ fillcs PREFETCHSIZE * SIZE(X) ++ LD a5, 1 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: -+ MUL a2, ALPHA_R, t0 -+ MUL a3, ALPHA_I, t1 -+ ST t4, 0 * SIZE(XX) -+ MUL a2, ALPHA_I, t2 -+ ST t5, 1 * SIZE(XX) -+ MUL a3, ALPHA_R, t3 -+ -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 -+ unop -+ -+ ST t6, 0 * SIZE(XX) -+ MUL a4, ALPHA_R, t0 -+ ST t7, 1 * SIZE(XX) -+ MUL a5, ALPHA_I, t1 -+ MUL a4, ALPHA_I, t2 -+ MUL a5, ALPHA_R, t3 ++ ADD s0, t0, s0 ++ LD a6, 0 * SIZE(X) ++ fabs a0, t0 + -+ SUB t0, t1, t4 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t5 -+ unop ++ ADD s1, t1, s1 ++ LD a7, 1 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X + -+ MUL a6, ALPHA_R, t0 -+ ST t4, 0 * SIZE(XX) -+ MUL a7, ALPHA_I, t1 -+ ST t5, 1 * SIZE(XX) ++ ADD s2, t2, s2 ++ fabs a2, t2 ++ ADD s3, t3, s3 ++ fabs a3, t3 + -+ MUL a6, ALPHA_I, t2 -+ MUL a7, ALPHA_R, t3 ++ ADD s0, t0, s0 ++ fabs a4, t0 ++ ADD s1, t1, s1 ++ fabs a5, t1 ++ ADD s2, t2, s2 ++ fabs a6, t2 ++ ADD s3, t3, s3 ++ fabs a7, t3 + -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 + -+ ST t6, 0 * SIZE(XX) -+ ST t7, 1 * SIZE(XX) -+ SXADDQ INCX, XX, XX + .align 4 + +$L15: ++ ADD s0, s2, s0 + and N, 3, I -+ unop -+ unop ++ ADD s1, s3, s1 + ble I, $L999 + .align 4 + +$L17: ++ ADD s0, t0, s0 + LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ MUL a0, ALPHA_R, t0 -+ MUL a1, ALPHA_I, t1 -+ MUL a0, ALPHA_I, t2 -+ MUL a1, ALPHA_R, t3 -+ -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 ++ fabs a0, t0 ++ ldi I, -1(I) + -+ ST t4, 0 * SIZE(XX) -+ ST t5, 1 * SIZE(XX) -+ SXADDQ INCX, XX, XX ++ ADD s1, t1, s1 ++ LD a1, 1 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X + -+ ldi I, -1(I) + bne I, $L17 + .align 4 + +$L999: ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ++ ADD s0, s1, s0 + ret + EPILOGUE -diff --git a/kernel/sw_64/zscal_simd.S b/kernel/sw_64/zscal_simd.S +diff --git a/kernel/sw_64/zaxpy.S b/kernel/sw_64/zaxpy.S new file mode 100644 -index 0000000..09d2f38 +index 000000000..bbcb825cc --- /dev/null -+++ b/kernel/sw_64/zscal_simd.S -@@ -0,0 +1,579 @@ ++++ b/kernel/sw_64/zaxpy.S +@@ -0,0 +1,611 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -84247,551 +25555,583 @@ index 0000000..09d2f38 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#define PREFETCHSIZE 96 + -+#define N $16 -+#define X $21 -+#define INCX $17 ++#define PREFETCHSIZE 40 + -+#define XX $18 -+#define I $19 ++#ifndef CONJ ++#define ADD1 SUB ++#define ADD2 ADD ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#endif + -+#define ALPHA_R $f19 -+#define ALPHA_I $f20 -+ + -+#define s0 $f0 -+#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 + -+ -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f21 ++ ldl $19, 0($sp) ++ fmov $f19, $f29 ++ ldl $20, 8($sp) ++ fmov $f20, $f30 + -+#define t0 $f22 -+#define t1 $f23 -+#define t2 $f24 -+#define t3 $f25 ++ mov $21, $18 ++ ldl $21, 16($sp) ++ ldi $sp, -64($sp) ++ nop + -+#define t4 $f26 -+#define t5 $f27 -+#define t6 $f28 -+#define t7 $f29 ++ fstd $f2, 0($sp) ++ cmpeq $19, 1, $1 ++ fstd $f3, 8($sp) ++ cmpeq $21, 1, $2 + -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 -+ -+ ldl INCX, 0($sp) -+ mov X, XX -+ cmpeq INCX, 1, $0 -+ ble N, $L999 -+ -+ beq $0, $Sub -+ .align 5 ++ fstd $f4, 16($sp) ++ and $16, 3, $5 ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ and $1, $2, $1 ++ ble $16, $End ++ sra $16, 2, $4 ++ beq $1, $Sub + -+ and X, (VEC_LEN*SIZE-1), $6 -+ bgt $6, $UnAlign_X_ACCESS ++ ble $4, $Remain ++ subl $4, 1, $4 + -+/* -+ Unloop 8 (8*2=16) -+*/ -+ sra N, 3, I -+ vcpyf ALPHA_R, ALPHA_R -+ vcpyf ALPHA_I, ALPHA_I -+ ble I, $Remain ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ LD $f2, 2*SIZE($18) ++ LD $f3, 3*SIZE($18) ++ LD $f4, 4*SIZE($18) ++ LD $f5, 5*SIZE($18) ++ LD $f6, 6*SIZE($18) ++ LD $f7, 7*SIZE($18) + -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ VLD a3, 3*VEC_LEN*SIZE(X) ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ LD $f10, 2*SIZE($20) ++ LD $f11, 3*SIZE($20) ++ LD $f12, 4*SIZE($20) ++ LD $f13, 5*SIZE($20) ++ LD $f14, 6*SIZE($20) ++ LD $f15, 7*SIZE($20) + -+ subl I, 1, I -+ addl X, 16*SIZE, X -+ ble I, $MainLoopEnd ++ addl $18, 8*SIZE, $18 ++ ble $4, $MainLoopEnd + .align 4 + -+ +$MainLoop: ++ fillde_e PREFETCHSIZE * SIZE($20) ++ s_fillcs PREFETCHSIZE * SIZE($18) ++ ++ MUL $f29, $f0, $f20 ++ s_fillcs 9*SIZE($18) ++ MUL $f30, $f1, $f21 ++ unop ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ MUL $f29, $f2, $f24 ++ unop ++ MUL $f30, $f3, $f25 ++ nop ++ ++ MUL $f30, $f2, $f26 ++ LD $f2, 2*SIZE($18) ++ MUL $f29, $f3, $f27 ++ LD $f3, 3*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ unop ++ MUL $f30, $f4, $f22 ++ LD $f4, 4*SIZE($18) ++ ++ ADD2 $f26, $f27, $f19 ++ addl $20, 8*SIZE, $20 ++ MUL $f29, $f5, $f23 ++ LD $f5, 5*SIZE($18) ++ ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($20) ++ MUL $f29, $f6, $f24 ++ unop ++ ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($20) ++ MUL $f30, $f7, $f25 ++ unop ++ ++ ADD $f18, $f10, $f18 ++ LD $f10, 2*SIZE($20) ++ MUL $f30, $f6, $f26 ++ LD $f6, 6*SIZE($18) ++ ++ ADD $f19, $f11, $f19 ++ LD $f11, 3*SIZE($20) ++ MUL $f29, $f7, $f27 ++ LD $f7, 7*SIZE($18) ++ ++ ST $f16,-8*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17,-7*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ ST $f18,-6*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ST $f19,-5*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ ++ ADD $f16, $f12, $f16 ++ LD $f12, 4*SIZE($20) ++ ADD $f17, $f13, $f17 ++ LD $f13, 5*SIZE($20) ++ ADD $f18, $f14, $f18 ++ LD $f14, 6*SIZE($20) ++ ADD $f19, $f15, $f19 ++ LD $f15, 7*SIZE($20) ++ ++ ST $f16,-4*SIZE($20) ++ addl $18, 8*SIZE, $18 ++ ST $f17,-3*SIZE($20) ++ subl $4, 1, $4 + -+ vextf a0, 1, a4 -+ vextf a0, 3, a5 -+ vextf a1, 0, a6 -+ vextf a1, 2, a7 -+ -+ vextf a2, 1, t0 -+ vextf a2, 3, t1 -+ vextf a3, 0, t2 -+ vextf a3, 2, t3 -+ -+ vinsf a4, a1, 0, a1 -+ vinsf a5, a1, 2, a1 -+ vinsf a6, a0, 1, a0 -+ vinsf a7, a0, 3, a0 -+ -+ vinsf t0, a3, 0, a3 -+ vinsf t1, a3, 2, a3 -+ vinsf t2, a2, 1, a2 -+ vinsf t3, a2, 3, a2 -+ -+ VMUL ALPHA_R, a0, t4 -+ VMUL ALPHA_I, a0, t5 -+ VMUL ALPHA_R, a2, t6 -+ VMUL ALPHA_I, a2, t7 -+ -+ VNMAD ALPHA_I, a1, t4, t0 -+ VLD a0, 0*VEC_LEN*SIZE(X) -+ VMAD ALPHA_R, a1, t5, t1 -+ VLD a1, 1*VEC_LEN*SIZE(X) -+ -+ VNMAD ALPHA_I, a3, t6, t2 -+ VLD a2, 2*VEC_LEN*SIZE(X) -+ VMAD ALPHA_R, a3, t7, t3 -+ VLD a3, 3*VEC_LEN*SIZE(X) -+ -+/*combine the real(t0,t2) & image(t1,t3) vector to complex vector*/ -+ vextf t0, 1, a4 -+ vextf t0, 3, a5 -+ vextf t1, 0, a6 -+ vextf t1, 2, a7 -+ -+ vextf t2, 1, s0 -+ vextf t2, 3, s1 -+ vextf t3, 0, s2 -+ vextf t3, 2, s3 -+ -+ vinsf a4, t1, 0, t1 -+ vinsf a5, t1, 2, t1 -+ vinsf a6, t0, 1, t0 -+ vinsf a7, t0, 3, t0 -+ -+ vinsf s0, t3, 0, t3 -+ vinsf s1, t3, 2, t3 -+ vinsf s2, t2, 1, t2 -+ vinsf s3, t2, 3, t2 -+ -+ VST t0, -4*VEC_LEN*SIZE(X) -+ VST t1, -3*VEC_LEN*SIZE(X) -+ VST t2, -2*VEC_LEN*SIZE(X) -+ VST t3, -1*VEC_LEN*SIZE(X) -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ subl I, 1, I -+ addl X, 16*SIZE, X -+ bgt I, $MainLoop ++ ST $f18,-2*SIZE($20) ++ nop ++ ST $f19,-1*SIZE($20) ++ bgt $4, $MainLoop + .align 4 + +$MainLoopEnd: -+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ -+ vextf a0, 1, a4 -+ vextf a0, 3, a5 -+ vextf a1, 0, a6 -+ vextf a1, 2, a7 -+ -+ vextf a2, 1, t0 -+ vextf a2, 3, t1 -+ vextf a3, 0, t2 -+ vextf a3, 2, t3 -+ -+ vinsf a4, a1, 0, a1 -+ vinsf a5, a1, 2, a1 -+ vinsf a6, a0, 1, a0 -+ vinsf a7, a0, 3, a0 -+ -+ vinsf t0, a3, 0, a3 -+ vinsf t1, a3, 2, a3 -+ vinsf t2, a2, 1, a2 -+ vinsf t3, a2, 3, a2 -+ -+ VMUL ALPHA_R, a0, t4 -+ VMUL ALPHA_I, a0, t5 -+ VMUL ALPHA_R, a2, t6 -+ VMUL ALPHA_I, a2, t7 -+ -+ VNMAD ALPHA_I, a1, t4, t0 -+ VMAD ALPHA_R, a1, t5, t1 -+ VNMAD ALPHA_I, a3, t6, t2 -+ VMAD ALPHA_R, a3, t7, t3 -+ -+/*combine the real(t0,t2) & image(t1,t3) vector to complex vector*/ -+ vextf t0, 1, a4 -+ vextf t0, 3, a5 -+ vextf t1, 0, a6 -+ vextf t1, 2, a7 -+ -+ vextf t2, 1, s0 -+ vextf t2, 3, s1 -+ vextf t3, 0, s2 -+ vextf t3, 2, s3 -+ -+ vinsf a4, t1, 0, t1 -+ vinsf a5, t1, 2, t1 -+ vinsf a6, t0, 1, t0 -+ vinsf a7, t0, 3, t0 -+ -+ vinsf s0, t3, 0, t3 -+ vinsf s1, t3, 2, t3 -+ vinsf s2, t2, 1, t2 -+ vinsf s3, t2, 3, t2 -+ -+ VST t0, -4*VEC_LEN*SIZE(X) -+ VST t1, -3*VEC_LEN*SIZE(X) -+ VST t2, -2*VEC_LEN*SIZE(X) -+ VST t3, -1*VEC_LEN*SIZE(X) ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ MUL $f29, $f2, $f24 ++ MUL $f30, $f3, $f25 ++ MUL $f30, $f2, $f26 ++ MUL $f29, $f3, $f27 ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ MUL $f30, $f4, $f22 ++ ADD2 $f26, $f27, $f19 ++ MUL $f29, $f5, $f23 ++ ++ ADD $f16, $f8, $f16 ++ MUL $f29, $f6, $f24 ++ ADD $f17, $f28, $f17 ++ MUL $f30, $f7, $f25 ++ ++ ADD $f18, $f10, $f18 ++ MUL $f30, $f6, $f26 ++ ADD $f19, $f11, $f19 ++ MUL $f29, $f7, $f27 ++ ++ ST $f16, 0*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17, 1*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ ST $f18, 2*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ST $f19, 3*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ ++ ADD $f16, $f12, $f16 ++ ADD $f17, $f13, $f17 ++ ADD $f18, $f14, $f18 ++ ADD $f19, $f15, $f19 ++ ++ ST $f16, 4*SIZE($20) ++ ST $f17, 5*SIZE($20) ++ ST $f18, 6*SIZE($20) ++ ST $f19, 7*SIZE($20) + -+$Remain: -+ and N, 7, I + unop ++ addl $20, 8*SIZE, $20 + unop -+ ble I, $L999 -+ .align 5 -+ -+$Remain_loop: -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) ++ ble $5, $End ++ .align 4 + -+ MUL a0, ALPHA_R, t0 -+ MUL a1, ALPHA_I, t1 -+ MUL a0, ALPHA_I, t2 -+ MUL a1, ALPHA_R, t3 ++$Remain: ++ subl $5, 1, $6 ++ ble $5, $End ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) + -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 -+ ST t4, 0 * SIZE(X) -+ ST t5, 1 * SIZE(X) -+ -+ addl X, 2*SIZE, X -+ ldi I, -1(I) -+ bne I, $Remain_loop -+ ret -+ .align 5 -+ -+$UnAlign_X_ACCESS: -+/* -+ unloop 4 (4*2=8) -+*/ -+ sra N, 2, I -+ ble I, $Unalign_Remain -+ -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ addl $18, 2*SIZE, $18 ++ ble $6, $RemainLoopEnd ++ .align 4 + -+ LD a2, 2 * SIZE(X) -+ LD a3, 3 * SIZE(X) ++$RemainLoop: ++ MUL $f29, $f0, $f20 ++ subl $6, 1, $6 ++ MUL $f30, $f1, $f21 ++ addl $20, 2*SIZE, $20 + -+ LD a4, 4 * SIZE(X) -+ MUL a0, ALPHA_R, t0 -+ LD a5, 5 * SIZE(X) -+ MUL a0, ALPHA_I, t2 -+ -+ LD a6, 6 * SIZE(X) -+ NMAD a1, ALPHA_I, t0, t4 -+ LD a7, 7 * SIZE(X) -+ MAD a1, ALPHA_R, t2, t5 ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) + ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($20) ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($20) + -+ ldi I, -1(I) -+ addl X, 8*SIZE, X -+ ble I, $Unalign_MainLoopEnd ++ ST $f16,-2*SIZE($20) ++ addl $18, 2*SIZE, $18 ++ ST $f17,-1*SIZE($20) ++ bgt $6, $RemainLoop + .align 4 + -+$Unalign_MainLoop: -+ MUL a2, ALPHA_R, t0 -+ ST t4, -8 * SIZE(X) -+ MUL a2, ALPHA_I, t2 -+ ST t5, -7 * SIZE(X) ++$RemainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 + -+ -+ NMAD a3, ALPHA_I, t0, t6 -+ LD a0, 0 * SIZE(X) -+ MAD a3, ALPHA_R, t2, t7 -+ LD a1, 1 * SIZE(X) ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ ADD $f17, $f28, $f17 + -+ ST t6, -6 * SIZE(X) -+ MUL a4, ALPHA_R, t0 -+ ST t7, -5 * SIZE(X) -+ MUL a4, ALPHA_I, t2 ++ ST $f16, 0*SIZE($20) ++ nop ++ ST $f17, 1*SIZE($20) ++ nop ++ .align 4 + ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ ldi $sp, 64($sp) ++ ret ++ .align 4 + -+ NMAD a5, ALPHA_I, t0, t4 -+ LD a2, 2 * SIZE(X) -+ MAD a5, ALPHA_R, t2, t5 -+ LD a3, 3 * SIZE(X) ++$Sub: ++ SXSUBL $16, SIZE, $22 ++ addl $22, $22, $22 # Complex ++ .align 4 + -+ MUL a6, ALPHA_R, t0 -+ ST t4, -4 * SIZE(X) -+ MUL a6, ALPHA_I, t2 -+ ST t5, -3 * SIZE(X) ++ addl $19, $19, $19 # Complex ++ addl $21, $21, $21 # Complex + -+ NMAD a7, ALPHA_I, t0, t6 -+ LD a4, 4 * SIZE(X) -+ MAD a7, ALPHA_R, t2, t7 -+ LD a5, 5 * SIZE(X) ++ ble $4, $SubRemain ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ SXADDQ $19, $18, $18 + -+ MUL a0, ALPHA_R, t0 -+ ST t6, -2 * SIZE(X) -+ MUL a0, ALPHA_I, t2 -+ ST t7, -1 * SIZE(X) ++ LD $f2, 0*SIZE($18) ++ LD $f3, 1*SIZE($18) ++ SXADDQ $19, $18, $18 + -+ NMAD a1, ALPHA_I, t0, t4 -+ LD a6, 6 * SIZE(X) -+ MAD a1, ALPHA_R, t2, t5 -+ LD a7, 7 * SIZE(X) ++ LD $f4, 0*SIZE($18) ++ LD $f5, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f6, 0*SIZE($18) ++ LD $f7, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ SXADDQ $21, $20, $24 ++ ++ LD $f10, 0*SIZE($24) ++ LD $f11, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ LD $f12, 0*SIZE($24) ++ LD $f13, 1*SIZE($24) ++ SXADDQ $21, $24, $24 + ++ LD $f14, 0*SIZE($24) ++ LD $f15, 1*SIZE($24) ++ SXADDQ $21, $24, $24 + -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ subl I, 1, I -+ addl X, 8*SIZE, X -+ bgt I, $Unalign_MainLoop ++ subl $4, 1, $4 ++ ble $4, $SubMainLoopEnd + .align 4 -+ -+$Unalign_MainLoopEnd: -+ MUL a2, ALPHA_R, t0 -+ ST t4, -8 * SIZE(X) -+ MUL a2, ALPHA_I, t2 -+ ST t5, -7 * SIZE(X) + -+ -+ NMAD a3, ALPHA_I, t0, t6 -+ MAD a3, ALPHA_R, t2, t7 -+ ++$SubMainLoop: ++ MUL $f29, $f0, $f20 ++ unop ++ MUL $f30, $f1, $f21 ++ unop + -+ ST t6, -6 * SIZE(X) -+ MUL a4, ALPHA_R, t0 -+ ST t7, -5 * SIZE(X) -+ MUL a4, ALPHA_I, t2 ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) + ++ MUL $f29, $f2, $f24 ++ SXADDQ $19, $18, $18 ++ MUL $f30, $f3, $f25 ++ unop + -+ NMAD a5, ALPHA_I, t0, t4 -+ MAD a5, ALPHA_R, t2, t5 ++ MUL $f30, $f2, $f26 ++ LD $f2, 0*SIZE($18) ++ MUL $f29, $f3, $f27 ++ LD $f3, 1*SIZE($18) + -+ MUL a6, ALPHA_R, t0 -+ ST t4, -4 * SIZE(X) -+ MUL a6, ALPHA_I, t2 -+ ST t5, -3 * SIZE(X) ++ ADD1 $f20, $f21, $f16 ++ SXADDQ $19, $18, $18 ++ MUL $f29, $f4, $f20 ++ unop + -+ NMAD a7, ALPHA_I, t0, t6 -+ MAD a7, ALPHA_R, t2, t7 -+ ST t6, -2 * SIZE(X) -+ ST t7, -1 * SIZE(X) ++ ADD2 $f22, $f23, $f17 ++ unop ++ MUL $f30, $f5, $f21 ++ unop + -+ .align 4 -+$Unalign_Remain: -+ and N, 3, I ++ ADD1 $f24, $f25, $f18 + unop ++ MUL $f30, $f4, $f22 ++ LD $f4, 0*SIZE($18) ++ ++ ADD2 $f26, $f27, $f19 + unop -+ ble I, $L999 -+ .align 4 ++ MUL $f29, $f5, $f23 ++ LD $f5, 1*SIZE($18) + -+$Unalign_RemainLoop: -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($24) ++ MUL $f29, $f6, $f24 ++ SXADDQ $19, $18, $18 + -+ -+ MUL a0, ALPHA_R, t0 -+ MUL a0, ALPHA_I, t2 ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($24) ++ MUL $f30, $f7, $f25 ++ SXADDQ $21, $24, $24 + -+ NMAD a1, ALPHA_I, t0, t4 -+ MAD a1, ALPHA_R, t2, t5 ++ ADD $f18, $f10, $f18 ++ LD $f10, 0*SIZE($24) ++ MUL $f30, $f6, $f26 ++ LD $f6, 0*SIZE($18) + -+ ST t4, 0 * SIZE(X) -+ ST t5, 1 * SIZE(X) ++ ADD $f19, $f11, $f19 ++ LD $f11, 1*SIZE($24) ++ MUL $f29, $f7, $f27 ++ LD $f7, 1*SIZE($18) + -+ addl X, 2*SIZE, X -+ ldi I, -1(I) -+ bne I, $Unalign_RemainLoop -+ nop -+ -+ ret -+ .align 4 ++ ST $f16, 0*SIZE($20) ++ SXADDQ $19, $18, $18 ++ ADD1 $f20, $f21, $f16 ++ unop + -+$Sub: -+ addl INCX, INCX, INCX ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ADD2 $f22, $f23, $f17 ++ unop + -+ sra N, 2, I -+ ble I, $L15 ++ ST $f18, 0*SIZE($20) ++ SXADDQ $21, $24, $24 ++ ADD1 $f24, $f25, $f18 ++ unop + -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a2, 0 * SIZE(X) -+ LD a3, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a4, 0 * SIZE(X) -+ LD a5, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ LD a6, 0 * SIZE(X) -+ LD a7, 1 * SIZE(X) -+ SXADDQ INCX, X, X ++ ST $f19, 1*SIZE($20) ++ unop ++ ADD2 $f26, $f27, $f19 ++ SXADDQ $21, $20, $20 + -+ MUL a0, ALPHA_R, t0 -+ MUL a1, ALPHA_I, t1 -+ MUL a0, ALPHA_I, t2 -+ MUL a1, ALPHA_R, t3 ++ ADD $f16, $f12, $f16 ++ unop ++ LD $f12, 0*SIZE($24) ++ unop + -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 ++ ADD $f17, $f13, $f17 ++ unop ++ LD $f13, 1*SIZE($24) ++ SXADDQ $21, $24, $24 + -+ ldi I, -1(I) -+ ble I, $L13 ++ ADD $f18, $f14, $f18 ++ subl $4, 1, $4 ++ LD $f14, 0*SIZE($24) ++ unop ++ ++ ADD $f19, $f15, $f19 ++ unop ++ LD $f15, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ unop ++ ++ ST $f18, 0*SIZE($20) ++ ST $f19, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ bgt $4, $SubMainLoop + .align 4 + -+$L12: -+ ST t4, 0 * SIZE(XX) -+ MUL a2, ALPHA_R, t0 -+ ST t5, 1 * SIZE(XX) -+ MUL a3, ALPHA_I, t1 -+ -+ MUL a2, ALPHA_I, t2 -+ LD a0, 0 * SIZE(X) -+ MUL a3, ALPHA_R, t3 -+ LD a1, 1 * SIZE(X) ++$SubMainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 + -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 -+ SXADDQ INCX, X, X ++ MUL $f29, $f2, $f24 ++ MUL $f30, $f3, $f25 ++ MUL $f30, $f2, $f26 ++ MUL $f29, $f3, $f27 + -+ MUL a4, ALPHA_R, t0 -+ ST t6, 0 * SIZE(XX) -+ MUL a5, ALPHA_I, t1 -+ ST t7, 1 * SIZE(XX) ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 + -+ MUL a4, ALPHA_I, t2 -+ LD a2, 0 * SIZE(X) -+ MUL a5, ALPHA_R, t3 -+ LD a3, 1 * SIZE(X) ++ ADD1 $f24, $f25, $f18 ++ MUL $f30, $f4, $f22 ++ ADD2 $f26, $f27, $f19 ++ MUL $f29, $f5, $f23 + -+ SUB t0, t1, t4 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t5 -+ SXADDQ INCX, X, X ++ ADD $f16, $f8, $f16 ++ MUL $f29, $f6, $f24 ++ ADD $f17, $f28, $f17 ++ MUL $f30, $f7, $f25 + -+ MUL a6, ALPHA_R, t0 -+ ST t4, 0 * SIZE(XX) -+ MUL a7, ALPHA_I, t1 -+ ST t5, 1 * SIZE(XX) ++ ADD $f18, $f10, $f18 ++ MUL $f30, $f6, $f26 ++ ADD $f19, $f11, $f19 ++ MUL $f29, $f7, $f27 + -+ MUL a6, ALPHA_I, t2 -+ LD a4, 0 * SIZE(X) -+ MUL a7, ALPHA_R, t3 -+ LD a5, 1 * SIZE(X) ++ ST $f16, 0*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17, 1*SIZE($20) ++ ADD2 $f22, $f23, $f17 + -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 -+ SXADDQ INCX, X, X ++ SXADDQ $21, $20, $20 ++ nop ++ ST $f18, 0*SIZE($20) ++ ADD1 $f24, $f25, $f18 + -+ MUL a0, ALPHA_R, t0 -+ ST t6, 0 * SIZE(XX) -+ MUL a1, ALPHA_I, t1 -+ ST t7, 1 * SIZE(XX) ++ ST $f19, 1*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ SXADDQ $21, $20, $20 ++ ADD $f16, $f12, $f16 + -+ MUL a0, ALPHA_I, t2 -+ LD a6, 0 * SIZE(X) -+ MUL a1, ALPHA_R, t3 -+ LD a7, 1 * SIZE(X) ++ ADD $f17, $f13, $f17 ++ ADD $f18, $f14, $f18 ++ ADD $f19, $f15, $f19 + -+ SUB t0, t1, t4 -+ ldi I, -1(I) -+ ADD t2, t3, t5 -+ SXADDQ INCX, XX, XX ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 + -+ fillcs PREFETCHSIZE * SIZE(X) -+ unop -+ SXADDQ INCX, X, X -+ bne I, $L12 ++ ST $f18, 0*SIZE($20) ++ ST $f19, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ble $5, $SubEnd + .align 4 + -+$L13: -+ MUL a2, ALPHA_R, t0 -+ MUL a3, ALPHA_I, t1 -+ ST t4, 0 * SIZE(XX) -+ MUL a2, ALPHA_I, t2 -+ ST t5, 1 * SIZE(XX) -+ MUL a3, ALPHA_R, t3 -+ -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 -+ unop -+ -+ ST t6, 0 * SIZE(XX) -+ MUL a4, ALPHA_R, t0 -+ ST t7, 1 * SIZE(XX) -+ MUL a5, ALPHA_I, t1 -+ MUL a4, ALPHA_I, t2 -+ MUL a5, ALPHA_R, t3 ++$SubRemain: ++ subl $5, 1, $6 ++ ble $5, $SubEnd ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) + -+ SUB t0, t1, t4 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t5 -+ unop ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ SXADDQ $19, $18, $18 ++ SXADDQ $21, $20, $24 ++ ble $6, $SubRemainLoopEnd ++ .align 4 + -+ MUL a6, ALPHA_R, t0 -+ ST t4, 0 * SIZE(XX) -+ MUL a7, ALPHA_I, t1 -+ ST t5, 1 * SIZE(XX) ++$SubRemainLoop: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) + -+ MUL a6, ALPHA_I, t2 -+ MUL a7, ALPHA_R, t3 ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ADD1 $f20, $f21, $f16 ++ SXADDQ $19, $18, $18 + -+ SUB t0, t1, t6 -+ SXADDQ INCX, XX, XX -+ ADD t2, t3, t7 ++ ADD2 $f22, $f23, $f17 ++ nop ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($24) + -+ ST t6, 0 * SIZE(XX) -+ ST t7, 1 * SIZE(XX) -+ SXADDQ INCX, XX, XX -+ .align 4 ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ subl $6, 1, $6 + -+$L15: -+ and N, 3, I -+ unop -+ unop -+ ble I, $L999 ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ bgt $6, $SubRemainLoop + .align 4 + -+$L17: -+ LD a0, 0 * SIZE(X) -+ LD a1, 1 * SIZE(X) -+ SXADDQ INCX, X, X -+ -+ MUL a0, ALPHA_R, t0 -+ MUL a1, ALPHA_I, t1 -+ MUL a0, ALPHA_I, t2 -+ MUL a1, ALPHA_R, t3 -+ -+ SUB t0, t1, t4 -+ ADD t2, t3, t5 ++$SubRemainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 + -+ ST t4, 0 * SIZE(XX) -+ ST t5, 1 * SIZE(XX) -+ SXADDQ INCX, XX, XX ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ ADD $f17, $f28, $f17 + -+ ldi I, -1(I) -+ bne I, $L17 ++ ST $f16, 0*SIZE($20) ++ nop ++ ST $f17, 1*SIZE($20) ++ nop + .align 4 + -+$L999: ++$SubEnd: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ ldi $sp, 64($sp) + ret + EPILOGUE -diff --git a/kernel/sw_64/zsum.S b/kernel/sw_64/zsum.S +diff --git a/kernel/sw_64/zdot.S b/kernel/sw_64/zdot.S new file mode 100644 -index 0000000..7b8570c +index 000000000..f037aef4d --- /dev/null -+++ b/kernel/sw_64/zsum.S -@@ -0,0 +1,234 @@ ++++ b/kernel/sw_64/zdot.S +@@ -0,0 +1,500 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -84832,534 +26172,472 @@ index 0000000..7b8570c + +#define ASSEMBLER +#include "common.h" -+#include "version.h" ++ + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 -+#define I $19 ++#define Y $19 ++#define INCY $20 ++#define XX $21 ++#define YY $23 ++ ++#define I $5 + +#define s0 $f0 +#define s1 $f1 -+#define s2 $f10 -+#define s3 $f11 ++#define s2 $f2 ++#define s3 $f30 + -+#define a0 $f12 -+#define a1 $f13 -+#define a2 $f14 -+#define a3 $f15 -+#define a4 $f16 -+#define a5 $f17 -+#define a6 $f18 -+#define a7 $f19 ++#define a0 $f10 ++#define a1 $f11 ++#define a2 $f12 ++#define a3 $f13 ++#define a4 $f14 ++#define a5 $f15 ++#define a6 $f16 ++#define a7 $f17 + -+#define t0 $f20 -+#define t1 $f21 -+#define t2 $f22 -+#define t3 $f23 ++#define b0 $f18 ++#define b1 $f19 ++#define b2 $f20 ++#define b3 $f21 ++#define b4 $f22 ++#define b5 $f23 ++#define b6 $f24 ++#define b7 $f25 ++ ++#define t0 $f26 ++#define t1 $f27 ++#define t2 $f28 ++#define t3 $f29 + + PROLOGUE + PROFCODE ++ .frame $sp, 16, $26, 0 + ++ ldi $sp, -16($sp) + fclr s0 -+ unop -+ fclr t0 -+ addw INCX, INCX, $20 -+ mov $20,INCX -+ ++ fstd $f2, 0($sp) + fclr s1 -+ unop -+ fclr t1 -+ ble N, $L999 + + fclr s2 -+ sra N, 2, I ++ addl INCX, INCX, INCX + fclr s3 -+ ble I, $L15 ++ ble N, $L999 + -+ LD a0, 0 * SIZE(X) ++ addl INCY, INCY, INCY ++ fclr t0 ++ fclr t1 + fclr t2 -+ LD a1, 1 * SIZE(X) ++ fclr t3 ++ ++ srl N, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ + SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ LD a3, 1 * SIZE(X) ++ LD b2, 0 * SIZE(Y) ++ LD b3, 1 * SIZE(Y) + -+ LD a2, 0 * SIZE(X) -+ fclr t3 -+ LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ LD b4, 0 * SIZE(Y) ++ LD b5, 1 * SIZE(Y) + -+ LD a4, 0 * SIZE(X) -+ LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X -+ ldi I, -1(I) ++ SXADDQ INCY, Y, Y + -+ ble I, $L13 -+ .align 4 ++ LD a6, 0 * SIZE(X) ++ LD b6, 0 * SIZE(Y) + -+$L12: -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ ldl $31, PREFETCHSIZE * SIZE(X) -+ fmov a0, t0 -+ ldi I, -1(I) ++ subl I, 1, I ++ ble I, $L23 ++ .align 4 + -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ LD a6, 0 * SIZE(X) -+ fmov a1, t1 -+ unop ++$L22: ++ ADD s0, t0, s0 ++ LD a7, 1 * SIZE(X) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) + -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ LD a7, 1 * SIZE(X) -+ fmov a2, t2 ++ ADD s1, t1, s1 ++ s_fillcs PREFETCHSIZE * SIZE(X) ++ MUL a0, b1, t1 + SXADDQ INCX, X, X + -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ LD a0, 0 * SIZE(X) -+ fmov a3, t3 -+ unop ++ ADD s2, t2, s2 ++ s_fillcs PREFETCHSIZE * SIZE(Y) ++ MUL a1, b0, t2 ++ SXADDQ INCY, Y, Y + -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ LD a1, 1 * SIZE(X) -+ fmov a4, t0 -+ SXADDQ INCX, X, X ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) + -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ LD a2, 0 * SIZE(X) -+ fmov a5, t1 -+ unop ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) + -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ LD a3, 1 * SIZE(X) -+ fmov a6, t2 ++ ADD s1, t1, s1 + SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y + -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ LD a4, 0 * SIZE(X) -+ fmov a7, t3 ++ ADD s2, t2, s2 + unop -+ -+ LD a5, 1 * SIZE(X) ++ MUL a3, b2, t2 + unop -+ SXADDQ INCX, X, X -+ bne I, $L12 -+ .align 4 + -+$L13: -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ LD a6, 0 * SIZE(X) -+ fmov a0, t0 ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) + -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ LD a7, 1 * SIZE(X) -+ fmov a1, t1 ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 + SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y + -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ fmov a2, t2 -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ fmov a3, t3 ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ unop + -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ fmov a4, t0 -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ fmov a5, t1 -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ fmov a6, t2 -+ ADD s3, t3, $f24 -+ fmov $f24,s3 -+ fmov a7, t3 ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) + -+ ADD s2, t2, $f24 -+ fmov $f24,s2 -+ ADD s3, t3, $f24 -+ fmov $f24,s3 ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) + -+ .align 4 ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y + -+$L15: -+ ADD s0, s2, $f24 -+ fmov $f24,s0 -+ and N, 3, I -+ ADD s1, s3, $f24 -+ fmov $f24,s1 -+ ble I, $L999 -+ .align 4 ++ ADD s2, t2, s2 ++ unop ++ MUL a7, b6, t2 ++ unop + -+$L17: -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ LD a0, 0 * SIZE(X) -+ fmov a0, t0 -+ ldi I, -1(I) ++ ADD s3, t3, s3 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, 1 * SIZE(X) + -+ ADD s1, t1, $f24 -+ fmov $f24,s1 -+ LD a1, 1 * SIZE(X) -+ fmov a1, t1 ++ ADD s0, t0, s0 ++ LD b6, 0 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 + SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y + -+ bne I, $L17 -+ .align 4 ++ ADD s2, t2, s2 ++ unop ++ MUL a1, b0, t2 ++ unop + -+$L999: -+ ADD s0, t0, $f24 -+ fmov $f24,s0 -+ ADD s1, t1, $f24 -+ fmov $f24,s1 ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) + -+ ADD s0, s1, $f24 -+ fmov $f24,s0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/zswap.S.bak b/kernel/sw_64/zswap.S.bak -new file mode 100644 -index 0000000..f0b19dd ---- /dev/null -+++ b/kernel/sw_64/zswap.S.bak -@@ -0,0 +1,244 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y + -+ PROLOGUE -+ PROFCODE -+ .frame $sp, 0, $26, 0 ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop + -+ mov $21, $17 -+ ldl $18, 0($sp) -+ ldl $19, 8($sp) -+ ldl $20, 16($sp) -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) + -+ ble $16, $SubEnd # if n <= 0 goto $End ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) + -+ cmpeq $18, 1, $1 -+ addl $18, $18, $18 -+ cmpeq $20, 1, $2 -+ addl $20, $20, $20 ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y + -+ sra $16, 2, $21 -+ and $1, $2, $1 -+ and $16, 3, $22 -+ beq $1, $Sub ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ subl I, 1, I + -+ ble $21, $MainRemain -+ .align 4 ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) + -+$MainLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f11, 1*SIZE($19) -+ LD $f12, 2*SIZE($19) -+ LD $f13, 3*SIZE($19) -+ LD $f14, 4*SIZE($19) -+ LD $f15, 5*SIZE($19) -+ LD $f16, 6*SIZE($19) -+ LD $f17, 7*SIZE($19) ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) + -+ LD $f20, 0*SIZE($17) -+ LD $f21, 1*SIZE($17) -+ LD $f22, 2*SIZE($17) -+ LD $f23, 3*SIZE($17) -+ LD $f24, 4*SIZE($17) -+ LD $f25, 5*SIZE($17) -+ LD $f26, 6*SIZE($17) -+ LD $f27, 7*SIZE($17) ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y + -+ fillcs 16*SIZE($17) ++ ADD s2, t2, s2 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b6, t2 + unop -+ fillcs 16*SIZE($19) -+ subl $21, 1, $21 + -+ ST $f10, 0*SIZE($17) -+ ST $f11, 1*SIZE($17) -+ ST $f12, 2*SIZE($17) -+ ST $f13, 3*SIZE($17) -+ ST $f14, 4*SIZE($17) -+ ST $f15, 5*SIZE($17) -+ ST $f16, 6*SIZE($17) -+ ST $f17, 7*SIZE($17) ++ ADD s3, t3, s3 ++ LD b6, 0 * SIZE(Y) ++ MUL a7, b7, t3 ++ bgt I, $L22 ++ .align 4 + -+ ST $f20, 0*SIZE($19) -+ ST $f21, 1*SIZE($19) -+ ST $f22, 2*SIZE($19) -+ ST $f23, 3*SIZE($19) -+ ST $f24, 4*SIZE($19) -+ ST $f25, 5*SIZE($19) -+ ST $f26, 6*SIZE($19) -+ ST $f27, 7*SIZE($19) ++$L23: ++ ADD s0, t0, s0 ++ LD a7, 1 * SIZE(X) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) + -+ ldi $17, 8*SIZE($17) -+ ldi $19, 8*SIZE($19) -+ bgt $21, $MainLoop -+ .align 4 ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y + -+$MainRemain: -+ ble $22, $MainEnd -+ .align 4 ++ ADD s2, t2, s2 ++ unop ++ MUL a1, b0, t2 ++ unop + -+$MainRemainLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f11, 1*SIZE($19) -+ LD $f20, 0*SIZE($17) -+ LD $f21, 1*SIZE($17) ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) + -+ ldi $17, 2*SIZE($17) -+ ldi $19, 2*SIZE($19) -+ subl $22, 1, $22 -+ ST $f10, -2*SIZE($17) -+ ST $f11, -1*SIZE($17) -+ ST $f20, -2*SIZE($19) -+ ST $f21, -1*SIZE($19) -+ bgt $22, $MainRemainLoop -+ .align 4 ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) + -+$MainEnd: -+ clr $0 -+ ret -+ .align 4 ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y + -+$Sub: -+ mov $17, $23 -+ mov $19, $24 -+ ble $21, $SubRemain -+ .align 4 ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop + -+$SubLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f11, 1*SIZE($19) -+ SXADDQ $20, $19, $19 ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) + -+ LD $f12, 0*SIZE($19) -+ LD $f13, 1*SIZE($19) -+ SXADDQ $20, $19, $19 ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) + -+ LD $f14, 0*SIZE($19) -+ LD $f15, 1*SIZE($19) -+ SXADDQ $20, $19, $19 ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y + -+ LD $f16, 0*SIZE($19) -+ LD $f17, 1*SIZE($19) -+ SXADDQ $20, $19, $19 ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ unop + -+ LD $f20, 0*SIZE($17) -+ LD $f21, 1*SIZE($17) -+ SXADDQ $18, $17, $17 ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) + -+ LD $f22, 0*SIZE($17) -+ LD $f23, 1*SIZE($17) -+ SXADDQ $18, $17, $17 ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) + -+ LD $f24, 0*SIZE($17) -+ LD $f25, 1*SIZE($17) -+ SXADDQ $18, $17, $17 ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y + -+ LD $f26, 0*SIZE($17) -+ LD $f27, 1*SIZE($17) -+ SXADDQ $18, $17, $17 ++ ADD s2, t2, s2 ++ unop ++ MUL a7, b6, t2 ++ unop + -+ ST $f10, 0*SIZE($23) -+ ST $f11, 1*SIZE($23) -+ SXADDQ $18, $23, $23 ++ ADD s3, t3, s3 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, 1 * SIZE(X) + -+ ST $f12, 0*SIZE($23) -+ ST $f13, 1*SIZE($23) -+ SXADDQ $18, $23, $23 ++ ADD s0, t0, s0 ++ LD b6, 0 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) + -+ ST $f14, 0*SIZE($23) -+ ST $f15, 1*SIZE($23) -+ SXADDQ $18, $23, $23 ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y + -+ ST $f16, 0*SIZE($23) -+ ST $f17, 1*SIZE($23) -+ SXADDQ $18, $23, $23 ++ ADD s2, t2, s2 ++ MUL a1, b0, t2 ++ ADD s3, t3, s3 ++ MUL a1, b1, t3 + -+ ST $f20, 0*SIZE($24) -+ ST $f21, 1*SIZE($24) -+ SXADDQ $20, $24, $24 ++ ADD s0, t0, s0 ++ MUL a2, b2, t0 ++ ADD s1, t1, s1 ++ MUL a2, b3, t1 + -+ ST $f22, 0*SIZE($24) -+ ST $f23, 1*SIZE($24) -+ SXADDQ $20, $24, $24 ++ ADD s2, t2, s2 ++ MUL a3, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 + -+ ST $f24, 0*SIZE($24) -+ ST $f25, 1*SIZE($24) -+ SXADDQ $20, $24, $24 ++ ADD s0, t0, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s1 ++ MUL a4, b5, t1 + -+ ST $f26, 0*SIZE($24) -+ ST $f27, 1*SIZE($24) -+ SXADDQ $20, $24, $24 ++ ADD s2, t2, s2 ++ MUL a5, b4, t2 ++ ADD s3, t3, s3 ++ MUL a5, b5, t3 + -+ subl $21, 1, $21 -+ bgt $21, $SubLoop -+ .align 4 ++ ADD s0, t0, s0 ++ MUL a6, b6, t0 ++ ADD s1, t1, s1 ++ MUL a6, b7, t1 + -+$SubRemain: -+ ble $22, $SubEnd ++ ADD s2, t2, s2 ++ MUL a7, b6, t2 ++ ADD s3, t3, s3 ++ MUL a7, b7, t3 + .align 4 + -+$SubRemainLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f11, 1*SIZE($19) -+ LD $f20, 0*SIZE($17) -+ LD $f21, 1*SIZE($17) -+ -+ subl $22, 1, $22 ++$L25: ++ and N, 7, I ++ unop ++ unop ++ ble I, $L998 + -+ ST $f10, 0*SIZE($17) -+ ST $f11, 1*SIZE($17) -+ ST $f20, 0*SIZE($19) -+ ST $f21, 1*SIZE($19) ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) + -+ SXADDQ $18, $17, $17 -+ SXADDQ $20, $19, $19 -+ bgt $22, $SubRemainLoop ++ SXADDQ INCX, X, X ++ subl I, 1, I ++ SXADDQ INCY, Y, Y ++ ble I, $L28 + .align 4 + -+$SubEnd: -+ clr $0 -+ ret -+ EPILOGUE -diff --git a/kernel/sw_64/zswap.c b/kernel/sw_64/zswap.c -new file mode 100644 -index 0000000..ae4760a ---- /dev/null -+++ b/kernel/sw_64/zswap.c -@@ -0,0 +1,72 @@ -+/*************************************************************************** -+Copyright (c) 2013, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+/************************************************************************************** -+* 2013/09/14 Saar -+* BLASTEST float : OK -+* BLASTEST double : OK -+* CTEST : OK -+* TEST : OK -+* -+**************************************************************************************/ -+ -+#include "common.h" -+#include ++$L26: ++ ADD s0, t0, s0 ++ mov X, XX ++ MUL a0, b0, t0 ++ mov Y, YY + -+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -+{ -+ BLASLONG i=0; -+ BLASLONG ix=0,iy=0; -+ FLOAT temp[2]; -+ BLASLONG inc_x2; -+ BLASLONG inc_y2; ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y + -+ if ( n < 0 ) return(0); ++ ADD s2, t2, s2 ++ LD a0, 0 * SIZE(XX) ++ MUL a1, b0, t2 ++ LD b0, 0 * SIZE(YY) + -+ inc_x2 = 2 * inc_x; -+ inc_y2 = 2 * inc_y; ++ ADD s3, t3, s3 ++ subl I, 1, I ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(XX) + -+ while(i < n) -+ { ++ LD b1, 1 * SIZE(YY) ++ bgt I, $L26 ++ .align 4 + -+ temp[0] = x[ix] ; -+ temp[1] = x[ix+1] ; -+ x[ix] = y[iy] ; -+ x[ix+1] = y[iy+1] ; -+ y[iy] = temp[0] ; -+ y[iy+1] = temp[1] ; ++$L28: ++ ADD s0, t0, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s1 ++ MUL a0, b1, t1 + -+ ix += inc_x2 ; -+ iy += inc_y2 ; -+ i++ ; ++ ADD s2, t2, s2 ++ MUL a1, b0, t2 ++ ADD s3, t3, s3 ++ MUL a1, b1, t3 ++ .align 4 + -+ } -+ return(0); ++$L998: ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 + -+} ++#ifndef CONJ ++ SUB s0, s3, s0 ++ ADD s1, s2, s1 ++#else ++ ADD s0, s3, s0 ++ SUB s1, s2, s1 ++#endif ++ .align 4 + ++$L999: ++ fldd $f2, 0($sp) ++ ldi $sp, 16($sp) ++ ret + -diff --git a/kernel/sw_64/zswap_simd.S b/kernel/sw_64/zswap_simd.S ++ EPILOGUE +diff --git a/kernel/sw_64/zgemm_beta.S b/kernel/sw_64/zgemm_beta.S new file mode 100644 -index 0000000..e49c95b +index 000000000..ffaa17ba2 --- /dev/null -+++ b/kernel/sw_64/zswap_simd.S -@@ -0,0 +1,306 @@ ++++ b/kernel/sw_64/zgemm_beta.S +@@ -0,0 +1,192 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -85400,278 +26678,164 @@ index 0000000..e49c95b + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#define PREFETCHSIZE 64 -+#define X $17 -+#define Y $19 + -+ PROLOGUE -+ PROFCODE ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++CNAME: + .frame $sp, 0, $26, 0 + -+ mov $21, $17 -+ ldl $18, 0($sp) -+ ldl $19, 8($sp) -+ ldl $20, 16($sp) -+#ifndef PROFILE -+ .prologue 0 -+#else ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount + .prologue 1 ++#else ++ .prologue 0 +#endif + -+ ble $16, $SubEnd # if n <= 0 goto $End -+ -+ cmpeq $18, 1, $1 -+ addl $18, $18, $18 -+ cmpeq $20, 1, $2 -+ addl $20, $20, $20 -+ -+/* -+ Unloop 8 complex, 16 real -+*/ -+ -+ sra $16, 3, $21 -+ and $1, $2, $1 -+ and $16, 7, $22 -+ beq $1, $Sub -+ -+/* -+ test the address of Y & X -+*/ -+ and Y, (VEC_LEN*SIZE-1), $4 -+ and X, (VEC_LEN*SIZE-1), $3 -+ or $3, $4, $4 -+ bne $4, $UnAlign_ACCESS -+ -+/* align access*/ -+ -+ ble $21, $MainRemain -+ .align 4 -+ -+$MainLoop: -+ VLD $f10, 0*VEC_LEN*SIZE(Y) -+ VLD $f11, 1*VEC_LEN*SIZE(Y) -+ VLD $f12, 2*VEC_LEN*SIZE(Y) -+ VLD $f13, 3*VEC_LEN*SIZE(Y) -+ -+ VLD $f20, 0*VEC_LEN*SIZE(X) -+ VLD $f21, 1*VEC_LEN*SIZE(X) -+ VLD $f22, 2*VEC_LEN*SIZE(X) -+ VLD $f23, 3*VEC_LEN*SIZE(X) -+ -+ fillcs PREFETCHSIZE * SIZE(X) -+ unop -+ fillcs PREFETCHSIZE * SIZE(Y) -+ subl $21, 1, $21 -+ -+ VST $f10, 0*VEC_LEN*SIZE(X) -+ VST $f11, 1*VEC_LEN*SIZE(X) -+ VST $f12, 2*VEC_LEN*SIZE(X) -+ VST $f13, 3*VEC_LEN*SIZE(X) -+ -+ VST $f20, 0*VEC_LEN*SIZE(Y) -+ VST $f21, 1*VEC_LEN*SIZE(Y) -+ VST $f22, 2*VEC_LEN*SIZE(Y) -+ VST $f23, 3*VEC_LEN*SIZE(Y) -+ -+ ldi $17, 16*SIZE(X) -+ ldi $19, 16*SIZE(Y) -+ bgt $21, $MainLoop -+ .align 4 -+ -+ jmp $MainRemain -+ .align 4 -+ -+$UnAlign_ACCESS: -+ sra $16, 2, $21 -+ and $16, 3, $22 -+ nop -+ ble $21, $MainRemain -+ .align 4 -+$UnAlign_ACCESS_MainLoop: -+ -+ LD $f10, 0*SIZE(Y) -+ LD $f11, 1*SIZE(Y) -+ LD $f12, 2*SIZE(Y) -+ LD $f13, 3*SIZE(Y) -+ LD $f14, 4*SIZE(Y) -+ LD $f15, 5*SIZE(Y) -+ LD $f16, 6*SIZE(Y) -+ LD $f17, 7*SIZE(Y) -+ -+ LD $f20, 0*SIZE(X) -+ LD $f21, 1*SIZE(X) -+ LD $f22, 2*SIZE(X) -+ LD $f23, 3*SIZE(X) -+ LD $f24, 4*SIZE(X) -+ LD $f25, 5*SIZE(X) -+ LD $f26, 6*SIZE(X) -+ LD $f27, 7*SIZE(X) -+ -+ fillcs 16*SIZE(X) -+ unop -+ fillcs 16*SIZE(Y) -+ subl $21, 1, $21 -+ -+ ST $f10, 0*SIZE(X) -+ ST $f11, 1*SIZE(X) -+ ST $f12, 2*SIZE(X) -+ ST $f13, 3*SIZE(X) -+ ST $f14, 4*SIZE(X) -+ ST $f15, 5*SIZE(X) -+ ST $f16, 6*SIZE(X) -+ ST $f17, 7*SIZE(X) -+ -+ ST $f20, 0*SIZE(Y) -+ ST $f21, 1*SIZE(Y) -+ ST $f22, 2*SIZE(Y) -+ ST $f23, 3*SIZE(Y) -+ ST $f24, 4*SIZE(Y) -+ ST $f25, 5*SIZE(Y) -+ ST $f26, 6*SIZE(Y) -+ ST $f27, 7*SIZE(Y) ++ ldl $18, 24($sp) ++ ble $16, $End ++ ldl $19, 32($sp) ++ ble $17, $End + -+ ldi X, 8*SIZE(X) -+ ldi Y, 8*SIZE(Y) -+ bgt $21, $UnAlign_ACCESS_MainLoop ++ addl $19, $19, $19 ++ fbne $f19,$Main ++ fbne $f20,$Main + .align 4 + -+$MainRemain: -+ ble $22, $MainEnd ++$L13: ++ mov $18, $1 ++ ldi $17, -1($17) ++ SXADDQ $19, $18, $18 ++ mov $16, $2 + .align 4 + -+$MainRemainLoop: -+ LD $f10, 0*SIZE(Y) -+ LD $f11, 1*SIZE(Y) -+ LD $f20, 0*SIZE(X) -+ LD $f21, 1*SIZE(X) -+ -+ ldi X, 2*SIZE(X) -+ ldi Y, 2*SIZE(Y) -+ subl $22, 1, $22 -+ ST $f10, -2*SIZE(X) -+ ST $f11, -1*SIZE(X) -+ ST $f20, -2*SIZE(Y) -+ ST $f21, -1*SIZE(Y) -+ bgt $22, $MainRemainLoop -+ .align 4 -+ -+$MainEnd: ++$L12: ++ ST $f31, 0*SIZE($1) ++ ST $f31, 1*SIZE($1) ++ ldi $2, -1($2) ++ ldi $1, 2*SIZE($1) ++ bgt $2, $L12 ++ bgt $17,$L13 + clr $0 + ret + .align 4 + -+$Sub: -+ sra $16, 2, $21 -+ and $16, 3, $22 ++/* Main Routine */ ++$Main: ++ sra $16, 1, $2 # $2 = (m >> 1) ++ mov $18, $1 # c_offset = c ++ ldi $17, -1($17) # n -- ++ SXADDQ $19, $18, $18 # c += ldc ++ beq $2, $L18 + -+ mov $17, $23 -+ mov $19, $24 -+ ble $21, $SubRemain ++ LD $f14, 0*SIZE($1) ++ LD $f15, 1*SIZE($1) ++ LD $f24, 2*SIZE($1) ++ LD $f25, 3*SIZE($1) ++ ldi $2, -1($2) # $2 -- ++ ble $2, $L19 + .align 4 + -+$SubLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f11, 1*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f12, 0*SIZE($19) -+ LD $f13, 1*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f14, 0*SIZE($19) -+ LD $f15, 1*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f16, 0*SIZE($19) -+ LD $f17, 1*SIZE($19) -+ SXADDQ $20, $19, $19 -+ -+ LD $f20, 0*SIZE($17) -+ LD $f21, 1*SIZE($17) -+ SXADDQ $18, $17, $17 -+ -+ LD $f22, 0*SIZE($17) -+ LD $f23, 1*SIZE($17) -+ SXADDQ $18, $17, $17 -+ -+ LD $f24, 0*SIZE($17) -+ LD $f25, 1*SIZE($17) -+ SXADDQ $18, $17, $17 + -+ LD $f26, 0*SIZE($17) -+ LD $f27, 1*SIZE($17) -+ SXADDQ $18, $17, $17 ++$L23: ++ MUL $f19, $f14, $f10 ++ fillde 9*SIZE($1) ++ MUL $f20, $f15, $f11 ++ ldi $2, -1($2) + -+ ST $f10, 0*SIZE($23) -+ ST $f11, 1*SIZE($23) -+ SXADDQ $18, $23, $23 ++ MUL $f19, $f15, $f12 ++ LD $f15, 5*SIZE($1) ++ MUL $f20, $f14, $f13 ++ LD $f14, 4*SIZE($1) + -+ ST $f12, 0*SIZE($23) -+ ST $f13, 1*SIZE($23) -+ SXADDQ $18, $23, $23 ++ MUL $f19, $f24, $f16 ++ unop ++ MUL $f20, $f25, $f17 ++ unop + -+ ST $f14, 0*SIZE($23) -+ ST $f15, 1*SIZE($23) -+ SXADDQ $18, $23, $23 ++ MUL $f19, $f25, $f18 ++ LD $f25, 7*SIZE($1) ++ SUB $f10, $f11, $f22 ++ unop + -+ ST $f16, 0*SIZE($23) -+ ST $f17, 1*SIZE($23) -+ SXADDQ $18, $23, $23 ++ MUL $f20, $f24, $f21 ++ LD $f24, 6*SIZE($1) ++ ADD $f12, $f13, $f23 ++ ldi $1, 4*SIZE($1) + -+ ST $f20, 0*SIZE($24) -+ ST $f21, 1*SIZE($24) -+ SXADDQ $20, $24, $24 ++ SUB $f16, $f17, $f26 ++ ADD $f18, $f21, $f27 ++ ST $f22,-4*SIZE($1) ++ ST $f23,-3*SIZE($1) + -+ ST $f22, 0*SIZE($24) -+ ST $f23, 1*SIZE($24) -+ SXADDQ $20, $24, $24 ++ ST $f26,-2*SIZE($1) ++ ST $f27,-1*SIZE($1) ++ unop ++ bgt $2,$L23 ++ .align 4 + -+ ST $f24, 0*SIZE($24) -+ ST $f25, 1*SIZE($24) -+ SXADDQ $20, $24, $24 ++$L19: ++ MUL $f19, $f14, $f10 ++ MUL $f20, $f15, $f11 ++ MUL $f19, $f15, $f12 ++ MUL $f20, $f14, $f13 + -+ ST $f26, 0*SIZE($24) -+ ST $f27, 1*SIZE($24) -+ SXADDQ $20, $24, $24 ++ MUL $f19, $f24, $f16 ++ MUL $f20, $f25, $f17 ++ MUL $f19, $f25, $f18 ++ MUL $f20, $f24, $f21 + -+ subl $21, 1, $21 -+ bgt $21, $SubLoop -+ .align 4 ++ SUB $f10, $f11, $f22 ++ ADD $f12, $f13, $f23 ++ SUB $f16, $f17, $f26 ++ ADD $f18, $f21, $f27 ++ ldi $1, 4*SIZE($1) + -+$SubRemain: -+ ble $22, $SubEnd -+ .align 4 ++ ST $f22, -4*SIZE($1) ++ ST $f23, -3*SIZE($1) ++ ST $f26, -2*SIZE($1) ++ ST $f27, -1*SIZE($1) + -+$SubRemainLoop: -+ LD $f10, 0*SIZE($19) -+ LD $f11, 1*SIZE($19) -+ LD $f20, 0*SIZE($17) -+ LD $f21, 1*SIZE($17) ++ blbs $16, $L18 ++ bgt $17, $Main ++ clr $0 ++ ret ++ .align 4 + -+ subl $22, 1, $22 ++$L18: ++ LD $f14, 0*SIZE($1) ++ LD $f15, 1*SIZE($1) ++ MUL $f19, $f15, $f13 ++ MUL $f20, $f14, $f10 + -+ ST $f10, 0*SIZE($17) -+ ST $f11, 1*SIZE($17) -+ ST $f20, 0*SIZE($19) -+ ST $f21, 1*SIZE($19) ++ MUL $f19, $f14, $f12 ++ MUL $f20, $f15, $f11 ++ ADD $f13, $f10, $f26 ++ SUB $f12, $f11, $f27 + -+ SXADDQ $18, $17, $17 -+ SXADDQ $20, $19, $19 -+ bgt $22, $SubRemainLoop ++ ST $f26, 1*SIZE($1) ++ ST $f27, 0*SIZE($1) ++ ldi $1, 2*SIZE($1) ++ bgt $17, $Main + .align 4 + -+$SubEnd: ++$End: + clr $0 + ret -+ EPILOGUE -diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S b/kernel/sw_64/ztrsm_kernel_2x2_LN.S ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/zgemm_kernel_2x2.S b/kernel/sw_64/zgemm_kernel_2x2.S new file mode 100644 -index 0000000..3a14e58 +index 000000000..1bd180f87 --- /dev/null -+++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S -@@ -0,0 +1,2593 @@ ++++ b/kernel/sw_64/zgemm_kernel_2x2.S +@@ -0,0 +1,1705 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -85712,36 +26876,29 @@ index 0000000..3a14e58 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++ ++#if !defined(SW8A) +#error "Architecture is not specified." +#endif + -+#ifdef SW6 ++#ifdef SW8A +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + -+#ifdef EV5 -+#define PREFETCHSIZE 48 -+#define UNOP -+#endif + -+#ifdef EV4 -+#define UNOP -+#endif + + .set noat + .set noreorder -+ .arch sw6a ++ .arch sw8a + +.text + .align 5 + .globl CNAME + .ent CNAME + -+#define STACKSIZE 88 ++#define STACKSIZE 80 + +#define M $16 +#define N $17 @@ -85760,683 +26917,147 @@ index 0000000..3a14e58 +#define J $7 +#define L $8 + -+#define tmp $9 -+ -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 -+ -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 -+ -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 -+ -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 -+ -+#define alpha_i $f29 -+#define alpha_r $f30 -+ -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 -+ -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 -+ -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 -+ -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 -+ -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define AORIG $3 -+#define OFFSET $4 -+ -+#if defined(LN) || defined(LT) -+#ifndef CONJ -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#define ADD5 SUB -+#define ADD6 ADD -+#else -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 SUB -+#define ADD4 ADD -+#define ADD5 ADD -+#define ADD6 SUB -+#endif -+#else -+#ifndef CONJ -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#define ADD5 SUB -+#define ADD6 ADD -+#else -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 ADD -+#define ADD4 SUB -+#define ADD5 ADD -+#define ADD6 SUB -+#endif -+#endif -+ -+ -+CNAME: -+ .frame $sp, STACKSIZE, $26, 0 -+ -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ ldi $at, _mcount -+ jsr $at, ($at), _mcount -+#endif -+ -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif -+ -+ ldi $sp, -STACKSIZE($sp) -+ -+ ldl B, 0 + STACKSIZE($sp) -+ ldl C, 8 + STACKSIZE($sp) -+ ldl LDC, 16 + STACKSIZE($sp) -+ ldl OFFSET, 24 + STACKSIZE($sp) -+ -+ sll LDC, ZBASE_SHIFT, LDC -+ -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ stl tmp, 72($sp) -+ -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 -+ -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 -+ -+#ifdef LN -+ addl M, M, TMP2 -+ mull TMP2, K, TMP1 -+ SXADDQ TMP1, A, A -+ SXADDQ TMP2, C, C -+#endif -+ -+#ifdef RN -+ negl OFFSET, KK -+#endif -+ -+#ifdef RT -+ mull N, K, TMP1 -+ addl TMP1, TMP1, TMP1 -+ SXADDQ TMP1, B, B -+ -+ mull N, LDC, TMP1 -+ addl TMP1, C, C -+ -+ subl N, OFFSET, KK -+#endif -+ -+ sra N, 1, J -+ ble J, $L30 -+ .align 4 -+ -+$L01: -+#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ subl B, TMP1, B -+ -+ subl C, LDC, C2 -+ subl C2, LDC, C1 -+ subl C2, LDC, C -+#else -+ mov C, C1 -+ addl C, LDC, C2 -+ addl C2, LDC, C -+#endif -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK -+#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif -+ -+ and M, 1, I -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+ fclr c01 -+ fclr c05 -+ ble I, $L20 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c10 -+ LD b2, 1 * SIZE(B) -+ fclr c14 -+ -+ LD b3, 2 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 4 * SIZE(B) -+ -+ ldi L, -2(KK) -+ -+ ble KK, $L28 -+ ble L, $L25 -+#else -+#ifdef LN -+ sll K, ZBASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, ZBASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c10 -+ LD b2, 1 * SIZE(BO) -+ fclr c14 -+ -+ LD b3, 2 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 4 * SIZE(BO) -+ -+ ldi L, -2(TMP1) -+ -+ ble TMP1, $L28 -+ ble L, $L25 -+#endif -+ .align 5 -+ -+$L22: -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+// unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+// unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+// unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD2 c14, t4, b5 -+ fmov b5, c14 -+// unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+// unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+// unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+// unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ FIMOVD b5, tmp -+ -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+// unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+// unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+// unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD2 c14, t4, b5 -+ fmov b5, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+// unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+// unop -+ IFMOVD tmp, b5 -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ IFMOVD tmp, b5 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 -+ -+$L25: -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L27 -+#else -+ blbs TMP1, $L27 -+#endif -+ .align 4 -+ -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+// unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+// unop -+ MUL a1, b2, t3 -+ unop -+ -+ ADD2 c14, t4, b5 -+ fmov b5, c14 -+// unop -+ MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+// unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+// unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+// unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+// unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L27: -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ MUL a2, b1, t2 -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+ MUL a1, b2, t3 -+ -+ ADD2 c14, t4, b5 -+ fmov b5, c14 -+ MUL a2, b2, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b3, t1 -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b3, t2 -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b4, t3 -+ -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+ ADD2 c14, t4, b5 -+ fmov b5, c14 -+ -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 -+ ADD c09, c14, b5 -+ fmov b5, c09 -+ ADD c10, c13, b5 -+ fmov b5, c10 -+ .align 4 -+ -+$L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c10, b5 -+ fmov b5, c10 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c10, b5 -+ fmov b5, c10 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c09, t3, b5 -+ fmov b5, c09 -+ ADD6 c10, t4, b5 -+ fmov b5, c10 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 + -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 + -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ ADD6 c09, t1, b5 -+ fmov b5, c09 -+ ADD5 c10, t2, b5 -+ fmov b5, c10 ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 + -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 + -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 ++#define alpha_i $f29 ++#define alpha_r $f30 + -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 -+#endif ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 + -+#ifdef RT -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ LD a3, 4 * SIZE(BO) -+ LD a4, 5 * SIZE(BO) ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 + -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 + -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 + -+ MUL a3, c09, t1 -+ MUL a3, c10, t2 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define BB $3 ++#define OFFSET $4 + -+ MUL a4, c10, t1 -+ MUL a4, c09, t2 -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 ++#define ALPHA_R 64($sp) ++#define ALPHA_I 72($sp) + -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++#if defined(NN) || defined(NT) || defined(TN) || defined(TT) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB ++#endif + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount +#endif + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c10, 3 * SIZE(BO) ++#ifndef PROFILE ++ .prologue 0 +#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c10, 3 * SIZE(AO) ++ .prologue 1 +#endif + -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++#ifdef TRMMKERNEL ++ ldl OFFSET, 24 + STACKSIZE($sp) +#endif + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c09, 0 * SIZE(C2) -+ ST c10, 1 * SIZE(C2) ++ sll LDC, ZBASE_SHIFT, LDC + -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+#endif ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ fstd $f19, ALPHA_R ++ fstd $f20, ALPHA_I + -+#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 + -+#ifdef LT -+ addl KK, 1, KK ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ subl $31, OFFSET, KK +#endif + -+#ifdef LN -+ subl KK, 1, KK -+#endif ++ sra N, 1, J ++ ble J, $L30 + .align 4 + -+$L20: ++$L01: ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ s4addl K, 0, BB ++ ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ SXADDQ BB, B, BB ++ addl C2, LDC, C ++ unop ++ + sra M, 1, I + fclr t1 + fclr t2 @@ -86446,11 +27067,28 @@ index 0000000..3a14e58 + fclr c01 + fclr c05 + -+ ble I, $L29 ++ ble I, $L20 + .align 4 + +$L11: -+#if defined(LT) || defined(RN) ++#ifndef EV4 ++ s_fillcs 0 * SIZE(BB) ++ s_fillcs 8 * SIZE(BB) ++ unop ++ ldi BB, 16 * SIZE(BB) ++#endif ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif + + LD a1, 0 * SIZE(AO) + fclr c09 @@ -86477,26 +27115,23 @@ index 0000000..3a14e58 + ldi AO, 4 * SIZE(AO) + fclr c15 + -+ fillcs 4 * SIZE(C1) ++ fillde 4 * SIZE(C1) + fclr c04 -+ ldi L, -2(KK) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif + fclr c08 + -+ fillcs 4 * SIZE(C2) ++ fillde 4 * SIZE(C2) + fclr c12 + fclr c16 -+ ble KK, $L18 + ble L, $L15 +#else -+#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ + sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ addl B, TMP1, BO -+ ++ addl AO, TMP1, AO ++ addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) @@ -86524,1227 +27159,851 @@ index 0000000..3a14e58 + ldi AO, 4 * SIZE(AO) + fclr c15 + -+ fillcs 4 * SIZE(C1) ++ fillde 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + -+ fillcs 4 * SIZE(C2) ++ fillde 4 * SIZE(C2) + fclr c12 + fclr c16 -+ ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ -+ ADD1 c11, t1, b5 -+ fmov b5, c11 ++ ADD1 c11, t1, c11 +#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) ++ s_fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) ++ s_fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+// unop ++ ADD3 c12, t2, c12 ++ unop + MUL b1, a2, t2 + unop + -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+// unop ++ ADD2 c16, t3, c16 ++ unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+// unop ++ ADD4 c15, t4, c15 ++ unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) -+ FIMOVD b5, tmp + +/* 2 */ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++ ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + -+ ADD2 c06, t3, b5 -+ fmov b5, c06 -+// unop ++ ADD2 c06, t3, c06 ++ unop + MUL b2, a4, t3 + unop + -+ ADD4 c05, t4, b5 -+ fmov b5, c05 -+// unop ++ ADD4 c05, t4, c05 ++ unop + MUL b4, a1, t4 + unop + +/* 3 */ -+ ADD1 c03, t1, b5 -+ fmov b5, c03 -+// unop ++ ADD1 c03, t1, c03 ++ unop + MUL b3, a1, t1 + unop + -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+// unop ++ ADD3 c04, t2, c04 ++ unop + MUL b3, a2, t2 + unop + -+ ADD2 c08, t3, b5 -+ fmov b5, c08 -+// unop ++ ADD2 c08, t3, c08 ++ unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + -+ ADD4 c13, t4, b5 -+ fmov b5, c13 -+// unop ++ ADD4 c13, t4, c13 ++ unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+// unop ++ ADD1 c09, t1, c09 ++ unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+// unop ++ ADD3 c10, t2, c10 ++ unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + -+ ADD2 c14, t3, b5 -+ fmov b5, c14 -+// unop ++ ADD2 c14, t3, c14 ++ unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + -+ ADD4 c07, t4, b5 -+ fmov b5, c07 -+// unop ++ ADD4 c07, t4, c07 ++ unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+// unop -+ IFMOVD tmp, b5 ++ ADD1 c11, t1, c11 ++ unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + -+ ADD3 c12, t2, b5 -+ fmov b5, c12 ++ ADD3 c12, t2, c12 + ldi L, -2(L) -+ IFMOVD tmp, b5 + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+// unop ++ ADD2 c16, t3, c16 ++ unop + MUL b2, a2, t3 + unop + -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+// unop ++ ADD4 c15, t4, c15 ++ unop + MUL b2, a5, t4 + unop + +/* 6 */ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+// unop -+ IFMOVD tmp, b5 ++ ADD1 c01, t1, c01 ++ unop + MUL b5, a6, t1 + unop + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+// unop -+ IFMOVD tmp, b5 ++ ADD3 c02, t2, c02 ++ unop + MUL b5, a4, t2 + unop + -+ ADD2 c06, t3, b5 -+ fmov b5, c06 -+// unop ++ ADD2 c06, t3, c06 ++ unop + MUL b2, a4, t3 + unop + -+ ADD4 c05, t4, b5 -+ fmov b5, c05 -+// unop ++ ADD4 c05, t4, c05 ++ unop + MUL b4, a5, t4 + unop + +/* 7 */ -+ ADD1 c03, t1, b5 -+ fmov b5, c03 ++ ADD1 c03, t1, c03 + ldi AO, 8 * SIZE(AO) + MUL b3, a5, t1 -+// unop ++ unop + -+ ADD3 c04, t2, b5 -+ fmov b5, c04 ++ ADD3 c04, t2, c04 + ldi BO, 8 * SIZE(BO) + MUL b3, a2, t2 -+// unop ++ unop + -+ ADD2 c08, t3, b5 -+ fmov b5, c08 -+// unop ++ ADD2 c08, t3, c08 ++ unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + -+ ADD4 c13, t4, b5 -+ fmov b5, c13 -+// unop ++ ADD4 c13, t4, c13 ++ unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+// unop ++ ADD1 c09, t1, c09 ++ unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+// unop ++ ADD3 c10, t2, c10 ++ unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + -+ ADD2 c14, t3, b5 -+ fmov b5, c14 -+// unop ++ ADD2 c14, t3, c14 ++ unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + -+ ADD4 c07, t4, b5 -+ fmov b5, c07 ++ ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+// unop ++ ADD1 c11, t1, c11 ++ fldd alpha_r, ALPHA_R + MUL b1, a1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L17 ++#ifndef TRMMKERNEL ++ blbs K, $L18 +#else -+ blbs TMP1, $L17 ++ blbs TMP1, $L18 +#endif + .align 4 + -+ ADD3 c12, t2, b5 -+ fmov b5, c12 ++ ADD3 c12, t2, c12 + MUL b1, a2, t2 -+ ADD2 c16, t3, b5 -+ fmov b5, c16 ++ ADD2 c16, t3, c16 + MUL b2, a2, t3 + -+ ADD4 c15, t4, b5 -+ fmov b5, c15 ++ ADD4 c15, t4, c15 + MUL b2, a1, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++ ADD1 c01, t1, c01 + MUL b1, a3, t1 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+// unop ++ ADD3 c02, t2, c02 ++ unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + -+ ADD2 c06, t3, b5 -+ fmov b5, c06 ++ ADD2 c06, t3, c06 + MUL b2, a4, t3 -+ ADD4 c05, t4, b5 -+ fmov b5, c05 ++ ADD4 c05, t4, c05 + MUL b4, a1, t4 + -+ ADD1 c03, t1, b5 -+ fmov b5, c03 -+// unop ++ ADD1 c03, t1, c03 ++ unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+// unop ++ ADD3 c04, t2, c04 ++ unop + MUL b3, a2, t2 + unop + -+ ADD2 c08, t3, b5 -+ fmov b5, c08 -+// unop ++ ADD2 c08, t3, c08 ++ unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + -+ ADD4 c13, t4, b5 -+ fmov b5, c13 -+// unop ++ ADD4 c13, t4, c13 ++ unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+// unop ++ ADD1 c09, t1, c09 ++ unop + MUL b3, a3, t1 + ldi AO, 4 * SIZE(AO) + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+// unop ++ ADD3 c10, t2, c10 ++ unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + -+ ADD2 c14, t3, b5 -+ fmov b5, c14 -+// unop ++ ADD2 c14, t3, c14 ++ unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + -+ ADD4 c07, t4, b5 -+ fmov b5, c07 -+// unop ++ ADD4 c07, t4, c07 ++ unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + -+ ADD1 c11, t1, b5 -+ fmov b5, c11 ++ ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + ldi BO, 4 * SIZE(BO) + .align 4 + -+$L17: -+ ADD3 c12, t2, b5 -+ fmov b5, c12 ++$L18: ++ ADD3 c12, t2, c12 ++ unop + MUL b1, a2, t2 -+ ADD2 c16, t3, b5 -+ fmov b5, c16 ++ fldd alpha_i, ALPHA_I ++ ++ ADD2 c16, t3, c16 ++ unop + MUL b2, a2, t3 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif + -+ ADD4 c15, t4, b5 -+ fmov b5, c15 ++ ADD4 c15, t4, c15 + MUL b2, a1, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++ ADD1 c01, t1, c01 + MUL b1, a3, t1 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ADD3 c02, t2, c02 ++ unop + MUL b1, a4, t2 -+ ADD2 c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, t3 -+ -+ ADD4 c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, t4 -+ ADD1 c03, t1, b5 -+ fmov b5, c03 -+ MUL b3, a1, t1 -+ -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+ MUL b3, a2, t2 -+ ADD2 c08, t3, b5 -+ fmov b5, c08 -+ MUL b4, a2, t3 -+ -+ ADD4 c13, t4, b5 -+ fmov b5, c13 -+ MUL b2, a3, t4 -+ ADD1 c09, t1,b5 -+ fmov b5, c09 -+ MUL b3, a3, t1 -+ -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ MUL b3, a4, t2 -+ ADD2 c14, t3, b5 -+ fmov b5, c14 -+ MUL b4, a4, t3 -+ -+ ADD4 c07, t4, b5 -+ fmov b5, c07 -+ ldi AO, 4 * SIZE(AO) -+ MUL b4, a3, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+ -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 -+ ADD c03, c08, b5 -+ fmov b5, c03 -+ ADD c04, c07, b5 -+ fmov b5, c04 -+ -+ ADD c09, c14, b5 -+ fmov b5, c09 -+ ADD c10, c13, b5 -+ fmov b5, c10 -+ ADD c11, c16, b5 -+ fmov b5, c11 -+ ADD c12, c15, b5 -+ fmov b5, c12 -+ .align 4 -+ -+$L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c10, b5 -+ fmov b5, c10 -+ -+ SUB b1, c03, b5 -+ fmov b5, c03 -+ SUB b2, c04, b5 -+ fmov b5, c04 -+ SUB b3, c11, b5 -+ fmov b5, c11 -+ SUB b4, c12, b5 -+ fmov b5, c12 ++#ifndef TRMMKERNEL ++ LD b1, 1 * SIZE(C1) +#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+ -+ SUB b1, c09, b5 -+ fmov b5, c09 -+ SUB b2, c10, b5 -+ fmov b5, c10 -+ SUB b3, c11, b5 -+ fmov b5, c11 -+ SUB b4, c12, b5 -+ fmov b5, c12 -+#endif -+ -+#ifdef LN -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) -+ LD a3, 4 * SIZE(AO) -+ LD a4, 5 * SIZE(AO) -+ -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a3, c03, t1 -+ MUL a3, c04, t2 -+ MUL a3, c11, t3 -+ MUL a3, c12, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c10, t4, b5 -+ fmov b5, c10 -+ -+ MUL a4, c04, t1 -+ MUL a4, c03, t2 -+ MUL a4, c12, t3 -+ MUL a4, c11, t4 -+ -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 -+ ADD6 c09, t3, b5 -+ fmov b5, c09 -+ ADD5 c10, t4, b5 -+ fmov b5, c10 -+ -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c09, t3, b5 -+ fmov b5, c09 -+ ADD6 c10, t4, b5 -+ fmov b5, c10 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c09, t3, b5 -+ fmov b5, c09 -+ ADD6 c10, t4, b5 -+ fmov b5, c10 -+ -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c09, t3 -+ MUL a3, c10, t4 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c04, t2, b5 -+ fmov b5, c04 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ MUL a4, c10, t3 -+ MUL a4, c09, t4 -+ -+ ADD6 c03, t1, b5 -+ fmov b5, c03 -+ ADD5 c04, t2, b5 -+ fmov b5, c04 -+ ADD6 c11, t3, b5 -+ fmov b5, c11 -+ ADD5 c12, t4, b5 -+ fmov b5, c12 -+ -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) -+ -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 -+ -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 -+ -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 ++ unop +#endif + -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c03, t3, b5 -+ fmov b5, c03 -+ ADD6 c04, t4, b5 -+ fmov b5, c04 -+ -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c03, t3 -+ MUL a3, c04, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ MUL a4, c04, t3 -+ MUL a4, c03, t4 -+ -+ ADD6 c09, t1, b5 -+ fmov b5, c09 -+ ADD5 c10, t2, b5 -+ fmov b5, c10 -+ ADD6 c11, t3, b5 -+ fmov b5, c11 -+ ADD5 c12, t4, b5 -+ fmov b5, c12 -+ -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 -+ -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 + -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++#ifndef TRMMKERNEL ++ LD a1, 2 * SIZE(C1) ++#else ++ unop +#endif + -+#ifdef RT -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ LD a3, 4 * SIZE(BO) -+ LD a4, 5 * SIZE(BO) -+ -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop + -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++#ifndef TRMMKERNEL ++ LD a2, 3 * SIZE(C1) ++#else ++ unop ++#endif + -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++#ifndef TRMMKERNEL ++ LD b2, 0 * SIZE(C2) ++#else ++ unop ++#endif + -+ MUL a3, c09, t1 -+ MUL a3, c10, t2 -+ MUL a3, c11, t3 -+ MUL a3, c12, t4 ++ ADD1 c09, t1, c09 ++ ldi I, -1(I) ++ MUL b3, a3, t1 ++ unop + -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++#ifndef TRMMKERNEL ++ LD b3, 1 * SIZE(C2) ++#else ++ unop ++#endif + -+ MUL a4, c10, t1 -+ MUL a4, c09, t2 -+ MUL a4, c12, t3 -+ MUL a4, c11, t4 ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++#ifndef TRMMKERNEL ++ LD a4, 2 * SIZE(C2) ++#else ++ unop ++#endif + -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 -+ ADD6 c03, t3, b5 -+ fmov b5, c03 -+ ADD5 c04, t4, b5 -+ fmov b5, c04 ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++#ifndef TRMMKERNEL ++ LD a3, 3 * SIZE(C2) ++#else ++ unop ++#endif + -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++ ADD1 c11, t1, c11 ++ ADD3 c12, t2, c12 ++ ADD2 c16, t3, c16 ++ ADD4 c15, t4, c15 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 + -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 ++ ADD c09, c14, c09 ++ MUL alpha_r, c01, t1 ++ ADD c10, c13, c10 ++ MUL alpha_r, c02, t2 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c03, t3, b5 -+ fmov b5, c03 -+ ADD6 c04, t4, b5 -+ fmov b5, c04 -+#endif ++ ADD c11, c16, c11 ++ MUL alpha_r, c03, t3 ++ ADD c12, c15, c12 ++ MUL alpha_r, c04, t4 + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c10, 3 * SIZE(BO) ++#ifndef TRMMKERNEL ++ ADD a5, t1, a5 ++ MUL alpha_i, c02, t1 ++ ADD b1, t2, b1 ++ MUL alpha_i, c01, t2 + -+ ST c03, 4 * SIZE(BO) -+ ST c04, 5 * SIZE(BO) -+ ST c11, 6 * SIZE(BO) -+ ST c12, 7 * SIZE(BO) ++ ADD a1, t3, a1 ++ MUL alpha_i, c04, t3 ++ ADD a2, t4, a2 ++ MUL alpha_i, c03, t4 +#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) ++ ADD $f31, t1, a5 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, b1 ++ MUL alpha_i, c01, t2 + -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c11, 6 * SIZE(AO) -+ ST c12, 7 * SIZE(AO) ++ ADD $f31, t3, a1 ++ MUL alpha_i, c04, t3 ++ ADD $f31, t4, a2 ++ MUL alpha_i, c03, t4 +#endif + -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+#endif ++ SUB a5, t1, a5 ++ MUL alpha_r, c09, t1 ++ ADD b1, t2, b1 ++ MUL alpha_r, c10, t2 + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) ++ SUB a1, t3, a1 ++ MUL alpha_r, c11, t3 ++ ADD a2, t4, a2 ++ MUL alpha_r, c12, t4 + -+ ST c09, 0 * SIZE(C2) -+ ST c10, 1 * SIZE(C2) -+ ST c11, 2 * SIZE(C2) -+ ST c12, 3 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD b2, t1, b2 ++ MUL alpha_i, c10, t1 ++ ADD b3, t2, b3 ++ MUL alpha_i, c09, t2 + -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) ++ ADD a4, t3, a4 ++ MUL alpha_i, c12, t3 ++ ADD a3, t4, a3 ++ MUL alpha_i, c11, t4 ++#else ++ ADD $f31, t1, b2 ++ MUL alpha_i, c10, t1 ++ ADD $f31, t2, b3 ++ MUL alpha_i, c09, t2 ++ ++ ADD $f31, t3, a4 ++ MUL alpha_i, c12, t3 ++ ADD $f31, t4, a3 ++ MUL alpha_i, c11, t4 +#endif + ++ SUB b2, t1, b2 ++ ST a5, 0 * SIZE(C1) + fclr t1 ++ unop ++ ++ ADD b3, t2, b3 ++ ST b1, 1 * SIZE(C1) + fclr t2 ++ unop ++ ++ SUB a4, t3, a4 ++ ST a1, 2 * SIZE(C1) + fclr t3 ++ unop ++ ++ ADD a3, t4, a3 ++ ST a2, 3 * SIZE(C1) + fclr t4 ++ unop + -+#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ST b2, 0 * SIZE(C2) ++ fclr c01 ++ ST b3, 1 * SIZE(C2) ++ fclr c05 + -+#if defined(LT) || defined(RN) ++ ST a4, 2 * SIZE(C2) ++ ldi C1, 4 * SIZE(C1) ++ ST a3, 3 * SIZE(C2) ++ ldi C2, 4 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addl AO, TMP1, AO + addl BO, TMP1, BO +#endif + -+#ifdef LT ++#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 2, KK +#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ fclr c01 -+ fclr c05 -+ -+ ldi I, -1(I) + bgt I, $L11 + .align 4 + -+$L29: -+#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 2, KK -+#endif -+ -+#ifdef RT -+ subl KK, 2, KK -+#endif -+ -+ ldi J, -1(J) -+ bgt J, $L01 -+ .align 4 -+ -+$L30: -+ and N, 1, J -+ ble J, $L999 ++$L20: ++ and M, 1, I ++ ble I, $L29 + -+#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 -+ subl B, TMP1, B ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + -+ subl C, LDC, C1 -+ subl C, LDC, C ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 +#else -+ mov C, C1 -+ addl C, LDC, C -+#endif -+ -+#ifdef LN -+ addl M, OFFSET, KK -+#endif -+ -+#ifdef LT -+ mov OFFSET, KK ++ addl KK, 2, TMP1 +#endif -+ -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO +#endif + -+ and M, 1, I -+ ble I, $L50 -+ -+#if defined(LT) || defined(RN) -+ + LD a1, 0 * SIZE(AO) -+ fclr t1 ++ fclr c09 + LD a2, 1 * SIZE(AO) -+ fclr t2 ++ fclr c13 ++ + LD a3, 2 * SIZE(AO) -+ fclr t3 ++ fclr c02 + LD a4, 3 * SIZE(AO) -+ fclr t4 ++ fclr c06 + + LD b1, 0 * SIZE(B) -+ fclr c01 ++ fclr c10 + LD b2, 1 * SIZE(B) -+ fclr c05 ++ fclr c14 + + LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 -+ + ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(B) -+ -+ ldi L, -2(KK) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) + -+ ble KK, $L58 -+ ble L, $L55 ++#ifndef TRMMKERNEL ++ ldi L, -2(K) +#else -+#ifdef LN -+ sll K, ZBASE_SHIFT, TMP1 -+ subl AORIG, TMP1, AORIG ++ ldi L, -2(TMP1) +#endif -+ -+ sll KK, ZBASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT, TMP1 -+ addl B, TMP1, BO -+ ++ ble L, $L25 ++#else ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) -+ fclr t1 ++ fclr c09 + LD a2, 1 * SIZE(AO) -+ fclr t2 ++ fclr c13 ++ + LD a3, 2 * SIZE(AO) -+ fclr t3 ++ fclr c02 + LD a4, 3 * SIZE(AO) -+ fclr t4 ++ fclr c06 + + LD b1, 0 * SIZE(BO) -+ fclr c01 ++ fclr c10 + LD b2, 1 * SIZE(BO) -+ fclr c05 ++ fclr c14 + + LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 -+ + ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) + + ldi L, -2(TMP1) -+ -+ ble TMP1, $L58 -+ ble L, $L55 ++ ble L, $L25 +#endif + .align 5 + -+$L52: -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++$L22: ++ ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ ldi AO, 4 * SIZE(AO) ++ ADD3 c10, t2, c10 ++ unop + MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) ++ LD b1, 0 * SIZE(BO) + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ ldi L, -2(L) ++ ADD4 c13, t3, c13 ++ unop + MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) ++ ldi BO, 8 * SIZE(BO) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+// unop ++ ADD2 c14, t4, c14 ++ unop + MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) ++ LD b2, -7 * SIZE(BO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 3 * SIZE(BO) ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) + MUL a3, b3, t1 -+ ldi BO, 4 * SIZE(BO) ++ LD b4, -1 * SIZE(BO) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+// unop ++ ADD3 c02, t2, c02 ++ unop + MUL a4, b3, t2 -+ LD b3, 0 * SIZE(BO) ++ LD b3, -2 * SIZE(BO) + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+// unop -+ MUL a3, b4, t3 ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ MUL a4, b4, t4 -+ LD b4, 1 * SIZE(BO) -+// unop -+ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L52 ++ bgt L, $L22 + .align 4 + -+$L55: -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++$L25: ++ ADD1 c09, t1, c09 ++ fldd alpha_r, ALPHA_R + MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L57 ++#ifndef TRMMKERNEL ++ blbs K, $L28 +#else -+ blbs TMP1, $L57 ++ blbs TMP1, $L28 +#endif + .align 4 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+// unop ++ ADD3 c10, t2, c10 ++ unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ ldi BO, 2 * SIZE(BO) ++ ADD4 c13, t3, c13 ++ unop + MUL a1, b2, t3 -+ LD a1, 0 * SIZE(AO) ++ unop + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+// unop ++ ADD2 c14, t4, c14 ++ unop + MUL a2, b2, t4 -+ LD a2, 1 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, t1 ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) -+ .align 4 + -+$L57: -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, t2 -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b2, t3 ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b2, t4 -+ ldi BO, 2 * SIZE(BO) ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ ADD2 c06, t4, b5 -+ fmov b5, c06 ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) + -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 + -+$L58: -+#if defined(LN) || defined(RT) -+ subl KK, 1, TMP1 ++$L28: ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I + -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl B, TMP2, BO ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c03, 0 * SIZE(C1) +#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) ++ unop +#endif + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c04, 1 * SIZE(C1) ++#else ++ unop ++#endif + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++#ifndef TRMMKERNEL ++ LD c11, 0 * SIZE(C2) +#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 ++ unop +#endif + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++#ifndef TRMMKERNEL ++ LD c12, 1 * SIZE(C2) ++#else ++ unop ++#endif + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+#endif ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 + -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_r, c09, t3 ++ MUL alpha_r, c10, t4 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+#endif ++#ifndef TRMMKERNEL ++ ADD c03, t1, c03 ++ MUL alpha_i, c02, t1 ++ ADD c04, t2, c04 ++ MUL alpha_i, c01, t2 + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) ++ ADD c11, t3, c11 ++ MUL alpha_i, c10, t3 ++ ADD c12, t4, c12 ++ MUL alpha_i, c09, t4 +#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+#endif ++ ADD $f31, t1, c03 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, c04 ++ MUL alpha_i, c01, t2 + -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) ++ ADD $f31, t3, c11 ++ MUL alpha_i, c10, t3 ++ ADD $f31, t4, c12 ++ MUL alpha_i, c09, t4 +#endif + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+#endif ++ SUB c03, t1, c03 ++ ADD c04, t2, c04 ++ SUB c11, t3, c11 ++ ADD c12, t4, c12 + -+#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ST c03, 0 * SIZE(C1) ++ ST c04, 1 * SIZE(C1) ++ ST c11, 0 * SIZE(C2) ++ ST c12, 1 * SIZE(C2) + -+#if defined(LT) || defined(RN) ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT, TMP2 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + -+#ifdef LT ++#if defined(TRMMKERNEL) && defined(LEFT) + addl KK, 1, KK +#endif ++ .align 4 + -+#ifdef LN -+ subl KK, 1, KK ++$L29: ++ mov BO, B ++ ldi J, -1(J) ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 2, KK ++#else ++ unop +#endif ++ bgt J, $L01 + .align 4 + -+$L50: ++$L30: ++ and N, 1, J ++ ble J, $L999 ++ ++ mov C, C1 ++ mov A, AO ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ + sra M, 1, I -+ ble I, $L59 ++ ble I, $L50 + .align 4 + +$L41: -+#if defined(LT) || defined(RN) ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif + + LD a1, 0 * SIZE(AO) + fclr t1 @@ -87769,23 +28028,19 @@ index 0000000..3a14e58 + ldi AO, 4 * SIZE(AO) + fclr c07 + -+ ldi L, -2(KK) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif + fclr c04 + fclr c08 -+ -+ ble KK, $L48 + ble L, $L45 +#else -+#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ + sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT, TMP1 -+ addl B, TMP1, BO -+ ++ addl AO, TMP1, AO ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) @@ -87814,5329 +28069,4332 @@ index 0000000..3a14e58 + ldi L, -2(TMP1) + fclr c04 + fclr c08 -+ -+ ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+// unop ++ ADD4 c05, t1, c05 ++ unop + MUL a1, b1, t1 + unop + -+ ADD2 c06, t2, b5 -+ fmov b5, c06 ++ ADD2 c06, t2, c06 + ldi L, -2(L) + MUL a2, b1, t2 -+// unop ++ unop + -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+// unop ++ ADD4 c07, t3, c07 ++ unop + MUL a3, b1, t3 + unop + -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+// unop ++ ADD2 c08, t4, c08 ++ unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+// unop ++ ADD1 c01, t1, c01 ++ unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ADD3 c02, t2, c02 + ldi BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + -+ ADD1 c03, t3, b5 -+ fmov b5, c03 -+// unop ++ ADD1 c03, t3, c03 ++ unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + -+ ADD3 c04, t4, b5 -+ fmov b5, c04 -+// unop ++ ADD3 c04, t4, c04 ++ unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+// unop ++ ADD4 c05, t1, c05 ++ unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+// unop ++ ADD2 c06, t2, c06 ++ unop + MUL a2, b3, t2 + unop + -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+// unop ++ ADD4 c07, t3, c07 ++ unop + MUL a3, b3, t3 + ldi AO, 8 * SIZE(AO) + -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+// unop ++ ADD2 c08, t4, c08 ++ unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+// unop ++ ADD1 c01, t1, c01 ++ unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+// unop ++ ADD3 c02, t2, c02 ++ unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + -+ ADD1 c03, t3, b5 -+ fmov b5, c03 ++ ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + -+ ADD3 c04, t4, b5 -+ fmov b5, c04 ++ ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: -+ ADD4 c05, t1, b5 -+ fmov b5, c05 ++ ADD4 c05, t1, c05 ++ fldd alpha_r, ALPHA_R + MUL b1, a1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L47 ++#ifndef TRMMKERNEL ++ blbs K, $L48 +#else -+ blbs TMP1, $L47 ++ blbs TMP1, $L48 +#endif + .align 4 + -+ ADD2 c06, t2, b5 -+ fmov b5, c06 ++ ADD2 c06, t2, c06 + MUL a2, b1, t2 -+ ADD4 c07, t3, b5 -+ fmov b5, c07 ++ ADD4 c07, t3, c07 + MUL a3, b1, t3 + -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+// unop ++ ADD2 c08, t4, c08 ++ unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+// unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+// unop -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD1 c03, t3, b5 -+ fmov b5, c03 -+// unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD3 c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 -+ -+$L47: -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, t2 -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, t3 -+ -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+ MUL a4, b1, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b2, t1 -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b2, t2 -+ ADD1 c03, t3, b5 -+ fmov b5, c03 -+ MUL a3, b2, t3 -+ -+ ADD3 c04, t4, b5 -+ fmov b5, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+ -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 -+ ADD c03, c08, b5 -+ fmov b5, c03 -+ ADD c04, c07, b5 -+ fmov b5, c04 -+ -+$L48: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+#endif -+ -+#ifdef LN -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) -+ LD a3, 4 * SIZE(AO) -+ LD a4, 5 * SIZE(AO) -+ -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 -+ MUL a3, c03, t1 -+ MUL a3, c04, t2 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ MUL a4, c04, t1 -+ MUL a4, c03, t2 -+ -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 -+ -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+#endif -+ -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c04, t2, b5 -+ fmov b5, c04 -+ -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ ADD6 c03, t1, b5 -+ fmov b5, c03 -+ ADD5 c04, t2, b5 -+ fmov b5, c04 -+ -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) -+ -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c03, t3, b5 -+ fmov b5, c03 -+ ADD6 c04, t4, b5 -+ fmov b5, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+#endif -+ -+#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl BO, TMP2, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ -+ ldi I, -1(I) -+ bgt I, $L41 -+ .align 4 -+ -+$L59: -+#ifdef LN -+ sll K, ZBASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif -+ -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif -+ -+#ifdef RN -+ addl KK, 1, KK -+#endif -+ -+#ifdef RT -+ subl KK, 1, KK -+#endif -+ .align 4 -+ -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldl tmp, 72($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ .ident VERSION -+ .end CNAME -diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak -new file mode 100644 -index 0000000..71202d8 ---- /dev/null -+++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak -@@ -0,0 +1,2230 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ -+ -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" -+ -+ -+#if !defined(SW2B) -+#error "Architecture is not specified." -+#endif -+ -+#ifdef SW2B -+#define PREFETCHSIZE 56 -+#define UNOP unop -+#endif -+ -+ -+ .set noat -+ .set noreorder -+ .arch ev6 -+ -+.text -+ .align 5 -+ .globl CNAME -+ .ent CNAME -+ -+#define STACKSIZE 80 -+ -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $21 -+#define B $22 -+#define C $20 -+#define LDC $23 -+ -+#define C1 $19 -+#define C2 $24 -+ -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 -+ -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 -+ -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 -+ -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 -+ -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 -+ -+#define alpha_i $f29 -+#define alpha_r $f30 -+ -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) + -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) + -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) + -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) + -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define AORIG $3 -+#define OFFSET $4 ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 + -+#if defined(LN) || defined(LT) -+#ifndef CONJ -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#define ADD5 SUB -+#define ADD6 ADD ++$L48: ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c07, t3, c07 ++ ldi I, -1(I) ++ MUL a3, b1, t3 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) +#else -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 SUB -+#define ADD4 ADD -+#define ADD5 ADD -+#define ADD6 SUB ++ unop +#endif ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) +#else -+#ifndef CONJ -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#define ADD5 SUB -+#define ADD6 ADD -+#else -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 ADD -+#define ADD4 SUB -+#define ADD5 ADD -+#define ADD6 SUB -+#endif ++ unop +#endif + -+ -+CNAME: -+ .frame $sp, STACKSIZE, $26, 0 -+ -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ ldi $at, _mcount -+ jsr $at, ($at), _mcount ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++#ifndef TRMMKERNEL ++ LD c11, 2 * SIZE(C1) ++#else ++ unop +#endif + -+#ifndef PROFILE -+ .prologue 0 ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++#ifndef TRMMKERNEL ++ LD c12, 3 * SIZE(C1) +#else -+ .prologue 1 ++ unop +#endif + -+ ldi $sp, -STACKSIZE($sp) -+ -+ ldl B, 0 + STACKSIZE($sp) -+ ldl C, 8 + STACKSIZE($sp) -+ ldl LDC, 16 + STACKSIZE($sp) -+ ldl OFFSET, 24 + STACKSIZE($sp) ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 + -+ sll LDC, ZBASE_SHIFT, LDC ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 + -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 + -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_r, c03, t3 ++ MUL alpha_r, c04, t4 + -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 ++#ifndef TRMMKERNEL ++ ADD c09, t1, c09 ++ MUL alpha_i, c02, t1 ++ ADD c10, t2, c10 ++ MUL alpha_i, c01, t2 + -+#ifdef LN -+ addl M, M, TMP2 -+ mull TMP2, K, TMP1 -+ SXADDQ TMP1, A, A -+ SXADDQ TMP2, C, C -+#endif ++ ADD c11, t3, c11 ++ MUL alpha_i, c04, t3 ++ ADD c12, t4, c12 ++ MUL alpha_i, c03, t4 ++#else ++ ADD $f31, t1, c09 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, c10 ++ MUL alpha_i, c01, t2 + -+#ifdef RN -+ negl OFFSET, KK ++ ADD $f31, t3, c11 ++ MUL alpha_i, c04, t3 ++ ADD $f31, t4, c12 ++ MUL alpha_i, c03, t4 +#endif + -+#ifdef RT -+ mull N, K, TMP1 -+ addl TMP1, TMP1, TMP1 -+ SXADDQ TMP1, B, B -+ -+ mull N, LDC, TMP1 -+ addl TMP1, C, C ++ SUB c09, t1, c09 ++ ADD c10, t2, c10 ++ SUB c11, t3, c11 ++ ADD c12, t4, c12 + -+ subl N, OFFSET, KK -+#endif ++ ST c09, 0 * SIZE(C1) ++ ST c10, 1 * SIZE(C1) ++ ST c11, 2 * SIZE(C1) ++ ST c12, 3 * SIZE(C1) + -+ sra N, 1, J -+ ble J, $L30 -+ .align 4 -+ -+$L01: -+#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ subl B, TMP1, B ++ ldi C1, 4 * SIZE(C1) + -+ subl C, LDC, C2 -+ subl C2, LDC, C1 -+ subl C2, LDC, C ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 +#else -+ mov C, C1 -+ addl C, LDC, C2 -+ addl C2, LDC, C ++ subl TMP1, 1, TMP1 +#endif -+ -+#ifdef LN -+ addl M, OFFSET, KK ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO +#endif + -+#ifdef LT -+ mov OFFSET, KK ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK +#endif + -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif ++ bgt I, $L41 ++ .align 4 + ++$L50: + and M, 1, I -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 ++ ble I, $L999 + -+ fclr c01 -+ fclr c05 -+ ble I, $L20 ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + -+#if defined(LT) || defined(RN) ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif + + LD a1, 0 * SIZE(AO) -+ fclr c09 ++ fclr t1 + LD a2, 1 * SIZE(AO) -+ fclr c13 -+ ++ fclr t2 + LD a3, 2 * SIZE(AO) -+ fclr c02 ++ fclr t3 + LD a4, 3 * SIZE(AO) -+ fclr c06 ++ fclr t4 + + LD b1, 0 * SIZE(B) -+ fclr c10 ++ fclr c01 + LD b2, 1 * SIZE(B) -+ fclr c14 ++ fclr c05 + + LD b3, 2 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) ++ fclr c02 + LD b4, 3 * SIZE(B) -+ ldi BO, 4 * SIZE(B) ++ fclr c06 + -+ ldi L, -2(KK) ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) + -+ ble KK, $L28 -+ ble L, $L25 ++#ifndef TRMMKERNEL ++ ldi L, -2(K) +#else -+#ifdef LN -+ sll K, ZBASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG ++ ldi L, -2(TMP1) +#endif -+ ++ ble L, $L55 ++#else + sll KK, ZBASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ ++ addl AO, TMP1, AO ++ addl B, TMP1, BO + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) -+ fclr c09 ++ fclr t1 + LD a2, 1 * SIZE(AO) -+ fclr c13 -+ ++ fclr t2 + LD a3, 2 * SIZE(AO) -+ fclr c02 ++ fclr t3 + LD a4, 3 * SIZE(AO) -+ fclr c06 ++ fclr t4 + + LD b1, 0 * SIZE(BO) -+ fclr c10 ++ fclr c01 + LD b2, 1 * SIZE(BO) -+ fclr c14 ++ fclr c05 + + LD b3, 2 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) ++ fclr c02 + LD b4, 3 * SIZE(BO) -+ ldi BO, 4 * SIZE(BO) ++ fclr c06 + -+ ldi L, -2(TMP1) ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) + -+ ble TMP1, $L28 -+ ble L, $L25 ++ ldi L, -2(TMP1) ++ ble L, $L55 +#endif + .align 5 + -+$L22: -+ ADD1 c09, t1, c09 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD3 c10, t2, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD4 c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD2 c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ ++$L52: + ADD1 c01, t1, c01 + unop -+ MUL a1, b3, t1 ++ MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ -+ ADD1 c09, t1, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD3 c10, t2, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD4 c13, t3, c13 + unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD2 c14, t4, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 -+ ldi L, -2(L) ++ LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) ++ LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop -+ MUL a3, b5, t3 ++ MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 -+ MUL a4, b5, t4 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ + LD a4, 1 * SIZE(AO) -+ bgt L, $L22 ++ unop ++ unop ++ bgt L, $L52 + .align 4 + -+$L25: -+ ADD1 c09, t1, c09 ++$L55: ++ ADD1 c01, t1, c01 ++ fldd alpha_r, ALPHA_R + MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L27 ++#ifndef TRMMKERNEL ++ blbs K, $L58 +#else -+ blbs TMP1, $L27 ++ blbs TMP1, $L58 +#endif + .align 4 + -+ ADD3 c10, t2, c10 ++ ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + -+ ADD4 c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ unop -+ -+ ADD2 c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD1 c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD3 c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ + ADD4 c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD2 c06, t4, c06 -+ unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD1 c09, t1, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L27: -+ ADD3 c10, t2, c10 -+ MUL a2, b1, t2 -+ ADD4 c13, t3, c13 ++ ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 -+ -+ ADD2 c14, t4, c14 -+ MUL a2, b2, t4 -+ ADD1 c01, t1, c01 -+ MUL a1, b3, t1 -+ -+ ADD3 c02, t2, c02 -+ MUL a2, b3, t2 -+ ADD4 c05, t3, c05 -+ MUL a1, b4, t3 -+ -+ ADD2 c06, t4, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD1 c09, t1, c09 -+ ADD3 c10, t2, c10 -+ ADD4 c13, t3, c13 -+ ADD2 c14, t4, c14 -+ -+ ADD c01, c06, c01 -+ ADD c02, c05, c02 -+ ADD c09, c14, c09 -+ ADD c10, c13, c10 -+ .align 4 -+ -+$L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c09, c09 -+ SUB a4, c10, c10 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c09, c09 -+ SUB a4, c10, c10 -+#endif -+ -+#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) + -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+ ADD5 c09, t3, c09 -+ ADD6 c10, t4, c10 ++$L58: ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c03, 0 * SIZE(C1) ++#else ++ unop +#endif + -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c04, 1 * SIZE(C1) ++#else ++ unop ++#endif + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 + -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_i, c02, t3 ++ MUL alpha_i, c01, t4 + -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ ADD6 c09, t1, c09 -+ ADD5 c10, t2, c10 ++#ifndef TRMMKERNEL ++ ADD c03, t1, c03 ++ ADD c04, t2, c04 ++#else ++ ADD $f31, t1, c03 ++ ADD $f31, t2, c04 ++#endif + -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) ++ SUB c03, t3, c03 ++ ADD c04, t4, c04 + -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 ++ ST c03, 0 * SIZE(C1) ++ ST c04, 1 * SIZE(C1) ++ .align 4 + -+ ADD5 c09, t1, c09 -+ ADD6 c10, t2, c10 -+#endif ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/zgemv_n.S b/kernel/sw_64/zgemv_n.S +new file mode 100644 +index 000000000..f28ad3094 +--- /dev/null ++++ b/kernel/sw_64/zgemv_n.S +@@ -0,0 +1,1027 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+#ifdef RT -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ LD a3, 4 * SIZE(BO) -+ LD a4, 5 * SIZE(BO) ++#define ASSEMBLER ++#include "common.h" + -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 + -+ ADD5 c09, t1, c09 -+ ADD6 c10, t2, c10 ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 + -+ MUL a3, c09, t1 -+ MUL a3, c10, t2 -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 ++#define M $16 ++#define N $17 ++#define A $21 ++#define LDA $18 + -+ MUL a4, c10, t1 -+ MUL a4, c09, t2 -+ ADD6 c01, t1, c01 -+ ADD5 c02, t2, c02 -+ -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++#define X $19 ++#define INCX $20 ++#define Y $22 ++#define INCY $23 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 ++#define BUFFER $24 + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+#endif ++#define I $25 ++#define J $27 + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c10, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c10, 3 * SIZE(AO) -+#endif ++#define Y1 $4 ++#define A1 $5 ++#define A2 $6 + -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+#endif ++#define alpha_r $f19 ++#define alpha_i $f20 + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c09, 0 * SIZE(C2) -+ ST c10, 1 * SIZE(C2) ++#define alpha1 $f0 ++#define alpha2 $f1 ++#define alpha3 $f10 ++#define alpha4 $f11 + -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+#endif ++#define y0 $f12 ++#define y1 $f13 ++#define y2 $f14 ++#define y3 $f15 + -+#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++#define y4 $f16 ++#define y5 $f17 ++#define y6 $f18 ++#define y7 $f21 + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 + -+#ifdef LT -+ addl KK, 1, KK -+#endif ++#define t0 $f2 ++#define t1 $f3 ++#define t2 $f4 ++#define t3 $f5 + -+#ifdef LN -+ subl KK, 1, KK ++#if !defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#elif !defined(CONJ) && defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB +#endif -+ .align 4 -+ -+$L20: -+ sra M, 1, I -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 + -+ fclr c01 -+ fclr c05 ++ PROLOGUE + -+ ble I, $L29 -+ .align 4 ++ ldi $sp, -STACKSIZE($sp) ++ ldl LDA, 0 + STACKSIZE($sp) ++ ldl X, 8 + STACKSIZE($sp) ++ ldl INCX, 16 + STACKSIZE($sp) ++ ldl Y, 24 + STACKSIZE($sp) ++ ldl INCY, 32 + STACKSIZE($sp) ++ ldl BUFFER, 40 + STACKSIZE($sp) + -+$L11: -+#if defined(LT) || defined(RN) ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) + -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 ++ PROFCODE + -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 ++ cmple M, 0, $0 ++ sll INCX, ZBASE_SHIFT, INCX ++ cmple N, 0, $1 ++ sll INCY, ZBASE_SHIFT, INCY + -+ LD b1, 0 * SIZE(B) -+ fclr c10 -+ LD b2, 1 * SIZE(B) -+ fclr c14 ++ or $0, $1, $0 ++ bne $0, $L999 + -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c07 ++ cmpeq INCY, 2 * SIZE, $0 ++ sll LDA, ZBASE_SHIFT,LDA ++ bne $0, $L10 + -+ ldi BO, 4 * SIZE(B) -+ fclr c11 -+ ldi AO, 4 * SIZE(AO) -+ fclr c15 ++ mov BUFFER, Y1 + -+ fillcs 4 * SIZE(C1) -+ fclr c04 -+ ldi L, -2(KK) -+ fclr c08 ++ mov Y, BUFFER ++ mov Y1, Y + -+ fillcs 4 * SIZE(C2) -+ fclr c12 -+ fclr c16 -+ ble KK, $L18 -+ ble L, $L15 -+#else -+#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif ++ sra M, 2, I ++ ble I, $L05 ++ .align 4 + -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ addl B, TMP1, BO ++$L02: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ ST $f31, 2 * SIZE(Y1) ++ ST $f31, 3 * SIZE(Y1) ++ ST $f31, 4 * SIZE(Y1) ++ ST $f31, 5 * SIZE(Y1) ++ ST $f31, 6 * SIZE(Y1) ++ ST $f31, 7 * SIZE(Y1) + -+ subl K, KK, TMP1 ++ ldi Y1, 8 * SIZE(Y1) ++ ldi I, -1(I) ++ bgt I, $L02 ++ .align 4 + -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 ++$L05: ++ and M, 3, I ++ ble I, $L10 ++ .align 4 + -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 ++$L06: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ addl Y1, 2 * SIZE, Y1 + -+ LD b1, 0 * SIZE(BO) -+ fclr c10 -+ LD b2, 1 * SIZE(BO) -+ fclr c14 ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 + -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c07 ++$L10: ++ sra N, 1, J ++ ble J, $L20 ++ .align 4 + -+ ldi BO, 4 * SIZE(BO) -+ fclr c11 -+ ldi AO, 4 * SIZE(AO) -+ fclr c15 ++$L11: ++ LD alpha1, 0 * SIZE(X) ++ LD alpha2, 1 * SIZE(X) ++ addl X, INCX, X ++ LD alpha3, 0 * SIZE(X) ++ LD alpha4, 1 * SIZE(X) ++ addl X, INCX, X + -+ fillcs 4 * SIZE(C1) -+ fclr c04 -+ ldi L, -2(TMP1) -+ fclr c08 ++ MUL alpha_r, alpha1, y0 ++ MUL alpha_r, alpha2, y1 ++ MUL alpha_r, alpha3, y2 ++ MUL alpha_r, alpha4, y3 + -+ fillcs 4 * SIZE(C2) -+ fclr c12 -+ fclr c16 -+ ble TMP1, $L18 -+ ble L, $L15 -+#endif -+ .align 5 ++ MUL alpha_i, alpha2, t0 ++ mov A, A1 ++ MUL alpha_i, alpha1, t1 ++ addl A, LDA, A2 ++ MUL alpha_i, alpha4, t2 ++ addl A2, LDA, A ++ MUL alpha_i, alpha3, t3 ++ mov Y, Y1 + -+$L12: -+/* 1 */ -+ ADD1 c11, t1, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) ++#ifndef XCONJ ++ SUB y0, t0, alpha1 ++ ADD y1, t1, alpha2 ++ SUB y2, t2, alpha3 ++ ADD y3, t3, alpha4 +#else -+ unop ++ ADD y0, t0, alpha1 ++ SUB y1, t1, alpha2 ++ ADD y2, t2, alpha3 ++ SUB y3, t3, alpha4 +#endif + -+ ADD3 c12, t2, c12 -+ unop -+ MUL b1, a2, t2 -+ unop ++ s_fillcs 4 * SIZE(X) + -+ ADD2 c16, t3, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) ++ sra M, 2, I ++ ble I, $L15 + -+ ADD4 c15, t4, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) + -+/* 2 */ -+ ADD1 c01, t1, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) + -+ ADD3 c02, t2, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) + -+ ADD2 c06, t3, c06 -+ unop -+ MUL b2, a4, t3 -+ unop ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) + -+ ADD4 c05, t4, c05 ++ ADD1 y0, t0, y0 + unop -+ MUL b4, a1, t4 ++ MUL alpha3, a4, t0 ++ LD y4, 4 * SIZE(Y1) ++ ++ ADD2 y1, t1, y1 + unop ++ MUL alpha3, a5, t1 ++ LD y5, 5 * SIZE(Y1) + -+/* 3 */ -+ ADD1 c03, t1, c03 ++ ADD1 y2, t2, y2 + unop -+ MUL b3, a1, t1 ++ MUL alpha3, a6, t2 ++ LD y6, 6 * SIZE(Y1) ++ ++ ADD2 y3, t3, y3 + unop ++ MUL alpha3, a7, t3 ++ LD y7, 7 * SIZE(Y1) + -+ ADD3 c04, t2, c04 -+ unop -+ MUL b3, a2, t2 ++ ADD1 y0, t0, y0 + unop ++ MUL alpha2, a1, t0 ++ LD a1, 5 * SIZE(A1) + -+ ADD2 c08, t3, c08 ++ ADD2 y1, t1, y1 + unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) ++ MUL alpha2, a0, t1 ++ LD a0, 4 * SIZE(A1) + -+ ADD4 c13, t4, c13 ++ ADD1 y2, t2, y2 + unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) ++ MUL alpha2, a3, t2 ++ LD a3, 7 * SIZE(A1) + -+/* 4 */ -+ ADD1 c09, t1, c09 ++ ADD2 y3, t3, y3 + unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) ++ MUL alpha2, a2, t3 ++ LD a2, 6 * SIZE(A1) + -+ ADD3 c10, t2, c10 ++ ADD3 y0, t0, y0 + unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) ++ MUL alpha4, a5, t0 ++ LD a5, 5 * SIZE(A2) + -+ ADD2 c14, t3, c14 ++ ADD4 y1, t1, y1 + unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) ++ MUL alpha4, a4, t1 ++ LD a4, 4 * SIZE(A2) + -+ ADD4 c07, t4, c07 ++ ADD3 y2, t2, y2 + unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) ++ MUL alpha4, a7, t2 ++ LD a7, 7 * SIZE(A2) + -+/* 5 */ -+ ADD1 c11, t1, c11 ++ ADD4 y3, t3, y3 + unop -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) ++ MUL alpha4, a6, t3 ++ LD a6, 6 * SIZE(A2) + -+ ADD3 c12, t2, c12 -+ ldi L, -2(L) -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) ++ ADD3 y0, t0, y0 ++ MUL alpha1, a0, t0 ++ ADD4 y1, t1, y1 ++ MUL alpha1, a1, t1 + -+ ADD2 c16, t3, c16 ++ ADD3 y2, t2, y2 + unop -+ MUL b2, a2, t3 ++ MUL alpha1, a2, t2 + unop + -+ ADD4 c15, t4, c15 -+ unop -+ MUL b2, a5, t4 -+ unop ++ ADD4 y3, t3, y3 ++ ldi I, -1(I) ++ MUL alpha1, a3, t3 ++ ble I, $L13 ++ .align 4 + -+/* 6 */ -+ ADD1 c01, t1, c01 -+ unop -+ MUL b5, a6, t1 -+ unop ++$L12: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) + -+ ADD3 c02, t2, c02 -+ unop -+ MUL b5, a4, t2 -+ unop ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ ldi I, -1(I) + -+ ADD2 c06, t3, c06 ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha3, a6, t2 + unop -+ MUL b2, a4, t3 ++ ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha3, a7, t3 + unop + -+ ADD4 c05, t4, c05 ++ ADD1 y4, t0, y4 + unop -+ MUL b4, a5, t4 ++ MUL alpha2, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 y5, t1, y5 + unop ++ MUL alpha2, a0, t1 ++ LD a0, 8 * SIZE(A1) + -+/* 7 */ -+ ADD1 c03, t1, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 ++ ADD1 y6, t2, y6 + unop ++ MUL alpha2, a3, t2 ++ LD a3, 11 * SIZE(A1) + -+ ADD3 c04, t2, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 ++ ADD2 y7, t3, y7 + unop ++ MUL alpha2, a2, t3 ++ LD a2, 10 * SIZE(A1) + -+ ADD2 c08, t3, c08 ++ ADD3 y4, t0, y4 ++ fillde (PREFETCHSIZE + 0) * SIZE(Y1) ++ MUL alpha4, a5, t0 ++ LD a5, 9 * SIZE(A2) ++ ++ ADD4 y5, t1, y5 + unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) ++ MUL alpha4, a4, t1 ++ LD a4, 8 * SIZE(A2) + -+ ADD4 c13, t4, c13 ++ ADD3 y6, t2, y6 + unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) ++ MUL alpha4, a7, t2 ++ LD a7, 11 * SIZE(A2) + -+/* 8 */ -+ ADD1 c09, t1, c09 ++ ADD4 y7, t3, y7 + unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) ++ MUL alpha4, a6, t3 ++ LD a6, 10 * SIZE(A2) + -+ ADD3 c10, t2, c10 ++ ADD3 y4, t0, y4 + unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) ++ MUL alpha1, a0, t0 ++ LD y0, 8 * SIZE(Y1) + -+ ADD2 c14, t3, c14 ++ ADD4 y5, t1, y5 + unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) ++ MUL alpha1, a1, t1 ++ LD y1, 9 * SIZE(Y1) + -+ ADD4 c07, t4, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 -+ .align 4 ++ ADD3 y6, t2, y6 ++ unop ++ MUL alpha1, a2, t2 ++ LD y2, 10 * SIZE(Y1) + -+$L15: -+ ADD1 c11, t1, c11 ++ ADD4 y7, t3, y7 + unop -+ MUL b1, a1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L17 -+#else -+ blbs TMP1, $L17 -+#endif -+ .align 4 ++ MUL alpha1, a3, t3 ++ LD y3, 11 * SIZE(Y1) + -+ ADD3 c12, t2, c12 -+ MUL b1, a2, t2 -+ ADD2 c16, t3, c16 -+ MUL b2, a2, t3 ++ ADD1 y0, t0, y0 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) + -+ ADD4 c15, t4, c15 -+ MUL b2, a1, t4 -+ ADD1 c01, t1, c01 -+ MUL b1, a3, t1 ++ ADD2 y1, t1, y1 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ unop + -+ ADD3 c02, t2, c02 ++ ADD1 y2, t2, y2 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha3, a6, t2 + unop -+ MUL b1, a4, t2 -+ LD b1, 0 * SIZE(BO) + -+ ADD2 c06, t3, c06 -+ MUL b2, a4, t3 -+ ADD4 c05, t4, c05 -+ MUL b4, a1, t4 ++ ADD2 y3, t3, y3 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ ldi Y1, 8 * SIZE(Y1) + -+ ADD1 c03, t1, c03 ++ ADD1 y0, t0, y0 + unop -+ MUL b3, a1, t1 -+ LD a1, 0 * SIZE(AO) ++ MUL alpha2, a1, t0 ++ LD a1, 13 * SIZE(A1) + -+ ADD3 c04, t2, c04 -+ unop -+ MUL b3, a2, t2 ++ ADD2 y1, t1, y1 + unop ++ MUL alpha2, a0, t1 ++ LD a0, 12 * SIZE(A1) + -+ ADD2 c08, t3, c08 ++ ADD1 y2, t2, y2 + unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) ++ MUL alpha2, a3, t2 ++ LD a3, 15 * SIZE(A1) + -+ ADD4 c13, t4, c13 ++ ADD2 y3, t3, y3 + unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) ++ MUL alpha2, a2, t3 ++ LD a2, 14 * SIZE(A1) + -+ ADD1 c09, t1, c09 ++ ADD3 y0, t0, y0 + unop -+ MUL b3, a3, t1 -+ ldi AO, 4 * SIZE(AO) ++ MUL alpha4, a5, t0 ++ LD a5, 13 * SIZE(A2) + -+ ADD3 c10, t2, c10 ++ ADD4 y1, t1, y1 + unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) ++ MUL alpha4, a4, t1 ++ LD a4, 12 * SIZE(A2) + -+ ADD2 c14, t3, c14 ++ ADD3 y2, t2, y2 + unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) ++ MUL alpha4, a7, t2 ++ LD a7, 15 * SIZE(A2) + -+ ADD4 c07, t4, c07 ++ ADD4 y3, t3, y3 + unop -+ MUL b4, a3, t4 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD1 c11, t1, c11 -+ LD b4, 3 * SIZE(BO) -+ MUL b1, a1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L17: -+ ADD3 c12, t2, c12 -+ MUL b1, a2, t2 -+ ADD2 c16, t3, c16 -+ MUL b2, a2, t3 -+ -+ ADD4 c15, t4, c15 -+ MUL b2, a1, t4 -+ ADD1 c01, t1, c01 -+ MUL b1, a3, t1 -+ -+ ADD3 c02, t2, c02 -+ MUL b1, a4, t2 -+ ADD2 c06, t3, c06 -+ MUL b2, a4, t3 -+ -+ ADD4 c05, t4, c05 -+ MUL b4, a1, t4 -+ ADD1 c03, t1, c03 -+ MUL b3, a1, t1 -+ -+ ADD3 c04, t2, c04 -+ MUL b3, a2, t2 -+ ADD2 c08, t3, c08 -+ MUL b4, a2, t3 -+ -+ ADD4 c13, t4, c13 -+ MUL b2, a3, t4 -+ ADD1 c09, t1, c09 -+ MUL b3, a3, t1 -+ -+ ADD3 c10, t2, c10 -+ MUL b3, a4, t2 -+ ADD2 c14, t3, c14 -+ MUL b4, a4, t3 ++ MUL alpha4, a6, t3 ++ LD a6, 14 * SIZE(A2) + -+ ADD4 c07, t4, c07 -+ ldi AO, 4 * SIZE(AO) -+ MUL b4, a3, t4 -+ ldi BO, 4 * SIZE(BO) ++ ADD3 y0, t0, y0 ++ unop ++ MUL alpha1, a0, t0 ++ LD y4, 4 * SIZE(Y1) + -+ ADD1 c11, t1, c11 -+ ADD3 c12, t2, c12 -+ ADD2 c16, t3, c16 -+ ADD4 c15, t4, c15 ++ ADD4 y1, t1, y1 ++ ldi A2, 8 * SIZE(A2) ++ MUL alpha1, a1, t1 ++ LD y5, 5 * SIZE(Y1) + -+ ADD c01, c06, c01 -+ ADD c02, c05, c02 -+ ADD c03, c08, c03 -+ ADD c04, c07, c04 ++ ADD3 y2, t2, y2 ++ ldi A1, 8 * SIZE(A1) ++ MUL alpha1, a2, t2 ++ LD y6, 6 * SIZE(Y1) + -+ ADD c09, c14, c09 -+ ADD c10, c13, c10 -+ ADD c11, c16, c11 -+ ADD c12, c15, c12 ++ ADD4 y3, t3, y3 ++ MUL alpha1, a3, t3 ++ LD y7, 7 * SIZE(Y1) ++ bgt I, $L12 + .align 4 + -+$L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c09, c09 -+ SUB a4, c10, c10 -+ -+ SUB b1, c03, c03 -+ SUB b2, c04, c04 -+ SUB b3, c11, c11 -+ SUB b4, c12, c12 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 ++$L13: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ unop + -+ SUB b1, c09, c09 -+ SUB b2, c10, c10 -+ SUB b3, c11, c11 -+ SUB b4, c12, c12 -+#endif ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ unop + -+#ifdef LN -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) -+ LD a3, 4 * SIZE(AO) -+ LD a4, 5 * SIZE(AO) ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha3, a6, t2 ++ unop + -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ unop + -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 ++ ADD1 y4, t0, y4 ++ MUL alpha2, a1, t0 ++ ADD2 y5, t1, y5 ++ MUL alpha2, a0, t1 + -+ ADD5 c03, t1, c03 -+ ADD6 c04, t2, c04 -+ ADD5 c11, t3, c11 -+ ADD6 c12, t4, c12 ++ ADD1 y6, t2, y6 ++ MUL alpha2, a3, t2 ++ ADD2 y7, t3, y7 ++ MUL alpha2, a2, t3 + -+ MUL a3, c03, t1 -+ MUL a3, c04, t2 -+ MUL a3, c11, t3 -+ MUL a3, c12, t4 ++ ADD3 y4, t0, y4 ++ MUL alpha4, a5, t0 ++ ADD4 y5, t1, y5 ++ MUL alpha4, a4, t1 + -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c09, t3, c09 -+ SUB c10, t4, c10 ++ ADD3 y6, t2, y6 ++ MUL alpha4, a7, t2 ++ ADD4 y7, t3, y7 ++ MUL alpha4, a6, t3 + -+ MUL a4, c04, t1 -+ MUL a4, c03, t2 -+ MUL a4, c12, t3 -+ MUL a4, c11, t4 ++ ADD3 y4, t0, y4 ++ ADD4 y5, t1, y5 ++ ADD3 y6, t2, y6 ++ ADD4 y7, t3, y7 + -+ ADD6 c01, t1, c01 -+ ADD5 c02, t2, c02 -+ ADD6 c09, t3, c09 -+ ADD5 c10, t4, c10 -+ -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++ ST y4, 4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, 5 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 ++ ST y6, 6 * SIZE(Y1) ++ unop ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 + -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 ++$L15: ++ and M, 2, I ++ ble I, $L17 + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+ ADD5 c09, t3, c09 -+ ADD6 c10, t4, c10 -+#endif ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) + -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) + -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 ++ ADD1 y0, t0, y0 ++ MUL alpha3, a4, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha3, a5, t1 ++ ADD1 y2, t2, y2 ++ MUL alpha3, a6, t2 ++ ADD2 y3, t3, y3 ++ MUL alpha3, a7, t3 + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+ ADD5 c09, t3, c09 -+ ADD6 c10, t4, c10 ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 + -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c09, t3 -+ MUL a3, c10, t4 ++ ADD1 y2, t2, y2 ++ MUL alpha2, a3, t2 ++ ADD2 y3, t3, y3 ++ MUL alpha2, a2, t3 + -+ SUB c03, t1, c03 -+ SUB c04, t2, c04 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 ++ ADD3 y0, t0, y0 ++ MUL alpha4, a5, t0 ++ ADD4 y1, t1, y1 ++ MUL alpha4, a4, t1 + -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ MUL a4, c10, t3 -+ MUL a4, c09, t4 ++ ADD3 y2, t2, y2 ++ MUL alpha4, a7, t2 ++ ADD4 y3, t3, y3 ++ MUL alpha4, a6, t3 + -+ ADD6 c03, t1, c03 -+ ADD5 c04, t2, c04 -+ ADD6 c11, t3, c11 -+ ADD5 c12, t4, c12 ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 ++ ADD3 y2, t2, y2 ++ ADD4 y3, t3, y3 + -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) + -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 ++ ST y2, 2 * SIZE(Y1) ++ unop ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 + -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 ++$L17: ++ blbc M, $L18 + -+ ADD5 c03, t1, c03 -+ ADD6 c04, t2, c04 -+ ADD5 c11, t3, c11 -+ ADD6 c12, t4, c12 -+#endif ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) + -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 ++ MUL alpha1, a0, t0 ++ MUL alpha1, a1, t1 + -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 ++ ADD1 y0, t0, y0 ++ MUL alpha3, a2, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha3, a3, t1 + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+ ADD5 c03, t3, c03 -+ ADD6 c04, t4, c04 ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 + -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c03, t3 -+ MUL a3, c04, t4 ++ ADD3 y0, t0, y0 ++ MUL alpha4, a3, t0 ++ ADD4 y1, t1, y1 ++ MUL alpha4, a2, t1 + -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 + -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ MUL a4, c04, t3 -+ MUL a4, c03, t4 ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ .align 4 + -+ ADD6 c09, t1, c09 -+ ADD5 c10, t2, c10 -+ ADD6 c11, t3, c11 -+ ADD5 c12, t4, c12 ++$L18: ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 + -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) ++$L20: ++ blbc N, $L990 + -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 ++ LD alpha1, 0 * SIZE(X) ++ LD alpha2, 1 * SIZE(X) + -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 ++ MUL alpha_r, alpha1, y0 ++ MUL alpha_r, alpha2, y1 + -+ ADD5 c09, t1, c09 -+ ADD6 c10, t2, c10 -+ ADD5 c11, t3, c11 -+ ADD6 c12, t4, c12 -+#endif ++ MUL alpha_i, alpha2, t0 ++ mov A, A1 ++ MUL alpha_i, alpha1, t1 ++ mov Y, Y1 + -+#ifdef RT -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ LD a3, 4 * SIZE(BO) -+ LD a4, 5 * SIZE(BO) ++#ifndef XCONJ ++ SUB y0, t0, alpha1 ++ ADD y1, t1, alpha2 ++#else ++ ADD y0, t0, alpha1 ++ SUB y1, t1, alpha2 ++#endif + -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 ++ sra M, 2, I ++ ble I, $L25 + -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) + -+ ADD5 c09, t1, c09 -+ ADD6 c10, t2, c10 -+ ADD5 c11, t3, c11 -+ ADD6 c12, t4, c12 ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) + -+ MUL a3, c09, t1 -+ MUL a3, c10, t2 -+ MUL a3, c11, t3 -+ MUL a3, c12, t4 ++ MUL alpha1, a0, t0 ++ LD a4, 4 * SIZE(A1) ++ MUL alpha1, a1, t1 ++ LD a5, 5 * SIZE(A1) ++ MUL alpha1, a2, t2 ++ LD a6, 6 * SIZE(A1) ++ MUL alpha1, a3, t3 ++ LD a7, 7 * SIZE(A1) + -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 ++ ADD1 y0, t0, y0 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 9 * SIZE(A1) + -+ MUL a4, c10, t1 -+ MUL a4, c09, t2 -+ MUL a4, c12, t3 -+ MUL a4, c11, t4 ++ ADD2 y1, t1, y1 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 8 * SIZE(A1) + -+ ADD6 c01, t1, c01 -+ ADD5 c02, t2, c02 -+ ADD6 c03, t3, c03 -+ ADD5 c04, t4, c04 -+ -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++ ADD1 y2, t2, y2 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 11 * SIZE(A1) + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 ++ ADD2 y3, t3, y3 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 10 * SIZE(A1) + -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 ++ ADD3 y0, t0, y0 ++ unop ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, t0 + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+ ADD5 c03, t3, c03 -+ ADD6 c04, t4, c04 -+#endif ++ ADD4 y1, t1, y1 ++ unop ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, t1 + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c10, 3 * SIZE(BO) ++ ADD3 y2, t2, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, t2 ++ ldi I, -1(I) + -+ ST c03, 4 * SIZE(BO) -+ ST c04, 5 * SIZE(BO) -+ ST c11, 6 * SIZE(BO) -+ ST c12, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) ++ ADD4 y3, t3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, t3 ++ ble I, $L23 ++ .align 4 + -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c11, 6 * SIZE(AO) -+ ST c12, 7 * SIZE(AO) -+#endif ++$L22: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a5, t0 ++ LD a5, 13 * SIZE(A1) + -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+#endif ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a4, t1 ++ LD a4, 12 * SIZE(A1) + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a7, t2 ++ LD a7, 15 * SIZE(A1) + -+ ST c09, 0 * SIZE(C2) -+ ST c10, 1 * SIZE(C2) -+ ST c11, 2 * SIZE(C2) -+ ST c12, 3 * SIZE(C2) ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a6, t3 ++ LD a6, 14 * SIZE(A1) + -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+#endif ++ ADD3 y4, t0, y4 ++ LD y0, 8 * SIZE(Y1) ++ MUL alpha1, a0, t0 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) + -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 ++ ADD4 y5, t1, y5 ++ LD y1, 9 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ ldi I, -1(I) + -+#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ADD3 y6, t2, y6 ++ LD y2, 10 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ unop + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif ++ ADD4 y7, t3, y7 ++ LD y3, 11 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ unop + -+#ifdef LT -+ addl KK, 2, KK -+#endif ++ ADD1 y0, t0, y0 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha2, a1, t0 ++ LD a1, 17 * SIZE(A1) + -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ fclr c01 -+ fclr c05 ++ ADD2 y1, t1, y1 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha2, a0, t1 ++ LD a0, 16 * SIZE(A1) + -+ ldi I, -1(I) -+ bgt I, $L11 -+ .align 4 ++ ADD1 y2, t2, y2 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha2, a3, t2 ++ LD a3, 19 * SIZE(A1) + -+$L29: -+#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ addl B, TMP1, B -+#endif ++ ADD2 y3, t3, y3 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha2, a2, t3 ++ LD a2, 18 * SIZE(A1) + -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif ++ ADD3 y0, t0, y0 ++ LD y4, 12 * SIZE(Y1) ++ MUL alpha1, a4, t0 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(Y1) + -+#ifdef RN -+ addl KK, 2, KK -+#endif ++ ADD4 y1, t1, y1 ++ LD y5, 13 * SIZE(Y1) ++ MUL alpha1, a5, t1 ++ ldi A1, 8 * SIZE(A1) + -+#ifdef RT -+ subl KK, 2, KK -+#endif ++ ADD3 y2, t2, y2 ++ LD y6, 14 * SIZE(Y1) ++ MUL alpha1, a6, t2 ++ ldi Y1, 8 * SIZE(Y1) + -+ ldi J, -1(J) -+ bgt J, $L01 ++ ADD4 y3, t3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, t3 ++ bgt I, $L22 + .align 4 + -+$L30: -+ and N, 1, J -+ ble J, $L999 ++$L23: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a5, t0 ++ unop + -+#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 -+ subl B, TMP1, B ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a4, t1 ++ unop + -+ subl C, LDC, C1 -+ subl C, LDC, C -+#else -+ mov C, C1 -+ addl C, LDC, C -+#endif ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a7, t2 ++ unop + -+#ifdef LN -+ addl M, OFFSET, KK -+#endif ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a6, t3 ++ unop + -+#ifdef LT -+ mov OFFSET, KK -+#endif ++ ADD3 y4, t0, y4 ++ ADD4 y5, t1, y5 ++ ADD3 y6, t2, y6 ++ ADD4 y7, t3, y7 + -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif ++ ST y4, 4 * SIZE(Y1) ++ unop ++ ST y5, 5 * SIZE(Y1) ++ unop + -+ and M, 1, I -+ ble I, $L50 ++ ST y6, 6 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 + -+#if defined(LT) || defined(RN) ++$L25: ++ and M, 2, I ++ ble I, $L27 + -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) + -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) + -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 ++ ADD1 y2, t2, y2 ++ MUL alpha2, a3, t2 ++ ADD2 y3, t3, y3 ++ MUL alpha2, a2, t3 + -+ ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(B) ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 ++ ADD3 y2, t2, y2 ++ ADD4 y3, t3, y3 + -+ ldi L, -2(KK) ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) + -+ ble KK, $L58 -+ ble L, $L55 -+#else -+#ifdef LN -+ sll K, ZBASE_SHIFT, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif ++ ST y2, 2 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 + -+ sll KK, ZBASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT, TMP1 -+ addl B, TMP1, BO ++$L27: ++ blbc M, $L990 + -+ subl K, KK, TMP1 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) + -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) + -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 + -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 + -+ ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(BO) ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ .align 4 + -+ ldi L, -2(TMP1) ++$L990: ++ cmpeq INCY, 2 * SIZE, $0 ++ bne $0, $L999 + -+ ble TMP1, $L58 -+ ble L, $L55 -+#endif -+ .align 5 ++ mov BUFFER, Y1 + -+$L52: -+ ADD1 c01, t1, c01 -+ unop -+ MUL a1, b1, t1 -+ unop ++ sra M, 2, I ++ ble I, $L995 ++ .align 4 + -+ ADD3 c02, t2, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) ++$L992: ++ LD a0, 0 * SIZE(BUFFER) ++ LD a1, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a2, 0 * SIZE(BUFFER) ++ LD a3, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER + -+ ADD4 c05, t3, c05 -+ ldi L, -2(L) -+ MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ LD y2, 2 * SIZE(Y) ++ LD y3, 3 * SIZE(Y) + -+ ADD2 c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) ++ LD a4, 0 * SIZE(BUFFER) ++ LD a5, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a6, 0 * SIZE(BUFFER) ++ LD a7, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER + -+ ADD1 c01, t1, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, t1 -+ ldi BO, 4 * SIZE(BO) ++ LD y4, 4 * SIZE(Y) ++ LD y5, 5 * SIZE(Y) ++ LD y6, 6 * SIZE(Y) ++ LD y7, 7 * SIZE(Y) + -+ ADD3 c02, t2, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, 0 * SIZE(BO) ++ ADD a0, y0, a0 ++ ADD a1, y1, a1 ++ ADD a2, y2, a2 ++ ADD a3, y3, a3 + -+ ADD4 c05, t3, c05 -+ unop -+ MUL a3, b4, t3 -+ LD a3, 0 * SIZE(AO) ++ ST a0, 0 * SIZE(Y1) ++ ADD a4, y4, a4 ++ ST a1, 1 * SIZE(Y1) ++ ADD a5, y5, a5 ++ addl Y1, INCY, Y1 + -+ ADD2 c06, t4, c06 -+ MUL a4, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ unop ++ ST a2, 0 * SIZE(Y1) ++ ADD a6, y6, a6 ++ ST a3, 1 * SIZE(Y1) ++ ADD a7, y7, a7 ++ addl Y1, INCY, Y1 + -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L52 ++ ST a4, 0 * SIZE(Y1) ++ ST a5, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a6, 0 * SIZE(Y1) ++ ST a7, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ ldi Y, 8 * SIZE(Y) ++ bgt I, $L992 + .align 4 + -+$L55: -+ ADD1 c01, t1, c01 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L57 -+#else -+ blbs TMP1, $L57 -+#endif ++$L995: ++ and M, 3, I ++ ble I, $L999 + .align 4 + -+ ADD3 c02, t2, c02 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) ++$L996: ++ LD a0, 0 * SIZE(BUFFER) ++ LD a1, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER + -+ ADD4 c05, t3, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, t3 -+ LD a1, 0 * SIZE(AO) ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ ldi Y, 2 * SIZE(Y) + -+ ADD2 c06, t4, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, 1 * SIZE(AO) ++ ADD a0, y0, a0 ++ ADD a1, y1, a1 + -+ ADD1 c01, t1, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi AO, 2 * SIZE(AO) ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L996 + .align 4 + -+$L57: -+ ADD3 c02, t2, c02 -+ MUL a2, b1, t2 -+ ADD4 c05, t3, c05 -+ MUL a1, b2, t3 ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) + -+ ADD2 c06, t4, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b2, t4 -+ ldi BO, 2 * SIZE(BO) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zgemv_t.S b/kernel/sw_64/zgemv_t.S +new file mode 100644 +index 000000000..4ee035c85 +--- /dev/null ++++ b/kernel/sw_64/zgemv_t.S +@@ -0,0 +1,922 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+ ADD1 c01, t1, c01 -+ ADD3 c02, t2, c02 -+ ADD4 c05, t3, c05 -+ ADD2 c06, t4, c06 ++#define ASSEMBLER ++#include "common.h" + -+ ADD c01, c06, c01 -+ ADD c02, c05, c02 + -+$L58: -+#if defined(LN) || defined(RT) -+ subl KK, 1, TMP1 ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 + -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif ++#define M $16 ++#define N $17 ++#define A $21 ++#define LDA $18 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++#define X $19 ++#define INCX $20 ++#define Y $22 ++#define INCY $23 + -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+#endif ++#define BUFFER $24 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++#define I $25 ++#define J $27 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 ++#define X1 $3 ++#define Y1 $4 ++#define A1 $5 ++#define A2 $6 + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+#endif ++#define alpha_r $f19 ++#define alpha_i $f20 + -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+#endif ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f21 + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 ++ ++#if !defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif !defined(CONJ) && defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#elif defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD +#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB +#endif + -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+#endif ++ PROLOGUE + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) ++ ldi $sp, -STACKSIZE($sp) ++ ldl LDA, 0 + STACKSIZE($sp) ++ ldl X, 8 + STACKSIZE($sp) ++ ldl INCX, 16 + STACKSIZE($sp) ++ ldl Y, 24 + STACKSIZE($sp) ++ ldl INCY, 32 + STACKSIZE($sp) ++ ldl BUFFER, 40 + STACKSIZE($sp) + -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+#endif ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) + -+#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ PROFCODE + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl BO, TMP2, BO -+#endif ++ cmple M, 0, $0 ++ sll INCX, ZBASE_SHIFT, INCX ++ cmple N, 0, $1 ++ sll INCY, ZBASE_SHIFT, INCY + -+#ifdef LT -+ addl KK, 1, KK -+#endif ++ or $0, $1, $0 ++ bne $0, $L999 + -+#ifdef LN -+ subl KK, 1, KK -+#endif -+ .align 4 ++ cmpeq INCX, 2 * SIZE, $0 ++ mov X, X1 ++ sll LDA, ZBASE_SHIFT,LDA ++ bne $0, $L10 + -+$L50: -+ sra M, 1, I -+ ble I, $L59 ++ sra M, 2, I ++ mov BUFFER, Y1 ++ mov BUFFER, X ++ ble I, $L05 + .align 4 + -+$L41: -+#if defined(LT) || defined(RN) ++$L02: ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ ldi I, -1(I) + -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++ LD a0, 0 * SIZE(X1) ++ LD a1, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a2, 0 * SIZE(X1) ++ LD a3, 1 * SIZE(X1) ++ addl X1, INCX, X1 + -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ST a2, 2 * SIZE(Y1) ++ ST a3, 3 * SIZE(Y1) + -+ ldi BO, 2 * SIZE(B) -+ fclr c03 -+ ldi AO, 4 * SIZE(AO) -+ fclr c07 ++ LD a4, 0 * SIZE(X1) ++ LD a5, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a6, 0 * SIZE(X1) ++ LD a7, 1 * SIZE(X1) ++ addl X1, INCX, X1 + -+ ldi L, -2(KK) -+ fclr c04 -+ fclr c08 ++ ST a4, 4 * SIZE(Y1) ++ ST a5, 5 * SIZE(Y1) ++ ST a6, 6 * SIZE(Y1) ++ ST a7, 7 * SIZE(Y1) + -+ ble KK, $L48 -+ ble L, $L45 -+#else -+#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L02 ++ .align 4 + -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT, TMP1 -+ addl B, TMP1, BO ++$L05: ++ and M, 3, I ++ ble I, $L10 ++ .align 4 + -+ subl K, KK, TMP1 ++$L06: ++ LD a0, 0 * SIZE(X1) ++ LD a1, 1 * SIZE(X1) ++ addl X1, INCX, X1 + -+ LD a1, 0 * SIZE(AO) ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ mov Y, Y1 ++ fclr t0 ++ unop + fclr t1 -+ LD a2, 1 * SIZE(AO) ++ ++ sra N, 1, J + fclr t2 -+ LD a3, 2 * SIZE(AO) + fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++ ble J, $L20 ++ .align 4 + -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 ++$L11: ++ mov A, A1 ++ fclr s0 ++ addl A, LDA, A2 ++ fclr s1 + -+ ldi BO, 2 * SIZE(BO) -+ fclr c03 -+ ldi AO, 4 * SIZE(AO) -+ fclr c07 ++ addl A2, LDA, A ++ unop ++ mov X, X1 ++ fillde 3 * SIZE(Y) + -+ ldi L, -2(TMP1) -+ fclr c04 -+ fclr c08 ++ sra M, 2, I ++ fclr s2 ++ fclr s3 ++ ble I, $L15 + -+ ble TMP1, $L48 -+ ble L, $L45 -+#endif -+ .align 5 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 3 * SIZE(A1) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) + -+$L42: -+ ADD4 c05, t1, c05 -+ unop -+ MUL a1, b1, t1 ++ LD a8, 4 * SIZE(A1) ++ LD a9, 5 * SIZE(A1) ++ LD a10, 4 * SIZE(A2) ++ LD a11, 5 * SIZE(A2) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 7 * SIZE(A1) ++ LD a14, 6 * SIZE(A2) ++ LD a15, 7 * SIZE(A2) ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD3 s0, t0, s0 + unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) + -+ ADD2 c06, t2, c06 -+ ldi L, -2(L) -+ MUL a2, b1, t2 ++ ADD4 s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 + unop + -+ ADD4 c07, t3, c07 ++ ADD3 s2, t2, s2 + unop -+ MUL a3, b1, t3 ++ MUL x0, a2, t2 + unop + -+ ADD2 c08, t4, c08 ++ ADD4 s3, t3, s3 + unop -+ MUL a4, b1, t4 -+ LD b1, 2 * SIZE(BO) ++ MUL x0, a3, t3 ++ LD x0, 4 * SIZE(X1) + -+ ADD1 c01, t1, c01 ++ ADD1 s0, t0, s0 + unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) -+ -+ ADD3 c02, t2, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) ++ MUL x1, a1, t0 ++ LD a1, 9 * SIZE(A1) + -+ ADD1 c03, t3, c03 ++ ADD2 s1, t1, s1 + unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) ++ MUL x1, a0, t1 ++ LD a0, 8 * SIZE(A1) + -+ ADD3 c04, t4, c04 ++ ADD1 s2, t2, s2 + unop -+ MUL a4, b2, t4 -+ LD a5, 3 * SIZE(AO) ++ MUL x1, a3, t2 ++ LD a3, 9 * SIZE(A2) + -+ ADD4 c05, t1, c05 ++ ADD2 s3, t3, s3 + unop -+ MUL a1, b3, t1 -+ LD b2, -1 * SIZE(BO) ++ MUL x1, a2, t3 ++ LD a2, 8 * SIZE(A2) + -+ ADD2 c06, t2, c06 ++ ADD3 s0, t0, s0 + unop -+ MUL a2, b3, t2 ++ MUL x2, a4, t0 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ MUL x2, a5, t1 ++ ADD3 s2, t2, s2 ++ MUL x2, a6, t2 ++ ++ ADD4 s3, t3, s3 + unop ++ MUL x2, a7, t3 ++ LD x2, 6 * SIZE(X1) + -+ ADD4 c07, t3, c07 ++ ADD1 s0, t0, s0 + unop -+ MUL a3, b3, t3 -+ ldi AO, 8 * SIZE(AO) ++ MUL x3, a5, t0 ++ LD a5, 11 * SIZE(A1) + -+ ADD2 c08, t4, c08 ++ ADD2 s1, t1, s1 + unop -+ MUL a5, b3, t4 -+ LD b3, 0 * SIZE(BO) ++ MUL x3, a4, t1 ++ LD a4, 10 * SIZE(A1) + -+ ADD1 c01, t1, c01 ++ ADD1 s2, t2, s2 + unop -+ MUL a1, b4, t1 -+ LD a1, -4 * SIZE(AO) ++ MUL x3, a7, t2 ++ LD a7, 11 * SIZE(A2) + -+ ADD3 c02, t2, c02 ++ ADD2 s3, t3, s3 + unop -+ MUL a2, b4, t2 -+ LD a2, -3 * SIZE(AO) ++ MUL x3, a6, t3 ++ LD a6, 10 * SIZE(A2) + -+ ADD1 c03, t3, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, -2 * SIZE(AO) ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a8, t0 ++ LD x3, 7 * SIZE(X1) + -+ ADD3 c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L42 -+ .align 4 ++ ADD4 s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x0, a9, t1 ++ unop + -+$L45: -+ ADD4 c05, t1, c05 -+ MUL b1, a1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L47 -+#else -+ blbs TMP1, $L47 -+#endif -+ .align 4 ++ ADD3 s2, t2, s2 ++ ldi I, -1(I) ++ MUL x0, a10, t2 ++ unop + -+ ADD2 c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD4 c07, t3, c07 -+ MUL a3, b1, t3 ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a11, t3 ++ LD x0, 8 * SIZE(X1) + -+ ADD2 c08, t4, c08 ++ ADD1 s0, t0, s0 + unop -+ MUL a4, b1, t4 -+ LD b1, 0 * SIZE(BO) ++ MUL x1, a9, t0 ++ LD a9, 13 * SIZE(A1) + -+ ADD1 c01, t1, c01 ++ ADD2 s1, t1, s1 + unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) ++ MUL x1, a8, t1 ++ LD a8, 12 * SIZE(A1) + -+ ADD3 c02, t2, c02 ++ ADD1 s2, t2, s2 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a11, t2 ++ LD a11, 13 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 + unop -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) ++ MUL x1, a10, t3 ++ LD a10, 12 * SIZE(A2) + -+ ADD1 c03, t3, c03 ++ ADD3 s0, t0, s0 + unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) ++ MUL x2, a12, t0 ++ LD x1, 9 * SIZE(X1) + -+ ADD3 c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) ++ ADD4 s1, t1, s1 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ MUL x2, a13, t1 ++ ldi A2, 8 * SIZE(A2) + -+ ADD4 c05, t1, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 ++ ADD3 s2, t2, s2 ++ unop ++ MUL x2, a14, t2 ++ unop + -+$L47: -+ ADD2 c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD4 c07, t3, c07 -+ MUL a3, b1, t3 ++ ADD4 s3, t3, s3 ++ unop ++ MUL x2, a15, t3 ++ LD x2, 10 * SIZE(X1) + -+ ADD2 c08, t4, c08 -+ MUL a4, b1, t4 -+ ADD1 c01, t1, c01 -+ MUL a1, b2, t1 ++ ADD1 s0, t0, s0 ++ unop ++ MUL x3, a13, t0 ++ LD a13, 7 * SIZE(A1) + -+ ADD3 c02, t2, c02 -+ MUL a2, b2, t2 -+ ADD1 c03, t3, c03 -+ MUL a3, b2, t3 ++ ADD2 s1, t1, s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a12, t1 ++ LD a12, 6 * SIZE(A1) + -+ ADD3 c04, t4, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, t4 -+ ldi BO, 2 * SIZE(BO) ++ ADD1 s2, t2, s2 ++ unop ++ MUL x3, a15, t2 ++ LD a15, 7 * SIZE(A2) + -+ ADD4 c05, t1, c05 -+ ADD2 c06, t2, c06 -+ ADD4 c07, t3, c07 -+ ADD2 c08, t4, c08 ++ ADD2 s3, t3, s3 ++ MUL x3, a14, t3 ++ LD a14, 6 * SIZE(A2) ++ bgt I, $L12 ++ .align 4 + -+ ADD c01, c06, c01 -+ ADD c02, c05, c02 -+ ADD c03, c08, c03 -+ ADD c04, c07, c04 ++$L13: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) + -+$L48: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif ++ ADD4 s1, t1, s1 ++ MUL x0, a1, t1 ++ ADD3 s2, t2, s2 ++ MUL x0, a2, t2 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+#endif ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 4 * SIZE(X1) + -+#ifdef LN -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) -+ LD a3, 4 * SIZE(AO) -+ LD a4, 5 * SIZE(AO) ++ ADD1 s0, t0, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 + -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 ++ ADD1 s2, t2, s2 ++ unop ++ MUL x1, a3, t2 ++ unop + -+ ADD5 c03, t1, c03 -+ ADD6 c04, t2, c04 -+ MUL a3, c03, t1 -+ MUL a3, c04, t2 ++ ADD2 s3, t3, s3 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a2, t3 ++ LD x1, 5 * SIZE(X1) + -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ MUL a4, c04, t1 -+ MUL a4, c03, t2 ++ ADD3 s0, t0, s0 ++ MUL x2, a4, t0 ++ ADD4 s1, t1, s1 ++ MUL x2, a5, t1 + -+ ADD6 c01, t1, c01 -+ ADD5 c02, t2, c02 -+ -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++ ADD3 s2, t2, s2 ++ unop ++ MUL x2, a6, t2 ++ unop + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 ++ ADD4 s3, t3, s3 ++ ldi A2, 8 * SIZE(A2) ++ MUL x2, a7, t3 ++ LD x2, 6 * SIZE(X1) + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+#endif ++ ADD1 s0, t0, s0 ++ MUL x3, a5, t0 ++ ADD2 s1, t1, s1 ++ MUL x3, a4, t1 + -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) ++ ADD1 s2, t2, s2 ++ unop ++ MUL x3, a7, t2 ++ ldi X1, 8 * SIZE(X1) + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 ++ ADD2 s3, t3, s3 ++ unop ++ MUL x3, a6, t3 ++ LD x3, -1 * SIZE(X1) + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 ++ ADD3 s0, t0, s0 ++ MUL x0, a8, t0 ++ ADD4 s1, t1, s1 ++ MUL x0, a9, t1 + -+ SUB c03, t1, c03 -+ SUB c04, t2, c04 ++ ADD3 s2, t2, s2 ++ MUL x0, a10, t2 ++ ADD4 s3, t3, s3 ++ MUL x0, a11, t3 + -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ ADD6 c03, t1, c03 -+ ADD5 c04, t2, c04 ++ ADD1 s0, t0, s0 ++ MUL x1, a9, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a8, t1 + -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) ++ ADD1 s2, t2, s2 ++ MUL x1, a11, t2 ++ ADD2 s3, t3, s3 ++ MUL x1, a10, t3 + -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 ++ ADD3 s0, t0, s0 ++ MUL x2, a12, t0 ++ ADD4 s1, t1, s1 ++ MUL x2, a13, t1 + -+ ADD5 c03, t1, c03 -+ ADD6 c04, t2, c04 -+#endif ++ ADD3 s2, t2, s2 ++ MUL x2, a14, t2 ++ ADD4 s3, t3, s3 ++ MUL x2, a15, t3 + -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++ ADD1 s0, t0, s0 ++ MUL x3, a13, t0 ++ ADD2 s1, t1, s1 ++ MUL x3, a12, t1 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 ++ ADD1 s2, t2, s2 ++ MUL x3, a15, t2 ++ ADD2 s3, t3, s3 ++ MUL x3, a14, t3 ++ .align 4 + -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 ++$L15: ++ and M, 3, I ++ ble I, $L18 + -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+ ADD5 c03, t3, c03 -+ ADD6 c04, t4, c04 -+#endif ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+#endif ++ LD x0, 0 * SIZE(X1) + -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+#endif ++ ldi I, -1(I) ++ ble I, $L17 ++ .align 4 + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) ++$L16: ++ ADD3 s0, t0, s0 ++ ldi I, -1(I) ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) + -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+#endif ++ ADD4 s1, t1, s1 ++ MUL x0, a1, t1 ++ ADD3 s2, t2, s2 ++ MUL x0, a2, t2 + -+#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 2 * SIZE(X1) + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl BO, TMP2, BO -+#endif ++ ADD1 s0, t0, s0 ++ ldi A2, 2 * SIZE(A2) ++ MUL x1, a1, t0 ++ LD a1, 3 * SIZE(A1) + -+#ifdef LT -+ addl KK, 2, KK -+#endif ++ ADD2 s1, t1, s1 ++ ldi X1, 2 * SIZE(X1) ++ MUL x1, a0, t1 ++ LD a0, 2 * SIZE(A1) + -+#ifdef LN -+ subl KK, 2, KK -+#endif ++ ADD1 s2, t2, s2 ++ ldi A1, 2 * SIZE(A1) ++ MUL x1, a3, t2 ++ LD a3, 1 * SIZE(A2) + -+ ldi I, -1(I) -+ bgt I, $L41 ++ ADD2 s3, t3, s3 ++ MUL x1, a2, t3 ++ LD a2, 0 * SIZE(A2) ++ bgt I, $L16 + .align 4 + -+$L59: -+#ifdef LN -+ sll K, ZBASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif ++$L17: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) + -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ unop + -+#ifdef RN -+ addl KK, 1, KK -+#endif ++ ADD3 s2, t2, s2 ++ MUL x0, a2, t2 ++ ADD4 s3, t3, s3 ++ MUL x0, a3, t3 + -+#ifdef RT -+ subl KK, 1, KK -+#endif -+ .align 4 ++ ADD1 s0, t0, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 + -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ clr $0 -+ ldi $sp, STACKSIZE($sp) -+ ret -+ .ident VERSION -+ .end CNAME -diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S b/kernel/sw_64/ztrsm_kernel_2x2_LT.S -new file mode 100644 -index 0000000..bb38b56 ---- /dev/null -+++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S -@@ -0,0 +1,2624 @@ -+/*********************************************************************/ -+/* Copyright 2009, 2010 The University of Texas at Austin. */ -+/* All rights reserved. */ -+/* */ -+/* Redistribution and use in source and binary forms, with or */ -+/* without modification, are permitted provided that the following */ -+/* conditions are met: */ -+/* */ -+/* 1. Redistributions of source code must retain the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer. */ -+/* */ -+/* 2. Redistributions in binary form must reproduce the above */ -+/* copyright notice, this list of conditions and the following */ -+/* disclaimer in the documentation and/or other materials */ -+/* provided with the distribution. */ -+/* */ -+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -+/* POSSIBILITY OF SUCH DAMAGE. */ -+/* */ -+/* The views and conclusions contained in the software and */ -+/* documentation are those of the authors and should not be */ -+/* interpreted as representing official policies, either expressed */ -+/* or implied, of The University of Texas at Austin. */ -+/*********************************************************************/ ++ ADD1 s2, t2, s2 ++ MUL x1, a3, t2 ++ ADD2 s3, t3, s3 ++ MUL x1, a2, t3 ++ .align 4 + -+#define ASSEMBLER -+#include "common.h" -+#include "version.h" ++$L18: ++ LD a0, 0 * SIZE(Y) ++ unop ++ LD a1, 1 * SIZE(Y) ++ addl Y, INCY, Y + -+#if !defined(EV4) && !defined(EV5) && !defined(SW6) -+#error "Architecture is not specified." -+#endif ++ LD a2, 0 * SIZE(Y) ++ unop ++ LD a3, 1 * SIZE(Y) ++ addl Y, INCY, Y + -+#ifdef SW6 -+#define PREFETCHSIZE 56 -+#define UNOP unop -+#endif ++ ADD3 s0, t0, s0 ++ ADD4 s1, t1, s1 ++ ADD3 s2, t2, s2 ++ ADD4 s3, t3, s3 + -+#ifdef EV5 -+#define PREFETCHSIZE 48 -+#define UNOP -+#endif ++ MUL alpha_r, s0, t0 ++ MUL alpha_r, s1, t1 ++ MUL alpha_r, s2, t2 ++ MUL alpha_r, s3, t3 + -+#ifdef EV4 -+#define UNOP -+#endif ++ ADD a0, t0, a0 ++ MUL alpha_i, s1, t0 ++ ADD a1, t1, a1 ++ MUL alpha_i, s0, t1 ++ ADD a2, t2, a2 ++ MUL alpha_i, s3, t2 ++ ADD a3, t3, a3 ++ MUL alpha_i, s2, t3 + -+ .set noat -+ .set noreorder -+ .arch sw6a ++ SUB a0, t0, a0 ++ ADD a1, t1, a1 ++ SUB a2, t2, a2 ++ ADD a3, t3, a3 + -+.text -+ .align 5 -+ .globl CNAME -+ .ent CNAME ++ ST a0, 0 * SIZE(Y1) ++ fclr t0 ++ ST a1, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 + -+#define STACKSIZE 88 ++ ST a2, 0 * SIZE(Y1) ++ fclr t1 ++ ST a3, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 + -+#define M $16 -+#define N $17 -+#define K $18 -+#define A $21 -+#define B $22 -+#define C $20 -+#define LDC $23 ++ fclr t2 ++ ldi J, -1(J) ++ fclr t3 ++ bgt J, $L11 ++ .align 4 + -+#define C1 $19 -+#define C2 $24 ++$L20: ++ blbc N, $L999 + -+#define AO $at -+#define BO $5 -+#define I $6 -+#define J $7 -+#define L $8 ++ mov A, A1 ++ fclr s0 ++ fclr s1 ++ mov X, X1 + -+#define tmp $9 ++ sra M, 2, I ++ fclr s2 ++ fclr s3 ++ ble I, $L25 + -+#define a1 $f16 -+#define a2 $f17 -+#define a3 $f18 -+#define a4 $f19 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 3 * SIZE(A1) ++ LD a8, 4 * SIZE(A1) ++ LD a9, 5 * SIZE(A1) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 7 * SIZE(A1) + -+#define b1 $f20 -+#define b2 $f21 -+#define b3 $f22 -+#define b4 $f23 ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) + -+#define t1 $f24 -+#define t2 $f25 -+#define t3 $f26 -+#define t4 $f27 ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 + -+#define a5 $f28 -+#define a6 $f30 -+#define b5 $f29 ++$L22: ++ ADD3 s0, t0, s0 ++ s_fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) + -+#define alpha_i $f29 -+#define alpha_r $f30 ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ LD x0, 4 * SIZE(X1) + -+#define c01 $f0 -+#define c02 $f1 -+#define c03 $f2 -+#define c04 $f3 ++ ADD1 s2, t0, s2 ++ ldi I, -1(I) ++ MUL x1, a1, t0 ++ LD a1, 9 * SIZE(A1) + -+#define c05 $f4 -+#define c06 $f5 -+#define c07 $f6 -+#define c08 $f7 ++ ADD2 s3, t1, s3 ++ unop ++ MUL x1, a0, t1 ++ LD a0, 8 * SIZE(A1) + -+#define c09 $f8 -+#define c10 $f9 -+#define c11 $f10 -+#define c12 $f11 ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a4, t0 ++ LD x1, 5 * SIZE(X1) + -+#define c13 $f12 -+#define c14 $f13 -+#define c15 $f14 -+#define c16 $f15 ++ ADD4 s1, t1, s1 ++ unop ++ MUL x2, a5, t1 ++ LD x2, 6 * SIZE(X1) + -+#define TMP1 $0 -+#define TMP2 $1 -+#define KK $2 -+#define AORIG $3 -+#define OFFSET $4 ++ ADD1 s2, t0, s2 ++ unop ++ MUL x3, a5, t0 ++ LD a5, 11 * SIZE(A1) + -+#if defined(LN) || defined(LT) -+#ifndef CONJ -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#define ADD5 SUB -+#define ADD6 ADD -+#else -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 SUB -+#define ADD4 ADD -+#define ADD5 ADD -+#define ADD6 SUB -+#endif -+#else -+#ifndef CONJ -+#define ADD1 ADD -+#define ADD2 SUB -+#define ADD3 ADD -+#define ADD4 ADD -+#define ADD5 SUB -+#define ADD6 ADD -+#else -+#define ADD1 ADD -+#define ADD2 ADD -+#define ADD3 ADD -+#define ADD4 SUB -+#define ADD5 ADD -+#define ADD6 SUB -+#endif -+#endif ++ ADD2 s3, t1, s3 ++ unop ++ MUL x3, a4, t1 ++ LD a4, 10 * SIZE(A1) + ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a8, t0 ++ LD x3, 7 * SIZE(X1) + -+CNAME: -+ .frame $sp, STACKSIZE, $26, 0 ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a9, t1 ++ LD x0, 8 * SIZE(X1) + -+#ifdef PROFILE -+ ldgp $gp, 0($27) -+ ldi $at, _mcount -+ jsr $at, ($at), _mcount -+#endif ++ ADD1 s2, t0, s2 ++ unop ++ MUL x1, a9, t0 ++ LD a9, 13 * SIZE(A1) + -+#ifndef PROFILE -+ .prologue 0 -+#else -+ .prologue 1 -+#endif ++ ADD2 s3, t1, s3 ++ unop ++ MUL x1, a8, t1 ++ LD a8, 12 * SIZE(A1) + -+ ldi $sp, -STACKSIZE($sp) ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a12, t0 ++ LD x1, 9 * SIZE(X1) + -+ ldl B, 0 + STACKSIZE($sp) -+ ldl C, 8 + STACKSIZE($sp) -+ ldl LDC, 16 + STACKSIZE($sp) -+ ldl OFFSET, 24 + STACKSIZE($sp) ++ ADD4 s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x2, a13, t1 ++ LD x2, 10 * SIZE(X1) + -+ sll LDC, ZBASE_SHIFT, LDC ++ ADD1 s2, t0, s2 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a13, t0 ++ LD a13, 7 * SIZE(A1) + -+ fstd $f2, 0($sp) -+ fstd $f3, 8($sp) -+ fstd $f4, 16($sp) -+ fstd $f5, 24($sp) -+ fstd $f6, 32($sp) -+ fstd $f7, 40($sp) -+ fstd $f8, 48($sp) -+ fstd $f9, 56($sp) -+ stl tmp, 72($sp) ++ ADD2 s3, t1, s3 ++ MUL x3, a12, t1 ++ LD a12, 6 * SIZE(A1) ++ bgt I, $L22 ++ .align 4 + -+ cmple M, 0, $0 -+ cmple N, 0, $1 -+ cmple K, 0, $2 ++$L23: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) + -+ or $0, $1, $0 -+ or $0, $2, $0 -+ bne $0, $L999 ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ LD x0, 4 * SIZE(X1) + -+#ifdef LN -+ addl M, M, TMP2 -+ mull TMP2, K, TMP1 -+ SXADDQ TMP1, A, A -+ SXADDQ TMP2, C, C -+#endif ++ ADD1 s2, t0, s2 ++ unop ++ MUL x1, a1, t0 ++ ldi A1, 8 * SIZE(A1) + -+#ifdef RN -+ negl OFFSET, KK -+#endif ++ ADD2 s3, t1, s3 ++ unop ++ MUL x1, a0, t1 ++ LD x1, 5 * SIZE(X1) + -+#ifdef RT -+ mull N, K, TMP1 -+ addl TMP1, TMP1, TMP1 -+ SXADDQ TMP1, B, B ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a4, t0 ++ unop + -+ mull N, LDC, TMP1 -+ addl TMP1, C, C ++ ADD4 s1, t1, s1 ++ unop ++ MUL x2, a5, t1 ++ LD x2, 6 * SIZE(X1) + -+ subl N, OFFSET, KK -+#endif ++ ADD1 s2, t0, s2 ++ unop ++ MUL x3, a5, t0 ++ ldi X1, 8 * SIZE(X1) + -+ sra N, 1, J -+ ble J, $L30 -+ .align 4 ++ ADD2 s3, t1, s3 ++ unop ++ MUL x3, a4, t1 ++ LD x3, -1 * SIZE(X1) + -+$L01: -+#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ subl B, TMP1, B ++ ADD3 s0, t0, s0 ++ MUL x0, a8, t0 ++ ADD4 s1, t1, s1 ++ MUL x0, a9, t1 + -+ subl C, LDC, C2 -+ subl C2, LDC, C1 -+ subl C2, LDC, C -+#else -+ mov C, C1 -+ addl C, LDC, C2 -+ addl C2, LDC, C -+#endif ++ ADD1 s2, t0, s2 ++ MUL x1, a9, t0 ++ ADD2 s3, t1, s3 ++ MUL x1, a8, t1 + -+#ifdef LN -+ addl M, OFFSET, KK -+#endif ++ ADD3 s0, t0, s0 ++ MUL x2, a12, t0 ++ ADD4 s1, t1, s1 ++ MUL x2, a13, t1 + -+#ifdef LT -+ mov OFFSET, KK -+#endif ++ ADD1 s2, t0, s2 ++ MUL x3, a13, t0 ++ ADD2 s3, t1, s3 ++ MUL x3, a12, t1 ++ .align 4 + -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif ++$L25: ++ and M, 3, I ++ ble I, $L28 + -+ sra M, 1, I -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) + -+ fclr c01 -+ fclr c05 ++ LD x0, 0 * SIZE(X1) + -+ ble I, $L20 ++ ldi I, -1(I) ++ ble I, $L27 + .align 4 + -+$L11: -+#if defined(LT) || defined(RN) ++$L26: ++ ADD3 s0, t0, s0 ++ ldi A1, 2 * SIZE(A1) ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) + -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 ++ ADD4 s1, t1, s1 ++ ldi I, -1(I) ++ MUL x0, a1, t1 ++ LD x0, 2 * SIZE(X1) + -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 ++ ADD1 s0, t0, s0 ++ ldi X1, 2 * SIZE(X1) ++ MUL x1, a1, t0 ++ LD a1, 1 * SIZE(A1) + -+ LD b1, 0 * SIZE(B) -+ fclr c10 -+ LD b2, 1 * SIZE(B) -+ fclr c14 ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 ++ LD a0, 0 * SIZE(A1) ++ bgt I, $L26 ++ .align 4 + -+ LD b3, 2 * SIZE(B) -+ fclr c03 -+ LD b4, 3 * SIZE(B) -+ fclr c07 ++$L27: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) + -+ ldi BO, 4 * SIZE(B) -+ fclr c11 -+ ldi AO, 4 * SIZE(AO) -+ fclr c15 ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ unop + -+ fillcs 4 * SIZE(C1) -+ fclr c04 -+ ldi L, -2(KK) -+ fclr c08 ++ ADD1 s0, t0, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 ++ .align 4 + -+ fillcs 4 * SIZE(C2) -+ fclr c12 -+ fclr c16 -+ ble KK, $L18 -+ ble L, $L15 -+#else -+#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif ++$L28: ++ LD a0, 0 * SIZE(Y) ++ LD a1, 1 * SIZE(Y) + -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ addl B, TMP1, BO ++ ADD3 s0, t0, s0 ++ ADD4 s1, t1, s1 ++ ADD3 s2, t2, s2 ++ ADD4 s3, t3, s3 + -+ subl K, KK, TMP1 ++ ADD s0, s2, s0 ++ ADD s1, s3, s1 + -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 ++ MUL alpha_r, s0, t0 ++ MUL alpha_r, s1, t1 + -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 ++ ADD a0, t0, a0 ++ MUL alpha_i, s1, t0 ++ ADD a1, t1, a1 ++ MUL alpha_i, s0, t1 + -+ LD b1, 0 * SIZE(BO) -+ fclr c10 -+ LD b2, 1 * SIZE(BO) -+ fclr c14 ++ SUB a0, t0, a0 ++ ADD a1, t1, a1 + -+ LD b3, 2 * SIZE(BO) -+ fclr c03 -+ LD b4, 3 * SIZE(BO) -+ fclr c07 ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ .align 4 + -+ ldi BO, 4 * SIZE(BO) -+ fclr c11 -+ ldi AO, 4 * SIZE(AO) -+ fclr c15 ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) + -+ fillcs 4 * SIZE(C1) -+ fclr c04 -+ ldi L, -2(TMP1) -+ fclr c08 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/znrm2.S b/kernel/sw_64/znrm2.S +new file mode 100644 +index 000000000..1892c5f2b +--- /dev/null ++++ b/kernel/sw_64/znrm2.S +@@ -0,0 +1,428 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+ fillcs 4 * SIZE(C2) -+ fclr c12 -+ fclr c16 -+ ble TMP1, $L18 -+ ble L, $L15 -+#endif -+ .align 5 ++#define ASSEMBLER + -+$L12: -+/* 1 */ -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop -+#endif ++#include "common.h" + -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ unop -+ MUL b1, a2, t2 -+ unop + -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) ++#define PREFETCH_SIZE 80 + -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ FIMOVD b5, tmp -+/* 2 */ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP ++#define I $0 + -+ ADD2 c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 -+ unop ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 + -+ ADD4 c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a1, t4 -+ unop ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 + -+/* 3 */ -+ ADD1 c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, t1 -+ unop ++ PROLOGUE + -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, t2 -+ unop ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 + -+ ADD2 c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) + -+ ADD4 c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif + -+/* 4 */ -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) ++ fclr a0 ++ sll INCX, ZBASE_SHIFT, INCX ++ fclr a1 ++ ble N, $L999 + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) ++ beq INCX, $L999 + -+ ADD2 c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) ++ fclr a2 ++ cmpeq INCX, 2 * SIZE, $0 ++ fclr a3 ++ beq $0, $L20 + -+ ADD4 c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L15 + -+/* 5 */ -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) + -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ ldi L, -2(L) -+ IFMOVD tmp, b5 -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) + -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ unop ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 + -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a5, t4 -+ unop ++$L11: ++ faddd a0, t0, a0 ++ s_fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) + -+/* 6 */ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a6, t1 -+ unop ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a4, t2 ++ faddd a2, t2, a2 + unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) + -+ ADD2 c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 ++ faddd a3, t3, a3 + unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) + -+ ADD4 c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a5, t4 -+ unop -+ -+/* 7 */ -+ ADD1 c03, t1, b5 -+ fmov b5, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 ++ faddd a0, t0, a0 + unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) + -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 ++ faddd a1, t1, a1 + unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) + -+ ADD2 c08, t3, b5 -+ fmov b5, c08 ++ faddd a2, t2, a2 + unop -+ MUL b4, a2, t3 -+ LD a2, -3 * SIZE(AO) ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) + -+ ADD4 c13, t4, b5 -+ fmov b5, c13 ++ faddd a3, t3, a3 + unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) + -+/* 8 */ -+ ADD1 c09, t1, b5 -+ fmov b5, c09 ++ faddd a0, t0, a0 + unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) + -+ ADD2 c14, t3, b5 -+ fmov b5, c14 ++ faddd a2, t2, a2 + unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD4 c07, t4, b5 -+ fmov b5, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 -+ .align 4 ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) + -+$L15: -+ ADD1 c11, t1, b5 -+ fmov b5, c11 ++ faddd a3, t3, a3 + unop -+ MUL b1, a1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L17 -+#else -+ blbs TMP1, $L17 -+#endif -+ .align 4 ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) + -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, t2 -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, t3 ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) + -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, t1 ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ faddd a2, t2, a2 + unop -+ MUL b1, a4, t2 -+ LD b1, 0 * SIZE(BO) ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) + -+ ADD2 c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, t3 -+ ADD4 c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, t4 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 + -+ ADD1 c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, t1 -+ LD a1, 0 * SIZE(AO) ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) + -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, t2 ++ faddd a1, t1, a1 + unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) + -+ ADD2 c08, t3, b5 -+ fmov b5, c08 ++ faddd a2, t2, a2 + unop -+ MUL b4, a2, t3 -+ LD a2, 1 * SIZE(AO) ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) + -+ ADD4 c13, t4, b5 -+ fmov b5, c13 ++ faddd a3, t3, a3 + unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) + -+ ADD1 c09, t1, b5 -+ fmov b5, c09 ++ faddd a0, t0, a0 + unop -+ MUL b3, a3, t1 -+ ldi AO, 4 * SIZE(AO) ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 ++ faddd a1, t1, a1 + unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) + -+ ADD2 c14, t3, b5 -+ fmov b5, c14 ++ faddd a2, t2, a2 + unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) + -+ ADD4 c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, t4 -+ LD a3, -2 * SIZE(AO) ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) + -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+ LD b4, 3 * SIZE(BO) -+ MUL b1, a1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 + -+$L17: -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, t2 -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, t3 ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 + -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, t1 ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ MUL b1, a4, t2 -+ ADD2 c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, t3 ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 + -+ ADD4 c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, t4 -+ ADD1 c03, t1, b5 -+ fmov b5, c03 -+ MUL b3, a1, t1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 + -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+ MUL b3, a2, t2 -+ ADD2 c08, t3, b5 -+ fmov b5, c08 -+ MUL b4, a2, t3 ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 + -+ ADD4 c13, t4, b5 -+ fmov b5, c13 -+ MUL b2, a3, t4 -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ MUL b3, a3, t1 ++$L16: ++ LD x0, 0 * SIZE(X) ++ LD x1, 1 * SIZE(X) + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ MUL b3, a4, t2 -+ ADD2 c14, t3, b5 -+ fmov b5, c14 -+ MUL b4, a4, t3 ++ ldi X, 2 * SIZE(X) + -+ ADD4 c07, t4, b5 -+ fmov b5, c07 -+ ldi AO, 4 * SIZE(AO) -+ MUL b4, a3, t4 -+ ldi BO, 4 * SIZE(BO) ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 + -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+ -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 -+ ADD c03, c08, b5 -+ fmov b5, c03 -+ ADD c04, c07, b5 -+ fmov b5, c04 -+ -+ ADD c09, c14, b5 -+ fmov b5, c09 -+ ADD c10, c13, b5 -+ fmov b5, c10 -+ ADD c11, c16, b5 -+ fmov b5, c11 -+ ADD c12, c15, b5 -+ fmov b5, c12 ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 + .align 4 + -+$L18: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif ++$L20: ++ fclr t0 ++ sra N, 2, I ++ fclr t1 ++ ble I, $L25 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) ++ LD x0, 0 * SIZE(X) ++ fclr t2 ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ fclr t3 ++ LD x3, 1 * SIZE(X) ++ addl X, INCX, X + -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) ++ LD x4, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x5, 1 * SIZE(X) ++ addl X, INCX, X + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c10, b5 -+ fmov b5, c10 -+ -+ SUB b1, c03, b5 -+ fmov b5, c03 -+ SUB b2, c04, b5 -+ fmov b5, c04 -+ SUB b3, c11, b5 -+ fmov b5, c11 -+ SUB b4, c12, b5 -+ fmov b5, c12 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) ++ LD x6, 0 * SIZE(X) ++ ble I, $L22 ++ .align 4 + -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+ -+ SUB b1, c09, b5 -+ fmov b5, c09 -+ SUB b2, c10, b5 -+ fmov b5, c10 -+ SUB b3, c11, b5 -+ fmov b5, c11 -+ SUB b4, c12, b5 -+ fmov b5, c12 -+#endif ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ unop + -+#ifdef LN -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) -+ LD a3, 4 * SIZE(AO) -+ LD a4, 5 * SIZE(AO) ++ faddd a2, t2, a2 ++ LD x1, 1 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X + -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ unop + -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 ++ faddd a0, t0, a0 ++ LD x3, 1 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X + -+ MUL a3, c03, t1 -+ MUL a3, c04, t2 -+ MUL a3, c11, t3 -+ MUL a3, c12, t4 ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ ldi I, -1(I) + -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c10, t4, b5 -+ fmov b5, c10 ++ faddd a2, t2, a2 ++ LD x5, 1 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X + -+ MUL a4, c04, t1 -+ MUL a4, c03, t2 -+ MUL a4, c12, t3 -+ MUL a4, c11, t4 ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ bgt I, $L21 ++ .align 4 + -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 -+ ADD6 c09, t3, b5 -+ fmov b5, c09 -+ ADD5 c10, t4, b5 -+ fmov b5, c10 ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X + -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 + -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c09, t3, b5 -+ fmov b5, c09 -+ ADD6 c10, t4, b5 -+ fmov b5, c10 -+#endif ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 + -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 ++$L25: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 + -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c09, t3, b5 -+ fmov b5, c09 -+ ADD6 c10, t4, b5 -+ fmov b5, c10 ++$L26: ++ LD x0, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X + -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c09, t3 -+ MUL a3, c10, t4 ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 + -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c04, t2, b5 -+ fmov b5, c04 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 ++ bgt I, $L26 ++ .align 4 + -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ MUL a4, c10, t3 -+ MUL a4, c09, t4 + -+ ADD6 c03, t1, b5 -+ fmov b5, c03 -+ ADD5 c04, t2, b5 -+ fmov b5, c04 -+ ADD6 c11, t3, b5 -+ fmov b5, c11 -+ ADD5 c12, t4, b5 -+ fmov b5, c12 ++$L998: ++ faddd a0, t0, a0 ++ faddd a1, t1, a1 + -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 + -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 + -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 +#endif ++ .align 4 + -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zrot.S b/kernel/sw_64/zrot.S +new file mode 100644 +index 000000000..3d05a2de1 +--- /dev/null ++++ b/kernel/sw_64/zrot.S +@@ -0,0 +1,631 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 ++#define ASSEMBLER ++#include "common.h" + -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c03, t3, b5 -+ fmov b5, c03 -+ ADD6 c04, t4, b5 -+ fmov b5, c04 + -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c03, t3 -+ MUL a3, c04, t4 ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define I $21 ++#define XX $23 ++#define YY $24 + -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 ++#define C $f10 ++#define S $f11 + -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ MUL a4, c04, t3 -+ MUL a4, c03, t4 ++#define PREFETCH_SIZE 80 + -+ ADD6 c09, t1, b5 -+ fmov b5, c09 -+ ADD5 c10, t2, b5 -+ fmov b5, c10 -+ ADD6 c11, t3, b5 -+ fmov b5, c11 -+ ADD5 c12, t4, b5 -+ fmov b5, c12 ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 + -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif + -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 ++ fmov $f21, C ++ LD S, 0($sp) + -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 -+#endif ++ addl INCX, INCX, INCX ++ addl INCY, INCY, INCY + -+#ifdef RT -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ LD a3, 4 * SIZE(BO) -+ LD a4, 5 * SIZE(BO) ++ cmpeq INCX, 2, $23 ++ cmpeq INCY, 2, $24 ++ ble N, $L998 + -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 ++ and $23, $24, $23 ++ beq $23, $L50 + -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 ++ sra N, 2, I ++ ble I, $L15 + -+ MUL a3, c09, t1 -+ MUL a3, c10, t2 -+ MUL a3, c11, t3 -+ MUL a3, c12, t4 ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) + -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 ++ LD $f16, 2*SIZE(X) ++ LD $f17, 2*SIZE(Y) ++ LD $f18, 3*SIZE(X) ++ LD $f19, 3*SIZE(Y) + -+ MUL a4, c10, t1 -+ MUL a4, c09, t2 -+ MUL a4, c12, t3 -+ MUL a4, c11, t4 ++ MUL C, $f12, $f21 ++ unop ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 + -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 -+ ADD6 c03, t3, b5 -+ fmov b5, c03 -+ ADD5 c04, t4, b5 -+ fmov b5, c04 ++ LD $f13, 4*SIZE(Y) ++ MUL S, $f12, $f24 ++ LD $f12, 4*SIZE(X) ++ MUL C, $f14, $f25 + -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++ ldi I, -1(I) ++ MUL S, $f15, $f26 ++ ADD $f21, $f22, $f22 ++ MUL C, $f15, $f27 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 ++ LD $f15, 5*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ ble I, $L13 ++ .align 4 + -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c03, t3, b5 -+ fmov b5, c03 -+ ADD6 c04, t4, b5 -+ fmov b5, c04 -+#endif ++$L12: ++ MUL C, $f16, $f21 ++ fillde (PREFETCH_SIZE) * SIZE(X) ++ unop ++ LD $f14, 5*SIZE(X) + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c10, 3 * SIZE(BO) ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 + -+ ST c03, 4 * SIZE(BO) -+ ST c04, 5 * SIZE(BO) -+ ST c11, 6 * SIZE(BO) -+ ST c12, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) ++ MUL C, $f17, $f23 ++ fillde (PREFETCH_SIZE) * SIZE(Y) ++ unop ++ LD $f17, 6*SIZE(Y) + -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c11, 6 * SIZE(AO) -+ ST c12, 7 * SIZE(AO) -+#endif ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 + -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+#endif ++ MUL C, $f18, $f25 ++ LD $f16, 6*SIZE(X) ++ unop ++ unop + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 + -+ ST c09, 0 * SIZE(C2) -+ ST c10, 1 * SIZE(C2) -+ ST c11, 2 * SIZE(C2) -+ ST c12, 3 * SIZE(C2) ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) + -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+#endif ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 + -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 ++ MUL C, $f12, $f21 ++ LD $f18, 7*SIZE(X) ++ unop ++ unop + -+#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif ++ MUL C, $f13, $f23 ++ LD $f13, 8*SIZE(Y) ++ unop ++ unop + -+#ifdef LT -+ addl KK, 2, KK -+#endif ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 + -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ fclr c01 -+ fclr c05 ++ MUL C, $f14, $f25 ++ LD $f12, 8*SIZE(X) ++ unop ++ unop + -+ ldi I, -1(I) -+ bgt I, $L11 -+ .align 4 ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 + -+$L20: -+ and M, 1, I -+ ble I, $L29 ++ MUL C, $f15, $f27 ++ LD $f15, 9*SIZE(Y) ++ unop ++ unop + -+#if defined(LT) || defined(RN) ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 + -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 ++ MUL C, $f16, $f21 ++ LD $f14, 9*SIZE(X) ++ unop ++ unop + -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 + -+ LD b1, 0 * SIZE(B) -+ fclr c10 -+ LD b2, 1 * SIZE(B) -+ fclr c14 ++ MUL C, $f17, $f23 ++ LD $f17, 10*SIZE(Y) ++ unop ++ unop + -+ LD b3, 2 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 4 * SIZE(B) ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 + -+ ldi L, -2(KK) ++ MUL C, $f18, $f25 ++ LD $f16, 10*SIZE(X) ++ unop ++ unop + -+ ble KK, $L28 -+ ble L, $L25 -+#else -+#ifdef LN -+ sll K, ZBASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 + -+ sll KK, ZBASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO ++ MUL C, $f19, $f27 ++ LD $f19, 11*SIZE(Y) ++ unop ++ unop + -+ subl K, KK, TMP1 ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ ldi I, -1(I) ++ SUB $f23, $f24, $f24 + -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 ++ MUL C, $f12, $f21 ++ LD $f18, 11*SIZE(X) ++ unop ++ unop + -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 ++ ST $f22, 6*SIZE(X) ++ MUL S, $f13, $f22 ++ unop ++ ADD $f25, $f26, $f26 + -+ LD b1, 0 * SIZE(BO) -+ fclr c10 -+ LD b2, 1 * SIZE(BO) -+ fclr c14 ++ MUL C, $f13, $f23 ++ LD $f13, 12*SIZE(Y) ++ ldi X, 8*SIZE(X) ++ unop + -+ LD b3, 2 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 4 * SIZE(BO) ++ ST $f24, 6*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 + -+ ldi L, -2(TMP1) ++ MUL C, $f14, $f25 ++ LD $f12, 4*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ unop + -+ ble TMP1, $L28 -+ ble L, $L25 -+#endif -+ .align 5 ++ ST $f26, -1*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 + -+$L22: -+ ADD1 c09, t1, b5 -+ fmov b5, c09 ++ MUL C, $f15, $f27 ++ LD $f15, 5*SIZE(Y) + unop -+ MUL a1, b1, t1 + unop + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 ++ ST $f28, -1*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ MUL C, $f16, $f21 ++ LD $f14, 5*SIZE(X) ++ unop + unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) + -+ ADD4 c13, t3, b5 -+ fmov b5, c13 ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 + unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) ++ ADD $f25, $f26, $f26 + -+ ADD2 c14, t4, b5 -+ fmov b5, c14 ++ MUL C, $f17, $f23 + unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) ++ unop ++ LD $f17, 6*SIZE(Y) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ LD $f16, 6*SIZE(X) ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop + unop -+ MUL a1, b3, t1 + unop + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 + unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) ++ ADD $f21, $f22, $f22 + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 ++ MUL C, $f19, $f27 + unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) ++ unop ++ LD $f19, 7*SIZE(Y) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ FIMOVD b5, tmp ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ LD $f18, 7*SIZE(X) ++ SUB $f23, $f24, $f24 + -+ ADD1 c09, t1, b5 -+ fmov b5, c09 ++ MUL C, $f12, $f21 ++ unop ++ unop + unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 ++ ST $f22, 2*SIZE(X) + unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 + -+ ADD4 c13, t3, b5 -+ fmov b5, c13 ++ MUL C, $f13, $f23 ++ unop ++ unop + unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) + -+ ADD2 c14, t4, b5 -+ fmov b5, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) ++ MUL C, $f14, $f25 ++ unop ++ unop ++ unop + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 + unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) ++ ADD $f21, $f22, $f22 + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 ++ MUL C, $f15, $f27 ++ unop ++ unop + unop -+ IFMOVD tmp, b5 -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ IFMOVD tmp, b5 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 + -+$L25: -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L27 -+#else -+ blbs TMP1, $L27 -+#endif -+ .align 4 ++ MUL C, $f16, $f21 ++ unop ++ unop ++ unop + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 + unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) ++ ADD $f25, $f26, $f26 + -+ ADD4 c13, t3, b5 -+ fmov b5, c13 ++ MUL C, $f17, $f23 ++ unop + unop -+ MUL a1, b2, t3 + unop + -+ ADD2 c14, t4, b5 -+ fmov b5, c14 ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 + unop -+ MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) ++ SUB $f27, $f28, $f28 + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++ MUL C, $f18, $f25 ++ unop ++ unop + unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 + unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) ++ ADD $f21, $f22, $f22 + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 ++ MUL C, $f19, $f27 ++ unop ++ unop + unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 + unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) ++ SUB $f23, $f24, $f24 + -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) ++ ST $f22, 6*SIZE(X) ++ ADD $f25, $f26, $f26 ++ ST $f24, 6*SIZE(Y) ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 7*SIZE(X) ++ ldi X, 8*SIZE(X) ++ ST $f28, 7*SIZE(Y) ++ ldi Y, 8*SIZE(Y) + .align 4 + -+$L27: -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ MUL a2, b1, t2 -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+ MUL a1, b2, t3 + -+ ADD2 c14, t4, b5 -+ fmov b5, c14 -+ MUL a2, b2, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b3, t1 ++$L15: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b3, t2 -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b4, t3 ++$L16: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, t4 -+ ldi BO, 4 * SIZE(BO) ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(X) ++ ST $f24, 0*SIZE(Y) ++ ldi I, -1(I) ++ ++ ST $f26, 1*SIZE(X) ++ ldi X, 2 * SIZE(X) ++ ST $f28, 1*SIZE(Y) ++ ldi Y, 2 * SIZE(Y) + -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+ ADD2 c14, t4, b5 -+ fmov b5, c14 -+ -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 -+ ADD c09, c14, b5 -+ fmov b5, c09 -+ ADD c10, c13, b5 -+ fmov b5, c10 ++ bgt I, $L16 + .align 4 + -+$L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif ++$L998: ++ clr $0 ++ ret ++ .align 4 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) ++$L50: ++ mov X, XX ++ mov Y, YY + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c10, b5 -+ fmov b5, c10 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) ++ sra N, 2, I ++ ble I, $L55 ++ .align 4 + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c10, b5 -+ fmov b5, c10 -+#endif ++$L51: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 + -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c09, t3, b5 -+ fmov b5, c09 -+ ADD6 c10, t4, b5 -+ fmov b5, c10 -+#endif ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 + -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 + -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y + -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ ADD6 c09, t1, b5 -+ fmov b5, c09 -+ ADD5 c10, t2, b5 -+ fmov b5, c10 ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 + -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 + -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 + -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 -+#endif ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ldi I, -1(I) ++ bgt I, $L51 ++ .align 4 ++ ++$L55: ++ and N, 3, I ++ ble I, $L999 ++ .align 4 ++ ++$L56: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(X) ++ ST $f24, 0*SIZE(Y) ++ ldi I, -1(I) ++ ++ ST $f26, 1*SIZE(X) ++ ST $f28, 1*SIZE(Y) ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ bgt I, $L56 ++ .align 4 ++ ++$L999: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zscal.S b/kernel/sw_64/zscal.S +new file mode 100644 +index 000000000..a0cb1671a +--- /dev/null ++++ b/kernel/sw_64/zscal.S +@@ -0,0 +1,341 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" + -+#ifdef RT -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ LD a3, 4 * SIZE(BO) -+ LD a4, 5 * SIZE(BO) + -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 ++#define PREFETCHSIZE 88 + -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 ++#define N $16 ++#define X $21 ++#define INCX $17 + -+ MUL a3, c09, t1 -+ MUL a3, c10, t2 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 ++#define XX $18 ++#define I $19 + -+ MUL a4, c10, t1 -+ MUL a4, c09, t2 -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 ++#define ALPHA_R $f19 ++#define ALPHA_I $f20 + -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f21 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+#endif ++#define t0 $f22 ++#define t1 $f23 ++#define t2 $f24 ++#define t3 $f25 + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c10, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c10, 3 * SIZE(AO) -+#endif ++#define t4 $f26 ++#define t5 $f27 ++#define t6 $f28 ++#define t7 $f29 + -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) -+#endif ++ PROLOGUE ++ PROFCODE + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c09, 0 * SIZE(C2) -+ ST c10, 1 * SIZE(C2) ++ ldl INCX, 0($sp) ++ mov X, XX ++ ble N, $L999 + -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) -+#endif ++ fbne ALPHA_R, $NORMAL ++ fbne ALPHA_I, $NORMAL + -+#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ addl INCX, INCX, INCX + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO -+#endif ++ sra N, 2, I ++ ble I, $LL15 + -+#ifdef LT -+ addl KK, 1, KK -+#endif ++ SXADDQ INCX, X, X ++ SXADDQ INCX, X, X ++ SXADDQ INCX, X, X ++ SXADDQ INCX, X, X + -+#ifdef LN -+ subl KK, 1, KK -+#endif ++ ldi I, -1(I) ++ ble I, $LL13 + .align 4 + -+$L29: -+#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ addl B, TMP1, B -+#endif ++$LL12: ++ ST $f31, 0 * SIZE(XX) ++ ST $f31, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ SXADDQ INCX, X, X + -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif ++ ST $f31, 0 * SIZE(XX) ++ ST $f31, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ SXADDQ INCX, X, X + -+#ifdef RN -+ addl KK, 2, KK -+#endif ++ ST $f31, 0 * SIZE(XX) ++ ST $f31, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ SXADDQ INCX, X, X + -+#ifdef RT -+ subl KK, 2, KK -+#endif ++ ST $f31, 0 * SIZE(XX) ++ ST $f31, 1 * SIZE(XX) ++ ldi I, -1(I) ++ SXADDQ INCX, XX, XX + -+ ldi J, -1(J) -+ bgt J, $L01 ++ fillde PREFETCHSIZE * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $LL12 + .align 4 + -+$L30: -+ and N, 1, J -+ ble J, $L999 -+ -+#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 -+ subl B, TMP1, B ++$LL13: ++ ST $f31, 0 * SIZE(XX) ++ ST $f31, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ unop + -+ subl C, LDC, C1 -+ subl C, LDC, C -+#else -+ mov C, C1 -+ addl C, LDC, C -+#endif ++ ST $f31, 0 * SIZE(XX) ++ ST $f31, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ unop + -+#ifdef LN -+ addl M, OFFSET, KK -+#endif ++ ST $f31, 0 * SIZE(XX) ++ ST $f31, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX + -+#ifdef LT -+ mov OFFSET, KK -+#endif ++ ST $f31, 0 * SIZE(XX) ++ ST $f31, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX + -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO -+#endif ++ .align 4 + -+ sra M, 1, I -+ ble I, $L50 ++$LL15: ++ and N, 3, I ++ unop ++ unop ++ ble I, $L999 + .align 4 + -+$L41: -+#if defined(LT) || defined(RN) ++$LL17: ++ SXADDQ INCX, X, X ++ ST $f31, 0 * SIZE(XX) ++ ST $f31, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX + -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++ ldi I, -1(I) ++ bne I, $LL17 + -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 ++ ret ++ .align 4 + -+ ldi BO, 2 * SIZE(B) -+ fclr c03 -+ ldi AO, 4 * SIZE(AO) -+ fclr c07 + -+ ldi L, -2(KK) -+ fclr c04 -+ fclr c08 ++$NORMAL: ++ addl INCX, INCX, INCX + -+ ble KK, $L48 -+ ble L, $L45 -+#else -+#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif ++ sra N, 2, I ++ ble I, $L15 + -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT, TMP1 -+ addl B, TMP1, BO ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a2, 0 * SIZE(X) ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a6, 0 * SIZE(X) ++ LD a7, 1 * SIZE(X) ++ SXADDQ INCX, X, X + -+ subl K, KK, TMP1 ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 + -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 + -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 + -+ ldi BO, 2 * SIZE(BO) -+ fclr c03 -+ ldi AO, 4 * SIZE(AO) -+ fclr c07 ++$L12: ++ ST t4, 0 * SIZE(XX) ++ MUL a2, ALPHA_R, t0 ++ ST t5, 1 * SIZE(XX) ++ MUL a3, ALPHA_I, t1 + -+ ldi L, -2(TMP1) -+ fclr c04 -+ fclr c08 ++ MUL a2, ALPHA_I, t2 ++ LD a0, 0 * SIZE(X) ++ MUL a3, ALPHA_R, t3 ++ LD a1, 1 * SIZE(X) + -+ ble TMP1, $L48 -+ ble L, $L45 -+#endif -+ .align 5 ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ SXADDQ INCX, X, X + -+$L42: -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b1, t1 -+ unop ++ MUL a4, ALPHA_R, t0 ++ ST t6, 0 * SIZE(XX) ++ MUL a5, ALPHA_I, t1 ++ ST t7, 1 * SIZE(XX) + -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ ldi L, -2(L) -+ MUL a2, b1, t2 -+ unop ++ MUL a4, ALPHA_I, t2 ++ LD a2, 0 * SIZE(X) ++ MUL a5, ALPHA_R, t3 ++ LD a3, 1 * SIZE(X) + -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b1, t3 -+ unop ++ SUB t0, t1, t4 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t5 ++ SXADDQ INCX, X, X + -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 2 * SIZE(BO) ++ MUL a6, ALPHA_R, t0 ++ ST t4, 0 * SIZE(XX) ++ MUL a7, ALPHA_I, t1 ++ ST t5, 1 * SIZE(XX) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) ++ MUL a6, ALPHA_I, t2 ++ LD a4, 0 * SIZE(X) ++ MUL a7, ALPHA_R, t3 ++ LD a5, 1 * SIZE(X) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ SXADDQ INCX, X, X + -+ ADD1 c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) ++ MUL a0, ALPHA_R, t0 ++ ST t6, 0 * SIZE(XX) ++ MUL a1, ALPHA_I, t1 ++ ST t7, 1 * SIZE(XX) + -+ ADD3 c04, t4, b5 -+ fmov b5, c04 -+ unop -+ MUL a4, b2, t4 -+ LD a5, 3 * SIZE(AO) ++ MUL a0, ALPHA_I, t2 ++ LD a6, 0 * SIZE(X) ++ MUL a1, ALPHA_R, t3 ++ LD a7, 1 * SIZE(X) + -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b3, t1 -+ LD b2, -1 * SIZE(BO) ++ SUB t0, t1, t4 ++ ldi I, -1(I) ++ ADD t2, t3, t5 ++ SXADDQ INCX, XX, XX + -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b3, t2 ++ fillde PREFETCHSIZE * SIZE(X) + unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 + -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+ unop -+ MUL a3, b3, t3 -+ ldi AO, 8 * SIZE(AO) ++$L13: ++ MUL a2, ALPHA_R, t0 ++ MUL a3, ALPHA_I, t1 ++ ST t4, 0 * SIZE(XX) ++ MUL a2, ALPHA_I, t2 ++ ST t5, 1 * SIZE(XX) ++ MUL a3, ALPHA_R, t3 + -+ ADD2 c08, t4, b5 -+ fmov b5, c08 ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 + unop -+ MUL a5, b3, t4 -+ LD b3, 0 * SIZE(BO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b4, t1 -+ LD a1, -4 * SIZE(AO) ++ ST t6, 0 * SIZE(XX) ++ MUL a4, ALPHA_R, t0 ++ ST t7, 1 * SIZE(XX) ++ MUL a5, ALPHA_I, t1 ++ MUL a4, ALPHA_I, t2 ++ MUL a5, ALPHA_R, t3 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ SUB t0, t1, t4 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t5 + unop -+ MUL a2, b4, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD1 c03, t3, b5 -+ fmov b5, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD3 c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L42 -+ .align 4 + -+$L45: -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+ MUL b1, a1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L47 -+#else -+ blbs TMP1, $L47 -+#endif -+ .align 4 ++ MUL a6, ALPHA_R, t0 ++ ST t4, 0 * SIZE(XX) ++ MUL a7, ALPHA_I, t1 ++ ST t5, 1 * SIZE(XX) + -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, t2 -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, t3 ++ MUL a6, ALPHA_I, t2 ++ MUL a7, ALPHA_R, t3 + -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 0 * SIZE(BO) ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) ++ ST t6, 0 * SIZE(XX) ++ ST t7, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ .align 4 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++$L15: ++ and N, 3, I + unop -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD1 c03, t3, b5 -+ fmov b5, c03 + unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD3 c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) ++ ble I, $L999 + .align 4 + -+$L47: -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, t2 -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, t3 -+ -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+ MUL a4, b1, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b2, t1 -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b2, t2 -+ ADD1 c03, t3, b5 -+ fmov b5, c03 -+ MUL a3, b2, t3 -+ -+ ADD3 c04, t4, b5 -+ fmov b5, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, t4 -+ ldi BO, 2 * SIZE(BO) -+ -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+ -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 -+ ADD c03, c08, b5 -+ fmov b5, c03 -+ ADD c04, c07, b5 -+ fmov b5, c04 ++$L17: ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X + -+$L48: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else -+ subl KK, 1, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -4 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) ++ ST t4, 0 * SIZE(XX) ++ ST t5, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+#endif ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 + -+#ifdef LN -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) -+ LD a3, 4 * SIZE(AO) -+ LD a4, 5 * SIZE(AO) ++$L999: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zsum.S b/kernel/sw_64/zsum.S +new file mode 100644 +index 000000000..e42bba8d1 +--- /dev/null ++++ b/kernel/sw_64/zsum.S +@@ -0,0 +1,210 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 -+ MUL a3, c03, t1 -+ MUL a3, c04, t2 ++#define ASSEMBLER ++#include "common.h" + -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ MUL a4, c04, t1 -+ MUL a4, c03, t2 + -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 ++#define PREFETCHSIZE 88 + -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+#endif ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 + -+#ifdef LT -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 ++ PROLOGUE ++ PROFCODE + -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c04, t2, b5 -+ fmov b5, c04 ++ fclr s0 ++ unop ++ fclr t0 ++ addl INCX, INCX, INCX + -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ ADD6 c03, t1, b5 -+ fmov b5, c03 -+ ADD5 c04, t2, b5 -+ fmov b5, c04 ++ fclr s1 ++ unop ++ fclr t1 ++ ble N, $L999 + -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) ++ beq INCX, $L999 + -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 ++ fclr s2 ++ sra N, 2, I ++ fclr s3 ++ ble I, $L15 + -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 -+#endif ++ LD a0, 0 * SIZE(X) ++ fclr t2 ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X + -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++ LD a2, 0 * SIZE(X) ++ fclr t3 ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) + -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c03, t3, b5 -+ fmov b5, c03 -+ ADD6 c04, t4, b5 -+ fmov b5, c04 -+#endif ++ ble I, $L13 ++ .align 4 + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+#endif ++$L12: ++ ADD s0, t0, s0 ++ s_fillcs PREFETCHSIZE * SIZE(X) ++ fmov a0, t0 ++ ldi I, -1(I) + -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+#endif ++ ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fmov a1, t1 ++ unop + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) ++ ADD s2, t2, s2 ++ LD a7, 1 * SIZE(X) ++ fmov a2, t2 ++ SXADDQ INCX, X, X + -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+#endif ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ fmov a3, t3 ++ unop + -+#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ADD s0, t0, s0 ++ LD a1, 1 * SIZE(X) ++ fmov a4, t0 ++ SXADDQ INCX, X, X + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl BO, TMP2, BO -+#endif ++ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) ++ fmov a5, t1 ++ unop + -+#ifdef LT -+ addl KK, 2, KK -+#endif ++ ADD s2, t2, s2 ++ LD a3, 1 * SIZE(X) ++ fmov a6, t2 ++ SXADDQ INCX, X, X + -+#ifdef LN -+ subl KK, 2, KK -+#endif ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ fmov a7, t3 ++ unop + -+ ldi I, -1(I) -+ bgt I, $L41 ++ LD a5, 1 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 + .align 4 + -+$L50: -+ and M, 1, I -+ ble I, $L59 ++$L13: ++ ADD s0, t0, s0 ++ LD a6, 0 * SIZE(X) ++ fmov a0, t0 + -+#if defined(LT) || defined(RN) ++ ADD s1, t1, s1 ++ LD a7, 1 * SIZE(X) ++ fmov a1, t1 ++ SXADDQ INCX, X, X + -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++ ADD s2, t2, s2 ++ fmov a2, t2 ++ ADD s3, t3, s3 ++ fmov a3, t3 + -+ LD b1, 0 * SIZE(B) -+ fclr c01 -+ LD b2, 1 * SIZE(B) -+ fclr c05 ++ ADD s0, t0, s0 ++ fmov a4, t0 ++ ADD s1, t1, s1 ++ fmov a5, t1 ++ ADD s2, t2, s2 ++ fmov a6, t2 ++ ADD s3, t3, s3 ++ fmov a7, t3 + -+ LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 + -+ ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(B) ++ .align 4 + -+ ldi L, -2(KK) ++$L15: ++ ADD s0, s2, s0 ++ and N, 3, I ++ ADD s1, s3, s1 ++ ble I, $L999 ++ .align 4 + -+ ble KK, $L58 -+ ble L, $L55 -+#else -+#ifdef LN -+ sll K, ZBASE_SHIFT, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif ++$L17: ++ ADD s0, t0, s0 ++ LD a0, 0 * SIZE(X) ++ fmov a0, t0 ++ ldi I, -1(I) + -+ sll KK, ZBASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT, TMP1 -+ addl B, TMP1, BO ++ ADD s1, t1, s1 ++ LD a1, 1 * SIZE(X) ++ fmov a1, t1 ++ SXADDQ INCX, X, X + -+ subl K, KK, TMP1 ++ bne I, $L17 ++ .align 4 + -+ LD a1, 0 * SIZE(AO) -+ fclr t1 -+ LD a2, 1 * SIZE(AO) -+ fclr t2 -+ LD a3, 2 * SIZE(AO) -+ fclr t3 -+ LD a4, 3 * SIZE(AO) -+ fclr t4 ++$L999: ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 + -+ LD b1, 0 * SIZE(BO) -+ fclr c01 -+ LD b2, 1 * SIZE(BO) -+ fclr c05 ++ ADD s0, s1, s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zswap.S b/kernel/sw_64/zswap.S +new file mode 100644 +index 000000000..6b4619ce9 +--- /dev/null ++++ b/kernel/sw_64/zswap.S +@@ -0,0 +1,247 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ + -+ LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 ++#define ASSEMBLER ++#include "common.h" + -+ ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(BO) + -+ ldi L, -2(TMP1) ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 + -+ ble TMP1, $L58 -+ ble L, $L55 ++ mov $21, $17 ++ ldl $18, 0($sp) ++ ldl $19, 8($sp) ++ ldl $20, 16($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 +#endif -+ .align 5 -+ -+$L52: -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ ldi AO, 4 * SIZE(AO) -+ MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) -+ -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ ldi L, -2(L) -+ MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, t1 -+ ldi BO, 4 * SIZE(BO) ++ beq $18, $SubEnd ++ beq $20, $SubEnd + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, 0 * SIZE(BO) ++ ble $16, $SubEnd # if n <= 0 goto $End + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a3, b4, t3 -+ LD a3, 0 * SIZE(AO) ++ cmpeq $18, 1, $1 ++ addl $18, $18, $18 ++ cmpeq $20, 1, $2 ++ addl $20, $20, $20 + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ MUL a4, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ unop ++ sra $16, 2, $21 ++ and $1, $2, $1 ++ and $16, 3, $22 ++ beq $1, $Sub + -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L52 ++ ble $21, $MainRemain + .align 4 + -+$L55: -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L57 -+#else -+ blbs TMP1, $L57 -+#endif -+ .align 4 ++$MainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f12, 2*SIZE($19) ++ LD $f13, 3*SIZE($19) ++ LD $f14, 4*SIZE($19) ++ LD $f15, 5*SIZE($19) ++ LD $f16, 6*SIZE($19) ++ LD $f17, 7*SIZE($19) ++ ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ LD $f22, 2*SIZE($17) ++ LD $f23, 3*SIZE($17) ++ LD $f24, 4*SIZE($17) ++ LD $f25, 5*SIZE($17) ++ LD $f26, 6*SIZE($17) ++ LD $f27, 7*SIZE($17) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ fillde 16*SIZE($17) + unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) ++ fillde 16*SIZE($19) ++ subl $21, 1, $21 + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, t3 -+ LD a1, 0 * SIZE(AO) ++ ST $f10, 0*SIZE($17) ++ ST $f11, 1*SIZE($17) ++ ST $f12, 2*SIZE($17) ++ ST $f13, 3*SIZE($17) ++ ST $f14, 4*SIZE($17) ++ ST $f15, 5*SIZE($17) ++ ST $f16, 6*SIZE($17) ++ ST $f17, 7*SIZE($17) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b2, t4 -+ LD a2, 1 * SIZE(AO) ++ ST $f20, 0*SIZE($19) ++ ST $f21, 1*SIZE($19) ++ ST $f22, 2*SIZE($19) ++ ST $f23, 3*SIZE($19) ++ ST $f24, 4*SIZE($19) ++ ST $f25, 5*SIZE($19) ++ ST $f26, 6*SIZE($19) ++ ST $f27, 7*SIZE($19) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi AO, 2 * SIZE(AO) ++ ldi $17, 8*SIZE($17) ++ ldi $19, 8*SIZE($19) ++ bgt $21, $MainLoop + .align 4 + -+$L57: -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b1, t2 -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b2, t3 ++$MainRemain: ++ ble $22, $MainEnd ++ .align 4 + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b2, t4 -+ ldi BO, 2 * SIZE(BO) ++$MainRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ ADD2 c06, t4, b5 -+ fmov b5, c06 ++ ldi $17, 2*SIZE($17) ++ ldi $19, 2*SIZE($19) ++ subl $22, 1, $22 ++ ST $f10, -2*SIZE($17) ++ ST $f11, -1*SIZE($17) ++ ST $f20, -2*SIZE($19) ++ ST $f21, -1*SIZE($19) ++ bgt $22, $MainRemainLoop ++ .align 4 + -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 ++$MainEnd: ++ clr $0 ++ ret ++ .align 4 + -+$L58: -+#if defined(LN) || defined(RT) -+ subl KK, 1, TMP1 ++$Sub: ++ mov $17, $23 ++ mov $19, $24 ++ ble $21, $SubRemain ++ .align 4 + -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) -+#endif ++$SubLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ SXADDQ $20, $19, $19 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++ LD $f12, 0*SIZE($19) ++ LD $f13, 1*SIZE($19) ++ SXADDQ $20, $19, $19 + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++ LD $f14, 0*SIZE($19) ++ LD $f15, 1*SIZE($19) ++ SXADDQ $20, $19, $19 + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+#endif ++ LD $f16, 0*SIZE($19) ++ LD $f17, 1*SIZE($19) ++ SXADDQ $20, $19, $19 + -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ SXADDQ $18, $17, $17 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++ LD $f22, 0*SIZE($17) ++ LD $f23, 1*SIZE($17) ++ SXADDQ $18, $17, $17 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+#endif ++ LD $f24, 0*SIZE($17) ++ LD $f25, 1*SIZE($17) ++ SXADDQ $18, $17, $17 + -+#if defined(RN) || defined(RT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) ++ LD $f26, 0*SIZE($17) ++ LD $f27, 1*SIZE($17) ++ SXADDQ $18, $17, $17 + -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++ ST $f10, 0*SIZE($23) ++ ST $f11, 1*SIZE($23) ++ SXADDQ $18, $23, $23 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+#endif ++ ST $f12, 0*SIZE($23) ++ ST $f13, 1*SIZE($23) ++ SXADDQ $18, $23, $23 + -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+#endif ++ ST $f14, 0*SIZE($23) ++ ST $f15, 1*SIZE($23) ++ SXADDQ $18, $23, $23 + -+#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+#endif ++ ST $f16, 0*SIZE($23) ++ ST $f17, 1*SIZE($23) ++ SXADDQ $18, $23, $23 + -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) ++ ST $f20, 0*SIZE($24) ++ ST $f21, 1*SIZE($24) ++ SXADDQ $20, $24, $24 + -+#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+#endif ++ ST $f22, 0*SIZE($24) ++ ST $f23, 1*SIZE($24) ++ SXADDQ $20, $24, $24 + -+#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif ++ ST $f24, 0*SIZE($24) ++ ST $f25, 1*SIZE($24) ++ SXADDQ $20, $24, $24 + -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl BO, TMP2, BO -+#endif ++ ST $f26, 0*SIZE($24) ++ ST $f27, 1*SIZE($24) ++ SXADDQ $20, $24, $24 + -+#ifdef LT -+ addl KK, 1, KK -+#endif ++ subl $21, 1, $21 ++ bgt $21, $SubLoop ++ .align 4 + -+#ifdef LN -+ subl KK, 1, KK -+#endif ++$SubRemain: ++ ble $22, $SubEnd + .align 4 + -+$L59: -+#ifdef LN -+ sll K, ZBASE_SHIFT, TMP1 -+ addl B, TMP1, B -+#endif ++$SubRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) + -+#if defined(LT) || defined(RN) -+ mov BO, B -+#endif ++ subl $22, 1, $22 + -+#ifdef RN -+ addl KK, 1, KK -+#endif ++ ST $f10, 0*SIZE($17) ++ ST $f11, 1*SIZE($17) ++ ST $f20, 0*SIZE($19) ++ ST $f21, 1*SIZE($19) + -+#ifdef RT -+ subl KK, 1, KK -+#endif ++ SXADDQ $18, $17, $17 ++ SXADDQ $20, $19, $19 ++ bgt $22, $SubRemainLoop + .align 4 + -+$L999: -+ fldd $f2, 0($sp) -+ fldd $f3, 8($sp) -+ fldd $f4, 16($sp) -+ fldd $f5, 24($sp) -+ fldd $f6, 32($sp) -+ fldd $f7, 40($sp) -+ fldd $f8, 48($sp) -+ fldd $f9, 56($sp) -+ ldl tmp, 72($sp) -+ ++$SubEnd: + clr $0 -+ ldi $sp, STACKSIZE($sp) + ret -+ .ident VERSION -+ .end CNAME -diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak ++ EPILOGUE +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S b/kernel/sw_64/ztrsm_kernel_2x2_LN.S new file mode 100644 -index 0000000..f4a2c13 +index 000000000..23eb83196 --- /dev/null -+++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak -@@ -0,0 +1,2222 @@ ++++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S +@@ -0,0 +1,2230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -93177,21 +32435,22 @@ index 0000000..f4a2c13 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#if !defined(SW2B) ++ ++#if !defined(SW8A) +#error "Architecture is not specified." +#endif + -+#ifdef SW2B ++#ifdef SW8A +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + + ++ + .set noat + .set noreorder -+ .arch ev6 ++ .arch sw8a + +.text + .align 5 @@ -93366,7 +32625,7 @@ index 0000000..f4a2c13 + sra N, 1, J + ble J, $L30 + .align 4 -+ ++ +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 @@ -93381,20 +32640,439 @@ index 0000000..f4a2c13 + addl C2, LDC, C +#endif + -+#ifdef LN -+ addl M, OFFSET, KK ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ and M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ble I, $L20 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L28 ++ ble L, $L25 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L28 ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD3 c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD4 c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD2 c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO +#endif + +#ifdef LT -+ mov OFFSET, KK ++ addl KK, 1, KK +#endif + -+#if defined(LN) || defined(RT) -+ mov A, AORIG -+#else -+ mov A, AO ++#ifdef LN ++ subl KK, 1, KK +#endif ++ .align 4 + ++$L20: + sra M, 1, I + fclr t1 + fclr t2 @@ -93404,7 +33082,7 @@ index 0000000..f4a2c13 + fclr c01 + fclr c05 + -+ ble I, $L20 ++ ble I, $L29 + .align 4 + +$L11: @@ -93435,12 +33113,12 @@ index 0000000..f4a2c13 + ldi AO, 4 * SIZE(AO) + fclr c15 + -+ fillcs 4 * SIZE(C1) ++ fillde 4 * SIZE(C1) + fclr c04 + ldi L, -2(KK) + fclr c08 + -+ fillcs 4 * SIZE(C2) ++ fillde 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 @@ -93482,12 +33160,12 @@ index 0000000..f4a2c13 + ldi AO, 4 * SIZE(AO) + fclr c15 + -+ fillcs 4 * SIZE(C1) ++ fillde 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + -+ fillcs 4 * SIZE(C2) ++ fillde 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 @@ -93499,13 +33177,13 @@ index 0000000..f4a2c13 +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) ++ s_fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) ++ s_fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif @@ -93828,7 +33506,7 @@ index 0000000..f4a2c13 + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) -+ ++ + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) @@ -93848,7 +33526,7 @@ index 0000000..f4a2c13 + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) -+ ++ + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) @@ -93905,7 +33583,7 @@ index 0000000..f4a2c13 + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 -+ ++ + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + @@ -93993,516 +33671,56 @@ index 0000000..f4a2c13 + + MUL a2, c02, t1 + MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+ ADD5 c03, t3, c03 -+ ADD6 c04, t4, c04 -+ -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c03, t3 -+ MUL a3, c04, t4 -+ -+ SUB c09, t1, c09 -+ SUB c10, t2, c10 -+ SUB c11, t3, c11 -+ SUB c12, t4, c12 -+ -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ MUL a4, c04, t3 -+ MUL a4, c03, t4 -+ -+ ADD6 c09, t1, c09 -+ ADD5 c10, t2, c10 -+ ADD6 c11, t3, c11 -+ ADD5 c12, t4, c12 -+ -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 -+ -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 -+ -+ ADD5 c09, t1, c09 -+ ADD6 c10, t2, c10 -+ ADD5 c11, t3, c11 -+ ADD6 c12, t4, c12 -+#endif -+ -+#ifdef RT -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ LD a3, 4 * SIZE(BO) -+ LD a4, 5 * SIZE(BO) -+ -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 -+ -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ MUL a1, c11, c11 -+ MUL a1, c12, c12 -+ -+ ADD5 c09, t1, c09 -+ ADD6 c10, t2, c10 -+ ADD5 c11, t3, c11 -+ ADD6 c12, t4, c12 -+ -+ MUL a3, c09, t1 -+ MUL a3, c10, t2 -+ MUL a3, c11, t3 -+ MUL a3, c12, t4 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ SUB c03, t3, c03 -+ SUB c04, t4, c04 -+ -+ MUL a4, c10, t1 -+ MUL a4, c09, t2 -+ MUL a4, c12, t3 -+ MUL a4, c11, t4 -+ -+ ADD6 c01, t1, c01 -+ ADD5 c02, t2, c02 -+ ADD6 c03, t3, c03 -+ ADD5 c04, t4, c04 -+ -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+ ADD5 c03, t3, c03 -+ ADD6 c04, t4, c04 -+#endif -+ -+#if defined(LN) || defined(LT) -+ ST c01, 0 * SIZE(BO) -+ ST c02, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c10, 3 * SIZE(BO) -+ -+ ST c03, 4 * SIZE(BO) -+ ST c04, 5 * SIZE(BO) -+ ST c11, 6 * SIZE(BO) -+ ST c12, 7 * SIZE(BO) -+#else -+ ST c01, 0 * SIZE(AO) -+ ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) -+ -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c11, 6 * SIZE(AO) -+ ST c12, 7 * SIZE(AO) -+#endif -+ -+#ifdef LN -+ ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) -+#endif -+ -+ ST c01, 0 * SIZE(C1) -+ ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) -+ -+ ST c09, 0 * SIZE(C2) -+ ST c10, 1 * SIZE(C2) -+ ST c11, 2 * SIZE(C2) -+ ST c12, 3 * SIZE(C2) -+ -+#ifndef LN -+ ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) -+#endif -+ -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 -+ addl AORIG, TMP1, AORIG -+#endif -+ -+#if defined(LT) || defined(RN) -+ subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO -+#endif -+ -+#ifdef LT -+ addl KK, 2, KK -+#endif -+ -+#ifdef LN -+ subl KK, 2, KK -+#endif -+ fclr c01 -+ fclr c05 -+ -+ ldi I, -1(I) -+ bgt I, $L11 -+ .align 4 -+ -+$L20: -+ and M, 1, I -+ ble I, $L29 -+ -+#if defined(LT) || defined(RN) -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(B) -+ fclr c10 -+ LD b2, 1 * SIZE(B) -+ fclr c14 -+ -+ LD b3, 2 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) -+ LD b4, 3 * SIZE(B) -+ ldi BO, 4 * SIZE(B) -+ -+ ldi L, -2(KK) -+ -+ ble KK, $L28 -+ ble L, $L25 -+#else -+#ifdef LN -+ sll K, ZBASE_SHIFT + 0, TMP1 -+ subl AORIG, TMP1, AORIG -+#endif -+ -+ sll KK, ZBASE_SHIFT + 0, TMP1 -+ addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT + 1, TMP1 -+ addl B, TMP1, BO -+ -+ subl K, KK, TMP1 -+ -+ LD a1, 0 * SIZE(AO) -+ fclr c09 -+ LD a2, 1 * SIZE(AO) -+ fclr c13 -+ -+ LD a3, 2 * SIZE(AO) -+ fclr c02 -+ LD a4, 3 * SIZE(AO) -+ fclr c06 -+ -+ LD b1, 0 * SIZE(BO) -+ fclr c10 -+ LD b2, 1 * SIZE(BO) -+ fclr c14 -+ -+ LD b3, 2 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) -+ LD b4, 3 * SIZE(BO) -+ ldi BO, 4 * SIZE(BO) -+ -+ ldi L, -2(TMP1) -+ -+ ble TMP1, $L28 -+ ble L, $L25 -+#endif -+ .align 5 -+ -+$L22: -+ ADD1 c09, t1, c09 -+ unop -+ MUL a1, b1, t1 -+ unop -+ -+ ADD3 c10, t2, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD4 c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) -+ -+ ADD2 c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD1 c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD3 c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD4 c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD2 c06, t4, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ -+ ADD1 c09, t1, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD3 c10, t2, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD4 c13, t3, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD2 c14, t4, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) -+ -+ ADD1 c01, t1, c01 -+ ldi L, -2(L) -+ MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) -+ -+ ADD3 c02, t2, c02 -+ unop -+ MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD4 c05, t3, c05 -+ unop -+ MUL a3, b5, t3 -+ LD a3, 0 * SIZE(AO) -+ -+ ADD2 c06, t4, c06 -+ MUL a4, b5, t4 -+ LD a4, 1 * SIZE(AO) -+ bgt L, $L22 -+ .align 4 -+ -+$L25: -+ ADD1 c09, t1, c09 -+ MUL a1, b1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L27 -+#else -+ blbs TMP1, $L27 -+#endif -+ .align 4 -+ -+ ADD3 c10, t2, c10 -+ unop -+ MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) -+ -+ ADD4 c13, t3, c13 -+ unop -+ MUL a1, b2, t3 -+ unop -+ -+ ADD2 c14, t4, c14 -+ unop -+ MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD1 c01, t1, c01 -+ unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD3 c02, t2, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD4 c05, t3, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD2 c06, t4, c06 -+ unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) -+ -+ ADD1 c09, t1, c09 -+ LD b4, 3 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) -+ .align 4 -+ -+$L27: -+ ADD3 c10, t2, c10 -+ MUL a2, b1, t2 -+ ADD4 c13, t3, c13 -+ MUL a1, b2, t3 -+ -+ ADD2 c14, t4, c14 -+ MUL a2, b2, t4 -+ ADD1 c01, t1, c01 -+ MUL a1, b3, t1 -+ -+ ADD3 c02, t2, c02 -+ MUL a2, b3, t2 -+ ADD4 c05, t3, c05 -+ MUL a1, b4, t3 -+ -+ ADD2 c06, t4, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, t4 -+ ldi BO, 4 * SIZE(BO) -+ -+ ADD1 c09, t1, c09 -+ ADD3 c10, t2, c10 -+ ADD4 c13, t3, c13 -+ ADD2 c14, t4, c14 -+ -+ ADD c01, c06, c01 -+ ADD c02, c05, c02 -+ ADD c09, c14, c09 -+ ADD c10, c13, c10 -+ .align 4 -+ -+$L28: -+#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 -+ addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl B, TMP2, BO -+#else -+ ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c09, c09 -+ SUB a4, c10, c10 -+#else -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ -+ SUB a1, c01, c01 -+ SUB a2, c02, c02 -+ SUB a3, c09, c09 -+ SUB a4, c10, c10 -+#endif -+ -+#if defined(LN) || defined(LT) -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 -+ -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ MUL a1, c09, c09 -+ MUL a1, c10, c10 -+ -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 -+ ADD5 c09, t3, c09 -+ ADD6 c10, t4, c10 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ + MUL a1, c01, c01 + MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ + SUB c09, t1, c09 + SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ + MUL a1, c09, c09 + MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 +#endif + +#ifdef RT @@ -94513,32 +33731,56 @@ index 0000000..f4a2c13 + + MUL a2, c10, t1 + MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ + MUL a1, c09, c09 + MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ + SUB c01, t1, c01 + SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 -+ ++ ADD6 c03, t3, c03 ++ ADD5 c04, t4, c04 ++ + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ + MUL a1, c01, c01 + MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) @@ -94546,48 +33788,72 @@ index 0000000..f4a2c13 + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c10, 3 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN -+ ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) + +#ifndef LN -+ ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) +#endif + ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ +#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 ++ sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl BO, TMP2, BO ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO +#endif + +#ifdef LT -+ addl KK, 1, KK ++ addl KK, 2, KK +#endif + +#ifdef LN -+ subl KK, 1, KK ++ subl KK, 2, KK +#endif ++ fclr c01 ++ fclr c05 ++ ++ ldi I, -1(I) ++ bgt I, $L11 + .align 4 + +$L29: @@ -94641,11 +33907,9 @@ index 0000000..f4a2c13 + mov A, AO +#endif + -+ sra M, 1, I ++ and M, 1, I + ble I, $L50 -+ .align 4 + -+$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) @@ -94661,29 +33925,26 @@ index 0000000..f4a2c13 + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 ++ + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + ++ ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(B) -+ fclr c03 -+ ldi AO, 4 * SIZE(AO) -+ fclr c07 + + ldi L, -2(KK) -+ fclr c04 -+ fclr c08 + -+ ble KK, $L48 -+ ble L, $L45 ++ ble KK, $L58 ++ ble L, $L55 +#else +#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 ++ sll K, ZBASE_SHIFT, TMP1 + subl AORIG, TMP1, AORIG +#endif + -+ sll KK, ZBASE_SHIFT + 1, TMP1 ++ sll KK, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO @@ -94703,263 +33964,149 @@ index 0000000..f4a2c13 + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 ++ + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + ++ ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(BO) -+ fclr c03 -+ ldi AO, 4 * SIZE(AO) -+ fclr c07 + + ldi L, -2(TMP1) -+ fclr c04 -+ fclr c08 + -+ ble TMP1, $L48 -+ ble L, $L45 ++ ble TMP1, $L58 ++ ble L, $L55 +#endif + .align 5 + -+$L42: -+ ADD4 c05, t1, c05 ++$L52: ++ ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + -+ ADD2 c06, t2, c06 -+ ldi L, -2(L) ++ ADD3 c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 -+ unop ++ LD b1, 2 * SIZE(BO) + -+ ADD4 c07, t3, c07 -+ unop -+ MUL a3, b1, t3 -+ unop ++ ADD4 c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) + -+ ADD2 c08, t4, c08 ++ ADD2 c06, t4, c06 + unop -+ MUL a4, b1, t4 -+ LD b1, 2 * SIZE(BO) ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) -+ -+ ADD1 c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD3 c04, t4, c04 -+ unop -+ MUL a4, b2, t4 -+ LD a5, 3 * SIZE(AO) -+ -+ ADD4 c05, t1, c05 -+ unop -+ MUL a1, b3, t1 -+ LD b2, -1 * SIZE(BO) -+ -+ ADD2 c06, t2, c06 -+ unop -+ MUL a2, b3, t2 + unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) + -+ ADD4 c07, t3, c07 ++ ADD4 c05, t3, c05 + unop -+ MUL a3, b3, t3 -+ ldi AO, 8 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) + -+ ADD2 c08, t4, c08 ++ ADD2 c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) + unop -+ MUL a5, b3, t4 -+ LD b3, 0 * SIZE(BO) + -+ ADD1 c01, t1, c01 ++ LD a4, 1 * SIZE(AO) + unop -+ MUL a1, b4, t1 -+ LD a1, -4 * SIZE(AO) -+ -+ ADD3 c02, t2, c02 + unop -+ MUL a2, b4, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD1 c03, t3, c03 -+ LD a4, -1 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD3 c04, t4, c04 -+ MUL a5, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L42 ++ bgt L, $L52 + .align 4 + -+$L45: -+ ADD4 c05, t1, c05 -+ MUL b1, a1, t1 ++$L55: ++ ADD1 c01, t1, c01 ++ MUL a1, b1, t1 +#if defined(LT) || defined(RN) -+ blbs KK, $L47 ++ blbs KK, $L57 +#else -+ blbs TMP1, $L47 ++ blbs TMP1, $L57 +#endif + .align 4 + -+ ADD2 c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD4 c07, t3, c07 -+ MUL a3, b1, t3 -+ -+ ADD2 c08, t4, c08 ++ ADD3 c02, t2, c02 + unop -+ MUL a4, b1, t4 ++ MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + -+ ADD1 c01, t1, c01 -+ unop -+ MUL a1, b2, t1 ++ ADD4 c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + -+ ADD3 c02, t2, c02 ++ ADD2 c06, t4, c06 + unop -+ MUL a2, b2, t2 ++ MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + -+ ADD1 c03, t3, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) -+ -+ ADD3 c04, t4, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD4 c05, t1, c05 -+ LD b2, 1 * SIZE(BO) ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) + .align 4 + -+$L47: -+ ADD2 c06, t2, c06 -+ MUL a2, b1, t2 -+ ADD4 c07, t3, c07 -+ MUL a3, b1, t3 -+ -+ ADD2 c08, t4, c08 -+ MUL a4, b1, t4 -+ ADD1 c01, t1, c01 -+ MUL a1, b2, t1 -+ ++$L57: + ADD3 c02, t2, c02 -+ MUL a2, b2, t2 -+ ADD1 c03, t3, c03 -+ MUL a3, b2, t3 ++ MUL a2, b1, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b2, t3 + -+ ADD3 c04, t4, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, t4 ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 + ldi BO, 2 * SIZE(BO) + -+ ADD4 c05, t1, c05 -+ ADD2 c06, t2, c06 -+ ADD4 c07, t3, c07 -+ ADD2 c08, t4, c08 ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 -+ ADD c03, c08, c03 -+ ADD c04, c07, c04 + -+$L48: ++$L58: +#if defined(LN) || defined(RT) -+#ifdef LN -+ subl KK, 2, TMP1 -+#else + subl KK, 1, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else -+ ldi AO, -4 * SIZE(AO) ++ ldi AO, -2 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ ++ + SUB a1, c01, c01 + SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) -+ ++ + SUB a1, c01, c01 + SUB a2, c02, c02 -+ SUB a3, c03, c03 -+ SUB a4, c04, c04 -+#endif -+ -+#ifdef LN -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) -+ LD a3, 4 * SIZE(AO) -+ LD a4, 5 * SIZE(AO) -+ -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ ADD5 c03, t1, c03 -+ ADD6 c04, t2, c04 -+ MUL a3, c03, t1 -+ MUL a3, c04, t2 -+ -+ SUB c01, t1, c01 -+ SUB c02, t2, c02 -+ MUL a4, c04, t1 -+ MUL a4, c03, t2 -+ -+ ADD6 c01, t1, c01 -+ ADD5 c02, t2, c02 -+ -+ LD a1, 0 * SIZE(AO) -+ LD a2, 1 * SIZE(AO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, c01 -+ MUL a1, c02, c02 -+ -+ ADD5 c01, t1, c01 -+ ADD6 c02, t2, c02 +#endif + -+#ifdef LT ++#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 @@ -94968,27 +34115,6 @@ index 0000000..f4a2c13 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ -+ SUB c03, t1, c03 -+ SUB c04, t2, c04 -+ -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ ADD6 c03, t1, c03 -+ ADD5 c04, t2, c04 -+ -+ LD a1, 6 * SIZE(AO) -+ LD a2, 7 * SIZE(AO) -+ -+ MUL a2, c04, t1 -+ MUL a2, c03, t2 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 -+ -+ ADD5 c03, t1, c03 -+ ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) @@ -94997,74 +34123,60 @@ index 0000000..f4a2c13 + + MUL a2, c02, t1 + MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 -+ + MUL a1, c01, c01 + MUL a1, c02, c02 -+ MUL a1, c03, c03 -+ MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 -+ ADD5 c03, t3, c03 -+ ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) -+ ST c03, 2 * SIZE(AO) -+ ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN -+ ldi C1, -4 * SIZE(C1) ++ ldi C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) -+ ST c03, 2 * SIZE(C1) -+ ST c04, 3 * SIZE(C1) + +#ifndef LN -+ ldi C1, 4 * SIZE(C1) ++ ldi C1, 2 * SIZE(C1) +#endif + +#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 ++ sll K, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ sll TMP1, ZBASE_SHIFT, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT -+ addl KK, 2, KK ++ addl KK, 1, KK +#endif + +#ifdef LN -+ subl KK, 2, KK ++ subl KK, 1, KK +#endif -+ -+ ldi I, -1(I) -+ bgt I, $L41 + .align 4 + +$L50: -+ and M, 1, I ++ sra M, 1, I + ble I, $L59 ++ .align 4 + ++$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) @@ -95080,26 +34192,29 @@ index 0000000..f4a2c13 + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 -+ + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + -+ ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 + + ldi L, -2(KK) ++ fclr c04 ++ fclr c08 + -+ ble KK, $L58 -+ ble L, $L55 ++ ble KK, $L48 ++ ble L, $L45 +#else +#ifdef LN -+ sll K, ZBASE_SHIFT, TMP1 ++ sll K, ZBASE_SHIFT + 1, TMP1 + subl AORIG, TMP1, AORIG +#endif + -+ sll KK, ZBASE_SHIFT, TMP1 ++ sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO @@ -95119,149 +34234,263 @@ index 0000000..f4a2c13 + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 -+ + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + -+ ldi AO, 2 * SIZE(AO) + ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 + + ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 + -+ ble TMP1, $L58 -+ ble L, $L55 ++ ble TMP1, $L48 ++ ble L, $L45 +#endif + .align 5 + -+$L52: -+ ADD1 c01, t1, c01 ++$L42: ++ ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + -+ ADD3 c02, t2, c02 -+ ldi AO, 4 * SIZE(AO) ++ ADD2 c06, t2, c06 ++ ldi L, -2(L) + MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) ++ unop + -+ ADD4 c05, t3, c05 -+ ldi L, -2(L) -+ MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop + -+ ADD2 c06, t4, c06 ++ ADD2 c08, t4, c08 + unop -+ MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, t1 -+ ldi BO, 4 * SIZE(BO) ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 + unop -+ MUL a4, b3, t2 ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + -+ ADD4 c05, t3, c05 ++ ADD1 c01, t1, c01 + unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 -+ LD a3, 0 * SIZE(AO) ++ LD a3, -2 * SIZE(AO) + -+ ADD2 c06, t4, c06 -+ MUL a4, b4, t4 ++ ADD3 c04, t4, c04 ++ MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) -+ unop -+ -+ LD a4, 1 * SIZE(AO) -+ unop -+ unop -+ bgt L, $L52 ++ bgt L, $L42 + .align 4 + -+$L55: -+ ADD1 c01, t1, c01 -+ MUL a1, b1, t1 ++$L45: ++ ADD4 c05, t1, c05 ++ MUL b1, a1, t1 +#if defined(LT) || defined(RN) -+ blbs KK, $L57 ++ blbs KK, $L47 +#else -+ blbs TMP1, $L57 ++ blbs TMP1, $L47 +#endif + .align 4 + -+ ADD3 c02, t2, c02 -+ unop ++ ADD2 c06, t2, c06 + MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + -+ ADD4 c05, t3, c05 -+ ldi BO, 2 * SIZE(BO) -+ MUL a1, b2, t3 ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + -+ ADD2 c06, t4, c06 ++ ADD3 c02, t2, c02 + unop -+ MUL a2, b2, t4 ++ MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + -+ ADD1 c01, t1, c01 -+ LD b2, -1 * SIZE(BO) ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 -+ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) + .align 4 + -+$L57: -+ ADD3 c02, t2, c02 ++$L47: ++ ADD2 c06, t2, c06 + MUL a2, b1, t2 -+ ADD4 c05, t3, c05 -+ MUL a1, b2, t3 -+ -+ ADD2 c06, t4, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b2, t4 -+ ldi BO, 2 * SIZE(BO) ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 + ++ ADD2 c08, t4, c08 ++ MUL a4, b1, t4 + ADD1 c01, t1, c01 ++ MUL a1, b2, t1 ++ + ADD3 c02, t2, c02 -+ ADD4 c05, t3, c05 -+ ADD2 c06, t4, c06 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD3 c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 + -+$L58: ++$L48: +#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else + subl KK, 1, TMP1 -+ -+ sll TMP1, ZBASE_SHIFT, TMP2 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else -+ ldi AO, -2 * SIZE(AO) ++ ldi AO, -4 * SIZE(AO) + ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) -+ ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ + SUB a1, c01, c01 + SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 +#endif + -+#if defined(LN) || defined(LT) ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 @@ -95270,6 +34499,27 @@ index 0000000..f4a2c13 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) @@ -95278,52 +34528,68 @@ index 0000000..f4a2c13 + + MUL a2, c02, t1 + MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ + MUL a1, c01, c01 + MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN -+ ldi C1, -2 * SIZE(C1) ++ ldi C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) + +#ifndef LN -+ ldi C1, 2 * SIZE(C1) ++ ldi C1, 4 * SIZE(C1) +#endif + +#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 ++ sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT, TMP2 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + +#ifdef LT -+ addl KK, 1, KK ++ addl KK, 2, KK +#endif + +#ifdef LN -+ subl KK, 1, KK ++ subl KK, 2, KK +#endif ++ ++ ldi I, -1(I) ++ bgt I, $L41 + .align 4 + +$L59: @@ -95359,12 +34625,12 @@ index 0000000..f4a2c13 + ret + .ident VERSION + .end CNAME -diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S b/kernel/sw_64/ztrsm_kernel_2x2_RT.S +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S b/kernel/sw_64/ztrsm_kernel_2x2_LT.S new file mode 100644 -index 0000000..97dbc16 +index 000000000..1e8f2c926 --- /dev/null -+++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S -@@ -0,0 +1,2623 @@ ++++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S +@@ -0,0 +1,2223 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ @@ -95405,36 +34671,29 @@ index 0000000..97dbc16 + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + -+#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++ ++#if !defined(SW8A) +#error "Architecture is not specified." +#endif + -+#ifdef SW6 ++#ifdef SW8A +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + -+#ifdef EV5 -+#define PREFETCHSIZE 48 -+#define UNOP -+#endif + -+#ifdef EV4 -+#define UNOP -+#endif + + .set noat + .set noreorder -+ .arch sw6a ++ .arch sw8a + +.text + .align 5 + .globl CNAME + .ent CNAME + -+#define STACKSIZE 88 ++#define STACKSIZE 80 + +#define M $16 +#define N $17 @@ -95453,8 +34712,6 @@ index 0000000..97dbc16 +#define J $7 +#define L $8 + -+#define tmp $9 -+ +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 @@ -95570,7 +34827,6 @@ index 0000000..97dbc16 + fstd $f7, 40($sp) + fstd $f8, 48($sp) + fstd $f9, 56($sp) -+ stl tmp, 72($sp) + + cmple M, 0, $0 + cmple N, 0, $1 @@ -95602,18 +34858,22 @@ index 0000000..97dbc16 + subl N, OFFSET, KK +#endif + -+ and N, 1, J ++ sra N, 1, J + ble J, $L30 ++ .align 4 + ++$L01: +#ifdef RT -+ sll K, ZBASE_SHIFT, TMP1 ++ sll K, ZBASE_SHIFT + 1, TMP1 + subl B, TMP1, B + -+ subl C, LDC, C1 -+ subl C, LDC, C ++ subl C, LDC, C2 ++ subl C2, LDC, C1 ++ subl C2, LDC, C +#else + mov C, C1 -+ addl C, LDC, C ++ addl C, LDC, C2 ++ addl C2, LDC, C +#endif + +#ifdef LN @@ -95631,41 +34891,55 @@ index 0000000..97dbc16 +#endif + + sra M, 1, I -+ ble I, $L50 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L20 + .align 4 + -+$L41: ++$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) -+ fclr t1 ++ fclr c09 + LD a2, 1 * SIZE(AO) -+ fclr t2 ++ fclr c13 ++ + LD a3, 2 * SIZE(AO) -+ fclr t3 ++ fclr c02 + LD a4, 3 * SIZE(AO) -+ fclr t4 ++ fclr c06 + + LD b1, 0 * SIZE(B) -+ fclr c01 ++ fclr c10 + LD b2, 1 * SIZE(B) -+ fclr c05 ++ fclr c14 ++ + LD b3, 2 * SIZE(B) -+ fclr c02 ++ fclr c03 + LD b4, 3 * SIZE(B) -+ fclr c06 ++ fclr c07 + -+ ldi BO, 2 * SIZE(B) -+ fclr c03 ++ ldi BO, 4 * SIZE(B) ++ fclr c11 + ldi AO, 4 * SIZE(AO) -+ fclr c07 ++ fclr c15 + -+ ldi L, -2(KK) ++ fillde 4 * SIZE(C1) + fclr c04 ++ ldi L, -2(KK) + fclr c08 + -+ ble KK, $L48 -+ ble L, $L45 ++ fillde 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble KK, $L18 ++ ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 @@ -95674,256 +34948,374 @@ index 0000000..97dbc16 + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) -+ fclr t1 ++ fclr c09 + LD a2, 1 * SIZE(AO) -+ fclr t2 ++ fclr c13 ++ + LD a3, 2 * SIZE(AO) -+ fclr t3 ++ fclr c02 + LD a4, 3 * SIZE(AO) -+ fclr t4 ++ fclr c06 + + LD b1, 0 * SIZE(BO) -+ fclr c01 ++ fclr c10 + LD b2, 1 * SIZE(BO) -+ fclr c05 ++ fclr c14 ++ + LD b3, 2 * SIZE(BO) -+ fclr c02 ++ fclr c03 + LD b4, 3 * SIZE(BO) -+ fclr c06 ++ fclr c07 + -+ ldi BO, 2 * SIZE(BO) -+ fclr c03 ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 + ldi AO, 4 * SIZE(AO) -+ fclr c07 ++ fclr c15 + -+ ldi L, -2(TMP1) ++ fillde 4 * SIZE(C1) + fclr c04 ++ ldi L, -2(TMP1) + fclr c08 + -+ ble TMP1, $L48 -+ ble L, $L45 ++ fillde 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble TMP1, $L18 ++ ble L, $L15 +#endif + .align 5 + -+$L42: -+ ADD4 c05, t1, b5 -+ fmov b5, c05 ++$L12: ++/* 1 */ ++ ADD1 c11, t1, c11 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(AO) ++#else + unop -+ MUL a1, b1, t1 ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ s_fillcs PREFETCHSIZE * SIZE(BO) ++#else + unop ++#endif + -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ ldi L, -2(L) -+ MUL a2, b1, t2 ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 + unop + -+ ADD4 c07, t3, b5 -+ fmov b5, c07 ++ ADD2 c16, t3, c16 + unop -+ MUL a3, b1, t3 ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD1 c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 + unop + -+ ADD2 c08, t4, b5 -+ fmov b5, c08 ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a1, t4 + unop -+ MUL a4, b1, t4 -+ LD b1, 2 * SIZE(BO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++/* 3 */ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 + unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ ldi BO, 4 * SIZE(BO) -+ MUL a2, b2, t2 ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + -+ ADD1 c03, t3, b5 -+ fmov b5, c03 ++ ADD4 c13, t4, c13 + unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) + -+ ADD3 c04, t4, b5 -+ fmov b5, c04 ++/* 4 */ ++ ADD1 c09, t1, c09 + unop -+ MUL a4, b2, t4 -+ LD a5, 3 * SIZE(AO) ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) + -+ ADD4 c05, t1, b5 -+ fmov b5, c05 ++ ADD3 c10, t2, c10 + unop -+ MUL a1, b3, t1 -+ LD b2, -1 * SIZE(BO) ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) + -+ ADD2 c06, t2, b5 -+ fmov b5, c06 ++ ADD2 c14, t3, c14 + unop -+ MUL a2, b3, t2 ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, c11 ++ unop ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 + unop + -+ ADD4 c07, t3, b5 -+ fmov b5, c07 ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 + unop -+ MUL a3, b3, t3 -+ ldi AO, 8 * SIZE(AO) ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) + -+ ADD2 c08, t4, b5 -+ fmov b5, c08 ++ ADD1 c09, t1, c09 + unop -+ MUL a5, b3, t4 -+ LD b3, 0 * SIZE(BO) ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++ ADD3 c10, t2, c10 + unop -+ MUL a1, b4, t1 -+ LD a1, -4 * SIZE(AO) ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ADD2 c14, t3, c14 + unop -+ MUL a2, b4, t2 -+ LD a2, -3 * SIZE(AO) -+ -+ ADD1 c03, t3, b5 -+ fmov b5, c03 ++ MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) -+ MUL a3, b4, t3 -+ LD a3, -2 * SIZE(AO) + -+ ADD3 c04, t4, b5 -+ fmov b5, c04 -+ MUL a5, b4, t4 -+ LD b4, 1 * SIZE(BO) -+ bgt L, $L42 -+ .align 4 ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) + -+$L45: -+ ADD4 c05, t1, b5 -+ fmov b5, c05 ++ ADD1 c11, t1, c11 ++ LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 -+#if defined(LT) || defined(RN) -+ blbs KK, $L47 -+#else -+ blbs TMP1, $L47 -+#endif ++ ldi BO, 4 * SIZE(BO) + .align 4 + -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, t2 -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, t3 -+ -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+ unop -+ MUL a4, b1, t4 -+ LD b1, 0 * SIZE(BO) ++$L17: ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b2, t1 -+ LD a1, 0 * SIZE(AO) ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b2, t2 -+ LD a2, 1 * SIZE(AO) ++ ADD3 c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 + -+ ADD1 c03, t3, b5 -+ fmov b5, c03 -+ unop -+ MUL a3, b2, t3 -+ LD a3, 2 * SIZE(AO) ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD1 c03, t1, c03 ++ MUL b3, a1, t1 + -+ ADD3 c04, t4, b5 -+ fmov b5, c04 -+ MUL a4, b2, t4 -+ LD a4, 3 * SIZE(AO) -+ ldi AO, 4 * SIZE(AO) ++ ADD3 c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD2 c08, t3, c08 ++ MUL b4, a2, t3 + -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+ LD b2, 1 * SIZE(BO) -+ MUL a1, b1, t1 -+ ldi BO, 2 * SIZE(BO) -+ .align 4 ++ ADD4 c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD1 c09, t1, c09 ++ MUL b3, a3, t1 + -+$L47: -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ MUL a2, b1, t2 -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+ MUL a3, b1, t3 ++ ADD3 c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD2 c14, t3, c14 ++ MUL b4, a4, t3 + -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+ MUL a4, b1, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b2, t1 ++ ADD4 c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b2, t2 -+ ADD1 c03, t3, b5 -+ fmov b5, c03 -+ MUL a3, b2, t3 ++ ADD1 c11, t1, c11 ++ ADD3 c12, t2, c12 ++ ADD2 c16, t3, c16 ++ ADD4 c15, t4, c15 + -+ ADD3 c04, t4, b5 -+ fmov b5, c04 -+ ldi AO, 4 * SIZE(AO) -+ MUL a4, b2, t4 -+ ldi BO, 2 * SIZE(BO) ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 + -+ ADD4 c05, t1, b5 -+ fmov b5, c05 -+ ADD2 c06, t2, b5 -+ fmov b5, c06 -+ ADD4 c07, t3, b5 -+ fmov b5, c07 -+ ADD2 c08, t4, b5 -+ fmov b5, c08 -+ -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 -+ ADD c03, c08, b5 -+ fmov b5, c03 -+ ADD c04, c07, b5 -+ fmov b5, c04 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ ADD c11, c16, c11 ++ ADD c12, c15, c12 ++ .align 4 + -+$L48: ++$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else -+ subl KK, 1, TMP1 ++ subl KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) ++ ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) @@ -95932,28 +35324,40 @@ index 0000000..97dbc16 + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++ ++ SUB b1, c03, c03 ++ SUB b2, c04, c04 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 +#endif + +#ifdef LN @@ -95964,44 +35368,56 @@ index 0000000..97dbc16 + + MUL a2, c04, t1 + MUL a2, c03, t2 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ + MUL a3, c03, t1 + MUL a3, c04, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c09, t3, c09 ++ SUB c10, t4, c10 + -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 + -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c09, t3, c09 ++ ADD5 c10, t4, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 +#endif + +#ifdef LT @@ -96012,47 +35428,159 @@ index 0000000..97dbc16 + + MUL a2, c02, t1 + MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++ + MUL a3, c01, t1 + MUL a3, c02, t2 ++ MUL a3, c09, t3 ++ MUL a3, c10, t4 + -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c04, t2, b5 -+ fmov b5, c04 ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 -+ ADD6 c03, t1, b5 -+ fmov b5, c03 -+ ADD5 c04, t2, b5 -+ fmov b5, c04 ++ MUL a4, c10, t3 ++ MUL a4, c09, t4 ++ ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 + -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 +#endif + -+#if defined(RN) || defined(RT) ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c03, t3, c03 ++ ADD5 c04, t4, c04 ++ + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + @@ -96061,39 +35589,42 @@ index 0000000..97dbc16 + MUL a2, c04, t3 + MUL a2, c03, t4 + -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c03, t3, b5 -+ fmov b5, c03 -+ ADD6 c04, t4, b5 -+ fmov b5, c04 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) -+ ST c03, 2 * SIZE(BO) -+ ST c04, 3 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) @@ -96101,10 +35632,21 @@ index 0000000..97dbc16 + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) ++ +#ifndef LN + ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) +#endif + ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG @@ -96112,10 +35654,9 @@ index 0000000..97dbc16 + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 -+ addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 -+ addl BO, TMP2, BO ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO +#endif + +#ifdef LT @@ -96125,233 +35666,282 @@ index 0000000..97dbc16 +#ifdef LN + subl KK, 2, KK +#endif ++ fclr c01 ++ fclr c05 + + ldi I, -1(I) -+ bgt I, $L41 ++ bgt I, $L11 + .align 4 + -+$L50: ++$L20: + and M, 1, I -+ ble I, $L59 ++ ble I, $L29 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) -+ fclr t1 ++ fclr c09 + LD a2, 1 * SIZE(AO) -+ fclr t2 ++ fclr c13 ++ + LD a3, 2 * SIZE(AO) -+ fclr t3 ++ fclr c02 + LD a4, 3 * SIZE(AO) -+ fclr t4 ++ fclr c06 + + LD b1, 0 * SIZE(B) -+ fclr c01 ++ fclr c10 + LD b2, 1 * SIZE(B) -+ fclr c05 ++ fclr c14 + + LD b3, 2 * SIZE(B) -+ fclr c02 -+ LD b4, 3 * SIZE(B) -+ fclr c06 -+ + ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) + + ldi L, -2(KK) + -+ ble KK, $L58 -+ ble L, $L55 ++ ble KK, $L28 ++ ble L, $L25 +#else +#ifdef LN -+ sll K, ZBASE_SHIFT, TMP1 ++ sll K, ZBASE_SHIFT + 0, TMP1 + subl AORIG, TMP1, AORIG +#endif + -+ sll KK, ZBASE_SHIFT, TMP1 ++ sll KK, ZBASE_SHIFT + 0, TMP1 + addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT, TMP1 ++ sll KK, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) -+ fclr t1 ++ fclr c09 + LD a2, 1 * SIZE(AO) -+ fclr t2 ++ fclr c13 ++ + LD a3, 2 * SIZE(AO) -+ fclr t3 ++ fclr c02 + LD a4, 3 * SIZE(AO) -+ fclr t4 ++ fclr c06 + + LD b1, 0 * SIZE(BO) -+ fclr c01 ++ fclr c10 + LD b2, 1 * SIZE(BO) -+ fclr c05 ++ fclr c14 + + LD b3, 2 * SIZE(BO) -+ fclr c02 -+ LD b4, 3 * SIZE(BO) -+ fclr c06 -+ + ldi AO, 2 * SIZE(AO) -+ ldi BO, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) + + ldi L, -2(TMP1) + -+ ble TMP1, $L58 -+ ble L, $L55 ++ ble TMP1, $L28 ++ ble L, $L25 +#endif + .align 5 + -+$L52: -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++$L22: ++ ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ ldi AO, 4 * SIZE(AO) ++ ADD3 c10, t2, c10 ++ unop + MUL a2, b1, t2 -+ LD b1, 2 * SIZE(BO) ++ LD b1, 0 * SIZE(BO) + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ ldi L, -2(L) ++ ADD4 c13, t3, c13 ++ unop + MUL a1, b2, t3 -+ LD a1, -2 * SIZE(AO) ++ ldi BO, 8 * SIZE(BO) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 ++ ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 -+ LD a2, -1 * SIZE(AO) ++ LD b2, -7 * SIZE(BO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ LD b2, 3 * SIZE(BO) -+ MUL a3, b3, t1 -+ ldi BO, 4 * SIZE(BO) ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ADD3 c02, t2, c02 + unop -+ MUL a4, b3, t2 -+ LD b3, 0 * SIZE(BO) ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 ++ ADD4 c05, t3, c05 + unop -+ MUL a3, b4, t3 -+ LD a3, 0 * SIZE(AO) ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ MUL a4, b4, t4 -+ LD b4, 1 * SIZE(BO) ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 + unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) + -+ LD a4, 1 * SIZE(AO) ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 + unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 + unop -+ bgt L, $L52 ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 + .align 4 + -+$L55: -+ ADD1 c01, t1, b5 -+ fmov b5, c01 ++$L25: ++ ADD1 c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) -+ blbs KK, $L57 ++ blbs KK, $L27 +#else -+ blbs TMP1, $L57 ++ blbs TMP1, $L27 +#endif + .align 4 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ ldi BO, 2 * SIZE(BO) ++ ADD4 c13, t3, c13 ++ unop + MUL a1, b2, t3 -+ LD a1, 0 * SIZE(AO) ++ unop + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 ++ ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 -+ LD a2, 1 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ LD b2, -1 * SIZE(BO) -+ MUL a1, b1, t1 ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 + ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) + .align 4 + -+$L57: -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++$L27: ++ ADD3 c10, t2, c10 + MUL a2, b1, t2 -+ ADD4 c05, t3, b5 -+ fmov b5, c05 ++ ADD4 c13, t3, c13 + MUL a1, b2, t3 + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) ++ ADD2 c14, t4, c14 + MUL a2, b2, t4 -+ ldi BO, 2 * SIZE(BO) ++ ADD1 c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ ADD2 c06, t4, b5 -+ fmov b5, c06 ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) + -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 + -+$L58: ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ .align 4 ++ ++$L28: +#if defined(LN) || defined(RT) ++#ifdef LN + subl KK, 1, TMP1 -+ -+ sll TMP1, ZBASE_SHIFT, TMP2 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) -+ ldi BO, -2 * SIZE(BO) ++ ldi BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 +#endif + +#if defined(LN) || defined(LT) @@ -96360,51 +35950,117 @@ index 0000000..97dbc16 + + MUL a2, c02, t1 + MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 +#endif + -+#if defined(RN) || defined(RT) ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) +#endif + +#ifdef RT @@ -96414,9 +36070,9 @@ index 0000000..97dbc16 + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT, TMP2 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 + addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT, TMP2 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl BO, TMP2, BO +#endif + @@ -96429,9 +36085,9 @@ index 0000000..97dbc16 +#endif + .align 4 + -+$L59: ++$L29: +#ifdef LN -+ sll K, ZBASE_SHIFT, TMP1 ++ sll K, ZBASE_SHIFT + 1, TMP1 + addl B, TMP1, B +#endif + @@ -96440,31 +36096,30 @@ index 0000000..97dbc16 +#endif + +#ifdef RN -+ addl KK, 1, KK ++ addl KK, 2, KK +#endif + +#ifdef RT -+ subl KK, 1, KK ++ subl KK, 2, KK +#endif ++ ++ ldi J, -1(J) ++ bgt J, $L01 + .align 4 + +$L30: -+ sra N, 1, J ++ and N, 1, J + ble J, $L999 -+ .align 4 + -+$L01: +#ifdef RT -+ sll K, ZBASE_SHIFT + 1, TMP1 ++ sll K, ZBASE_SHIFT, TMP1 + subl B, TMP1, B + -+ subl C, LDC, C2 -+ subl C2, LDC, C1 -+ subl C2, LDC, C ++ subl C, LDC, C1 ++ subl C, LDC, C +#else + mov C, C1 -+ addl C, LDC, C2 -+ addl C2, LDC, C ++ addl C, LDC, C +#endif + +#ifdef LN @@ -96482,55 +36137,41 @@ index 0000000..97dbc16 +#endif + + sra M, 1, I -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ -+ fclr c01 -+ fclr c05 -+ -+ ble I, $L20 ++ ble I, $L50 + .align 4 + -+$L11: ++$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) -+ fclr c09 ++ fclr t1 + LD a2, 1 * SIZE(AO) -+ fclr c13 -+ ++ fclr t2 + LD a3, 2 * SIZE(AO) -+ fclr c02 ++ fclr t3 + LD a4, 3 * SIZE(AO) -+ fclr c06 ++ fclr t4 + + LD b1, 0 * SIZE(B) -+ fclr c10 ++ fclr c01 + LD b2, 1 * SIZE(B) -+ fclr c14 -+ ++ fclr c05 + LD b3, 2 * SIZE(B) -+ fclr c03 ++ fclr c02 + LD b4, 3 * SIZE(B) -+ fclr c07 ++ fclr c06 + -+ ldi BO, 4 * SIZE(B) -+ fclr c11 ++ ldi BO, 2 * SIZE(B) ++ fclr c03 + ldi AO, 4 * SIZE(AO) -+ fclr c15 ++ fclr c07 + -+ fillcs 4 * SIZE(C1) -+ fclr c04 + ldi L, -2(KK) ++ fclr c04 + fclr c08 + -+ fillcs 4 * SIZE(C2) -+ fclr c12 -+ fclr c16 -+ ble KK, $L18 -+ ble L, $L15 ++ ble KK, $L48 ++ ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 @@ -96539,454 +36180,216 @@ index 0000000..97dbc16 + + sll KK, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) -+ fclr c09 ++ fclr t1 + LD a2, 1 * SIZE(AO) -+ fclr c13 -+ ++ fclr t2 + LD a3, 2 * SIZE(AO) -+ fclr c02 ++ fclr t3 + LD a4, 3 * SIZE(AO) -+ fclr c06 ++ fclr t4 + + LD b1, 0 * SIZE(BO) -+ fclr c10 ++ fclr c01 + LD b2, 1 * SIZE(BO) -+ fclr c14 -+ ++ fclr c05 + LD b3, 2 * SIZE(BO) -+ fclr c03 ++ fclr c02 + LD b4, 3 * SIZE(BO) -+ fclr c07 ++ fclr c06 + -+ ldi BO, 4 * SIZE(BO) -+ fclr c11 ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 + ldi AO, 4 * SIZE(AO) -+ fclr c15 ++ fclr c07 + -+ fillcs 4 * SIZE(C1) -+ fclr c04 + ldi L, -2(TMP1) ++ fclr c04 + fclr c08 + -+ fillcs 4 * SIZE(C2) -+ fclr c12 -+ fclr c16 -+ ble TMP1, $L18 -+ ble L, $L15 ++ ble TMP1, $L48 ++ ble L, $L45 +#endif + .align 5 + -+$L12: -+/* 1 */ -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) -+#else -+ unop -+#endif -+ MUL b1, a1, t1 -+#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) -+#else -+ unop -+#endif -+ -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ unop -+ MUL b1, a2, t2 -+ unop -+ -+ ADD2 c16, t3, b5 -+ fmov b5, c16 ++$L42: ++ ADD4 c05, t1, c05 + unop -+ MUL b2, a2, t3 -+ LD a5, 0 * SIZE(AO) -+ -+ ADD4 c15, t4, b5 -+ fmov b5, c15 ++ MUL a1, b1, t1 + unop -+ MUL b2, a1, t4 -+ LD b5, 0 * SIZE(BO) -+ FIMOVD b5, tmp -+/* 2 */ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ UNOP -+ MUL b1, a3, t1 -+ UNOP -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ UNOP -+ MUL b1, a4, t2 -+ UNOP + -+ ADD2 c06, t3, b5 -+ fmov b5, c06 -+ unop -+ MUL b2, a4, t3 ++ ADD2 c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 + unop + -+ ADD4 c05, t4, b5 -+ fmov b5, c05 ++ ADD4 c07, t3, c07 + unop -+ MUL b4, a1, t4 ++ MUL a3, b1, t3 + unop + -+/* 3 */ -+ ADD1 c03, t1, b5 -+ fmov b5, c03 -+ unop -+ MUL b3, a1, t1 ++ ADD2 c08, t4, c08 + unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) + -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, t2 ++ ADD1 c01, t1, c01 + unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) + -+ ADD2 c08, t3, b5 -+ fmov b5, c08 -+ unop -+ MUL b4, a2, t3 ++ ADD3 c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + -+ ADD4 c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) -+ -+/* 4 */ -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, t1 -+ LD a6, 2 * SIZE(AO) -+ -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD2 c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, 3 * SIZE(AO) -+ -+ ADD4 c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, t4 -+ LD b4, 3 * SIZE(BO) -+ -+/* 5 */ -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a5, t1 -+ LD a1, 4 * SIZE(AO) -+ -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ ldi L, -2(L) -+ IFMOVD tmp, b5 -+ MUL b5, a2, t2 -+ LD b1, 4 * SIZE(BO) -+ -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+ unop -+ MUL b2, a2, t3 -+ unop -+ -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+ unop -+ MUL b2, a5, t4 ++ ADD1 c03, t3, c03 + unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) + -+/* 6 */ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a6, t1 ++ ADD3 c04, t4, c04 + unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ unop -+ IFMOVD tmp, b5 -+ MUL b5, a4, t2 ++ ADD4 c05, t1, c05 + unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) + -+ ADD2 c06, t3, b5 -+ fmov b5, c06 ++ ADD2 c06, t2, c06 + unop -+ MUL b2, a4, t3 ++ MUL a2, b3, t2 + unop + -+ ADD4 c05, t4, b5 -+ fmov b5, c05 -+ unop -+ MUL b4, a5, t4 ++ ADD4 c07, t3, c07 + unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) + -+/* 7 */ -+ ADD1 c03, t1, b5 -+ fmov b5, c03 -+ ldi AO, 8 * SIZE(AO) -+ MUL b3, a5, t1 ++ ADD2 c08, t4, c08 + unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) + -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+ ldi BO, 8 * SIZE(BO) -+ MUL b3, a2, t2 ++ ADD1 c01, t1, c01 + unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) + -+ ADD2 c08, t3, b5 -+ fmov b5, c08 ++ ADD3 c02, t2, c02 + unop -+ MUL b4, a2, t3 ++ MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + -+ ADD4 c13, t4, b5 -+ fmov b5, c13 -+ unop -+ MUL b2, a6, t4 -+ LD b2, -3 * SIZE(BO) -+ -+/* 8 */ -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a6, t1 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, -2 * SIZE(BO) -+ -+ ADD2 c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 ++ ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) + -+ ADD4 c07, t4, b5 -+ fmov b5, c07 -+ MUL b4, a6, t4 -+ LD b4, -1 * SIZE(BO) -+ bgt L, $L12 ++ ADD3 c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 + .align 4 + -+$L15: -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+ unop ++$L45: ++ ADD4 c05, t1, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) -+ blbs KK, $L17 ++ blbs KK, $L47 +#else -+ blbs TMP1, $L17 ++ blbs TMP1, $L47 +#endif + .align 4 + -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, t2 -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, t3 -+ -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, t1 ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ADD2 c08, t4, c08 + unop -+ MUL b1, a4, t2 ++ MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + -+ ADD2 c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, t3 -+ ADD4 c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, t4 -+ -+ ADD1 c03, t1, b5 -+ fmov b5, c03 ++ ADD1 c01, t1, c01 + unop -+ MUL b3, a1, t1 ++ MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+ unop -+ MUL b3, a2, t2 -+ unop -+ -+ ADD2 c08, t3, b5 -+ fmov b5, c08 ++ ADD3 c02, t2, c02 + unop -+ MUL b4, a2, t3 ++ MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + -+ ADD4 c13, t4, b5 -+ fmov b5, c13 ++ ADD1 c03, t3, c03 + unop -+ MUL b2, a3, t4 -+ LD b2, 1 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) + -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL b3, a3, t1 ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) + ldi AO, 4 * SIZE(AO) + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL b3, a4, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD2 c14, t3, b5 -+ fmov b5, c14 -+ unop -+ MUL b4, a4, t3 -+ LD a4, -1 * SIZE(AO) -+ -+ ADD4 c07, t4, b5 -+ fmov b5, c07 -+ unop -+ MUL b4, a3, t4 -+ LD a3, -2 * SIZE(AO) -+ -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+ LD b4, 3 * SIZE(BO) -+ MUL b1, a1, t1 -+ ldi BO, 4 * SIZE(BO) ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) + .align 4 + -+$L17: -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ MUL b1, a2, t2 -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+ MUL b2, a2, t3 -+ -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+ MUL b2, a1, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL b1, a3, t1 -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ MUL b1, a4, t2 -+ ADD2 c06, t3, b5 -+ fmov b5, c06 -+ MUL b2, a4, t3 -+ -+ ADD4 c05, t4, b5 -+ fmov b5, c05 -+ MUL b4, a1, t4 -+ ADD1 c03, t1, b5 -+ fmov b5, c03 -+ MUL b3, a1, t1 -+ -+ ADD3 c04, t2, b5 -+ fmov b5, c04 -+ MUL b3, a2, t2 -+ ADD2 c08, t3, b5 -+ fmov b5, c08 -+ MUL b4, a2, t3 -+ -+ ADD4 c13, t4, b5 -+ fmov b5, c13 -+ MUL b2, a3, t4 -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ MUL b3, a3, t1 ++$L47: ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ MUL b3, a4, t2 -+ ADD2 c14, t3, b5 -+ fmov b5, c14 -+ MUL b4, a4, t3 ++ ADD2 c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 + -+ ADD4 c07, t4, b5 -+ fmov b5, c07 ++ ADD3 c04, t4, c04 + ldi AO, 4 * SIZE(AO) -+ MUL b4, a3, t4 -+ ldi BO, 4 * SIZE(BO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) + -+ ADD1 c11, t1, b5 -+ fmov b5, c11 -+ ADD3 c12, t2, b5 -+ fmov b5, c12 -+ ADD2 c16, t3, b5 -+ fmov b5, c16 -+ ADD4 c15, t4, b5 -+ fmov b5, c15 -+ -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 -+ ADD c03, c08, b5 -+ fmov b5, c03 -+ ADD c04, c07, b5 -+ fmov b5, c04 -+ -+ ADD c09, c14, b5 -+ fmov b5, c09 -+ ADD c10, c13, b5 -+ fmov b5, c10 -+ ADD c11, c16, b5 -+ fmov b5, c11 -+ ADD c12, c15, b5 -+ fmov b5, c12 -+ .align 4 ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 + -+$L18: ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subl KK, 2, TMP1 +#else -+ subl KK, 2, TMP1 ++ subl KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -4 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) ++ ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) @@ -96995,56 +36398,20 @@ index 0000000..97dbc16 + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + -+ LD b1, 4 * SIZE(BO) -+ LD b2, 5 * SIZE(BO) -+ LD b3, 6 * SIZE(BO) -+ LD b4, 7 * SIZE(BO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c10, b5 -+ fmov b5, c10 -+ -+ SUB b1, c03, b5 -+ fmov b5, c03 -+ SUB b2, c04, b5 -+ fmov b5, c04 -+ SUB b3, c11, b5 -+ fmov b5, c11 -+ SUB b4, c12, b5 -+ fmov b5, c12 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + -+ LD b1, 4 * SIZE(AO) -+ LD b2, 5 * SIZE(AO) -+ LD b3, 6 * SIZE(AO) -+ LD b4, 7 * SIZE(AO) -+ -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c03, b5 -+ fmov b5, c03 -+ SUB a4, c04, b5 -+ fmov b5, c04 -+ -+ SUB b1, c09, b5 -+ fmov b5, c09 -+ SUB b2, c10, b5 -+ fmov b5, c10 -+ SUB b3, c11, b5 -+ fmov b5, c11 -+ SUB b4, c12, b5 -+ fmov b5, c12 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 +#endif + +#ifdef LN @@ -97055,80 +36422,32 @@ index 0000000..97dbc16 + + MUL a2, c04, t1 + MUL a2, c03, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 -+ -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 + ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 -+ MUL a3, c11, t3 -+ MUL a3, c12, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c09, t3, b5 -+ fmov b5, c09 -+ SUB c10, t4, b5 -+ fmov b5, c10 + ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 -+ MUL a4, c12, t3 -+ MUL a4, c11, t4 + -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 -+ ADD6 c09, t3, b5 -+ fmov b5, c09 -+ ADD5 c10, t4, b5 -+ fmov b5, c10 ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 + -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c09, t3, b5 -+ fmov b5, c09 -+ ADD6 c10, t4, b5 -+ fmov b5, c10 ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 +#endif + +#ifdef LT @@ -97139,275 +36458,68 @@ index 0000000..97dbc16 + + MUL a2, c02, t1 + MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c09, t3, b5 -+ fmov b5, c09 -+ ADD6 c10, t4, b5 -+ fmov b5, c10 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 + ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 -+ MUL a3, c09, t3 -+ MUL a3, c10, t4 + -+ SUB c03, t1, b5 -+ fmov b5, c03 -+ SUB c04, t2, b5 -+ fmov b5, c04 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 -+ MUL a4, c10, t3 -+ MUL a4, c09, t4 -+ -+ ADD6 c03, t1, b5 -+ fmov b5, c03 -+ ADD5 c04, t2, b5 -+ fmov b5, c04 -+ ADD6 c11, t3, b5 -+ fmov b5, c11 -+ ADD5 c12, t4, b5 -+ fmov b5, c12 ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 + -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ ADD5 c03, t1, b5 -+ fmov b5, c03 -+ ADD6 c04, t2, b5 -+ fmov b5, c04 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 +#endif + -+#ifdef RN ++#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c03, t3, b5 -+ fmov b5, c03 -+ ADD6 c04, t4, b5 -+ fmov b5, c04 -+ -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ MUL a3, c03, t3 -+ MUL a3, c04, t4 -+ -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ SUB c11, t3, b5 -+ fmov b5, c11 -+ SUB c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ MUL a4, c04, t3 -+ MUL a4, c03, t4 -+ -+ ADD6 c09, t1, b5 -+ fmov b5, c09 -+ ADD5 c10, t2, b5 -+ fmov b5, c10 -+ ADD6 c11, t3, b5 -+ fmov b5, c11 -+ ADD5 c12, t4, b5 -+ fmov b5, c12 -+ -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 -+#endif -+ -+#ifdef RT -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ LD a3, 4 * SIZE(BO) -+ LD a4, 5 * SIZE(BO) -+ -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a2, c12, t3 -+ MUL a2, c11, t4 -+ -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ MUL a1, c11, b5 -+ fmov b5, c11 -+ MUL a1, c12, b5 -+ fmov b5, c12 -+ -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 -+ ADD5 c11, t3, b5 -+ fmov b5, c11 -+ ADD6 c12, t4, b5 -+ fmov b5, c12 -+ -+ MUL a3, c09, t1 -+ MUL a3, c10, t2 -+ MUL a3, c11, t3 -+ MUL a3, c12, t4 -+ -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ SUB c03, t3, b5 -+ fmov b5, c03 -+ SUB c04, t4, b5 -+ fmov b5, c04 -+ -+ MUL a4, c10, t1 -+ MUL a4, c09, t2 -+ MUL a4, c12, t3 -+ MUL a4, c11, t4 -+ -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 -+ ADD6 c03, t3, b5 -+ fmov b5, c03 -+ ADD5 c04, t4, b5 -+ fmov b5, c04 -+ -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a2, c04, t3 -+ MUL a2, c03, t4 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 + -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c03, b5 -+ fmov b5, c03 -+ MUL a1, c04, b5 -+ fmov b5, c04 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c03, t3, b5 -+ fmov b5, c03 -+ ADD6 c04, t4, b5 -+ fmov b5, c04 ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c10, 3 * SIZE(BO) -+ -+ ST c03, 4 * SIZE(BO) -+ ST c04, 5 * SIZE(BO) -+ ST c11, 6 * SIZE(BO) -+ ST c12, 7 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) -+ -+ ST c09, 4 * SIZE(AO) -+ ST c10, 5 * SIZE(AO) -+ ST c11, 6 * SIZE(AO) -+ ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -4 * SIZE(C1) -+ ldi C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) @@ -97415,21 +36527,10 @@ index 0000000..97dbc16 + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + -+ ST c09, 0 * SIZE(C2) -+ ST c10, 1 * SIZE(C2) -+ ST c11, 2 * SIZE(C2) -+ ST c12, 3 * SIZE(C2) -+ +#ifndef LN + ldi C1, 4 * SIZE(C1) -+ ldi C2, 4 * SIZE(C2) +#endif + -+ fclr t1 -+ fclr t2 -+ fclr t3 -+ fclr t4 -+ +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addl AORIG, TMP1, AORIG @@ -97437,9 +36538,10 @@ index 0000000..97dbc16 + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 1, TMP1 -+ addl AO, TMP1, AO -+ addl BO, TMP1, BO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO +#endif + +#ifdef LT @@ -97449,333 +36551,207 @@ index 0000000..97dbc16 +#ifdef LN + subl KK, 2, KK +#endif -+ fclr c01 -+ fclr c05 + + ldi I, -1(I) -+ bgt I, $L11 ++ bgt I, $L41 + .align 4 + -+$L20: ++$L50: + and M, 1, I -+ ble I, $L29 ++ ble I, $L59 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) -+ fclr c09 ++ fclr t1 + LD a2, 1 * SIZE(AO) -+ fclr c13 -+ ++ fclr t2 + LD a3, 2 * SIZE(AO) -+ fclr c02 ++ fclr t3 + LD a4, 3 * SIZE(AO) -+ fclr c06 ++ fclr t4 + + LD b1, 0 * SIZE(B) -+ fclr c10 ++ fclr c01 + LD b2, 1 * SIZE(B) -+ fclr c14 ++ fclr c05 + + LD b3, 2 * SIZE(B) -+ ldi AO, 2 * SIZE(AO) ++ fclr c02 + LD b4, 3 * SIZE(B) -+ ldi BO, 4 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) + + ldi L, -2(KK) + -+ ble KK, $L28 -+ ble L, $L25 ++ ble KK, $L58 ++ ble L, $L55 +#else +#ifdef LN -+ sll K, ZBASE_SHIFT + 0, TMP1 ++ sll K, ZBASE_SHIFT, TMP1 + subl AORIG, TMP1, AORIG +#endif + -+ sll KK, ZBASE_SHIFT + 0, TMP1 ++ sll KK, ZBASE_SHIFT, TMP1 + addl AORIG, TMP1, AO -+ sll KK, ZBASE_SHIFT + 1, TMP1 ++ sll KK, ZBASE_SHIFT, TMP1 + addl B, TMP1, BO + + subl K, KK, TMP1 + + LD a1, 0 * SIZE(AO) -+ fclr c09 ++ fclr t1 + LD a2, 1 * SIZE(AO) -+ fclr c13 -+ ++ fclr t2 + LD a3, 2 * SIZE(AO) -+ fclr c02 ++ fclr t3 + LD a4, 3 * SIZE(AO) -+ fclr c06 ++ fclr t4 + + LD b1, 0 * SIZE(BO) -+ fclr c10 ++ fclr c01 + LD b2, 1 * SIZE(BO) -+ fclr c14 ++ fclr c05 + + LD b3, 2 * SIZE(BO) -+ ldi AO, 2 * SIZE(AO) ++ fclr c02 + LD b4, 3 * SIZE(BO) -+ ldi BO, 4 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) + + ldi L, -2(TMP1) + -+ ble TMP1, $L28 -+ ble L, $L25 ++ ble TMP1, $L58 ++ ble L, $L55 +#endif + .align 5 + -+$L22: -+ ADD1 c09, t1, b5 -+ fmov b5, c09 ++$L52: ++ ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ unop ++ ADD3 c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) + MUL a2, b1, t2 -+ LD b1, 0 * SIZE(BO) ++ LD b1, 2 * SIZE(BO) + -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+ unop ++ ADD4 c05, t3, c05 ++ ldi L, -2(L) + MUL a1, b2, t3 -+ ldi BO, 8 * SIZE(BO) ++ LD a1, -2 * SIZE(AO) + -+ ADD2 c14, t4, b5 -+ fmov b5, c14 ++ ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 -+ LD b2, -7 * SIZE(BO) -+ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, t1 -+ unop -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, -6 * SIZE(BO) -+ -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, 2 * SIZE(AO) -+ -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ MUL a2, b4, t4 -+ LD b5, -5 * SIZE(BO) -+ FIMOVD b5, tmp -+ -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ unop -+ MUL a3, b1, t1 -+ LD a2, 3 * SIZE(AO) -+ -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ unop -+ MUL a4, b1, t2 -+ LD b1, -4 * SIZE(BO) -+ -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+ unop -+ MUL a3, b2, t3 -+ ldi AO, 4 * SIZE(AO) -+ -+ ADD2 c14, t4, b5 -+ fmov b5, c14 -+ MUL a4, b2, t4 -+ LD b2, -3 * SIZE(BO) ++ LD a2, -1 * SIZE(AO) + -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ ldi L, -2(L) ++ ADD1 c01, t1, c01 ++ LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 -+ LD b4, -1 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) + -+ ADD3 c02, t2, b5 -+ fmov b5, c02 ++ ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 -+ LD b3, -2 * SIZE(BO) ++ LD b3, 0 * SIZE(BO) + -+ ADD4 c05, t3, b5 -+ fmov b5, c05 ++ ADD4 c05, t3, c05 + unop -+ IFMOVD tmp, b5 -+ MUL a3, b5, t3 ++ MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ IFMOVD tmp, b5 -+ MUL a4, b5, t4 ++ ADD2 c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ + LD a4, 1 * SIZE(AO) -+ bgt L, $L22 ++ unop ++ unop ++ bgt L, $L52 + .align 4 + -+$L25: -+ ADD1 c09, t1, b5 -+ fmov b5, c09 ++$L55: ++ ADD1 c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) -+ blbs KK, $L27 ++ blbs KK, $L57 +#else -+ blbs TMP1, $L27 ++ blbs TMP1, $L57 +#endif + .align 4 + -+ ADD3 c10, t2, b5 -+ fmov b5, c10 ++ ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+ unop ++ ADD4 c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) + MUL a1, b2, t3 -+ unop ++ LD a1, 0 * SIZE(AO) + -+ ADD2 c14, t4, b5 -+ fmov b5, c14 ++ ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 -+ LD b2, 1 * SIZE(BO) -+ -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ unop -+ MUL a1, b3, t1 -+ ldi AO, 2 * SIZE(AO) -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ unop -+ MUL a2, b3, t2 -+ LD b3, 2 * SIZE(BO) -+ -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ unop -+ MUL a1, b4, t3 -+ LD a1, -2 * SIZE(AO) -+ -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ unop -+ MUL a2, b4, t4 -+ LD a2, -1 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) + -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ LD b4, 3 * SIZE(BO) ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 -+ ldi BO, 4 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) + .align 4 + -+$L27: -+ ADD3 c10, t2, b5 -+ fmov b5, c10 ++$L57: ++ ADD3 c02, t2, c02 + MUL a2, b1, t2 -+ ADD4 c13, t3, b5 -+ fmov b5, c13 ++ ADD4 c05, t3, c05 + MUL a1, b2, t3 + -+ ADD2 c14, t4, b5 -+ fmov b5, c14 ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) + MUL a2, b2, t4 -+ ADD1 c01, t1, b5 -+ fmov b5, c01 -+ MUL a1, b3, t1 -+ -+ ADD3 c02, t2, b5 -+ fmov b5, c02 -+ MUL a2, b3, t2 -+ ADD4 c05, t3, b5 -+ fmov b5, c05 -+ MUL a1, b4, t3 ++ ldi BO, 2 * SIZE(BO) + -+ ADD2 c06, t4, b5 -+ fmov b5, c06 -+ ldi AO, 2 * SIZE(AO) -+ MUL a2, b4, t4 -+ ldi BO, 4 * SIZE(BO) ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 + -+ ADD1 c09, t1, b5 -+ fmov b5, c09 -+ ADD3 c10, t2, b5 -+ fmov b5, c10 -+ ADD4 c13, t3, b5 -+ fmov b5, c13 -+ ADD2 c14, t4, b5 -+ fmov b5, c14 -+ -+ ADD c01, c06, b5 -+ fmov b5, c01 -+ ADD c02, c05, b5 -+ fmov b5, c02 -+ ADD c09, c14, b5 -+ fmov b5, c09 -+ ADD c10, c13, b5 -+ fmov b5, c10 -+ .align 4 ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 + -+$L28: ++$L58: +#if defined(LN) || defined(RT) -+#ifdef LN + subl KK, 1, TMP1 -+#else -+ subl KK, 2, TMP1 -+#endif -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 + addl AORIG, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ sll TMP1, ZBASE_SHIFT, TMP2 + addl B, TMP2, BO +#else + ldi AO, -2 * SIZE(AO) -+ ldi BO, -4 * SIZE(BO) ++ ldi BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c10, b5 -+ fmov b5, c10 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) -+ LD a3, 2 * SIZE(AO) -+ LD a4, 3 * SIZE(AO) + -+ SUB a1, c01, b5 -+ fmov b5, c01 -+ SUB a2, c02, b5 -+ fmov b5, c02 -+ SUB a3, c09, b5 -+ fmov b5, c09 -+ SUB a4, c10, b5 -+ fmov b5, c10 ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 +#endif + +#if defined(LN) || defined(LT) @@ -97784,149 +36760,43 @@ index 0000000..97dbc16 + + MUL a2, c02, t1 + MUL a2, c01, t2 -+ MUL a2, c10, t3 -+ MUL a2, c09, t4 -+ -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ ADD5 c09, t3, b5 -+ fmov b5, c09 -+ ADD6 c10, t4, b5 -+ fmov b5, c10 -+#endif -+ -+#ifdef RN -+ LD a1, 0 * SIZE(BO) -+ LD a2, 1 * SIZE(BO) -+ LD a3, 2 * SIZE(BO) -+ LD a4, 3 * SIZE(BO) -+ -+ MUL a2, c02, t1 -+ MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 -+ -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 -+ -+ MUL a3, c01, t1 -+ MUL a3, c02, t2 -+ SUB c09, t1, b5 -+ fmov b5, c09 -+ SUB c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL a4, c02, t1 -+ MUL a4, c01, t2 -+ ADD6 c09, t1, b5 -+ fmov b5, c09 -+ ADD5 c10, t2, b5 -+ fmov b5, c10 -+ -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 + -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 +#endif + -+#ifdef RT -+ LD a1, 6 * SIZE(BO) -+ LD a2, 7 * SIZE(BO) -+ LD a3, 4 * SIZE(BO) -+ LD a4, 5 * SIZE(BO) -+ -+ MUL a2, c10, t1 -+ MUL a2, c09, t2 -+ MUL a1, c09, b5 -+ fmov b5, c09 -+ MUL a1, c10, b5 -+ fmov b5, c10 -+ -+ ADD5 c09, t1, b5 -+ fmov b5, c09 -+ ADD6 c10, t2, b5 -+ fmov b5, c10 -+ -+ MUL a3, c09, t1 -+ MUL a3, c10, t2 -+ SUB c01, t1, b5 -+ fmov b5, c01 -+ SUB c02, t2, b5 -+ fmov b5, c02 -+ -+ MUL a4, c10, t1 -+ MUL a4, c09, t2 -+ ADD6 c01, t1, b5 -+ fmov b5, c01 -+ ADD5 c02, t2, b5 -+ fmov b5, c02 -+ ++#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 -+ MUL a1, c01, b5 -+ fmov b5, c01 -+ MUL a1, c02, b5 -+ fmov b5, c02 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 + -+ ADD5 c01, t1, b5 -+ fmov b5, c01 -+ ADD6 c02, t2, b5 -+ fmov b5, c02 ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) -+ ST c09, 2 * SIZE(BO) -+ ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) -+ ST c09, 2 * SIZE(AO) -+ ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + ldi C1, -2 * SIZE(C1) -+ ldi C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) -+ ST c09, 0 * SIZE(C2) -+ ST c10, 1 * SIZE(C2) + +#ifndef LN + ldi C1, 2 * SIZE(C1) -+ ldi C2, 2 * SIZE(C2) +#endif + +#ifdef RT @@ -97936,9 +36806,9 @@ index 0000000..97dbc16 + +#if defined(LT) || defined(RN) + subl K, KK, TMP1 -+ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ sll TMP1, ZBASE_SHIFT, TMP2 + addl AO, TMP2, AO -+ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ sll TMP1, ZBASE_SHIFT, TMP2 + addl BO, TMP2, BO +#endif + @@ -97951,9 +36821,9 @@ index 0000000..97dbc16 +#endif + .align 4 + -+$L29: ++$L59: +#ifdef LN -+ sll K, ZBASE_SHIFT + 1, TMP1 ++ sll K, ZBASE_SHIFT, TMP1 + addl B, TMP1, B +#endif + @@ -97962,15 +36832,12 @@ index 0000000..97dbc16 +#endif + +#ifdef RN -+ addl KK, 2, KK ++ addl KK, 1, KK +#endif + +#ifdef RT -+ subl KK, 2, KK ++ subl KK, 1, KK +#endif -+ -+ ldi J, -1(J) -+ bgt J, $L01 + .align 4 + +$L999: @@ -97982,17 +36849,16 @@ index 0000000..97dbc16 + fldd $f7, 40($sp) + fldd $f8, 48($sp) + fldd $f9, 56($sp) -+ ldl tmp, 72($sp) + clr $0 + ldi $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME -diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S b/kernel/sw_64/ztrsm_kernel_2x2_RT.S new file mode 100644 -index 0000000..4d4f59d +index 000000000..460b2b86b --- /dev/null -+++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak ++++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S @@ -0,0 +1,2223 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ @@ -98034,22 +36900,22 @@ index 0000000..4d4f59d + +#define ASSEMBLER +#include "common.h" -+#include "version.h" + + -+#if !defined(SW2B) ++#if !defined(SW8A) +#error "Architecture is not specified." +#endif + -+#ifdef SW2B ++#ifdef SW8A +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + + ++ + .set noat + .set noreorder -+ .arch ev6 ++ .arch sw8a + +.text + .align 5 @@ -98510,7 +37376,7 @@ index 0000000..4d4f59d + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) -+ ++ + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 @@ -98520,7 +37386,7 @@ index 0000000..4d4f59d + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) -+ ++ + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 @@ -98550,7 +37416,7 @@ index 0000000..4d4f59d + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 -+ ++ + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + @@ -98862,7 +37728,7 @@ index 0000000..4d4f59d +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) -+ ++ + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif @@ -98957,7 +37823,7 @@ index 0000000..4d4f59d + sra N, 1, J + ble J, $L999 + .align 4 -+ ++ +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 @@ -99026,12 +37892,12 @@ index 0000000..4d4f59d + ldi AO, 4 * SIZE(AO) + fclr c15 + -+ fillcs 4 * SIZE(C1) ++ fillde 4 * SIZE(C1) + fclr c04 + ldi L, -2(KK) + fclr c08 + -+ fillcs 4 * SIZE(C2) ++ fillde 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 @@ -99073,12 +37939,12 @@ index 0000000..4d4f59d + ldi AO, 4 * SIZE(AO) + fclr c15 + -+ fillcs 4 * SIZE(C1) ++ fillde 4 * SIZE(C1) + fclr c04 + ldi L, -2(TMP1) + fclr c08 + -+ fillcs 4 * SIZE(C2) ++ fillde 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 @@ -99090,13 +37956,13 @@ index 0000000..4d4f59d +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(AO) ++ s_fillcs PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 -+ fillcs PREFETCHSIZE * SIZE(BO) ++ s_fillcs PREFETCHSIZE * SIZE(BO) +#else + unop +#endif @@ -99419,7 +38285,7 @@ index 0000000..4d4f59d + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) -+ ++ + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) @@ -99439,7 +38305,7 @@ index 0000000..4d4f59d + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) -+ ++ + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) @@ -99496,7 +38362,7 @@ index 0000000..4d4f59d + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 -+ ++ + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + @@ -99676,7 +38542,7 @@ index 0000000..4d4f59d + ADD5 c02, t2, c02 + ADD6 c03, t3, c03 + ADD5 c04, t4, c04 -+ ++ + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + @@ -100023,7 +38889,7 @@ index 0000000..4d4f59d + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) -+ ++ + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 @@ -100033,7 +38899,7 @@ index 0000000..4d4f59d + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) -+ ++ + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 @@ -100119,7 +38985,7 @@ index 0000000..4d4f59d + MUL a4, c09, t2 + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 -+ ++ + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + @@ -100219,7 +39085,7 @@ index 0000000..4d4f59d + .end CNAME diff --git a/lapack/laswp/sw_64/Makefile b/lapack/laswp/sw_64/Makefile new file mode 100644 -index 0000000..af1f019 +index 000000000..af1f0199c --- /dev/null +++ b/lapack/laswp/sw_64/Makefile @@ -0,0 +1,8 @@ @@ -100232,37 +39098,47 @@ index 0000000..af1f019 +include ../generic/Makefile + diff --git a/param.h b/param.h -index ee4640f..1a5f361 100644 +index ee4640f57..3d912ae5d 100644 --- a/param.h +++ b/param.h -@@ -2128,7 +2128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +@@ -2201,6 +2201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif --#if defined(EV4) || defined(EV5) || defined(EV6) -+#if defined(EV4) || defined(EV5) || defined(SW6) - - #ifdef EV4 - #define SNUMOPT 1 -@@ -2140,7 +2140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - #define GEMM_DEFAULT_OFFSET_A 512 - #define GEMM_DEFAULT_OFFSET_B 512 --#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL ++#if defined(SW8A) ++ ++#define SNUMOPT 2 ++#define DNUMOPT 2 ++ ++#define GEMM_DEFAULT_OFFSET_A 512 ++#define GEMM_DEFAULT_OFFSET_B 512 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL -+//#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL - - #define SGEMM_DEFAULT_UNROLL_M 4 - #define SGEMM_DEFAULT_UNROLL_N 4 -@@ -2185,7 +2186,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #define ZGEMM_DEFAULT_Q 64 - #endif - --#ifdef EV6 -+#ifdef SW6 - #define SGEMM_DEFAULT_P 256 - #define SGEMM_DEFAULT_Q 512 ++ ++#define SGEMM_DEFAULT_UNROLL_M 4 ++#define SGEMM_DEFAULT_UNROLL_N 4 ++#define DGEMM_DEFAULT_UNROLL_M 4 ++#define DGEMM_DEFAULT_UNROLL_N 4 ++#define CGEMM_DEFAULT_UNROLL_M 2 ++#define CGEMM_DEFAULT_UNROLL_N 2 ++#define ZGEMM_DEFAULT_UNROLL_M 2 ++#define ZGEMM_DEFAULT_UNROLL_N 2 ++ ++#define SYMV_P 8 ++ ++#define SGEMM_DEFAULT_P 256 ++#define SGEMM_DEFAULT_Q 512 ++#define DGEMM_DEFAULT_P 256 ++#define DGEMM_DEFAULT_Q 256 ++#define CGEMM_DEFAULT_P 256 ++#define CGEMM_DEFAULT_Q 256 ++#define ZGEMM_DEFAULT_P 128 ++#define ZGEMM_DEFAULT_Q 256 ++ ++#endif ++ + #ifdef CELL + #define SNUMOPT 2 -- -2.31.1 +2.39.5 (Apple Git-154) diff --git a/openblas.spec b/openblas.spec index 4c3e77c..07f0235 100644 --- a/openblas.spec +++ b/openblas.spec @@ -2,7 +2,7 @@ Name: openblas Version: 0.3.25 -Release: 6 +Release: 7 Summary: An optimized BLAS library based on GotoBLAS2 1.13 BSD version License: BSD-3-Clause URL: https://github.com/xianyi/OpenBLAS/ @@ -10,7 +10,7 @@ Source0: https://github.com/xianyi/OpenBLAS/archive/v%{version}/openblas- Patch0000: openblas-0.2.15-system_lapack.patch Patch0001: openblas-0.2.5-libname.patch Patch0002: openblas-0.3.11-tests.patch -Patch0003: OpenBLAS-0.3.25-sw.patch +Patch0003: openblas-0.3.25-sw_64.patch BuildRequires: gcc gcc-gfortran perl-devel gcc-c++ @@ -238,7 +238,7 @@ suffix="_riscv64_generic" suffix="_power8" %endif %ifarch sw_64 -suffix="_sw6" +suffix="_sw8a" %endif slibname=`basename %{buildroot}%{_libdir}/libopenblas${suffix}-*.so .so` mv %{buildroot}%{_libdir}/${slibname}.a %{buildroot}%{_libdir}/lib%{name}.a @@ -380,6 +380,9 @@ sed -i -e 's|%{buildroot}||' "%{buildroot}%{_libdir}/pkgconfig/%{name}.pc" %{_libdir}/lib%{name}*64_.so %changelog +* Thu Aug 28 2025 swcompiler - 0.3.25-7 +- fix some bugs for sw_64 + * Tue Feb 25 2025 zhangshaoning - 0.3.25-6 - Add sw_64 support -- Gitee