diff --git a/0005-openblas-0.3.28-sw64.patch b/0005-openblas-0.3.28-sw64.patch new file mode 100644 index 0000000000000000000000000000000000000000..6555090e455dd033db143a14a0c18b3635edb606 --- /dev/null +++ b/0005-openblas-0.3.28-sw64.patch @@ -0,0 +1,100268 @@ +diff --git a/Makefile b/Makefile +index fc021a9..c33edd9 100644 +--- a/Makefile ++++ b/Makefile +@@ -158,18 +158,18 @@ tests : shared + ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) + touch $(LIBNAME) + ifndef NO_FBLAS +- $(MAKE) -C test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all ++ $(MAKE) -C test all + endif + endif + ifneq ($(ONLY_CBLAS), 1) +- $(MAKE) -C utest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all ++ #$(MAKE) -C utest all + endif + ifneq ($(NO_CBLAS), 1) + ifneq ($(ONLY_CBLAS), 1) +- $(MAKE) -C ctest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all ++ $(MAKE) -C ctest all + endif + ifeq ($(CPP_THREAD_SAFETY_TEST), 1) +- $(MAKE) -C cpp_thread_test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all ++ $(MAKE) -C cpp_thread_test all + endif + endif + +diff --git a/Makefile.sw_64 b/Makefile.sw_64 +new file mode 100644 +index 0000000..b4542ce +--- /dev/null ++++ b/Makefile.sw_64 +@@ -0,0 +1,35 @@ ++CPP = $(CC) -E ++RANLIB = ranlib ++ ++ifeq ($(LIBSUBARCH), SW6) ++LIBNAME = $(LIBPREFIX)_sw6.a ++LIBNAME_P = $(LIBPREFIX)_sw6_p.a ++endif ++ ++ifneq ($(COMPILER), NATIVE) ++# GCC User ++ifeq ($(LIBSUBARCH), SW6) ++OPTION += -DSW6 -mcpu=sw6 ++endif ++else ++# Compaq Compiler User ++ifeq ($(LIBSUBARCH), SW6) ++OPTION += -DSW6 -tune sw6 -arch sw6 ++endif ++endif ++ ++ifeq ($(F_COMPILER), GFORTRAN) ++FCOMMON_OPT += -mieee ++endif ++ ++ifeq ($(F_COMPILER), G77) ++FCOMMON_OPT += -mieee ++endif ++ ++ifndef SMP ++LIBCXML = -lcxml -lots -lm ++LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm ++else ++LIBCXML = -lcxmlp -lots -lm ++LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm ++endif +diff --git a/Makefile.system b/Makefile.system +index 3be47c6..ae90af3 100644 +--- a/Makefile.system ++++ b/Makefile.system +@@ -42,6 +42,8 @@ else ifeq ($(ARCH), mips64el) + override ARCH=mips64 + else ifeq ($(ARCH), zarch) + override ARCH=zarch ++else ifeq ($(ARCH), sw_64) ++override ARCH=sw_64 + endif + + NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib +@@ -809,6 +811,11 @@ NO_BINARY_MODE = 1 + BINARY_DEFINED = 1 + endif + ++ifeq ($(ARCH), sw_64) ++NO_BINARY_MODE = 1 ++BINARY_DEFINED = 1 ++endif ++ + ifeq ($(ARCH), arm) + NO_BINARY_MODE = 1 + BINARY_DEFINED = 1 +diff --git a/Makefile.system.libname b/Makefile.system.libname +deleted file mode 100644 +index 1b84195..0000000 +--- a/Makefile.system.libname ++++ /dev/null +@@ -1,1860 +0,0 @@ +-# +-# Include user definition +-# +- +-# TO suppress recursive includes +-INCLUDED = 1 +- +-ifndef TOPDIR +-TOPDIR = . +-endif +- +-ifndef RELAPACK_REPLACE +-RELAPACK_REPLACE=0 +-endif +- +-# we need to use the host system's architecture for getarch compile options even especially when cross-compiling +-HOSTARCH := $(shell uname -m) +-ifeq ($(HOSTARCH), amd64) +-HOSTARCH=x86_64 +-endif +- +-# Catch conflicting usage of ARCH in some BSD environments +-ifeq ($(ARCH), amd64) +-override ARCH=x86_64 +-else ifeq ($(ARCH), powerpc64) +-override ARCH=power +-else ifeq ($(ARCH), powerpc64le) +-override ARCH=power +-else ifeq ($(ARCH), powerpc) +-override ARCH=power +-else ifeq ($(ARCH), i386) +-override ARCH=x86 +-else ifeq ($(ARCH), armv6) +-override ARCH=arm +-else ifeq ($(ARCH), armv7) +-override ARCH=arm +-else ifeq ($(ARCH), aarch64) +-override ARCH=arm64 +-else ifeq ($(ARCH), mipsel) +-override ARCH=mips +-else ifeq ($(ARCH), mips64el) +-override ARCH=mips64 +-else ifeq ($(ARCH), zarch) +-override ARCH=zarch +-endif +- +-NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib +- +-# Default C compiler +-# - Only set if not specified on the command line or inherited from the environment. +-# - CC is an implicit variable so neither '?=' or 'ifndef' can be used. +-# http://stackoverflow.com/questions/4029274/mingw-and-make-variables +-# - Default value is 'cc' which is not always a valid command (e.g. MinGW). +-ifeq ($(origin CC),default) +- +-# Check if $(CC) refers to a valid command and set the value to gcc if not +-ifneq ($(findstring cmd.exe,$(SHELL)),) +-ifeq ($(shell where $(CC) 2>NUL),) +-CC = gcc +-endif +-else # POSIX-ish +-ifeq ($(shell command -v $(CC) 2>/dev/null),) +-ifeq ($(shell uname -s),Darwin) +-CC = clang +-# EXTRALIB += -Wl,-no_compact_unwind +-else +-CC = gcc +-endif # Darwin +-endif # CC exists +-endif # Shell is sane +- +-endif # CC is set to default +- +-# Default Fortran compiler (FC) is selected by f_check. +- +-ifndef MAKEFILE_RULE +-include $(TOPDIR)/Makefile.rule +-else +-include $(TOPDIR)/$(MAKEFILE_RULE) +-endif +- +-# +-# Beginning of system configuration +-# +-ifneq ($(BUILD_SINGLE),1) +-ifneq ($(BUILD_DOUBLE),1) +-ifneq ($(BUILD_COMPLEX),1) +-ifneq ($(BUILD_COMPLEX16),1) +-override BUILD_SINGLE=1 +-override BUILD_DOUBLE=1 +-override BUILD_COMPLEX=1 +-override BUILD_COMPLEX16=1 +-endif +-endif +-endif +-endif +- +-ifndef HOSTCC +-HOSTCC = $(CC) +-endif +- +-ifdef TARGET +-GETARCH_FLAGS := -DFORCE_$(TARGET) +-GETARCH_FLAGS += -DUSER_TARGET +-ifeq ($(TARGET), GENERIC) +-ifeq ($(DYNAMIC_ARCH), 1) +-override NO_EXPRECISION=1 +-export NO_EXPRECISION +-endif +-endif +-endif +- +-# Force fallbacks for 32bit +- +-ifeq ($(BINARY), 32) +-ifeq ($(TARGET), HASWELL) +-GETARCH_FLAGS := -DFORCE_NEHALEM +-endif +-ifeq ($(TARGET), SKYLAKEX) +-GETARCH_FLAGS := -DFORCE_NEHALEM +-endif +-ifeq ($(TARGET), COOPERLAKE) +-GETARCH_FLAGS := -DFORCE_NEHALEM +-endif +-ifeq ($(TARGET), SAPPHIRERAPIDS) +-GETARCH_FLAGS := -DFORCE_NEHALEM +-endif +-ifeq ($(TARGET), SANDYBRIDGE) +-GETARCH_FLAGS := -DFORCE_NEHALEM +-endif +-ifeq ($(TARGET), BULLDOZER) +-GETARCH_FLAGS := -DFORCE_BARCELONA +-endif +-ifeq ($(TARGET), PILEDRIVER) +-GETARCH_FLAGS := -DFORCE_BARCELONA +-endif +-ifeq ($(TARGET), STEAMROLLER) +-GETARCH_FLAGS := -DFORCE_BARCELONA +-endif +-ifeq ($(TARGET), EXCAVATOR) +-GETARCH_FLAGS := -DFORCE_BARCELONA +-endif +-ifeq ($(TARGET), ZEN) +-GETARCH_FLAGS := -DFORCE_BARCELONA +-endif +-ifeq ($(TARGET), ARMV8) +-GETARCH_FLAGS := -DFORCE_ARMV7 +-endif +-ifeq ($(TARGET), POWER8) +-GETARCH_FLAGS := -DFORCE_POWER6 +-endif +-ifeq ($(TARGET), POWER9) +-GETARCH_FLAGS := -DFORCE_POWER6 +-endif +-ifeq ($(TARGET), POWER10) +-GETARCH_FLAGS := -DFORCE_POWER6 +-endif +-endif +- +-#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. +-# +-ifdef TARGET_CORE +-GETARCH_FLAGS := -DFORCE_$(TARGET_CORE) +-endif +- +-# Force fallbacks for 32bit +- +-ifeq ($(BINARY), 32) +-ifeq ($(TARGET_CORE), HASWELL) +-GETARCH_FLAGS := -DFORCE_NEHALEM +-endif +-ifeq ($(TARGET_CORE), SKYLAKEX) +-GETARCH_FLAGS := -DFORCE_NEHALEM +-endif +-ifeq ($(TARGET_CORE), COOPERLAKE) +-GETARCH_FLAGS := -DFORCE_NEHALEM +-endif +-ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) +-GETARCH_FLAGS := -DFORCE_NEHALEM +-endif +-ifeq ($(TARGET_CORE), SANDYBRIDGE) +-GETARCH_FLAGS := -DFORCE_NEHALEM +-endif +-ifeq ($(TARGET_CORE), BULLDOZER) +-GETARCH_FLAGS := -DFORCE_BARCELONA +-endif +-ifeq ($(TARGET_CORE), PILEDRIVER) +-GETARCH_FLAGS := -DFORCE_BARCELONA +-endif +-ifeq ($(TARGET_CORE), STEAMROLLER) +-GETARCH_FLAGS := -DFORCE_BARCELONA +-endif +-ifeq ($(TARGET_CORE), EXCAVATOR) +-GETARCH_FLAGS := -DFORCE_BARCELONA +-endif +-ifeq ($(TARGET_CORE), ZEN) +-GETARCH_FLAGS := -DFORCE_BARCELONA +-endif +-endif +- +- +-# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. +-ifeq ($(HOSTARCH), x86_64) +-ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) +-GETARCH_FLAGS += -march=native +-endif +-endif +- +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-GETARCH_FLAGS += -DUSE64BITINT +-endif +-endif +- +-ifndef GEMM_MULTITHREAD_THRESHOLD +-GEMM_MULTITHREAD_THRESHOLD=4 +-endif +-GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) +- +-ifeq ($(NO_AVX), 1) +-GETARCH_FLAGS += -DNO_AVX +-endif +- +-ifeq ($(BINARY), 32) +-GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 +-NO_AVX512 = 1 +-endif +- +-ifeq ($(NO_AVX2), 1) +-GETARCH_FLAGS += -DNO_AVX2 +-endif +- +-ifeq ($(NO_AVX512), 1) +-GETARCH_FLAGS += -DNO_AVX512 +-endif +- +-ifeq ($(DEBUG), 1) +-GETARCH_FLAGS += -g +-endif +- +-ifeq ($(QUIET_MAKE), 1) +-MAKE += -s +-endif +- +-ifndef NO_PARALLEL_MAKE +-NO_PARALLEL_MAKE=0 +-endif +-GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE) +- +-ifdef MAKE_NB_JOBS +-GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS) +-endif +- +-ifeq ($(HOSTCC), loongcc) +-GETARCH_FLAGS += -static +-endif +- +-#if don't use Fortran, it will only compile CBLAS. +-ifeq ($(ONLY_CBLAS), 1) +-NO_LAPACK = 1 +-else +-ONLY_CBLAS = 0 +-endif +- +-#For small matrix optimization +-ifeq ($(ARCH), x86_64) +-SMALL_MATRIX_OPT = 1 +-else ifeq ($(ARCH), power) +-SMALL_MATRIX_OPT = 1 +-BUILD_BFLOAT16 = 1 +-endif +-ifeq ($(SMALL_MATRIX_OPT), 1) +-CCOMMON_OPT += -DSMALL_MATRIX_OPT +-endif +- +-# This operation is expensive, so execution should be once. +-ifndef GOTOBLAS_MAKEFILE +-export GOTOBLAS_MAKEFILE = 1 +- +-# Generating Makefile.conf and config.h +-DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) +- +-endif +- +-ifndef TARGET_CORE +--include $(TOPDIR)/Makefile.conf +-else +-HAVE_NEON= +-HAVE_VFP= +-HAVE_VFPV3= +-HAVE_VFPV4= +-HAVE_MMX= +-HAVE_SSE= +-HAVE_SSE2= +-HAVE_SSE3= +-HAVE_SSSE3= +-HAVE_SSE4_1= +-HAVE_SSE4_2= +-HAVE_SSE4A= +-HAVE_SSE5= +-HAVE_AVX= +-HAVE_AVX2= +-HAVE_FMA3= +-include $(TOPDIR)/Makefile_kernel.conf +-endif +- +- +-ifndef NUM_PARALLEL +-NUM_PARALLEL = 1 +-endif +- +-ifndef NUM_THREADS +-NUM_THREADS = $(NUM_CORES) +-endif +- +-ifeq ($(NUM_THREADS), 1) +-override USE_THREAD = 0 +-override USE_OPENMP = 0 +-endif +- +-ifdef USE_THREAD +-ifeq ($(USE_THREAD), 0) +-SMP = +-else +-SMP = 1 +-endif +-else +-ifeq ($(NUM_THREADS), 1) +-SMP = +-else +-SMP = 1 +-endif +-endif +- +-ifeq ($(SMP), 1) +-USE_LOCKING = +-endif +- +-ifndef NEED_PIC +-NEED_PIC = 1 +-endif +- +-ARFLAGS = +-CPP = $(COMPILER) -E +-AR ?= $(CROSS_SUFFIX)ar +-AS ?= $(CROSS_SUFFIX)as +-LD ?= $(CROSS_SUFFIX)ld +-RANLIB ?= $(CROSS_SUFFIX)ranlib +-NM = $(CROSS_SUFFIX)nm +-DLLWRAP = $(CROSS_SUFFIX)dllwrap +-OBJCOPY = $(CROSS_SUFFIX)objcopy +-OBJCONV = $(CROSS_SUFFIX)objconv +- +- +-# When fortran support was either not detected or actively deselected, only build BLAS. +-ifeq ($(NOFORTRAN), 1) +-C_LAPACK = 1 +-override FEXTRALIB = +-endif +- +-ifeq ($(C_COMPILER), GCC) +-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +-GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +-GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) +-GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) +-GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) +-GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +-GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) +-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) +-# Note that the behavior of -dumpversion is compile-time-configurable for +-# gcc-7.x and newer. Use -dumpfullversion there +-ifeq ($(GCCVERSIONGTEQ7),1) +- GCCDUMPVERSION_PARAM := -dumpfullversion +-else +- GCCDUMPVERSION_PARAM := -dumpversion +-endif +-GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) +-GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) +-GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) +-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) +-endif +- +-ifeq ($(C_COMPILER), CLANG) +-CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +-CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) +-endif +- +-# +-# OS dependent settings +-# +- +-ifeq ($(OSNAME), Darwin) +-ifndef MACOSX_DEPLOYMENT_TARGET +-ifeq ($(ARCH), arm64) +-export MACOSX_DEPLOYMENT_TARGET=11.0 +-ifeq ($(C_COMPILER), GCC) +-export NO_SVE = 1 +-endif +-else +-export MACOSX_DEPLOYMENT_TARGET=10.8 +-endif +-endif +-MD5SUM = md5 -r +-XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.Xcode |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.) +-ifeq (x$(XCVER)x,xx) +-XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.) +-endif +-ifeq (x$(XCVER), x 15) +-CCOMMON_OPT += -Wl,-ld_classic +-endif +-endif +- +-ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) +-MD5SUM = md5 -r +-endif +- +-ifeq ($(OSNAME), NetBSD) +-MD5SUM = md5 -n +-endif +- +-ifeq ($(OSNAME), Linux) +-EXTRALIB += -lm +-NO_EXPRECISION = 1 +-endif +- +-ifeq ($(OSNAME), Android) +-EXTRALIB += -lm +-endif +- +-ifeq ($(OSNAME), AIX) +-EXTRALIB += -lm +-endif +- +-ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) +-ifeq ($(ARCH), $(filter $(ARCH),arm arm64)) +-EXTRALIB += -lm +-endif +-endif +- +-ifeq ($(OSNAME), WINNT) +-NEED_PIC = 0 +-NO_EXPRECISION = 1 +- +-EXTRALIB += -defaultlib:advapi32 +- +-SUFFIX = obj +-PSUFFIX = pobj +-LIBSUFFIX = a +- +-ifeq ($(C_COMPILER), CLANG) +-CCOMMON_OPT += -DMS_ABI +-endif +- +-#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) +-ifeq ($(GCCVERSIONGT4), 1) +-# GCC Major version > 4 +-# It is compatible with MSVC ABI. +-CCOMMON_OPT += -DMS_ABI +-endif +- +-ifeq ($(GCCVERSIONGTEQ4), 1) +-ifeq ($(GCCMINORVERSIONGTEQ7), 1) +-# GCC Version >=4.7 +-# It is compatible with MSVC ABI. +-CCOMMON_OPT += -DMS_ABI +-endif +-endif +- +-# Ensure the correct stack alignment on Win32 +-# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 +-ifeq ($(ARCH), x86) +-CCOMMON_OPT += -mincoming-stack-boundary=2 +-FCOMMON_OPT += -mincoming-stack-boundary=2 +-endif +- +-endif +- +-ifeq ($(OSNAME), Interix) +-NEED_PIC = 0 +-NO_EXPRECISION = 1 +- +-INTERIX_TOOL_DIR = /opt/gcc.3.3/i586-pc-interix3/bin +-endif +- +-ifeq ($(OSNAME), CYGWIN_NT) +-NEED_PIC = 0 +-NO_EXPRECISION = 1 +-OS_CYGWIN_NT = 1 +-endif +- +-ifneq ($(OSNAME), WINNT) +-ifneq ($(OSNAME), CYGWIN_NT) +-ifneq ($(OSNAME), Interix) +-ifneq ($(OSNAME), Android) +-ifdef SMP +-EXTRALIB += -lpthread +-endif +-endif +-endif +-endif +-endif +- +-# ifeq logical or +-ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix)) +-OS_WINDOWS=1 +-endif +- +-ifdef QUAD_PRECISION +-CCOMMON_OPT += -DQUAD_PRECISION +-NO_EXPRECISION = 1 +-endif +- +-ifneq ($(ARCH), x86) +-ifneq ($(ARCH), x86_64) +-NO_EXPRECISION = 1 +-endif +-endif +- +-ifdef UTEST_CHECK +-CCOMMON_OPT += -DUTEST_CHECK +-SANITY_CHECK = 1 +-endif +- +-ifdef SANITY_CHECK +-CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) +-endif +- +-MAX_STACK_ALLOC ?= 2048 +-ifneq ($(MAX_STACK_ALLOC), 0) +-CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) +-endif +- +-ifdef USE_LOCKING +-ifneq ($(USE_LOCKING), 0) +-CCOMMON_OPT += -DUSE_LOCKING +-endif +-endif +- +-# +-# Architecture dependent settings +-# +- +-ifeq ($(ARCH), x86) +-ifndef BINARY +-NO_BINARY_MODE = 1 +-endif +- +-ifeq ($(CORE), generic) +-NO_EXPRECISION = 1 +-endif +- +-ifndef NO_EXPRECISION +-ifeq ($(F_COMPILER), GFORTRAN) +-# ifeq logical or. GCC or LSB +-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) +-EXPRECISION = 1 +-CCOMMON_OPT += -DEXPRECISION -m128bit-long-double +-FCOMMON_OPT += -m128bit-long-double +-endif +-ifeq ($(C_COMPILER), CLANG) +-EXPRECISION = 1 +-CCOMMON_OPT += -DEXPRECISION +-FCOMMON_OPT += -m128bit-long-double +-endif +-endif +-endif +-endif +- +-ifeq ($(ARCH), x86_64) +- +-ifeq ($(CORE), generic) +-NO_EXPRECISION = 1 +-endif +- +-ifndef NO_EXPRECISION +-ifeq ($(F_COMPILER), GFORTRAN) +-# ifeq logical or. GCC or LSB +-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) +-EXPRECISION = 1 +-CCOMMON_OPT += -DEXPRECISION -m128bit-long-double +-FCOMMON_OPT += -m128bit-long-double +-endif +-ifeq ($(C_COMPILER), CLANG) +-EXPRECISION = 1 +-CCOMMON_OPT += -DEXPRECISION +-FCOMMON_OPT += -m128bit-long-double +-endif +-endif +-endif +-endif +- +-ifeq ($(C_COMPILER), INTEL) +-CCOMMON_OPT += -wd981 +-endif +- +- +-ifeq ($(USE_OPENMP), 1) +- +-#check +-ifeq ($(USE_THREAD), 0) +-$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.) +-endif +- +-# ifeq logical or. GCC or LSB +-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) +-CCOMMON_OPT += -fopenmp +-endif +- +-ifeq ($(C_COMPILER), CLANG) +-CCOMMON_OPT += -fopenmp +-ifeq ($(F_COMPILER), GFORTRAN) +-FEXTRALIB := $(subst -lgomp,-lomp,$(FEXTRALIB)) +-endif +-endif +- +-ifeq ($(C_COMPILER), INTEL) +-CCOMMON_OPT += -fopenmp +-endif +- +-ifeq ($(C_COMPILER), PGI) +-CCOMMON_OPT += -mp +-endif +- +-ifeq ($(C_COMPILER), OPEN64) +-CCOMMON_OPT += -mp +-CEXTRALIB += -lstdc++ +-endif +- +-ifeq ($(C_COMPILER), PATHSCALE) +-CCOMMON_OPT += -mp +-endif +-endif +- +- +-ifeq ($(DYNAMIC_ARCH), 1) +-ifeq ($(ARCH), x86) +-DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ +- CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO +-endif +- +-ifeq ($(ARCH), x86_64) +-DYNAMIC_CORE = PRESCOTT CORE2 +-ifeq ($(DYNAMIC_OLDER), 1) +-DYNAMIC_CORE += PENRYN DUNNINGTON +-endif +-DYNAMIC_CORE += NEHALEM +-ifeq ($(DYNAMIC_OLDER), 1) +-DYNAMIC_CORE += OPTERON OPTERON_SSE3 +-endif +-DYNAMIC_CORE += BARCELONA +-ifeq ($(DYNAMIC_OLDER), 1) +-DYNAMIC_CORE += BOBCAT ATOM NANO +-endif +-ifneq ($(NO_AVX), 1) +-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR +-endif +-ifneq ($(NO_AVX2), 1) +-DYNAMIC_CORE += HASWELL ZEN +-endif +-ifneq ($(NO_AVX512), 1) +-ifneq ($(NO_AVX2), 1) +-DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS +-endif +-endif +-endif +- +-ifdef DYNAMIC_LIST +-override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST) +-XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT +-XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +-CCOMMON_OPT += $(XCCOMMON_OPT) +-#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' +-endif +- +-ifeq ($(ARCH), arm64) +-DYNAMIC_CORE = ARMV8 +-DYNAMIC_CORE += CORTEXA53 +-DYNAMIC_CORE += CORTEXA57 +-DYNAMIC_CORE += CORTEXA72 +-DYNAMIC_CORE += CORTEXA73 +-DYNAMIC_CORE += NEOVERSEN1 +-ifneq ($(NO_SVE), 1) +-DYNAMIC_CORE += NEOVERSEV1 +-DYNAMIC_CORE += NEOVERSEN2 +-DYNAMIC_CORE += ARMV8SVE +-endif +-DYNAMIC_CORE += CORTEXA55 +-DYNAMIC_CORE += FALKOR +-DYNAMIC_CORE += THUNDERX +-DYNAMIC_CORE += THUNDERX2T99 +-DYNAMIC_CORE += TSV110 +-DYNAMIC_CORE += EMAG8180 +-DYNAMIC_CORE += THUNDERX3T110 +-ifdef DYNAMIC_LIST +-override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST) +-XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8 +-XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +-endif +-endif +- +-ifeq ($(ARCH), mips64) +-DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 MIPS64_GENERIC +-ifdef DYNAMIC_LIST +-override DYNAMIC_CORE = MIPS64_GENERIC $(DYNAMIC_LIST) +-XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_MIPS64_GENERIC +-XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +-endif +-endif +- +-ifeq ($(ARCH), loongarch64) +-DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC +-endif +- +-ifeq ($(ARCH), zarch) +-DYNAMIC_CORE = ZARCH_GENERIC +- +-# if the compiler accepts -march=arch11 or -march=z13 and can compile a file +-# with z13-specific inline assembly, then we can include support for Z13. +-# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases +-# only support one or the other. +-# note: LLVM version 6.x supported -march=z13 yet could not handle vector +-# registers in inline assembly, so the check for supporting the -march flag is +-# not enough. +-ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null +-ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) +-ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) +- +-ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) +-DYNAMIC_CORE += Z13 +-CCOMMON_OPT += -DDYN_Z13 +-else +-$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) +-endif +- +-# as above for z13, check for -march=arch12 and z14 support in the compiler. +-ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) +-ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) +-ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) +-DYNAMIC_CORE += Z14 +-CCOMMON_OPT += -DDYN_Z14 +-else +-$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) +-endif +- +-endif # ARCH zarch +- +-ifeq ($(ARCH), power) +-ifneq ($(C_COMPILER), PGI) +-DYNAMIC_CORE = POWER6 +-DYNAMIC_CORE += POWER8 +-ifneq ($(C_COMPILER), GCC) +-DYNAMIC_CORE += POWER9 +-DYNAMIC_CORE += POWER10 +-CCOMMON_OPT += -DHAVE_P10_SUPPORT +-endif +-ifeq ($(C_COMPILER), GCC) +-ifeq ($(GCCVERSIONGT5), 1) +-DYNAMIC_CORE += POWER9 +-else +-$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) +-endif +-ifeq ($(OSNAME), AIX) +-LDVERSIONGTEQ35 := 1 +-else +-LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) +-endif +-ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) +-DYNAMIC_CORE += POWER10 +-CCOMMON_OPT += -DHAVE_P10_SUPPORT +-else ifeq ($(GCCVERSIONGTEQ10), 1) +-ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11) +-DYNAMIC_CORE += POWER10 +-CCOMMON_OPT += -DHAVE_P10_SUPPORT +-endif +-else +-$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) +-endif +-endif +-else +-DYNAMIC_CORE = POWER8 +-DYNAMIC_CORE += POWER9 +-endif +-endif +- +-# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty +-ifndef DYNAMIC_CORE +-override DYNAMIC_ARCH= +-endif +-endif +- +-ifeq ($(ARCH), ia64) +-NO_BINARY_MODE = 1 +-BINARY_DEFINED = 1 +- +-ifeq ($(F_COMPILER), GFORTRAN) +-ifeq ($(C_COMPILER), GCC) +-# EXPRECISION = 1 +-# CCOMMON_OPT += -DEXPRECISION +-endif +-endif +-endif +- +-ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) +-NO_BINARY_MODE = 1 +-endif +- +-ifeq ($(ARCH), alpha) +-NO_BINARY_MODE = 1 +-BINARY_DEFINED = 1 +-endif +- +-ifeq ($(ARCH), arm) +-NO_BINARY_MODE = 1 +-BINARY_DEFINED = 1 +- +-CCOMMON_OPT += -marm +-FCOMMON_OPT += -marm +- +-# If softfp abi is mentioned on the command line, force it. +-ifeq ($(ARM_SOFTFP_ABI), 1) +-CCOMMON_OPT += -mfloat-abi=softfp +-FCOMMON_OPT += -mfloat-abi=softfp +-endif +- +-ifeq ($(OSNAME), Android) +-ifeq ($(ARM_SOFTFP_ABI), 1) +-EXTRALIB += -lm +-else +-EXTRALIB += -Wl,-lm_hard +-endif +-endif +-endif +- +-ifeq ($(ARCH), arm64) +-NO_BINARY_MODE = 1 +-BINARY_DEFINED = 1 +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-ifeq ($(F_COMPILER), GFORTRAN) +-FCOMMON_OPT += -fdefault-integer-8 +-endif +-ifeq ($(F_COMPILER), FLANG) +-FCOMMON_OPT += -i8 +-endif +-endif +-endif +-endif +- +-ifeq ($(ARCH), riscv64) +-NO_BINARY_MODE = 1 +-BINARY_DEFINED = 1 +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-ifeq ($(F_COMPILER), GFORTRAN) +-FCOMMON_OPT += -fdefault-integer-8 +-endif +-ifeq ($(F_COMPILER), FLANG) +-FCOMMON_OPT += -i8 +-endif +-endif +-endif +-endif +- +-ifeq ($(ARCH), loongarch64) +-NO_BINARY_MODE = 1 +-BINARY_DEFINED = 1 +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-ifeq ($(F_COMPILER), GFORTRAN) +-FCOMMON_OPT += -fdefault-integer-8 +-endif +-ifeq ($(F_COMPILER), FLANG) +-FCOMMON_OPT += -i8 +-endif +-endif +-endif +-endif +- +-# +-# C Compiler dependent settings +-# +- +- +-# ifeq logical or. GCC or CLANG or LSB +-# http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or +-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB)) +-CCOMMON_OPT += -Wall +-COMMON_PROF += -fno-inline +-NO_UNINITIALIZED_WARN = -Wno-uninitialized +- +-ifeq ($(QUIET_MAKE), 1) +-CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused +-endif +- +-ifdef NO_BINARY_MODE +- +-ifeq ($(ARCH), $(filter $(ARCH),mips64)) +-ifdef BINARY64 +-CCOMMON_OPT += -mabi=64 +-else +-CCOMMON_OPT += -mabi=n32 +-endif +-BINARY_DEFINED = 1 +-else ifeq ($(ARCH), $(filter $(ARCH),mips)) +-CCOMMON_OPT += -mabi=32 +-BINARY_DEFINED = 1 +-endif +- +-ifneq (, $(filter $(CORE), MIPS64_GENERIC)) +-CCOMMON_OPT += -DNO_MSA +-FCOMMON_OPT += -DNO_MSA +-endif +- +-ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) +-CCOMMON_OPT += -march=loongson3a +-FCOMMON_OPT += -march=loongson3a +-endif +- +-ifeq ($(CORE), MIPS24K) +-CCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) +-FCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) +-endif +- +-ifeq ($(CORE), MIPS1004K) +-CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) +-FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) +-endif +- +-ifeq ($(CORE), P5600) +-CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +-FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +-endif +- +-ifeq ($(CORE), I6400) +-CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) +-FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) +-endif +- +-ifeq ($(CORE), P6600) +-CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) +-FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) +-endif +- +-ifeq ($(CORE), I6500) +-CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) +-FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) +-endif +- +-ifeq ($(OSNAME), AIX) +-BINARY_DEFINED = 1 +-endif +- +-ifeq ($(ARCH), loongarch64) +-LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) +-ifneq ($(LA64_ABI), lp64d) +-LA64_ABI=lp64 +-endif +-CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) +-FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) +-endif +- +-endif +- +-ifndef BINARY_DEFINED +-ifneq ($(OSNAME), AIX) +-ifdef BINARY64 +-ifneq ($(ARCH), riscv64) +-CCOMMON_OPT += -m64 +-endif +-else +-CCOMMON_OPT += -m32 +-endif +-endif +-endif +- +-endif +- +-ifeq ($(C_COMPILER), PGI) +-PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) +-PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20) +-PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11) +-PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) +-ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011)) +-NEWPGI := 1 +-PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21) +-PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21) +-PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11) +-ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011)) +-NEWPGI2 := 1 +-endif +-endif +-ifdef BINARY64 +-ifeq ($(ARCH), x86_64) +-ifeq (,$(findstring tp,$(CFLAGS))) +-ifneq ($(NEWPGI2),1) +-CCOMMON_OPT += -tp p7-64 +-else +-CCOMMON_OPT += -tp px +-endif +-endif +-ifneq ($(NEWPGI),1) +-CCOMMON_OPT += -D__MMX__ -Mnollvm +-endif +-else +-ifeq ($(ARCH), power) +-ifeq (,$(findstring tp,$(CFLAGS))) +-ifeq ($(CORE), POWER8) +-CCOMMON_OPT += -tp pwr8 +-endif +-ifeq ($(CORE), POWER9) +-CCOMMON_OPT += -tp pwr9 +-endif +-endif +-endif +-endif +-else +-ifneq ($(NEWPGI2),1) +-ifeq (,$(findstring tp,$(CFLAGS))) +-CCOMMON_OPT += -tp p7 +-else +-CCOMMON_OPT += -tp px +-endif +-endif +-endif +-endif +- +-ifeq ($(C_COMPILER), PATHSCALE) +-ifdef BINARY64 +-CCOMMON_OPT += -m64 +-else +-CCOMMON_OPT += -m32 +-endif +-endif +- +-# +-# Fortran Compiler dependent settings +-# +- +-ifeq ($(F_COMPILER), NAG) +-FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-FCOMMON_OPT += -i8 +-endif +-endif +-ifeq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -openmp +-endif +-endif +- +-ifeq ($(F_COMPILER), FLANG) +-CCOMMON_OPT += -DF_INTERFACE_FLANG +-FCOMMON_OPT += -Mrecursive -Kieee +-ifeq ($(OSNAME), Linux) +-ifeq ($(ARCH), x86_64) +-FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ") +-ifeq ($(FLANG_VENDOR), AMD) +-FCOMMON_OPT += -fno-unroll-loops +-endif +-endif +-endif +-ifdef BINARY64 +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-FCOMMON_OPT += -i8 +-endif +-endif +-FCOMMON_OPT += -Wall +-else +-FCOMMON_OPT += -Wall +-endif +-ifeq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -fopenmp +-endif +-endif +- +-ifeq ($(F_COMPILER), G77) +-CCOMMON_OPT += -DF_INTERFACE_G77 +-FCOMMON_OPT += -Wall +-ifndef NO_BINARY_MODE +-ifneq ($(OSNAME), AIX) +-ifdef BINARY64 +-FCOMMON_OPT += -m64 +-else +-FCOMMON_OPT += -m32 +-endif +-endif +-endif +-endif +- +-ifeq ($(F_COMPILER), G95) +-CCOMMON_OPT += -DF_INTERFACE_G95 +-FCOMMON_OPT += -Wall +-ifneq ($(OSNAME), AIX) +-ifndef NO_BINARY_MODE +-ifdef BINARY64 +-FCOMMON_OPT += -m64 +-else +-FCOMMON_OPT += -m32 +-endif +-endif +-ifneq ($(NO_LAPACKE), 1) +-FCOMMON_OPT += -fno-second-underscore +-endif +-endif +-endif +- +-ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) +-CCOMMON_OPT += -DF_INTERFACE_GFORT +-ifeq ($(F_COMPILER), GFORTRAN) +-FCOMMON_OPT += -Wall +-# make single-threaded LAPACK calls thread-safe #1847 +-FCOMMON_OPT += -frecursive +-# work around ABI problem with passing single-character arguments +-FCOMMON_OPT += -fno-optimize-sibling-calls +-#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc +-ifneq ($(NOFORTRAN), 1) +-ifneq ($(NOFORTRAN), 2) +-ifneq ($(NO_LAPACK), 1) +-EXTRALIB += -lgfortran +-endif +-endif +-endif +-endif +-ifdef NO_BINARY_MODE +-ifeq ($(ARCH), $(filter $(ARCH),mips64)) +-ifdef BINARY64 +-FCOMMON_OPT += -mabi=64 +-else +-FCOMMON_OPT += -mabi=n32 +-endif +-else ifeq ($(ARCH), $(filter $(ARCH),mips)) +-FCOMMON_OPT += -mabi=32 +-endif +-else +-ifdef BINARY64 +-ifneq ($(OSNAME), AIX) +-ifneq ($(ARCH), riscv64) +-FCOMMON_OPT += -m64 +-endif +-endif +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-FCOMMON_OPT += -fdefault-integer-8 +-endif +-endif +-else +-ifneq ($(OSNAME), AIX) +-FCOMMON_OPT += -m32 +-endif +-endif +-endif +-ifeq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -fopenmp +-endif +-endif +- +-ifeq ($(F_COMPILER), INTEL) +-CCOMMON_OPT += -DF_INTERFACE_INTEL +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-FCOMMON_OPT += -i8 +-endif +-endif +-FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens +-ifeq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -fopenmp +-endif +-endif +- +-ifeq ($(F_COMPILER), FUJITSU) +-CCOMMON_OPT += -DF_INTERFACE_FUJITSU +-ifeq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -openmp +-endif +-endif +- +-ifeq ($(F_COMPILER), IBM) +-CCOMMON_OPT += -DF_INTERFACE_IBM +-FEXTRALIB += -lxlf90 +-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG)) +-FCOMMON_OPT += -qextname +-endif +-# FCOMMON_OPT += -qarch=440 +-ifdef BINARY64 +-FCOMMON_OPT += -q64 +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-FCOMMON_OPT += -qintsize=8 +-endif +-endif +-else +-FCOMMON_OPT += -q32 +-endif +-ifeq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -openmp +-endif +-endif +- +-ifeq ($(F_COMPILER), PGI) +-CCOMMON_OPT += -DF_INTERFACE_PGI +-COMMON_PROF += -DPGICOMPILER +-ifdef BINARY64 +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-FCOMMON_OPT += -i8 +-endif +-endif +-ifeq ($(ARCH), x86_64) +-ifneq ($(NEWPGI2),1) +-FCOMMON_OPT += -tp p7-64 +-else +-FCOMMON_OPT += -tp px +-endif +-else +-ifeq ($(ARCH), power) +-ifeq ($(CORE), POWER6) +-$(warning NVIDIA HPC compilers do not support POWER6.) +-endif +-ifeq ($(CORE), POWER8) +-FCOMMON_OPT += -tp pwr8 +-endif +-ifeq ($(CORE), POWER9) +-FCOMMON_OPT += -tp pwr9 +-endif +-ifeq ($(CORE), POWER10) +-$(warning NVIDIA HPC compilers do not support POWER10.) +-endif +-endif +-endif +-else +-FCOMMON_OPT += -tp p7 +-endif +-FCOMMON_OPT += -Mrecursive -Kieee +-ifeq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -mp +-endif +-endif +- +-ifeq ($(F_COMPILER), PATHSCALE) +-CCOMMON_OPT += -DF_INTERFACE_PATHSCALE +-ifdef BINARY64 +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-FCOMMON_OPT += -i8 +-endif +-endif +-endif +- +-ifeq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -mp +-endif +-endif +- +-ifeq ($(F_COMPILER), OPEN64) +-CCOMMON_OPT += -DF_INTERFACE_OPEN64 +-ifdef BINARY64 +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-FCOMMON_OPT += -i8 +-endif +-endif +-endif +-ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) +-ifndef BINARY64 +-FCOMMON_OPT += -n32 +-else +-FCOMMON_OPT += -n64 +-endif +-ifeq ($(CORE), LOONGSON3R3) +-FCOMMON_OPT += -loongson3 -static +-endif +-ifeq ($(CORE), LOONGSON3R4) +-FCOMMON_OPT += -loongson3 -static +-endif +-else +-ifndef BINARY64 +-FCOMMON_OPT += -m32 +-else +-FCOMMON_OPT += -m64 +-endif +-endif +-ifeq ($(USE_OPENMP), 1) +-FEXTRALIB += -lstdc++ +-FCOMMON_OPT += -mp +-endif +-endif +- +-ifeq ($(C_COMPILER), OPEN64) +-ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) +-ifndef BINARY64 +-CCOMMON_OPT += -n32 +-else +-CCOMMON_OPT += -n64 +-endif +-ifeq ($(CORE), LOONGSON3R3) +-CCOMMON_OPT += -loongson3 -static +-endif +-ifeq ($(CORE), LOONGSON3R4) +-CCOMMON_OPT += -loongson3 -static +-endif +-else +-ifndef BINARY64 +-CCOMMON_OPT += -m32 +-else +-CCOMMON_OPT += -m64 +-endif +-endif +-endif +- +-ifeq ($(C_COMPILER), SUN) +-CCOMMON_OPT += -w +-ifeq ($(ARCH), x86) +-CCOMMON_OPT += -m32 +-else +-ifdef BINARY64 +-CCOMMON_OPT += -m64 +-else +-CCOMMON_OPT += -m32 +-endif +-endif +-endif +- +-ifeq ($(F_COMPILER), SUN) +-CCOMMON_OPT += -DF_INTERFACE_SUN +-FCOMMON_OPT += -ftrap=%none -xrecursive +-ifeq ($(ARCH), x86) +-FCOMMON_OPT += -m32 +-else +-ifdef BINARY64 +-FCOMMON_OPT += -m64 +-else +-FCOMMON_OPT += -m32 +-endif +-endif +-ifeq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -xopenmp=parallel +-endif +-endif +- +-ifeq ($(F_COMPILER), COMPAQ) +-CCOMMON_OPT += -DF_INTERFACE_COMPAQ +-ifeq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -openmp +-endif +-endif +- +-ifeq ($(F_COMPILER), CRAY) +-CCOMMON_OPT += -DF_INTERFACE_INTEL +-FCOMMON_OPT += -hnopattern +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-FCOMMON_OPT += -s integer64 +-endif +-endif +-ifneq ($(USE_OPENMP), 1) +-FCOMMON_OPT += -O noomp +-endif +-endif +- +-ifdef BINARY64 +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-CCOMMON_OPT += +-#-DUSE64BITINT +-endif +-endif +-endif +- +-ifeq ($(NEED_PIC), 1) +-ifeq ($(C_COMPILER), IBM) +-CCOMMON_OPT += -qpic=large +-else +-CCOMMON_OPT += -fPIC +-endif +-ifeq ($(F_COMPILER), SUN) +-FCOMMON_OPT += -pic +-else ifeq ($(F_COMPILER), NAG) +-FCOMMON_OPT += -PIC +-else ifeq ($(F_COMPILER), IBM) +-FCOMMON_OPT += -qpic=large +-else +-FCOMMON_OPT += -fPIC +-endif +-endif +- +-ifeq ($(DYNAMIC_ARCH), 1) +-CCOMMON_OPT += -DDYNAMIC_ARCH +-endif +- +-ifeq ($(DYNAMIC_OLDER), 1) +-CCOMMON_OPT += -DDYNAMIC_OLDER +-endif +- +-ifeq ($(C_LAPACK), 1) +-CCOMMON_OPT += -DC_LAPACK +-endif +- +-ifeq ($(NO_LAPACK), 1) +-CCOMMON_OPT += -DNO_LAPACK +-#Disable LAPACK C interface +-NO_LAPACKE = 1 +-endif +- +-ifeq ($(NO_LAPACKE), 1) +-CCOMMON_OPT += -DNO_LAPACKE +-endif +- +-ifeq ($(NO_AVX), 1) +-CCOMMON_OPT += -DNO_AVX +-endif +- +-ifeq ($(ARCH), x86) +-CCOMMON_OPT += -DNO_AVX +-endif +- +-ifeq ($(NO_AVX2), 1) +-CCOMMON_OPT += -DNO_AVX2 +-endif +- +-ifeq ($(NO_AVX512), 1) +-CCOMMON_OPT += -DNO_AVX512 +-endif +- +-ifeq ($(NO_SVE), 1) +-CCOMMON_OPT += -DNO_SVE +-endif +- +-ifdef SMP +-CCOMMON_OPT += -DSMP_SERVER +- +-ifeq ($(ARCH), mips64) +-USE_SIMPLE_THREADED_LEVEL3 = 1 +-endif +- +-ifeq ($(USE_OPENMP), 1) +-# USE_SIMPLE_THREADED_LEVEL3 = 1 +-# NO_AFFINITY = 1 +-CCOMMON_OPT += -DUSE_OPENMP +-endif +- +-ifeq ($(BIGNUMA), 1) +-CCOMMON_OPT += -DBIGNUMA +-endif +- +-endif +- +-ifeq ($(NO_WARMUP), 1) +-CCOMMON_OPT += -DNO_WARMUP +-endif +- +-ifeq ($(CONSISTENT_FPCSR), 1) +-CCOMMON_OPT += -DCONSISTENT_FPCSR +-endif +- +-# Only for development +-# CCOMMON_OPT += -DPARAMTEST +-# CCOMMON_OPT += -DPREFETCHTEST +-# CCOMMON_OPT += -DNO_SWITCHING +-# USE_PAPI = 1 +- +-ifdef USE_PAPI +-CCOMMON_OPT += -DUSE_PAPI +-EXTRALIB += -lpapi -lperfctr +-endif +- +-ifdef BUFFERSIZE +-CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) +-endif +- +-ifdef DYNAMIC_THREADS +-CCOMMON_OPT += -DDYNAMIC_THREADS +-endif +- +-CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) +- +-CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL) +- +-ifdef USE_SIMPLE_THREADED_LEVEL3 +-CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 +-endif +- +-ifeq ($(USE_TLS), 1) +-CCOMMON_OPT += -DUSE_TLS +-endif +- +-ifeq ($(BUILD_BFLOAT16), 1) +-CCOMMON_OPT += -DBUILD_BFLOAT16 +-endif +-ifeq ($(BUILD_SINGLE), 1) +-CCOMMON_OPT += -DBUILD_SINGLE=1 +-endif +-ifeq ($(BUILD_DOUBLE), 1) +-CCOMMON_OPT += -DBUILD_DOUBLE=1 +-endif +-ifeq ($(BUILD_COMPLEX), 1) +-CCOMMON_OPT += -DBUILD_COMPLEX=1 +-endif +-ifeq ($(BUILD_COMPLEX16), 1) +-CCOMMON_OPT += -DBUILD_COMPLEX16=1 +-endif +- +-CCOMMON_OPT += -DVERSION=\"$(VERSION)\" +- +-ifndef SYMBOLPREFIX +-SYMBOLPREFIX = +-endif +- +-ifndef SYMBOLSUFFIX +-SYMBOLSUFFIX = +-endif +- +-ifndef LIBSONAMEBASE +-LIBSONAMEBASE = openblas +-endif +- +-ifndef LIBNAMESUFFIX +-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) +-else +-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) +-endif +- +-ifeq ($(OSNAME), CYGWIN_NT) +-LIBPREFIX = cyg$(LIBNAMEBASE) +-else +-LIBPREFIX = lib$(LIBNAMEBASE) +-endif +- +-KERNELDIR = $(TOPDIR)/kernel/$(ARCH) +- +-include $(TOPDIR)/Makefile.$(ARCH) +- +-ifneq ($(C_COMPILER), PGI) +-ifneq ($(C_COMPILER), SUN) +-CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME +-endif +-endif +-CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" +- +-ifeq ($(CORE), PPC440) +-CCOMMON_OPT += -DALLOC_QALLOC +-endif +- +-ifeq ($(CORE), PPC440FP2) +-STATIC_ALLOCATION = 1 +-endif +- +-ifneq ($(OSNAME), Linux) +-NO_AFFINITY = 1 +-endif +- +-ifneq ($(ARCH), x86_64) +-ifneq ($(ARCH), x86) +-NO_AFFINITY = 1 +-endif +-endif +- +-ifdef NO_AFFINITY +-ifeq ($(NO_AFFINITY), 0) +-override undefine NO_AFFINITY +-else +-CCOMMON_OPT += -DNO_AFFINITY +-endif +-endif +- +-ifdef FUNCTION_PROFILE +-CCOMMON_OPT += -DFUNCTION_PROFILE +-endif +- +-ifdef HUGETLB_ALLOCATION +-CCOMMON_OPT += -DALLOC_HUGETLB +-endif +- +-ifdef HUGETLBFILE_ALLOCATION +-CCOMMON_OPT += -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION) +-endif +- +-ifdef STATIC_ALLOCATION +-CCOMMON_OPT += -DALLOC_STATIC +-endif +- +-ifdef DEVICEDRIVER_ALLOCATION +-CCOMMON_OPT += -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\" +-endif +- +-ifdef MIXED_MEMORY_ALLOCATION +-CCOMMON_OPT += -DMIXED_MEMORY_ALLOCATION +-endif +- +-ifeq ($(OSNAME), SunOS) +-TAR = gtar +-PATCH = gpatch +-GREP = ggrep +-AWK = nawk +-else +-TAR = tar +-PATCH = patch +-GREP = grep +-AWK = awk +-endif +- +-ifndef MD5SUM +-MD5SUM = md5sum +-endif +- +- +-REVISION = -r$(VERSION) +-MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) +- +-ifeq ($(DEBUG), 1) +-COMMON_OPT += -g +-endif +- +-ifeq ($(DEBUG), 1) +-FCOMMON_OPT += -g +-endif +- +-ifndef COMMON_OPT +-COMMON_OPT = -O2 +-endif +- +-ifndef FCOMMON_OPT +-FCOMMON_OPT = -O2 -frecursive +-endif +- +-override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) +-override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) +-override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) +-override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) +-#MAKEOVERRIDES = +- +-ifeq ($(NEED_PIC), 1) +-ifeq (,$(findstring PIC,$(FFLAGS))) +-ifneq ($(F_COMPILER),IBM) +-override FFLAGS += -fPIC +-endif +-endif +-endif +- +-#For LAPACK Fortran codes. +-#Disable -fopenmp for LAPACK Fortran codes on Windows. +-ifdef OS_WINDOWS +-LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS)) +-LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS)) +-else +-LAPACK_FFLAGS := $(FFLAGS) +-LAPACK_FPFLAGS := $(FPFLAGS) +-endif +- +-ifeq ($(F_COMPILER),NAG) +-LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +-override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +-endif +-ifeq ($(F_COMPILER),CRAY) +-LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +-override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +-endif +- +-LAPACK_CFLAGS = $(CFLAGS) +-LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +-LAPACK_CFLAGS += -DLAPACK_ILP64 +-endif +-endif +- +-ifdef OS_WINDOWS +-LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS +-LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE +-endif +-ifeq ($(C_COMPILER), LSB) +-LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE +-endif +- +-ifndef SUFFIX +-SUFFIX = o +-endif +- +-ifndef PSUFFIX +-PSUFFIX = po +-endif +- +-ifndef LIBSUFFIX +-LIBSUFFIX = a +-endif +- +-ifneq ($(DYNAMIC_ARCH), 1) +-ifndef SMP +-LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) +-LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) +-else +-LIBNAME = $(LIBPREFIX)_$(LIBCORE)p$(REVISION).$(LIBSUFFIX) +-LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)p$(REVISION)_p.$(LIBSUFFIX) +-endif +-else +-ifndef SMP +-LIBNAME = $(LIBPREFIX)$(REVISION).$(LIBSUFFIX) +-LIBNAME_P = $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX) +-else +-LIBNAME = $(LIBPREFIX)p$(REVISION).$(LIBSUFFIX) +-LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX) +-endif +-endif +- +- +-LIBDLLNAME = $(LIBPREFIX).dll +-IMPLIBNAME = lib$(LIBNAMEBASE).dll.a +-ifneq ($(OSNAME), AIX) +-LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) +-else +-LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) +-endif +-LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) +-LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) +-LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) +-LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip) +- +-LIBS = $(TOPDIR)/$(LIBNAME) +-LIBS_P = $(TOPDIR)/$(LIBNAME_P) +- +- +-LIB_COMPONENTS = BLAS +-ifneq ($(NO_CBLAS), 1) +-LIB_COMPONENTS += CBLAS +-endif +- +-ifneq ($(NO_LAPACK), 1) +-LIB_COMPONENTS += LAPACK +-ifneq ($(NO_LAPACKE), 1) +-LIB_COMPONENTS += LAPACKE +-endif +-ifeq ($(BUILD_RELAPACK), 1) +-LIB_COMPONENTS += ReLAPACK +-endif +-endif +- +-ifeq ($(ONLY_CBLAS), 1) +-LIB_COMPONENTS = CBLAS +-endif +- +-export OSNAME +-export ARCH +-export CORE +-export LIBCORE +-export __BYTE_ORDER__ +-export ELF_VERSION +-export PGCPATH +-export CONFIG +-export CC +-export FC +-export BU +-export FU +-export NEED2UNDERSCORES +-export USE_THREAD +-export NUM_THREADS +-export NUM_CORES +-export SMP +-export MAKEFILE_RULE +-export NEED_PIC +-export BINARY +-export BINARY32 +-export BINARY64 +-export F_COMPILER +-export C_COMPILER +-export USE_OPENMP +-export CROSS +-export CROSS_SUFFIX +-export NOFORTRAN +-export C_LAPACK +-export NO_FBLAS +-export EXTRALIB +-export CEXTRALIB +-export FEXTRALIB +-export HAVE_SSE +-export HAVE_SSE2 +-export HAVE_SSE3 +-export HAVE_SSSE3 +-export HAVE_SSE4_1 +-export HAVE_SSE4_2 +-export HAVE_SSE4A +-export HAVE_SSE5 +-export HAVE_AVX +-export HAVE_AVX2 +-export HAVE_FMA3 +-export HAVE_VFP +-export HAVE_VFPV3 +-export HAVE_VFPV4 +-export HAVE_NEON +-ifndef NO_MSA +- export HAVE_MSA +- export MSA_FLAGS +-endif +-export KERNELDIR +-export FUNCTION_PROFILE +-export TARGET_CORE +-export NO_AVX512 +-export NO_AVX2 +-export BUILD_BFLOAT16 +-export NO_LSX +-export NO_LASX +- +-export SBGEMM_UNROLL_M +-export SBGEMM_UNROLL_N +-export SGEMM_UNROLL_M +-export SGEMM_UNROLL_N +-export DGEMM_UNROLL_M +-export DGEMM_UNROLL_N +-export QGEMM_UNROLL_M +-export QGEMM_UNROLL_N +-export CGEMM_UNROLL_M +-export CGEMM_UNROLL_N +-export ZGEMM_UNROLL_M +-export ZGEMM_UNROLL_N +-export XGEMM_UNROLL_M +-export XGEMM_UNROLL_N +-export CGEMM3M_UNROLL_M +-export CGEMM3M_UNROLL_N +-export ZGEMM3M_UNROLL_M +-export ZGEMM3M_UNROLL_N +-export XGEMM3M_UNROLL_M +-export XGEMM3M_UNROLL_N +- +- +-ifdef USE_CUDA +-export CUDADIR +-export CUCC +-export CUFLAGS +-export CULIB +-endif +- +-.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f +- +-.f.$(SUFFIX): +- $(FC) $(FFLAGS) -c $< -o $(@F) +- +-.f.$(PSUFFIX): +- $(FC) $(FPFLAGS) -pg -c $< -o $(@F) +- +- +-ifdef BINARY64 +-PATHSCALEPATH = /opt/pathscale/lib/3.1 +-PGIPATH = /opt/pgi/linux86-64/7.1-5/lib +-else +-PATHSCALEPATH = /opt/pathscale/lib/3.1/32 +-PGIPATH = /opt/pgi/linux86/7.1-5/lib +-endif +- +-ACMLPATH = /opt/acml/4.3.0 +-ifneq ($(OSNAME), Darwin) +-MKLPATH = /opt/intel/mkl/10.2.2.025/lib +-else +-MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib +-endif +-ATLASPATH = /opt/atlas/3.9.17/opteron +-FLAMEPATH = $(HOME)/flame/lib +-ifneq ($(OSNAME), SunOS) +-SUNPATH = /opt/sunstudio12.1 +-else +-SUNPATH = /opt/SUNWspro +-endif +diff --git a/Makefile.tail b/Makefile.tail +index 54ba649..f73a86d 100644 +--- a/Makefile.tail ++++ b/Makefile.tail +@@ -583,7 +583,7 @@ gen_insn_flash.c : + echo 'int i;' >> gen_insn_flash.c + echo '#ifdef __alpha' >> gen_insn_flash.c + echo 'printf(".set noat;.set noreorder;\n");' >> gen_insn_flash.c +- echo 'printf(".arch ev6;.text;.align 5\n");' >> gen_insn_flash.c ++ echo 'printf(".arch sw6;.text;.align 5\n");' >> gen_insn_flash.c + echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c + echo 'printf(".ent insn_flash\n");' >> gen_insn_flash.c + echo 'printf("insn_flash:\n");' >> gen_insn_flash.c +diff --git a/Makefile.tests b/Makefile.tests +deleted file mode 100644 +index b344abc..0000000 +--- a/Makefile.tests ++++ /dev/null +@@ -1,435 +0,0 @@ +-TOPDIR = . +-include ./Makefile.system +- +-BLASDIRS = interface driver/level2 driver/level3 driver/others +- +-ifneq ($(DYNAMIC_ARCH), 1) +-BLASDIRS += kernel +-endif +- +-ifdef SANITY_CHECK +-BLASDIRS += reference +-endif +- +-SUBDIRS = $(BLASDIRS) +-ifneq ($(NO_LAPACK), 1) +-SUBDIRS += lapack +-endif +- +-RELA = +-ifeq ($(BUILD_RELAPACK), 1) +-RELA = re_lapack +-endif +- +-ifeq ($(NO_FORTRAN), 1) +-define NOFORTRAN +-1 +-endef +-ifneq ($(NO_LAPACK), 1) +-define C_LAPACK +-1 +-endef +-endif +-export NOFORTRAN +-export NO_LAPACK +-export C_LAPACK +-endif +- +-ifeq ($(F_COMPILER),CRAY) +-LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS)) +-else +-LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) +-endif +- +-SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test +- +-.PHONY : all libs netlib $(RELA) test ctest shared install +-.NOTPARALLEL : shared +- +-all :: tests +- @echo +- @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" +- @echo +- @echo " OS ... $(OSNAME) " +- @echo " Architecture ... $(ARCH) " +-ifndef BINARY64 +- @echo " BINARY ... 32bit " +-else +- @echo " BINARY ... 64bit " +-endif +- +-ifdef INTERFACE64 +-ifneq ($(INTERFACE64), 0) +- @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " +-endif +-endif +- @$(CC) --version > /dev/null 2>&1;\ +- if [ $$? -eq 0 ]; then \ +- cverinfo=`$(CC) --version | sed -n '1p'`; \ +- if [ -z "$${cverinfo}" ]; then \ +- cverinfo=`$(CC) --version | sed -n '2p'`; \ +- fi; \ +- echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ +- else \ +- echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ +- fi +-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) +- @$(FC) --version > /dev/null 2>&1;\ +- if [ $$? -eq 0 ]; then \ +- fverinfo=`$(FC) --version | sed -n '1p'`; \ +- if [ -z "$${fverinfo}" ]; then \ +- fverinfo=`$(FC) --version | sed -n '2p'`; \ +- fi; \ +- echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ +- else \ +- echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ +- fi +-endif +-ifneq ($(OSNAME), AIX) +- @echo -n " Library Name ... $(LIBNAME)" +-else +- @echo " Library Name ... $(LIBNAME)" +-endif +- +-ifndef SMP +- @echo " (Single-threading) " +-else +- @echo " (Multi-threading; Max num-threads is $(NUM_THREADS))" +-endif +- +-ifeq ($(DYNAMIC_ARCH), 1) +- @echo " Supporting multiple $(ARCH) cpu models with minimum requirement for the common code being $(CORE)" +-endif +- +-ifeq ($(USE_OPENMP), 1) +- @echo +- @echo " Use OpenMP in the multithreading. Because of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, " +- @echo " you should use OMP_NUM_THREADS environment variable to control the number of threads." +- @echo +-endif +- +-ifeq ($(OSNAME), Darwin) +- @echo "WARNING: If you plan to use the dynamic library $(LIBDYNNAME), you must run:" +- @echo +- @echo "\"make PREFIX=/your_installation_path/ install\"." +- @echo +- @echo "(or set PREFIX in Makefile.rule and run make install." +- @echo +- @echo "Note that any flags passed to make during build should also be passed to make install" +- @echo "to circumvent any install errors." +- @echo +- @echo "If you want to move the .dylib to a new location later, make sure you change" +- @echo "the internal name of the dylib with:" +- @echo +- @echo "install_name_tool -id /new/absolute/path/to/$(LIBDYNNAME) $(LIBDYNNAME)" +-endif +- @echo +- @echo "To install the library, you can run \"make PREFIX=/path/to/your/installation install\"." +- @echo +- @echo "Note that any flags passed to make during build should also be passed to make install" +- @echo "to circumvent any install errors." +- @echo +- +-shared : libs netlib $(RELA) +-ifneq ($(NO_SHARED), 1) +-ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) +- @$(MAKE) -C exports so +- @ln -fs $(LIBSONAME) $(LIBPREFIX).so +- @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) +-endif +-ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) +- @$(MAKE) -C exports so +- @ln -fs $(LIBSONAME) $(LIBPREFIX).so +-endif +-ifeq ($(OSNAME), Darwin) +- @$(MAKE) -C exports dyn +- @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib +- @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib +-endif +-ifeq ($(OSNAME), WINNT) +- @$(MAKE) -C exports dll +-endif +-ifeq ($(OSNAME), CYGWIN_NT) +- @$(MAKE) -C exports dll +-endif +-endif +- +-tests : shared +-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) +- touch $(LIBNAME) +-ifndef NO_FBLAS +- $(MAKE) -C test all +-endif +-endif +-ifneq ($(ONLY_CBLAS), 1) +- $(MAKE) -C utest all +-endif +-ifneq ($(NO_CBLAS), 1) +-ifneq ($(ONLY_CBLAS), 1) +- $(MAKE) -C ctest all +-endif +-ifeq ($(CPP_THREAD_SAFETY_TEST), 1) +- $(MAKE) -C cpp_thread_test all +-endif +-endif +- +-libs : +-ifeq ($(CORE), UNKNOWN) +- $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) +-endif +-ifeq ($(NOFORTRAN), 1) +- $(info OpenBLAS: Detecting fortran compiler failed. Can only compile BLAS and f2c-converted LAPACK.) +-endif +-ifeq ($(NO_STATIC), 1) +-ifeq ($(NO_SHARED), 1) +- $(error OpenBLAS: neither static nor shared are enabled.) +-endif +-endif +- @for d in $(SUBDIRS) ; \ +- do if test -d $$d; then \ +- $(MAKE) -C $$d $(@F) || exit 1 ; \ +- fi; \ +- done +-#Save the config files for installation +- @cp Makefile.conf Makefile.conf_last +- @cp config.h config_last.h +-ifdef QUAD_PRECISION +- @echo "#define QUAD_PRECISION">> config_last.h +-endif +-ifeq ($(EXPRECISION), 1) +- @echo "#define EXPRECISION">> config_last.h +-endif +-## +-ifeq ($(DYNAMIC_ARCH), 1) +- @$(MAKE) -C kernel commonlibs || exit 1 +- @for d in $(DYNAMIC_CORE) ; \ +- do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ +- done +- @echo DYNAMIC_ARCH=1 >> Makefile.conf_last +-ifeq ($(DYNAMIC_OLDER), 1) +- @echo DYNAMIC_OLDER=1 >> Makefile.conf_last +-endif +-endif +- @echo TARGET=$(CORE) >> Makefile.conf_last +-ifdef USE_THREAD +- @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last +-endif +-ifdef SMP +-ifdef NUM_THREADS +- @echo NUM_THREADS=$(NUM_THREADS) >> Makefile.conf_last +-else +- @echo NUM_THREADS=$(NUM_CORES) >> Makefile.conf_last +-endif +-endif +-ifeq ($(USE_OPENMP),1) +- @echo USE_OPENMP=1 >> Makefile.conf_last +-endif +-ifeq ($(INTERFACE64),1) +- @echo INTERFACE64=1 >> Makefile.conf_last +-endif +- @echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last +- @echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last +- @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +- @touch lib.grd +- +-prof : prof_blas prof_lapack +- +-prof_blas : +- ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) +- for d in $(SUBDIRS) ; \ +- do if test -d $$d; then \ +- $(MAKE) -C $$d prof || exit 1 ; \ +- fi; \ +- done +-ifeq ($(DYNAMIC_ARCH), 1) +- $(MAKE) -C kernel commonprof || exit 1 +-endif +- +-blas : +- ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +- for d in $(BLASDIRS) ; \ +- do if test -d $$d; then \ +- $(MAKE) -C $$d libs || exit 1 ; \ +- fi; \ +- done +- +-hpl : +- ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +- for d in $(BLASDIRS) ../laswp exports ; \ +- do if test -d $$d; then \ +- $(MAKE) -C $$d $(@F) || exit 1 ; \ +- fi; \ +- done +-ifeq ($(DYNAMIC_ARCH), 1) +- $(MAKE) -C kernel commonlibs || exit 1 +- for d in $(DYNAMIC_CORE) ; \ +- do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ +- done +-endif +- +-hpl_p : +- ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) +- for d in $(SUBDIRS) ../laswp exports ; \ +- do if test -d $$d; then \ +- $(MAKE) -C $$d $(@F) || exit 1 ; \ +- fi; \ +- done +- +-netlib : lapack_prebuild +-ifneq ($(NO_LAPACK), 1) +- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib +- @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib +-endif +-ifneq ($(NO_LAPACKE), 1) +- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib +-endif +- +-ifeq ($(NO_LAPACK), 1) +-re_lapack : +- +-else +-re_lapack : +- @$(MAKE) -C relapack +-endif +- +-prof_lapack : lapack_prebuild +- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof +- +-lapack_prebuild : +-ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK))) +- -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc +-ifeq ($(F_COMPILER), GFORTRAN) +- -@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc +-else +- -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +- -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc +-ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1) +- -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc +-else +- -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +- -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +-ifeq ($(F_COMPILER), GFORTRAN) +- -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc +-ifdef SMP +-ifeq ($(OSNAME), WINNT) +- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +-else ifeq ($(OSNAME), Haiku) +- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +-else +- -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +-else +- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +-else +- -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +-ifeq ($(BUILD_LAPACK_DEPRECATED), 1) +- -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +-ifeq ($(BUILD_SINGLE), 1) +- -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +-ifeq ($(BUILD_DOUBLE), 1) +- -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +-ifeq ($(BUILD_COMPLEX), 1) +- -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +-ifeq ($(BUILD_COMPLEX16), 1) +- -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +- -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +- +-large.tgz : +-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) +- if [ ! -a $< ]; then +- -wget http://www.netlib.org/lapack/timing/large.tgz; +- fi +-endif +- +-timing.tgz : +-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) +- if [ ! -a $< ]; then +- -wget http://www.netlib.org/lapack/timing/timing.tgz; +- fi +-endif +- +-lapack-timing : large.tgz timing.tgz +-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) +- (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) +- (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) +- $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING +-endif +- +- +-lapack-test : +- (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) +- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz +- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc +-ifneq ($(CROSS), 1) +- ( cd $(NETLIB_LAPACK_DIR)/INSTALL; $(MAKE) all; ./testlsame; ./testslamch; ./testdlamch; \ +- ./testsecond; ./testdsecnd; ./testieee; ./testversion ) +- (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING) +-endif +- +-lapack-runtest: lapack-test +- ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ +- ./testsecond; ./testdsecnd; ./testieee; ./testversion ) +- (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING ) +- +- +-blas-test: +- (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) +- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing +- (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) +- +- +-dummy : +- +-install : +- $(MAKE) -f Makefile.install install +- +-clean :: +- @for d in $(SUBDIRS_ALL) ; \ +- do if test -d $$d; then \ +- $(MAKE) -C $$d $(@F) || exit 1 ; \ +- fi; \ +- done +-#ifdef DYNAMIC_ARCH +- @$(MAKE) -C kernel clean +-#endif +- @$(MAKE) -C reference clean +- @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 +-ifeq ($(OSNAME), Darwin) +- @rm -rf getarch.dSYM getarch_2nd.dSYM +-endif +- @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib +- @rm -f cblas.tmp cblas.tmp2 +- @touch $(NETLIB_LAPACK_DIR)/make.inc +- @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean +- @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h +- @$(MAKE) -C relapack clean +- @rm -f *.grd Makefile.conf_last config_last.h +- @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) +- @echo Done. +diff --git a/c_check b/c_check +index b018c10..13a7086 100755 +--- a/c_check ++++ b/c_check +@@ -84,6 +84,7 @@ case "$data" in + *ARCH_MIPS64*) architecture=mips64 ;; + *ARCH_MIPS*) architecture=mips ;; + *ARCH_ALPHA*) architecture=alpha ;; ++ *ARCH_SW_64*) architecture=sw_64 ;; + *ARCH_SPARC*) architecture=sparc ;; + *ARCH_IA64*) architecture=ia64 ;; + *ARCH_ARM64*) architecture=arm64 ;; +@@ -124,7 +125,7 @@ case "$architecture" in + defined=1 + ;; + arm|arm64) defined=1 ;; +- zarch|e2k|alpha|ia64|riscv64|loonarch64) ++ zarch|e2k|alpha|ia64|riscv64|loonarch64|sw_64) + defined=1 + BINARY=64 + ;; +@@ -232,6 +233,7 @@ case "$data" in + *ARCH_MIPS64*) architecture=mips64 ;; + *ARCH_MIPS*) architecture=mips ;; + *ARCH_ALPHA*) architecture=alpha ;; ++ *ARCH_SW_64*) architecture=sw_64 ;; + *ARCH_SPARC*) architecture=sparc ;; + *ARCH_IA64*) architecture=ia64 ;; + *ARCH_ARM64*) architecture=arm64 ;; +diff --git a/common.h b/common.h +index 4074df0..309c3f9 100644 +--- a/common.h ++++ b/common.h +@@ -420,6 +420,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 + #include "common_alpha.h" + #endif + ++#ifdef ARCH_SW_64 ++#include "common_sw_64.h" ++#endif ++ + #if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include) + #if __has_include() + #include +diff --git a/common_sw_64.h b/common_sw_64.h +new file mode 100644 +index 0000000..e14268e +--- /dev/null ++++ b/common_sw_64.h +@@ -0,0 +1,200 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#ifndef COMMON_SW_64 ++#define COMMON_SW_64 ++ ++#ifndef ASSEMBLER ++ ++#define MB asm("memb") ++#define WMB asm("memb") ++#define RMB asm("memb") ++ ++static void __inline blas_lock(unsigned long *address){ ++#ifndef __DECC ++ unsigned long tmp1, tmp2,tmp3; ++ asm volatile( ++ "1: ldl %1, %0\n" ++ " bne %1, 2f\n" ++ " ldi %3, %0 \n" ++ " lldl %1, 0(%3)\n" ++ " ldi %2, 1 \n" ++ " wr_f %2 \n" ++ " or %1, 1, %2\n" ++ " memb\n " ++ " lstl %2, 0(%3)\n" ++ " rd_f %2\n" ++ " bne %1, 2f\n" ++ " beq %2, 2f\n" ++ " memb\n " ++ " br $31, 3f\n" ++ "2: br $31, 1b\n" ++ "3:\n" : "=m"(*address), "=&r"(tmp1), "=&r"(tmp2),"=&r"(tmp3) : : "memory"); ++#else ++ asm ( ++ "10:" ++ " ldl %t0, 0(%a0); " ++ " bne %t0, 20f; " ++ " ldi %t2, %a0" ++ " lldl %t0, 0(%t2); " ++ " ldi %t1, 1" ++ " wr_f %t1" ++ " or %t0, 1, %t1;" ++ " memb; " ++ " lstl %t1, 0(%t2); " ++ " rd_f %t1" ++ " bne %t0, 20f; " ++ " beq %t1, 20f; " ++ " memb; " ++ " br %r31,30f; " ++ "20: " ++ " br %r31,10b; " ++ "30:", address); ++#endif ++} ++#define BLAS_LOCK_DEFINED ++ ++static __inline unsigned int rpcc(void){ ++ ++ unsigned int r0; ++ ++#ifndef __DECC ++ asm __volatile__("rtc %0" : "=r"(r0) : : "memory"); ++#else ++ r0 = asm("rtc %v0"); ++#endif ++ ++ return r0; ++} ++#define RPCC_DEFINED ++ ++ ++#define HALT ldl $0, 0($0) ++ ++#ifndef __DECC ++#define GET_IMAGE(res) asm __volatile__("fmov $f1, %0" : "=f"(res) : : "memory") ++#else ++#define GET_IMAGE(res) res = dasm("fmov $f1, %f0") ++#endif ++ ++#ifdef SMP ++#ifdef USE64BITINT ++static __inline long blas_quickdivide(long x, long y){ ++ return x/y; ++} ++#else ++extern unsigned int blas_quick_divide_table[]; ++ ++static __inline int blas_quickdivide(unsigned int x, unsigned int y){ ++ if (y <= 1) return x; ++ return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32); ++} ++#endif ++#endif ++ ++#define BASE_ADDRESS ((0x1b0UL << 33) | (0x1c0UL << 23) | (0x000UL << 13)) ++ ++#ifndef PAGESIZE ++#define PAGESIZE ( 8UL << 10) ++#define HUGE_PAGESIZE ( 4 << 20) ++#endif ++#define BUFFER_SIZE (32UL << 20) ++ ++#else ++ ++#ifndef F_INTERFACE ++#define REALNAME ASMNAME ++#else ++#define REALNAME ASMFNAME ++#endif ++ ++#define PROLOGUE \ ++ .arch sw6; \ ++ .set noat; \ ++ .set noreorder; \ ++.text; \ ++ .align 5; \ ++ .globl REALNAME; \ ++ .ent REALNAME; \ ++REALNAME: ++ ++#ifdef PROFILE ++#define PROFCODE \ ++ ldgp $gp, 0($27); \ ++ ldi $28, _mcount; \ ++ jsr $28, ($28), _mcount; \ ++ .prologue 1 ++#else ++#define PROFCODE .prologue 0 ++#endif ++ ++#if defined(__linux__) && defined(__ELF__) ++#define GNUSTACK .section .note.GNU-stack,"",@progbits ++#else ++#define GNUSTACK ++#endif ++ ++#define EPILOGUE \ ++ .end REALNAME; \ ++ .ident VERSION; \ ++ GNUSTACK ++ ++#endif ++ ++#ifdef DOUBLE ++#define SXADDQ s8addl ++#define SXSUBL s8subl ++#define LD fldd ++#define ST fstd ++#define STQ stq ++#define ADD faddd ++#define SUB fsubd ++#define MUL fmuld ++#define DIV fdivd ++#else ++#define SXADDQ s4addl ++#define SXSUBL s4subl ++#define LD flds ++#define ST fsts ++#define STQ stl ++#define ADD fadds ++#define SUB fsubs ++#define MUL fmuls ++#define DIV fdivs ++#endif ++#endif +diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile +index be8313e..1ab9bb8 100644 +--- a/cpp_thread_test/Makefile ++++ b/cpp_thread_test/Makefile +@@ -1,14 +1,13 @@ +-TOPDIR = .. +-include $(TOPDIR)/Makefile.system ++include ../Makefile.rule + + all :: dgemv_tester dgemm_tester + + dgemv_tester : +- $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester ++ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester + ./dgemv_tester + + dgemm_tester : dgemv_tester +- $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester ++ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester + ./dgemm_tester + + clean :: +diff --git a/cpuid_sw_64.c b/cpuid_sw_64.c +new file mode 100644 +index 0000000..61ed28a +--- /dev/null ++++ b/cpuid_sw_64.c +@@ -0,0 +1,105 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#if defined(__sw_64__) && defined(__DECC) ++#include ++#endif ++ ++int implver(void){ ++ int arch; ++ ++#ifndef __DECC ++ asm __volatile__("implver %0" : "=r"(arch) : : "memory"); ++#else ++ arch = asm("implver %v0"); ++#endif ++ return arch; ++} ++ ++void get_architecture(void){ ++ printf("SW_64"); ++} ++ ++void get_subarchitecture(void){ ++ printf("sw%d", implver() + 4); ++} ++ ++void get_subdirname(void){ ++ printf("sw_64"); ++} ++ ++char *get_corename(void){ ++ return "sw_64"; ++} ++ ++void get_cpuconfig(void){ ++ printf("#define SW%d\n", implver() + 4); ++ ++ switch (implver()){ ++ case 0: ++ printf("#define L1_DATA_SIZE 16384\n"); ++ printf("#define L1_DATA_LINESIZE 32\n"); ++ printf("#define L2_SIZE 2097152\n"); ++ printf("#define L2_LINESIZE 32\n"); ++ printf("#define DTB_DEFAULT_ENTRIES 32\n"); ++ printf("#define DTB_SIZE 8192\n"); ++ break; ++ ++ case 1: ++ printf("#define L1_DATA_SIZE 16384\n"); ++ printf("#define L1_DATA_LINESIZE 32\n"); ++ printf("#define L2_SIZE 2097152\n"); ++ printf("#define L2_LINESIZE 64\n"); ++ printf("#define DTB_DEFAULT_ENTRIES 64\n"); ++ printf("#define DTB_SIZE 8192\n"); ++ break; ++ ++ case 2: ++ printf("#define L1_DATA_SIZE 32768\n"); ++ printf("#define L1_DATA_LINESIZE 64\n"); ++ printf("#define L2_SIZE 4194304\n"); ++ printf("#define L2_LINESIZE 64\n"); ++ printf("#define DTB_DEFAULT_ENTRIES 64\n"); ++ printf("#define DTB_SIZE 8192\n"); ++ break; ++ } ++} ++ ++void get_libname(void){ ++ printf("sw%d\n", implver() + 4); ++} +diff --git a/ctest.c b/ctest.c +index 2ccae8d..6b21d3a 100644 +--- a/ctest.c ++++ b/ctest.c +@@ -137,6 +137,10 @@ ARCH_MIPS + ARCH_ALPHA + #endif + ++#ifdef __sw_64__ ++ARCH_SW_64 ++#endif ++ + #if defined(__sparc) || defined(__sparc__) + ARCH_SPARC + #endif +diff --git a/getarch.c b/getarch.c +index 87384c0..306c389 100644 +--- a/getarch.c ++++ b/getarch.c +@@ -1766,6 +1766,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #define OPENBLAS_SUPPORTED + #endif + ++#ifdef __sw_64__ ++#include "cpuid_sw_64.c" ++#define OPENBLAS_SUPPORTED ++#endif + + #ifndef OPENBLAS_SUPPORTED + #error "This arch/CPU is not supported by OpenBLAS." +@@ -1831,7 +1835,7 @@ int main(int argc, char *argv[]){ + #ifdef FORCE + printf("CORE=%s\n", CORENAME); + #else +-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) ++#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__sw_64__) + printf("CORE=%s\n", get_corename()); + #endif + #endif +@@ -1979,7 +1983,7 @@ printf("ELF_VERSION=2\n"); + #ifdef FORCE + printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); + #else +-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) ++#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__sw_64__) + printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); + #endif + #endif +diff --git a/interface/gbmv.c b/interface/gbmv.c +index 1d58ba8..18aa50e 100644 +--- a/interface/gbmv.c ++++ b/interface/gbmv.c +@@ -236,7 +236,12 @@ void CNAME(enum CBLAS_ORDER order, + + #ifdef SMP + } else { +- ++//ZYX20220118 ++#ifndef TRANSA ++ memset(buffer, 0, nthreads*m*sizeof(FLOAT)); ++#else ++ memset(buffer, 0, nthreads*n*sizeof(FLOAT)); ++#endif + (gbmv_thread[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer, nthreads); + + } +diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 +index 0933736..111924b 100644 +--- a/kernel/Makefile.L1 ++++ b/kernel/Makefile.L1 +@@ -398,12 +398,16 @@ ifndef DSWAPKERNEL + DSWAPKERNEL = swap.S + endif + ++#ZYX20220301 + ifndef CSWAPKERNEL +-CSWAPKERNEL = zswap.S ++CSWAPKERNEL = zswap.c ++#CSWAPKERNEL = zswap.S + endif + ++#ZYX20220301 + ifndef ZSWAPKERNEL +-ZSWAPKERNEL = zswap.S ++ZSWAPKERNEL = zswap.c ++#ZSWAPKERNEL = zswap.S + endif + + ifndef QSWAPKERNEL +diff --git a/kernel/sw_64/KERNEL b/kernel/sw_64/KERNEL +new file mode 100644 +index 0000000..d10504b +--- /dev/null ++++ b/kernel/sw_64/KERNEL +@@ -0,0 +1,176 @@ ++ifndef SAMINKERNEL ++SAMINKERNEL = amax.S ++endif ++ ++ifndef DAMINKERNEL ++DAMINKERNEL = amax.S ++endif ++ ++ifndef CAMINKERNEL ++CAMINKERNEL = zamax.S ++endif ++ ++ifndef ZAMINKERNEL ++ZAMINKERNEL = zamax.S ++endif ++ ++ifndef SMINKERNEL ++SMINKERNEL = max.S ++endif ++ ++ifndef DMINKERNEL ++DMINKERNEL = max.S ++endif ++ ++ifndef ISAMINKERNEL ++ISAMINKERNEL = iamax.S ++endif ++ ++ifndef IDAMINKERNEL ++IDAMINKERNEL = iamax.S ++endif ++ ++ifndef ICAMINKERNEL ++ICAMINKERNEL = izamax.S ++endif ++ ++ifndef IZAMINKERNEL ++IZAMINKERNEL = izamax.S ++endif ++ ++#ZYX20220301 ++ifndef LSAME_KERNEL ++LSAME_KERNEL = ../generic/lsame.c ++endif ++ ++#ZYX20220120 ++ifndef ISMINKERNEL ++ISMINKERNEL = amax.S ++#ISMINKERNEL = imin.c ++endif ++ ++#ZYX20220120 ++#ifndef ISMAXKERNEL ++#ISMAXKERNEL = imax.c ++#endif ++ ++ifndef IDMINKERNEL ++IDMINKERNEL = amax.S ++endif ++ ++ifndef CCOPYKERNEL ++CCOPYKERNEL = copy.S ++endif ++ ++ifndef ZCOPYKERNEL ++ZCOPYKERNEL = copy.S ++endif ++ ++ifndef SNRM2KERNEL ++SNRM2KERNEL = snrm2.S ++endif ++ ++ifndef DNRM2KERNEL ++DNRM2KERNEL = dnrm2.S ++endif ++ ++ifndef CNRM2KERNEL ++CNRM2KERNEL = cnrm2.S ++endif ++ ++ifndef ZNRM2KERNEL ++ZNRM2KERNEL = znrm2.S ++endif ++ ++ifndef SGEMMKERNEL ++SGEMMKERNEL = gemm_kernel_4x4.S ++SGEMM_BETA = gemm_beta.S ++SGEMMONCOPY = ../generic/gemm_ncopy_4.c ++SGEMMOTCOPY = ../generic/gemm_tcopy_4.c ++SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) ++SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) ++endif ++ ++ifndef DGEMMKERNEL ++DGEMMKERNEL = gemm_kernel_4x4.S ++DGEMM_BETA = gemm_beta.S ++DGEMMONCOPY = ../generic/gemm_ncopy_4.c ++DGEMMOTCOPY = ../generic/gemm_tcopy_4.c ++DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) ++DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) ++endif ++ ++ifndef CGEMMKERNEL ++CGEMMKERNEL = zgemm_kernel_2x2.S ++CGEMM_BETA = zgemm_beta.S ++CGEMMONCOPY = ../generic/zgemm_ncopy_2.c ++CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ++CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) ++CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) ++endif ++ ++ifndef ZGEMMKERNEL ++ZGEMMKERNEL = zgemm_kernel_2x2.S ++ZGEMM_BETA = zgemm_beta.S ++ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ++ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ++ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) ++ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) ++endif ++ ++SGEMM_BETA = gemm_beta.S ++DGEMM_BETA = gemm_beta.S ++CGEMM_BETA = zgemm_beta.S ++ZGEMM_BETA = zgemm_beta.S ++ ++ifndef STRSMKERNEL_LN ++STRSMKERNEL_LN = trsm_kernel_4x4_LN.S ++endif ++ifndef STRSMKERNEL_LT ++STRSMKERNEL_LT = trsm_kernel_4x4_LT.S ++endif ++ifndef STRSMKERNEL_RN ++STRSMKERNEL_RN = trsm_kernel_4x4_LT.S ++endif ++ifndef STRSMKERNEL_RT ++STRSMKERNEL_RT = trsm_kernel_4x4_RT.S ++endif ++ ++ifndef DTRSMKERNEL_LN ++DTRSMKERNEL_LN = trsm_kernel_4x4_LN.S ++endif ++ifndef DTRSMKERNEL_LT ++DTRSMKERNEL_LT = trsm_kernel_4x4_LT.S ++endif ++ifndef DTRSMKERNEL_RN ++DTRSMKERNEL_RN = trsm_kernel_4x4_LT.S ++endif ++ifndef DTRSMKERNEL_RT ++DTRSMKERNEL_RT = trsm_kernel_4x4_RT.S ++endif ++ ++ifndef CTRSMKERNEL_LN ++CTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S ++endif ++ifndef CTRSMKERNEL_LT ++CTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S ++endif ++ifndef CTRSMKERNEL_RN ++CTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S ++endif ++ifndef CTRSMKERNEL_RT ++CTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S ++endif ++ ++ifndef ZTRSMKERNEL_LN ++ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S ++endif ++ifndef ZTRSMKERNEL_LT ++ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S ++endif ++ifndef ZTRSMKERNEL_RN ++ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S ++endif ++ifndef ZTRSMKERNEL_RT ++ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S ++endif +diff --git a/kernel/sw_64/Makefile b/kernel/sw_64/Makefile +new file mode 100644 +index 0000000..efae70d +--- /dev/null ++++ b/kernel/sw_64/Makefile +@@ -0,0 +1,2 @@ ++clean :: ++ +diff --git a/kernel/sw_64/amax.S b/kernel/sw_64/amax.S +new file mode 100644 +index 0000000..300a2f7 +--- /dev/null ++++ b/kernel/sw_64/amax.S +@@ -0,0 +1,283 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 6 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ nop ++ .align 4 ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ unop ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $0 ++ unop ++ ++ fstd $f6, 32($sp) ++ fclr $f0 ++ sra N, 3, $1 ++ beq $0, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ unop ++ fabs $f20, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ fabs $f20, $f1 ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f21, 0 * SIZE(X) ++ fabs $f20, $f2 ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fabs $f20, $f3 ++ addl X, INCX, X ++ unop ++ ++ LD $f23, 0 * SIZE(X) ++ fabs $f20, $f4 ++ addl X, INCX, X ++ unop ++ ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ fabs $f20, $f5 ++ unop ++ ++ LD $f25, 0 * SIZE(X) ++ fabs $f20, $f6 ++ addl X, INCX, X ++ unop ++ ++ LD $f26, 0 * SIZE(X) ++ fabs $f20, $f28 ++ addl X, INCX, X ++ ldi $1, -1($1) ++ ++ LD $f27, 0 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ fselne $f16, $f12, $f4, $f4 ++ unop ++ fabs $f20, $f29 ++ fillcs 56 * SIZE(X) ++ ++ fselne $f17, $f13, $f5, $f5 ++ LD $f20, 0 * SIZE(X) ++ fabs $f21, $f30 ++ addl X, INCX, X ++ ++ fselne $f18, $f14, $f6, $f6 ++ LD $f21, 0 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ fselne $f19, $f15, $f28, $f28 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ addl X, INCX, X ++ ++ fabs $f24, $f12 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f0, $f29), $f16 ++ addl X, INCX, X ++ ++ fabs $f25, $f13 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f1, $f30), $f17 ++ addl X, INCX, X ++ ++ fabs $f26, $f14 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f2, $f10), $f18 ++ addl X, INCX, X ++ ++ fabs $f27, $f15 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f3, $f11), $f19 ++ addl X, INCX, X ++ ++ fselne $f16, $f29, $f0, $f0 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f4, $f12), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f30, $f1, $f1 ++ unop ++ CMPLT($f5, $f13), $f17 ++ ldi $1, -1($1) # i -- ++ ++ fselne $f18, $f10, $f2, $f2 ++ unop ++ CMPLT($f6, $f14), $f18 ++ unop ++ ++ fselne $f19, $f11, $f3, $f3 ++ unop ++ CMPLT($f28, $f15), $f19 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ fselne $f16, $f12, $f4, $f4 ++ fabs $f20, $f29 ++ fselne $f17, $f13, $f5, $f5 ++ fabs $f21, $f30 ++ ++ fselne $f18, $f14, $f6, $f6 ++ fabs $f22, $f10 ++ fselne $f19, $f15, $f28, $f28 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ CMPLT($f0, $f29), $f16 ++ fabs $f25, $f13 ++ CMPLT($f1, $f30), $f17 ++ ++ fabs $f26, $f14 ++ CMPLT($f2, $f10), $f18 ++ fabs $f27, $f15 ++ CMPLT($f3, $f11), $f19 ++ ++ fselne $f16, $f29, $f0, $f0 ++ CMPLT($f4, $f12), $f16 ++ fselne $f17, $f30, $f1, $f1 ++ CMPLT($f5, $f13), $f17 ++ ++ fselne $f18, $f10, $f2, $f2 ++ CMPLT($f6, $f14), $f18 ++ fselne $f19, $f11, $f3, $f3 ++ CMPLT($f28, $f15), $f19 ++ ++ fselne $f16, $f12, $f4, $f4 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f13, $f5, $f5 ++ CMPLT($f2, $f3), $f17 ++ ++ fselne $f18, $f14, $f6, $f6 ++ CMPLT($f4, $f5), $f18 ++ fselne $f19, $f15, $f28, $f28 ++ CMPLT($f6, $f28), $f19 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ fselne $f18, $f5, $f4, $f4 ++ fselne $f19, $f28, $f6, $f6 ++ ++ CMPLT($f0, $f2), $f16 ++ CMPLT($f4, $f6), $f17 ++ ++ fselne $f16, $f2, $f0, $f0 ++ fselne $f17, $f6, $f4, $f0 ++ ++ CMPLT($f0, $f4), $f16 ++ fselne $f16, $f4, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $End ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/asum.S b/kernel/sw_64/asum.S +new file mode 100644 +index 0000000..54e7fcb +--- /dev/null ++++ b/kernel/sw_64/asum.S +@@ -0,0 +1,230 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ ble N, $L999 ++ ++ sra N, 3, I ++ fclr s1 ++ fclr s2 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t1 ++ SXADDQ INCX, X, X ++ fclr t2 ++ ++ LD a1, 0 * SIZE(X) ++ fclr t3 ++ SXADDQ INCX, X, X ++ fclr s3 ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ ldw $31, PREFETCHSIZE * 2 * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ LD a6, 0 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2,$f24 ++ fmov $f24,s2 ++ LD a7, 0 * SIZE(X) ++ fabs a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3,$f24 ++ fmov $f24,s3 ++ LD a0, 0 * SIZE(X) ++ fabs a3, t3 ++ SXADDQ INCX, X, X ++ ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ LD a1, 0 * SIZE(X) ++ fabs a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ LD a2, 0 * SIZE(X) ++ fabs a5, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2,$f24 ++ fmov $f24,s2 ++ LD a3, 0 * SIZE(X) ++ fabs a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ LD a4, 0 * SIZE(X) ++ fabs a7, t3 ++ SXADDQ INCX, X, X ++ ++ LD a5, 0 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0,$f24 ++ fmov $f24,s0 ++ LD a6, 0 * SIZE(X) ++ fabs a0, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1,$f24 ++ fmov $f24,s1 ++ LD a7, 0 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ fabs a2, t2 ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ fabs a3, t3 ++ ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ fabs a4, t0 ++ ADD s1, t1,$f24 ++ fmov $f24,s1 ++ fabs a5, t1 ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ fabs a6, t2 ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ fabs a7, t3 ++ ++ ADD s1, t1,$f24 ++ fmov $f24,s1 ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ ++ ADD s0, s1, $f24 ++ fmov $f24,s0 ++ ADD s2, s3, $f24 ++ fmov $f24,s2 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ADD s0, s2,$f24 ++ fmov $f24,s0 ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, a0 ++ fmov a0,s0 ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ fabs a0, t0 ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0,$f24 ++ fmov $f24,s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/asum.S.bak b/kernel/sw_64/asum.S.bak +new file mode 100644 +index 0000000..faf7827 +--- /dev/null ++++ b/kernel/sw_64/asum.S.bak +@@ -0,0 +1,206 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ ble N, $L999 ++ ++ sra N, 3, I ++ fclr s1 ++ fclr s2 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t1 ++ SXADDQ INCX, X, X ++ fclr t2 ++ ++ LD a1, 0 * SIZE(X) ++ fclr t3 ++ SXADDQ INCX, X, X ++ fclr s3 ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ fillcs PREFETCHSIZE * 2 * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ LD a7, 0 * SIZE(X) ++ fabs a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ fabs a3, t3 ++ SXADDQ INCX, X, X ++ ++ ADD s0, t0, s0 ++ LD a1, 0 * SIZE(X) ++ fabs a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) ++ fabs a5, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ LD a3, 0 * SIZE(X) ++ fabs a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ fabs a7, t3 ++ SXADDQ INCX, X, X ++ ++ LD a5, 0 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD a6, 0 * SIZE(X) ++ fabs a0, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a7, 0 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ fabs a2, t2 ++ ADD s3, t3, s3 ++ fabs a3, t3 ++ ++ ADD s0, t0, s0 ++ fabs a4, t0 ++ ADD s1, t1, s1 ++ fabs a5, t1 ++ ADD s2, t2, s2 ++ fabs a6, t2 ++ ADD s3, t3, s3 ++ fabs a7, t3 ++ ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ ADD s0, s1, s0 ++ ADD s2, s3, s2 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ADD s0, s2, s0 ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ fabs a0, t0 ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/asum_simd.S b/kernel/sw_64/asum_simd.S +new file mode 100644 +index 0000000..f9152ec +--- /dev/null ++++ b/kernel/sw_64/asum_simd.S +@@ -0,0 +1,342 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ ble N, $L999 ++ ++ cmpeq INCX, 1, $3 ++ beq $3, $Sub ++ .align 4 ++ ++/* ++ Unloop 16 ++*/ ++ ++/** ++ test the address of X ++**/ ++ and X, (VEC_LEN*SIZE-1), $4 ++ nop ++ nop ++ beq $4, $Align ++ ++/** ++ process the unalign address of X ++**/ ++ ++/*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/ ++ sra N, 4, I ++ fclr s1 ++ fclr s2 ++ ble I, $Remain ++ ++ sra $4, BASE_SHIFT, $4 ++ ldi $3, VEC_LEN ++ subl $3, $4, $4 ++ nop ++ ++$UnAlign_X_Loop: ++ LD a0, 0 * SIZE(X) ++ addl X, SIZE, X ++ fabs a0, t0 ++ subl $4, 1, $4 ++ ++ ADD s0, t0, s0 ++ subl N, 1, N ++ nop ++ bgt $4, $UnAlign_X_Loop ++ ++$Align: ++ sra N, 4, I ++ fclr s1 ++ fclr s2 ++ ble I, $Remain ++ ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t0 ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t1 ++ ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t2 ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t3 ++ ++ subl I, 1, I ++ addl X, 16*SIZE, X ++ unop ++ ble I, $MainLoopEnd ++ ++$MainLoop: ++ ++ vcpys $f31, a0, a4 ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ vcpys $f31, a1, a5 ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ ++ vcpys $f31, a2, a6 ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ vcpys $f31, a3, a7 ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ ++ VADD t0, a4, t0 ++ subl I, 1, I ++ VADD t1, a5, t1 ++ fillcs PREFETCHSIZE * SIZE(X) ++ ++ VADD t2, a6, t2 ++ addl X, 16*SIZE, X ++ VADD t3, a7, t3 ++ bgt I, $MainLoop ++ ++$MainLoopEnd: ++ /*fabs*/ ++ ++ vcpys $f31, a0, a4 ++ vcpys $f31, a1, a5 ++ vcpys $f31, a2, a6 ++ vcpys $f31, a3, a7 ++ ++ VADD t0, a4, t0 ++ VADD t1, a5, t1 ++ VADD t2, a6, t2 ++ VADD t3, a7, t3 ++ ++ VADD t0, t1, t0 ++ VADD t2, t3, t2 ++ VADD t0, t2, t0 ++ nop ++ ++ vextf t0, 1, s1 ++ vextf t0, 2, s2 ++ vextf t0, 3, s3 ++ nop ++ ++ /*sum*/ ++ ADD t0, s1, t0 ++ ADD s2, s3, s2 ++ ADD s0, t0, s0 ++ nop ++$Remain: ++ and N, 15, I ++ ADD s0, s2, s0 ++ unop ++ ble I, $End ++ .align 4 ++ ++$RemainLoop: ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s0, t0, s0 ++ bne I, $RemainLoop ++ .align 4 ++ ++$End: ++ ret ++ ++ ++$Sub: ++ sra N, 3, I ++ fclr s1 ++ fclr s2 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t1 ++ SXADDQ INCX, X, X ++ fclr t2 ++ ++ LD a1, 0 * SIZE(X) ++ fclr t3 ++ SXADDQ INCX, X, X ++ fclr s3 ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ fillcs PREFETCHSIZE * 2 * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ LD a7, 0 * SIZE(X) ++ fabs a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ fabs a3, t3 ++ SXADDQ INCX, X, X ++ ++ ADD s0, t0, s0 ++ LD a1, 0 * SIZE(X) ++ fabs a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) ++ fabs a5, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ LD a3, 0 * SIZE(X) ++ fabs a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ fabs a7, t3 ++ SXADDQ INCX, X, X ++ ++ LD a5, 0 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD a6, 0 * SIZE(X) ++ fabs a0, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a7, 0 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ fabs a2, t2 ++ ADD s3, t3, s3 ++ fabs a3, t3 ++ ++ ADD s0, t0, s0 ++ fabs a4, t0 ++ ADD s1, t1, s1 ++ fabs a5, t1 ++ ADD s2, t2, s2 ++ fabs a6, t2 ++ ADD s3, t3, s3 ++ fabs a7, t3 ++ ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ ADD s0, s1, s0 ++ ADD s2, s3, s2 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ADD s0, s2, s0 ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ fabs a0, t0 ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/axpy.S b/kernel/sw_64/axpy.S +new file mode 100644 +index 0000000..70e97d6 +--- /dev/null ++++ b/kernel/sw_64/axpy.S +@@ -0,0 +1,428 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 40 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldl $24, 0($sp) ++ fmov $f19, $f30 ++ ldl $23, 8($sp) ++ ldi $sp, -16($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ nop ++ sra $16, 3, $1 ++ fstd $f2, 0($sp) ++ cmpeq $21, 1, $3 ++ ++ fstd $f3, 8($sp) ++ cmpeq $23, 1, $4 ++ and $16, 7, $2 ++ ble $16, $End ++ ++ and $3, $4, $3 ++ fbeq $f30, $End ++ ++ beq $3, $Sub ++ ble $1, $Remain ++ .align 4 ++ ++ LD $f10, 0*SIZE($20) ++ LD $f11, 1*SIZE($20) ++ LD $f12, 2*SIZE($20) ++ LD $f13, 3*SIZE($20) ++ ++ LD $f18, 0*SIZE($24) ++ LD $f19, 1*SIZE($24) ++ LD $f20, 2*SIZE($24) ++ LD $f21, 3*SIZE($24) ++ ++ LD $f14, 4*SIZE($20) ++ LD $f15, 5*SIZE($20) ++ LD $f16, 6*SIZE($20) ++ LD $f17, 7*SIZE($20) ++ ++ LD $f22, 4*SIZE($24) ++ LD $f23, 5*SIZE($24) ++ LD $f24, 6*SIZE($24) ++ LD $f25, 7*SIZE($24) ++ ++ subl $1, 1, $1 ++ addl $20, 8*SIZE, $20 ++ unop ++ ble $1, $LoopEnd ++ .align 4 ++ ++$Loop: ++ fillcs PREFETCHSIZE * SIZE($24) ++ fillcs PREFETCHSIZE * SIZE($20) ++ ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ LD $f10, 0*SIZE($20) ++ MUL $f30, $f11, $f27 ++ LD $f11, 1*SIZE($20) ++ ++ MUL $f30, $f12, $f28 ++ LD $f12, 2*SIZE($20) ++ MUL $f30, $f13, $f29 ++ LD $f13, 3*SIZE($20) ++ ++ ADD $f18, $f26, $f0 ++ LD $f18, 8*SIZE($24) ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ LD $f14, 4*SIZE($20) ++ ++ ADD $f19, $f27, $f1 ++ LD $f19, 9*SIZE($24) ++ MUL $f30, $f15, $f27 ++ LD $f15, 5*SIZE($20) ++ ++ ADD $f20, $f28, $f2 ++ LD $f20, 10*SIZE($24) ++ MUL $f30, $f16, $f28 ++ LD $f16, 6*SIZE($20) ++ ++ ADD $f21, $f29, $f3 ++ LD $f21, 11*SIZE($24) ++ MUL $f30, $f17, $f29 ++ LD $f17, 7*SIZE($20) ++ ++ ST $f0, 0*SIZE($24) ++ ADD $f22, $f26, $f0 ++ ST $f1, 1*SIZE($24) ++ ADD $f23, $f27, $f1 ++ ++ ST $f2, 2*SIZE($24) ++ ADD $f24, $f28, $f2 ++ ST $f3, 3*SIZE($24) ++ ADD $f25, $f29, $f3 ++ ++ LD $f22, 12*SIZE($24) ++ LD $f23, 13*SIZE($24) ++ LD $f24, 14*SIZE($24) ++ LD $f25, 15*SIZE($24) ++ ++ ST $f0, 4*SIZE($24) ++ ST $f1, 5*SIZE($24) ++ ST $f2, 6*SIZE($24) ++ ST $f3, 7*SIZE($24) ++ ++ subl $1, 1, $1 ++ addl $24, 8*SIZE, $24 ++ addl $20, 8*SIZE, $20 ++ bgt $1, $Loop ++ .align 4 ++ ++$LoopEnd: ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ MUL $f30, $f11, $f27 ++ MUL $f30, $f12, $f28 ++ MUL $f30, $f13, $f29 ++ ++ ADD $f18, $f26, $f0 ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ ADD $f19, $f27, $f1 ++ MUL $f30, $f15, $f27 ++ ++ ADD $f20, $f28, $f2 ++ MUL $f30, $f16, $f28 ++ ADD $f21, $f29, $f3 ++ MUL $f30, $f17, $f29 ++ ++ ST $f0, 0*SIZE($24) ++ ADD $f22, $f26, $f0 ++ ST $f1, 1*SIZE($24) ++ ADD $f23, $f27, $f1 ++ ++ ST $f2, 2*SIZE($24) ++ ADD $f24, $f28, $f2 ++ ST $f3, 3*SIZE($24) ++ ADD $f25, $f29, $f3 ++ ++ ST $f0, 4*SIZE($24) ++ ST $f1, 5*SIZE($24) ++ ST $f2, 6*SIZE($24) ++ ST $f3, 7*SIZE($24) ++ addl $24, 8*SIZE, $24 ++ .align 4 ++ ++$Remain: ++ ble $2, $End ++ .align 4 ++ ++$RemainLoop: ++ LD $f10, 0*SIZE($20) ++ LD $f11, 0*SIZE($24) ++ addl $20, SIZE, $20 ++ addl $24, SIZE, $24 ++ ++ MUL $f30, $f10, $f12 ++ subl $2, 1, $2 ++ ADD $f11, $f12, $f13 ++ ST $f13, -1*SIZE($24) ++ bgt $2, $RemainLoop ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ ldi $sp, 16($sp) ++ ret ++ .align 4 ++ ++$Sub: ++ SXSUBL $16, SIZE, $22 ++ subl $1, 1, $4 ++ ble $1, $SubRemain ++ .align 4 ++ ++ LD $f10, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f11, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f12, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f13, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f18, 0($24) ++ SXADDQ $23, $24, $22 ++ ++ LD $f19, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f20, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f21, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f14, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f15, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f16, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f17, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f22, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f23, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f24, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f25, 0($22) ++ SXADDQ $23, $22, $22 ++ unop ++ ble $4, $SubLoopEnd ++ .align 4 ++ ++$SubLoop: ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ LD $f10, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f11, $f27 ++ LD $f11, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f12, $f28 ++ LD $f12, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f13, $f29 ++ LD $f13, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ ADD $f18, $f26, $f0 ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ LD $f14, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ADD $f19, $f27, $f1 ++ MUL $f30, $f15, $f27 ++ LD $f15, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ADD $f20, $f28, $f2 ++ MUL $f30, $f16, $f28 ++ LD $f16, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ADD $f21, $f29, $f3 ++ MUL $f30, $f17, $f29 ++ LD $f17, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f22, $f26, $f0 ++ unop ++ ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f23, $f27, $f1 ++ unop ++ ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f24, $f28, $f2 ++ unop ++ ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f25, $f29, $f3 ++ unop ++ ++ LD $f18, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f19, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f20, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f21, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f22, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f23, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f24, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f25, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ subl $4, 1, $4 ++ bgt $4, $SubLoop ++ .align 4 ++ ++$SubLoopEnd: ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ MUL $f30, $f11, $f27 ++ MUL $f30, $f12, $f28 ++ MUL $f30, $f13, $f29 ++ ++ ADD $f18, $f26, $f0 ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ ADD $f19, $f27, $f1 ++ MUL $f30, $f15, $f27 ++ ++ ADD $f20, $f28, $f2 ++ MUL $f30, $f16, $f28 ++ ADD $f21, $f29, $f3 ++ MUL $f30, $f17, $f29 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ ADD $f22, $f26, $f0 ++ ADD $f23, $f27, $f1 ++ ADD $f24, $f28, $f2 ++ ADD $f25, $f29, $f3 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ .align 4 ++ ++$SubRemain: ++ ble $2, $SubEnd ++ .align 4 ++ ++$SubRemainLoop: ++ LD $f10, 0($20) ++ LD $f11, 0($24) ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f10, $f12 ++ subl $2, 1, $2 ++ ADD $f11, $f12, $f13 ++ ST $f13, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ bgt $2, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ ldi $sp, 16($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/axpy_simd.S b/kernel/sw_64/axpy_simd.S +new file mode 100644 +index 0000000..3a2219c +--- /dev/null ++++ b/kernel/sw_64/axpy_simd.S +@@ -0,0 +1,655 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ ++#define PREFETCHSIZE 80 ++// #define PREFETCH_DISTANCE_BYTES 384 ++ ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldl $24, 0($sp) ++ fmov $f19, $f30 ++ ldl $23, 8($sp) ++ ldi $sp, -16($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fstd $f2, 0($sp) ++ cmpeq $21, 1, $3 ++ fstd $f3, 8($sp) ++ cmpeq $23, 1, $4 ++ ++ ble $16, $End ++ fbeq $f30, $End ++ and $3, $4, $3 ++ beq $3, $Sub ++ ++/** ++ test the address of Y ++**/ ++ and $24, (VEC_LEN*SIZE-1), $4 ++ nop ++ nop ++ beq $4, $Align_Y_Access ++ .align 4 ++/** ++ process the unalign address of Y ++**/ ++ ++ sra $16, 4, $1 ++ and $16, 15, $2 ++ sra $4, BASE_SHIFT, $4 ++ ble $1, $Remain /*if N is too small(less then unroll size), don't need process unalign Y. Just jump to remain section.*/ ++ ++ ldi $3, VEC_LEN ++ subl $3, $4, $4 ++ ++$UnAlign_Y_Loop: ++ LD $f10, 0*SIZE($20) ++ LD $f11, 0*SIZE($24) ++ addl $20, SIZE, $20 ++ addl $24, SIZE, $24 ++ ++ MAD $f30, $f10, $f11, $f13 ++ subl $4, 1, $4 ++ subl $16, 1, $16 ++ ST $f13, -1*SIZE($24) ++ bgt $4, $UnAlign_Y_Loop ++ .align 4 ++ ++ ++$Align_Y_Access: ++ ++ nop ++ sra $16, 4, $1 ++ and $16, 15, $2 ++ ble $1, $Remain ++ ++/** ++ test the address of X ++**/ ++ ++ and $20, (VEC_LEN*SIZE-1), $3 ++ nop ++ nop ++ bne $3, $UnAlign_X_Access ++ ++ .align 4 ++$Align_Access: ++/*** ++ extern alpha from $f30 to vector 4 in $f13 ++ unloop 16 ++***/ ++ vcpyf $f30, $f13 ++ ++ VLD $f10, 0*VEC_LEN*SIZE($20) ++/* ++ LD $f10, 0*SIZE($20) ++ LD $f11, 1*SIZE($20) ++ LD $f12, 2*SIZE($20) ++ LD $f13, 3*SIZE($20) ++*/ ++ VLD $f18, 0*VEC_LEN*SIZE($24) ++/* ++ LD $f18, 0*SIZE($24) ++ LD $f19, 1*SIZE($24) ++ LD $f20, 2*SIZE($24) ++ LD $f21, 3*SIZE($24) ++*/ ++ VLD $f14, 1*VEC_LEN*SIZE($20) ++ VLD $f15, 2*VEC_LEN*SIZE($20) ++ VLD $f16, 3*VEC_LEN*SIZE($20) ++/* ++ LD $f14, 4*SIZE($20) ++ LD $f15, 5*SIZE($20) ++ LD $f16, 6*SIZE($20) ++ LD $f17, 7*SIZE($20) ++*/ ++ VLD $f22, 1*VEC_LEN*SIZE($24) ++ VLD $f23, 2*VEC_LEN*SIZE($24) ++ VLD $f24, 3*VEC_LEN*SIZE($24) ++/* ++ LD $f22, 4*SIZE($24) ++ LD $f23, 5*SIZE($24) ++ LD $f24, 6*SIZE($24) ++ LD $f25, 7*SIZE($24) ++*/ ++ ++ subl $1, 1, $1 ++ addl $20, 16*SIZE, $20 ++ unop ++ ble $1, $LoopEnd ++ .align 4 ++ ++$Loop: ++ ++ fillcs PREFETCHSIZE * SIZE($24) ++ fillcs PREFETCHSIZE * SIZE($20) ++/* ++ fillcs PREFETCH_DISTANCE_BYTES($24) ++ fillcs PREFETCH_DISTANCE_BYTES($20) ++*/ ++ ++ VMAD $f13, $f10, $f18, $f0 ++ VLD $f10, 0*VEC_LEN*SIZE($20) ++ VLD $f18, 4*VEC_LEN*SIZE($24) ++/* ++ MAD $f30, $f10, $f18, $f0 # y += alpha * x ++ LD $f10, 0*SIZE($20) ++ MAD $f30, $f11, $f19, $f1 ++ LD $f11, 1*SIZE($20) ++ ++ MAD $f30, $f12, $f20, $f2 ++ LD $f12, 2*SIZE($20) ++ MAD $f30, $f13, $f21, $f3 ++ LD $f13, 3*SIZE($20) ++*/ ++ ++ VMAD $f13, $f14, $f22, $f26 ++ VLD $f14, 1*VEC_LEN*SIZE($20) ++ VLD $f22, 5*VEC_LEN*SIZE($24) ++ ++ VMAD $f13, $f15, $f23, $f27 ++ VLD $f15, 2*VEC_LEN*SIZE($20) ++ VLD $f23, 6*VEC_LEN*SIZE($24) ++ ++ VMAD $f13, $f16, $f24, $f28 ++ VLD $f16, 3*VEC_LEN*SIZE($20) ++ VLD $f24, 7*VEC_LEN*SIZE($24) ++/* ++ MAD $f30, $f14, $f22, $f26 # y += alpha * x ++ LD $f14, 4*SIZE($20) ++ MAD $f30, $f15, $f23, $f27 ++ LD $f15, 5*SIZE($20) ++ ++ MAD $f30, $f16, $f24, $f28 ++ LD $f16, 6*SIZE($20) ++ MAD $f30, $f17, $f25, $f29 ++ LD $f17, 7*SIZE($20) ++*/ ++ ++/* ++ LD $f18, 8*SIZE($24) ++ LD $f19, 9*SIZE($24) ++ LD $f20, 10*SIZE($24) ++ LD $f21, 11*SIZE($24) ++ ++ LD $f22, 12*SIZE($24) ++ LD $f23, 13*SIZE($24) ++ LD $f24, 14*SIZE($24) ++ LD $f25, 15*SIZE($24) ++*/ ++ ++ ++ ++ VST $f0, 0*VEC_LEN*SIZE($24) ++ VST $f26, 1*VEC_LEN*SIZE($24) ++ VST $f27, 2*VEC_LEN*SIZE($24) ++ VST $f28, 3*VEC_LEN*SIZE($24) ++/* ++ ST $f0, 0*SIZE($24) ++ ST $f1, 1*SIZE($24) ++ ST $f2, 2*SIZE($24) ++ ST $f3, 3*SIZE($24) ++ ++ ST $f26, 4*SIZE($24) ++ ST $f27, 5*SIZE($24) ++ ST $f28, 6*SIZE($24) ++ ST $f29, 7*SIZE($24) ++*/ ++ subl $1, 1, $1 ++ addl $24, 16*SIZE, $24 ++ addl $20, 16*SIZE, $20 ++ bgt $1, $Loop ++ .align 4 ++ ++$LoopEnd: ++ VMAD $f13, $f10, $f18, $f0 ++ VST $f0, 0*VEC_LEN*SIZE($24) ++ VMAD $f13, $f14, $f22, $f26 ++ VST $f26, 1*VEC_LEN*SIZE($24) ++ VMAD $f13, $f15, $f23, $f27 ++ VST $f27, 2*VEC_LEN*SIZE($24) ++ VMAD $f13, $f16, $f24, $f28 ++ VST $f28, 3*VEC_LEN*SIZE($24) ++ ++/* ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ MUL $f30, $f11, $f27 ++ MUL $f30, $f12, $f28 ++ MUL $f30, $f13, $f29 ++ ++ ADD $f18, $f26, $f0 ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ ADD $f19, $f27, $f1 ++ MUL $f30, $f15, $f27 ++ ++ ADD $f20, $f28, $f2 ++ MUL $f30, $f16, $f28 ++ ADD $f21, $f29, $f3 ++ MUL $f30, $f17, $f29 ++ ++ ST $f0, 0*SIZE($24) ++ ADD $f22, $f26, $f0 ++ ST $f1, 1*SIZE($24) ++ ADD $f23, $f27, $f1 ++ ++ ST $f2, 2*SIZE($24) ++ ADD $f24, $f28, $f2 ++ ST $f3, 3*SIZE($24) ++ ADD $f25, $f29, $f3 ++ ++ ST $f0, 4*SIZE($24) ++ ST $f1, 5*SIZE($24) ++ ST $f2, 6*SIZE($24) ++ ST $f3, 7*SIZE($24) ++*/ ++ addl $24, 16*SIZE, $24 ++ ++ .align 4 ++ ++$Remain: ++ ble $2, $End ++ ++ .align 4 ++ ++$RemainLoop: ++ LD $f10, 0*SIZE($20) ++ LD $f11, 0*SIZE($24) ++ addl $20, SIZE, $20 ++ addl $24, SIZE, $24 ++ ++ MAD $f30, $f10, $f11, $f13 ++ subl $2, 1, $2 ++ ST $f13, -1*SIZE($24) ++ bgt $2, $RemainLoop ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ ldi $sp, 16($sp) ++ ret ++ .align 4 ++ ++$UnAlign_X_Access: ++/*** ++ extern alpha from $f30 to vector 4 in $f13 ++ unloop 16 ++ unalign access X ++ align access Y ++***/ ++ vcpyf $f30, $f13 ++ VLD_UL $f10, 0*VEC_LEN*SIZE($20) ++ VLD_UH $f2, 1*VEC_LEN*SIZE($20) ++ ++ VLD_UL $f14, 1*VEC_LEN*SIZE($20) ++ VLD_UH $f3, 2*VEC_LEN*SIZE($20) ++ ++ VLD_UL $f15, 2*VEC_LEN*SIZE($20) ++ VLD_UH $f11, 3*VEC_LEN*SIZE($20) ++ ++ VLD_UL $f16, 3*VEC_LEN*SIZE($20) ++ VLD_UH $f12, 4*VEC_LEN*SIZE($20) ++ ++ VLD $f18, 0*VEC_LEN*SIZE($24) ++ VLD $f22, 1*VEC_LEN*SIZE($24) ++ VLD $f23, 2*VEC_LEN*SIZE($24) ++ VLD $f24, 3*VEC_LEN*SIZE($24) ++ ++ vbisw $f10, $f2, $f10 ++ vbisw $f14, $f3, $f14 ++ vbisw $f15, $f11, $f15 ++ vbisw $f16, $f12, $f16 ++ ++ ++ subl $1, 1, $1 ++ addl $20, 16*SIZE, $20 ++ unop ++ ble $1, $UnAlign_X_LoopEnd ++ .align 4 ++ ++$UnAlign_X_Loop: ++ ++ fillcs PREFETCHSIZE * SIZE($24) ++ fillcs PREFETCHSIZE * SIZE($20) ++ ++ VMAD $f13, $f10, $f18, $f0 ++ VLD_UL $f10, 0*VEC_LEN*SIZE($20) ++ VLD_UH $f2, 1*VEC_LEN*SIZE($20) ++ ++ ++ VMAD $f13, $f14, $f22, $f26 ++ VLD_UL $f14, 1*VEC_LEN*SIZE($20) ++ VLD_UH $f3, 2*VEC_LEN*SIZE($20) ++ ++ VMAD $f13, $f15, $f23, $f27 ++ VLD_UL $f15, 2*VEC_LEN*SIZE($20) ++ VLD_UH $f11, 3*VEC_LEN*SIZE($20) ++ ++ VMAD $f13, $f16, $f24, $f28 ++ VLD_UL $f16, 3*VEC_LEN*SIZE($20) ++ VLD_UH $f12, 4*VEC_LEN*SIZE($20) ++ ++ ++ ++ ++ VLD $f18, 4*VEC_LEN*SIZE($24) ++ vbisw $f10, $f2, $f10 ++ VLD $f22, 5*VEC_LEN*SIZE($24) ++ vbisw $f14, $f3, $f14 ++ VLD $f23, 6*VEC_LEN*SIZE($24) ++ vbisw $f15, $f11, $f15 ++ VLD $f24, 7*VEC_LEN*SIZE($24) ++ vbisw $f16, $f12, $f16 ++ ++ ++ VST $f0, 0*VEC_LEN*SIZE($24) ++ VST $f26, 1*VEC_LEN*SIZE($24) ++ VST $f27, 2*VEC_LEN*SIZE($24) ++ VST $f28, 3*VEC_LEN*SIZE($24) ++ ++ ++ subl $1, 1, $1 ++ addl $24, 16*SIZE, $24 ++ addl $20, 16*SIZE, $20 ++ bgt $1, $UnAlign_X_Loop ++ .align 4 ++ ++$UnAlign_X_LoopEnd: ++ VMAD $f13, $f10, $f18, $f0 ++ VST $f0, 0*VEC_LEN*SIZE($24) ++ VMAD $f13, $f14, $f22, $f26 ++ VST $f26, 1*VEC_LEN*SIZE($24) ++ VMAD $f13, $f15, $f23, $f27 ++ VST $f27, 2*VEC_LEN*SIZE($24) ++ VMAD $f13, $f16, $f24, $f28 ++ VST $f28, 3*VEC_LEN*SIZE($24) ++ ++ addl $24, 16*SIZE, $24 ++ ++ .align 4 ++ ++$UnAlign_X_Remain: ++ ble $2, $UnAlign_X_End ++ ++ .align 4 ++ ++$UnAlign_X_RemainLoop: ++ LD $f10, 0*SIZE($20) ++ LD $f11, 0*SIZE($24) ++ addl $20, SIZE, $20 ++ addl $24, SIZE, $24 ++ ++ MAD $f30, $f10, $f11, $f13 ++ subl $2, 1, $2 ++ ST $f13, -1*SIZE($24) ++ bgt $2, $UnAlign_X_RemainLoop ++ .align 4 ++ ++$UnAlign_X_End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ ldi $sp, 16($sp) ++ ret ++ .align 4 ++ ++ ++$Sub: ++ sra $16, 3, $1 ++ and $16, 7, $2 ++ SXSUBL $16, SIZE, $22 ++ subl $1, 1, $4 ++ ++ ble $1, $SubRemain ++ .align 4 ++ ++ LD $f10, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f11, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f12, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f13, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f18, 0($24) ++ SXADDQ $23, $24, $22 ++ ++ LD $f19, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f20, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f21, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f14, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f15, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f16, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ LD $f17, 0($20) ++ SXADDQ $21, $20, $20 ++ LD $f22, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f23, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f24, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f25, 0($22) ++ SXADDQ $23, $22, $22 ++ unop ++ ble $4, $SubLoopEnd ++ .align 4 ++ ++$SubLoop: ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ LD $f10, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f11, $f27 ++ LD $f11, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f12, $f28 ++ LD $f12, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f13, $f29 ++ LD $f13, 0($20) ++ unop ++ SXADDQ $21, $20, $20 ++ ++ ADD $f18, $f26, $f0 ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ LD $f14, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ADD $f19, $f27, $f1 ++ MUL $f30, $f15, $f27 ++ LD $f15, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ADD $f20, $f28, $f2 ++ MUL $f30, $f16, $f28 ++ LD $f16, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ADD $f21, $f29, $f3 ++ MUL $f30, $f17, $f29 ++ LD $f17, 0($20) ++ SXADDQ $21, $20, $20 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f22, $f26, $f0 ++ unop ++ ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f23, $f27, $f1 ++ unop ++ ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f24, $f28, $f2 ++ unop ++ ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ ADD $f25, $f29, $f3 ++ unop ++ ++ LD $f18, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f19, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f20, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f21, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f22, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f23, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ LD $f24, 0($22) ++ SXADDQ $23, $22, $22 ++ LD $f25, 0($22) ++ SXADDQ $23, $22, $22 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ subl $4, 1, $4 ++ bgt $4, $SubLoop ++ .align 4 ++ ++$SubLoopEnd: ++ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 ++ MUL $f30, $f11, $f27 ++ MUL $f30, $f12, $f28 ++ MUL $f30, $f13, $f29 ++ ++ ADD $f18, $f26, $f0 ++ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ++ ADD $f19, $f27, $f1 ++ MUL $f30, $f15, $f27 ++ ++ ADD $f20, $f28, $f2 ++ MUL $f30, $f16, $f28 ++ ADD $f21, $f29, $f3 ++ MUL $f30, $f17, $f29 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ ADD $f22, $f26, $f0 ++ ADD $f23, $f27, $f1 ++ ADD $f24, $f28, $f2 ++ ADD $f25, $f29, $f3 ++ ++ ST $f0, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f1, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ ST $f2, 0($24) ++ SXADDQ $23, $24, $24 ++ ST $f3, 0($24) ++ SXADDQ $23, $24, $24 ++ .align 4 ++ ++$SubRemain: ++ ble $2, $SubEnd ++ .align 4 ++ ++$SubRemainLoop: ++ LD $f10, 0($20) ++ LD $f11, 0($24) ++ SXADDQ $21, $20, $20 ++ ++ MUL $f30, $f10, $f12 ++ subl $2, 1, $2 ++ ADD $f11, $f12, $f13 ++ ST $f13, 0($24) ++ SXADDQ $23, $24, $24 ++ ++ bgt $2, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ ldi $sp, 16($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/cabs.S b/kernel/sw_64/cabs.S +new file mode 100644 +index 0000000..3f9ed2c +--- /dev/null ++++ b/kernel/sw_64/cabs.S +@@ -0,0 +1,72 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl NAME ++ .ent NAME ++NAME: ++ .frame $sp, 0, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++#endif ++ ++ LD $f10, 0($16) ++ LD $f11, SIZE($16) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fabs $f10, $f12 ++ fabs $f11, $f0 ++ ADD $f12, $f0, $f29 ++ fmov $f29, $f0 ++ ret ++ .end NAME ++ .ident VERSION +diff --git a/kernel/sw_64/cabs.S.bak b/kernel/sw_64/cabs.S.bak +new file mode 100644 +index 0000000..5fa27af +--- /dev/null ++++ b/kernel/sw_64/cabs.S.bak +@@ -0,0 +1,71 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl NAME ++ .ent NAME ++NAME: ++ .frame $sp, 0, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ lda $28, _mcount ++ jsr $28, ($28), _mcount ++#endif ++ ++ LD $f10, 0($16) ++ LD $f11, SIZE($16) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fabs $f10, $f12 ++ fabs $f11, $f0 ++ ADD $f12, $f0, $f0 ++ ret ++ .end NAME ++ .ident VERSION +diff --git a/kernel/sw_64/cnrm2.S b/kernel/sw_64/cnrm2.S +new file mode 100644 +index 0000000..25eab03 +--- /dev/null ++++ b/kernel/sw_64/cnrm2.S +@@ -0,0 +1,440 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stl $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ sll INCX, ZBASE_SHIFT, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, 2 * SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, $f25 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, $f26 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, $f27 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, $f28 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd $f25, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd $f26, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd $f27, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd $f28, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, $f25 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, $f26 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, $f27 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, $f28 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd $f25, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd $f26, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd $f27, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd $f28, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0, $f25 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, $f26 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, $f27 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, $f28 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd $f25, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd $f26, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd $f27, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd $f28, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, $f25 ++ fmuld x0, x0, t0 ++ faddd a1, t1, $f26 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, $f27 ++ fmuld x2, x2, t2 ++ faddd a3, t3, $f28 ++ fmuld x3, x3, t3 ++ ++ faddd $f25, t0, a0 ++ fmuld x4, x4, t0 ++ faddd $f26, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd $f27, t2, a2 ++ fmuld x6, x6, t2 ++ faddd $f28, t3, a2 ++ fmuld x7, x7, t3 ++ ++ faddd a2, t2, $f27 ++ fmov $f27, a2 ++ faddd a3, t3, $f28 ++ fmov $f28, a3 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ LD x1, 1 * SIZE(X) ++ ++ ldi X, 2 * SIZE(X) ++ ++ faddd a0, t0, $f25 ++ fmov $f25, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, $f26 ++ fmov $f26, a1 ++ fmuld x1, x1, t1 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 2, I ++ fclr t1 ++ ble I, $L25 ++ ++ LD x0, 0 * SIZE(X) ++ fclr t2 ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ fclr t3 ++ LD x3, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x5, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x6, 0 * SIZE(X) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, $f25 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, $f26 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, $f27 ++ LD x1, 1 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, $f28 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ unop ++ ++ faddd $f25, t0, a0 ++ LD x3, 1 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd $f26, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ ldi I, -1(I) ++ ++ faddd $f27, t2, a2 ++ LD x5, 1 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd $f28, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, $f25 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, $f26 ++ fmuld x1, x1, t1 ++ faddd a2, t2, $f27 ++ fmuld x2, x2, t2 ++ ++ faddd a3, t3, $f28 ++ fmuld x3, x3, t3 ++ faddd $f25, t0, a0 ++ fmuld x4, x4, t0 ++ ++ faddd $f26, t1, a1 ++ fmuld x5, x5, t1 ++ faddd $f27, t2, a2 ++ fmuld x6, x6, t2 ++ ++ faddd $f28, t3, a3 ++ fmuld x7, x7, t3 ++ faddd a2, t2, $f27 ++ fmov $f27, a2 ++ faddd a3, t3, $f28 ++ fmov $f28, a3 ++ .align 4 ++ ++$L25: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0, $f25 ++ fmov $f25, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, $f26 ++ fmov $f26, a1 ++ fmuld x1, x1, t1 ++ ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, $f25 ++ fmov $f25, a0 ++ faddd a1, t1, $f26 ++ fmov $f26, a1 ++ ++ faddd a0, a1, $f25 ++ fmov $f25, a0 ++ faddd a2, a3, $f26 ++ fmov $f26, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, $f25 ++ fmov $f25, a0 ++ fsqrtd a0, $f25 ++ fmov $f25, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/cnrm2.S.bak b/kernel/sw_64/cnrm2.S.bak +new file mode 100644 +index 0000000..b2e80e0 +--- /dev/null ++++ b/kernel/sw_64/cnrm2.S.bak +@@ -0,0 +1,426 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ sll INCX, ZBASE_SHIFT, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, 2 * SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, a0 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ LD x1, 1 * SIZE(X) ++ ++ ldi X, 2 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 2, I ++ fclr t1 ++ ble I, $L25 ++ ++ LD x0, 0 * SIZE(X) ++ fclr t2 ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ fclr t3 ++ LD x3, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x5, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x6, 0 * SIZE(X) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, a2 ++ LD x1, 1 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ unop ++ ++ faddd a0, t0, a0 ++ LD x3, 1 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ ldi I, -1(I) ++ ++ faddd a2, t2, a2 ++ LD x5, 1 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L25: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, a0 ++ faddd a1, t1, a1 ++ ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/copy.S b/kernel/sw_64/copy.S +new file mode 100644 +index 0000000..c960ac1 +--- /dev/null ++++ b/kernel/sw_64/copy.S +@@ -0,0 +1,379 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ cmpeq INCX, 1, $0 ++ ble N, $End ++#ifndef COMPLEX ++ sra N, 4, $4 ++#else ++ sra N, 3, $4 ++#endif ++ cmpeq INCY, 1, $1 ++ ++ and $0, $1, $0 ++ beq $0, $Sub ++#ifndef COMPLEX ++ and N, 15, $5 ++#else ++ and N, 7, $5 ++#endif ++ ble $4, $Remain ++ ++ LD $f10, 0*SIZE(X) ++ LD $f11, 1*SIZE(X) ++ LD $f12, 2*SIZE(X) ++ LD $f13, 3*SIZE(X) ++ LD $f14, 4*SIZE(X) ++ LD $f15, 5*SIZE(X) ++ LD $f16, 6*SIZE(X) ++ LD $f17, 7*SIZE(X) ++ ++ LD $f18, 8*SIZE(X) ++ LD $f19, 9*SIZE(X) ++ LD $f20, 10*SIZE(X) ++ LD $f21, 11*SIZE(X) ++ LD $f22, 12*SIZE(X) ++ LD $f23, 13*SIZE(X) ++ LD $f24, 14*SIZE(X) ++ LD $f25, 15*SIZE(X) ++ ++ subl $4, 1, $4 ++ ldi X, 16*SIZE(X) ++ ble $4, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++ ST $f10, 0*SIZE(Y) ++ ST $f11, 1*SIZE(Y) ++ ST $f12, 2*SIZE(Y) ++ ST $f13, 3*SIZE(Y) ++ ++ LD $f10, 0*SIZE(X) ++ LD $f11, 1*SIZE(X) ++ LD $f12, 2*SIZE(X) ++ LD $f13, 3*SIZE(X) ++ ++ ST $f14, 4*SIZE(Y) ++ ST $f15, 5*SIZE(Y) ++ ST $f16, 6*SIZE(Y) ++ ST $f17, 7*SIZE(Y) ++ ++ LD $f14, 4*SIZE(X) ++ LD $f15, 5*SIZE(X) ++ LD $f16, 6*SIZE(X) ++ LD $f17, 7*SIZE(X) ++ ++ ST $f18, 8*SIZE(Y) ++ ST $f19, 9*SIZE(Y) ++ ST $f20, 10*SIZE(Y) ++ ST $f21, 11*SIZE(Y) ++ ++ LD $f18, 8*SIZE(X) ++ LD $f19, 9*SIZE(X) ++ LD $f20, 10*SIZE(X) ++ LD $f21, 11*SIZE(X) ++ ++ ST $f22, 12*SIZE(Y) ++ ST $f23, 13*SIZE(Y) ++ ST $f24, 14*SIZE(Y) ++ ST $f25, 15*SIZE(Y) ++ ++ LD $f22, 12*SIZE(X) ++ LD $f23, 13*SIZE(X) ++ LD $f24, 14*SIZE(X) ++ LD $f25, 15*SIZE(X) ++ ++ subl $4, 1, $4 ++ ldi Y, 16*SIZE(Y) ++ ldi X, 16*SIZE(X) ++ bgt $4, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ ST $f10, 0*SIZE(Y) ++ ST $f11, 1*SIZE(Y) ++ ST $f12, 2*SIZE(Y) ++ ST $f13, 3*SIZE(Y) ++ ST $f14, 4*SIZE(Y) ++ ST $f15, 5*SIZE(Y) ++ ST $f16, 6*SIZE(Y) ++ ST $f17, 7*SIZE(Y) ++ ++ ST $f18, 8*SIZE(Y) ++ ST $f19, 9*SIZE(Y) ++ ST $f20, 10*SIZE(Y) ++ ST $f21, 11*SIZE(Y) ++ ST $f22, 12*SIZE(Y) ++ ST $f23, 13*SIZE(Y) ++ ST $f24, 14*SIZE(Y) ++ ST $f25, 15*SIZE(Y) ++ ++ ldi Y, 16*SIZE(Y) ++ .align 4 ++ ++$Remain: ++ ble $5, $End ++ .align 4 ++ ++$RemainLoop: ++#ifndef COMPLEX ++ LD $f10, 0*SIZE(X) ++ ldi X, 1*SIZE(X) ++ ST $f10, 0*SIZE(Y) ++ ldi Y, 1*SIZE(Y) ++#else ++ LD $f10, 0*SIZE(X) ++ LD $f11, 1*SIZE(X) ++ ldi X, 2*SIZE(X) ++ ST $f10, 0*SIZE(Y) ++ ST $f11, 1*SIZE(Y) ++ ldi Y, 2*SIZE(Y) ++#endif ++ subl $5, 1, $5 ++ bgt $5, $RemainLoop ++ .align 4 ++$End: ++ ret ++ .align 4 ++ ++$Sub: ++#ifdef COMPLEX ++ addl INCX, INCX, INCX ++ addl INCY, INCY, INCY ++ and N, 7, $5 ++#else ++ and N, 15, $5 ++#endif ++ ble $4, $SubRemain ++ .align 4 ++ ++$SubMainLoop: ++#ifndef COMPLEX ++ LD $f10, 0(X) ++ SXADDQ INCX, X, X ++ LD $f11, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f12, 0(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f14, 0(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f16, 0(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f18, 0(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f20, 0(X) ++ SXADDQ INCX, X, X ++ LD $f21, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f22, 0(X) ++ SXADDQ INCX, X, X ++ LD $f23, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f24, 0(X) ++ SXADDQ INCX, X, X ++ LD $f25, 0(X) ++ SXADDQ INCX, X, X ++ ++ ST $f10, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f11, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f12, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f13, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f14, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f15, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f16, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f17, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f18, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f19, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f20, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f21, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f22, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f23, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f24, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f25, 0(Y) ++ SXADDQ INCY, Y, Y ++#else ++ LD $f10, 0(X) ++ LD $f11, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f12, 0(X) ++ LD $f13, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f14, 0(X) ++ LD $f15, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f16, 0(X) ++ LD $f17, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f18, 0(X) ++ LD $f19, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f20, 0(X) ++ LD $f21, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f22, 0(X) ++ LD $f23, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f24, 0(X) ++ LD $f25, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST $f10, 0(Y) ++ ST $f11, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f12, 0(Y) ++ ST $f13, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f14, 0(Y) ++ ST $f15, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f16, 0(Y) ++ ST $f17, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f18, 0(Y) ++ ST $f19, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f20, 0(Y) ++ ST $f21, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f22, 0(Y) ++ ST $f23, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f24, 0(Y) ++ ST $f25, SIZE(Y) ++ SXADDQ INCY, Y, Y ++#endif ++ subl $4, 1, $4 ++ bgt $4, $SubMainLoop ++ .align 4 ++ ++$SubRemain: ++ ble $5, $SubEnd ++ .align 4 ++ ++ $SubRemainLoop: ++#ifndef COMPLEX ++ LD $f10, 0(X) ++ SXADDQ INCX, X, X ++ ST $f10, 0(Y) ++ SXADDQ INCY, Y, Y ++#else ++ LD $f10, 0(X) ++ LD $f11, SIZE(X) ++ SXADDQ INCX, X, X ++ ST $f10, 0(Y) ++ ST $f11, SIZE(Y) ++ SXADDQ INCY, Y, Y ++#endif ++ subl $5, 1, $5 ++ bgt $5, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/copy_simd.S b/kernel/sw_64/copy_simd.S +new file mode 100644 +index 0000000..84e96a9 +--- /dev/null ++++ b/kernel/sw_64/copy_simd.S +@@ -0,0 +1,563 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ cmpeq INCX, 1, $0 ++ ble N, $End ++#ifndef COMPLEX ++ sra N, 4, $4 ++#else ++ sra N, 3, $4 ++#endif ++ cmpeq INCY, 1, $1 ++ ++ and $0, $1, $0 ++ beq $0, $Sub ++#ifndef COMPLEX ++ and N, 15, $5 ++#else ++ and N, 7, $5 ++#endif ++ ble $4, $Remain ++ ++/** ++ test the address of X & Y ++**/ ++ ++ and Y, (VEC_LEN*SIZE-1), $6 ++ and X, (VEC_LEN*SIZE-1), $7 ++ bgt $6, $UnAlign_Y_ACCESS ++ bgt $7, $UnAlign_X_ACCESS ++ ++ .align 4 ++ ++$Align: ++ VLD $f10, 0*VEC_LEN*SIZE(X) ++ VLD $f11, 1*VEC_LEN*SIZE(X) ++ VLD $f12, 2*VEC_LEN*SIZE(X) ++ VLD $f13, 3*VEC_LEN*SIZE(X) ++ ++ subl $4, 1, $4 ++ ldi X, 16*SIZE(X) ++ ble $4, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++ fillcs PREFETCHSIZE * SIZE(X) ++ fillcs PREFETCHSIZE * SIZE(Y) ++ ++ VST $f10, 0*VEC_LEN*SIZE(Y) ++ VST $f11, 1*VEC_LEN*SIZE(Y) ++ VST $f12, 2*VEC_LEN*SIZE(Y) ++ VST $f13, 3*VEC_LEN*SIZE(Y) ++ ++ VLD $f10, 0*VEC_LEN*SIZE(X) ++ VLD $f11, 1*VEC_LEN*SIZE(X) ++ VLD $f12, 2*VEC_LEN*SIZE(X) ++ VLD $f13, 3*VEC_LEN*SIZE(X) ++ ++ subl $4, 1, $4 ++ ldi Y, 16*SIZE(Y) ++ ldi X, 16*SIZE(X) ++ bgt $4, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ ++ VST $f10, 0*VEC_LEN*SIZE(Y) ++ VST $f11, 1*VEC_LEN*SIZE(Y) ++ VST $f12, 2*VEC_LEN*SIZE(Y) ++ VST $f13, 3*VEC_LEN*SIZE(Y) ++ ++ ldi Y, 16*SIZE(Y) ++ .align 4 ++ ++$Remain: ++ ble $5, $End ++ .align 4 ++ ++$RemainLoop: ++#ifndef COMPLEX ++ LD $f10, 0*SIZE(X) ++ ldi X, 1*SIZE(X) ++ ST $f10, 0*SIZE(Y) ++ ldi Y, 1*SIZE(Y) ++#else ++ LD $f10, 0*SIZE(X) ++ LD $f11, 1*SIZE(X) ++ ldi X, 2*SIZE(X) ++ ST $f10, 0*SIZE(Y) ++ ST $f11, 1*SIZE(Y) ++ ldi Y, 2*SIZE(Y) ++#endif ++ subl $5, 1, $5 ++ bgt $5, $RemainLoop ++ .align 4 ++$End: ++ ret ++ .align 4 ++ ++$UnAlign_X_ACCESS: ++ and Y, (VEC_LEN*SIZE-1), $7 ++ nop ++ nop ++ bgt $7, $UnAlign_XY_ACCESS ++ .align 4 ++ ++ VLD_UL $f10, 0*VEC_LEN*SIZE(X) ++ VLD_UH $f14, 1*VEC_LEN*SIZE(X) ++ ++ VLD_UL $f11, 1*VEC_LEN*SIZE(X) ++ VLD_UH $f15, 2*VEC_LEN*SIZE(X) ++ ++ VLD_UL $f12, 2*VEC_LEN*SIZE(X) ++ VLD_UH $f16, 3*VEC_LEN*SIZE(X) ++ ++ ++ VLD_UL $f13, 3*VEC_LEN*SIZE(X) ++ VLD_UH $f17, 4*VEC_LEN*SIZE(X) ++ ++ subl $4, 1, $4 ++ vbisw $f10, $f14, $f10 ++ ldi X, 16*SIZE(X) ++ vbisw $f11, $f15, $f11 ++ ++ vbisw $f12, $f16, $f12 ++ vbisw $f13, $f17, $f13 ++ nop ++ ble $4, $UnAlign_X_MainLoopEnd ++ .align 4 ++ ++$UnAlign_X_MainLoop: ++ fillcs PREFETCHSIZE * SIZE(X) ++ fillcs PREFETCHSIZE * SIZE(Y) ++ ++ VST $f10, 0*VEC_LEN*SIZE(Y) ++ VST $f11, 1*VEC_LEN*SIZE(Y) ++ VST $f12, 2*VEC_LEN*SIZE(Y) ++ VST $f13, 3*VEC_LEN*SIZE(Y) ++ ++ VLD_UL $f10, 0*VEC_LEN*SIZE(X) ++ VLD_UH $f14, 1*VEC_LEN*SIZE(X) ++ VLD_UL $f11, 1*VEC_LEN*SIZE(X) ++ VLD_UH $f15, 2*VEC_LEN*SIZE(X) ++ ++ VLD_UL $f12, 2*VEC_LEN*SIZE(X) ++ VLD_UH $f16, 3*VEC_LEN*SIZE(X) ++ VLD_UL $f13, 3*VEC_LEN*SIZE(X) ++ VLD_UH $f17, 4*VEC_LEN*SIZE(X) ++ ++ subl $4, 1, $4 ++ vbisw $f10, $f14, $f10 ++ ldi Y, 16*SIZE(Y) ++ vbisw $f11, $f15, $f11 ++ ++ vbisw $f12, $f16, $f12 ++ ldi X, 16*SIZE(X) ++ vbisw $f13, $f17, $f13 ++ bgt $4, $UnAlign_X_MainLoop ++ .align 4 ++ ++$UnAlign_X_MainLoopEnd: ++ ++ VST $f10, 0*VEC_LEN*SIZE(Y) ++ VST $f11, 1*VEC_LEN*SIZE(Y) ++ VST $f12, 2*VEC_LEN*SIZE(Y) ++ VST $f13, 3*VEC_LEN*SIZE(Y) ++ ++ ldi Y, 16*SIZE(Y) ++ ble $5, $End ++ jmp $RemainLoop ++ ++ .align 4 ++ ++$UnAlign_Y_ACCESS: ++ and X, (VEC_LEN*SIZE-1), $7 ++ nop ++ nop ++ bgt $7, $UnAlign_XY_ACCESS ++ .align 4 ++ ++ VLD $f10, 0*VEC_LEN*SIZE(X) ++ VLD $f11, 1*VEC_LEN*SIZE(X) ++ VLD $f12, 2*VEC_LEN*SIZE(X) ++ VLD $f13, 3*VEC_LEN*SIZE(X) ++ ++ subl $4, 1, $4 ++ ldi X, 16*SIZE(X) ++ ble $4, $UnAlign_Y_MainLoopEnd ++ .align 4 ++ ++$UnAlign_Y_MainLoop: ++ fillcs PREFETCHSIZE * SIZE(X) ++ fillcs PREFETCHSIZE * SIZE(Y) ++ ++ VST_UL $f10, 0*VEC_LEN*SIZE(Y) ++ VST_UH $f10, 1*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f11, 1*VEC_LEN*SIZE(Y) ++ VST_UH $f11, 2*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f12, 2*VEC_LEN*SIZE(Y) ++ VST_UH $f12, 3*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f13, 3*VEC_LEN*SIZE(Y) ++ VST_UH $f13, 4*VEC_LEN*SIZE(Y) ++ ++ VLD $f10, 0*VEC_LEN*SIZE(X) ++ VLD $f11, 1*VEC_LEN*SIZE(X) ++ VLD $f12, 2*VEC_LEN*SIZE(X) ++ VLD $f13, 3*VEC_LEN*SIZE(X) ++ ++ subl $4, 1, $4 ++ ldi Y, 16*SIZE(Y) ++ ldi X, 16*SIZE(X) ++ bgt $4, $UnAlign_Y_MainLoop ++ .align 4 ++ ++$UnAlign_Y_MainLoopEnd: ++ ++ VST_UL $f10, 0*VEC_LEN*SIZE(Y) ++ VST_UH $f10, 1*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f11, 1*VEC_LEN*SIZE(Y) ++ VST_UH $f11, 2*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f12, 2*VEC_LEN*SIZE(Y) ++ VST_UH $f12, 3*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f13, 3*VEC_LEN*SIZE(Y) ++ VST_UH $f13, 4*VEC_LEN*SIZE(Y) ++ ++ ldi Y, 16*SIZE(Y) ++ ble $5, $End ++ jmp $RemainLoop ++ ++ .align 4 ++ ++$UnAlign_XY_ACCESS: ++ ++ VLD_UL $f10, 0*VEC_LEN*SIZE(X) ++ VLD_UH $f14, 1*VEC_LEN*SIZE(X) ++ ++ VLD_UL $f11, 1*VEC_LEN*SIZE(X) ++ VLD_UH $f15, 2*VEC_LEN*SIZE(X) ++ ++ VLD_UL $f12, 2*VEC_LEN*SIZE(X) ++ VLD_UH $f16, 3*VEC_LEN*SIZE(X) ++ ++ ++ VLD_UL $f13, 3*VEC_LEN*SIZE(X) ++ VLD_UH $f17, 4*VEC_LEN*SIZE(X) ++ ++ subl $4, 1, $4 ++ vbisw $f10, $f14, $f10 ++ ldi X, 16*SIZE(X) ++ vbisw $f11, $f15, $f11 ++ ++ vbisw $f12, $f16, $f12 ++ vbisw $f13, $f17, $f13 ++ nop ++ ble $4, $UnAlign_XY_MainLoopEnd ++ .align 4 ++ ++$UnAlign_XY_MainLoop: ++ fillcs PREFETCHSIZE * SIZE(X) ++ fillcs PREFETCHSIZE * SIZE(Y) ++ ++ VST_UL $f10, 0*VEC_LEN*SIZE(Y) ++ VST_UH $f10, 1*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f11, 1*VEC_LEN*SIZE(Y) ++ VST_UH $f11, 2*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f12, 2*VEC_LEN*SIZE(Y) ++ VST_UH $f12, 3*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f13, 3*VEC_LEN*SIZE(Y) ++ VST_UH $f13, 4*VEC_LEN*SIZE(Y) ++ ++ ++ VLD_UL $f10, 0*VEC_LEN*SIZE(X) ++ VLD_UH $f14, 1*VEC_LEN*SIZE(X) ++ VLD_UL $f11, 1*VEC_LEN*SIZE(X) ++ VLD_UH $f15, 2*VEC_LEN*SIZE(X) ++ ++ VLD_UL $f12, 2*VEC_LEN*SIZE(X) ++ VLD_UH $f16, 3*VEC_LEN*SIZE(X) ++ VLD_UL $f13, 3*VEC_LEN*SIZE(X) ++ VLD_UH $f17, 4*VEC_LEN*SIZE(X) ++ ++ subl $4, 1, $4 ++ vbisw $f10, $f14, $f10 ++ ldi Y, 16*SIZE(Y) ++ vbisw $f11, $f15, $f11 ++ ++ vbisw $f12, $f16, $f12 ++ ldi X, 16*SIZE(X) ++ vbisw $f13, $f17, $f13 ++ bgt $4, $UnAlign_XY_MainLoop ++ .align 4 ++ ++$UnAlign_XY_MainLoopEnd: ++ ++ VST_UL $f10, 0*VEC_LEN*SIZE(Y) ++ VST_UH $f10, 1*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f11, 1*VEC_LEN*SIZE(Y) ++ VST_UH $f11, 2*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f12, 2*VEC_LEN*SIZE(Y) ++ VST_UH $f12, 3*VEC_LEN*SIZE(Y) ++ ++ VST_UL $f13, 3*VEC_LEN*SIZE(Y) ++ VST_UH $f13, 4*VEC_LEN*SIZE(Y) ++ ++ ldi Y, 16*SIZE(Y) ++ ble $5, $End ++ jmp $RemainLoop ++ ++ .align 4 ++ ++$Sub: ++#ifdef COMPLEX ++ addl INCX, INCX, INCX ++ addl INCY, INCY, INCY ++ and N, 7, $5 ++#else ++ and N, 15, $5 ++#endif ++ ble $4, $SubRemain ++ .align 4 ++ ++$SubMainLoop: ++#ifndef COMPLEX ++ LD $f10, 0(X) ++ SXADDQ INCX, X, X ++ LD $f11, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f12, 0(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f14, 0(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f16, 0(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f18, 0(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f20, 0(X) ++ SXADDQ INCX, X, X ++ LD $f21, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f22, 0(X) ++ SXADDQ INCX, X, X ++ LD $f23, 0(X) ++ SXADDQ INCX, X, X ++ ++ LD $f24, 0(X) ++ SXADDQ INCX, X, X ++ LD $f25, 0(X) ++ SXADDQ INCX, X, X ++ ++ ST $f10, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f11, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f12, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f13, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f14, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f15, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f16, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f17, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f18, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f19, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f20, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f21, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f22, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f23, 0(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f24, 0(Y) ++ SXADDQ INCY, Y, Y ++ ST $f25, 0(Y) ++ SXADDQ INCY, Y, Y ++#else ++ LD $f10, 0(X) ++ LD $f11, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f12, 0(X) ++ LD $f13, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f14, 0(X) ++ LD $f15, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f16, 0(X) ++ LD $f17, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f18, 0(X) ++ LD $f19, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f20, 0(X) ++ LD $f21, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f22, 0(X) ++ LD $f23, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD $f24, 0(X) ++ LD $f25, SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST $f10, 0(Y) ++ ST $f11, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f12, 0(Y) ++ ST $f13, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f14, 0(Y) ++ ST $f15, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f16, 0(Y) ++ ST $f17, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f18, 0(Y) ++ ST $f19, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f20, 0(Y) ++ ST $f21, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f22, 0(Y) ++ ST $f23, SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ST $f24, 0(Y) ++ ST $f25, SIZE(Y) ++ SXADDQ INCY, Y, Y ++#endif ++ subl $4, 1, $4 ++ bgt $4, $SubMainLoop ++ .align 4 ++ ++$SubRemain: ++ ble $5, $SubEnd ++ .align 4 ++ ++ $SubRemainLoop: ++#ifndef COMPLEX ++ LD $f10, 0(X) ++ SXADDQ INCX, X, X ++ ST $f10, 0(Y) ++ SXADDQ INCY, Y, Y ++#else ++ LD $f10, 0(X) ++ LD $f11, SIZE(X) ++ SXADDQ INCX, X, X ++ ST $f10, 0(Y) ++ ST $f11, SIZE(Y) ++ SXADDQ INCY, Y, Y ++#endif ++ subl $5, 1, $5 ++ bgt $5, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/cscal.S b/kernel/sw_64/cscal.S +new file mode 100644 +index 0000000..bba3137 +--- /dev/null ++++ b/kernel/sw_64/cscal.S +@@ -0,0 +1,217 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++ .set noat ++ .set noreorder ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++ .globl NAME ++ .ent NAME ++ ++NAME: ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ lda $28, _mcount ++ jsr $28, ($28), _mcount ++#endif ++ ++#ifndef C_INTERFACE ++ ldl $16, 0($16) # n ++ mov $18, $20 # Store Address ++ ldl $19, 0($19) # incx ++ nop ++ ++ LD $f1, 0($17) # alpha ++#else ++ mov $18, $20 # Store Address ++ fmov $f17, $f1 # alpha ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ sra $16, 1, $21 # 4-unrolling ++ ble $16, $End ++ ++ lda $23, -1($19) ++ ble $19, $End ++ ++ bgt $23, $INC_NOT_1 ++ .align 4 ++ ++ ble $21, $Sub ++ lda $21, -1($21) ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ ++ LD $f12, 2*SIZE($18) ++ LD $f13, 3*SIZE($18) ++ lda $18, 4*SIZE($18) ++ ble $21, $MainRemain ++ .align 4 ++ ++$MainLoop: ++ MUL $f10, $f1, $f20 ++ LD $f10, 0*SIZE($18) ++ MUL $f11, $f1, $f21 ++ LD $f11, 1*SIZE($18) ++ ++ MUL $f12, $f1, $f22 ++ LD $f12, 2*SIZE($18) ++ MUL $f13, $f1, $f23 ++ LD $f13, 3*SIZE($18) ++ ++ lda $18, 4*SIZE($18) ++ lda $21, -1($21) ++ ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ ST $f22, 2*SIZE($20) ++ ST $f23, 3*SIZE($20) ++ lda $20, 4*SIZE($20) ++ ++ bgt $21, $MainLoop ++ .align 4 ++ ++$MainRemain: ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ MUL $f12, $f1, $f22 ++ MUL $f13, $f1, $f23 ++ ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ ST $f22, 2*SIZE($20) ++ ST $f23, 3*SIZE($20) ++ lda $20, 4*SIZE($20) ++ .align 4 ++ ++$Sub: ++ blbc $16, $End ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ .align 4 ++ ++$End: ++ ret ++ .align 4 ++ ++$INC_NOT_1: ++ addl $19, $19, $19 ++ ble $21, $INC_Sub ++ lda $21, -1($21) ++ ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f12, 0*SIZE($18) ++ LD $f13, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ble $21, $INC_MainRemain ++ .align 4 ++ ++$INC_MainLoop: ++ MUL $f10, $f1, $f20 ++ LD $f10, 0*SIZE($18) ++ MUL $f11, $f1, $f21 ++ LD $f11, 1*SIZE($18) ++ ++ SXADDQ $19, $18, $18 ++ ++ MUL $f12, $f1, $f22 ++ LD $f12, 0*SIZE($18) ++ MUL $f13, $f1, $f23 ++ LD $f13, 1*SIZE($18) ++ ++ SXADDQ $19, $18, $18 ++ ++ ST $f20, 0*SIZE($20) ++ lda $21, -1($21) ++ ST $f21, 1*SIZE($20) ++ SXADDQ $19, $20, $20 ++ ++ ST $f22, 0*SIZE($20) ++ ST $f23, 1*SIZE($20) ++ SXADDQ $19, $20, $20 ++ unop ++ bgt $21, $INC_MainLoop ++ .align 4 ++ ++$INC_MainRemain: ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ MUL $f12, $f1, $f22 ++ MUL $f13, $f1, $f23 ++ ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ SXADDQ $19, $20, $20 ++ ++ ST $f22, 0*SIZE($20) ++ ST $f23, 1*SIZE($20) ++ SXADDQ $19, $20, $20 ++ .align 4 ++ ++$INC_Sub: ++ blbc $16, $INC_End ++ ++ LD $f10, 0*SIZE($18) ++ LD $f11, 1*SIZE($18) ++ MUL $f10, $f1, $f20 ++ MUL $f11, $f1, $f21 ++ ++ ST $f20, 0*SIZE($20) ++ ST $f21, 1*SIZE($20) ++ .align 4 ++ ++$INC_End: ++ ret ++ .end NAME ++ .ident VERSION +diff --git a/kernel/sw_64/dnrm2.S b/kernel/sw_64/dnrm2.S +new file mode 100644 +index 0000000..89cf787 +--- /dev/null ++++ b/kernel/sw_64/dnrm2.S +@@ -0,0 +1,490 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stl $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ SXADDQ INCX, 0, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 4, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0,$f24 ++ fmov $f24,a0 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1,$f24 ++ fmov $f24,a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ #unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ #unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ #unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ #unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ #unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ #unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ #unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ #unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ #unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ #unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0,$f24 ++ fmov $f24,a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ #unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ #unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ #unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ #unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd a1, t1,$f24 ++ fmov $f24,a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd a2, t2,$f24 ++ fmov $f24,a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ .align 4 ++ ++$L15: ++ and N, 15, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L25 ++ ++ fclr t2 ++ fclr t3 ++ ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x3, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x5, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x6, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0,$f24 ++ fmov $f24,a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ LD x1, 0 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ addl X, INCX, X ++ ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ LD x3, 0 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ LD x5, 0 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ unop ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, $f24 ++ fmov $f24,a1 ++ faddd a2, t2, $f24 ++ fmov $f24,a2 ++ faddd a3, t3, $f24 ++ fmov $f24,a3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0,$f24 ++ fmov $f24,a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, $f24 ++ fmov $f24,a0 ++ ++ faddd a0, a1, $f24 ++ fmov $f24,a1 ++ faddd a2, a3, $f24 ++ fmov $f24,a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, $f24 ++ fsqrtd $f24, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/dnrm2.S.bak b/kernel/sw_64/dnrm2.S.bak +new file mode 100644 +index 0000000..753c90b +--- /dev/null ++++ b/kernel/sw_64/dnrm2.S.bak +@@ -0,0 +1,431 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ SXADDQ INCX, 0, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 4, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, a0 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, a1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L15: ++ and N, 15, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L25 ++ ++ fclr t2 ++ fclr t3 ++ ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x3, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x5, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x6, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2, a2 ++ LD x1, 0 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ LD x3, 0 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2, a2 ++ LD x5, 0 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, a1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, a0 ++ ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/dot.S b/kernel/sw_64/dot.S +new file mode 100644 +index 0000000..513eada +--- /dev/null ++++ b/kernel/sw_64/dot.S +@@ -0,0 +1,607 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++ ++#define I $5 ++ ++#define s0 $f0 ++#define s1 $f30 ++#define s2 $f1 ++#define s3 $f2 ++ ++#define a0 $f10 ++#define a1 $f11 ++#define a2 $f12 ++#define a3 $f13 ++#define a4 $f14 ++#define a5 $f15 ++#define a6 $f16 ++#define a7 $f17 ++ ++#define b0 $f18 ++#define b1 $f19 ++#define b2 $f20 ++#define b3 $f21 ++#define b4 $f22 ++#define b5 $f23 ++#define b6 $f24 ++#define b7 $f25 ++ ++#define t0 $f26 ++#define t1 $f27 ++#define t2 $f28 ++#define t3 $f29 ++ ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldi $sp, -16($sp) ++ fclr s0 ++ fstd $f2, 0($sp) ++#ifndef ZYX20220111 ++ fstd $f3, 8($sp) ++#endif ++ fclr s1 ++ ++ fclr s2 ++ nop ++ fclr s3 ++ ble N, $L999 ++ ++ fclr t0 ++ cmpeq INCX, 1, $21 ++ fclr t1 ++ cmpeq INCY, 1, $22 ++ fclr t2 ++ and $21, $22, $22 ++ fclr t3 ++ beq $22, $L20 ++ ++#ifndef DOUBLE ++ srl N, 4, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ LD b2, 2 * SIZE(Y) ++ LD b3, 3 * SIZE(Y) ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ LD b4, 4 * SIZE(Y) ++ LD b5, 5 * SIZE(Y) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ addl X, 16 * SIZE, X ++ subl I, 1, I ++ ++ addl Y, 16 * SIZE, Y ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ fillcs PREFETCHSIZE * 2 * SIZE(X) ++ subl I, 1, I ++ fillcs PREFETCHSIZE * 2 * SIZE(Y) ++ addl X, 16 * SIZE, X ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6, -10 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -9 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a0, -24 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -23 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b0, -8 * SIZE(Y) ++ MUL a2, b2, $f3 ++ fmov $f3, t2 ++ LD b1, -7 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a2, -22 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -21 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b2, -6 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, -5 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a4, -20 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -19 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b4, -4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, -3 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a6, -18 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -17 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a0, -16 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -15 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a2, -14 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -13 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b2, 2 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 3 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a4, -12 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -11 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b4, 4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, 5 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a6, -10 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -9 * SIZE(X) ++ ++ addl Y, 16 * SIZE, Y ++ bgt I, $L12 ++ nop ++ fnop ++ .align 4 ++ ++$L13: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6,-10 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -9 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a0, -8 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -7 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b0, -8 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, -7 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a2, -6 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -5 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b2, -6 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, -5 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a4, -4 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -3 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b4, -4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, -3 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a6, -2 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -1 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a1, b1, t1 ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a5, b5, t1 ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a6, b6, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L15: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ and N, 15, I ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ ble I, $L18 ++ .align 4 ++ ++#else ++ ++ srl N, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ LD b2, 2 * SIZE(Y) ++ LD b3, 3 * SIZE(Y) ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ LD b4, 4 * SIZE(Y) ++ LD b5, 5 * SIZE(Y) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ addl X, 8 * SIZE, X ++ subl I, 1, I ++ ++ addl Y, 8 * SIZE, Y ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ fillcs PREFETCHSIZE * SIZE(X) ++ subl I, 1, I ++ fillcs PREFETCHSIZE * SIZE(Y) ++ addl X, 8 * SIZE, X ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a0, -8 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -7 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a2, -6 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -5 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b2, 2 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 3 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a4, -4 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -3 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b4, 4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, 5 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a6, -2 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -1 * SIZE(X) ++ ++ addl Y, 8 * SIZE, Y ++ bgt I, $L12 ++ nop ++ fnop ++ .align 4 ++ ++$L13: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a1, b1, t1 ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a5, b5, t1 ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a6, b6, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L15: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ and N, 7, I ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ ble I, $L18 ++ .align 4 ++ ++#endif ++ ++$L16: ++ LD a0, 0 * SIZE(X) ++ addl X, SIZE, X ++ LD b0, 0 * SIZE(Y) ++ addl Y, SIZE, Y ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a0, b0, t2 ++ subl I, 1, I ++ bgt I, $L16 ++ .align 4 ++ ++$L18: ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ br $L999 ++ .align 4 ++ ++$L20: ++ srl N, 2, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b1, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b2, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b3, 0 * SIZE(Y) ++ subl I, 1, I ++ ++ SXADDQ INCY, Y, Y ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a1, b1, t1 ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a3, b3, t3 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b1, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b2, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b3, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ subl I, 1, I ++ bgt I, $L22 ++ nop ++ fnop ++ .align 4 ++ ++$L23: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a1, b1, t1 ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a3, b3, t3 ++ .align 4 ++ ++$L25: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ and N, 3, I ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ ble I, $L28 ++ .align 4 ++ ++$L26: ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a0, b0, t2 ++ subl I, 1, I ++ bgt I, $L26 ++ .align 4 ++ ++$L28: ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ .align 4 ++ ++$L999: ++ ADD s2, s3, $f3 ++ fmov $f3, s2 ++ fldd $f2, 0($sp) ++ ADD s0, s1, $f3 ++ fmov $f3, s0 ++ ADD s0, s2, $f3 ++ fmov $f3, s0 ++#ifndef ZYX20220111 ++ fldd $f3, 8($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/dot.S.bak b/kernel/sw_64/dot.S.bak +new file mode 100644 +index 0000000..cd96e21 +--- /dev/null ++++ b/kernel/sw_64/dot.S.bak +@@ -0,0 +1,602 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++ ++#define I $5 ++ ++#define s0 $f0 ++#define s1 $f30 ++#define s2 $f1 ++#define s3 $f2 ++ ++#define a0 $f10 ++#define a1 $f11 ++#define a2 $f12 ++#define a3 $f13 ++#define a4 $f14 ++#define a5 $f15 ++#define a6 $f16 ++#define a7 $f17 ++ ++#define b0 $f18 ++#define b1 $f19 ++#define b2 $f20 ++#define b3 $f21 ++#define b4 $f22 ++#define b5 $f23 ++#define b6 $f24 ++#define b7 $f25 ++ ++#define t0 $f26 ++#define t1 $f27 ++#define t2 $f28 ++#define t3 $f29 ++ ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldi $sp, -16($sp) ++ fclr s0 ++ fstd $f2, 0($sp) ++ fclr s1 ++ ++ fclr s2 ++ nop ++ fclr s3 ++ ble N, $L999 ++ ++ fclr t0 ++ cmpeq INCX, 1, $21 ++ fclr t1 ++ cmpeq INCY, 1, $22 ++ fclr t2 ++ and $21, $22, $22 ++ fclr t3 ++ beq $22, $L20 ++ ++#ifndef DOUBLE ++ srl N, 4, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ LD b2, 2 * SIZE(Y) ++ LD b3, 3 * SIZE(Y) ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ LD b4, 4 * SIZE(Y) ++ LD b5, 5 * SIZE(Y) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ addl X, 16 * SIZE, X ++ subl I, 1, I ++ ++ addl Y, 16 * SIZE, Y ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ fillcs PREFETCHSIZE * 2 * SIZE(X) ++ subl I, 1, I ++ fillcs PREFETCHSIZE * 2 * SIZE(Y) ++ addl X, 16 * SIZE, X ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6, -10 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -9 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a0, -24 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -23 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b0, -8 * SIZE(Y) ++ MUL a2, b2, $f3 ++ fmov $f3, t2 ++ LD b1, -7 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a2, -22 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -21 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b2, -6 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, -5 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a4, -20 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -19 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b4, -4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, -3 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a6, -18 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -17 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a0, -16 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -15 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a2, -14 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -13 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b2, 2 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 3 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a4, -12 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -11 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b4, 4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, 5 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a6, -10 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -9 * SIZE(X) ++ ++ addl Y, 16 * SIZE, Y ++ bgt I, $L12 ++ nop ++ fnop ++ .align 4 ++ ++$L13: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6,-10 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -9 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a0, -8 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -7 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b0, -8 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, -7 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a2, -6 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -5 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b2, -6 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, -5 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a4, -4 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -3 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b4, -4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, -3 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a6, -2 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -1 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a1, b1, t1 ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a5, b5, t1 ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a6, b6, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L15: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ and N, 15, I ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ ble I, $L18 ++ .align 4 ++ ++#else ++ ++ srl N, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ LD b2, 2 * SIZE(Y) ++ LD b3, 3 * SIZE(Y) ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ LD b4, 4 * SIZE(Y) ++ LD b5, 5 * SIZE(Y) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ addl X, 8 * SIZE, X ++ subl I, 1, I ++ ++ addl Y, 8 * SIZE, Y ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ fillcs PREFETCHSIZE * SIZE(X) ++ subl I, 1, I ++ fillcs PREFETCHSIZE * SIZE(Y) ++ addl X, 8 * SIZE, X ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a0, -8 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -7 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a2, -6 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -5 * SIZE(X) ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b2, 2 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 3 * SIZE(Y) ++ ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ LD a4, -4 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -3 * SIZE(X) ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ LD b4, 4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, 5 * SIZE(Y) ++ ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ LD a6, -2 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -1 * SIZE(X) ++ ++ addl Y, 8 * SIZE, Y ++ bgt I, $L12 ++ nop ++ fnop ++ .align 4 ++ ++$L13: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a1, b1, t1 ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a5, b5, t1 ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a6, b6, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L15: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ and N, 7, I ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ ble I, $L18 ++ .align 4 ++ ++#endif ++ ++$L16: ++ LD a0, 0 * SIZE(X) ++ addl X, SIZE, X ++ LD b0, 0 * SIZE(Y) ++ addl Y, SIZE, Y ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a0, b0, t2 ++ subl I, 1, I ++ bgt I, $L16 ++ .align 4 ++ ++$L18: ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ br $L999 ++ .align 4 ++ ++$L20: ++ srl N, 2, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b1, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b2, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b3, 0 * SIZE(Y) ++ subl I, 1, I ++ ++ SXADDQ INCY, Y, Y ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a1, b1, t1 ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a3, b3, t3 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b1, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b2, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b3, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ subl I, 1, I ++ bgt I, $L22 ++ nop ++ fnop ++ .align 4 ++ ++$L23: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ MUL a1, b1, t1 ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ MUL a3, b3, t3 ++ .align 4 ++ ++$L25: ++ ADD s0, t0, $f3 ++ fmov $f3, s0 ++ and N, 3, I ++ ADD s1, t1, $f3 ++ fmov $f3, s1 ++ ble I, $L28 ++ .align 4 ++ ++$L26: ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ MUL a0, b0, t2 ++ subl I, 1, I ++ bgt I, $L26 ++ .align 4 ++ ++$L28: ++ ADD s2, t2, $f3 ++ fmov $f3, s2 ++ ADD s3, t3, $f3 ++ fmov $f3, s3 ++ .align 4 ++ ++$L999: ++ ADD s2, s3, $f3 ++ fmov $f3, s2 ++ fldd $f2, 0($sp) ++ ADD s0, s1, $f3 ++ fmov $f3, s0 ++ ldi $sp, 16($sp) ++ ++ ADD s0, s2, $f3 ++ fmov $f3, s0 ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/dot_simd.S b/kernel/sw_64/dot_simd.S +new file mode 100644 +index 0000000..3e2288d +--- /dev/null ++++ b/kernel/sw_64/dot_simd.S +@@ -0,0 +1,634 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++ ++#define I $5 ++ ++#define s0 $f0 ++#define s1 $f30 ++#define s2 $f1 ++#define s3 $f2 ++ ++#define a0 $f10 ++#define a1 $f11 ++#define a2 $f12 ++#define a3 $f13 ++#define a4 $f14 ++#define a5 $f15 ++#define a6 $f16 ++#define a7 $f17 ++ ++#define b0 $f18 ++#define b1 $f19 ++#define b2 $f20 ++#define b3 $f21 ++#define b4 $f22 ++#define b5 $f23 ++#define b6 $f24 ++#define b7 $f25 ++ ++#define t0 $f26 ++#define t1 $f27 ++#define t2 $f28 ++#define t3 $f29 ++ ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldi $sp, -16($sp) ++ fclr s0 ++ fstd $f2, 0($sp) ++ fclr s1 ++ ++ fclr s2 ++ nop ++ fclr s3 ++ ble N, $L999 ++ ++ fclr t0 ++ cmpeq INCX, 1, $21 ++ fclr t1 ++ cmpeq INCY, 1, $22 ++ fclr t2 ++ and $21, $22, $22 ++ fclr t3 ++ beq $22, $L20 ++ ++ ++/* ++ test the address of Y & X ++*/ ++ and Y, (VEC_LEN*SIZE-1), $4 ++ and X, (VEC_LEN*SIZE-1), $3 ++ or $3, $4, $4 ++ bne $4, $UnAlign_ACCESS ++ ++/*Align Accessing*/ ++ sra N, 4, I ++ ble I, $Remain ++ ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, s0 #clear s0 vector ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, s1 ++ ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, s2 ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, s3 ++ ++ VLD b0, 0*VEC_LEN*SIZE(Y) ++ VLD b1, 1*VEC_LEN*SIZE(Y) ++ VLD b2, 2*VEC_LEN*SIZE(Y) ++ VLD b3, 3*VEC_LEN*SIZE(Y) ++ ++ addl X, 16 * SIZE, X ++ addl Y, 16 * SIZE, Y ++ subl I, 1, I ++ ble I, $MainLoopEnd ++$MainLoop: ++ VMAD a0, b0, s0, s0 ++ fillcs PREFETCHSIZE * SIZE(X) ++ VMAD a1, b1, s1, s1 ++ fillcs PREFETCHSIZE * SIZE(Y) ++ ++ subl I, 1, I ++ VMAD a2, b2, s2, s2 ++ addl X, 16 * SIZE, X ++ VMAD a3, b3, s3, s3 ++ ++ VLD a0, -4*VEC_LEN*SIZE(X) ++ VLD a1, -3*VEC_LEN*SIZE(X) ++ VLD a2, -2*VEC_LEN*SIZE(X) ++ VLD a3, -1*VEC_LEN*SIZE(X) ++ ++ VLD b0, 0*VEC_LEN*SIZE(Y) ++ VLD b1, 1*VEC_LEN*SIZE(Y) ++ VLD b2, 2*VEC_LEN*SIZE(Y) ++ VLD b3, 3*VEC_LEN*SIZE(Y) ++ ++ ++ addl Y, 16 * SIZE, Y ++ bgt I, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ VMAD a0, b0, s0, s0 ++ VMAD a1, b1, s1, s1 ++ VMAD a2, b2, s2, s2 ++ VMAD a3, b3, s3, s3 ++ ++ VADD s0, s1, t0 ++ VADD s2, s3, t1 ++ nop ++ VADD t0, t1, s0 ++ ++ vextf s0, 1, s1 ++ vextf s0, 2, s2 ++ vextf s0, 3, s3 ++ nop ++ ++ ADD s0, s1, t2 ++ ADD s2, s3, t3 ++ nop ++ ADD t2, t3, s0 ++ ++ .align 4 ++$Remain: ++ and N, 15, I ++ ble I, $End ++ .align 4 ++$Remain_Loop: ++ LD a0, 0 * SIZE(X) ++ addl X, SIZE, X ++ LD b0, 0 * SIZE(Y) ++ addl Y, SIZE, Y ++ ++ MAD a0, b0, s0, s0 ++ subl I, 1, I ++ bgt I, $Remain_Loop ++ .align 4 ++$End: ++ ++ fldd $f2, 0($sp) ++ ldi $sp, 16($sp) ++ ret ++ .align 4 ++ ++/*UnAlign Accessing*/ ++$UnAlign_ACCESS: ++ ++#ifndef DOUBLE ++ srl N, 4, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ LD b2, 2 * SIZE(Y) ++ LD b3, 3 * SIZE(Y) ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ LD b4, 4 * SIZE(Y) ++ LD b5, 5 * SIZE(Y) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ addl X, 16 * SIZE, X ++ subl I, 1, I ++ ++ addl Y, 16 * SIZE, Y ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ fillcs PREFETCHSIZE * 2 * SIZE(X) ++ subl I, 1, I ++ fillcs PREFETCHSIZE * 2 * SIZE(Y) ++ addl X, 16 * SIZE, X ++ ++ ADD s0, t0, s0 ++ LD b6, -10 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -9 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a0, -24 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -23 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b0, -8 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, -7 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -22 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -21 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, -6 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, -5 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -20 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -19 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, -4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, -3 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -18 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -17 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a0, -16 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -15 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -14 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -13 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 2 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 3 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -12 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -11 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, 4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, 5 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -10 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -9 * SIZE(X) ++ ++ addl Y, 16 * SIZE, Y ++ bgt I, $L12 ++ nop ++ fnop ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD b6,-10 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -9 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a0, -8 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -7 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b0, -8 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, -7 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -6 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -5 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, -6 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, -5 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -4 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -3 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, -4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, -3 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -2 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s1 ++ MUL a5, b5, t1 ++ ADD s2, t2, s2 ++ MUL a6, b6, t2 ++ ADD s3, t3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L15: ++ ADD s0, t0, s0 ++ and N, 15, I ++ ADD s1, t1, s1 ++ ble I, $L18 ++ .align 4 ++ ++#else ++ ++ srl N, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ LD b2, 2 * SIZE(Y) ++ LD b3, 3 * SIZE(Y) ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ LD b4, 4 * SIZE(Y) ++ LD b5, 5 * SIZE(Y) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ addl X, 8 * SIZE, X ++ subl I, 1, I ++ ++ addl Y, 8 * SIZE, Y ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ fillcs PREFETCHSIZE * SIZE(X) ++ subl I, 1, I ++ fillcs PREFETCHSIZE * SIZE(Y) ++ addl X, 8 * SIZE, X ++ ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a0, -8 * SIZE(X) ++ MUL a1, b1, t1 ++ LD a1, -7 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t2 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a2, -6 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, -5 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 2 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 3 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ LD a4, -4 * SIZE(X) ++ MUL a5, b5, t1 ++ LD a5, -3 * SIZE(X) ++ ++ ADD s2, t2, s2 ++ LD b4, 4 * SIZE(Y) ++ MUL a6, b6, t2 ++ LD b5, 5 * SIZE(Y) ++ ++ ADD s3, t3, s3 ++ LD a6, -2 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, -1 * SIZE(X) ++ ++ addl Y, 8 * SIZE, Y ++ bgt I, $L12 ++ nop ++ fnop ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD b6, -2 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, -1 * SIZE(Y) ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s1 ++ MUL a5, b5, t1 ++ ADD s2, t2, s2 ++ MUL a6, b6, t2 ++ ADD s3, t3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L15: ++ ADD s0, t0, s0 ++ and N, 7, I ++ ADD s1, t1, s1 ++ ble I, $L18 ++ .align 4 ++ ++#endif ++ ++$L16: ++ LD a0, 0 * SIZE(X) ++ addl X, SIZE, X ++ LD b0, 0 * SIZE(Y) ++ addl Y, SIZE, Y ++ ++ ADD s2, t2, s2 ++ MUL a0, b0, t2 ++ subl I, 1, I ++ bgt I, $L16 ++ .align 4 ++ ++$L18: ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ br $L999 ++ .align 4 ++ ++$L20: ++ srl N, 2, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b1, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b2, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b3, 0 * SIZE(Y) ++ subl I, 1, I ++ ++ SXADDQ INCY, Y, Y ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b1, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b2, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b3, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ subl I, 1, I ++ bgt I, $L22 ++ nop ++ fnop ++ .align 4 ++ ++$L23: ++ ADD s0, t0, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s1 ++ MUL a1, b1, t1 ++ ADD s2, t2, s2 ++ MUL a2, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ .align 4 ++ ++$L25: ++ ADD s0, t0, s0 ++ and N, 3, I ++ ADD s1, t1, s1 ++ ble I, $L28 ++ .align 4 ++ ++$L26: ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD b0, 0 * SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ MUL a0, b0, t2 ++ subl I, 1, I ++ bgt I, $L26 ++ .align 4 ++ ++$L28: ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ .align 4 ++ ++$L999: ++ ADD s2, s3, s2 ++ fldd $f2, 0($sp) ++ ADD s0, s1, s0 ++ ldi $sp, 16($sp) ++ ++ ADD s0, s2, s0 ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/gemm_beta.S b/kernel/sw_64/gemm_beta.S +new file mode 100644 +index 0000000..d9ea890 +--- /dev/null ++++ b/kernel/sw_64/gemm_beta.S +@@ -0,0 +1,179 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++CNAME: ++ .frame $sp, 0, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++#endif ++ ++ ldl $18, 16($sp) ++ ble $16, $End ++ ldl $19, 24($sp) ++ ble $17, $End ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO) ++ .align 4 ++ ++$BETA_NE_ZERO: ++ sra $16, 3, $2 # i = (m >> 3) ++ mov $18, $1 # c_offset = c ++ ldi $17, -1($17) # j -- ++ ble $2,$L52 ++ .align 4 ++ ++$L51: ++ fillcs 64($1) ++ ldi $2, -1($2) ++ ++ LD $f14, 0*SIZE($1) ++ LD $f15, 1*SIZE($1) ++ LD $f16, 2*SIZE($1) ++ LD $f17, 3*SIZE($1) ++ LD $f18, 4*SIZE($1) ++ LD $f11, 5*SIZE($1) ++ LD $f21, 6*SIZE($1) ++ LD $f22, 7*SIZE($1) ++ ++ MUL $f19, $f14, $f23 ++ MUL $f19, $f15, $f24 ++ MUL $f19, $f16, $f25 ++ MUL $f19, $f17, $f26 ++ MUL $f19, $f18, $f27 ++ MUL $f19, $f11, $f28 ++ MUL $f19, $f21, $f29 ++ MUL $f19, $f22, $f30 ++ ++ ST $f23, 0*SIZE($1) ++ ST $f24, 1*SIZE($1) ++ ST $f25, 2*SIZE($1) ++ ST $f26, 3*SIZE($1) ++ ST $f27, 4*SIZE($1) ++ ST $f28, 5*SIZE($1) ++ ST $f29, 6*SIZE($1) ++ ST $f30, 7*SIZE($1) ++ ++ ldi $1,8*SIZE($1) ++ bgt $2,$L51 ++ .align 4 ++ ++$L52: ++ and $16, 7, $2 ++ ble $2,$L54 ++ .align 4 ++ ++$L53: ++ LD $f12, 0($1) ++ ldi $2, -1($2) ++ MUL $f19, $f12, $f23 ++ ST $f23, 0($1) ++ ldi $1, SIZE($1) ++ bgt $2,$L53 ++ .align 4 ++ ++$L54: ++ SXADDQ $19, $18, $18 # c += ldc ++ bgt $17,$BETA_NE_ZERO ++ clr $0 ++ ret ++ .align 4 ++ ++$BETA_EQ_ZERO: ++ sra $16, 3, $2 # i = (m >> 3) ++ ldi $4, 8*SIZE($18) ++ mov $18, $1 # c_offset = c ++ ldi $17, -1($17) # j -- ++ ble $2,$L42 ++ .align 4 ++ ++$L41: ++ ST $f31, 0*SIZE($1) ++ ST $f31, 1*SIZE($1) ++ ST $f31, 2*SIZE($1) ++ ST $f31, 3*SIZE($1) ++ ST $f31, 4*SIZE($1) ++ ST $f31, 5*SIZE($1) ++ ST $f31, 6*SIZE($1) ++ ST $f31, 7*SIZE($1) ++ ldi $2, -1($2) ++ ++ ldi $4, 8*SIZE($4) ++ ldi $1, 8*SIZE($1) ++ bgt $2,$L41 ++ .align 4 ++ ++$L42: ++ and $16, 7, $2 ++ ble $2,$L44 ++ .align 4 ++ ++$L43: ++ ldi $2, -1($2) ++ ST $f31, 0($1) ++ ldi $1, SIZE($1) ++ bgt $2, $L43 ++ .align 4 ++ ++$L44: ++ SXADDQ $19, $18, $18 # c += ldc ++ bgt $17,$BETA_EQ_ZERO ++ clr $0 ++ .align 4 ++ ++$End: ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/gemm_kernel_4x4.S b/kernel/sw_64/gemm_kernel_4x4.S +new file mode 100644 +index 0000000..dd17554 +--- /dev/null ++++ b/kernel/sw_64/gemm_kernel_4x4.S +@@ -0,0 +1,3244 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 56 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++#define STACKSIZE 96 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define BB $3 ++#define OFFSET $4 ++ ++#define tmp $9 ++ ++#define ALPHA 64($sp) ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++#ifdef TRMMKERNEL ++ ldl OFFSET, 16 + STACKSIZE($sp) ++#endif ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ stl $9, 80($sp) ++ fstd $f19, ALPHA ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ subl $31, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 ++ .align 4 ++ ++$L01: ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ s4addl K, 0, BB ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ addl C2, LDC, C3 ++ s4addl LDC, C, C ++ ++ SXADDQ BB, B, BB ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(EV5) || defined(SW6A) ++ fillcs 0 * SIZE(BB) ++ fillcs 8 * SIZE(BB) ++ unop ++ ldi BB, 16 * SIZE(BB) ++#endif ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++#else ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++#endif ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3,b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4,b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ FIMOVD b5, tmp ++/* 2 */ ++ ADD c01, t1,b5 ++ fmov b5, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2,b5 ++ fmov b5, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3,b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1,b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2,b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3,b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4,b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1,b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ ldi L, -2(L) ++ IFMOVD tmp, b5 ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ fldd alpha, ALPHA ++ MUL b1, a1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L18 ++#else ++ blbs TMP1, $L18 ++#endif ++ .align 4 ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, t4 ++ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L18: ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ unop ++ MUL b1, a2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a1, t4 ++#ifndef TRMMKERNEL ++ LD b5, 1 * SIZE(C1) ++ FIMOVD b5, tmp ++#else ++ unop ++#endif ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL b1, a3, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL b1, a4, t2 ++#ifndef TRMMKERNEL ++ LD b1, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, t2 ++#ifndef TRMMKERNEL ++ LD a1, 0 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++#ifndef TRMMKERNEL ++ LD a2, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, t4 ++#ifndef TRMMKERNEL ++ LD b2, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ ldi I, -1(I) ++ MUL b3, a3, t1 ++ unop ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++#ifndef TRMMKERNEL ++ LD b3, 0 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++#ifndef TRMMKERNEL ++ LD a4, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, t4 ++#ifndef TRMMKERNEL ++ LD a3, 2 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ unop ++ MUL alpha, c01, b5 ++ fmov b5, c01 ++#ifndef TRMMKERNEL ++ LD b4, 3 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ unop ++ MUL alpha, c02, b5 ++ fmov b5, c02 ++#ifndef TRMMKERNEL ++ LD t1, 1 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL alpha, c03, b5 ++ fmov b5, c03 ++#ifndef TRMMKERNEL ++ LD t2, 2 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL alpha, c04, b5 ++ fmov b5, c04 ++#ifndef TRMMKERNEL ++ LD t3, 3 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ MUL alpha, c05, b5 ++ fmov b5, c05 ++ unop ++#ifndef TRMMKERNEL ++ ADD c01, a5, b5 ++ fmov b5, c01 ++ LD t4, 1 * SIZE(C4) ++#else ++ unop ++ unop ++#endif ++ ++ MUL alpha, c06, b5 ++ fmov b5, c06 ++#ifndef TRMMKERNEL ++ unop ++ IFMOVD tmp, b5 ++ fstd b1, 88($sp) ++# FIMOVD b1, tmp ++ ADD c02, b5, b1 ++ fmov b1, c02 ++ fldd b1, 88($sp) ++# IFMOVD tmp, b1 ++ LD a5, 2 * SIZE(C4) ++#endif ++ ++ MUL alpha, c07, b5 ++ fmov b5, c07 ++#ifndef TRMMKERNEL ++ unop ++ ADD c03, a2, b5 ++ fmov b5, c03 ++ LD b5, 3 * SIZE(C4) ++ FIMOVD b5, tmp ++#endif ++ ++ MUL alpha, c08, b5 ++ fmov b5, c08 ++#ifndef TRMMKERNEL ++ unop ++ ADD c04, b2, b5 ++ fmov b5, c04 ++ unop ++#endif ++ ++ MUL alpha, c09, b5 ++ fmov b5, c09 ++ ST c01, 0 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c05, b1, b5 ++ fmov b5, c05 ++ unop ++#endif ++ ++ MUL alpha, c10, b5 ++ fmov b5, c10 ++ ST c02, 1 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c06, a4, b5 ++ fmov b5, c06 ++ unop ++#endif ++ ++ MUL alpha, c11, b5 ++ fmov b5, c11 ++ ST c03, 2 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c07, a3, b5 ++ fmov b5, c07 ++ unop ++#endif ++ ++ MUL alpha, c12, b5 ++ fmov b5, c12 ++ ST c04, 3 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c08, b4, b5 ++ fmov b5, c08 ++#else ++ unop ++#endif ++ ldi C1, 4 * SIZE(C1) ++ ++ MUL alpha, c13, b5 ++ fmov b5, c13 ++ ST c05, 0 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c09, a1, b5 ++ fmov b5, c09 ++ unop ++#endif ++ ++ MUL alpha, c14, b5 ++ fmov b5, c14 ++ ST c06, 1 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c10, t1, b5 ++ fmov b5, c10 ++ unop ++#endif ++ ++ MUL alpha, c15, b5 ++ fmov b5, c15 ++ ST c07, 2 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c11, t2, b5 ++ fmov b5, c11 ++ unop ++#endif ++ ++ MUL alpha, c16, b5 ++ fmov b5, c16 ++ ST c08, 3 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c12, t3, b5 ++ fmov b5, c12 ++#else ++ unop ++#endif ++ ldi C2, 4 * SIZE(C2) ++ ++#ifndef TRMMKERNEL ++ ADD c13, b3, b5 ++ fmov b5, c13 ++#else ++ unop ++#endif ++ ST c09, 0 * SIZE(C3) ++ fclr t1 ++ ldi C4, 4 * SIZE(C4) ++ ++#ifndef TRMMKERNEL ++ ADD c14, t4, b5 ++ fmov b5, c14 ++#else ++ unop ++#endif ++ ST c10, 1 * SIZE(C3) ++ fclr t2 ++ unop ++ ++#ifndef TRMMKERNEL ++ ADD c15, a5, b5 ++ fmov b5, c15 ++#else ++ unop ++#endif ++ ST c11, 2 * SIZE(C3) ++ fclr t3 ++ unop ++ ++#ifndef TRMMKERNEL ++ IFMOVD tmp, b5 ++# FIMOVD b1, tmp ++ fstd b1, 88($sp) ++ ADD c16, b5, b1 ++ fmov b1, c16 ++ fldd b1, 88($sp) ++# IFMOVD tmp, b1 ++#else ++ unop ++#endif ++ ST c12, 3 * SIZE(C3) ++ fclr t4 ++ ldi C3, 4 * SIZE(C3) ++ ++ ST c13, -4 * SIZE(C4) ++ ST c14, -3 * SIZE(C4) ++ ST c15, -2 * SIZE(C4) ++ ST c16, -1 * SIZE(C4) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble L, $L25 ++ ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ FIMOVD b5, tmp ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ IFMOVD tmp, b5 ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ IFMOVD tmp, b5 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L28 ++#else ++ blbs TMP1, $L28 ++#endif ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L28: ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD a4, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++#ifndef TRMMKERNEL ++ LD b5, 1 * SIZE(C2) ++ FIMOVD b5, tmp ++#else ++ unop ++#endif ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++#ifndef TRMMKERNEL ++ LD b1, 0 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, t3 ++#ifndef TRMMKERNEL ++ LD b2, 1 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b4, t4 ++#ifndef TRMMKERNEL ++ LD b3, 0 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL alpha, c01, b5 ++ fmov b5, c01 ++#ifndef TRMMKERNEL ++ LD b4, 1 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL alpha, c02, b5 ++ fmov b5, c02 ++ unop ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ MUL alpha, c05, b5 ++ fmov b5, c05 ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ MUL alpha, c06, b5 ++ fmov b5, c06 ++ ++ MUL alpha, c09, b5 ++ fmov b5, c09 ++#ifndef TRMMKERNEL ++ ADD c01, a3, b5 ++ fmov b5, c01 ++#endif ++ MUL alpha, c10, b5 ++ fmov b5, c10 ++#ifndef TRMMKERNEL ++ ADD c02, a4, b5 ++ fmov b5, c02 ++#endif ++ ++ MUL alpha, c13, b5 ++ fmov b5, c13 ++#ifndef TRMMKERNEL ++ ADD c05, a5, b5 ++ fmov b5, c05 ++#endif ++ MUL alpha, c14, b5 ++ fmov b5, c14 ++#ifndef TRMMKERNEL ++ IFMOVD tmp, b5 ++ fstd b1, 88($sp) ++# FIMOVD b1, tmp ++ ADD c06, b5, b1 ++ fmov b1, c06 ++ fldd b1, 88($sp) ++# IFMOVD tmp, b1 ++#endif ++ ++#ifndef TRMMKERNEL ++ ADD c09, b1, b5 ++ fmov b5, c09 ++ unop ++#endif ++ ST c01, 0 * SIZE(C1) ++ fclr t1 ++ ++#ifndef TRMMKERNEL ++ ADD c10, b2, b5 ++ fmov b5, c10 ++ unop ++#endif ++ ST c02, 1 * SIZE(C1) ++ fclr t2 ++ ++#ifndef TRMMKERNEL ++ ADD c13, b3, b5 ++ fmov b5, c13 ++ unop ++#endif ++ ST c05, 0 * SIZE(C2) ++ fclr t3 ++ ++#ifndef TRMMKERNEL ++ ADD c14, b4, b5 ++ fmov b5, c14 ++ unop ++#endif ++ ST c06, 1 * SIZE(C2) ++ fclr t4 ++ ++ ST c09, 0 * SIZE(C3) ++ ldi C1, 2 * SIZE(C1) ++ ST c10, 1 * SIZE(C3) ++ ldi C2, 2 * SIZE(C2) ++ ++ ST c13, 0 * SIZE(C4) ++ ldi C3, 2 * SIZE(C3) ++ ST c14, 1 * SIZE(C4) ++ ldi C4, 2 * SIZE(C4) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ and M, 1, I ++ ble I, $L39 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble L, $L35 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ LD b5, 3 * SIZE(BO) ++ FIMOVD b5, tmp ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ IFMOVD tmp, b5 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L38 ++#else ++ blbs TMP1, $L38 ++#endif ++ .align 4 ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L38: ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ unop ++ MUL a1, b3, t3 ++#ifndef TRMMKERNEL ++ LD b5, 0 * SIZE(C2) ++ FIMOVD b5, tmp ++#else ++ unop ++#endif ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b4, t4 ++#ifndef TRMMKERNEL ++ LD a2, 0 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL alpha, c01, b5 ++ fmov b5, c01 ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ unop ++ MUL alpha, c05, b5 ++ fmov b5, c05 ++ unop ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ MUL alpha, c09, b5 ++ fmov b5, c09 ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL alpha, c13, b5 ++ fmov b5, c13 ++ ++#ifndef TRMMKERNEL ++ IFMOVD tmp, b5 ++ fstd b1, 88($sp) ++# FIMOVD b1, tmp ++ ADD c01, a5, b1 ++ fmov b1, c01 ++ ADD c05, b5, b1 ++ fmov b1, c05 ++ ADD c09, a2, b1 ++ fmov b1, c09 ++ ADD c13, a3, b1 ++ fmov b1, c13 ++ fldd b1, 88($sp) ++# IFMOVD tmp, b1 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 1, KK ++#endif ++ .align 4 ++ ++$L39: ++ mov BO, B ++ ldi J, -1(J) ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 4, KK ++#else ++ unop ++#endif ++ bgt J, $L01 ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ fclr t1 ++ addl C2, LDC, C ++ fclr t2 ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 ++ ++$L51: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ble L, $L55 ++#else ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L58 ++#else ++ blbs TMP1, $L58 ++#endif ++ .align 4 ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L58: ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b1, t3 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, t4 ++#ifndef TRMMKERNEL ++ LD c11, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, t1 ++#ifndef TRMMKERNEL ++ LD c12, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, t2 ++#ifndef TRMMKERNEL ++ LD c13, 0 * SIZE(C2) ++ unop ++#endif ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, t3 ++#ifndef TRMMKERNEL ++ LD c14, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ unop ++ MUL a4, b2, t4 ++#ifndef TRMMKERNEL ++ LD c15, 2 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL alpha, c01, b5 ++ fmov b5, c01 ++#ifndef TRMMKERNEL ++ LD c16, 3 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ ldi I, -1(I) ++ MUL alpha, c02, b5 ++ fmov b5, c02 ++ unop ++ ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ MUL alpha, c03, b5 ++ fmov b5, c03 ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ MUL alpha, c04, b5 ++ fmov b5, c04 ++ ++ MUL alpha, c05, b5 ++ fmov b5, c05 ++#ifndef TRMMKERNEL ++ ADD c01, c09, b5 ++ fmov b5, c01 ++#endif ++ MUL alpha, c06, b5 ++ fmov b5, c06 ++#ifndef TRMMKERNEL ++ ADD c02, c10, b5 ++ fmov b5, c02 ++#endif ++ ++ MUL alpha, c07, b5 ++ fmov b5, c07 ++#ifndef TRMMKERNEL ++ ADD c03, c11, b5 ++ fmov b5, c03 ++#endif ++ MUL alpha, c08, b5 ++ fmov b5, c08 ++#ifndef TRMMKERNEL ++ ADD c04, c12, b5 ++ fmov b5, c04 ++#endif ++ ++#ifndef TRMMKERNEL ++ ADD c05, c13, b5 ++ fmov b5, c05 ++#endif ++ ST c01, 0 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c06, c14, b5 ++ fmov b5, c06 ++#endif ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef TRMMKERNEL ++ ADD c07, c15, b5 ++ fmov b5, c07 ++#endif ++ ST c03, 2 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c08, c16, b5 ++ fmov b5, c08 ++#endif ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ fclr t1 ++ ST c06, 1 * SIZE(C2) ++ fclr t2 ++ ST c07, 2 * SIZE(C2) ++ fclr t3 ++ ST c08, 3 * SIZE(C2) ++ fclr t4 ++ ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif ++ bgt I, $L51 ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ble L, $L65 ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L68 ++#else ++ blbs TMP1, $L68 ++#endif ++ .align 4 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L68: ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c11, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL alpha, c01, b5 ++ fmov b5, c01 ++#ifndef TRMMKERNEL ++ LD c12, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi C1, 2 * SIZE(C1) ++ MUL alpha, c02, b5 ++ fmov b5, c02 ++ ldi C2, 2 * SIZE(C2) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ MUL alpha, c05, b5 ++ fmov b5, c05 ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL alpha, c06, b5 ++ fmov b5, c06 ++ ++#ifndef TRMMKERNEL ++ ADD c01, c09, b5 ++ fmov b5, c01 ++ ADD c02, c10, b5 ++ fmov b5, c02 ++ ADD c05, c11, b5 ++ fmov b5, c05 ++ ADD c06, c12, b5 ++ fmov b5, c06 ++#endif ++ ++ ST c01, -2 * SIZE(C1) ++ fclr t1 ++ ST c02, -1 * SIZE(C1) ++ fclr t2 ++ ST c05, -2 * SIZE(C2) ++ fclr t3 ++ ST c06, -1 * SIZE(C2) ++ fclr t4 ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ and M, 1, I ++ ble I, $L79 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ble L, $L75 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, b5 ++ fmov b5, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L78 ++#else ++ blbs TMP1, $L78 ++#endif ++ .align 4 ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L78: ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c02, t3, b5 ++ fmov b5, c02 ++ ADD c06, t4, b5 ++ fmov b5, c06 ++#ifndef TRMMKERNEL ++ LD b5, 0 * SIZE(C2) ++ FIMOVD b5, tmp ++#else ++ unop ++#endif ++ ++ ADD c01, c02, b5 ++ fmov b5, c01 ++ ADD c05, c06, b5 ++ fmov b5, c05 ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ++ MUL alpha, c01, b5 ++ fmov b5, c01 ++ MUL alpha, c05, b5 ++ fmov b5, c05 ++ ++#ifndef TRMMKERNEL ++ IFMOVD tmp ,b5 ++ fstd b1, 88($sp) ++# FIMOVD b1, tmp ++ ADD c01, a5, b1 ++ fmov b1, c01 ++ ADD c05, b5, b1 ++ fmov b1, c05 ++ fldd b1, 88($sp) ++# IFMOVD tmp ,b1 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 1, KK ++#endif ++ .align 4 ++ ++$L79: ++ mov BO, B ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 2, KK ++#else ++ unop ++#endif ++ unop ++ unop ++ .align 4 ++ ++$L80: ++ and N, 1, J ++ ble J, $L999 ++ ++ mov C, C1 ++ mov A, AO ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 ++ .align 4 ++ ++$L91: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO ++ unop ++ ble L, $L95 ++#else ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++#ifndef TRMMKERNEL ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD c05, 0 * SIZE(C1) ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ LD c06, 1 * SIZE(C1) ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD c07, 2 * SIZE(C1) ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ LD c08, 3 * SIZE(C1) ++#else ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++#endif ++ ++ MUL alpha, c01, b5 ++ fmov b5, c01 ++ MUL alpha, c02, b5 ++ fmov b5, c02 ++ MUL alpha, c03, b5 ++ fmov b5, c03 ++ MUL alpha, c04, b5 ++ fmov b5, c04 ++ ++#ifndef TRMMKERNEL ++ ADD c01, c05, b5 ++ fmov b5, c01 ++ ADD c02, c06, b5 ++ fmov b5, c02 ++ ADD c03, c07, b5 ++ fmov b5, c03 ++ ADD c04, c08, b5 ++ fmov b5, c04 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ldi C1, 4 * SIZE(C1) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ unop ++ unop ++ ble I, $L110 ++ .align 4 ++ ++$L101: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO ++ unop ++ ble L, $L105 ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C1) ++ LD a4, 1 * SIZE(C1) ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ fclr t1 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ fclr t2 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ fclr t3 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ fclr t4 ++ ++ ADD c01, c03, b5 ++ fmov b5, c01 ++ ADD c02, c04, b5 ++ fmov b5, c02 ++ ++ MUL alpha, c01, b5 ++ fmov b5, c01 ++ MUL alpha, c02, b5 ++ fmov b5, c02 ++ ++#ifndef TRMMKERNEL ++ ADD c01, a3, b5 ++ fmov b5, c01 ++ ADD c02, a4, b5 ++ fmov b5, c02 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ldi C1, 2 * SIZE(C1) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ and M, 1, I ++ ble I, $L999 ++ .align 4 ++ ++$L111: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA ++#ifndef TRMMKERNEL ++ LD a2, 0 * SIZE(C1) ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ++ ADD c01, c02, b5 ++ fmov b5, c01 ++ ADD c03, c04, b5 ++ fmov b5, c03 ++ ADD c01, c03, b5 ++ fmov b5, c01 ++ ++ MUL alpha, c01, b5 ++ fmov b5, c01 ++#ifndef TRMMKERNEL ++ ADD c01, a2, b5 ++ fmov b5, c01 ++#endif ++ ST c01, 0 * SIZE(C1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldl $9, 80($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/gemm_kernel_4x4.S.bak b/kernel/sw_64/gemm_kernel_4x4.S.bak +new file mode 100644 +index 0000000..10dc98d +--- /dev/null ++++ b/kernel/sw_64/gemm_kernel_4x4.S.bak +@@ -0,0 +1,2844 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(SW2B) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW2B ++#define PREFETCHSIZE 56 ++#define UNOP nop ++#endif ++ ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define BB $3 ++#define OFFSET $4 ++ ++#define ALPHA 64($sp) ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++#ifdef TRMMKERNEL ++ ldl OFFSET, 16 + STACKSIZE($sp) ++#endif ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ fstd $f19, ALPHA ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ subl $31, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 ++ .align 4 ++ ++$L01: ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ s4addl K, 0, BB ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ addl C2, LDC, C3 ++ s4addl LDC, C, C ++ ++ SXADDQ BB, B, BB ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(EV5) || defined(EV6) || defined(SW2B) ++ fillcs 0 * SIZE(BB) ++ fillcs 8 * SIZE(BB) ++ unop ++ ldi BB, 16 * SIZE(BB) ++#endif ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++#else ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++#endif ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, c11 ++ fldd alpha, ALPHA ++ MUL b1, a1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L18 ++#else ++ blbs TMP1, $L18 ++#endif ++ .align 4 ++ ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L18: ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++#ifndef TRMMKERNEL ++ LD b5, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, c01 ++ unop ++ MUL b1, a3, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++#ifndef TRMMKERNEL ++ LD b1, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++#ifndef TRMMKERNEL ++ LD a1, 0 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++#ifndef TRMMKERNEL ++ LD a2, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++#ifndef TRMMKERNEL ++ LD b2, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c09, t1, c09 ++ ldi I, -1(I) ++ MUL b3, a3, t1 ++ unop ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++#ifndef TRMMKERNEL ++ LD b3, 0 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++#ifndef TRMMKERNEL ++ LD a4, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++#ifndef TRMMKERNEL ++ LD a3, 2 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c11, t1, c11 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD b4, 3 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, c12 ++ unop ++ MUL alpha, c02, c02 ++#ifndef TRMMKERNEL ++ LD t1, 1 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c16, t3, c16 ++ unop ++ MUL alpha, c03, c03 ++#ifndef TRMMKERNEL ++ LD t2, 2 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c15, t4, c15 ++ unop ++ MUL alpha, c04, c04 ++#ifndef TRMMKERNEL ++ LD t3, 3 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ MUL alpha, c05, c05 ++ unop ++#ifndef TRMMKERNEL ++ ADD c01, a5, c01 ++ LD t4, 1 * SIZE(C4) ++#else ++ unop ++ unop ++#endif ++ ++ MUL alpha, c06, c06 ++#ifndef TRMMKERNEL ++ unop ++ ADD c02, b5, c02 ++ LD a5, 2 * SIZE(C4) ++#endif ++ ++ MUL alpha, c07, c07 ++#ifndef TRMMKERNEL ++ unop ++ ADD c03, a2, c03 ++ LD b5, 3 * SIZE(C4) ++#endif ++ ++ MUL alpha, c08, c08 ++#ifndef TRMMKERNEL ++ unop ++ ADD c04, b2, c04 ++ unop ++#endif ++ ++ MUL alpha, c09, c09 ++ ST c01, 0 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c05, b1, c05 ++ unop ++#endif ++ ++ MUL alpha, c10, c10 ++ ST c02, 1 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c06, a4, c06 ++ unop ++#endif ++ ++ MUL alpha, c11, c11 ++ ST c03, 2 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c07, a3, c07 ++ unop ++#endif ++ ++ MUL alpha, c12, c12 ++ ST c04, 3 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c08, b4, c08 ++#else ++ unop ++#endif ++ ldi C1, 4 * SIZE(C1) ++ ++ MUL alpha, c13, c13 ++ ST c05, 0 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c09, a1, c09 ++ unop ++#endif ++ ++ MUL alpha, c14, c14 ++ ST c06, 1 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c10, t1, c10 ++ unop ++#endif ++ ++ MUL alpha, c15, c15 ++ ST c07, 2 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c11, t2, c11 ++ unop ++#endif ++ ++ MUL alpha, c16, c16 ++ ST c08, 3 * SIZE(C2) ++#ifndef TRMMKERNEL ++ ADD c12, t3, c12 ++#else ++ unop ++#endif ++ ldi C2, 4 * SIZE(C2) ++ ++#ifndef TRMMKERNEL ++ ADD c13, b3, c13 ++#else ++ unop ++#endif ++ ST c09, 0 * SIZE(C3) ++ fclr t1 ++ ldi C4, 4 * SIZE(C4) ++ ++#ifndef TRMMKERNEL ++ ADD c14, t4, c14 ++#else ++ unop ++#endif ++ ST c10, 1 * SIZE(C3) ++ fclr t2 ++ unop ++ ++#ifndef TRMMKERNEL ++ ADD c15, a5, c15 ++#else ++ unop ++#endif ++ ST c11, 2 * SIZE(C3) ++ fclr t3 ++ unop ++ ++#ifndef TRMMKERNEL ++ ADD c16, b5, c16 ++#else ++ unop ++#endif ++ ST c12, 3 * SIZE(C3) ++ fclr t4 ++ ldi C3, 4 * SIZE(C3) ++ ++ ST c13, -4 * SIZE(C4) ++ ST c14, -3 * SIZE(C4) ++ ST c15, -2 * SIZE(C4) ++ ST c16, -1 * SIZE(C4) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble L, $L25 ++ ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, c09 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L28 ++#else ++ blbs TMP1, $L28 ++#endif ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L28: ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD a4, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++#ifndef TRMMKERNEL ++ LD b5, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++#ifndef TRMMKERNEL ++ LD b1, 0 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++#ifndef TRMMKERNEL ++ LD b2, 1 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++#ifndef TRMMKERNEL ++ LD b3, 0 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c09, t1, c09 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD b4, 1 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c10, t2, c10 ++ unop ++ MUL alpha, c02, c02 ++ unop ++ ++ ADD c13, t3, c13 ++ MUL alpha, c05, c05 ++ ADD c14, t4, c14 ++ MUL alpha, c06, c06 ++ ++ MUL alpha, c09, c09 ++#ifndef TRMMKERNEL ++ ADD c01, a3, c01 ++#endif ++ MUL alpha, c10, c10 ++#ifndef TRMMKERNEL ++ ADD c02, a4, c02 ++#endif ++ ++ MUL alpha, c13, c13 ++#ifndef TRMMKERNEL ++ ADD c05, a5, c05 ++#endif ++ MUL alpha, c14, c14 ++#ifndef TRMMKERNEL ++ ADD c06, b5, c06 ++#endif ++ ++#ifndef TRMMKERNEL ++ ADD c09, b1, c09 ++ unop ++#endif ++ ST c01, 0 * SIZE(C1) ++ fclr t1 ++ ++#ifndef TRMMKERNEL ++ ADD c10, b2, c10 ++ unop ++#endif ++ ST c02, 1 * SIZE(C1) ++ fclr t2 ++ ++#ifndef TRMMKERNEL ++ ADD c13, b3, c13 ++ unop ++#endif ++ ST c05, 0 * SIZE(C2) ++ fclr t3 ++ ++#ifndef TRMMKERNEL ++ ADD c14, b4, c14 ++ unop ++#endif ++ ST c06, 1 * SIZE(C2) ++ fclr t4 ++ ++ ST c09, 0 * SIZE(C3) ++ ldi C1, 2 * SIZE(C1) ++ ST c10, 1 * SIZE(C3) ++ ldi C2, 2 * SIZE(C2) ++ ++ ST c13, 0 * SIZE(C4) ++ ldi C3, 2 * SIZE(C3) ++ ST c14, 1 * SIZE(C4) ++ ldi C4, 2 * SIZE(C4) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ and M, 1, I ++ ble I, $L39 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 4, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble L, $L35 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, c01 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L38 ++#else ++ blbs TMP1, $L38 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L38: ++ ADD c05, t2, c05 ++ unop ++ MUL a1, b2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c09, t3, c09 ++ unop ++ MUL a1, b3, t3 ++#ifndef TRMMKERNEL ++ LD b5, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c13, t4, c13 ++ unop ++ MUL a1, b4, t4 ++#ifndef TRMMKERNEL ++ LD a2, 0 * SIZE(C3) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, c01 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C4) ++#else ++ unop ++#endif ++ ++ ADD c05, t2, c05 ++ unop ++ MUL alpha, c05, c05 ++ unop ++ ++ ADD c09, t3, c09 ++ MUL alpha, c09, c09 ++ ADD c13, t4, c13 ++ MUL alpha, c13, c13 ++ ++#ifndef TRMMKERNEL ++ ADD c01, a5, c01 ++ ADD c05, b5, c05 ++ ADD c09, a2, c09 ++ ADD c13, a3, c13 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 1, KK ++#endif ++ .align 4 ++ ++$L39: ++ mov BO, B ++ ldi J, -1(J) ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 4, KK ++#else ++ unop ++#endif ++ bgt J, $L01 ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ fclr t1 ++ addl C2, LDC, C ++ fclr t2 ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 ++ ++$L51: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ble L, $L55 ++#else ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, c05 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L58 ++#else ++ blbs TMP1, $L58 ++#endif ++ .align 4 ++ ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L58: ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++#ifndef TRMMKERNEL ++ LD c11, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++#ifndef TRMMKERNEL ++ LD c12, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++#ifndef TRMMKERNEL ++ LD c13, 0 * SIZE(C2) ++ unop ++#endif ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++#ifndef TRMMKERNEL ++ LD c14, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++#ifndef TRMMKERNEL ++ LD c15, 2 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c05, t1, c05 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD c16, 3 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c06, t2, c06 ++ ldi I, -1(I) ++ MUL alpha, c02, c02 ++ unop ++ ++ ADD c07, t3, c07 ++ MUL alpha, c03, c03 ++ ADD c08, t4, c08 ++ MUL alpha, c04, c04 ++ ++ MUL alpha, c05, c05 ++#ifndef TRMMKERNEL ++ ADD c01, c09, c01 ++#endif ++ MUL alpha, c06, c06 ++#ifndef TRMMKERNEL ++ ADD c02, c10, c02 ++#endif ++ ++ MUL alpha, c07, c07 ++#ifndef TRMMKERNEL ++ ADD c03, c11, c03 ++#endif ++ MUL alpha, c08, c08 ++#ifndef TRMMKERNEL ++ ADD c04, c12, c04 ++#endif ++ ++#ifndef TRMMKERNEL ++ ADD c05, c13, c05 ++#endif ++ ST c01, 0 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c06, c14, c06 ++#endif ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef TRMMKERNEL ++ ADD c07, c15, c07 ++#endif ++ ST c03, 2 * SIZE(C1) ++#ifndef TRMMKERNEL ++ ADD c08, c16, c08 ++#endif ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ fclr t1 ++ ST c06, 1 * SIZE(C2) ++ fclr t2 ++ ST c07, 2 * SIZE(C2) ++ fclr t3 ++ ST c08, 3 * SIZE(C2) ++ fclr t4 ++ ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif ++ bgt I, $L51 ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ble L, $L65 ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, c01 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L68 ++#else ++ blbs TMP1, $L68 ++#endif ++ .align 4 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L68: ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c11, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c01, t1, c01 ++ unop ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ LD c12, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c02, t2, c02 ++ ldi C1, 2 * SIZE(C1) ++ MUL alpha, c02, c02 ++ ldi C2, 2 * SIZE(C2) ++ ++ ADD c05, t3, c05 ++ MUL alpha, c05, c05 ++ ADD c06, t4, c06 ++ MUL alpha, c06, c06 ++ ++#ifndef TRMMKERNEL ++ ADD c01, c09, c01 ++ ADD c02, c10, c02 ++ ADD c05, c11, c05 ++ ADD c06, c12, c06 ++#endif ++ ++ ST c01, -2 * SIZE(C1) ++ fclr t1 ++ ST c02, -1 * SIZE(C1) ++ fclr t2 ++ ST c05, -2 * SIZE(C2) ++ fclr t3 ++ ST c06, -1 * SIZE(C2) ++ fclr t4 ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ and M, 1, I ++ ble I, $L79 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ble L, $L75 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, c01 ++ fldd alpha, ALPHA ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L78 ++#else ++ blbs TMP1, $L78 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L78: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 ++#ifndef TRMMKERNEL ++ LD b5, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD c01, c02, c01 ++ ADD c05, c06, c05 ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ++ MUL alpha, c01, c01 ++ MUL alpha, c05, c05 ++ ++#ifndef TRMMKERNEL ++ ADD c01, a5, c01 ++ ADD c05, b5, c05 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 1, KK ++#endif ++ .align 4 ++ ++$L79: ++ mov BO, B ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 2, KK ++#else ++ unop ++#endif ++ unop ++ unop ++ .align 4 ++ ++$L80: ++ and N, 1, J ++ ble J, $L999 ++ ++ mov C, C1 ++ mov A, AO ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 ++ .align 4 ++ ++$L91: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 4, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO ++ unop ++ ble L, $L95 ++#else ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++#ifndef TRMMKERNEL ++ ADD c01, t1, c01 ++ LD c05, 0 * SIZE(C1) ++ ADD c02, t2, c02 ++ LD c06, 1 * SIZE(C1) ++ ADD c03, t3, c03 ++ LD c07, 2 * SIZE(C1) ++ ADD c04, t4, c04 ++ LD c08, 3 * SIZE(C1) ++#else ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++#endif ++ ++ MUL alpha, c01, c01 ++ MUL alpha, c02, c02 ++ MUL alpha, c03, c03 ++ MUL alpha, c04, c04 ++ ++#ifndef TRMMKERNEL ++ ADD c01, c05, c01 ++ ADD c02, c06, c02 ++ ADD c03, c07, c03 ++ ADD c04, c08, c04 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ldi C1, 4 * SIZE(C1) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 4, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ unop ++ unop ++ ble I, $L110 ++ .align 4 ++ ++$L101: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO ++ unop ++ ble L, $L105 ++#else ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA ++#ifndef TRMMKERNEL ++ LD a3, 0 * SIZE(C1) ++ LD a4, 1 * SIZE(C1) ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, c01 ++ fclr t1 ++ ADD c02, t2, c02 ++ fclr t2 ++ ADD c03, t3, c03 ++ fclr t3 ++ ADD c04, t4, c04 ++ fclr t4 ++ ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 ++ ++ MUL alpha, c01, c01 ++ MUL alpha, c02, c02 ++ ++#ifndef TRMMKERNEL ++ ADD c01, a3, c01 ++ ADD c02, a4, c02 ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ldi C1, 2 * SIZE(C1) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ and M, 1, I ++ ble I, $L999 ++ .align 4 ++ ++$L111: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++#ifndef TRMMKERNEL ++ sra K, 2, L ++#else ++ sra TMP1, 2, L ++#endif ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#ifndef TRMMKERNEL ++ and K, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ fldd alpha, ALPHA ++#ifndef TRMMKERNEL ++ LD a2, 0 * SIZE(C1) ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 ++ ++ MUL alpha, c01, c01 ++#ifndef TRMMKERNEL ++ ADD c01, a2, c01 ++#endif ++ ST c01, 0 * SIZE(C1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/gemm_kernel_simd_16x4.S b/kernel/sw_64/gemm_kernel_simd_16x4.S +new file mode 100644 +index 0000000..1acf679 +--- /dev/null ++++ b/kernel/sw_64/gemm_kernel_simd_16x4.S +@@ -0,0 +1,4054 @@ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(SW2B) ++#error "Architecture is not specified." ++#endif ++ ++ ++#define STACKSIZE 336 ++ ++#define CO $1 ++#define C1 $2 ++#define C2 $3 ++#define C3 $4 ++ ++#define LDM $5 ++ ++#define PREB $7 ++#define SPANA $8 ++#define SPANB $9 ++#define NC1 $10 ++#define KC1 $11 ++#define MC1 $12 ++#define PREA $13 ++ ++#define A $20 ++#define B $21 ++#define C $19 ++#define MC $16 ++#define NC $17 ++#define KC $18 ++ ++#define A1 $22 ++#define B1 $23 ++ ++#define ALPHA $f8 ++ ++#define a0 $f0 ++#define a4 $f1 ++#define a8 $f2 ++#define a12 $f3 ++ ++#define b0 $f4 ++#define b1 $f5 ++#define b2 $f6 ++#define b3 $f7 ++ ++#define na0 $f0 ++#define na4 $f8 ++#define na8 $f9 ++#define na12 $f10 ++ ++#define nb0 $f11 ++#define nb1 $f12 ++#define nb2 $f13 ++#define nb3 $f14 ++ ++#define t00 $f15 ++#define t01 $f16 ++#define t02 $f17 ++#define t03 $f18 ++#define t04 $f19 ++#define t05 $f20 ++#define t06 $f21 ++#define t07 $f22 ++#define t08 $f23 ++#define t09 $f24 ++#define t10 $f25 ++#define t11 $f26 ++#define t12 $f27 ++#define t13 $f28 ++#define t14 $f29 ++#define t15 $f30 ++ ++#define c00 $f1 ++#define c01 $f2 ++#define c02 $f3 ++#define c03 $f4 ++ ++#define c04 $f5 ++#define c05 $f6 ++#define c06 $f7 ++#define c07 $f9 ++ ++#define c08 $f10 ++#define c09 $f11 ++#define c10 $f12 ++#define c11 $f13 ++ ++#define c12 $f1 ++#define c13 $f2 ++#define c14 $f3 ++#define c15 $f4 ++ ++#if defined(TRMMKERNEL) ++#define TEMP $14 ++#define KK $24 ++#define OFFSET $25 ++#endif ++ ++ PROLOGUE ++ PROFCODE ++ ++.frame $30,STACKSIZE,$26,0 ++ldi $sp,-STACKSIZE($sp) # # [2] ++ ++ stl $9,328($sp) # Integer Saved Register ++ stl $10,320($sp) ++ stl $11,312($sp) ++ stl $12,304($sp) ++ stl $13,296($sp) ++ stl $14,288($sp) ++ ++ ++ ST $f2,280($sp) # Float Saved Register ++ ST $f3,272($sp) ++ ST $f4,264($sp) ++ ST $f5,256($sp) ++ ST $f6,248($sp) ++ ST $f7,240($sp) ++ ST $f8,232($sp) ++ ST $f9,224($sp) ++ ++ ++ ++ .align 5 ++ ++$Begin_NC_Unroll4: ++ ldl C, 0 + STACKSIZE($sp) # load C ++ ldl LDM, 8 + STACKSIZE($sp) # load ldm ++ ++#ifdef TRMMKERNEL ++ ldl OFFSET, 16 + STACKSIZE($sp) # load offset ++ nop ++#endif ++ ++ ST $f19, 192($sp) # store alpha ++ SXADDQ LDM, 0, LDM # ldm*X+0 ++ ++ mov NC, NC1 # backup nc ++ mov KC, KC1 # backup kc ++ mov MC, MC1 # backup mc ++ ++ mov B, B1 # backup the initial address of b ++ sra NC1,2,NC # NC=NC1/4 Unroll N 4 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ subl $31, OFFSET, KK # when trmm at right ++ nop ++#endif ++ ++ mov A, A1 # backup the initial address of a ++ sll KC1,1+BASE_SHIFT,SPANB # kc*2nr ++ ++ sll KC1,4+BASE_SHIFT,SPANA # kc*16mr ++ beq NC,$Begin_NC_Unroll2 ++ ++ ++ .align 5 ++ ++.L0: ++ sra MC1,4,MC # MC=MC1/16 ++ mov C, CO # compute c pointer ++ ++ addl B1,SPANB,PREB # prefetch B ++ addl A1,SPANA,PREA # prefetch A ++ ++ addl C, LDM, C1 ++ addl C1,LDM, C2 ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET,KK # Reset the left offset ++ nop ++#endif ++ ++ subl PREA,16*SIZE,PREA # prea=kc1*mc-mc ++ addl C2,LDM, C3 ++ ++ s4addl LDM,C,C # C=ldm*4+C ++ beq MC,.L15 # MC=0:MC1<16 ++ ++ ++ .align 5 # nr=4,mr=4----------------------------- ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B # LL && RU reset B ++ nop ++#else ++ sll KK, 4 + BASE_SHIFT, KC # KK*16 ++ sll KK, 2 + BASE_SHIFT, TEMP # KK*4 ++ ++ addl A, KC, A # mov A point to the data part ++ addl B1,TEMP,B # mov B point to the data part ++#endif ++ ++ vcpys $f31,$f31,t00 # CLEAR Results Register ++ fillcs 0(CO) # prefetch C ++ fillcs 0(C1) ++ ++ vcpys $f31,$f31,t01 # 64 results ++ fillcs 0(C2) ++ fillcs 0(C3) ++ ++ vcpys $f31,$f31,t02 ++ LDDE b0,0*SIZE(B) ++ LDDE b1,1*SIZE(B) ++ ++ vcpys $f31,$f31,t03 ++ LDDE b2,2*SIZE(B) ++ LDDE b3,3*SIZE(B) ++ ++ vcpys $f31,$f31,t04 ++ fillcs 4(CO) # prefetch C ++ fillcs 4(C1) ++ ++ vcpys $f31,$f31,t05 ++ fillcs 4(C2) ++ fillcs 4(C3) ++ ++ vcpys $f31,$f31,t06 ++ VLD a0, 0*SIZE(A) ++ VLD a4, 4*SIZE(A) ++ ++ vcpys $f31,$f31,t07 ++ VLD a8, 8*SIZE(A) ++ VLD a12,12*SIZE(A) ++ ++ vcpys $f31,$f31,t08 ++ fillcs 8*SIZE(CO) ++ fillcs 8*SIZE(C1) ++ ++ vcpys $f31,$f31,t09 ++ fillcs 8*SIZE(C2) ++ fillcs 8*SIZE(C3) ++ ++ vcpys $f31,$f31,t10 ++ fillcs 12*SIZE(CO) ++ fillcs 12*SIZE(C1) ++ ++ vcpys $f31,$f31,t11 ++ fillcs 12*SIZE(C2) ++ fillcs 12*SIZE(C3) ++ ++ vcpys $f31,$f31,t12 ++ vcpys $f31,$f31,t13 ++ vcpys $f31,$f31,t14 ++ vcpys $f31,$f31,t15 ++ ++ ++#if (defined(LEFT) && !defined(TRANSA)) \ ++ ||(!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP # temp is the length of data part ++#elif defined(LEFT) ++ addl KK, 16, TEMP # mr=16 ++#else ++ addl KK, 4, TEMP # right nr=4 ++#endif ++ sra TEMP, 1, KC # KC=TEMP/2 ++ ++ nop ++ beq KC, $Rest_16x4x1 ++ ++#else ++ ++ vcpys $f31,$f31,t00 # CLEAR Results Register ++ mov B1,B # Reset B ++ sra KC1,1,KC # Unroll Kr=2, KC=KC1/2 ++ ++ vcpys $f31,$f31,t01 # 64 results ++ fillcs 0(CO) # prefetch C ++ fillcs 0(C1) ++ ++ vcpys $f31,$f31,t02 ++ fillcs 0(C2) ++ fillcs 0(C3) ++ ++ vcpys $f31,$f31,t03 ++ LDDE b0,0*SIZE(B) ++ LDDE b1,1*SIZE(B) ++ ++ vcpys $f31,$f31,t04 ++ LDDE b2,2*SIZE(B) ++ LDDE b3,3*SIZE(B) ++ ++ vcpys $f31,$f31,t05 ++ fillcs 4(CO) # prefetch C ++ fillcs 4(C1) ++ ++ vcpys $f31,$f31,t06 ++ fillcs 4(C2) ++ fillcs 4(C3) ++ ++ vcpys $f31,$f31,t07 ++ VLD a0, 0*SIZE(A) ++ VLD a4, 4*SIZE(A) ++ ++ vcpys $f31,$f31,t08 ++ VLD a8, 8*SIZE(A) ++ VLD a12,12*SIZE(A) ++ ++ vcpys $f31,$f31,t09 ++ fillcs 8(CO) # prefetch C ++ fillcs 8(C1) ++ ++ vcpys $f31,$f31,t10 ++ fillcs 8(C2) ++ fillcs 8(C3) ++ ++ vcpys $f31,$f31,t11 ++ fillcs 12*SIZE(CO) ++ fillcs 12*SIZE(C1) ++ ++ vcpys $f31,$f31,t12 ++ fillcs 12*SIZE(C2) ++ fillcs 12*SIZE(C3) ++ ++ vcpys $f31,$f31,t13 ++ vcpys $f31,$f31,t14 ++ ++ vcpys $f31,$f31,t15 ++ beq KC,$Rest_16x4x1 # KC1<2 goto $Rest_16x4x1 ++ ++#endif ++ ++ .align 5 ++ ++$Panel_16x4x2: # nr=4,mr=4,kr=2------------------------ ++ ++ VMAD a0,b0,t00,t00 ++ addl A,16*SIZE,A # 16a*1k ++ LDDE nb0,4*SIZE(B) # get next 4b ++ ++ VMAD a0,b1,t04,t04 ++ LDDE nb1,5*SIZE(B) ++ ++ VMAD a4,b0,t01,t01 ++ VLD na12,12*SIZE(A) ++ ++ VMAD a4,b1,t05,t05 ++ VLD na8,8*SIZE(A) ++ ++ VMAD a0,b2,t08,t08 ++ LDDE nb2,6*SIZE(B) ++ ++ VMAD a0,b3,t12,t12 ++ LDDE nb3,7*SIZE(B) ++ ++ VMAD a8,b0,t02,t02 ++ VMAD a8,b1,t06,t06 ++ ++ VMAD a4,b2,t09,t09 ++ addl B,8*SIZE,B # 4b*2k ++ VLD na0,0*SIZE(A) # carefule na0=a0 use the same register ++ ++ VMAD a4,b3,t13,t13 ++ VLD na4,4*SIZE(A) # get next 16a ++ ++ VMAD a12,b0,t03,t03 ++ VMAD a12,b1,t07,t07 ++ ++ VMAD a8,b2,t10,t10 ++ fillcs 0(PREB) ++ ++ VMAD a8,b3,t14,t14 ++ fillcs 0(PREA) ++ ++ VMAD a12,b2,t11,t11 ++ fillcs 8*SIZE(PREA) ++ ++ VMAD a12,b3,t15,t15 ++ subl KC,1,KC # loop k -- ++ ++ ++ VMAD na12,nb0,t03,t03 ++ addl A,16*SIZE,A # ### next k ### ++ LDDE b0,0(B) # get 3rd 4b ++ ++ VMAD na12,nb1,t07,t07 ++ LDDE b1,1*SIZE(B) ++ ++ VMAD na8,nb0,t02,t02 ++ VLD a12,12*SIZE(A) ++ ++ VMAD na8,nb1,t06,t06 ++ VLD a8,8*SIZE(A) ++ ++ VMAD na0,nb0,t00,t00 ++ subl PREA,16*SIZE,PREA # prea-=16 ++ LDDE b2,2*SIZE(B) ++ ++ VMAD na0,nb1,t04,t04 ++ LDDE b3,3*SIZE(B) ++ ++ VMAD na12,nb2,t11,t11 ++ VMAD na12,nb3,t15,t15 ++ VMAD na8,nb2,t10,t10 ++ VMAD na8,nb3,t14,t14 ++ ++ VMAD na0,nb2,t08,t08 ++ fillcs 0(PREA) ++ ++ VMAD na0,nb3,t12,t12 ++ fillcs 4*SIZE(PREB) ++ ++ VMAD na4,nb0,t01,t01 ++ VLD a0,0(A) # get 3rd 16a ++ ++ VMAD na4,nb1,t05,t05 ++ VLD a4,4*SIZE(A) ++ ++ VMAD na4,nb2,t09,t09 ++ fillcs 8*SIZE(PREA) ++ addl PREB,8*SIZE,PREB # preb+=8 ++ ++ VMAD na4,nb3,t13,t13 ++ subl PREA,16*SIZE,PREA # prea-=16 ++ bne KC,$Panel_16x4x2 ++ ++ ++$Rest_16x4x1: ++ LDDE ALPHA, 192($sp) # get alpha ++#ifndef TRMMKERNEL ++ blbc KC1, $Write_16x4 ++#else ++ blbc TEMP,$Write_16x4 ++#endif ++ ++ VMAD a0,b0,t00,t00 ++ addl A,16*SIZE,A # 16a*1k ++ ++ VMAD a0,b1,t04,t04 ++ addl B,4*SIZE,B # 4b*1k ++ ++ VMAD a0,b2,t08,t08 ++ VMAD a0,b3,t12,t12 ++ ++ ++ VMAD a4,b0,t01,t01 ++ VMAD a4,b1,t05,t05 ++ VMAD a4,b2,t09,t09 ++ VMAD a4,b3,t13,t13 ++ ++ VMAD a8,b0,t02,t02 ++ VMAD a8,b1,t06,t06 ++ VMAD a8,b2,t10,t10 ++ VMAD a8,b3,t14,t14 ++ ++ VMAD a12,b0,t03,t03 ++ VMAD a12,b1,t07,t07 ++ VMAD a12,b2,t11,t11 ++ VMAD a12,b3,t15,t15 ++ ++ ++ .align 5 ++ ++$Write_16x4: ++ ++#ifndef TRMMKERNEL ++ and CO, (VEC_LEN*SIZE-1), $6 ### gemm part #### ++ bne $6, $UnAlign_CO_Access_16x4 ++ ++$Align_CO_Access_16x4: ++ VLD c00,0(CO) ++ VLD c01,4*SIZE(CO) ++ VLD c02,8*SIZE(CO) ++ VLD c03,12*SIZE(CO) ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ VMAD t02,ALPHA,c02,t02 ++ VMAD t03,ALPHA,c03,t03 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ VST t02,8*SIZE(CO) ++ VST t03,12*SIZE(CO) ++ jmp $Access_C1_16x4 ++ ++$UnAlign_CO_Access_16x4: ++ VLD_UL c00, 0*VEC_LEN*SIZE(CO) ++ VLD_UH c04, 1*VEC_LEN*SIZE(CO) ++ ++ VLD_UL c01, 1*VEC_LEN*SIZE(CO) ++ VLD_UH c05, 2*VEC_LEN*SIZE(CO) ++ ++ vbisw c00,c04,c00 ++ VLD_UL c02, 2*VEC_LEN*SIZE(CO) ++ VLD_UH c06, 3*VEC_LEN*SIZE(CO) ++ ++ vbisw c01,c05,c01 ++ VLD_UL c03, 3*VEC_LEN*SIZE(CO) ++ VLD_UH c07, 4*VEC_LEN*SIZE(CO) ++ ++ vbisw c02,c06,c02 ++ vbisw c03,c07,c03 ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ ++ VMAD t02,ALPHA,c02,t02 ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VMAD t03,ALPHA,c03,t03 ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++ VST_UL t02, 2*VEC_LEN*SIZE(CO) ++ VST_UH t02, 3*VEC_LEN*SIZE(CO) ++ ++ VST_UL t03, 3*VEC_LEN*SIZE(CO) ++ VST_UH t03, 4*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_16x4: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_C1_Access_16x4 ++ ++$Align_C1_Access_16x4: ++ VLD c04,0(C1) ++ VLD c05,4*SIZE(C1) ++ VLD c06,8*SIZE(C1) ++ VLD c07,12*SIZE(C1) ++ ++ VMAD t04,ALPHA,c04,t04 ++ VMAD t05,ALPHA,c05,t05 ++ VMAD t06,ALPHA,c06,t06 ++ VMAD t07,ALPHA,c07,t07 ++ ++ VST t04,0(C1) ++ VST t05,4*SIZE(C1) ++ VST t06,8*SIZE(C1) ++ VST t07,12*SIZE(C1) ++ jmp $Access_C2_16x4 ++ ++$UnAlign_C1_Access_16x4: ++ VLD_UL c04, 0*VEC_LEN*SIZE(C1) ++ VLD_UH t00, 1*VEC_LEN*SIZE(C1) ++ ++ VLD_UL c05, 1*VEC_LEN*SIZE(C1) ++ VLD_UH t01, 2*VEC_LEN*SIZE(C1) ++ ++ vbisw c04,t00,c04 ++ VLD_UL c06, 2*VEC_LEN*SIZE(C1) ++ VLD_UH t02, 3*VEC_LEN*SIZE(C1) ++ ++ vbisw c05,t01,c05 ++ VLD_UL c07, 3*VEC_LEN*SIZE(C1) ++ VLD_UH t03, 4*VEC_LEN*SIZE(C1) ++ ++ vbisw c06,t02,c06 ++ vbisw c07,t03,c07 ++ ++ VMAD t04,ALPHA,c04,t04 ++ VMAD t05,ALPHA,c05,t05 ++ ++ VMAD t06,ALPHA,c06,t06 ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ ++ VMAD t07,ALPHA,c07,t07 ++ VST_UL t05, 1*VEC_LEN*SIZE(C1) ++ VST_UH t05, 2*VEC_LEN*SIZE(C1) ++ ++ VST_UL t06, 2*VEC_LEN*SIZE(C1) ++ VST_UH t06, 3*VEC_LEN*SIZE(C1) ++ ++ VST_UL t07, 3*VEC_LEN*SIZE(C1) ++ VST_UH t07, 4*VEC_LEN*SIZE(C1) ++ ++ ++$Access_C2_16x4: ++ and C2, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_C2_Access_16x4 ++ ++ $Align_C2_Access_16x4: ++ VLD c08,0(C2) ++ VLD c09,4*SIZE(C2) ++ VLD c10,8*SIZE(C2) ++ VLD c11,12*SIZE(C2) ++ ++ VMAD t08,ALPHA,c08,t08 ++ VMAD t09,ALPHA,c09,t09 ++ VMAD t10,ALPHA,c10,t10 ++ VMAD t11,ALPHA,c11,t11 ++ ++ VST t08,0(C2) ++ VST t09,4*SIZE(C2) ++ VST t10,8*SIZE(C2) ++ VST t11,12*SIZE(C2) ++ jmp $Access_C3_16x4 ++ ++$UnAlign_C2_Access_16x4: ++ VLD_UL c08, 0*VEC_LEN*SIZE(C2) ++ VLD_UH t00, 1*VEC_LEN*SIZE(C2) ++ ++ VLD_UL c09, 1*VEC_LEN*SIZE(C2) ++ VLD_UH t01, 2*VEC_LEN*SIZE(C2) ++ ++ vbisw c08,t00,c08 ++ VLD_UL c10, 2*VEC_LEN*SIZE(C2) ++ VLD_UH t02, 3*VEC_LEN*SIZE(C2) ++ ++ vbisw c09,t01,c09 ++ VLD_UL c11, 3*VEC_LEN*SIZE(C2) ++ VLD_UH t03, 4*VEC_LEN*SIZE(C2) ++ ++ vbisw c10,t02,c10 ++ vbisw c11,t03,c11 ++ ++ VMAD t08,ALPHA,c08,t08 ++ VMAD t09,ALPHA,c09,t09 ++ ++ VMAD t10,ALPHA,c10,t10 ++ VST_UL t08, 0*VEC_LEN*SIZE(C2) ++ VST_UH t08, 1*VEC_LEN*SIZE(C2) ++ ++ VMAD t11,ALPHA,c11,t11 ++ VST_UL t09, 1*VEC_LEN*SIZE(C2) ++ VST_UH t09, 2*VEC_LEN*SIZE(C2) ++ ++ VST_UL t10, 2*VEC_LEN*SIZE(C2) ++ VST_UH t10, 3*VEC_LEN*SIZE(C2) ++ ++ VST_UL t11, 3*VEC_LEN*SIZE(C2) ++ VST_UH t11, 4*VEC_LEN*SIZE(C2) ++ ++ ++$Access_C3_16x4: ++ and C3, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_C3_Access_16x4 ++ ++$Align_C3_Access_16x4: ++ VLD c12,0(C3) ++ VLD c13,4*SIZE(C3) ++ VLD c14,8*SIZE(C3) ++ VLD c15,12*SIZE(C3) ++ ++ VMAD t12,ALPHA,c12,t12 ++ VMAD t13,ALPHA,c13,t13 ++ VMAD t14,ALPHA,c14,t14 ++ VMAD t15,ALPHA,c15,t15 ++ ++ VST t12,0(C3) ++ VST t13,4*SIZE(C3) ++ VST t14,8*SIZE(C3) ++ VST t15,12*SIZE(C3) ++ jmp $End_NC_Unroll4 ++ ++$UnAlign_C3_Access_16x4: ++ VLD_UL c12, 0*VEC_LEN*SIZE(C3) ++ VLD_UH t04, 1*VEC_LEN*SIZE(C3) ++ ++ VLD_UL c13, 1*VEC_LEN*SIZE(C3) ++ VLD_UH t05, 2*VEC_LEN*SIZE(C3) ++ ++ vbisw c12,t04,c12 ++ VLD_UL c14, 2*VEC_LEN*SIZE(C3) ++ VLD_UH t06, 3*VEC_LEN*SIZE(C3) ++ ++ vbisw c13,t05,c13 ++ VLD_UL c15, 3*VEC_LEN*SIZE(C3) ++ VLD_UH t07, 4*VEC_LEN*SIZE(C3) ++ ++ vbisw c14,t06,c14 ++ vbisw c15,t07,c15 ++ ++ VMAD t12,ALPHA,c12,t12 ++ VMAD t13,ALPHA,c13,t13 ++ ++ VMAD t14,ALPHA,c14,t14 ++ VST_UL t12, 0*VEC_LEN*SIZE(C3) ++ VST_UH t12, 1*VEC_LEN*SIZE(C3) ++ ++ VMAD t15,ALPHA,c15,t15 ++ VST_UL t13, 1*VEC_LEN*SIZE(C3) ++ VST_UH t13, 2*VEC_LEN*SIZE(C3) ++ ++ VST_UL t14, 2*VEC_LEN*SIZE(C3) ++ VST_UH t14, 3*VEC_LEN*SIZE(C3) ++ ++ VST_UL t15, 3*VEC_LEN*SIZE(C3) ++ VST_UH t15, 4*VEC_LEN*SIZE(C3) ++ jmp $End_NC_Unroll4 ++ ++#else ++ and CO, (VEC_LEN*SIZE-1),$6 ### trmm part ### ++ bne $6,$UnAlign_CO_Access_16x4 ++ ++$Align_CO_Access_16x4: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ VMUL t02,ALPHA,t02 ++ VMUL t03,ALPHA,t03 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ VST t02,8*SIZE(CO) ++ VST t03,12*SIZE(CO) ++ jmp $Access_C1_16x4 ++ ++$UnAlign_CO_Access_16x4: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ ++ VMUL t02,ALPHA,t02 ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VMUL t03,ALPHA,t03 ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++ VST_UL t02, 2*VEC_LEN*SIZE(CO) ++ VST_UH t02, 3*VEC_LEN*SIZE(CO) ++ ++ VST_UL t03, 3*VEC_LEN*SIZE(CO) ++ VST_UH t03, 4*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_16x4: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_C1_Access_16x4 ++ ++$Align_C1_Access_16x4: ++ VMUL t04,ALPHA,t04 ++ VMUL t05,ALPHA,t05 ++ VMUL t06,ALPHA,t06 ++ VMUL t07,ALPHA,t07 ++ ++ VST t04,0(C1) ++ VST t05,4*SIZE(C1) ++ VST t06,8*SIZE(C1) ++ VST t07,12*SIZE(C1) ++ jmp $Access_C2_16x4 ++ ++$UnAlign_C1_Access_16x4: ++ VMUL t04,ALPHA,t04 ++ VMUL t05,ALPHA,t05 ++ ++ VMUL t06,ALPHA,t06 ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ ++ VMUL t07,ALPHA,t07 ++ VST_UL t05, 1*VEC_LEN*SIZE(C1) ++ VST_UH t05, 2*VEC_LEN*SIZE(C1) ++ ++ VST_UL t06, 2*VEC_LEN*SIZE(C1) ++ VST_UH t06, 3*VEC_LEN*SIZE(C1) ++ ++ VST_UL t07, 3*VEC_LEN*SIZE(C1) ++ VST_UH t07, 4*VEC_LEN*SIZE(C1) ++ ++ ++$Access_C2_16x4: ++ and C2, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_C2_Access_16x4 ++ ++$Align_C2_Access_16x4: ++ VMUL t08,ALPHA,t08 ++ VMUL t09,ALPHA,t09 ++ VMUL t10,ALPHA,t10 ++ VMUL t11,ALPHA,t11 ++ ++ VST t08,0(C2) ++ VST t09,4*SIZE(C2) ++ VST t10,8*SIZE(C2) ++ VST t11,12*SIZE(C2) ++ jmp $Access_C3_16x4 ++ ++$UnAlign_C2_Access_16x4: ++ VMUL t08,ALPHA,t08 ++ VMUL t09,ALPHA,t09 ++ ++ VMUL t10,ALPHA,t10 ++ VST_UL t08, 0*VEC_LEN*SIZE(C2) ++ VST_UH t08, 1*VEC_LEN*SIZE(C2) ++ ++ VMUL t11,ALPHA,t11 ++ VST_UL t09, 1*VEC_LEN*SIZE(C2) ++ VST_UH t09, 2*VEC_LEN*SIZE(C2) ++ ++ VST_UL t10, 2*VEC_LEN*SIZE(C2) ++ VST_UH t10, 3*VEC_LEN*SIZE(C2) ++ ++ VST_UL t11, 3*VEC_LEN*SIZE(C2) ++ VST_UH t11, 4*VEC_LEN*SIZE(C2) ++ ++ ++$Access_C3_16x4: ++ and C3, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_C3_Access_16x4 ++ ++$Align_C3_Access_16x4: ++ VMUL t12,ALPHA,t12 ++ VMUL t13,ALPHA,t13 ++ VMUL t14,ALPHA,t14 ++ VMUL t15,ALPHA,t15 ++ ++ VST t12,0(C3) ++ VST t13,4*SIZE(C3) ++ VST t14,8*SIZE(C3) ++ VST t15,12*SIZE(C3) ++ jmp $TRMMKERNEL_16x4 ++ ++$UnAlign_C3_Access_16x4: ++ VMUL t12,ALPHA,t12 ++ VMUL t13,ALPHA,t13 ++ ++ VMUL t14,ALPHA,t14 ++ VST_UL t12, 0*VEC_LEN*SIZE(C3) ++ VST_UH t12, 1*VEC_LEN*SIZE(C3) ++ ++ VMUL t15,ALPHA,t15 ++ VST_UL t13, 1*VEC_LEN*SIZE(C3) ++ VST_UH t13, 2*VEC_LEN*SIZE(C3) ++ ++ VST_UL t14, 2*VEC_LEN*SIZE(C3) ++ VST_UH t14, 3*VEC_LEN*SIZE(C3) ++ ++ VST_UL t15, 3*VEC_LEN*SIZE(C3) ++ VST_UH t15, 4*VEC_LEN*SIZE(C3) ++ ++ ++$TRMMKERNEL_16x4: ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP # nodata length ++#ifdef LEFT ++ subl TEMP, 16, TEMP # mr=16 ++#else ++ subl TEMP, 4, TEMP # nr=4 ++#endif ++ ++ sll TEMP, 4 + BASE_SHIFT,KC # mr=16 ++ sll TEMP, 2 + BASE_SHIFT,TEMP # nr=4 ++ ++ addl A, KC, A # mov A to the end of this panel ++ addl B, TEMP,B # mov B to the end of this panel ++#endif ++ ++#ifdef LEFT ++ addl KK, 16 ,KK ++#endif ++ nop ++ jmp $End_NC_Unroll4 ++#endif ++ ++ ++ .align 5 ++ ++.L15: # n=4,m=8----------------------------- ++ and MC1,8,MC ++ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc ++ nop ++ beq MC,.L16 ++ ++ addl A1,SPANA,PREA ++ subl PREA,8*SIZE,PREA # PREA-=MC ++ ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA))\ ++ || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B # set B ++ nop ++#else ++ sll KK, 3 + BASE_SHIFT,KC # mr=8 ++ sll KK, 2 + BASE_SHIFT,TEMP # nr=4 ++ ++ addl A,KC,A ++ addl B1,TEMP,B ++#endif ++ ++ vcpys $f31,$f31,t00 # clear (32 results) ++ vcpys $f31,$f31,t01 ++ vcpys $f31,$f31,t04 ++ vcpys $f31,$f31,t05 ++ ++ LDDE b0,0(B) ++ LDDE b1,1*SIZE(B) ++ LDDE b2,2*SIZE(B) ++ LDDE b3,3*SIZE(B) ++ ++ vcpys $f31,$f31,t08 ++ vcpys $f31,$f31,t09 ++ vcpys $f31,$f31,t12 ++ vcpys $f31,$f31,t13 ++ ++ VLD a0,0(A) # get 8 A ++ VLD a4,4*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ fillcs 0(C1) ++ fillcs 0(C2) ++ fillcs 0(C3) ++ ++ fillcs 4*SIZE(CO) # ++ fillcs 4*SIZE(C1) ++ fillcs 4*SIZE(C2) ++ fillcs 4*SIZE(C3) ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP # temp is the length of the data part ++#elif defined(LEFT) ++ addl KK, 8, TEMP # mr=8 ++#else ++ addl KK, 4, TEMP # nr=4 ++#endif ++ sra TEMP,1, KC # kc/2 ++ beq KC,$Rest_8x4x1 ++ ++#else ++ ++ mov B1,B # Reset B ++ sra KC1,1,KC # unroll kc as 2, kc=kc1/2 ++ vcpys $f31,$f31,t00 # clear (32 results) ++ vcpys $f31,$f31,t01 ++ vcpys $f31,$f31,t04 ++ vcpys $f31,$f31,t05 ++ ++ LDDE b0,0(B) ++ LDDE b1,1*SIZE(B) ++ LDDE b2,2*SIZE(B) ++ LDDE b3,3*SIZE(B) ++ ++ vcpys $f31,$f31,t08 ++ vcpys $f31,$f31,t09 ++ vcpys $f31,$f31,t12 ++ vcpys $f31,$f31,t13 ++ ++ VLD a0,0(A) # get 8 A ++ VLD a4,4*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ fillcs 0(C1) ++ fillcs 0(C2) ++ fillcs 0(C3) ++ ++ fillcs 4*SIZE(CO) # ++ fillcs 4*SIZE(C1) ++ fillcs 4*SIZE(C2) ++ fillcs 4*SIZE(C3) ++ ++ beq KC,$Rest_8x4x1 ++#endif ++ ++ .align 5 ++ ++$Panel_8x4x2: ++ VMAD a0,b0,t00,t00 ++ VMAD a0,b1,t04,t04 ++ VMAD a0,b2,t08,t08 ++ VMAD a0,b3,t12,t12 ++ ++ LDDE nb0,4*SIZE(B) # get next 4b ++ LDDE nb1,5*SIZE(B) ++ LDDE nb2,6*SIZE(B) ++ LDDE nb3,7*SIZE(B) ++ ++ addl B,8*SIZE,B # 4n*2k ++ VMAD a4,b0,t01,t01 ++ VMAD a4,b1,t05,t05 ++ VMAD a4,b2,t09,t09 ++ VMAD a4,b3,t13,t13 ++ ++ VLD na8,8*SIZE(A) # get next 8a ++ VLD na12,12*SIZE(A) ++ ++ fillcs 0(PREA) ++ fillcs 4*SIZE(PREA) ++ subl PREA,8*SIZE,PREA # prea -= 8 ++ ++ subl KC,1,KC ++ addl A,16*SIZE,A # ### next k ###8m*2k ++ VMAD na8,nb0,t00,t00 ++ VMAD na8,nb1,t04,t04 ++ VMAD na8,nb2,t08,t08 ++ VMAD na8,nb3,t12,t12 ++ ++ LDDE b0,0(B) # get 3rd 4b ++ LDDE b1,1*SIZE(B) ++ LDDE b2,2*SIZE(B) ++ LDDE b3,3*SIZE(B) ++ ++ VMAD na12,nb0,t01,t01 ++ VMAD na12,nb1,t05,t05 ++ VMAD na12,nb2,t09,t09 ++ VMAD na12,nb3,t13,t13 ++ ++ VLD a0,0(A) # get 3rd 8a ++ VLD a4,4*SIZE(A) ++ ++ fillcs 0(PREA) ++ fillcs 4*SIZE(PREA) ++ subl PREA,8*SIZE,PREA # prea -= mc ++ bne KC,$Panel_8x4x2 # loop k-- ++ ++$Rest_8x4x1: ++ LDDE ALPHA, 192($sp) # get alpha ++#ifndef TRMMKERNEL ++ blbc KC1, $Write_8x4 ++#else ++ blbc TEMP, $Write_8x4 ++#endif ++ ++ addl A,8*SIZE,A # 8a*1k ++ addl B,4*SIZE,B # 4b*1K ++ ++ VMAD a0,b0,t00,t00 ++ VMAD a0,b1,t04,t04 ++ VMAD a0,b2,t08,t08 ++ VMAD a0,b3,t12,t12 ++ ++ fillcs 0(PREA) ++ fillcs 4*SIZE(PREA) ++ subl PREA,8*SIZE,PREA ++ ++ VMAD a4,b0,t01,t01 ++ VMAD a4,b1,t05,t05 ++ VMAD a4,b2,t09,t09 ++ VMAD a4,b3,t13,t13 ++ ++$Write_8x4: ++ ++#ifndef TRMMKERNEL ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_8x4 ++ ++$Align_CO_Access_8x4: ++ VLD c00,0(CO) # get 1st colum of 16c ++ VLD c01,4*SIZE(CO) ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ jmp $Access_C1_8x4 ++ ++$UnAlign_CO_Access_8x4: ++ VLD_UL c00, 0*VEC_LEN*SIZE(CO) ++ VLD_UH c02, 1*VEC_LEN*SIZE(CO) ++ ++ VLD_UL c01, 1*VEC_LEN*SIZE(CO) ++ VLD_UH c03, 2*VEC_LEN*SIZE(CO) ++ ++ vbisw c00,c02,c00 ++ vbisw c01,c03,c01 ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_8x4: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ addl CO,8*SIZE,CO ++ nop ++ bne $6,$UnAlign_C1_Access_8x4 ++ ++$Align_C1_Access_8x4: ++ VLD c04,0(C1) ++ VLD c05,4*SIZE(C1) ++ ++ VMAD t04,ALPHA,c04,t04 ++ VMAD t05,ALPHA,c05,t05 ++ ++ VST t04,0(C1) ++ VST t05,4*SIZE(C1) ++ jmp $Access_C2_8x4 ++ ++$UnAlign_C1_Access_8x4: ++ VLD_UL c04, 0*VEC_LEN*SIZE(C1) ++ VLD_UH c06, 1*VEC_LEN*SIZE(C1) ++ ++ VLD_UL c05, 1*VEC_LEN*SIZE(C1) ++ VLD_UH c07, 2*VEC_LEN*SIZE(C1) ++ ++ vbisw c04,c06,c04 ++ vbisw c05,c07,c05 ++ ++ VMAD t04,ALPHA,c04,t04 ++ VMAD t05,ALPHA,c05,t05 ++ ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ ++ VST_UL t05, 1*VEC_LEN*SIZE(C1) ++ VST_UH t05, 2*VEC_LEN*SIZE(C1) ++ ++ ++$Access_C2_8x4: ++ and C2, (VEC_LEN*SIZE-1),$6 ++ addl C1,8*SIZE,C1 ++ nop ++ bne $6,$UnAlign_C2_Access_8x4 ++ ++$Align_C2_Access_8x4: ++ VLD c08,0(C2) ++ VLD c09,4*SIZE(C2) ++ ++ VMAD t08,ALPHA,c08,t08 ++ VMAD t09,ALPHA,c09,t09 ++ ++ VST t08,0(C2) ++ VST t09,4*SIZE(C2) ++ jmp $Access_C3_8x4 ++ ++$UnAlign_C2_Access_8x4: ++ VLD_UL c08, 0*VEC_LEN*SIZE(C2) ++ VLD_UH c10, 1*VEC_LEN*SIZE(C2) ++ ++ VLD_UL c09, 1*VEC_LEN*SIZE(C2) ++ VLD_UH c11, 2*VEC_LEN*SIZE(C2) ++ ++ vbisw c08,c10,c08 ++ vbisw c09,c11,c09 ++ ++ VMAD t08,ALPHA,c08,t08 ++ VMAD t09,ALPHA,c09,t09 ++ ++ VST_UL t08, 0*VEC_LEN*SIZE(C2) ++ VST_UH t08, 1*VEC_LEN*SIZE(C2) ++ ++ VST_UL t09, 1*VEC_LEN*SIZE(C2) ++ VST_UH t09, 2*VEC_LEN*SIZE(C2) ++ ++ ++$Access_C3_8x4: ++ and C3, (VEC_LEN*SIZE-1),$6 ++ addl C2,8*SIZE,C2 ++ nop ++ bne $6,$UnAlign_C3_Access_8x4 ++ ++$Align_C3_Access_8x4: ++ VLD c12,0(C3) ++ VLD c13,4*SIZE(C3) ++ ++ VMAD t12,ALPHA,c12,t12 ++ VMAD t13,ALPHA,c13,t13 ++ ++ VST t12,0(C3) ++ VST t13,4*SIZE(C3) ++ addl C3,8*SIZE,C3 ++ jmp .L16 ++ ++ ++$UnAlign_C3_Access_8x4: ++ VLD_UL c12, 0*VEC_LEN*SIZE(C3) ++ VLD_UH c14, 1*VEC_LEN*SIZE(C3) ++ ++ VLD_UL c13, 1*VEC_LEN*SIZE(C3) ++ VLD_UH c15, 2*VEC_LEN*SIZE(C3) ++ ++ vbisw c12,c14,c12 ++ vbisw c13,c15,c13 ++ ++ VMAD t12,ALPHA,c12,t12 ++ VMAD t13,ALPHA,c13,t13 ++ ++ VST_UL t12, 0*VEC_LEN*SIZE(C3) ++ VST_UH t12, 1*VEC_LEN*SIZE(C3) ++ ++ VST_UL t13, 1*VEC_LEN*SIZE(C3) ++ VST_UH t13, 2*VEC_LEN*SIZE(C3) ++ addl C3,8*SIZE,C3 ++ ++#else ++ ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_8x4 ++ ++$Align_CO_Access_8x4: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ jmp $Access_C1_8x4 ++ ++$UnAlign_CO_Access_8x4: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_8x4: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ addl CO,8*SIZE,CO # 8c ++ nop ++ bne $6,$UnAlign_C1_Access_8x4 ++ ++$Align_C1_Access_8x4: ++ VMUL t04,ALPHA,t04 ++ VMUL t05,ALPHA,t05 ++ ++ VST t04,0(C1) ++ VST t05,4*SIZE(C1) ++ jmp $Access_C2_8x4 ++ ++$UnAlign_C1_Access_8x4: ++ VMUL t04,ALPHA,t04 ++ VMUL t05,ALPHA,t05 ++ ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ ++ VST_UL t05, 1*VEC_LEN*SIZE(C1) ++ VST_UH t05, 2*VEC_LEN*SIZE(C1) ++ ++ ++$Access_C2_8x4: ++ and C2, (VEC_LEN*SIZE-1),$6 ++ addl C1,8*SIZE,C1 # 8c ++ nop ++ bne $6,$UnAlign_C2_Access_8x4 ++ ++$Align_C2_Access_8x4: ++ VMUL t08,ALPHA,t08 ++ VMUL t09,ALPHA,t09 ++ ++ VST t08,0(C2) ++ VST t09,4*SIZE(C2) ++ jmp $Access_C3_8x4 ++ ++$UnAlign_C2_Access_8x4: ++ VMUL t08,ALPHA,t08 ++ VMUL t09,ALPHA,t09 ++ ++ VST_UL t08, 0*VEC_LEN*SIZE(C2) ++ VST_UH t08, 1*VEC_LEN*SIZE(C2) ++ ++ VST_UL t09, 1*VEC_LEN*SIZE(C2) ++ VST_UH t09, 2*VEC_LEN*SIZE(C2) ++ ++ ++$Access_C3_8x4: ++ and C3, (VEC_LEN*SIZE-1),$6 ++ addl C2,8*SIZE,C2 # 8c ++ nop ++ bne $6,$UnAlign_C3_Access_8x4 ++ ++$Align_C3_Access_8x4: ++ VMUL t12,ALPHA,t12 ++ VMUL t13,ALPHA,t13 ++ ++ VST t12,0(C3) ++ VST t13,4*SIZE(C3) ++ addl C3,8*SIZE,C3 ++ jmp $TRMMKERNEL_8x4 ++ ++$UnAlign_C3_Access_8x4: ++ VMUL t12,ALPHA,t12 ++ VMUL t13,ALPHA,t13 ++ ++ VST_UL t12, 0*VEC_LEN*SIZE(C3) ++ VST_UH t12, 1*VEC_LEN*SIZE(C3) ++ ++ VST_UL t13, 1*VEC_LEN*SIZE(C3) ++ VST_UH t13, 2*VEC_LEN*SIZE(C3) ++ addl C3,8*SIZE,C3 ++ ++$TRMMKERNEL_8x4: ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 8,TEMP # mr=8 ++#else ++ subl TEMP, 4,TEMP # nr=4 ++#endif ++ ++ sll TEMP, 3 + BASE_SHIFT,KC ++ sll TEMP, 2 + BASE_SHIFT,TEMP ++ ++ addl A, KC, A # move A, B to the end of this panel ++ addl B, TEMP, B ++#endif ++ ++#ifdef LEFT ++ addl KK, 8, KK ++#endif ++#endif ++ ++ ++ ++ .align 5 ++ ++.L16: ++ and MC1,4,MC # nr=4,mr=4---------------------------- ++ sll KC1,2+BASE_SHIFT,SPANA # spana=kc1*mc ++ nop ++ beq MC,.L17 ++ ++ addl A1,SPANA,PREA ++ subl PREA,4*SIZE,PREA # PREA-=MC ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1,B # Set B ++ nop ++#else ++ sll KK, 2 + BASE_SHIFT,KC # mr=nr=4 ++ nop ++ ++ addl A, KC, A ++ addl B1,KC, B ++#endif ++ ++ vcpys $f31,$f31,t00 # clear 16 register ++ vcpys $f31,$f31,t04 ++ vcpys $f31,$f31,t08 ++ vcpys $f31,$f31,t12 ++ ++ LDDE b0,0(B) # get 4b ++ LDDE b1,1*SIZE(B) ++ LDDE b2,2*SIZE(B) ++ LDDE b3,3*SIZE(B) ++ ++ VLD a0,0(A) # get 4a ++ ++ fillcs 0(CO) # prefetch C ++ fillcs 0(C1) ++ fillcs 0(C2) ++ fillcs 0(C3) ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#else ++ addl KK, 4, TEMP ++#endif ++ sra TEMP,1,KC ++ nop ++ beq KC,$Rest_4x4x1 ++ ++#else ++ mov B1,B # Reset B ++ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 ++ vcpys $f31,$f31,t00 # clear 16 register ++ vcpys $f31,$f31,t04 ++ vcpys $f31,$f31,t08 ++ vcpys $f31,$f31,t12 ++ ++ LDDE b0,0(B) # get 4b ++ LDDE b1,1*SIZE(B) ++ LDDE b2,2*SIZE(B) ++ LDDE b3,3*SIZE(B) ++ ++ VLD a0,0(A) # get 4a ++ ++ fillcs 0(CO) # prefetch C ++ fillcs 0(C1) ++ fillcs 0(C2) ++ fillcs 0(C3) ++ ++ beq KC,$Rest_4x4x1 ++ ++#endif ++ ++ ++$Panel_4x4x2: ++ VMAD a0,b0,t00,t00 ++ VMAD a0,b1,t04,t04 ++ VMAD a0,b2,t08,t08 ++ VMAD a0,b3,t12,t12 ++ ++ VLD a4,4*SIZE(A) ++ LDDE nb0,4*SIZE(B) # get next 4b and 4a ++ LDDE nb1,5*SIZE(B) ++ LDDE nb2,6*SIZE(B) ++ LDDE nb3,7*SIZE(B) ++ addl B,8*SIZE,B # 4b*2k ++ ++ fillcs 0(PREA) ++ subl PREA,4*SIZE,PREA ++ ++ subl KC,1,KC ++ VMAD a4,nb0,t00,t00 ++ VMAD a4,nb1,t04,t04 ++ VMAD a4,nb2,t08,t08 ++ VMAD a4,nb3,t12,t12 ++ ++ addl A,8*SIZE,A # 4a*2k ++ LDDE b0,0(B) # get 3rd 4b and 4a ++ LDDE b1,1*SIZE(B) ++ LDDE b2,2*SIZE(B) ++ LDDE b3,3*SIZE(B) ++ VLD a0,0(A) ++ ++ fillcs 0(PREA) ++ subl PREA,4*SIZE,PREA ++ bne KC,$Panel_4x4x2 ++ ++ ++$Rest_4x4x1: ++ LDDE ALPHA, 192($sp) # Get ALPHA ++#ifndef TRMMKERNEL ++ blbc KC1, $Write_4x4 ++#else ++ blbc TEMP, $Write_4x4 ++#endif ++ ++ addl A,4*SIZE,A # 4a*1k ++ addl B,4*SIZE,B # 4b*1K ++ ++ fillcs 0(PREA) ++ subl PREA,4*SIZE,PREA ++ ++ VMAD a0,b0,t00,t00 ++ VMAD a0,b1,t04,t04 ++ VMAD a0,b2,t08,t08 ++ VMAD a0,b3,t12,t12 ++ ++ ++$Write_4x4: ++ ++#ifndef TRMMKERNEL ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_4x4 ++ ++$Align_CO_Access_4x4: ++ VLD c00,0(CO) # get 1st colum of 16c ++ VMAD t00,ALPHA,c00,t00 ++ VST t00,0(CO) ++ jmp $Access_C1_4x4 ++ ++$UnAlign_CO_Access_4x4: ++ VLD_UL c00, 0*VEC_LEN*SIZE(CO) ++ VLD_UH c02, 1*VEC_LEN*SIZE(CO) ++ ++ vbisw c00,c02,c00 ++ ++ VMAD t00,ALPHA,c00,t00 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_4x4: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ addl CO,4*SIZE,CO # 4c ++ nop ++ bne $6,$UnAlign_C1_Access_4x4 ++ ++$Align_C1_Access_4x4: ++ VLD c04,0(C1) ++ VMAD t04,ALPHA,c04,t04 ++ VST t04,0(C1) ++ jmp $Access_C2_4x4 ++ ++$UnAlign_C1_Access_4x4: ++ VLD_UL c04, 0*VEC_LEN*SIZE(C1) ++ VLD_UH c06, 1*VEC_LEN*SIZE(C1) ++ ++ vbisw c04,c06,c04 ++ ++ VMAD t04,ALPHA,c04,t04 ++ ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ ++ ++$Access_C2_4x4: ++ and C2, (VEC_LEN*SIZE-1),$6 ++ addl C1,4*SIZE,C1 # 4c ++ nop ++ bne $6,$UnAlign_C2_Access_4x4 ++ ++$Align_C2_Access_4x4: ++ VLD c08,0(C2) ++ VMAD t08,ALPHA,c08,t08 ++ VST t08,0(C2) ++ jmp $Access_C3_4x4 ++ ++$UnAlign_C2_Access_4x4: ++ VLD_UL c08, 0*VEC_LEN*SIZE(C2) ++ VLD_UH c10, 1*VEC_LEN*SIZE(C2) ++ ++ vbisw c08,c10,c08 ++ ++ VMAD t08,ALPHA,c08,t08 ++ ++ VST_UL t08, 0*VEC_LEN*SIZE(C2) ++ VST_UH t08, 1*VEC_LEN*SIZE(C2) ++ ++ ++$Access_C3_4x4: ++ and C3, (VEC_LEN*SIZE-1),$6 ++ addl C2,4*SIZE,C2 # 4c ++ nop ++ bne $6,$UnAlign_C3_Access_4x4 ++ ++$Align_C3_Access_4x4: ++ VLD c12,0(C3) ++ VMAD t12,ALPHA,c12,t12 ++ VST t12,0(C3) ++ addl C3,4*SIZE,C3 ++ jmp .L17 ++ ++$UnAlign_C3_Access_4x4: ++ VLD_UL c12, 0*VEC_LEN*SIZE(C3) ++ VLD_UH c14, 1*VEC_LEN*SIZE(C3) ++ ++ vbisw c12,c14,c12 ++ ++ VMAD t12,ALPHA,c12,t12 ++ ++ VST_UL t12, 0*VEC_LEN*SIZE(C3) ++ VST_UH t12, 1*VEC_LEN*SIZE(C3) ++ addl C3,4*SIZE,C3 ++ ++ ++#else ++ ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_4x4 ++ ++$Align_CO_Access_4x4: ++ VMUL t00,ALPHA,t00 ++ VST t00,0(CO) ++ jmp $Access_C1_4x4 ++ ++$UnAlign_CO_Access_4x4: ++ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_4x4: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ addl CO,4*SIZE,CO # 4c ++ nop ++ bne $6,$UnAlign_C1_Access_4x4 ++ ++$Align_C1_Access_4x4: ++ VMUL t04,ALPHA,t04 ++ VST t04,0(C1) ++ jmp $Access_C2_4x4 ++ ++$UnAlign_C1_Access_4x4: ++ VMUL t04,ALPHA,t04 ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ ++ ++$Access_C2_4x4: ++ and C2, (VEC_LEN*SIZE-1),$6 ++ addl C1,4*SIZE,C1 # 4c ++ nop ++ bne $6,$UnAlign_C2_Access_4x4 ++ ++$Align_C2_Access_4x4: ++ VMUL t08,ALPHA,t08 ++ VST t08,0(C2) ++ jmp $Access_C3_4x4 ++ ++$UnAlign_C2_Access_4x4: ++ VMUL t08,ALPHA,t08 ++ VST_UL t08, 0*VEC_LEN*SIZE(C2) ++ VST_UH t08, 1*VEC_LEN*SIZE(C2) ++ ++ ++$Access_C3_4x4: ++ and C3, (VEC_LEN*SIZE-1),$6 ++ addl C2,4*SIZE,C2 # 4c ++ nop ++ bne $6,$UnAlign_C3_Access_4x4 ++ ++$Align_C3_Access_4x4: ++ VMUL t12,ALPHA,t12 ++ VST t12,0(C3) ++ addl C3,4*SIZE,C3 ++ jmp $TRMMKERNEL_4x4 ++ ++$UnAlign_C3_Access_4x4: ++ VMUL t12,ALPHA,t12 ++ VST_UL t12, 0*VEC_LEN*SIZE(C3) ++ VST_UH t12, 1*VEC_LEN*SIZE(C3) ++ addl C3,4*SIZE,C3 ++ ++$TRMMKERNEL_4x4: ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++ subl TEMP, 4, TEMP # mr=nr=4 ++ ++ sll TEMP, 2 + BASE_SHIFT,KC ++ nop ++ ++ addl A, KC, A # move A B to the end of this panel ++ addl B, KC, B ++#endif ++ ++#ifdef LEFT ++ addl KK, 4, KK ++#endif ++#endif ++ ++ ++ ++ ++ .align 5 ++.L17: # nr=4,mr=2-------------------- ++ and MC1,2,MC ++ beq MC,.L18 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA))\ ++ || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B ++#else ++ sll KK, 1 + BASE_SHIFT, KC # mr=2 ++ sll KK, 2 + BASE_SHIFT, TEMP # nr=4 ++ ++ addl A, KC, A ++ addl B1,TEMP, B ++#endif ++ ++ fclr t00 # CLEAR 8 register ++ fclr t01 ++ fclr t04 ++ fclr t05 ++ fclr t08 ++ fclr t09 ++ fclr t12 ++ fclr t13 ++ ++ LD b0,0(B) # get 4b ++ LD b1,1*SIZE(B) ++ LD a0,0(A) # get 2a ++ LD b2,2*SIZE(B) ++ LD b3,3*SIZE(B) ++ LD a4,1*SIZE(A) ++ ++ fillcs 0(CO) # prefetch C ++ fillcs 0(C1) ++ fillcs 0(C2) ++ fillcs 0(C3) ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 2, TEMP # mr=2 ++#else ++ addl KK, 4, TEMP # nr=4 ++#endif ++ sra TEMP, 1, KC ++ beq KC,$Rest_2x4x1 ++ ++#else ++ mov B1,B # reset B ++ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 ++ fclr t00 # CLEAR 8 register ++ fclr t01 ++ fclr t04 ++ fclr t05 ++ fclr t08 ++ fclr t09 ++ fclr t12 ++ fclr t13 ++ ++ LD b0,0(B) # get 4b ++ LD b1,1*SIZE(B) ++ LD a0,0(A) # get 2a ++ LD b2,2*SIZE(B) ++ LD b3,3*SIZE(B) ++ LD a4,1*SIZE(A) ++ ++ fillcs 0(CO) # prefetch C ++ fillcs 0(C1) ++ fillcs 0(C2) ++ fillcs 0(C3) ++ ++ beq KC,$Rest_2x4x1 ++#endif ++ ++ ++$Panel_2x4x2: ++ MAD a0,b0,t00,t00 ++ MAD a0,b1,t04,t04 ++ MAD a0,b2,t08,t08 ++ MAD a0,b3,t12,t12 ++ ++ LD nb0,4*SIZE(B) # get next 4b and 2a ++ LD nb1,5*SIZE(B) ++ LD a8,2*SIZE(A) ++ LD nb2,6*SIZE(B) ++ LD nb3,7*SIZE(B) ++ LD a12,3*SIZE(A) ++ addl B,8*SIZE,B # 4b*2k ++ ++ MAD a4,b0,t01,t01 ++ MAD a4,b1,t05,t05 ++ MAD a4,b2,t09,t09 ++ MAD a4,b3,t13,t13 ++ ++ subl KC,1,KC ++ MAD a8,nb0,t00,t00 ++ MAD a8,nb1,t04,t04 ++ MAD a8,nb2,t08,t08 ++ MAD a8,nb3,t12,t12 ++ ++ addl A,4*SIZE,A # 2a*2k ++ LD b0,0(B) # get 3rd 4b and 2a ++ LD b1,1*SIZE(B) ++ LD a0,0(A) ++ LD b2,2*SIZE(B) ++ LD b3,3*SIZE(B) ++ LD a4,1*SIZE(A) ++ ++ MAD a12,nb0,t01,t01 ++ MAD a12,nb1,t05,t05 ++ MAD a12,nb2,t09,t09 ++ MAD a12,nb3,t13,t13 ++ ++ bne KC,$Panel_2x4x2 ++ ++ ++$Rest_2x4x1: ++ LD ALPHA, 192($sp) # get alpha ++#ifndef TRMMKERNEL ++ blbc KC1, $Write_2x4 ++#else ++ blbc TEMP, $Write_2x4 ++#endif ++ ++ addl A,2*SIZE,A # 2a*1k ++ addl B,4*SIZE,B # 4b*1K ++ ++ MAD a0,b0,t00,t00 ++ MAD a0,b1,t04,t04 ++ MAD a0,b2,t08,t08 ++ MAD a0,b3,t12,t12 ++ ++ MAD a4,b0,t01,t01 ++ MAD a4,b1,t05,t05 ++ MAD a4,b2,t09,t09 ++ MAD a4,b3,t13,t13 ++ ++$Write_2x4: ++#ifndef TRMMKERNEL ++ LD c00,0(CO) ++ LD c01,1*SIZE(CO) ++ LD c04,0(C1) ++ LD c05,1*SIZE(C1) ++ ++ MAD t00,ALPHA,c00,t00 ++ MAD t01,ALPHA,c01,t01 ++ ++ LD c08,0(C2) ++ LD c09,1*SIZE(C2) ++ ++ MAD t04,ALPHA,c04,t04 ++ MAD t05,ALPHA,c05,t05 ++ ++ LD c12,0(C3) ++ LD c13,1*SIZE(C3) ++ ++ MAD t08,ALPHA,c08,t08 ++ MAD t09,ALPHA,c09,t09 ++ ++ addl CO,2*SIZE,CO # 2c ++ addl C1,2*SIZE,C1 ++ addl C2,2*SIZE,C2 ++ addl C3,2*SIZE,C3 ++ ++ ST t00,-2*SIZE(CO) # 2c ++ ST t01,-1*SIZE(CO) ++ ++ MAD t12,ALPHA,c12,t12 ++ MAD t13,ALPHA,c13,t13 ++ ++ ST t04,-2*SIZE(C1) ++ ST t05,-1*SIZE(C1) ++ ++ ST t08,-2*SIZE(C2) ++ ST t09,-1*SIZE(C2) ++ ++ ST t12,-2*SIZE(C3) ++ ST t13,-1*SIZE(C3) ++ ++#else ++ MUL t00,ALPHA,t00 ++ MUL t01,ALPHA,t01 ++ ++ MUL t04,ALPHA,t04 ++ MUL t05,ALPHA,t05 ++ ++ MUL t08,ALPHA,t08 ++ MUL t09,ALPHA,t09 ++ ++ addl CO,2*SIZE,CO # 2c ++ addl C1,2*SIZE,C1 ++ addl C2,2*SIZE,C2 ++ addl C3,2*SIZE,C3 ++ ++ ST t00,-2*SIZE(CO) # 2c ++ ST t01,-1*SIZE(CO) ++ ++ MUL t12,ALPHA,t12 ++ MUL t13,ALPHA,t13 ++ ++ ST t04,-2*SIZE(C1) ++ ST t05,-1*SIZE(C1) ++ ++ ST t08,-2*SIZE(C2) ++ ST t09,-1*SIZE(C2) ++ ++ ST t12,-2*SIZE(C3) ++ ST t13,-1*SIZE(C3) ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 2, TEMP ++#else ++ subl TEMP, 4, TEMP ++#endif ++ ++ sll TEMP, 1 + BASE_SHIFT,KC ++ sll TEMP, 2 + BASE_SHIFT,TEMP ++ ++ addl A, KC, A ++ addl B, TEMP, B ++#endif ++ ++#ifdef LEFT ++ addl KK,2,KK ++#endif ++#endif ++ ++ ++ ++.align 5 ++.L18: # nr=4,mr=1--------------------------- ++ and MC1,1,MC ++ beq MC,$End_NC_Unroll4 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B ++ nop ++#else ++ sll KK, BASE_SHIFT, KC # mr=1 ++ sll KK, 2 + BASE_SHIFT,TEMP # nr=4 ++ ++ addl A, KC, A ++ addl B1,TEMP, B ++#endif ++ ++ fclr t00 # clear 4 regitster ++ fclr t04 ++ fclr t08 ++ fclr t12 ++ ++ LD b0,0(B) # get 4b ++ LD b1,1*SIZE(B) ++ LD b2,2*SIZE(B) ++ LD b3,3*SIZE(B) ++ ++ LD a0,0(A) # get 1 a ++ ++ fillcs 0(CO) # prefetch C ++ fillcs 0(C1) ++ fillcs 0(C2) ++ fillcs 0(C3) ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 1, TEMP # mr=1 ++#else ++ addl KK, 4,TEMP # nr=4 ++#endif ++ sra TEMP,1,KC ++ beq KC,$Rest_1x4x1 ++ ++#else ++ mov B1,B # Reset B ++ fclr t00 # clear 4 regitster ++ fclr t04 ++ fclr t08 ++ fclr t12 ++ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 ++ ++ LD b0,0(B) # get 4b ++ LD b1,1*SIZE(B) ++ LD b2,2*SIZE(B) ++ LD b3,3*SIZE(B) ++ ++ LD a0,0(A) # get 1 a ++ ++ fillcs 0(CO) # prefetch C ++ fillcs 0(C1) ++ fillcs 0(C2) ++ fillcs 0(C3) ++ ++ beq KC,$Rest_1x4x1 ++ ++#endif ++ ++ ++$Panel_1x4x2: ++ MAD a0,b0,t00,t00 ++ MAD a0,b1,t04,t04 ++ MAD a0,b2,t08,t08 ++ MAD a0,b3,t12,t12 ++ ++ LD a8,1*SIZE(A) ++ LD nb0,4*SIZE(B) ++ LD nb1,5*SIZE(B) ++ LD nb2,6*SIZE(B) ++ LD nb3,7*SIZE(B) ++ ++ addl B,8*SIZE,B # 4b*2k ++ ++ subl KC,1,KC ++ MAD a8,nb0,t00,t00 ++ MAD a8,nb1,t04,t04 ++ MAD a8,nb2,t08,t08 ++ MAD a8,nb3,t12,t12 ++ ++ addl A,2*SIZE,A # 1a*2k ++ LD a0,0(A) # get 3rd 4b and 1a ++ LD b0,0(B) ++ LD b1,1*SIZE(B) ++ LD b2,2*SIZE(B) ++ LD b3,3*SIZE(B) ++ bne KC,$Panel_1x4x2 ++ ++ ++$Rest_1x4x1: ++ LD ALPHA,192($sp) # get alpha ++#ifndef TRMMKERNEL ++ blbc KC1, $Write_1x4 ++#else ++ blbc TEMP, $Write_1x4 ++#endif ++ ++ addl A,1*SIZE,A # 1m*1k*8Byte ++ addl B,4*SIZE,B # 4n*1K*8Byte ++ ++ MAD a0,b0,t00,t00 ++ MAD a0,b1,t04,t04 ++ MAD a0,b2,t08,t08 ++ MAD a0,b3,t12,t12 ++ ++ ++$Write_1x4: ++#ifndef TRMMKERNEL ++ LD c00,0(CO) ++ LD c04,0(C1) ++ MAD t00,ALPHA,c00,t00 ++ MAD t04,ALPHA,c04,t04 ++ LD c08,0(C2) ++ LD c12,0(C3) ++ MAD t08,ALPHA,c08,t08 ++ MAD t12,ALPHA,c12,t12 ++ ST t00,0(CO) ++ ST t04,0(C1) ++ ST t08,0(C2) ++ ST t12,0(C3) ++ ++#else ++ MUL t00,ALPHA,t00 ++ MUL t04,ALPHA,t04 ++ MUL t08,ALPHA,t08 ++ MUL t12,ALPHA,t12 ++ ++ ST t00,0(CO) ++ ST t04,0(C1) ++ ST t08,0(C2) ++ ST t12,0(C3) ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 1, TEMP ++#else ++ subl TEMP, 4, TEMP ++#endif ++ ++ sll TEMP, BASE_SHIFT, KC ++ sll TEMP, 2 + BASE_SHIFT, TEMP ++ ++ addl A, KC, A ++ addl B, TEMP,B ++#endif ++ ++#ifdef LEFT ++ addl KK, 1,KK ++#endif ++#endif ++ ++ ++ .align 5 ++ ++$End_NC_Unroll4: ++ subl NC,1,NC # Loop N -- ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 4, KK ++ nop ++#endif ++ mov A1,A # Reset A ++ mov B, B1 # mov B1 to the next panel ++ bne NC,.L0 ++ ++ ++ ++ ++ .align 5 ++$Begin_NC_Unroll2: ++ ++ and NC1, 2, NC ++ beq NC, $Begin_NC_Unroll1 ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK # reset KK ++#endif ++ ++ mov C,CO ++ addl C,LDM,C1 ++ ++ sra MC1,4,MC # MC=MC1/16 ++ sll KC1,4+BASE_SHIFT,SPANA # SPANA=KC1*MC ++ ++ addl A1,SPANA,PREA ++ subl PREA,16*SIZE,PREA ++ ++ addl C1,LDM,C # C=C1+LDM, Mov C to Next Panel ++ beq MC,.L25 # MC=0:MC1<16 ++ ++ ++ .align 5 ++.L2: # nr=2,mr=16------------------- ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA))\ ++ || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1,B ++#else ++ sll KK, 4 + BASE_SHIFT,KC # mr=16 ++ sll KK, 1 + BASE_SHIFT,TEMP # nr=2 ++ ++ addl A,KC,A ++ addl B1,TEMP,B ++#endif ++ ++ vcpys $f31,$f31,t00 # CLEAR Results Register ++ vcpys $f31,$f31,t01 ++ vcpys $f31,$f31,t02 ++ vcpys $f31,$f31,t03 ++ ++ LDDE b0,0(B) ++ LDDE b1,1*SIZE(B) ++ ++ VLD a0,0(A) # Get 16 A and 2 B ++ VLD a4,4*SIZE(A) ++ VLD a8,8*SIZE(A) ++ VLD a12,12*SIZE(A) ++ ++ vcpys $f31,$f31,t04 ++ vcpys $f31,$f31,t06 ++ vcpys $f31,$f31,t05 ++ vcpys $f31,$f31,t07 ++ ++ fillcs 0(CO) # fetch C ++ fillcs 0(C1) ++ fillcs 8*SIZE(CO) ++ fillcs 8*SIZE(C1) ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 16, TEMP # mr=16 ++#else ++ addl KK, 2, TEMP # nr=2 ++#endif ++ sra TEMP, 1, KC ++ nop ++ beq KC,$Rest_16x2x1 ++ ++#else ++ ++ mov B1,B # Set B ++ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 ++ vcpys $f31,$f31,t00 # CLEAR Results Register ++ vcpys $f31,$f31,t01 ++ vcpys $f31,$f31,t02 ++ vcpys $f31,$f31,t03 ++ ++ LDDE b0,0(B) ++ LDDE b1,1*SIZE(B) ++ ++ VLD a0,0(A) # Get 16 A and 2 B ++ VLD a4,4*SIZE(A) ++ VLD a8,8*SIZE(A) ++ VLD a12,12*SIZE(A) ++ ++ vcpys $f31,$f31,t04 ++ vcpys $f31,$f31,t06 ++ vcpys $f31,$f31,t05 ++ vcpys $f31,$f31,t07 ++ ++ fillcs 0(CO) # fetch C ++ fillcs 0(C1) ++ fillcs 8*SIZE(CO) ++ fillcs 8*SIZE(C1) ++ ++ beq KC,$Rest_16x2x1 ++ ++#endif ++ ++ ++$Panel_16x2x2: ++ VMAD a0,b0,t00,t00 ++ VMAD a0,b1,t04,t04 ++ ++ addl A,16*SIZE,A # 16m*1k ++ LDDE nb0,2*SIZE(B) ++ LDDE nb1,3*SIZE(B) ++ ++ VMAD a4,b0,t01,t01 ++ VMAD a4,b1,t05,t05 ++ ++ addl B,4*SIZE,B # 2n*2k ++ VLD na0,0(A) ++ VLD na4,4*SIZE(A) ++ VLD na8,8*SIZE(A) ++ VLD na12,12*SIZE(A) ++ ++ VMAD a8,b0,t02,t02 ++ VMAD a8,b1,t06,t06 ++ ++ VMAD a12,b0,t03,t03 ++ VMAD a12,b1,t07,t07 ++ ++ fillcs 0(PREA) ++ fillcs 8*SIZE(PREA) ++ subl PREA,16*SIZE,PREA ++ ++ subl KC,1,KC ++ VMAD na0,nb0,t00,t00 ++ VMAD na0,nb1,t04,t04 ++ ++ addl A,16*SIZE,A # 16m*1k ++ LDDE b0,0(B) ++ LDDE b1,1*SIZE(B) ++ ++ VMAD na4,nb0,t01,t01 ++ VMAD na4,nb1,t05,t05 ++ ++ VLD a0,0(A) # get 3rd 16a ++ VLD a4,4*SIZE(A) ++ VLD a8,8*SIZE(A) ++ VLD a12,12*SIZE(A) ++ ++ VMAD na8,nb0,t02,t02 ++ VMAD na8,nb1,t06,t06 ++ ++ VMAD na12,nb0,t03,t03 ++ VMAD na12,nb1,t07,t07 ++ ++ fillcs 0(PREA) ++ fillcs 8*SIZE(PREA) ++ subl PREA,16*SIZE,PREA ++ bne KC,$Panel_16x2x2 ++ ++ ++$Rest_16x2x1: ++ LDDE ALPHA, 192($sp) # get alpha ++#ifndef TRMMKERNEL ++ blbc KC1, $Write_16x2 ++#else ++ blbc TEMP, $Write_16x2 ++#endif ++ ++ addl A,16*SIZE,A # 16m*1k ++ addl B,2*SIZE,B # 2n*1k ++ ++ VMAD a0,b0,t00,t00 ++ VMAD a0,b1,t04,t04 ++ ++ fillcs 0(PREA) ++ fillcs 8*SIZE(PREA) ++ subl PREA,16*SIZE,PREA ++ ++ VMAD a4,b0,t01,t01 ++ VMAD a4,b1,t05,t05 ++ VMAD a8,b0,t02,t02 ++ VMAD a8,b1,t06,t06 ++ VMAD a12,b0,t03,t03 ++ VMAD a12,b1,t07,t07 ++ ++ ++$Write_16x2: ++ ++#ifndef TRMMKERNEL ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_16x2 ++ ++$Align_CO_Access_16x2: ++ VLD c00,0(CO) # get 1st colum of 16c ++ VLD c01,4*SIZE(CO) ++ VLD c02,8*SIZE(CO) ++ VLD c03,12*SIZE(CO) ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ VMAD t02,ALPHA,c02,t02 ++ VMAD t03,ALPHA,c03,t03 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ VST t02,8*SIZE(CO) ++ VST t03,12*SIZE(CO) ++ jmp $Access_C1_16x2 ++ ++$UnAlign_CO_Access_16x2: ++ VLD_UL c00, 0*VEC_LEN*SIZE(CO) ++ VLD_UH c04, 1*VEC_LEN*SIZE(CO) ++ ++ VLD_UL c01, 1*VEC_LEN*SIZE(CO) ++ VLD_UH c05, 2*VEC_LEN*SIZE(CO) ++ ++ VLD_UL c02, 2*VEC_LEN*SIZE(CO) ++ VLD_UH c06, 3*VEC_LEN*SIZE(CO) ++ ++ VLD_UL c03, 3*VEC_LEN*SIZE(CO) ++ VLD_UH c07, 4*VEC_LEN*SIZE(CO) ++ ++ vbisw c00,c04,c00 ++ vbisw c01,c05,c01 ++ vbisw c02,c06,c02 ++ vbisw c03,c07,c03 ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ VMAD t02,ALPHA,c02,t02 ++ VMAD t03,ALPHA,c03,t03 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++ VST_UL t02, 2*VEC_LEN*SIZE(CO) ++ VST_UH t02, 3*VEC_LEN*SIZE(CO) ++ ++ VST_UL t03, 3*VEC_LEN*SIZE(CO) ++ VST_UH t03, 4*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_16x2: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_C1_Access_16x2 ++ ++$Align_C1_Access_16x2: ++ VLD c04,0(C1) ++ VLD c05,4*SIZE(C1) ++ VLD c06,8*SIZE(C1) ++ VLD c07,12*SIZE(C1) ++ ++ VMAD t04,ALPHA,c04,t04 ++ VMAD t05,ALPHA,c05,t05 ++ VMAD t06,ALPHA,c06,t06 ++ VMAD t07,ALPHA,c07,t07 ++ ++ VST t04,0(C1) ++ VST t05,4*SIZE(C1) ++ VST t06,8*SIZE(C1) ++ VST t07,12*SIZE(C1) ++ jmp $End_NC_Unroll2 ++ ++$UnAlign_C1_Access_16x2: ++ VLD_UL c04, 0*VEC_LEN*SIZE(C1) ++ VLD_UH t00, 1*VEC_LEN*SIZE(C1) ++ ++ VLD_UL c05, 1*VEC_LEN*SIZE(C1) ++ VLD_UH t01, 2*VEC_LEN*SIZE(C1) ++ ++ VLD_UL c06, 2*VEC_LEN*SIZE(C1) ++ VLD_UH t02, 3*VEC_LEN*SIZE(C1) ++ ++ VLD_UL c07, 3*VEC_LEN*SIZE(C1) ++ VLD_UH t03, 4*VEC_LEN*SIZE(C1) ++ ++ vbisw c04,t00,c04 ++ vbisw c05,t01,c05 ++ vbisw c06,t02,c06 ++ vbisw c07,t03,c07 ++ ++ VMAD t04,ALPHA,c04,t04 ++ VMAD t05,ALPHA,c05,t05 ++ VMAD t06,ALPHA,c06,t06 ++ VMAD t07,ALPHA,c07,t07 ++ ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ ++ VST_UL t05, 1*VEC_LEN*SIZE(C1) ++ VST_UH t05, 2*VEC_LEN*SIZE(C1) ++ ++ VST_UL t06, 2*VEC_LEN*SIZE(C1) ++ VST_UH t06, 3*VEC_LEN*SIZE(C1) ++ ++ VST_UL t07, 3*VEC_LEN*SIZE(C1) ++ VST_UH t07, 4*VEC_LEN*SIZE(C1) ++ jmp $End_NC_Unroll2 # loop m finished ++ ++ ++#else ++ ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_16x2 ++ ++$Align_CO_Access_16x2: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ VMUL t02,ALPHA,t02 ++ VMUL t03,ALPHA,t03 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ VST t02,8*SIZE(CO) ++ VST t03,12*SIZE(CO) ++ jmp $Access_C1_16x2 ++ ++$UnAlign_CO_Access_16x2: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ VMUL t02,ALPHA,t02 ++ VMUL t03,ALPHA,t03 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++ VST_UL t02, 2*VEC_LEN*SIZE(CO) ++ VST_UH t02, 3*VEC_LEN*SIZE(CO) ++ ++ VST_UL t03, 3*VEC_LEN*SIZE(CO) ++ VST_UH t03, 4*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_16x2: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_C1_Access_16x2 ++ ++$Align_C1_Access_16x2: ++ VMUL t04,ALPHA,t04 ++ VMUL t05,ALPHA,t05 ++ VMUL t06,ALPHA,t06 ++ VMUL t07,ALPHA,t07 ++ ++ VST t04,0(C1) ++ VST t05,4*SIZE(C1) ++ VST t06,8*SIZE(C1) ++ VST t07,12*SIZE(C1) ++ jmp $TRMMKERNEL_16x2 ++ ++$UnAlign_C1_Access_16x2: ++ VMUL t04,ALPHA,t04 ++ VMUL t05,ALPHA,t05 ++ VMUL t06,ALPHA,t06 ++ VMUL t07,ALPHA,t07 ++ ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ ++ VST_UL t05, 1*VEC_LEN*SIZE(C1) ++ VST_UH t05, 2*VEC_LEN*SIZE(C1) ++ ++ VST_UL t06, 2*VEC_LEN*SIZE(C1) ++ VST_UH t06, 3*VEC_LEN*SIZE(C1) ++ ++ VST_UL t07, 3*VEC_LEN*SIZE(C1) ++ VST_UH t07, 4*VEC_LEN*SIZE(C1) ++ ++$TRMMKERNEL_16x2: ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 16, TEMP ++#else ++ subl TEMP, 2, TEMP ++#endif ++ ++ sll TEMP, 4 + BASE_SHIFT,KC ++ sll TEMP, 1 + BASE_SHIFT,TEMP ++ ++ addl A, KC, A ++ addl B, TEMP,B ++#endif ++ ++#ifdef LEFT ++ addl KK, 16, KK ++ nop ++#endif ++ ++ jmp $End_NC_Unroll2 # loop m finished ++#endif ++ ++ ++ ++ .align 5 ++ ++.L25: ++ and MC1,8,MC ++ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc ++ nop ++ beq MC,.L26 ++ ++ addl A1,SPANA,PREA ++ subl PREA,8*SIZE,PREA # PREA-=MC ++ ++ ++ .align 5 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA))\ ++ || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B ++#else ++ sll KK, 3 + BASE_SHIFT,KC # mr=8 ++ sll KK, 1 + BASE_SHIFT,TEMP # nr=2 ++ ++ addl A,KC, A ++ addl B1,TEMP,B ++#endif ++ ++ vcpys $f31,$f31,t00 # clear 16 registers ++ vcpys $f31,$f31,t01 ++ ++ LDDE b0,0(B) # Get 2b ++ LDDE b1,1*SIZE(B) ++ ++ vcpys $f31,$f31,t04 ++ vcpys $f31,$f31,t05 ++ ++ VLD a0,0(A) # Get 8a ++ VLD a4,4*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ fillcs 0(C1) ++ fillcs 4*SIZE(CO) ++ fillcs 4*SIZE(C1) ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 8, TEMP # mr=8 ++#else ++ addl KK, 2, TEMP # nr=2 ++#endif ++ sra TEMP, 1,KC ++ nop ++ beq KC,$Rest_8x2x1 ++ ++#else ++ ++ mov B1, B ++ sra KC1,1,KC ++ vcpys $f31,$f31,t00 # clear 16 registers ++ vcpys $f31,$f31,t01 ++ ++ LDDE b0,0(B) # Get 2b ++ LDDE b1,1*SIZE(B) ++ ++ vcpys $f31,$f31,t04 ++ vcpys $f31,$f31,t05 ++ ++ VLD a0,0(A) # Get 8a ++ VLD a4,4*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ fillcs 0(C1) ++ fillcs 4*SIZE(CO) ++ fillcs 4*SIZE(C1) ++ ++ beq KC,$Rest_8x2x1 ++#endif ++ ++ ++$Panel_8x2x2: ++ VMAD a0,b0,t00,t00 ++ VMAD a0,b1,t04,t04 ++ ++ LDDE nb0,2*SIZE(B) # get next 2b ++ LDDE nb1,3*SIZE(B) ++ ++ VMAD a4,b0,t01,t01 ++ VMAD a4,b1,t05,t05 ++ ++ addl B,4*SIZE,B # 2n*2k ++ VLD na8,8*SIZE(A) # get next 8a ++ VLD na12,12*SIZE(A) ++ ++ fillcs 0(PREA) ++ fillcs 4*SIZE(PREA) ++ subl PREA,8*SIZE,PREA ++ ++ subl KC,1,KC ++ VMAD na8,nb0,t00,t00 ++ VMAD na8,nb1,t04,t04 ++ ++ addl A,16*SIZE,A # 8m*2k ++ LDDE b0,0(B) ++ LDDE b1,1*SIZE(B) # get 3rd 2b ++ ++ VMAD na12,nb0,t01,t01 ++ VMAD na12,nb1,t05,t05 ++ ++ VLD a0,0(A) # get 3rd 8a ++ VLD a4,4*SIZE(A) ++ ++ fillcs 0(PREA) ++ fillcs 4*SIZE(PREA) ++ subl PREA,8*SIZE,PREA ++ bne KC,$Panel_8x2x2 ++ ++ ++$Rest_8x2x1: ++ LDDE ALPHA,192($sp) # get alpha ++#ifndef TRMMKERNEL ++ blbc KC1,$Write_8x2 ++#else ++ blbc TEMP,$Write_8x2 ++#endif ++ ++ addl A,8*SIZE,A # 8m*1k ++ addl B,2*SIZE,B # 2n*1K ++ ++ fillcs 0(PREA) ++ fillcs 4*SIZE(PREA) ++ subl PREA,8*SIZE,PREA ++ ++ VMAD a0,b0,t00,t00 ++ VMAD a0,b1,t04,t04 ++ VMAD a4,b0,t01,t01 ++ VMAD a4,b1,t05,t05 ++ ++ ++$Write_8x2: ++ ++#ifndef TRMMKERNEL ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_8x2 ++ ++$Align_CO_Access_8x2: ++ VLD c00,0(CO) # get 1st colum of 16c ++ VLD c01,4*SIZE(CO) ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ jmp $Access_C1_8x2 ++ ++$UnAlign_CO_Access_8x2: ++ VLD_UL c00, 0*VEC_LEN*SIZE(CO) ++ VLD_UH c02, 1*VEC_LEN*SIZE(CO) ++ ++ VLD_UL c01, 1*VEC_LEN*SIZE(CO) ++ VLD_UH c03, 2*VEC_LEN*SIZE(CO) ++ ++ vbisw c00,c02,c00 ++ vbisw c01,c03,c01 ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_8x2: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ addl CO,8*SIZE,CO # 8c ++ nop ++ bne $6,$UnAlign_C1_Access_8x2 ++ ++$Align_C1_Access_8x2: ++ VLD c04,0(C1) ++ VLD c05,4*SIZE(C1) ++ ++ VMAD t04,ALPHA,c04,t04 ++ VMAD t05,ALPHA,c05,t05 ++ ++ VST t04,0(C1) ++ VST t05,4*SIZE(C1) ++ addl C1,8*SIZE,C1 ++ jmp .L26 ++ ++$UnAlign_C1_Access_8x2: ++ VLD_UL c04, 0*VEC_LEN*SIZE(C1) ++ VLD_UH c06, 1*VEC_LEN*SIZE(C1) ++ ++ VLD_UL c05, 1*VEC_LEN*SIZE(C1) ++ VLD_UH c07, 2*VEC_LEN*SIZE(C1) ++ ++ vbisw c04,c06,c04 ++ vbisw c05,c07,c05 ++ ++ VMAD t04,ALPHA,c04,t04 ++ VMAD t05,ALPHA,c05,t05 ++ ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ ++ VST_UL t05, 1*VEC_LEN*SIZE(C1) ++ VST_UH t05, 2*VEC_LEN*SIZE(C1) ++ addl C1,8*SIZE,C1 ++ ++#else ++ ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_8x2 ++ ++$Align_CO_Access_8x2: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ jmp $Access_C1_8x2 ++ ++$UnAlign_CO_Access_8x2: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_8x2: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ addl CO,8*SIZE,CO # 8c ++ nop ++ bne $6,$UnAlign_C1_Access_8x2 ++ ++$Align_C1_Access_8x2: ++ VMUL t04,ALPHA,t04 ++ VMUL t05,ALPHA,t05 ++ ++ VST t04,0(C1) ++ VST t05,4*SIZE(C1) ++ addl C1,8*SIZE,C1 ++ jmp $TRMMKERNEL_8x2 ++ ++$UnAlign_C1_Access_8x2: ++ VMUL t04,ALPHA,t04 ++ VMUL t05,ALPHA,t05 ++ ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ ++ VST_UL t05, 1*VEC_LEN*SIZE(C1) ++ VST_UH t05, 2*VEC_LEN*SIZE(C1) ++ addl C1,8*SIZE,C1 ++ ++$TRMMKERNEL_8x2: ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK,TEMP ++#ifdef LEFT ++ subl TEMP, 8,TEMP # mr=8 ++#else ++ subl TEMP, 2,TEMP # nr=2 ++#endif ++ ++ sll TEMP, 3 + BASE_SHIFT,KC ++ sll TEMP, 1 + BASE_SHIFT,TEMP ++ ++ addl A,KC,A ++ addl B,TEMP,B ++#endif ++ ++#ifdef LEFT ++ addl KK,8,KK ++ nop ++#endif ++#endif ++ ++ ++ ++ .align 5 ++ ++.L26: # nr=2,mr=4------------------ ++ and MC1,4,MC # MC1&4 ++ beq MC,.L27 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B ++ nop ++#else ++ sll KK, 2 + BASE_SHIFT,KC # mr=4 ++ sll KK, 1 + BASE_SHIFT,TEMP # nr=2 ++ ++ addl A,KC,A ++ addl B1,TEMP,B ++#endif ++ ++ vcpys $f31,$f31,t00 # clear 2vector registers ++ vcpys $f31,$f31,t04 ++ ++ LDDE b0,0(B) # get 2b ++ LDDE b1,1*SIZE(B) ++ ++ VLD a0,0(A) # Get 4 a ++ ++ fillcs 0(CO) # fetch C ++ fillcs 0(C1) ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 4, TEMP ++#else ++ addl KK, 2, TEMP ++#endif ++ sra TEMP,1,KC ++ beq KC,$Rest_4x2x1 ++ ++#else ++ ++ mov B1,B ++ sra KC1,1,KC ++ vcpys $f31,$f31,t00 # clear 2vector registers ++ vcpys $f31,$f31,t04 ++ ++ LDDE b0,0(B) # get 2b ++ LDDE b1,1*SIZE(B) ++ ++ VLD a0,0(A) # Get 4 a ++ ++ fillcs 0(CO) # fetch C ++ fillcs 0(C1) ++ ++ beq KC,$Rest_4x2x1 ++#endif ++ ++$Panel_4x2x2: ++ VMAD a0,b0,t00,t00 ++ VMAD a0,b1,t04,t04 ++ ++ LDDE nb0,2*SIZE(B) # get next 2b ++ LDDE nb1,3*SIZE(B) ++ ++ addl B,4*SIZE,B # 2n*2K ++ VLD a4,4*SIZE(A) # get next 4a ++ ++ subl KC,1,KC ++ VMAD a4,nb0,t00,t00 ++ VMAD a4,nb1,t04,t04 ++ ++ addl A,8*SIZE,A # 4m*2k ++ LDDE b0,0(B) # get 3rd 2b ++ LDDE b1,1*SIZE(B) ++ ++ VLD a0,0(A) # get 3rd 4a ++ bne KC,$Panel_4x2x2 ++ ++ ++$Rest_4x2x1: ++ LDDE ALPHA,192($sp) # get alpha ++#ifndef TRMMKERNEL ++ blbc KC1,$Write_4x2 ++#else ++ blbc TEMP,$Write_4x2 ++#endif ++ ++ addl A,4*SIZE,A # 4m*1k ++ addl B,2*SIZE,B # 2n*1K ++ ++ VMAD a0,b0,t00,t00 ++ VMAD a0,b1,t04,t04 ++ ++ ++$Write_4x2: ++ ++#ifndef TRMMKERNEL ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_4x2 ++ ++$Align_CO_Access_4x2: ++ VLD c00,0(CO) # get 1st colum of 16c ++ VMAD t00,ALPHA,c00,t00 ++ VST t00,0(CO) ++ jmp $Access_C1_4x2 ++ ++$UnAlign_CO_Access_4x2: ++ VLD_UL c00, 0*VEC_LEN*SIZE(CO) ++ VLD_UH c01, 1*VEC_LEN*SIZE(CO) ++ ++ vbisw c00,c01,c00 ++ ++ VMAD t00,ALPHA,c00,t00 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_4x2: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ addl CO,4*SIZE,CO # 4c ++ nop ++ bne $6,$UnAlign_C1_Access_4x2 ++ ++$Align_C1_Access_4x2: ++ VLD c04,0(C1) ++ VMAD t04,ALPHA,c04,t04 ++ VST t04,0(C1) ++ addl C1,4*SIZE,C1 ++ jmp .L27 ++ ++$UnAlign_C1_Access_4x2: ++ VLD_UL c04, 0*VEC_LEN*SIZE(C1) ++ VLD_UH c05, 1*VEC_LEN*SIZE(C1) ++ ++ vbisw c04,c05,c04 ++ ++ VMAD t04,ALPHA,c04,t04 ++ ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ addl C1,4*SIZE,C1 ++ ++#else ++ ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_4x2 ++ ++$Align_CO_Access_4x2: ++ VMUL t00,ALPHA,t00 ++ VST t00,0(CO) ++ jmp $Access_C1_4x2 ++ ++$UnAlign_CO_Access_4x2: ++ VMUL t00,ALPHA,t00 ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ ++$Access_C1_4x2: ++ and C1, (VEC_LEN*SIZE-1),$6 ++ addl CO,4*SIZE,CO # 4c ++ nop ++ bne $6,$UnAlign_C1_Access_4x2 ++ ++$Align_C1_Access_4x2: ++ VMUL t04,ALPHA,t04 ++ VST t04,0(C1) ++ addl C1,4*SIZE,C1 ++ jmp $TRMMKERNEL_4x2 ++ ++$UnAlign_C1_Access_4x2: ++ VMUL t04,ALPHA,t04 ++ VST_UL t04, 0*VEC_LEN*SIZE(C1) ++ VST_UH t04, 1*VEC_LEN*SIZE(C1) ++ addl C1,4*SIZE,C1 ++ ++$TRMMKERNEL_4x2: ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 4, TEMP ++#else ++ subl TEMP, 2, TEMP ++#endif ++ ++ sll TEMP, 2 + BASE_SHIFT,KC ++ sll TEMP, 1 + BASE_SHIFT,TEMP ++ ++ addl A, KC, A ++ addl B, TEMP, B ++#endif ++ ++#ifdef LEFT ++ addl KK, 4, KK ++ nop ++#endif ++#endif ++ ++ ++ ++ .align 5 ++ ++.L27: # nr=2,mr=2-------------- ++ and MC1,2,MC ++ beq MC,.L28 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B ++#else ++ sll KK, 1 + BASE_SHIFT,KC # mr=nr=2 ++ nop ++ addl A,KC,A ++ addl B1,KC,B ++#endif ++ ++ fclr t00 # clear 4 register ++ fclr t01 ++ fclr t04 ++ fclr t05 ++ ++ LD b0,0(B) # get 2b ++ LD b1,1*SIZE(B) ++ ++ LD a0,0(A) # get 2a ++ LD a4,1*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ fillcs 0(C1) ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#else ++ addl KK, 2, TEMP # mr=nr=2 ++#endif ++ sra TEMP,1, KC ++ nop ++ nop ++ beq KC,$Rest_2x2x1 ++ ++#else ++ ++ mov B1,B # Reset B ++ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 ++ fclr t00 # clear 4 register ++ fclr t01 ++ fclr t04 ++ fclr t05 ++ ++ LD b0,0(B) # get 2b ++ LD b1,1*SIZE(B) ++ ++ LD a0,0(A) # get 2a ++ LD a4,1*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ fillcs 0(C1) ++ beq KC,$Rest_2x2x1 ++ ++#endif ++ ++ ++$Panel_2x2x2: ++ MAD a0,b0,t00,t00 ++ MAD a0,b1,t04,t04 ++ ++ LD nb0,2*SIZE(B) # get next 2b ++ LD nb1,3*SIZE(B) ++ ++ MAD a4,b0,t01,t01 ++ MAD a4,b1,t05,t05 ++ ++ addl B,4*SIZE,B # 2(n)*2(k) ++ LD a8,2*SIZE(A) # get next 2a ++ LD a12,3*SIZE(A) ++ ++ subl KC,1,KC ++ MAD a8,nb0,t00,t00 ++ MAD a8,nb1,t04,t04 ++ ++ addl A,4*SIZE,A # 2m*2k ++ LD b0,0(B) ++ LD b1,1*SIZE(B) ++ ++ MAD a12,nb0,t01,t01 ++ MAD a12,nb1,t05,t05 ++ ++ LD a0,0(A) ++ LD a4,1*SIZE(A) ++ bne KC,$Panel_2x2x2 ++ ++ ++$Rest_2x2x1: ++ LD ALPHA,192($sp) # Get ALPHA ++#ifndef TRMMKERNEL ++ blbc KC1,$Write_2x2 ++#else ++ blbc TEMP,$Write_2x2 ++#endif ++ ++ addl A,2*SIZE,A # 2m*1k ++ addl B,2*SIZE,B # 2n*1K ++ ++ MAD a0,b0,t00,t00 ++ MAD a0,b1,t04,t04 ++ MAD a4,b0,t01,t01 ++ MAD a4,b1,t05,t05 ++ ++ ++$Write_2x2: ++ ++#ifndef TRMMKERNEL ++ LD c00,0(CO) ++ LD c04,0(C1) ++ LD c01,1*SIZE(CO) ++ LD c05,1*SIZE(C1) ++ ++ MAD t00,ALPHA,c00,t00 ++ MAD t04,ALPHA,c04,t04 ++ MAD t01,ALPHA,c01,t01 ++ MAD t05,ALPHA,c05,t05 ++ ++ ST t00,0(CO) ++ ST t04,0(C1) ++ ST t01,1*SIZE(CO) ++ ST t05,1*SIZE(C1) ++ ++ addl CO,2*SIZE,CO # 2c ++ addl C1,2*SIZE,C1 ++ ++#else ++ ++ MUL t00,ALPHA,t00 ++ MUL t04,ALPHA,t04 ++ MUL t01,ALPHA,t01 ++ MUL t05,ALPHA,t05 ++ ++ ST t00,0(CO) ++ ST t04,0(C1) ++ ST t01,1*SIZE(CO) ++ ST t05,1*SIZE(C1) ++ ++ addl CO,2*SIZE,CO # 2c ++ addl C1,2*SIZE,C1 ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++ subl TEMP, 2, TEMP ++ ++ sll TEMP, 1 + BASE_SHIFT, KC ++ nop ++ ++ addl A,KC, A ++ addl B,KC, B ++#endif ++ ++#ifdef LEFT ++ addl KK, 2, KK ++#endif ++#endif ++ ++ ++ ++ .align 5 ++.L28: ++ and MC1,1,MC # nr=2,mr=1------------------- ++ beq MC,$End_NC_Unroll2 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B ++#else ++ sll KK, BASE_SHIFT,KC # mr=1 ++ sll KK, 1 + BASE_SHIFT,TEMP # nr=2 ++ ++ addl A,KC,A ++ addl B1,TEMP,B ++#endif ++ ++ fclr t00 # clear 2 registers ++ fclr t04 ++ ++ LD b0,0(B) # 2b ++ LD b1,1*SIZE(B) ++ ++ LD a0,0(A) # 1a ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 1, TEMP ++#else ++ addl KK, 2, TEMP ++#endif ++ sra TEMP,1,KC ++ nop ++ beq KC,$Rest_1x2x1 ++ ++#else ++ mov B1,B # Reset B ++ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 ++ fclr t00 # clear 2 registers ++ fclr t04 ++ ++ LD b0,0(B) # 2b ++ LD b1,1*SIZE(B) ++ ++ LD a0,0(A) # 1a ++ beq KC,$Rest_1x2x1 ++#endif ++ ++ ++ .align 5 ++ ++$Panel_1x2x2: ++ MAD a0,b0,t00,t00 ++ MAD a0,b1,t04,t04 ++ ++ LD nb0,2*SIZE(B) # get next 2b ++ LD nb1,3*SIZE(B) ++ ++ addl B,4*SIZE,B # 2(n)*2(k) ++ LD a8,1*SIZE(A) # get next 1a ++ ++ subl KC,1,KC ++ MAD a8,nb0,t00,t00 ++ MAD a8,nb1,t04,t04 ++ ++ addl A,2*SIZE,A # 1m*2k ++ LD b0,0(B) # get 3rd 2b ++ LD b1,1*SIZE(B) ++ ++ LD a0,0(A) # get 3rd 1a ++ bne KC,$Panel_1x2x2 ++ ++ ++$Rest_1x2x1: ++ LD ALPHA,192($sp) # Get ALPHA ++#ifndef TRMMKERNEL ++ blbc KC1,$Write_1x2 ++#else ++ blbc TEMP,$Write_1x2 ++#endif ++ ++ addl A,1*SIZE,A # 1m*1k ++ addl B,2*SIZE,B # 2n*1K ++ ++ MAD a0,b0,t00,t00 ++ MAD a0,b1,t04,t04 ++ ++ ++$Write_1x2: # Write back 2 results ++#ifndef TRMMKERNEL ++ LD c00,0(CO) ++ LD c04,0(C1) ++ ++ MAD t00,ALPHA,c00,t00 ++ MAD t04,ALPHA,c04,t04 ++ ++ ST t00,0(CO) ++ ST t04,0(C1) ++ ++#else ++ ++ MUL t00,ALPHA,t00 ++ MUL t04,ALPHA,t04 ++ ++ ST t00,0(CO) ++ ST t04,0(C1) ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 1,TEMP ++#else ++ subl TEMP, 2,TEMP ++#endif ++ ++ sll TEMP, BASE_SHIFT,KC ++ sll TEMP, 1 + BASE_SHIFT,TEMP ++ ++ addl A,KC,A ++ addl B,TEMP,B ++#endif ++ ++#ifdef LEFT ++ addl KK,1,KK ++#endif ++#endif ++ ++ ++ .align 5 ++ ++$End_NC_Unroll2: ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 2,KK ++#endif ++ mov B, B1 ++ ++ ++ .align 5 ++$Begin_NC_Unroll1: # Nr=1 ++ and NC1,1,NC # NC=NC1&1 ++ beq NC,$Kernel_End ++ ++ mov A1,A # Reset A ++ mov C,CO # Reset C ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET,KK # reset offset ++#endif ++ ++ sll KC1,4+BASE_SHIFT,SPANA # SPANA=KC1*MC ++ subl PREA,16*SIZE,PREA ++ ++ sra MC1,4,MC # MC=MC1/16 ++ beq MC,.L35 # MC=0:MC1<16 ++ ++ ++.L3: # nr=1,mr=16 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1,B ++#else ++ sll KK, 4 + BASE_SHIFT, KC # mr=16 ++ sll KK, BASE_SHIFT,TEMP # nr=1 ++ ++ addl A,KC,A ++ addl B1,TEMP,B ++#endif ++ ++ vcpys $f31,$f31,t00 # CLEAR 16 Register ++ vcpys $f31,$f31,t01 ++ vcpys $f31,$f31,t02 ++ vcpys $f31,$f31,t03 ++ ++ LDDE b0,0(B) # get 1b and 16a ++ ++ VLD a0,0(A) ++ VLD a4,4*SIZE(A) ++ VLD a8,8*SIZE(A) ++ VLD a12,12*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ fillcs 4*SIZE(CO) ++ fillcs 8*SIZE(CO) ++ fillcs 12*SIZE(CO) ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 16, TEMP ++#else ++ addl KK, 1, TEMP ++#endif ++ sra TEMP, 1, KC ++ beq KC,$Rest_16x1x1 ++ ++#else ++ ++ mov B1,B # Set B ++ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 ++ vcpys $f31,$f31,t00 # CLEAR 16 Register ++ vcpys $f31,$f31,t01 ++ vcpys $f31,$f31,t02 ++ vcpys $f31,$f31,t03 ++ ++ LDDE b0,0(B) # get 1b and 16a ++ ++ VLD a0,0(A) ++ VLD a4,4*SIZE(A) ++ VLD a8,8*SIZE(A) ++ VLD a12,12*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ fillcs 4*SIZE(CO) ++ fillcs 8*SIZE(CO) ++ fillcs 12*SIZE(CO) ++ ++ beq KC,$Rest_16x1x1 ++ ++#endif ++ ++$Panel_16x1x2: ++ addl A,16*SIZE,A # 16(m)*1(k) ++ LDDE b1,1*SIZE(B) # get next 1b ++ ++ VMAD a0,b0,t00,t00 ++ VMAD a4,b0,t01,t01 ++ ++ addl B,2*SIZE,B # 1(n)*2(k) ++ VLD na0,0(A) # get next 16a ++ VLD na4,4*SIZE(A) ++ VLD na8,8*SIZE(A) ++ VLD na12,12*SIZE(A) ++ ++ VMAD a8,b0,t02,t02 ++ VMAD a12,b0,t03,t03 ++ ++ subl KC,1,KC ++ addl A,16*SIZE,A # 16m*1k ++ LDDE b0,0(B) ++ ++ VMAD na0,b1,t00,t00 ++ VMAD na4,b1,t01,t01 ++ ++ VLD a0,0(A) ++ VLD a4,4*SIZE(A) ++ VLD a8,8*SIZE(A) ++ VLD a12,12*SIZE(A) ++ ++ VMAD na8,b1,t02,t02 ++ VMAD na12,b1,t03,t03 ++ bne KC,$Panel_16x1x2 ++ ++ ++$Rest_16x1x1: ++ LDDE ALPHA,192($sp) ++#ifndef TRMMKERNEL ++ blbc KC1,$Write_16x1 # If(KC1[0]==0) goto $Write_16x1 ++#else ++ blbc TEMP,$Write_16x1 # If(KC1[0]==0) goto $Write_16x1 ++#endif ++ ++ addl A,16*SIZE,A # 16a*1k ++ addl B,1*SIZE,B # 1b*1k ++ ++ VMAD a0,b0,t00,t00 ++ VMAD a4,b0,t01,t01 ++ VMAD a8,b0,t02,t02 ++ VMAD a12,b0,t03,t03 ++ ++ ++$Write_16x1: ++ ++#ifndef TRMMKERNEL ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_16x1 ++ ++$Align_CO_Access_16x1: ++ VLD c00,0(CO) # get 1st colum of 16c ++ VLD c01,4*SIZE(CO) ++ VLD c02,8*SIZE(CO) ++ VLD c03,12*SIZE(CO) ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ VMAD t02,ALPHA,c02,t02 ++ VMAD t03,ALPHA,c03,t03 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ VST t02,8*SIZE(CO) ++ VST t03,12*SIZE(CO) ++ jmp $Kernel_End ++ ++$UnAlign_CO_Access_16x1: ++ VLD_UL c00, 0*VEC_LEN*SIZE(CO) ++ VLD_UH c04, 1*VEC_LEN*SIZE(CO) ++ ++ VLD_UL c01, 1*VEC_LEN*SIZE(CO) ++ VLD_UH c05, 2*VEC_LEN*SIZE(CO) ++ ++ VLD_UL c02, 2*VEC_LEN*SIZE(CO) ++ VLD_UH c06, 3*VEC_LEN*SIZE(CO) ++ ++ VLD_UL c03, 3*VEC_LEN*SIZE(CO) ++ VLD_UH c07, 4*VEC_LEN*SIZE(CO) ++ ++ vbisw c00,c04,c00 ++ vbisw c01,c05,c01 ++ vbisw c02,c06,c02 ++ vbisw c03,c07,c03 ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ VMAD t02,ALPHA,c02,t02 ++ VMAD t03,ALPHA,c03,t03 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++ VST_UL t02, 2*VEC_LEN*SIZE(CO) ++ VST_UH t02, 3*VEC_LEN*SIZE(CO) ++ ++ VST_UL t03, 3*VEC_LEN*SIZE(CO) ++ VST_UH t03, 4*VEC_LEN*SIZE(CO) ++ jmp $Kernel_End ++ ++#else ++ ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_16x1 ++ ++$Align_CO_Access_16x1: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ VMUL t02,ALPHA,t02 ++ VMUL t03,ALPHA,t03 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ VST t02,8*SIZE(CO) ++ VST t03,12*SIZE(CO) ++ jmp $TRMMKERNEL_16x1 ++ ++$UnAlign_CO_Access_16x1: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ VMUL t02,ALPHA,t02 ++ VMUL t03,ALPHA,t03 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++ VST_UL t02, 2*VEC_LEN*SIZE(CO) ++ VST_UH t02, 3*VEC_LEN*SIZE(CO) ++ ++ VST_UL t03, 3*VEC_LEN*SIZE(CO) ++ VST_UH t03, 4*VEC_LEN*SIZE(CO) ++ ++$TRMMKERNEL_16x1: ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 16, TEMP ++#else ++ subl TEMP, 1,TEMP ++#endif ++ ++ sll TEMP, 4 + BASE_SHIFT,KC ++ sll TEMP, BASE_SHIFT, TEMP ++ ++ addl A,KC,A ++ addl B,TEMP,B ++#endif ++ ++#ifdef LEFT ++ addl KK, 16, KK ++ nop ++#endif ++ ++ jmp $Kernel_End ++#endif ++ ++ ++ ++ .align 5 ++.L35: # nr=1,mr=8------------------ ++ and MC1,8,MC ++ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc ++ nop ++ beq MC,.L36 # MC1<8 ++ ++ addl A1,SPANA,PREA ++ subl PREA,8*SIZE,PREA # PREA-=MC ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B ++#else ++ sll KK, 3 + BASE_SHIFT,KC # mr=8 ++ sll KK, BASE_SHIFT,TEMP # nr=1 ++ ++ addl A,KC, A ++ addl B1,TEMP,B ++#endif ++ ++ vcpys $f31,$f31,t00 # CLEAR 8Register ++ vcpys $f31,$f31,t01 ++ ++ LDDE b0,0(B) # get 1b ++ ++ VLD a0,0(A) # get 8a ++ VLD a4,4*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ fillcs 4*SIZE(CO) ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK,TEMP ++#elif defined(LEFT) ++ addl KK, 8,TEMP ++#else ++ addl KK, 1,TEMP ++#endif ++ sra TEMP,1,KC ++ nop ++ beq KC,$Rest_8x1x1 ++ ++#else ++ ++ mov B1, B ++ sra KC1,1,KC ++ vcpys $f31,$f31,t00 # CLEAR 8Register ++ vcpys $f31,$f31,t01 ++ ++ LDDE b0,0(B) # get 1b ++ ++ VLD a0,0(A) # get 8a ++ VLD a4,4*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ fillcs 4*SIZE(CO) ++ beq KC,$Rest_8x1x1 ++ ++#endif ++ ++ ++$Panel_8x1x2: ++ VMAD a0,b0,t00,t00 ++ VMAD a4,b0,t01,t01 ++ ++ LDDE nb0,1*SIZE(B) # get next 1b ++ ++ addl B,2*SIZE,B # 1(n)*2k ++ VLD na8,8*SIZE(A) # get next 8a ++ VLD na12,12*SIZE(A) ++ ++ fillcs 0(PREA) ++ subl PREA,8*SIZE,PREA ++ ++ subl KC,1,KC ++ VMAD na8,nb0,t00,t00 ++ VMAD na12,nb0,t01,t01 ++ ++ addl A,16*SIZE,A # 8m*2k ++ LDDE b0,0(B) # get 3rd 1b ++ ++ VLD a0,0(A) # get 3rd 8a ++ VLD a4,4*SIZE(A) ++ ++ fillcs 0(PREA) ++ subl PREA,8*SIZE,PREA ++ bne KC,$Panel_8x1x2 ++ ++ ++$Rest_8x1x1: ++ LDDE ALPHA,192($sp) # Get ALPHA ++#ifndef TRMMKERNEL ++ blbc KC1,$Write_8x1 ++#else ++ blbc TEMP,$Write_8x1 ++#endif ++ ++ addl A,8*SIZE,A # 8m*1k ++ addl B,1*SIZE,B # 1n*1k ++ ++ VMAD a0,b0,t00,t00 ++ VMAD a4,b0,t01,t01 ++ ++ ++$Write_8x1: ++ ++#ifndef TRMMKERNEL ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_8x1 ++ ++$Align_CO_Access_8x1: ++ VLD c00,0(CO) # get 1st colum of 16c ++ VLD c01,4*SIZE(CO) ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ addl CO,8*SIZE,CO # 8c ++ jmp .L36 ++ ++$UnAlign_CO_Access_8x1: ++ VLD_UL c00, 0*VEC_LEN*SIZE(CO) ++ VLD_UH c02, 1*VEC_LEN*SIZE(CO) ++ ++ VLD_UL c01, 1*VEC_LEN*SIZE(CO) ++ VLD_UH c03, 2*VEC_LEN*SIZE(CO) ++ ++ vbisw c00,c02,c00 ++ vbisw c01,c03,c01 ++ ++ VMAD t00,ALPHA,c00,t00 ++ VMAD t01,ALPHA,c01,t01 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ addl CO,8*SIZE,CO # 8c ++ ++#else ++ ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_8x1 ++ ++$Align_CO_Access_8x1: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ ++ VST t00,0(CO) ++ VST t01,4*SIZE(CO) ++ jmp $TRMMKERNEL_8x1 ++ ++$UnAlign_CO_Access_8x1: ++ VMUL t00,ALPHA,t00 ++ VMUL t01,ALPHA,t01 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++ VST_UL t01, 1*VEC_LEN*SIZE(CO) ++ VST_UH t01, 2*VEC_LEN*SIZE(CO) ++ ++$TRMMKERNEL_8x1: ++ addl CO,8*SIZE,CO # 8c ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 8, TEMP ++#else ++ subl TEMP, 1, TEMP ++#endif ++ ++ sll TEMP, 3 + BASE_SHIFT, KC ++ sll TEMP, BASE_SHIFT,TEMP ++ ++ addl A,KC, A ++ addl B,TEMP,B ++#endif ++ ++#ifdef LEFT ++ addl KK,8, KK ++#endif ++#endif ++ ++ ++ ++ .align 5 ++.L36: # nr=1,mr=4--------------- ++ and MC1,4,MC # MC1&4 ++ beq MC,.L37 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA))\ ++ || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B ++#else ++ sll KK, 2 + BASE_SHIFT, KC # mr=4 ++ sll KK, BASE_SHIFT, TEMP # nr=1 ++ ++ addl A,KC,A ++ addl B1,TEMP,B ++#endif ++ ++ vcpys $f31,$f31,t00 # CLEAR 4 Register ++ ++ LDDE b0,0(B) ++ VLD a0,0(A) ++ ++ fillcs 0(CO) # fetch C ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 4, TEMP # mr=4 ++#else ++ addl KK, 1, TEMP # nr=1 ++#endif ++ sra TEMP,1, KC ++ beq KC,$Rest_4x1x1 ++ ++#else ++ ++ mov B1,B # Reset B ++ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 ++ vcpys $f31,$f31,t00 # CLEAR 4 Register ++ ++ LDDE b0,0(B) ++ VLD a0,0(A) ++ ++ fillcs 0(CO) # fetch C ++ beq KC,$Rest_4x1x1 ++#endif ++ ++ ++$Panel_4x1x2: ++ VMAD a0,b0,t00,t00 ++ ++ LDDE nb0,1*SIZE(B) ++ VLD a4,4*SIZE(A) ++ addl B,2*SIZE,B # 1(n)*2(k)*8Byte ++ ++ subl KC,1,KC ++ VMAD a4,nb0,t00,t00 ++ ++ addl A,8*SIZE,A # 4m*2k ++ LDDE b0,0(B) ++ VLD a0,0(A) ++ ++ bne KC,$Panel_4x1x2 ++ ++ ++$Rest_4x1x1: ++ LDDE ALPHA,192($sp) # Get ALPHA ++#ifndef TRMMKERNEL ++ blbc KC1,$Write_4x1 ++#else ++ blbc TEMP,$Write_4x1 ++#endif ++ ++ addl A,4*SIZE,A # 4m*1k ++ addl B,1*SIZE,B # 1n*1K ++ ++ VMAD a0,b0,t00,t00 ++ ++ ++$Write_4x1: # Write back 4 results ++ ++#ifndef TRMMKERNEL ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_4x1 ++ ++$Align_CO_Access_4x1: ++ VLD c00,0(CO) # get 1st colum of 16c ++ VMAD t00,ALPHA,c00,t00 ++ VST t00,0(CO) ++ addl CO,4*SIZE,CO # 4c ++ jmp .L37 ++ ++$UnAlign_CO_Access_4x1: ++ VLD_UL c00, 0*VEC_LEN*SIZE(CO) ++ VLD_UH c01, 1*VEC_LEN*SIZE(CO) ++ ++ vbisw c00,c01,c00 ++ ++ VMAD t00,ALPHA,c00,t00 ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ addl CO,4*SIZE,CO # 4c ++ ++ ++#else ++ and CO, (VEC_LEN*SIZE-1),$6 ++ bne $6,$UnAlign_CO_Access_4x1 ++ ++$Align_CO_Access_4x1: ++ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register ++ VST t00,0(CO) ++ jmp $TRMMKERNEL_4x1 ++ ++$UnAlign_CO_Access_4x1: ++ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register ++ ++ VST_UL t00, 0*VEC_LEN*SIZE(CO) ++ VST_UH t00, 1*VEC_LEN*SIZE(CO) ++ ++$TRMMKERNEL_4x1: ++ addl CO,4*SIZE,CO # 4c ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 4, TEMP # mr=4 ++#else ++ subl TEMP, 1, TEMP ++#endif ++ ++ sll TEMP, 2 + BASE_SHIFT, KC ++ sll TEMP, BASE_SHIFT, TEMP ++ ++ addl A, KC, A ++ addl B, TEMP,B ++#endif ++ ++#ifdef LEFT ++ addl KK, 4, KK ++#endif ++#endif ++ ++ ++ ++ ++ .align 5 ++.L37: # nr=1,mr=2------------------------- ++ and MC1,2,MC ++ beq MC,.L38 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B ++#else ++ sll KK, 1 + BASE_SHIFT,KC # mr=2 ++ sll KK, BASE_SHIFT, TEMP # nr=1 ++ ++ addl A,KC, A ++ addl B1,TEMP,B ++#endif ++ ++ fclr t00 # CLEAR 2 Register ++ fclr t01 ++ ++ LD b0,0(B) ++ ++ LD a0,0(A) ++ LD a4,1*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 2,TEMP ++#else ++ addl KK, 1,TEMP ++#endif ++ sra TEMP,1,KC ++ beq KC,.L373 ++ ++#else ++ ++ mov B1,B # Reset B ++ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 ++ fclr t00 # CLEAR 2 Register ++ fclr t01 ++ ++ LD b0,0(B) ++ ++ LD a0,0(A) ++ LD a4,1*SIZE(A) ++ ++ fillcs 0(CO) # fetch C ++ beq KC,.L373 ++ ++#endif ++ ++.L371: ++ MAD a0,b0,t00,t00 ++ MAD a4,b0,t01,t01 ++ ++ LD nb0,1*SIZE(B) ++ ++ addl B,2*SIZE,B # 1(n)*2(k) ++ LD a8,2*SIZE(A) ++ LD a12,3*SIZE(A) ++ ++ subl KC,1,KC ++ MAD a8,nb0,t00,t00 ++ MAD a12,nb0,t01,t01 ++ ++ addl A,4*SIZE,A # 2m*2k ++ LD b0,0(B) ++ ++ LD a0,0(A) ++ LD a4,1*SIZE(A) ++ bne KC,.L371 ++ ++.L373: ++ LD ALPHA,192($sp) # Get ALPHA ++#ifndef TRMMKERNEL ++ blbc KC1,.L374 ++#else ++ blbc TEMP,.L374 ++#endif ++ ++ addl A,2*SIZE,A # 2m*1k*8Byte ++ addl B,1*SIZE,B # 1n*1K*8Byte ++ ++ MAD a0,b0,t00,t00 ++ MAD a4,b0,t01,t01 ++ ++.L374: # Write back 2 results ++ ++#ifndef TRMMKERNEL ++ LD c00,0(CO) ++ LD c01,1*SIZE(CO) ++ ++ MAD t00,ALPHA,c00,t00 ++ MAD t01,ALPHA,c01,t01 ++ ++ ST t00,0(CO) ++ ST t01,1*SIZE(CO) ++ addl CO,2*SIZE,CO # 2c ++ ++#else ++ ++ MUL t00,ALPHA,t00 ++ MUL t01,ALPHA,t01 ++ ++ ST t00,0(CO) ++ ST t01,1*SIZE(CO) ++ ++ addl CO,2*SIZE,CO # 2c ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl KC1, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 2, TEMP ++#else ++ subl TEMP, 1, TEMP ++#endif ++ ++ sll TEMP, 1 + BASE_SHIFT,KC ++ sll TEMP, BASE_SHIFT,TEMP ++ ++ addl A,KC,A ++ addl B,TEMP,B ++#endif ++ ++#ifdef LEFT ++ addl KK, 2, KK ++#endif ++#endif ++ ++ ++ ++ .align 5 ++.L38: ++ and MC1,1,MC ++ beq MC,$Kernel_End ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B1, B ++#else ++ sll KK, BASE_SHIFT,KC # mr=nr=1 ++ nop ++ ++ addl A,KC,A ++ addl B1,KC,B ++#endif ++ ++ fclr t00 # CLEAR Results Register ++ ++ LD b0,0(B) ++ LD a0,0(A) # Get 16 A and 4 B ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl KC1, KK, TEMP ++#else ++ addl KK, 1, TEMP # mr=nr=1 ++#endif ++ sra TEMP,1,KC ++ nop ++ beq KC,.L383 ++ ++#else ++ ++ mov B1,B # Reset B ++ sra KC1,1,KC # Unroll KC 2, KC=KC1/2 ++ fclr t00 # CLEAR Results Register ++ ++ LD b0,0(B) ++ LD a0,0(A) # Get 16 A and 4 B ++ ++ beq KC,.L383 ++#endif ++ ++.L381: ++ MAD a0,b0,t00,t00 ++ LD nb0,1*SIZE(B) ++ ++ addl B,2*SIZE,B # 1n*2k ++ LD a8,1*SIZE(A) ++ ++ ++ subl KC,1,KC ++ MAD a8,nb0,t00,t00 ++ ++ addl A,2*SIZE,A # 1m*2k ++ LD b0,0(B) ++ ++ LD a0,0(A) ++ bne KC,.L381 ++ ++ ++.L383: ++ LD ALPHA,192($sp) # get alpha ++#ifndef TRMMKERNEL ++ blbc KC1,.L384 ++#else ++ blbc TEMP,.L384 ++#endif ++ ++ addl A,1*SIZE,A # 1m*1k ++ addl B,1*SIZE,B # 1n*1K ++ ++ MAD a0,b0,t00,t00 ++ ++ ++.L384: # Write back 1 results ++ ++#ifndef TRMMKERNEL ++ LD c00,0(CO) ++ MAD t00,ALPHA,c00,t00 ++ ST t00,0(CO) ++ ++#else ++ MUL t00,ALPHA,t00 ++ ST t00,0(CO) ++#endif ++ ++ ++ ++$Kernel_End: ++ ldl $9,328($sp) # Integer Saved Register ++ ldl $10,320($sp) ++ ldl $11,312($sp) ++ ldl $12,304($sp) ++ ldl $13,296($sp) ++ldl $14,288($sp) ++# Float Saved Register ++ LD $f2,280($sp) ++ LD $f3,272($sp) ++ LD $f4,264($sp) ++ LD $f5,256($sp) ++ LD $f6,248($sp) ++ LD $f7,240($sp) ++ LD $f8,232($sp) ++LD $f9,224($sp) ++ ++ ldi $sp,STACKSIZE($sp) # ++ ret $31,($26),1 # ++ ++ EPILOGUE ++ ++ +diff --git a/kernel/sw_64/gemv_n.S b/kernel/sw_64/gemv_n.S +new file mode 100644 +index 0000000..90284db +--- /dev/null ++++ b/kernel/sw_64/gemv_n.S +@@ -0,0 +1,1647 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define STACKSIZE 72 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $20 ++#define LDA $21 ++ ++#define X $18 ++#define INCX $19 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define Y1 $4 ++ ++#define A1 $5 ++#define A2 $6 ++#define A3 $7 ++#define A4 $8 ++ ++#define alpha $f19 ++ ++#define alpha1 $f0 ++#define alpha2 $f1 ++#define alpha3 $f10 ++#define alpha4 $f11 ++ ++#define y0 $f12 ++#define y1 $f13 ++#define y2 $f14 ++#define y3 $f15 ++ ++#define y4 $f16 ++#define y5 $f17 ++#define y6 $f18 ++#define y7 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 ++ ++#define tmp $f20 ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl X, 0 + STACKSIZE($sp) ++ ldl INCX, 8 + STACKSIZE($sp) ++ ldl Y, 16 + STACKSIZE($sp) ++ ldl INCY, 24 + STACKSIZE($sp) ++ ldl BUFFER, 32 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ fstd tmp, 64($sp) ++ PROFCODE ++ ++ cmple M, 0, $0 ++ SXADDQ INCX, 0, INCX ++ cmple N, 0, $1 ++ SXADDQ INCY, 0, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ SXADDQ LDA, 0, LDA ++ ++ cmpeq INCY, SIZE, $0 ++ bne $0, $L10 ++ ++ mov BUFFER, Y1 ++ ++ mov Y, BUFFER ++ mov Y1, Y ++ ++ sra M, 3, I ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ ST $f31, 2 * SIZE(Y1) ++ ST $f31, 3 * SIZE(Y1) ++ ST $f31, 4 * SIZE(Y1) ++ ST $f31, 5 * SIZE(Y1) ++ ST $f31, 6 * SIZE(Y1) ++ ST $f31, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ ldi I, -1(I) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 7, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ ST $f31, 0 * SIZE(Y1) ++ addl Y1, SIZE, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ sra N, 2, J ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ LD alpha1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha3, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha4, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ MUL alpha, alpha1, tmp ++ fmov tmp, alpha1 ++ MUL alpha, alpha2, tmp ++ fmov tmp, alpha2 ++ MUL alpha, alpha3, tmp ++ fmov tmp, alpha3 ++ MUL alpha, alpha4, tmp ++ fmov tmp, alpha4 ++ ++ mov A, A1 ++ addl A, LDA, A2 ++ addl A2, LDA, A3 ++ addl A3, LDA, A4 ++ s4addl LDA, A, A ++ ++ mov Y, Y1 ++ ldw $31, 4 * SIZE(X) ++ ++ sra M, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ LD a8, 0 * SIZE(A3) ++ LD a9, 1 * SIZE(A3) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 3 * SIZE(A3) ++ ++ LD y4, 4 * SIZE(Y1) ++ LD y5, 5 * SIZE(Y1) ++ LD y6, 6 * SIZE(Y1) ++ LD y7, 7 * SIZE(Y1) ++ ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ LD a12, 0 * SIZE(A4) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ LD a13, 1 * SIZE(A4) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ LD a14, 2 * SIZE(A4) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ LD a15, 3 * SIZE(A4) ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, tmp ++ fmov tmp, a4 ++ unop ++ ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, tmp ++ fmov tmp, a5 ++ unop ++ ++ ADD y2, a2, tmp ++ fmov tmp, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, tmp ++ fmov tmp, a6 ++ unop ++ ++ ADD y3, a3, tmp ++ fmov tmp, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, tmp ++ fmov tmp, a7 ++ unop ++ ++ ADD y0, a4, tmp ++ fmov tmp, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha3, a8, tmp ++ fmov tmp, a8 ++ unop ++ ++ ADD y1, a5, tmp ++ fmov tmp, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha3, a9, tmp ++ fmov tmp, a9 ++ ldi I, -1(I) ++ ++ ADD y2, a6, tmp ++ fmov tmp, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha3, a10, tmp ++ fmov tmp, a10 ++ unop ++ ++ ADD y3, a7, tmp ++ fmov tmp, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha3, a11, tmp ++ fmov tmp, a11 ++ unop ++ ++ ADD y0, a8, tmp ++ fmov tmp, y0 ++ LD a8, 4 * SIZE(A3) ++ MUL alpha4, a12, tmp ++ fmov tmp, a12 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD y1, a9, tmp ++ fmov tmp, y1 ++ LD a9, 5 * SIZE(A3) ++ MUL alpha4, a13, tmp ++ fmov tmp, a13 ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) ++ ++ ADD y2, a10, tmp ++ fmov tmp, y2 ++ LD a10, 6 * SIZE(A3) ++ MUL alpha4, a14, tmp ++ fmov tmp, a14 ++ unop ++ ++ ADD y3, a11, tmp ++ fmov tmp, y3 ++ LD a11, 7 * SIZE(A3) ++ MUL alpha4, a15, tmp ++ fmov tmp, a15 ++ ldi I, -1(I) ++ ++ ADD y0, a12, tmp ++ fmov tmp, y0 ++ LD a12, 4 * SIZE(A4) ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) ++ ++ ADD y1, a13, tmp ++ fmov tmp, y1 ++ LD a13, 5 * SIZE(A4) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ unop ++ ++ ADD y2, a14, tmp ++ fmov tmp, y2 ++ LD a14, 6 * SIZE(A4) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ unop ++ ++ ADD y3, a15, tmp ++ fmov tmp, y3 ++ LD a15, 7 * SIZE(A4) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) ++ ++ ADD y4, a0, tmp ++ fmov tmp, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, tmp ++ fmov tmp, a4 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD y5, a1, tmp ++ fmov tmp, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, tmp ++ fmov tmp, a5 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD y6, a2, tmp ++ fmov tmp, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, tmp ++ fmov tmp, a6 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD y7, a3, tmp ++ fmov tmp, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, tmp ++ fmov tmp, a7 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD y4, a4, tmp ++ fmov tmp, y4 ++ LD a4, 8 * SIZE(A2) ++ MUL alpha3, a8, tmp ++ fmov tmp, a8 ++ LD y0, 8 * SIZE(Y1) ++ ++ ADD y5, a5, tmp ++ fmov tmp, y5 ++ LD a5, 9 * SIZE(A2) ++ MUL alpha3, a9, tmp ++ fmov tmp, a9 ++ LD y1, 9 * SIZE(Y1) ++ ++ ADD y6, a6, tmp ++ fmov tmp, y6 ++ LD a6, 10 * SIZE(A2) ++ MUL alpha3, a10, tmp ++ fmov tmp, a10 ++ LD y2, 10 * SIZE(Y1) ++ ++ ADD y7, a7, tmp ++ fmov tmp, y7 ++ LD a7, 11 * SIZE(A2) ++ MUL alpha3, a11, tmp ++ fmov tmp, a11 ++ LD y3, 11 * SIZE(Y1) ++ ++ ADD y4, a8, tmp ++ fmov tmp, y4 ++ LD a8, 8 * SIZE(A3) ++ MUL alpha4, a12, tmp ++ fmov tmp, a12 ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A3) ++ ++ ADD y5, a9, tmp ++ fmov tmp, y5 ++ LD a9, 9 * SIZE(A3) ++ MUL alpha4, a13, tmp ++ fmov tmp, a13 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD y6, a10, tmp ++ fmov tmp, y6 ++ LD a10, 10 * SIZE(A3) ++ MUL alpha4, a14, tmp ++ fmov tmp, a14 ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD y7, a11, tmp ++ fmov tmp, y7 ++ LD a11, 11 * SIZE(A3) ++ MUL alpha4, a15, tmp ++ fmov tmp, a15 ++ ldi Y1, 8 * SIZE(Y1) ++ ++ ADD y4, a12, tmp ++ fmov tmp, y4 ++ LD a12, 8 * SIZE(A4) ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ unop ++ ++ ADD y5, a13, tmp ++ fmov tmp, y5 ++ LD a13, 9 * SIZE(A4) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ ldi A3, 8 * SIZE(A3) ++ ++ ADD y6, a14, tmp ++ fmov tmp, y6 ++ LD a14, 10 * SIZE(A4) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A4) ++ ++ ADD y7, a15, tmp ++ fmov tmp, y7 ++ LD a15, 11 * SIZE(A4) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ ldi A4, 8 * SIZE(A4) ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, tmp ++ fmov tmp, a4 ++ ST y4, -4 * SIZE(Y1) ++ ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, tmp ++ fmov tmp, a5 ++ ST y5, -3 * SIZE(Y1) ++ ++ ADD y2, a2, tmp ++ fmov tmp, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, tmp ++ fmov tmp, a6 ++ ST y6, -2 * SIZE(Y1) ++ ++ ADD y3, a3, tmp ++ fmov tmp, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, tmp ++ fmov tmp, a7 ++ ST y7, -1 * SIZE(Y1) ++ ++ ADD y0, a4, tmp ++ fmov tmp, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha3, a8, tmp ++ fmov tmp, a8 ++ LD y4, 4 * SIZE(Y1) ++ ++ ADD y1, a5, tmp ++ fmov tmp, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha3, a9, tmp ++ fmov tmp, a9 ++ LD y5, 5 * SIZE(Y1) ++ ++ ADD y2, a6, tmp ++ fmov tmp, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha3, a10, tmp ++ fmov tmp, a10 ++ LD y6, 6 * SIZE(Y1) ++ ++ ADD y3, a7, tmp ++ fmov tmp, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha3, a11, tmp ++ fmov tmp, a11 ++ LD y7, 7 * SIZE(Y1) ++ ++ ADD y0, a8, tmp ++ fmov tmp, y0 ++ LD a8, 4 * SIZE(A3) ++ MUL alpha4, a12, tmp ++ fmov tmp, a12 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD y1, a9, tmp ++ fmov tmp, y1 ++ LD a9, 5 * SIZE(A3) ++ MUL alpha4, a13, tmp ++ fmov tmp, a13 ++ unop ++ ++ ADD y2, a10, tmp ++ fmov tmp, y2 ++ LD a10, 6 * SIZE(A3) ++ MUL alpha4, a14, tmp ++ fmov tmp, a14 ++ unop ++ ++ ADD y3, a11, tmp ++ fmov tmp, y3 ++ LD a11, 7 * SIZE(A3) ++ MUL alpha4, a15, tmp ++ fmov tmp, a15 ++ unop ++ ++ ADD y0, a12, tmp ++ fmov tmp, y0 ++ LD a12, 4 * SIZE(A4) ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ unop ++ ++ ADD y1, a13, tmp ++ fmov tmp, y1 ++ LD a13, 5 * SIZE(A4) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ unop ++ ++ ADD y2, a14, tmp ++ fmov tmp, y2 ++ LD a14, 6 * SIZE(A4) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ unop ++ ++ ADD y3, a15, tmp ++ fmov tmp, y3 ++ LD a15, 7 * SIZE(A4) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ unop ++ ++ ST y0, 0 * SIZE(Y1) ++ ADD y4, a0, tmp ++ fmov tmp, y4 ++ unop ++ MUL alpha2, a4, tmp ++ fmov tmp, a4 ++ ++ ST y1, 1 * SIZE(Y1) ++ ADD y5, a1, tmp ++ fmov tmp, y5 ++ unop ++ MUL alpha2, a5, tmp ++ fmov tmp, a5 ++ ++ ST y2, 2 * SIZE(Y1) ++ ADD y6, a2, tmp ++ fmov tmp, y6 ++ unop ++ MUL alpha2, a6, tmp ++ fmov tmp, a6 ++ ++ ST y3, 3 * SIZE(Y1) ++ ADD y7, a3, tmp ++ fmov tmp, y7 ++ ldi Y1, 8 * SIZE(Y1) ++ MUL alpha2, a7, tmp ++ fmov tmp, a7 ++ ++ ADD y4, a4, tmp ++ fmov tmp, y4 ++ MUL alpha3, a8, tmp ++ fmov tmp, a8 ++ ADD y5, a5, tmp ++ fmov tmp, y5 ++ MUL alpha3, a9, tmp ++ fmov tmp, a9 ++ ADD y6, a6, tmp ++ fmov tmp, y6 ++ MUL alpha3, a10, tmp ++ fmov tmp, a10 ++ ADD y7, a7, tmp ++ fmov tmp, y7 ++ MUL alpha3, a11, tmp ++ fmov tmp, a11 ++ ++ ADD y4, a8, tmp ++ fmov tmp, y4 ++ MUL alpha4, a12, tmp ++ fmov tmp, a12 ++ ADD y5, a9, tmp ++ fmov tmp, y5 ++ MUL alpha4, a13, tmp ++ fmov tmp, a13 ++ ADD y6, a10, tmp ++ fmov tmp, y6 ++ MUL alpha4, a14, tmp ++ fmov tmp, a14 ++ ADD y7, a11, tmp ++ fmov tmp, y7 ++ MUL alpha4, a15, tmp ++ fmov tmp, a15 ++ ++ ADD y4, a12, tmp ++ fmov tmp, y4 ++ ADD y5, a13, tmp ++ fmov tmp, y5 ++ ADD y6, a14, tmp ++ fmov tmp, y6 ++ ADD y7, a15, tmp ++ fmov tmp, y7 ++ ++ ST y4, -4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, -3 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) ++ ST y6, -2 * SIZE(Y1) ++ ldi A3, 8 * SIZE(A3) ++ ST y7, -1 * SIZE(Y1) ++ ldi A4, 8 * SIZE(A4) ++ .align 4 ++ ++$L15: ++ and M, 4, I ++ ble I, $L16 ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD a8, 0 * SIZE(A3) ++ LD a9, 1 * SIZE(A3) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 3 * SIZE(A3) ++ ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ LD a12, 0 * SIZE(A4) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ LD a13, 1 * SIZE(A4) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ LD a14, 2 * SIZE(A4) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ LD a15, 3 * SIZE(A4) ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ MUL alpha2, a4, tmp ++ fmov tmp, a4 ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ MUL alpha2, a5, tmp ++ fmov tmp, a5 ++ ADD y2, a2, tmp ++ fmov tmp, y2 ++ MUL alpha2, a6, tmp ++ fmov tmp, a6 ++ ADD y3, a3, tmp ++ fmov tmp, y3 ++ MUL alpha2, a7, tmp ++ fmov tmp, a7 ++ ++ ADD y0, a4, tmp ++ fmov tmp, y0 ++ MUL alpha3, a8, tmp ++ fmov tmp, a8 ++ ADD y1, a5, tmp ++ fmov tmp, y1 ++ MUL alpha3, a9, tmp ++ fmov tmp, a9 ++ ADD y2, a6, tmp ++ fmov tmp, y2 ++ MUL alpha3, a10, tmp ++ fmov tmp, a10 ++ ADD y3, a7, tmp ++ fmov tmp, y3 ++ MUL alpha3, a11, tmp ++ fmov tmp, a11 ++ ++ ADD y0, a8, tmp ++ fmov tmp, y0 ++ MUL alpha4, a12, tmp ++ fmov tmp, a12 ++ ADD y1, a9, tmp ++ fmov tmp, y1 ++ MUL alpha4, a13, tmp ++ fmov tmp, a13 ++ ADD y2, a10, tmp ++ fmov tmp, y2 ++ MUL alpha4, a14, tmp ++ fmov tmp, a14 ++ ADD y3, a11, tmp ++ fmov tmp, y3 ++ MUL alpha4, a15, tmp ++ fmov tmp, a15 ++ ++ ADD y0, a12, tmp ++ fmov tmp, y0 ++ ldi Y1, 4 * SIZE(Y1) ++ ADD y1, a13, tmp ++ fmov tmp, y1 ++ unop ++ ++ ADD y2, a14, tmp ++ fmov tmp, y2 ++ unop ++ ADD y3, a15, tmp ++ fmov tmp, y3 ++ unop ++ ++ ST y0, -4 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, -3 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, -2 * SIZE(Y1) ++ ldi A3, 4 * SIZE(A3) ++ ST y3, -1 * SIZE(Y1) ++ ldi A4, 4 * SIZE(A4) ++ .align 4 ++ ++$L16: ++ and M, 2, I ++ ble I, $L17 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ ++ LD a4, 0 * SIZE(A3) ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ LD a5, 1 * SIZE(A3) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ LD a6, 0 * SIZE(A4) ++ MUL alpha2, a2, tmp ++ fmov tmp, a2 ++ LD a7, 1 * SIZE(A4) ++ MUL alpha2, a3, tmp ++ fmov tmp, a3 ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ MUL alpha3, a4, tmp ++ fmov tmp, a4 ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ MUL alpha3, a5, tmp ++ fmov tmp, a5 ++ ADD y0, a2, tmp ++ fmov tmp, y0 ++ MUL alpha4, a6, tmp ++ fmov tmp, a6 ++ ADD y1, a3, tmp ++ fmov tmp, y1 ++ MUL alpha4, a7, tmp ++ fmov tmp, a7 ++ ++ ADD y0, a4, tmp ++ fmov tmp, y0 ++ ldi A1, 2 * SIZE(A1) ++ ADD y1, a5, tmp ++ fmov tmp, y1 ++ ldi A2, 2 * SIZE(A2) ++ ADD y0, a6, tmp ++ fmov tmp, y0 ++ ldi A3, 2 * SIZE(A3) ++ ADD y1, a7, tmp ++ fmov tmp, y1 ++ ldi A4, 2 * SIZE(A4) ++ ++ ST y0, 0 * SIZE(Y1) ++ unop ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 ++ ++$L17: ++ blbc M, $L18 ++ ++ LD y0, 0 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) ++ ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ MUL alpha2, a1, tmp ++ fmov tmp, a1 ++ MUL alpha3, a2, tmp ++ fmov tmp, a2 ++ MUL alpha4, a3, tmp ++ fmov tmp, a3 ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ ADD y0, a1, tmp ++ fmov tmp, y0 ++ ADD y0, a2, tmp ++ fmov tmp, y0 ++ ADD y0, a3, tmp ++ fmov tmp, y0 ++ ++ ST y0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L18: ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ and N, 2, J ++ ble J, $L30 ++ ++ LD alpha1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha2, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ mov A, A1 ++ MUL alpha, alpha1, tmp ++ fmov tmp, alpha1 ++ addl A, LDA, A2 ++ MUL alpha, alpha2, tmp ++ fmov tmp, alpha2 ++ ++ addl A2, LDA, A ++ mov Y, Y1 ++ ++ sra M, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ LD y7, 7 * SIZE(Y1) ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, tmp ++ fmov tmp, a4 ++ ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, tmp ++ fmov tmp, a5 ++ ++ ADD y2, a2, tmp ++ fmov tmp, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, tmp ++ fmov tmp, a6 ++ ++ ADD y3, a3, tmp ++ fmov tmp, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, tmp ++ fmov tmp, a7 ++ ++ ADD y0, a4, tmp ++ fmov tmp, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ ++ ADD y1, a5, tmp ++ fmov tmp, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ ++ ADD y2, a6, tmp ++ fmov tmp, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ ++ ADD y3, a7, tmp ++ fmov tmp, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) ++ ldi I, -1(I) ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD y4, a0, tmp ++ fmov tmp, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, tmp ++ fmov tmp, a4 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD y5, a1, tmp ++ fmov tmp, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, tmp ++ fmov tmp, a5 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD y6, a2, tmp ++ fmov tmp, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, tmp ++ fmov tmp, a6 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD y7, a3, tmp ++ fmov tmp, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, tmp ++ fmov tmp, a7 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD y4, a4, tmp ++ fmov tmp, y4 ++ LD a4, 0 * SIZE(A2) ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ LD y0, 8 * SIZE(Y1) ++ ++ ADD y5, a5, tmp ++ fmov tmp, y5 ++ LD a5, 1 * SIZE(A2) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ LD y1, 9 * SIZE(Y1) ++ ++ ADD y6, a6, tmp ++ fmov tmp, y6 ++ LD a6, 2 * SIZE(A2) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ LD y2, 10 * SIZE(Y1) ++ ++ ADD y7, a7, tmp ++ fmov tmp, y7 ++ LD a7, 3 * SIZE(A2) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ LD y3, 11 * SIZE(Y1) ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha2, a4, tmp ++ fmov tmp, a4 ++ LD a0, 12 * SIZE(A1) ++ ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha2, a5, tmp ++ fmov tmp, a5 ++ LD a1, 13 * SIZE(A1) ++ ++ ADD y2, a2, tmp ++ fmov tmp, y2 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha2, a6, tmp ++ fmov tmp, a6 ++ LD a2, 14 * SIZE(A1) ++ ++ ADD y3, a3, tmp ++ fmov tmp, y3 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha2, a7, tmp ++ fmov tmp, a7 ++ LD a3, 15 * SIZE(A1) ++ ++ ADD y0, a4, tmp ++ fmov tmp, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ LD y4, 12 * SIZE(Y1) ++ ++ ADD y1, a5, tmp ++ fmov tmp, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ LD y5, 13 * SIZE(Y1) ++ ++ ADD y2, a6, tmp ++ fmov tmp, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ LD y6, 14 * SIZE(Y1) ++ ++ ADD y3, a7, tmp ++ fmov tmp, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ LD y7, 15 * SIZE(Y1) ++ ++ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD y4, a0, tmp ++ fmov tmp, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, tmp ++ fmov tmp, a4 ++ unop ++ ++ ADD y5, a1, tmp ++ fmov tmp, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, tmp ++ fmov tmp, a5 ++ unop ++ ++ ADD y6, a2, tmp ++ fmov tmp, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, tmp ++ fmov tmp, a6 ++ unop ++ ++ ADD y7, a3, tmp ++ fmov tmp, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, tmp ++ fmov tmp, a7 ++ unop ++ ++ ADD y4, a4, tmp ++ fmov tmp, y4 ++ ADD y5, a5, tmp ++ fmov tmp, y5 ++ ADD y6, a6, tmp ++ fmov tmp, y6 ++ ADD y7, a7, tmp ++ fmov tmp, y7 ++ ++ ST y4, 4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, 5 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) ++ ++ ST y6, 6 * SIZE(Y1) ++ unop ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L25: ++ and M, 4, I ++ ble I, $L26 ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ LD a4, 0 * SIZE(A2) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ LD a5, 1 * SIZE(A2) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ LD a6, 2 * SIZE(A2) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ LD a7, 3 * SIZE(A2) ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ MUL alpha2, a4, tmp ++ fmov tmp, a4 ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ MUL alpha2, a5, tmp ++ fmov tmp, a5 ++ ADD y2, a2, tmp ++ fmov tmp, y2 ++ MUL alpha2, a6, tmp ++ fmov tmp, a6 ++ ADD y3, a3, tmp ++ fmov tmp, y3 ++ MUL alpha2, a7, tmp ++ fmov tmp, a7 ++ ++ ADD y0, a4, tmp ++ fmov tmp, y0 ++ ldi Y1, 4 * SIZE(Y1) ++ ADD y1, a5, tmp ++ fmov tmp, y1 ++ unop ++ ADD y2, a6, tmp ++ fmov tmp, y2 ++ unop ++ ADD y3, a7, tmp ++ fmov tmp, y3 ++ unop ++ ++ ST y0, -4 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, -3 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, -2 * SIZE(Y1) ++ ldi A3, 4 * SIZE(A3) ++ ST y3, -1 * SIZE(Y1) ++ ldi A4, 4 * SIZE(A4) ++ .align 4 ++ ++$L26: ++ and M, 2, I ++ ble I, $L27 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ MUL alpha2, a2, tmp ++ fmov tmp, a2 ++ MUL alpha2, a3, tmp ++ fmov tmp, a3 ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ ldi A1, 2 * SIZE(A1) ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ ldi A2, 2 * SIZE(A2) ++ ADD y0, a2, tmp ++ fmov tmp, y0 ++ unop ++ ADD y1, a3, tmp ++ fmov tmp, y1 ++ unop ++ ++ ST y0, 0 * SIZE(Y1) ++ unop ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 ++ ++$L27: ++ blbc M, $L30 ++ ++ LD y0, 0 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ MUL alpha2, a1, tmp ++ fmov tmp, a1 ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ ADD y0, a1, tmp ++ fmov tmp, y0 ++ ++ ST y0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L30: ++ blbc N, $L990 ++ ++ LD alpha1, 0 * SIZE(X) ++ mov A, A1 ++ MUL alpha, alpha1, tmp ++ fmov tmp, alpha1 ++ mov Y, Y1 ++ ++ sra M, 3, I ++ ble I, $L35 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ LD a4, 4 * SIZE(A1) ++ LD a5, 5 * SIZE(A1) ++ LD a6, 6 * SIZE(A1) ++ LD a7, 7 * SIZE(A1) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ LD y4, 4 * SIZE(Y1) ++ LD y5, 5 * SIZE(Y1) ++ LD y6, 6 * SIZE(Y1) ++ LD y7, 7 * SIZE(Y1) ++ ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ ++ ldi I, -1(I) ++ ble I, $L33 ++ .align 4 ++ ++$L32: ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, tmp ++ fmov tmp, a4 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, tmp ++ fmov tmp, a5 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD y2, a2, tmp ++ fmov tmp, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, tmp ++ fmov tmp, a6 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD y3, a3, tmp ++ fmov tmp, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, tmp ++ fmov tmp, a7 ++ LD a3, 11 * SIZE(A1) ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ ST y2, 2 * SIZE(Y1) ++ ST y3, 3 * SIZE(Y1) ++ ++ ADD y4, a4, tmp ++ fmov tmp, y4 ++ LD y0, 8 * SIZE(Y1) ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ LD a4, 12 * SIZE(A1) ++ ++ ADD y5, a5, tmp ++ fmov tmp, y5 ++ LD y1, 9 * SIZE(Y1) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ LD a5, 13 * SIZE(A1) ++ ++ ADD y6, a6, tmp ++ fmov tmp, y6 ++ LD y2, 10 * SIZE(Y1) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ LD a6, 14 * SIZE(A1) ++ ++ ADD y7, a7, tmp ++ fmov tmp, y7 ++ LD y3, 11 * SIZE(Y1) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ LD a7, 15 * SIZE(A1) ++ ++ ST y4, 4 * SIZE(Y1) ++ ldi I, -1(I) ++ ST y5, 5 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ++ ST y6, 6 * SIZE(Y1) ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L32 ++ .align 4 ++ ++$L33: ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, tmp ++ fmov tmp, a4 ++ unop ++ ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, tmp ++ fmov tmp, a5 ++ unop ++ ++ ADD y2, a2, tmp ++ fmov tmp, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, tmp ++ fmov tmp, a6 ++ unop ++ ++ ADD y3, a3, tmp ++ fmov tmp, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, tmp ++ fmov tmp, a7 ++ unop ++ ++ ADD y4, a4, tmp ++ fmov tmp, y4 ++ ST y0, 0 * SIZE(Y1) ++ ADD y5, a5, tmp ++ fmov tmp, y5 ++ ST y1, 1 * SIZE(Y1) ++ ADD y6, a6, tmp ++ fmov tmp, y6 ++ ST y2, 2 * SIZE(Y1) ++ ADD y7, a7, tmp ++ fmov tmp, y7 ++ ST y3, 3 * SIZE(Y1) ++ ++ ST y4, 4 * SIZE(Y1) ++ unop ++ ST y5, 5 * SIZE(Y1) ++ unop ++ ++ ST y6, 6 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L35: ++ and M, 4, I ++ ble I, $L36 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, tmp ++ fmov tmp, a2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, tmp ++ fmov tmp, a3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ ADD y2, a2, tmp ++ fmov tmp, y2 ++ ADD y3, a3, tmp ++ fmov tmp, y3 ++ ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, 2 * SIZE(Y1) ++ unop ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 ++ ++$L36: ++ and M, 2, I ++ ble I, $L37 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a1, tmp ++ fmov tmp, a1 ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ ADD y1, a1, tmp ++ fmov tmp, y1 ++ ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 2 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 ++ ++$L37: ++ blbc M, $L990 ++ ++ LD y0, 0 * SIZE(Y1) ++ LD a0, 0 * SIZE(A1) ++ ++ MUL alpha1, a0, tmp ++ fmov tmp, a0 ++ ++ ADD y0, a0, tmp ++ fmov tmp, y0 ++ ST y0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L990: ++ cmpeq INCY, SIZE, $0 ++ bne $0, $L999 ++ ++ mov BUFFER, Y1 ++ ++ sra M, 3, I ++ ble I, $L995 ++ .align 4 ++ ++$L992: ++ LD a0, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a1, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a2, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a3, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ LD y2, 2 * SIZE(Y) ++ LD y3, 3 * SIZE(Y) ++ ++ LD a4, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a5, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a6, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a7, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y4, 4 * SIZE(Y) ++ LD y5, 5 * SIZE(Y) ++ LD y6, 6 * SIZE(Y) ++ LD y7, 7 * SIZE(Y) ++ ++ ADD a0, y0, tmp ++ fmov tmp, a0 ++ ADD a1, y1, tmp ++ fmov tmp, a1 ++ ADD a2, y2, tmp ++ fmov tmp, a2 ++ ADD a3, y3, tmp ++ fmov tmp, a3 ++ ADD a4, y4, tmp ++ fmov tmp, a4 ++ ADD a5, y5, tmp ++ fmov tmp, a5 ++ ADD a6, y6, tmp ++ fmov tmp, a6 ++ ADD a7, y7, tmp ++ fmov tmp, a7 ++ ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a1, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a2, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a3, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ST a4, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a5, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a6, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a7, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ ldi Y, 8 * SIZE(Y) ++ bgt I, $L992 ++ .align 4 ++ ++$L995: ++ and M, 7, I ++ ble I, $L999 ++ .align 4 ++ ++$L996: ++ LD a0, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ ldi Y, 1 * SIZE(Y) ++ ++ ADD a0, y0, tmp ++ fmov tmp, a0 ++ ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L996 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ fldd $f20, 64($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/gemv_n.S.bak b/kernel/sw_64/gemv_n.S.bak +new file mode 100644 +index 0000000..f90abdf +--- /dev/null ++++ b/kernel/sw_64/gemv_n.S.bak +@@ -0,0 +1,1307 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $20 ++#define LDA $21 ++ ++#define X $18 ++#define INCX $19 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define Y1 $4 ++ ++#define A1 $5 ++#define A2 $6 ++#define A3 $7 ++#define A4 $8 ++ ++#define alpha $f19 ++ ++#define alpha1 $f0 ++#define alpha2 $f1 ++#define alpha3 $f10 ++#define alpha4 $f11 ++ ++#define y0 $f12 ++#define y1 $f13 ++#define y2 $f14 ++#define y3 $f15 ++ ++#define y4 $f16 ++#define y5 $f17 ++#define y6 $f18 ++#define y7 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl X, 0 + STACKSIZE($sp) ++ ldl INCX, 8 + STACKSIZE($sp) ++ ldl Y, 16 + STACKSIZE($sp) ++ ldl INCY, 24 + STACKSIZE($sp) ++ ldl BUFFER, 32 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ SXADDQ INCX, 0, INCX ++ cmple N, 0, $1 ++ SXADDQ INCY, 0, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ SXADDQ LDA, 0, LDA ++ ++ cmpeq INCY, SIZE, $0 ++ bne $0, $L10 ++ ++ mov BUFFER, Y1 ++ ++ mov Y, BUFFER ++ mov Y1, Y ++ ++ sra M, 3, I ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ ST $f31, 2 * SIZE(Y1) ++ ST $f31, 3 * SIZE(Y1) ++ ST $f31, 4 * SIZE(Y1) ++ ST $f31, 5 * SIZE(Y1) ++ ST $f31, 6 * SIZE(Y1) ++ ST $f31, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ ldi I, -1(I) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 7, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ ST $f31, 0 * SIZE(Y1) ++ addl Y1, SIZE, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ sra N, 2, J ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ LD alpha1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha3, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha4, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ MUL alpha, alpha1, alpha1 ++ MUL alpha, alpha2, alpha2 ++ MUL alpha, alpha3, alpha3 ++ MUL alpha, alpha4, alpha4 ++ ++ mov A, A1 ++ addl A, LDA, A2 ++ addl A2, LDA, A3 ++ addl A3, LDA, A4 ++ s4addl LDA, A, A ++ ++ mov Y, Y1 ++ fillcs 4 * SIZE(X) ++ ++ sra M, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ LD a8, 0 * SIZE(A3) ++ LD a9, 1 * SIZE(A3) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 3 * SIZE(A3) ++ ++ LD y4, 4 * SIZE(Y1) ++ LD y5, 5 * SIZE(Y1) ++ LD y6, 6 * SIZE(Y1) ++ LD y7, 7 * SIZE(Y1) ++ ++ MUL alpha1, a0, a0 ++ LD a12, 0 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ LD a13, 1 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ LD a14, 2 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ LD a15, 3 * SIZE(A4) ++ ++ ADD y0, a0, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, a4 ++ unop ++ ++ ADD y1, a1, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, a5 ++ unop ++ ++ ADD y2, a2, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, a6 ++ unop ++ ++ ADD y3, a3, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, a7 ++ unop ++ ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha3, a8, a8 ++ unop ++ ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha3, a9, a9 ++ ldi I, -1(I) ++ ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha3, a10, a10 ++ unop ++ ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha3, a11, a11 ++ unop ++ ++ ADD y0, a8, y0 ++ LD a8, 4 * SIZE(A3) ++ MUL alpha4, a12, a12 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD y1, a9, y1 ++ LD a9, 5 * SIZE(A3) ++ MUL alpha4, a13, a13 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ++ ADD y2, a10, y2 ++ LD a10, 6 * SIZE(A3) ++ MUL alpha4, a14, a14 ++ unop ++ ++ ADD y3, a11, y3 ++ LD a11, 7 * SIZE(A3) ++ MUL alpha4, a15, a15 ++ ldi I, -1(I) ++ ++ ADD y0, a12, y0 ++ LD a12, 4 * SIZE(A4) ++ MUL alpha1, a0, a0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) ++ ++ ADD y1, a13, y1 ++ LD a13, 5 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ unop ++ ++ ADD y2, a14, y2 ++ LD a14, 6 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ unop ++ ++ ADD y3, a15, y3 ++ LD a15, 7 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ ++ ADD y4, a0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD y5, a1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD y6, a2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD y7, a3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD y4, a4, y4 ++ LD a4, 8 * SIZE(A2) ++ MUL alpha3, a8, a8 ++ LD y0, 8 * SIZE(Y1) ++ ++ ADD y5, a5, y5 ++ LD a5, 9 * SIZE(A2) ++ MUL alpha3, a9, a9 ++ LD y1, 9 * SIZE(Y1) ++ ++ ADD y6, a6, y6 ++ LD a6, 10 * SIZE(A2) ++ MUL alpha3, a10, a10 ++ LD y2, 10 * SIZE(Y1) ++ ++ ADD y7, a7, y7 ++ LD a7, 11 * SIZE(A2) ++ MUL alpha3, a11, a11 ++ LD y3, 11 * SIZE(Y1) ++ ++ ADD y4, a8, y4 ++ LD a8, 8 * SIZE(A3) ++ MUL alpha4, a12, a12 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A3) ++ ++ ADD y5, a9, y5 ++ LD a9, 9 * SIZE(A3) ++ MUL alpha4, a13, a13 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD y6, a10, y6 ++ LD a10, 10 * SIZE(A3) ++ MUL alpha4, a14, a14 ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD y7, a11, y7 ++ LD a11, 11 * SIZE(A3) ++ MUL alpha4, a15, a15 ++ ldi Y1, 8 * SIZE(Y1) ++ ++ ADD y4, a12, y4 ++ LD a12, 8 * SIZE(A4) ++ MUL alpha1, a0, a0 ++ unop ++ ++ ADD y5, a13, y5 ++ LD a13, 9 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ ldi A3, 8 * SIZE(A3) ++ ++ ADD y6, a14, y6 ++ LD a14, 10 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A4) ++ ++ ADD y7, a15, y7 ++ LD a15, 11 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ ldi A4, 8 * SIZE(A4) ++ ++ ADD y0, a0, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, a4 ++ ST y4, -4 * SIZE(Y1) ++ ++ ADD y1, a1, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, a5 ++ ST y5, -3 * SIZE(Y1) ++ ++ ADD y2, a2, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, a6 ++ ST y6, -2 * SIZE(Y1) ++ ++ ADD y3, a3, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, a7 ++ ST y7, -1 * SIZE(Y1) ++ ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha3, a8, a8 ++ LD y4, 4 * SIZE(Y1) ++ ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha3, a9, a9 ++ LD y5, 5 * SIZE(Y1) ++ ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha3, a10, a10 ++ LD y6, 6 * SIZE(Y1) ++ ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha3, a11, a11 ++ LD y7, 7 * SIZE(Y1) ++ ++ ADD y0, a8, y0 ++ LD a8, 4 * SIZE(A3) ++ MUL alpha4, a12, a12 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD y1, a9, y1 ++ LD a9, 5 * SIZE(A3) ++ MUL alpha4, a13, a13 ++ unop ++ ++ ADD y2, a10, y2 ++ LD a10, 6 * SIZE(A3) ++ MUL alpha4, a14, a14 ++ unop ++ ++ ADD y3, a11, y3 ++ LD a11, 7 * SIZE(A3) ++ MUL alpha4, a15, a15 ++ unop ++ ++ ADD y0, a12, y0 ++ LD a12, 4 * SIZE(A4) ++ MUL alpha1, a0, a0 ++ unop ++ ++ ADD y1, a13, y1 ++ LD a13, 5 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ unop ++ ++ ADD y2, a14, y2 ++ LD a14, 6 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ unop ++ ++ ADD y3, a15, y3 ++ LD a15, 7 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ unop ++ ++ ST y0, 0 * SIZE(Y1) ++ ADD y4, a0, y4 ++ unop ++ MUL alpha2, a4, a4 ++ ++ ST y1, 1 * SIZE(Y1) ++ ADD y5, a1, y5 ++ unop ++ MUL alpha2, a5, a5 ++ ++ ST y2, 2 * SIZE(Y1) ++ ADD y6, a2, y6 ++ unop ++ MUL alpha2, a6, a6 ++ ++ ST y3, 3 * SIZE(Y1) ++ ADD y7, a3, y7 ++ ldi Y1, 8 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ ++ ADD y4, a4, y4 ++ MUL alpha3, a8, a8 ++ ADD y5, a5, y5 ++ MUL alpha3, a9, a9 ++ ADD y6, a6, y6 ++ MUL alpha3, a10, a10 ++ ADD y7, a7, y7 ++ MUL alpha3, a11, a11 ++ ++ ADD y4, a8, y4 ++ MUL alpha4, a12, a12 ++ ADD y5, a9, y5 ++ MUL alpha4, a13, a13 ++ ADD y6, a10, y6 ++ MUL alpha4, a14, a14 ++ ADD y7, a11, y7 ++ MUL alpha4, a15, a15 ++ ++ ADD y4, a12, y4 ++ ADD y5, a13, y5 ++ ADD y6, a14, y6 ++ ADD y7, a15, y7 ++ ++ ST y4, -4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, -3 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) ++ ST y6, -2 * SIZE(Y1) ++ ldi A3, 8 * SIZE(A3) ++ ST y7, -1 * SIZE(Y1) ++ ldi A4, 8 * SIZE(A4) ++ .align 4 ++ ++$L15: ++ and M, 4, I ++ ble I, $L16 ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD a8, 0 * SIZE(A3) ++ LD a9, 1 * SIZE(A3) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 3 * SIZE(A3) ++ ++ MUL alpha1, a0, a0 ++ LD a12, 0 * SIZE(A4) ++ MUL alpha1, a1, a1 ++ LD a13, 1 * SIZE(A4) ++ MUL alpha1, a2, a2 ++ LD a14, 2 * SIZE(A4) ++ MUL alpha1, a3, a3 ++ LD a15, 3 * SIZE(A4) ++ ++ ADD y0, a0, y0 ++ MUL alpha2, a4, a4 ++ ADD y1, a1, y1 ++ MUL alpha2, a5, a5 ++ ADD y2, a2, y2 ++ MUL alpha2, a6, a6 ++ ADD y3, a3, y3 ++ MUL alpha2, a7, a7 ++ ++ ADD y0, a4, y0 ++ MUL alpha3, a8, a8 ++ ADD y1, a5, y1 ++ MUL alpha3, a9, a9 ++ ADD y2, a6, y2 ++ MUL alpha3, a10, a10 ++ ADD y3, a7, y3 ++ MUL alpha3, a11, a11 ++ ++ ADD y0, a8, y0 ++ MUL alpha4, a12, a12 ++ ADD y1, a9, y1 ++ MUL alpha4, a13, a13 ++ ADD y2, a10, y2 ++ MUL alpha4, a14, a14 ++ ADD y3, a11, y3 ++ MUL alpha4, a15, a15 ++ ++ ADD y0, a12, y0 ++ ldi Y1, 4 * SIZE(Y1) ++ ADD y1, a13, y1 ++ unop ++ ++ ADD y2, a14, y2 ++ unop ++ ADD y3, a15, y3 ++ unop ++ ++ ST y0, -4 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, -3 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, -2 * SIZE(Y1) ++ ldi A3, 4 * SIZE(A3) ++ ST y3, -1 * SIZE(Y1) ++ ldi A4, 4 * SIZE(A4) ++ .align 4 ++ ++$L16: ++ and M, 2, I ++ ble I, $L17 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ ++ LD a4, 0 * SIZE(A3) ++ MUL alpha1, a0, a0 ++ LD a5, 1 * SIZE(A3) ++ MUL alpha1, a1, a1 ++ LD a6, 0 * SIZE(A4) ++ MUL alpha2, a2, a2 ++ LD a7, 1 * SIZE(A4) ++ MUL alpha2, a3, a3 ++ ++ ADD y0, a0, y0 ++ MUL alpha3, a4, a4 ++ ADD y1, a1, y1 ++ MUL alpha3, a5, a5 ++ ADD y0, a2, y0 ++ MUL alpha4, a6, a6 ++ ADD y1, a3, y1 ++ MUL alpha4, a7, a7 ++ ++ ADD y0, a4, y0 ++ ldi A1, 2 * SIZE(A1) ++ ADD y1, a5, y1 ++ ldi A2, 2 * SIZE(A2) ++ ADD y0, a6, y0 ++ ldi A3, 2 * SIZE(A3) ++ ADD y1, a7, y1 ++ ldi A4, 2 * SIZE(A4) ++ ++ ST y0, 0 * SIZE(Y1) ++ unop ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 ++ ++$L17: ++ blbc M, $L18 ++ ++ LD y0, 0 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) ++ ++ MUL alpha1, a0, a0 ++ MUL alpha2, a1, a1 ++ MUL alpha3, a2, a2 ++ MUL alpha4, a3, a3 ++ ++ ADD y0, a0, y0 ++ ADD y0, a1, y0 ++ ADD y0, a2, y0 ++ ADD y0, a3, y0 ++ ++ ST y0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L18: ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ and N, 2, J ++ ble J, $L30 ++ ++ LD alpha1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD alpha2, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ mov A, A1 ++ MUL alpha, alpha1, alpha1 ++ addl A, LDA, A2 ++ MUL alpha, alpha2, alpha2 ++ ++ addl A2, LDA, A ++ mov Y, Y1 ++ ++ sra M, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ MUL alpha1, a0, a0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a2, a2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a3, a3 ++ LD y7, 7 * SIZE(Y1) ++ ++ ADD y0, a0, y0 ++ LD a0, 4 * SIZE(A1) ++ MUL alpha2, a4, a4 ++ ++ ADD y1, a1, y1 ++ LD a1, 5 * SIZE(A1) ++ MUL alpha2, a5, a5 ++ ++ ADD y2, a2, y2 ++ LD a2, 6 * SIZE(A1) ++ MUL alpha2, a6, a6 ++ ++ ADD y3, a3, y3 ++ LD a3, 7 * SIZE(A1) ++ MUL alpha2, a7, a7 ++ ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha1, a0, a0 ++ ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ldi I, -1(I) ++ fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD y4, a0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD y5, a1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD y6, a2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD y7, a3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD y4, a4, y4 ++ LD a4, 0 * SIZE(A2) ++ MUL alpha1, a0, a0 ++ LD y0, 8 * SIZE(Y1) ++ ++ ADD y5, a5, y5 ++ LD a5, 1 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ LD y1, 9 * SIZE(Y1) ++ ++ ADD y6, a6, y6 ++ LD a6, 2 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ LD y2, 10 * SIZE(Y1) ++ ++ ADD y7, a7, y7 ++ LD a7, 3 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ LD y3, 11 * SIZE(Y1) ++ ++ ADD y0, a0, y0 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ LD a0, 12 * SIZE(A1) ++ ++ ADD y1, a1, y1 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ LD a1, 13 * SIZE(A1) ++ ++ ADD y2, a2, y2 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ LD a2, 14 * SIZE(A1) ++ ++ ADD y3, a3, y3 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ LD a3, 15 * SIZE(A1) ++ ++ ADD y0, a4, y0 ++ LD a4, 4 * SIZE(A2) ++ MUL alpha1, a0, a0 ++ LD y4, 12 * SIZE(Y1) ++ ++ ADD y1, a5, y1 ++ LD a5, 5 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ LD y5, 13 * SIZE(Y1) ++ ++ ADD y2, a6, y2 ++ LD a6, 6 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ LD y6, 14 * SIZE(Y1) ++ ++ ADD y3, a7, y3 ++ LD a7, 7 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ LD y7, 15 * SIZE(Y1) ++ ++ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD y4, a0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a4, a4 ++ unop ++ ++ ADD y5, a1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a5, a5 ++ unop ++ ++ ADD y6, a2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a6, a6 ++ unop ++ ++ ADD y7, a3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a7, a7 ++ unop ++ ++ ADD y4, a4, y4 ++ ADD y5, a5, y5 ++ ADD y6, a6, y6 ++ ADD y7, a7, y7 ++ ++ ST y4, 4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, 5 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) ++ ++ ST y6, 6 * SIZE(Y1) ++ unop ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L25: ++ and M, 4, I ++ ble I, $L26 ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ MUL alpha1, a0, a0 ++ LD a4, 0 * SIZE(A2) ++ MUL alpha1, a1, a1 ++ LD a5, 1 * SIZE(A2) ++ MUL alpha1, a2, a2 ++ LD a6, 2 * SIZE(A2) ++ MUL alpha1, a3, a3 ++ LD a7, 3 * SIZE(A2) ++ ++ ADD y0, a0, y0 ++ MUL alpha2, a4, a4 ++ ADD y1, a1, y1 ++ MUL alpha2, a5, a5 ++ ADD y2, a2, y2 ++ MUL alpha2, a6, a6 ++ ADD y3, a3, y3 ++ MUL alpha2, a7, a7 ++ ++ ADD y0, a4, y0 ++ ldi Y1, 4 * SIZE(Y1) ++ ADD y1, a5, y1 ++ unop ++ ADD y2, a6, y2 ++ unop ++ ADD y3, a7, y3 ++ unop ++ ++ ST y0, -4 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, -3 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, -2 * SIZE(Y1) ++ ldi A3, 4 * SIZE(A3) ++ ST y3, -1 * SIZE(Y1) ++ ldi A4, 4 * SIZE(A4) ++ .align 4 ++ ++$L26: ++ and M, 2, I ++ ble I, $L27 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ ++ MUL alpha1, a0, a0 ++ MUL alpha1, a1, a1 ++ MUL alpha2, a2, a2 ++ MUL alpha2, a3, a3 ++ ++ ADD y0, a0, y0 ++ ldi A1, 2 * SIZE(A1) ++ ADD y1, a1, y1 ++ ldi A2, 2 * SIZE(A2) ++ ADD y0, a2, y0 ++ unop ++ ADD y1, a3, y1 ++ unop ++ ++ ST y0, 0 * SIZE(Y1) ++ unop ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 ++ ++$L27: ++ blbc M, $L30 ++ ++ LD y0, 0 * SIZE(Y1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ ++ MUL alpha1, a0, a0 ++ MUL alpha2, a1, a1 ++ ++ ADD y0, a0, y0 ++ ADD y0, a1, y0 ++ ++ ST y0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L30: ++ blbc N, $L990 ++ ++ LD alpha1, 0 * SIZE(X) ++ mov A, A1 ++ MUL alpha, alpha1, alpha1 ++ mov Y, Y1 ++ ++ sra M, 3, I ++ ble I, $L35 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ LD a4, 4 * SIZE(A1) ++ LD a5, 5 * SIZE(A1) ++ LD a6, 6 * SIZE(A1) ++ LD a7, 7 * SIZE(A1) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ LD y4, 4 * SIZE(Y1) ++ LD y5, 5 * SIZE(Y1) ++ LD y6, 6 * SIZE(Y1) ++ LD y7, 7 * SIZE(Y1) ++ ++ MUL alpha1, a0, a0 ++ MUL alpha1, a1, a1 ++ MUL alpha1, a2, a2 ++ MUL alpha1, a3, a3 ++ ++ ldi I, -1(I) ++ ble I, $L33 ++ .align 4 ++ ++$L32: ++ ADD y0, a0, y0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, a4 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD y1, a1, y1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, a5 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD y2, a2, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, a6 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD y3, a3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, a7 ++ LD a3, 11 * SIZE(A1) ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ ST y2, 2 * SIZE(Y1) ++ ST y3, 3 * SIZE(Y1) ++ ++ ADD y4, a4, y4 ++ LD y0, 8 * SIZE(Y1) ++ MUL alpha1, a0, a0 ++ LD a4, 12 * SIZE(A1) ++ ++ ADD y5, a5, y5 ++ LD y1, 9 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ LD a5, 13 * SIZE(A1) ++ ++ ADD y6, a6, y6 ++ LD y2, 10 * SIZE(Y1) ++ MUL alpha1, a2, a2 ++ LD a6, 14 * SIZE(A1) ++ ++ ADD y7, a7, y7 ++ LD y3, 11 * SIZE(Y1) ++ MUL alpha1, a3, a3 ++ LD a7, 15 * SIZE(A1) ++ ++ ST y4, 4 * SIZE(Y1) ++ ldi I, -1(I) ++ ST y5, 5 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ++ ST y6, 6 * SIZE(Y1) ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L32 ++ .align 4 ++ ++$L33: ++ ADD y0, a0, y0 ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, a4 ++ unop ++ ++ ADD y1, a1, y1 ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, a5 ++ unop ++ ++ ADD y2, a2, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, a6 ++ unop ++ ++ ADD y3, a3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, a7 ++ unop ++ ++ ADD y4, a4, y4 ++ ST y0, 0 * SIZE(Y1) ++ ADD y5, a5, y5 ++ ST y1, 1 * SIZE(Y1) ++ ADD y6, a6, y6 ++ ST y2, 2 * SIZE(Y1) ++ ADD y7, a7, y7 ++ ST y3, 3 * SIZE(Y1) ++ ++ ST y4, 4 * SIZE(Y1) ++ unop ++ ST y5, 5 * SIZE(Y1) ++ unop ++ ++ ST y6, 6 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L35: ++ and M, 4, I ++ ble I, $L36 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ MUL alpha1, a0, a0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, a2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, a3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD y0, a0, y0 ++ ADD y1, a1, y1 ++ ADD y2, a2, y2 ++ ADD y3, a3, y3 ++ ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ST y2, 2 * SIZE(Y1) ++ unop ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 ++ ++$L36: ++ and M, 2, I ++ ble I, $L37 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a0, a0 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a1, a1 ++ ++ ADD y0, a0, y0 ++ ADD y1, a1, y1 ++ ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 2 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ .align 4 ++ ++$L37: ++ blbc M, $L990 ++ ++ LD y0, 0 * SIZE(Y1) ++ LD a0, 0 * SIZE(A1) ++ ++ MUL alpha1, a0, a0 ++ ++ ADD y0, a0, y0 ++ ST y0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L990: ++ cmpeq INCY, SIZE, $0 ++ bne $0, $L999 ++ ++ mov BUFFER, Y1 ++ ++ sra M, 3, I ++ ble I, $L995 ++ .align 4 ++ ++$L992: ++ LD a0, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a1, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a2, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a3, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ LD y2, 2 * SIZE(Y) ++ LD y3, 3 * SIZE(Y) ++ ++ LD a4, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a5, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a6, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a7, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y4, 4 * SIZE(Y) ++ LD y5, 5 * SIZE(Y) ++ LD y6, 6 * SIZE(Y) ++ LD y7, 7 * SIZE(Y) ++ ++ ADD a0, y0, a0 ++ ADD a1, y1, a1 ++ ADD a2, y2, a2 ++ ADD a3, y3, a3 ++ ADD a4, y4, a4 ++ ADD a5, y5, a5 ++ ADD a6, y6, a6 ++ ADD a7, y7, a7 ++ ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a1, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a2, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a3, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ST a4, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a5, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a6, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a7, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ ldi Y, 8 * SIZE(Y) ++ bgt I, $L992 ++ .align 4 ++ ++$L995: ++ and M, 7, I ++ ble I, $L999 ++ .align 4 ++ ++$L996: ++ LD a0, 0 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ ldi Y, 1 * SIZE(Y) ++ ++ ADD a0, y0, a0 ++ ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L996 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/gemv_t.S b/kernel/sw_64/gemv_t.S +new file mode 100644 +index 0000000..4d8f130 +--- /dev/null ++++ b/kernel/sw_64/gemv_t.S +@@ -0,0 +1,1222 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define STACKSIZE 72 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $20 ++#define LDA $21 ++ ++#define X $18 ++#define INCX $19 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define X1 $3 ++#define Y1 $4 ++ ++#define A1 $5 ++#define A2 $6 ++#define A3 $7 ++#define A4 $8 ++ ++#define alpha $f19 ++#define f20 $f20 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl X, 0 + STACKSIZE($sp) ++ ldl INCX, 8 + STACKSIZE($sp) ++ ldl Y, 16 + STACKSIZE($sp) ++ ldl INCY, 24 + STACKSIZE($sp) ++ ldl BUFFER, 32 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ fstd f20, 64($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ SXADDQ INCX, 0, INCX ++ cmple N, 0, $1 ++ SXADDQ INCY, 0, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ cmpeq INCX, SIZE, $0 ++ mov X, X1 ++ SXADDQ LDA, 0, LDA ++ bne $0, $L10 ++ ++ sra M, 3, I ++ mov BUFFER, Y1 ++ mov BUFFER, X ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(X1) ++ ldi I, -1(I) ++ ++ LD a0, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a1, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a2, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a3, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ST a2, 2 * SIZE(Y1) ++ ST a3, 3 * SIZE(Y1) ++ ++ LD a4, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a5, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a6, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a7, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a4, 4 * SIZE(Y1) ++ ST a5, 5 * SIZE(Y1) ++ ST a6, 6 * SIZE(Y1) ++ ST a7, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 7, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ LD a0, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, SIZE, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ mov Y, Y1 ++ fclr t0 ++ unop ++ fclr t1 ++ ++ sra N, 2, J ++ fclr t2 ++ fclr t3 ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ mov A, A1 ++ fclr s0 ++ addl A, LDA, A2 ++ fclr s1 ++ ++ addl A2, LDA, A3 ++ fclr s2 ++ addl A3, LDA, A4 ++ fclr s3 ++ ++ s4addl LDA, A, A ++ unop ++ mov X, X1 ++ flds $f31, 3 * SIZE(Y) ++ ++ sra M, 3, I ++ ble I, $L15 ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) ++ LD a4, 1 * SIZE(A1) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 1 * SIZE(A3) ++ LD a7, 1 * SIZE(A4) ++ LD a8, 2 * SIZE(A1) ++ LD a9, 2 * SIZE(A2) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 2 * SIZE(A4) ++ LD a12, 3 * SIZE(A1) ++ LD a13, 3 * SIZE(A2) ++ LD a14, 3 * SIZE(A3) ++ LD a15, 3 * SIZE(A4) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 4 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ unop ++ MUL x0, a2, t2 ++ LD a2, 4 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20, s3 ++ LD a0, 4 * SIZE(A1) ++ unop ++ MUL x0, a3, t3 ++ LD a3, 4 * SIZE(A4) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a4, t0 ++ ++ ADD s1, t1, f20 ++ fmov f20, s1 ++ LD a4, 5 * SIZE(A1) ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a5, t1 ++ LD a5, 5 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ #unop ++ MUL x1, a6, t2 ++ LD a6, 5 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ #unop ++ MUL x1, a7, t3 ++ LD a7, 5 * SIZE(A4) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, -2 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x2, a9, t1 ++ LD a9, 6 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ ldi A2, 8 * SIZE(A2) ++ MUL x2, a10, t2 ++ LD a10, 6 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ ldi A3, 8 * SIZE(A3) ++ MUL x2, a11, t3 ++ LD a11, 6 * SIZE(A4) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, -1 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldi A4, 8 * SIZE(A4) ++ MUL x3, a13, t1 ++ LD a13, -1 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ unop ++ MUL x3, a14, t2 ++ LD a14, -1 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ unop ++ MUL x3, a15, t3 ++ LD a15, -1 * SIZE(A4) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 0 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldw $31, (PREFETCHSIZE - 8) * SIZE(A3) ++ MUL x0, a1, t1 ++ LD a1, 0 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ unop ++ MUL x0, a2, t2 ++ LD a2, 0 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ unop ++ MUL x0, a3, t3 ++ LD a3, 0 * SIZE(A4) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x0, 8 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 1 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ unop ++ MUL x1, a5, t1 ++ LD a5, 1 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ unop ++ MUL x1, a6, t2 ++ LD a6, 1 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ unop ++ MUL x1, a7, t3 ++ LD a7, 1 * SIZE(A4) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x1, 9 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, 2 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldw $31, (PREFETCHSIZE - 8) * SIZE(A4) ++ MUL x2, a9, t1 ++ LD a9, 2 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ ldi X1, 8 * SIZE(X1) ++ MUL x2, a10, t2 ++ LD a10, 2 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ ldi I, -1(I) ++ MUL x2, a11, t3 ++ LD a11, 2 * SIZE(A4) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x2, 2 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, 3 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldw $31, (PREFETCHSIZE - 8) * SIZE(X1) ++ MUL x3, a13, t1 ++ LD a13, 3 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ unop ++ MUL x3, a14, t2 ++ LD a14, 3 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ MUL x3, a15, t3 ++ LD a15, 3 * SIZE(A4) ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 4 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ #unop ++ MUL x0, a1, t1 ++ LD a1, 4 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ #unop ++ MUL x0, a2, t2 ++ LD a2, 4 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ #unop ++ MUL x0, a3, t3 ++ LD a3, 4 * SIZE(A4) ++ ++ ADD s0, t0, x0 ++ fmov x0,s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 5 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ #unop ++ MUL x1, a5, t1 ++ LD a5, 5 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ #unop ++ MUL x1, a6, t2 ++ LD a6, 5 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ #unop ++ MUL x1, a7, t3 ++ LD a7, 5 * SIZE(A4) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, 6 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ #unop ++ MUL x2, a9, t1 ++ LD a9, 6 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ #unop ++ MUL x2, a10, t2 ++ LD a10, 6 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ #unop ++ MUL x2, a11, t3 ++ LD a11, 6 * SIZE(A4) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, 7 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x3, a13, t1 ++ LD a13, 7 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ ldi A2, 8 * SIZE(A2) ++ MUL x3, a14, t2 ++ LD a14, 7 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ ldi A3, 8 * SIZE(A3) ++ MUL x3, a15, t3 ++ LD a15, 7 * SIZE(A4) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a0, t0 ++ unop ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x0, a1, t1 ++ ldi A4, 8 * SIZE(A4) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ MUL x0, a2, t2 ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ MUL x0, a3, t3 ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ MUL x1, a4, t0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL x1, a5, t1 ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ MUL x1, a6, t2 ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ MUL x1, a7, t3 ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ MUL x2, a8, t0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL x2, a9, t1 ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ MUL x2, a10, t2 ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ MUL x2, a11, t3 ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ MUL x3, a12, t0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL x3, a13, t1 ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ MUL x3, a14, t2 ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ MUL x3, a15, t3 ++ .align 4 ++ ++$L15: ++ and M, 7, I ++ ble I, $L18 ++ ++ LD x0, 0 * SIZE(X1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) ++ ++ ldi I, -1(I) ++ ble I, $L17 ++ .align 4 ++ ++$L16: ++ ADD s0, t0,f20 ++ fmov f20,s0 ++ ldi A4, 1 * SIZE(A4) ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldi A1, 1 * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 1 * SIZE(A2) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ ldi A2, 1 * SIZE(A2) ++ MUL x0, a2, t2 ++ LD a2, 1 * SIZE(A3) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ ldi A3, 1 * SIZE(A3) ++ MUL x0, a3, t3 ++ LD a3, 0 * SIZE(A4) ++ ++ LD x0, 1 * SIZE(X1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L16 ++ .align 4 ++ ++$L17: ++ ADD s0, t0,f20 ++ fmov f20,s0 ++ MUL x0, a0, t0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL x0, a1, t1 ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ MUL x0, a2, t2 ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ MUL x0, a3, t3 ++ .align 4 ++ ++$L18: ++ LD a0, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a1, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a2, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a3, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ ADD s0, t0,f20 ++ fmov f20,s0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ ++ MUL alpha, s0,f20 ++ fmov f20,s0 ++ MUL alpha, s1, f20 ++ fmov f20,s1 ++ MUL alpha, s2, f20 ++ fmov f20,s2 ++ MUL alpha, s3, f20 ++ fmov f20,s3 ++ ++ ADD a0, s0,f20 ++ fmov f20,a0 ++ fclr t0 ++ ADD a1, s1, f20 ++ fmov f20,a1 ++ fclr t1 ++ ADD a2, s2, f20 ++ fmov f20,a2 ++ fclr t2 ++ ADD a3, s3, f20 ++ fmov f20,a3 ++ fclr t3 ++ ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a1, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a2, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a3, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ and N, 2, J ++ ble J, $L30 ++ mov A, A1 ++ addl A, LDA, A2 ++ ++ addl A2, LDA, A ++ fclr s0 ++ mov X, X1 ++ fclr s1 ++ ++ sra M, 3, I ++ fclr s2 ++ fclr s3 ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 1 * SIZE(A1) ++ LD a3, 1 * SIZE(A2) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 2 * SIZE(A2) ++ LD a6, 3 * SIZE(A1) ++ LD a7, 3 * SIZE(A2) ++ ++ LD a8, 4 * SIZE(A1) ++ LD a9, 4 * SIZE(A2) ++ LD a10, 5 * SIZE(A1) ++ LD a11, 5 * SIZE(A2) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 6 * SIZE(A2) ++ LD a14, 7 * SIZE(A1) ++ LD a15, 7 * SIZE(A2) ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, x3 ++ fmov x3,s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 8 * SIZE(A2) ++ ++ ADD s0, t2, x0 ++ fmov x0,s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a2, t2 ++ LD a2, 9 * SIZE(A1) ++ ++ ADD s1, t3, f20 ++ fmov f20,s1 ++ #unop ++ MUL x1, a3, t3 ++ LD a3, 9 * SIZE(A2) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a4, t0 ++ LD a4, 10 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldi I, -1(I) ++ MUL x2, a5, t1 ++ LD a5, 10 * SIZE(A2) ++ ++ ADD s0, t2, f20 ++ fmov f20,s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a6, t2 ++ LD a6, 11 * SIZE(A1) ++ ++ ADD s1, t3, f20 ++ fmov f20,s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a7, t3 ++ LD a7, 11 * SIZE(A2) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x3, -1 * SIZE(X1) ++ MUL x0, a8, t0 ++ LD a8, 12 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x0, a9, t1 ++ LD a9, 12 * SIZE(A2) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x0, 0 * SIZE(X1) ++ MUL x1, a10, t0 ++ LD a10, 13 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a11, t1 ++ LD a11, 13 * SIZE(A2) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x1, 1 * SIZE(X1) ++ MUL x2, a12, t0 ++ LD a12, 6 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL x2, a13, t1 ++ LD a13, 14 * SIZE(A2) ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x2, 2 * SIZE(X1) ++ MUL x3, a14, t0 ++ LD a14, 7 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL x3, a15, t1 ++ LD a15, 7 * SIZE(A2) ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ unop ++ MUL x0, a1, t1 ++ unop ++ ++ ADD s0, t2, f20 ++ fmov f20,s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a2, t2 ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD s1, t3, f20 ++ fmov f20,s1 ++ unop ++ MUL x1, a3, t3 ++ unop ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a4, t0 ++ unop ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ unop ++ MUL x2, a5, t1 ++ unop ++ ++ ADD s0, t2, f20 ++ fmov f20,s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a6, t2 ++ unop ++ ++ ADD s1, t3, f20 ++ fmov f20,s1 ++ unop ++ MUL x3, a7, t3 ++ unop ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a8, t0 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ unop ++ MUL x0, a9, t1 ++ unop ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ MUL x1, a10, t0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL x1, a11, t1 ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ MUL x2, a12, t0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL x2, a13, t1 ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ MUL x3, a14, t0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL x3, a15, t1 ++ .align 4 ++ ++$L25: ++ and M, 7, I ++ ble I, $L28 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L27 ++ .align 4 ++ ++$L26: ++ ADD s0, t0,f20 ++ fmov f20,s0 ++ ldi A2, 1 * SIZE(A2) ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) ++ ++ ADD s1, t1,f20 ++ fmov f20,s1 ++ ldi A1, 1 * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 0 * SIZE(A2) ++ ++ LD x0, 1 * SIZE(X1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L26 ++ .align 4 ++ ++$L27: ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ MUL x0, a0, t0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL x0, a1, t1 ++ .align 4 ++ ++$L28: ++ LD a0, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a1, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ ++ ADD s0, s2, f20 ++ fmov f20,s0 ++ ADD s1, s3, f20 ++ fmov f20,s1 ++ ++ MUL alpha, s0, f20 ++ fmov f20,s0 ++ MUL alpha, s1,f20 ++ fmov f20,s1 ++ ++ ADD a0, s0, f20 ++ fmov f20,a0 ++ ADD a1, s1, f20 ++ fmov f20,a1 ++ ++ ST a0, 0 * SIZE(Y1) ++ fclr t0 ++ addl Y1, INCY, Y1 ++ fclr t1 ++ ++ ST a1, 0 * SIZE(Y1) ++ fclr t2 ++ addl Y1, INCY, Y1 ++ fclr t3 ++ .align 4 ++ ++$L30: ++ blbc N, $L999 ++ ++ mov A, A1 ++ fclr s0 ++ mov X, X1 ++ fclr s1 ++ ++ sra M, 3, I ++ fclr s2 ++ fclr s3 ++ ble I, $L35 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a8, 0 * SIZE(X1) ++ LD a9, 1 * SIZE(X1) ++ ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ LD a10, 2 * SIZE(X1) ++ LD a11, 3 * SIZE(X1) ++ ++ LD a4, 4 * SIZE(A1) ++ LD a5, 5 * SIZE(A1) ++ LD a12, 4 * SIZE(X1) ++ LD a13, 5 * SIZE(X1) ++ ++ LD a6, 6 * SIZE(A1) ++ LD a7, 7 * SIZE(A1) ++ LD a14, 6 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L33 ++ .align 4 ++ ++$L32: ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD a15, 7 * SIZE(X1) ++ MUL a0, a8, f20 ++ fmov f20,t0 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ LD a8, 8 * SIZE(X1) ++ MUL a1, a9, t1 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ LD a9, 9 * SIZE(X1) ++ MUL a2, a10, t2 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ LD a10, 10 * SIZE(X1) ++ MUL a3, a11, t3 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD a11, 11 * SIZE(X1) ++ MUL a4, a12, t0 ++ LD a4, 12 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ LD a12, 12 * SIZE(X1) ++ MUL a5, a13, t1 ++ LD a5, 13 * SIZE(A1) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ LD a13, 13 * SIZE(X1) ++ MUL a6, a14, t2 ++ LD a6, 14 * SIZE(A1) ++ ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ LD a14, 14 * SIZE(X1) ++ MUL a7, a15, t3 ++ LD a7, 15 * SIZE(A1) ++ ++ ldi A1, 8 * SIZE(A1) ++ ldi I, -1(I) ++ ldi X1, 8 * SIZE(X1) ++ bgt I, $L32 ++ .align 4 ++ ++$L33: ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ LD a15, 7 * SIZE(X1) ++ MUL a0, a8, t0 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ unop ++ MUL a1, a9, t1 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ MUL a2, a10, t2 ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ MUL a3, a11, t3 ++ ++ ADD s0, t0, f20 ++ fmov f20,s0 ++ MUL a4, a12, t0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ MUL a5, a13, t1 ++ ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ MUL a6, a14, t2 ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ MUL a7, a15, t3 ++ .align 4 ++ ++$L35: ++ and M, 7, I ++ ble I, $L38 ++ ++ LD a0, 0 * SIZE(A1) ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L37 ++ .align 4 ++ ++$L36: ++ ADD s0, t0,f20 ++ fmov f20,s0 ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) ++ LD x0, 1 * SIZE(X1) ++ ++ ldi A1, 1 * SIZE(A1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L36 ++ .align 4 ++ ++$L37: ++ ADD s0, t0,f20 ++ fmov f20,s0 ++ MUL x0, a0, t0 ++ .align 4 ++ ++$L38: ++ LD a0, 0 * SIZE(Y) ++ ++ ADD s0, t0,f20 ++ fmov f20,s0 ++ ADD s1, t1, f20 ++ fmov f20,s1 ++ ADD s2, t2, f20 ++ fmov f20,s2 ++ ADD s3, t3, f20 ++ fmov f20,s3 ++ ++ ADD s0, s2, f20 ++ fmov f20,s0 ++ ADD s1, s3, f20 ++ fmov f20,s1 ++ ADD s0, s1, f20 ++ fmov f20,s0 ++ ++ MUL alpha, s0, f20 ++ fmov f20,s0 ++ ADD a0, s0, f20 ++ fmov f20,a0 ++ ++ ST a0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ fldd f20, 64($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/gemv_t.S.bak b/kernel/sw_64/gemv_t.S.bak +new file mode 100644 +index 0000000..068e463 +--- /dev/null ++++ b/kernel/sw_64/gemv_t.S.bak +@@ -0,0 +1,1061 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $20 ++#define LDA $21 ++ ++#define X $18 ++#define INCX $19 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define X1 $3 ++#define Y1 $4 ++ ++#define A1 $5 ++#define A2 $6 ++#define A3 $7 ++#define A4 $8 ++ ++#define alpha $f19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl X, 0 + STACKSIZE($sp) ++ ldl INCX, 8 + STACKSIZE($sp) ++ ldl Y, 16 + STACKSIZE($sp) ++ ldl INCY, 24 + STACKSIZE($sp) ++ ldl BUFFER, 32 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ SXADDQ INCX, 0, INCX ++ cmple N, 0, $1 ++ SXADDQ INCY, 0, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ cmpeq INCX, SIZE, $0 ++ mov X, X1 ++ SXADDQ LDA, 0, LDA ++ bne $0, $L10 ++ ++ sra M, 3, I ++ mov BUFFER, Y1 ++ mov BUFFER, X ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ ldi I, -1(I) ++ ++ LD a0, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a1, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a2, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a3, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ST a2, 2 * SIZE(Y1) ++ ST a3, 3 * SIZE(Y1) ++ ++ LD a4, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a5, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a6, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a7, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a4, 4 * SIZE(Y1) ++ ST a5, 5 * SIZE(Y1) ++ ST a6, 6 * SIZE(Y1) ++ ST a7, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 7, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ LD a0, 0 * SIZE(X1) ++ addl X1, INCX, X1 ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, SIZE, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ mov Y, Y1 ++ fclr t0 ++ unop ++ fclr t1 ++ ++ sra N, 2, J ++ fclr t2 ++ fclr t3 ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ mov A, A1 ++ fclr s0 ++ addl A, LDA, A2 ++ fclr s1 ++ ++ addl A2, LDA, A3 ++ fclr s2 ++ addl A3, LDA, A4 ++ fclr s3 ++ ++ s4addl LDA, A, A ++ unop ++ mov X, X1 ++ fillcs 3 * SIZE(Y) ++ ++ sra M, 3, I ++ ble I, $L15 ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) ++ LD a4, 1 * SIZE(A1) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 1 * SIZE(A3) ++ LD a7, 1 * SIZE(A4) ++ LD a8, 2 * SIZE(A1) ++ LD a9, 2 * SIZE(A2) ++ LD a10, 2 * SIZE(A3) ++ LD a11, 2 * SIZE(A4) ++ LD a12, 3 * SIZE(A1) ++ LD a13, 3 * SIZE(A2) ++ LD a14, 3 * SIZE(A3) ++ LD a15, 3 * SIZE(A4) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 4 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 4 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x0, a2, t2 ++ LD a2, 4 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD a3, 4 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 5 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a5, t1 ++ LD a5, 5 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x1, a6, t2 ++ LD a6, 5 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x1, a7, t3 ++ LD a7, 5 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, -2 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x2, a9, t1 ++ LD a9, 6 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ ldi A2, 8 * SIZE(A2) ++ MUL x2, a10, t2 ++ LD a10, 6 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ ldi A3, 8 * SIZE(A3) ++ MUL x2, a11, t3 ++ LD a11, 6 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, -1 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A4, 8 * SIZE(A4) ++ MUL x3, a13, t1 ++ LD a13, -1 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x3, a14, t2 ++ LD a14, -1 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x3, a15, t3 ++ LD a15, -1 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 0 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ fillcs (PREFETCHSIZE - 8) * SIZE(A3) ++ MUL x0, a1, t1 ++ LD a1, 0 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x0, a2, t2 ++ LD a2, 0 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD a3, 0 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x0, 8 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 1 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x1, a5, t1 ++ LD a5, 1 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x1, a6, t2 ++ LD a6, 1 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x1, a7, t3 ++ LD a7, 1 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x1, 9 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, 2 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ fillcs (PREFETCHSIZE - 8) * SIZE(A4) ++ MUL x2, a9, t1 ++ LD a9, 2 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ ldi X1, 8 * SIZE(X1) ++ MUL x2, a10, t2 ++ LD a10, 2 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ ldi I, -1(I) ++ MUL x2, a11, t3 ++ LD a11, 2 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x2, 2 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, 3 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ fillcs (PREFETCHSIZE - 8) * SIZE(X1) ++ MUL x3, a13, t1 ++ LD a13, 3 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x3, a14, t2 ++ LD a14, 3 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ MUL x3, a15, t3 ++ LD a15, 3 * SIZE(A4) ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 4 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ LD a1, 4 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x0, a2, t2 ++ LD a2, 4 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD a3, 4 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a4, t0 ++ LD a4, 5 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x1, a5, t1 ++ LD a5, 5 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x1, a6, t2 ++ LD a6, 5 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x1, a7, t3 ++ LD a7, 5 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a8, t0 ++ LD a8, 6 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x2, a9, t1 ++ LD a9, 6 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ unop ++ MUL x2, a10, t2 ++ LD a10, 6 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ unop ++ MUL x2, a11, t3 ++ LD a11, 6 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a12, t0 ++ LD a12, 7 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x3, a13, t1 ++ LD a13, 7 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ ldi A2, 8 * SIZE(A2) ++ MUL x3, a14, t2 ++ LD a14, 7 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ ldi A3, 8 * SIZE(A3) ++ MUL x3, a15, t3 ++ LD a15, 7 * SIZE(A4) ++ ++ ADD s0, t0, s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a0, t0 ++ unop ++ ++ ADD s1, t1, s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x0, a1, t1 ++ ldi A4, 8 * SIZE(A4) ++ ++ ADD s2, t2, s2 ++ MUL x0, a2, t2 ++ ADD s3, t3, s3 ++ MUL x0, a3, t3 ++ ++ ADD s0, t0, s0 ++ MUL x1, a4, t0 ++ ADD s1, t1, s1 ++ MUL x1, a5, t1 ++ ++ ADD s2, t2, s2 ++ MUL x1, a6, t2 ++ ADD s3, t3, s3 ++ MUL x1, a7, t3 ++ ++ ADD s0, t0, s0 ++ MUL x2, a8, t0 ++ ADD s1, t1, s1 ++ MUL x2, a9, t1 ++ ++ ADD s2, t2, s2 ++ MUL x2, a10, t2 ++ ADD s3, t3, s3 ++ MUL x2, a11, t3 ++ ++ ADD s0, t0, s0 ++ MUL x3, a12, t0 ++ ADD s1, t1, s1 ++ MUL x3, a13, t1 ++ ++ ADD s2, t2, s2 ++ MUL x3, a14, t2 ++ ADD s3, t3, s3 ++ MUL x3, a15, t3 ++ .align 4 ++ ++$L15: ++ and M, 7, I ++ ble I, $L18 ++ ++ LD x0, 0 * SIZE(X1) ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 0 * SIZE(A3) ++ LD a3, 0 * SIZE(A4) ++ ++ ldi I, -1(I) ++ ble I, $L17 ++ .align 4 ++ ++$L16: ++ ADD s0, t0, s0 ++ ldi A4, 1 * SIZE(A4) ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A1, 1 * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 1 * SIZE(A2) ++ ++ ADD s2, t2, s2 ++ ldi A2, 1 * SIZE(A2) ++ MUL x0, a2, t2 ++ LD a2, 1 * SIZE(A3) ++ ++ ADD s3, t3, s3 ++ ldi A3, 1 * SIZE(A3) ++ MUL x0, a3, t3 ++ LD a3, 0 * SIZE(A4) ++ ++ LD x0, 1 * SIZE(X1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L16 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ ADD s1, t1, s1 ++ MUL x0, a1, t1 ++ ++ ADD s2, t2, s2 ++ MUL x0, a2, t2 ++ ADD s3, t3, s3 ++ MUL x0, a3, t3 ++ .align 4 ++ ++$L18: ++ LD a0, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a1, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a2, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a3, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ MUL alpha, s0, s0 ++ MUL alpha, s1, s1 ++ MUL alpha, s2, s2 ++ MUL alpha, s3, s3 ++ ++ ADD a0, s0, a0 ++ fclr t0 ++ ADD a1, s1, a1 ++ fclr t1 ++ ADD a2, s2, a2 ++ fclr t2 ++ ADD a3, s3, a3 ++ fclr t3 ++ ++ ST a0, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a1, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a2, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a3, 0 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ and N, 2, J ++ ble J, $L30 ++ mov A, A1 ++ addl A, LDA, A2 ++ ++ addl A2, LDA, A ++ fclr s0 ++ mov X, X1 ++ fclr s1 ++ ++ sra M, 3, I ++ fclr s2 ++ fclr s3 ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD a2, 1 * SIZE(A1) ++ LD a3, 1 * SIZE(A2) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 2 * SIZE(A2) ++ LD a6, 3 * SIZE(A1) ++ LD a7, 3 * SIZE(A2) ++ ++ LD a8, 4 * SIZE(A1) ++ LD a9, 4 * SIZE(A2) ++ LD a10, 5 * SIZE(A1) ++ LD a11, 5 * SIZE(A2) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 6 * SIZE(A2) ++ LD a14, 7 * SIZE(A1) ++ LD a15, 7 * SIZE(A2) ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 8 * SIZE(A2) ++ ++ ADD s0, t2, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a2, t2 ++ LD a2, 9 * SIZE(A1) ++ ++ ADD s1, t3, s1 ++ unop ++ MUL x1, a3, t3 ++ LD a3, 9 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a4, t0 ++ LD a4, 10 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi I, -1(I) ++ MUL x2, a5, t1 ++ LD a5, 10 * SIZE(A2) ++ ++ ADD s0, t2, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a6, t2 ++ LD a6, 11 * SIZE(A1) ++ ++ ADD s1, t3, s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a7, t3 ++ LD a7, 11 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x3, -1 * SIZE(X1) ++ MUL x0, a8, t0 ++ LD a8, 12 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x0, a9, t1 ++ LD a9, 12 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x0, 0 * SIZE(X1) ++ MUL x1, a10, t0 ++ LD a10, 13 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a11, t1 ++ LD a11, 13 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x1, 1 * SIZE(X1) ++ MUL x2, a12, t0 ++ LD a12, 6 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ MUL x2, a13, t1 ++ LD a13, 14 * SIZE(A2) ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD s0, t0, s0 ++ LD x2, 2 * SIZE(X1) ++ MUL x3, a14, t0 ++ LD a14, 7 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ MUL x3, a15, t1 ++ LD a15, 7 * SIZE(A2) ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD s0, t0, s0 ++ LD x3, 3 * SIZE(X1) ++ MUL x0, a0, t0 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ unop ++ ++ ADD s0, t2, s0 ++ LD x0, 4 * SIZE(X1) ++ MUL x1, a2, t2 ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD s1, t3, s1 ++ unop ++ MUL x1, a3, t3 ++ unop ++ ++ ADD s0, t0, s0 ++ LD x1, 5 * SIZE(X1) ++ MUL x2, a4, t0 ++ unop ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x2, a5, t1 ++ unop ++ ++ ADD s0, t2, s0 ++ LD x2, 6 * SIZE(X1) ++ MUL x3, a6, t2 ++ unop ++ ++ ADD s1, t3, s1 ++ unop ++ MUL x3, a7, t3 ++ unop ++ ++ ADD s0, t0, s0 ++ LD x3, 7 * SIZE(X1) ++ MUL x0, a8, t0 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL x0, a9, t1 ++ unop ++ ++ ADD s0, t0, s0 ++ MUL x1, a10, t0 ++ ADD s1, t1, s1 ++ MUL x1, a11, t1 ++ ++ ADD s0, t0, s0 ++ MUL x2, a12, t0 ++ ADD s1, t1, s1 ++ MUL x2, a13, t1 ++ ++ ADD s0, t0, s0 ++ MUL x3, a14, t0 ++ ADD s1, t1, s1 ++ MUL x3, a15, t1 ++ .align 4 ++ ++$L25: ++ and M, 7, I ++ ble I, $L28 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 0 * SIZE(A2) ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L27 ++ .align 4 ++ ++$L26: ++ ADD s0, t0, s0 ++ ldi A2, 1 * SIZE(A2) ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ ldi A1, 1 * SIZE(A1) ++ MUL x0, a1, t1 ++ LD a1, 0 * SIZE(A2) ++ ++ LD x0, 1 * SIZE(X1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L26 ++ .align 4 ++ ++$L27: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ ADD s1, t1, s1 ++ MUL x0, a1, t1 ++ .align 4 ++ ++$L28: ++ LD a0, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ LD a1, 0 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ ADD s0, s2, s0 ++ ADD s1, s3, s1 ++ ++ MUL alpha, s0, s0 ++ MUL alpha, s1, s1 ++ ++ ADD a0, s0, a0 ++ ADD a1, s1, a1 ++ ++ ST a0, 0 * SIZE(Y1) ++ fclr t0 ++ addl Y1, INCY, Y1 ++ fclr t1 ++ ++ ST a1, 0 * SIZE(Y1) ++ fclr t2 ++ addl Y1, INCY, Y1 ++ fclr t3 ++ .align 4 ++ ++$L30: ++ blbc N, $L999 ++ ++ mov A, A1 ++ fclr s0 ++ mov X, X1 ++ fclr s1 ++ ++ sra M, 3, I ++ fclr s2 ++ fclr s3 ++ ble I, $L35 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a8, 0 * SIZE(X1) ++ LD a9, 1 * SIZE(X1) ++ ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ LD a10, 2 * SIZE(X1) ++ LD a11, 3 * SIZE(X1) ++ ++ LD a4, 4 * SIZE(A1) ++ LD a5, 5 * SIZE(A1) ++ LD a12, 4 * SIZE(X1) ++ LD a13, 5 * SIZE(X1) ++ ++ LD a6, 6 * SIZE(A1) ++ LD a7, 7 * SIZE(A1) ++ LD a14, 6 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L33 ++ .align 4 ++ ++$L32: ++ ADD s0, t0, s0 ++ LD a15, 7 * SIZE(X1) ++ MUL a0, a8, t0 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ LD a8, 8 * SIZE(X1) ++ MUL a1, a9, t1 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD s2, t2, s2 ++ LD a9, 9 * SIZE(X1) ++ MUL a2, a10, t2 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD s3, t3, s3 ++ LD a10, 10 * SIZE(X1) ++ MUL a3, a11, t3 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD s0, t0, s0 ++ LD a11, 11 * SIZE(X1) ++ MUL a4, a12, t0 ++ LD a4, 12 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ LD a12, 12 * SIZE(X1) ++ MUL a5, a13, t1 ++ LD a5, 13 * SIZE(A1) ++ ++ ADD s2, t2, s2 ++ LD a13, 13 * SIZE(X1) ++ MUL a6, a14, t2 ++ LD a6, 14 * SIZE(A1) ++ ++ ADD s3, t3, s3 ++ LD a14, 14 * SIZE(X1) ++ MUL a7, a15, t3 ++ LD a7, 15 * SIZE(A1) ++ ++ ldi A1, 8 * SIZE(A1) ++ ldi I, -1(I) ++ ldi X1, 8 * SIZE(X1) ++ bgt I, $L32 ++ .align 4 ++ ++$L33: ++ ADD s0, t0, s0 ++ LD a15, 7 * SIZE(X1) ++ MUL a0, a8, t0 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD s1, t1, s1 ++ unop ++ MUL a1, a9, t1 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD s2, t2, s2 ++ MUL a2, a10, t2 ++ ADD s3, t3, s3 ++ MUL a3, a11, t3 ++ ++ ADD s0, t0, s0 ++ MUL a4, a12, t0 ++ ADD s1, t1, s1 ++ MUL a5, a13, t1 ++ ++ ADD s2, t2, s2 ++ MUL a6, a14, t2 ++ ADD s3, t3, s3 ++ MUL a7, a15, t3 ++ .align 4 ++ ++$L35: ++ and M, 7, I ++ ble I, $L38 ++ ++ LD a0, 0 * SIZE(A1) ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L37 ++ .align 4 ++ ++$L36: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ LD a0, 1 * SIZE(A1) ++ LD x0, 1 * SIZE(X1) ++ ++ ldi A1, 1 * SIZE(A1) ++ ldi X1, 1 * SIZE(X1) ++ ldi I, -1(I) ++ bgt I, $L36 ++ .align 4 ++ ++$L37: ++ ADD s0, t0, s0 ++ MUL x0, a0, t0 ++ .align 4 ++ ++$L38: ++ LD a0, 0 * SIZE(Y) ++ ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ ADD s0, s2, s0 ++ ADD s1, s3, s1 ++ ADD s0, s1, s0 ++ ++ MUL alpha, s0, s0 ++ ADD a0, s0, a0 ++ ++ ST a0, 0 * SIZE(Y1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/iamax.S b/kernel/sw_64/iamax.S +new file mode 100644 +index 0000000..f3b2909 +--- /dev/null ++++ b/kernel/sw_64/iamax.S +@@ -0,0 +1,440 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 6 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef F_INTERFACE ++ ldl N, 0(N) # n ++ ldl INCX, 0(INCX) # incx ++#endif ++ ldi $sp, -STACKSIZE($sp) ++ mov X, XX ++ .align 4 ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ unop ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $2 ++ clr $0 ++ ++ fstd $f6, 32($sp) ++ fclr $f0 ++ sra N, 3, $1 ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ unop ++ fabs $f20, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ fabs $f20, $f1 ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f21, 0 * SIZE(X) ++ fabs $f20, $f2 ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fabs $f20, $f3 ++ addl X, INCX, X ++ unop ++ ++ LD $f23, 0 * SIZE(X) ++ fabs $f20, $f4 ++ addl X, INCX, X ++ unop ++ ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ fabs $f20, $f5 ++ unop ++ ++ LD $f25, 0 * SIZE(X) ++ fabs $f20, $f6 ++ addl X, INCX, X ++ unop ++ ++ LD $f26, 0 * SIZE(X) ++ fabs $f20, $f28 ++ addl X, INCX, X ++ ldi $1, -1($1) ++ ++ LD $f27, 0 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ fselne $f16, $f12, $f4, $f4 ++ unop ++ fabs $f20, $f29 ++ fillcs 56 * SIZE(X) ++ ++ fselne $f17, $f13, $f5, $f5 ++ LD $f20, 0 * SIZE(X) ++ fabs $f21, $f30 ++ addl X, INCX, X ++ ++ fselne $f18, $f14, $f6, $f6 ++ LD $f21, 0 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ fselne $f19, $f15, $f28, $f28 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ addl X, INCX, X ++ ++ fabs $f24, $f12 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f0, $f29), $f16 ++ addl X, INCX, X ++ ++ fabs $f25, $f13 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f1, $f30), $f17 ++ addl X, INCX, X ++ ++ fabs $f26, $f14 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f2, $f10), $f18 ++ addl X, INCX, X ++ ++ fabs $f27, $f15 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f3, $f11), $f19 ++ addl X, INCX, X ++ ++ fselne $f16, $f29, $f0, $f0 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f4, $f12), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f30, $f1, $f1 ++ unop ++ CMPLT($f5, $f13), $f17 ++ ldi $1, -1($1) # i -- ++ ++ fselne $f18, $f10, $f2, $f2 ++ unop ++ CMPLT($f6, $f14), $f18 ++ unop ++ ++ fselne $f19, $f11, $f3, $f3 ++ unop ++ CMPLT($f28, $f15), $f19 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ fselne $f16, $f12, $f4, $f4 ++ fabs $f20, $f29 ++ fselne $f17, $f13, $f5, $f5 ++ fabs $f21, $f30 ++ ++ fselne $f18, $f14, $f6, $f6 ++ fabs $f22, $f10 ++ fselne $f19, $f15, $f28, $f28 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ CMPLT($f0, $f29), $f16 ++ fabs $f25, $f13 ++ CMPLT($f1, $f30), $f17 ++ ++ fabs $f26, $f14 ++ CMPLT($f2, $f10), $f18 ++ fabs $f27, $f15 ++ CMPLT($f3, $f11), $f19 ++ ++ fselne $f16, $f29, $f0, $f0 ++ CMPLT($f4, $f12), $f16 ++ fselne $f17, $f30, $f1, $f1 ++ CMPLT($f5, $f13), $f17 ++ ++ fselne $f18, $f10, $f2, $f2 ++ CMPLT($f6, $f14), $f18 ++ fselne $f19, $f11, $f3, $f3 ++ CMPLT($f28, $f15), $f19 ++ ++ fselne $f16, $f12, $f4, $f4 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f13, $f5, $f5 ++ CMPLT($f2, $f3), $f17 ++ ++ fselne $f18, $f14, $f6, $f6 ++ CMPLT($f4, $f5), $f18 ++ fselne $f19, $f15, $f28, $f28 ++ CMPLT($f6, $f28), $f19 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ fselne $f18, $f5, $f4, $f4 ++ fselne $f19, $f28, $f6, $f6 ++ ++ CMPLT($f0, $f2), $f16 ++ CMPLT($f4, $f6), $f17 ++ ++ fselne $f16, $f2, $f0, $f0 ++ fselne $f17, $f6, $f4, $f4 ++ ++ CMPLT($f0, $f4), $f16 ++ fselne $f16, $f4, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $L20 ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$L20: ++ sra N, 3, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f11, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f13, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f15, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f17, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ fabs $f12, $f20 ++ fabs $f13, $f21 ++ ++ ldi $1, -1($1) ++ ble $1, $L23 ++ .align 4 ++ ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fabs $f14, $f22 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f18, $f2 ++ ++ LD $f11, 0 * SIZE(XX) ++ fabs $f15, $f23 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f19, $f3 ++ ++ LD $f12, 0 * SIZE(XX) ++ fabs $f16, $f24 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f20, $f4 ++ ++ LD $f13, 0 * SIZE(XX) ++ fabs $f17, $f25 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f21, $f5 ++ ++ LD $f14, 0 * SIZE(XX) ++ ldi $1, -1($1) # i -- ++ fcmpeq $f0, $f22, $f26 ++ addl XX, INCX, XX ++ ++ ldi $0, 1($0) ++ fbne $f2, $End ++ ++ LD $f15, 0 * SIZE(XX) ++ fcmpeq $f0, $f23, $f27 ++ ldi $0, 1($0) ++ fbne $f3, $End ++ ++ addl XX, INCX, XX ++ fcmpeq $f0, $f24, $f28 ++ ldi $0, 1($0) ++ fbne $f4, $End ++ ++ LD $f16, 0 * SIZE(XX) ++ fcmpeq $f0, $f25, $f29 ++ ldi $0, 1($0) ++ fbne $f5, $End ++ ++ addl XX, INCX, XX ++ ldi $0, 1($0) ++ fabs $f10, $f18 ++ fbne $f26, $End ++ ++ LD $f17, 0 * SIZE(XX) ++ ldi $0, 1($0) ++ fabs $f11, $f19 ++ fbne $f27, $End ++ ++ addl XX, INCX, XX ++ ldi $0, 1($0) ++ fabs $f12, $f20 ++ fbne $f28, $End ++ ++ ldi $0, 1($0) ++ fabs $f13, $f21 ++ fbne $f29, $End ++ bgt $1, $L22 ++ .align 4 ++ ++$L23: ++ fabs $f14, $f22 ++ fcmpeq $f0, $f18, $f2 ++ fabs $f15, $f23 ++ fcmpeq $f0, $f19, $f3 ++ ++ fabs $f16, $f24 ++ fcmpeq $f0, $f20, $f4 ++ fabs $f17, $f25 ++ fcmpeq $f0, $f21, $f5 ++ ++ fcmpeq $f0, $f22, $f26 ++ ldi $0, 1($0) ++ unop ++ fbne $f2, $End ++ ++ fcmpeq $f0, $f23, $f27 ++ ldi $0, 1($0) ++ unop ++ fbne $f3, $End ++ ++ fcmpeq $f0, $f24, $f28 ++ ldi $0, 1($0) ++ unop ++ fbne $f4, $End ++ ++ fcmpeq $f0, $f25, $f29 ++ ldi $0, 1($0) ++ unop ++ fbne $f5, $End ++ ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ ldi $0, 1($0) ++ fbne $f28, $End ++ ldi $0, 1($0) ++ fbne $f29, $End ++ .align 4 ++ ++$L40: ++ LD $f20, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fabs $f20, $f25 ++ fcmpeq $f0, $f25, $f29 ++ ++ ldi $0, 1($0) ++ fbne $f29, $End ++ br $31, $L40 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/iamax_simd.S b/kernel/sw_64/iamax_simd.S +new file mode 100644 +index 0000000..c7c6c27 +--- /dev/null ++++ b/kernel/sw_64/iamax_simd.S +@@ -0,0 +1,732 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 96 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++#define I $1 ++#define NN $22 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#ifndef USE_MIN ++#define VCMPLT(a, b) vfcmplt a, b ++#else ++#define VCMPLT(a, b) vfcmplt b, a ++#endif ++ ++#define STACKSIZE 6 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef F_INTERFACE ++ ldl N, 0(N) # n ++ ldl INCX, 0(INCX) # incx ++#endif ++ ldi $sp, -STACKSIZE($sp) ++ mov X, XX ++ mov N, NN ++ .align 4 ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ unop ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $2 ++ clr $0 ++ ++ fstd $f6, 32($sp) ++ fclr $f0 ++ unop ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ cmpeq INCX, SIZE, $3 ++ beq $3, $Sub ++ .align 4 ++ ++ ++/** ++ test the address of Y ++**/ ++ ++ and X, (VEC_LEN*SIZE-1), $3 ++ LD $f10, 0*SIZE(X) ++ fabs $f10, $f0 # init temp max/min result value ++ beq $3, $Align_Access ++ .align 4 ++/** ++ process the unalign address of X ++**/ ++ ++/*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/ ++ sra NN, 4, I ++ and NN, 15, $3 ++ ble I, $Remain ++ nop ++ ++ sra $3, BASE_SHIFT, $3 ++ ldi $2, VEC_LEN ++ subl $2, $3, $3 ++ nop ++$UnAlign_Y_Loop: ++ LD $f10, 0*SIZE(X) ++ addl X, SIZE, X ++ fabs $f10, $f29 ++ CMPLT($f0, $f29), $f16 ++ ++ fseleq $f16, $f0, $f29, $f0 ++ subl $3, 1, $3 ++ subl NN, 1, NN ++ bgt $3, $UnAlign_Y_Loop ++ .align 4 ++ ++ ++$Align_Access: ++/*search max or min. Unloop 16 */ ++ sra NN, 4, I ++ and NN, 15, $3 ++ ble I, $Remain ++ nop ++ ++ VLD $f10, 0*VEC_LEN*SIZE(X) ++ VLD $f11, 1*VEC_LEN*SIZE(X) ++ VLD $f12, 2*VEC_LEN*SIZE(X) ++ VLD $f13, 3*VEC_LEN*SIZE(X) ++ ++ /*vfabs*/ ++ vcpys $f31, $f10, $f22 ++ vcpys $f31, $f11, $f23 ++ vcpys $f31, $f12, $f24 ++ vcpys $f31, $f13, $f25 ++ ++ vcpyf $f0, $f0 ++ vcpys $f22, $f22, $f1 # copy $f22 -> $f1 ++ vcpys $f22, $f22, $f2 ++ vcpys $f22, $f22, $f3 ++ ++ subl I, 1, I ++ addl X, 16*SIZE, X ++ nop ++ ble I, $MainLoopEnd ++ .align 4 ++$MainLoop: ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ VCMPLT($f0, $f22), $f26 ++ subl I, 1, I ++ VCMPLT($f1, $f23), $f27 ++ ++ VLD $f10, 0*VEC_LEN*SIZE(X) ++ VLD $f11, 1*VEC_LEN*SIZE(X) ++ VLD $f12, 2*VEC_LEN*SIZE(X) ++ VLD $f13, 3*VEC_LEN*SIZE(X) ++ ++ VCMPLT($f2, $f24), $f28 ++ addl X, 16 * SIZE, X ++ nop ++ VCMPLT($f3, $f25), $f29 ++ ++ vfseleq $f26, $f0, $f22, $f0 ++ vfseleq $f27, $f1, $f23, $f1 ++ vfseleq $f28, $f2, $f24, $f2 ++ vfseleq $f29, $f3, $f25, $f3 ++ ++ vcpys $f31, $f10, $f22 ++ vcpys $f31, $f11, $f23 ++ vcpys $f31, $f12, $f24 ++ vcpys $f31, $f13, $f25 ++ ++ bne I, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ VCMPLT($f0, $f22), $f26 ++ VCMPLT($f1, $f23), $f27 ++ VCMPLT($f2, $f24), $f28 ++ VCMPLT($f3, $f25), $f29 ++ ++ vfseleq $f26, $f0, $f22, $f0 ++ vfseleq $f27, $f1, $f23, $f1 ++ vfseleq $f28, $f2, $f24, $f2 ++ vfseleq $f29, $f3, $f25, $f3 ++ ++ /*find the max or min among f0, f1 ,f2 and f3*/ ++ VCMPLT($f0, $f1), $f26 ++ VCMPLT($f2, $f3), $f27 ++ vfseleq $f26, $f0, $f1, $f0 ++ vfseleq $f27, $f2, $f3, $f2 ++ ++ VCMPLT($f0, $f2), $f26 ++ vfseleq $f26, $f0, $f2, $f0 ++ vextf $f0, 1, $f22 ++ vextf $f0, 2, $f23 ++ ++ vextf $f0, 3, $f24 ++ CMPLT($f0, $f22), $f16 ++ CMPLT($f23, $f24), $f17 ++ fseleq $f16, $f0, $f22, $f0 ++ ++ fseleq $f17, $f23, $f24, $f23 ++ CMPLT($f0, $f23), $f18 ++ fseleq $f18, $f0, $f23, $f0 ++ nop ++$Remain: ++ ble $3, $Continuous_FindIndex ++ .align 4 ++$RemainLoop: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ CMPLT($f0, $f29), $f16 ++ fseleq $f16, $f0, $f29, $f0 ++ ++ subl $3, 1, $3 ++ bgt $3, $RemainLoop ++ .align 4 ++ /*find index*/ ++$Continuous_FindIndex: ++ sra N, 3, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) ++ LD $f12, 2 * SIZE(XX) ++ LD $f13, 3 * SIZE(XX) ++ ++ ++ LD $f14, 4 * SIZE(XX) ++ LD $f15, 5 * SIZE(XX) ++ LD $f16, 6 * SIZE(XX) ++ LD $f17, 7 * SIZE(XX) ++ ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ fabs $f12, $f20 ++ fabs $f13, $f21 ++ ++ addl XX, 8*SIZE, XX ++ ldi $1, -1($1) ++ ble $1, $Continuous_FindIndex_Loop ++ .align 4 ++ ++$Continuous_FindIndex_Loop: ++ LD $f10, 0 * SIZE(XX) ++ fabs $f14, $f22 ++ LD $f11, 1 * SIZE(XX) ++ fcmpeq $f0, $f18, $f2 ++ ++ LD $f12, 2 * SIZE(XX) ++ fabs $f15, $f23 ++ LD $f13, 3 * SIZE(XX) ++ fcmpeq $f0, $f19, $f3 ++ ++ LD $f14, 4 * SIZE(XX) ++ fabs $f16, $f24 ++ ldi $1, -1($1) # i -- ++ fcmpeq $f0, $f20, $f4 ++ ++ LD $f15, 5 * SIZE(XX) ++ fabs $f17, $f25 ++ fcmpeq $f0, $f21, $f5 ++ fillcs PREFETCHSIZE * SIZE(X) ++ ++ LD $f16, 6 * SIZE(XX) ++ fcmpeq $f0, $f22, $f26 ++ ldi $0, 1($0) ++ fbne $f2, $End ++ ++ LD $f17, 7 * SIZE(XX) ++ fcmpeq $f0, $f23, $f27 ++ ldi $0, 1($0) ++ fbne $f3, $End ++ ++ addl XX, 8*SIZE, XX ++ fcmpeq $f0, $f24, $f28 ++ ldi $0, 1($0) ++ fbne $f4, $End ++ ++ fcmpeq $f0, $f25, $f29 ++ ldi $0, 1($0) ++ nop ++ fbne $f5, $End ++ ++ ldi $0, 1($0) ++ fabs $f10, $f18 ++ nop ++ fbne $f26, $End ++ ++ ldi $0, 1($0) ++ fabs $f11, $f19 ++ nop ++ fbne $f27, $End ++ ++ ldi $0, 1($0) ++ fabs $f12, $f20 ++ nop ++ fbne $f28, $End ++ ++ ldi $0, 1($0) ++ fabs $f13, $f21 ++ fbne $f29, $End ++ bgt $1, $Continuous_FindIndex_Loop ++ .align 4 ++ ++$Continuous_FindIndex_LoopEnd: ++ fabs $f14, $f22 ++ fcmpeq $f0, $f18, $f2 ++ fabs $f15, $f23 ++ fcmpeq $f0, $f19, $f3 ++ ++ fabs $f16, $f24 ++ fcmpeq $f0, $f20, $f4 ++ fabs $f17, $f25 ++ fcmpeq $f0, $f21, $f5 ++ ++ fcmpeq $f0, $f22, $f26 ++ ldi $0, 1($0) ++ unop ++ fbne $f2, $End ++ ++ fcmpeq $f0, $f23, $f27 ++ ldi $0, 1($0) ++ unop ++ fbne $f3, $End ++ ++ fcmpeq $f0, $f24, $f28 ++ ldi $0, 1($0) ++ unop ++ fbne $f4, $End ++ ++ fcmpeq $f0, $f25, $f29 ++ ldi $0, 1($0) ++ unop ++ fbne $f5, $End ++ ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ ldi $0, 1($0) ++ fbne $f28, $End ++ ldi $0, 1($0) ++ fbne $f29, $End ++ .align 4 ++ ++ jmp $L40 ++ .align 4 ++$Sub: ++ sra N, 3, $1 ++ LD $f20, 0 * SIZE(X) ++ fabs $f20, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ fabs $f20, $f1 ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f21, 0 * SIZE(X) ++ fabs $f20, $f2 ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fabs $f20, $f3 ++ addl X, INCX, X ++ unop ++ ++ LD $f23, 0 * SIZE(X) ++ fabs $f20, $f4 ++ addl X, INCX, X ++ unop ++ ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ fabs $f20, $f5 ++ unop ++ ++ LD $f25, 0 * SIZE(X) ++ fabs $f20, $f6 ++ addl X, INCX, X ++ unop ++ ++ LD $f26, 0 * SIZE(X) ++ fabs $f20, $f28 ++ addl X, INCX, X ++ ldi $1, -1($1) ++ ++ LD $f27, 0 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ fselne $f16, $f12, $f4, $f4 ++ unop ++ fabs $f20, $f29 ++ fillcs 56 * SIZE(X) ++ ++ fselne $f17, $f13, $f5, $f5 ++ LD $f20, 0 * SIZE(X) ++ fabs $f21, $f30 ++ addl X, INCX, X ++ ++ fselne $f18, $f14, $f6, $f6 ++ LD $f21, 0 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ fselne $f19, $f15, $f28, $f28 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ addl X, INCX, X ++ ++ fabs $f24, $f12 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f0, $f29), $f16 ++ addl X, INCX, X ++ ++ fabs $f25, $f13 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f1, $f30), $f17 ++ addl X, INCX, X ++ ++ fabs $f26, $f14 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f2, $f10), $f18 ++ addl X, INCX, X ++ ++ fabs $f27, $f15 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f3, $f11), $f19 ++ addl X, INCX, X ++ ++ fselne $f16, $f29, $f0, $f0 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f4, $f12), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f30, $f1, $f1 ++ unop ++ CMPLT($f5, $f13), $f17 ++ ldi $1, -1($1) # i -- ++ ++ fselne $f18, $f10, $f2, $f2 ++ unop ++ CMPLT($f6, $f14), $f18 ++ unop ++ ++ fselne $f19, $f11, $f3, $f3 ++ unop ++ CMPLT($f28, $f15), $f19 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ fselne $f16, $f12, $f4, $f4 ++ fabs $f20, $f29 ++ fselne $f17, $f13, $f5, $f5 ++ fabs $f21, $f30 ++ ++ fselne $f18, $f14, $f6, $f6 ++ fabs $f22, $f10 ++ fselne $f19, $f15, $f28, $f28 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ CMPLT($f0, $f29), $f16 ++ fabs $f25, $f13 ++ CMPLT($f1, $f30), $f17 ++ ++ fabs $f26, $f14 ++ CMPLT($f2, $f10), $f18 ++ fabs $f27, $f15 ++ CMPLT($f3, $f11), $f19 ++ ++ fselne $f16, $f29, $f0, $f0 ++ CMPLT($f4, $f12), $f16 ++ fselne $f17, $f30, $f1, $f1 ++ CMPLT($f5, $f13), $f17 ++ ++ fselne $f18, $f10, $f2, $f2 ++ CMPLT($f6, $f14), $f18 ++ fselne $f19, $f11, $f3, $f3 ++ CMPLT($f28, $f15), $f19 ++ ++ fselne $f16, $f12, $f4, $f4 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f13, $f5, $f5 ++ CMPLT($f2, $f3), $f17 ++ ++ fselne $f18, $f14, $f6, $f6 ++ CMPLT($f4, $f5), $f18 ++ fselne $f19, $f15, $f28, $f28 ++ CMPLT($f6, $f28), $f19 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ fselne $f18, $f5, $f4, $f4 ++ fselne $f19, $f28, $f6, $f6 ++ ++ CMPLT($f0, $f2), $f16 ++ CMPLT($f4, $f6), $f17 ++ ++ fselne $f16, $f2, $f0, $f0 ++ fselne $f17, $f6, $f4, $f4 ++ ++ CMPLT($f0, $f4), $f16 ++ fselne $f16, $f4, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $L20 ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++/* ++ find the index ++*/ ++$L20: ++ sra N, 3, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f11, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f13, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f15, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ LD $f17, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ fabs $f12, $f20 ++ fabs $f13, $f21 ++ ++ ldi $1, -1($1) ++ ble $1, $L23 ++ .align 4 ++ ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fabs $f14, $f22 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f18, $f2 ++ ++ LD $f11, 0 * SIZE(XX) ++ fabs $f15, $f23 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f19, $f3 ++ ++ LD $f12, 0 * SIZE(XX) ++ fabs $f16, $f24 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f20, $f4 ++ ++ LD $f13, 0 * SIZE(XX) ++ fabs $f17, $f25 ++ addl XX, INCX, XX ++ fcmpeq $f0, $f21, $f5 ++ ++ LD $f14, 0 * SIZE(XX) ++ ldi $1, -1($1) # i -- ++ fcmpeq $f0, $f22, $f26 ++ addl XX, INCX, XX ++ ++ ldi $0, 1($0) ++ fbne $f2, $End ++ ++ LD $f15, 0 * SIZE(XX) ++ fcmpeq $f0, $f23, $f27 ++ ldi $0, 1($0) ++ fbne $f3, $End ++ ++ addl XX, INCX, XX ++ fcmpeq $f0, $f24, $f28 ++ ldi $0, 1($0) ++ fbne $f4, $End ++ ++ LD $f16, 0 * SIZE(XX) ++ fcmpeq $f0, $f25, $f29 ++ ldi $0, 1($0) ++ fbne $f5, $End ++ ++ addl XX, INCX, XX ++ ldi $0, 1($0) ++ fabs $f10, $f18 ++ fbne $f26, $End ++ ++ LD $f17, 0 * SIZE(XX) ++ ldi $0, 1($0) ++ fabs $f11, $f19 ++ fbne $f27, $End ++ ++ addl XX, INCX, XX ++ ldi $0, 1($0) ++ fabs $f12, $f20 ++ fbne $f28, $End ++ ++ ldi $0, 1($0) ++ fabs $f13, $f21 ++ fbne $f29, $End ++ bgt $1, $L22 ++ .align 4 ++ ++$L23: ++ fabs $f14, $f22 ++ fcmpeq $f0, $f18, $f2 ++ fabs $f15, $f23 ++ fcmpeq $f0, $f19, $f3 ++ ++ fabs $f16, $f24 ++ fcmpeq $f0, $f20, $f4 ++ fabs $f17, $f25 ++ fcmpeq $f0, $f21, $f5 ++ ++ fcmpeq $f0, $f22, $f26 ++ ldi $0, 1($0) ++ unop ++ fbne $f2, $End ++ ++ fcmpeq $f0, $f23, $f27 ++ ldi $0, 1($0) ++ unop ++ fbne $f3, $End ++ ++ fcmpeq $f0, $f24, $f28 ++ ldi $0, 1($0) ++ unop ++ fbne $f4, $End ++ ++ fcmpeq $f0, $f25, $f29 ++ ldi $0, 1($0) ++ unop ++ fbne $f5, $End ++ ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ ldi $0, 1($0) ++ fbne $f28, $End ++ ldi $0, 1($0) ++ fbne $f29, $End ++ .align 4 ++ ++$L40: ++ LD $f20, 0 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fabs $f20, $f25 ++ fcmpeq $f0, $f25, $f29 ++ ++ ldi $0, 1($0) ++ fbne $f29, $End ++ br $31, $L40 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/imax.S b/kernel/sw_64/imax.S +new file mode 100644 +index 0000000..b0cf5c8 +--- /dev/null ++++ b/kernel/sw_64/imax.S +@@ -0,0 +1,351 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) cmptlt a, b ++#else ++#define CMPLT(a, b) cmptlt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ ++ clr $0 ++ mov X, XX ++ .align 4 ++ ++ cmplt $31, N, $2 ++ cmplt $31, INCX, $3 ++ SXADDQ INCX, $31, INCX ++ and $2, $3, $2 ++ ++ sra N, 3, $1 ++ fclr $f0 ++ unop ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f0, 0 * SIZE(X) ++ unop ++ unop ++ ble $1, $L15 ++ .align 4 ++ ++ fmov $f0, $f1 ++ addq X, INCX, X ++ fmov $f0, $f10 ++ lda $1, -1($1) ++ ++ LD $f21, 0 * SIZE(X) ++ fmov $f0, $f11 ++ addq X, INCX, X ++ fmov $f0, $f12 ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f13 ++ addq X, INCX, X ++ fmov $f0, $f14 ++ ++ LD $f23, 0 * SIZE(X) ++ fmov $f0, $f15 ++ addq X, INCX, X ++ fmov $f0, $f20 ++ ++ LD $f24, 0 * SIZE(X) ++ addq X, INCX, X ++ LD $f25, 0 * SIZE(X) ++ addq X, INCX, X ++ LD $f26, 0 * SIZE(X) ++ addq X, INCX, X ++ LD $f27, 0 * SIZE(X) ++ addq X, INCX, X ++ ++ CMPLT($f0, $f20), $f16 ++ CMPLT($f1, $f21), $f17 ++ CMPLT($f10, $f22), $f18 ++ CMPLT($f11, $f23), $f19 ++ ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ fcmovne $f16, $f20, $f0 ++ LD $f20, 0 * SIZE(X) ++ CMPLT($f12, $f24), $f16 ++ addq X, INCX, X ++ ++ fcmovne $f17, $f21, $f1 ++ LD $f21, 0 * SIZE(X) ++ CMPLT($f13, $f25), $f17 ++ addq X, INCX, X ++ ++ fcmovne $f18, $f22, $f10 ++ LD $f22, 0 * SIZE(X) ++ CMPLT($f14, $f26), $f18 ++ addq X, INCX, X ++ ++ fcmovne $f19, $f23, $f11 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f15, $f27), $f19 ++ addq X, INCX, X ++ ++ fcmovne $f16, $f24, $f12 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f0, $f20), $f16 ++ addq X, INCX, X ++ ++ fcmovne $f17, $f25, $f13 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f1, $f21), $f17 ++ addq X, INCX, X ++ ++ fcmovne $f18, $f26, $f14 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f10, $f22), $f18 ++ addq X, INCX, X ++ ++ fcmovne $f19, $f27, $f15 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f11, $f23), $f19 ++ lda $1, -1($1) # i -- ++ ++ addq X, INCX, X ++ unop ++ unop ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ fcmovne $f16, $f20, $f0 ++ CMPLT($f12, $f24), $f16 ++ ++ fcmovne $f17, $f21, $f1 ++ CMPLT($f13, $f25), $f17 ++ ++ fcmovne $f18, $f22, $f10 ++ CMPLT($f14, $f26), $f18 ++ ++ fcmovne $f19, $f23, $f11 ++ CMPLT($f15, $f27), $f19 ++ ++ fcmovne $f16, $f24, $f12 ++ CMPLT($f0, $f1), $f16 ++ fcmovne $f17, $f25, $f13 ++ CMPLT($f10, $f11), $f17 ++ ++ fcmovne $f18, $f26, $f14 ++ CMPLT($f12, $f13), $f18 ++ fcmovne $f19, $f27, $f15 ++ CMPLT($f14, $f15), $f19 ++ ++ fcmovne $f16, $f1, $f0 ++ fcmovne $f17, $f11, $f10 ++ fcmovne $f18, $f13, $f12 ++ fcmovne $f19, $f15, $f14 ++ ++ CMPLT($f0, $f10), $f16 ++ CMPLT($f12, $f14), $f17 ++ ++ fcmovne $f16, $f10, $f0 ++ fcmovne $f17, $f14, $f12 ++ ++ CMPLT($f0, $f12), $f16 ++ fcmovne $f16, $f12, $f0 ++ .align 4 ++ ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $L20 ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addq X, INCX, X ++ ++ CMPLT($f0, $f20), $f16 ++ fcmovne $f16, $f20, $f0 ++ lda $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$L20: ++ sra N, 3, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ addq XX, INCX, XX ++ LD $f11, 0 * SIZE(XX) ++ addq XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ addq XX, INCX, XX ++ LD $f13, 0 * SIZE(XX) ++ addq XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ addq XX, INCX, XX ++ LD $f15, 0 * SIZE(XX) ++ addq XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ addq XX, INCX, XX ++ LD $f17, 0 * SIZE(XX) ++ addq XX, INCX, XX ++ ++ cmpteq $f0, $f10, $f20 ++ cmpteq $f0, $f11, $f21 ++ cmpteq $f0, $f12, $f22 ++ cmpteq $f0, $f13, $f23 ++ ++ lda $1, -1($1) ++ ble $1, $L23 ++ .align 4 ++ ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ cmpteq $f0, $f14, $f24 ++ lda $0, 1($0) ++ addq XX, INCX, XX ++ fbne $f20, $End ++ ++ LD $f11, 0 * SIZE(XX) ++ cmpteq $f0, $f15, $f25 ++ lda $0, 1($0) ++ addq XX, INCX, XX ++ fbne $f21, $End ++ ++ LD $f12, 0 * SIZE(XX) ++ cmpteq $f0, $f16, $f26 ++ lda $0, 1($0) ++ addq XX, INCX, XX ++ fbne $f22, $End ++ ++ LD $f13, 0 * SIZE(XX) ++ cmpteq $f0, $f17, $f27 ++ lda $0, 1($0) ++ addq XX, INCX, XX ++ fbne $f23, $End ++ ++ LD $f14, 0 * SIZE(XX) ++ cmpteq $f0, $f10, $f20 ++ lda $0, 1($0) ++ addq XX, INCX, XX ++ fbne $f24, $End ++ ++ LD $f15, 0 * SIZE(XX) ++ cmpteq $f0, $f11, $f21 ++ lda $0, 1($0) ++ addq XX, INCX, XX ++ fbne $f25, $End ++ ++ LD $f16, 0 * SIZE(XX) ++ lda $1, -1($1) # i -- ++ cmpteq $f0, $f12, $f22 ++ lda $0, 1($0) ++ addq XX, INCX, XX ++ fbne $f26, $End ++ ++ LD $f17, 0 * SIZE(XX) ++ cmpteq $f0, $f13, $f23 ++ lda $0, 1($0) ++ addq XX, INCX, XX ++ fbne $f27, $End ++ ++ bgt $1, $L22 ++ .align 4 ++ ++$L23: ++ lda $0, 1($0) ++ cmpteq $f0, $f14, $f24 ++ unop ++ fbne $f20, $End ++ ++ lda $0, 1($0) ++ cmpteq $f0, $f15, $f25 ++ unop ++ fbne $f21, $End ++ ++ lda $0, 1($0) ++ cmpteq $f0, $f16, $f26 ++ unop ++ fbne $f22, $End ++ ++ lda $0, 1($0) ++ cmpteq $f0, $f17, $f27 ++ unop ++ fbne $f23, $End ++ ++ lda $0, 1($0) ++ fbne $f24, $End ++ lda $0, 1($0) ++ fbne $f25, $End ++ lda $0, 1($0) ++ fbne $f26, $End ++ lda $0, 1($0) ++ fbne $f27, $End ++ .align 4 ++ ++$L40: ++ LD $f20, 0 * SIZE(XX) ++ addq XX, INCX, XX ++ ++ cmpteq $f0, $f20, $f29 ++ ++ lda $0, 1($0) ++ fbne $f29, $End ++ br $31, $L40 ++ .align 4 ++ ++$End: ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/imax.c b/kernel/sw_64/imax.c +new file mode 100644 +index 0000000..5072dd1 +--- /dev/null ++++ b/kernel/sw_64/imax.c +@@ -0,0 +1,69 @@ ++/*************************************************************************** ++Copyright (c) 2013, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++ ++/************************************************************************************** ++* 2013/09/14 Saar ++* BLASTEST float : NoTest ++* BLASTEST double : NoTest ++* CTEST : NoTest ++* TEST : NoTest ++* ++**************************************************************************************/ ++ ++#include "common.h" ++#include ++ ++ ++ ++BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) ++{ ++ BLASLONG i=0; ++ BLASLONG ix=0; ++ FLOAT maxf=0.0; ++ BLASLONG max=0; ++ ++ if (n <= 0 || inc_x <= 0) return(max); ++ ++ maxf=x[0]; ++ ix += inc_x; ++ i++; ++ ++ while(i < n) ++ { ++ if( x[ix] > maxf ) ++ { ++ max = i; ++ maxf = x[ix]; ++ } ++ ix += inc_x; ++ i++; ++ } ++ return(max+1); ++} ++ ++ +diff --git a/kernel/sw_64/imin.c b/kernel/sw_64/imin.c +new file mode 100644 +index 0000000..ffc6522 +--- /dev/null ++++ b/kernel/sw_64/imin.c +@@ -0,0 +1,67 @@ ++/*************************************************************************** ++Copyright (c) 2013, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++ ++/************************************************************************************** ++* 2013/08/19 Saar ++* BLASTEST float ++* BLASTEST double ++* ++**************************************************************************************/ ++ ++#include "common.h" ++#include ++ ++ ++ ++BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) ++{ ++ BLASLONG i=0; ++ BLASLONG ix=0; ++ FLOAT minf=0.0; ++ BLASLONG min=0; ++ ++ if (n <= 0 || inc_x <= 0) return(min); ++ ++ minf=x[0]; ++ ix += inc_x; ++ i++; ++ ++ while(i < n) ++ { ++ if( x[ix] < minf ) ++ { ++ min = i; ++ minf = x[ix]; ++ } ++ ix += inc_x; ++ i++; ++ } ++ return(min+1); ++} ++ ++ +diff --git a/kernel/sw_64/izamax.S b/kernel/sw_64/izamax.S +new file mode 100644 +index 0000000..5ccc60e +--- /dev/null ++++ b/kernel/sw_64/izamax.S +@@ -0,0 +1,429 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ unop ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $2 ++ clr $0 ++ ++ fstd $f6, 32($sp) ++ mov X, XX ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ fclr $f0 ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ sra N, 2, $1 ++ addl INCX, INCX, INCX ++ ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ faddd $f20, $f21, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ ldi $1, -1($1) ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f1 ++ LD $f23, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ fmov $f0, $f2 ++ LD $f25, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ fmov $f0, $f3 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f8 ++ fabs $f21, $f9 ++ fabs $f22, $f10 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ fabs $f25, $f13 ++ fabs $f26, $f14 ++ fabs $f27, $f15 ++ ++ ble $1, $L14 ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ ldi $1, -1($1) ++ addl X, INCX, X ++ ++ LD $f22, 0 * SIZE(X) ++ LD $f23, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ LD $f25, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ faddd $f8, $f9, $f16 ++ unop ++ fabs $f20, $f8 ++ fillcs 64 * SIZE(X) ++ ++ faddd $f10, $f11, $f17 ++ unop ++ fabs $f21, $f9 ++ LD $f20, 0 * SIZE(X) ++ ++ faddd $f12, $f13, $f18 ++ LD $f21, 1 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ faddd $f14, $f15, $f19 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ unop ++ ++ CMPLT($f0, $f16), $f4 ++ LD $f23, 1 * SIZE(X) ++ fabs $f24, $f12 ++ addl X, INCX, X ++ ++ CMPLT($f1, $f17), $f5 ++ LD $f24, 0 * SIZE(X) ++ fabs $f25, $f13 ++ unop ++ ++ CMPLT($f2, $f18), $f6 ++ LD $f25, 1 * SIZE(X) ++ fabs $f26, $f14 ++ addl X, INCX, X ++ ++ CMPLT($f3, $f19), $f7 ++ LD $f26, 0 * SIZE(X) ++ fabs $f27, $f15 ++ unop ++ ++ fselne $f4, $f16, $f0, $f0 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ldi $1, -1($1) # i -- ++ ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ faddd $f8, $f9, $f16 ++ fabs $f20, $f8 ++ ++ faddd $f10, $f11, $f17 ++ fabs $f21, $f9 ++ ++ faddd $f12, $f13, $f18 ++ fabs $f22, $f10 ++ ++ faddd $f14, $f15, $f19 ++ fabs $f23, $f11 ++ ++ CMPLT($f0, $f16), $f4 ++ fabs $f24, $f12 ++ ++ CMPLT($f1, $f17), $f5 ++ fabs $f25, $f13 ++ ++ CMPLT($f2, $f18), $f6 ++ fabs $f26, $f14 ++ CMPLT($f3, $f19), $f7 ++ fabs $f27, $f15 ++ ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ .align 4 ++ ++$L14: ++ faddd $f8, $f9, $f16 ++ faddd $f10, $f11, $f17 ++ faddd $f12, $f13, $f18 ++ faddd $f14, $f15, $f19 ++ ++ CMPLT($f0, $f16), $f4 ++ CMPLT($f1, $f17), $f5 ++ CMPLT($f2, $f18), $f6 ++ CMPLT($f3, $f19), $f7 ++ ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ ++ CMPLT($f0, $f1), $f16 ++ CMPLT($f2, $f3), $f17 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ ++ CMPLT($f0, $f2), $f16 ++ fselne $f16, $f2, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 3, $1 ++ unop ++ unop ++ ble $1, $L20 ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ fabs $f21, $f30 ++ faddd $f29, $f30, $f24 ++ fmov $f24,$f29 ++ ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$L20: ++ sra N, 2, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ LD $f13, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ LD $f15, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ LD $f17, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ fabs $f12, $f20 ++ fabs $f13, $f21 ++ ++ ldi $1, -1($1) ++ ble $1, $L23 ++ .align 4 ++ ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fabs $f14, $f22 ++ LD $f11, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ fabs $f15, $f23 ++ LD $f13, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ fabs $f16, $f24 ++ LD $f15, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ fabs $f17, $f25 ++ LD $f17, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ faddd $f18, $f19, $f4 ++ faddd $f20, $f21, $f5 ++ faddd $f22, $f23, $f6 ++ faddd $f24, $f25, $f7 ++ ++ fcmpeq $f0, $f4, $f26 ++ fcmpeq $f0, $f5, $f27 ++ fcmpeq $f0, $f6, $f28 ++ fcmpeq $f0, $f7, $f29 ++ ++ fabs $f10, $f18 ++ ldi $0, 1($0) ++ ldi $1, -1($1) # i -- ++ fbne $f26, $End ++ ++ fabs $f11, $f19 ++ ldi $0, 1($0) ++ unop ++ fbne $f27, $End ++ ++ fabs $f12, $f20 ++ ldi $0, 1($0) ++ unop ++ fbne $f28, $End ++ ++ fabs $f13, $f21 ++ ldi $0, 1($0) ++ fbne $f29, $End ++ bgt $1, $L22 ++ .align 4 ++ ++$L23: ++ fabs $f14, $f22 ++ fabs $f15, $f23 ++ fabs $f16, $f24 ++ fabs $f17, $f25 ++ ++ faddd $f18, $f19, $f4 ++ faddd $f20, $f21, $f5 ++ faddd $f22, $f23, $f6 ++ faddd $f24, $f25, $f7 ++ ++ fcmpeq $f0, $f4, $f26 ++ fcmpeq $f0, $f5, $f27 ++ fcmpeq $f0, $f6, $f28 ++ fcmpeq $f0, $f7, $f29 ++ ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ ldi $0, 1($0) ++ fbne $f28, $End ++ ldi $0, 1($0) ++ fbne $f29, $End ++ .align 4 ++ ++$L40: ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) ++ ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ ++ faddd $f18, $f19, $f2 ++ fmov $f2,$f18 ++ fcmpeq $f0, $f18, $f2 ++ ++ ldi $0, 1($0) ++ fbne $f2, $End ++ br $31, $L40 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/izamax.S.bak b/kernel/sw_64/izamax.S.bak +new file mode 100644 +index 0000000..34e4c88 +--- /dev/null ++++ b/kernel/sw_64/izamax.S.bak +@@ -0,0 +1,427 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ unop ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $2 ++ clr $0 ++ ++ fstd $f6, 32($sp) ++ mov X, XX ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ fclr $f0 ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ sra N, 2, $1 ++ addl INCX, INCX, INCX ++ ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ faddd $f20, $f21, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ ldi $1, -1($1) ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f1 ++ LD $f23, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ fmov $f0, $f2 ++ LD $f25, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ fmov $f0, $f3 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f8 ++ fabs $f21, $f9 ++ fabs $f22, $f10 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ fabs $f25, $f13 ++ fabs $f26, $f14 ++ fabs $f27, $f15 ++ ++ ble $1, $L14 ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ ldi $1, -1($1) ++ addl X, INCX, X ++ ++ LD $f22, 0 * SIZE(X) ++ LD $f23, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ LD $f25, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ faddd $f8, $f9, $f16 ++ unop ++ fabs $f20, $f8 ++ fillcs 64 * SIZE(X) ++ ++ faddd $f10, $f11, $f17 ++ unop ++ fabs $f21, $f9 ++ LD $f20, 0 * SIZE(X) ++ ++ faddd $f12, $f13, $f18 ++ LD $f21, 1 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ faddd $f14, $f15, $f19 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ unop ++ ++ CMPLT($f0, $f16), $f4 ++ LD $f23, 1 * SIZE(X) ++ fabs $f24, $f12 ++ addl X, INCX, X ++ ++ CMPLT($f1, $f17), $f5 ++ LD $f24, 0 * SIZE(X) ++ fabs $f25, $f13 ++ unop ++ ++ CMPLT($f2, $f18), $f6 ++ LD $f25, 1 * SIZE(X) ++ fabs $f26, $f14 ++ addl X, INCX, X ++ ++ CMPLT($f3, $f19), $f7 ++ LD $f26, 0 * SIZE(X) ++ fabs $f27, $f15 ++ unop ++ ++fselne $f4,$f16,$f0, $f0 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ldi $1, -1($1) # i -- ++ ++fselne $f5,$f17,$f1, $f1 ++fselne $f6,$f18,$f2, $f2 ++fselne $f7,$f19,$f3, $f3 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ faddd $f8, $f9, $f16 ++ fabs $f20, $f8 ++ ++ faddd $f10, $f11, $f17 ++ fabs $f21, $f9 ++ ++ faddd $f12, $f13, $f18 ++ fabs $f22, $f10 ++ ++ faddd $f14, $f15, $f19 ++ fabs $f23, $f11 ++ ++ CMPLT($f0, $f16), $f4 ++ fabs $f24, $f12 ++ ++ CMPLT($f1, $f17), $f5 ++ fabs $f25, $f13 ++ ++ CMPLT($f2, $f18), $f6 ++ fabs $f26, $f14 ++ CMPLT($f3, $f19), $f7 ++ fabs $f27, $f15 ++ ++fselne $f4,$f16,$f0, $f0 ++fselne $f5,$f17,$f1, $f1 ++fselne $f6,$f18,$f2, $f2 ++fselne $f7,$f19,$f3, $f3 ++ .align 4 ++ ++$L14: ++ faddd $f8, $f9, $f16 ++ faddd $f10, $f11, $f17 ++ faddd $f12, $f13, $f18 ++ faddd $f14, $f15, $f19 ++ ++ CMPLT($f0, $f16), $f4 ++ CMPLT($f1, $f17), $f5 ++ CMPLT($f2, $f18), $f6 ++ CMPLT($f3, $f19), $f7 ++ ++fselne $f4,$f16,$f0, $f0 ++fselne $f5,$f17,$f1, $f1 ++fselne $f6,$f18,$f2, $f2 ++fselne $f7,$f19,$f3, $f3 ++ ++ CMPLT($f0, $f1), $f16 ++ CMPLT($f2, $f3), $f17 ++ ++fselne $f16,$f1,$f0, $f0 ++fselne $f17,$f3,$f2, $f2 ++ ++ CMPLT($f0, $f2), $f16 ++fselne $f16,$f2,$f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 3, $1 ++ unop ++ unop ++ ble $1, $L20 ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ fabs $f21, $f30 ++ faddd $f29, $f30, $f29 ++ ++ CMPLT($f0, $f29), $f16 ++fselne $f16,$f29,$f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$L20: ++ sra N, 2, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ LD $f13, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ LD $f15, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ LD $f17, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ fabs $f12, $f20 ++ fabs $f13, $f21 ++ ++ ldi $1, -1($1) ++ ble $1, $L23 ++ .align 4 ++ ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fabs $f14, $f22 ++ LD $f11, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ fabs $f15, $f23 ++ LD $f13, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ fabs $f16, $f24 ++ LD $f15, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ fabs $f17, $f25 ++ LD $f17, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ faddd $f18, $f19, $f4 ++ faddd $f20, $f21, $f5 ++ faddd $f22, $f23, $f6 ++ faddd $f24, $f25, $f7 ++ ++ fcmpeq $f0, $f4, $f26 ++ fcmpeq $f0, $f5, $f27 ++ fcmpeq $f0, $f6, $f28 ++ fcmpeq $f0, $f7, $f29 ++ ++ fabs $f10, $f18 ++ ldi $0, 1($0) ++ ldi $1, -1($1) # i -- ++ fbne $f26, $End ++ ++ fabs $f11, $f19 ++ ldi $0, 1($0) ++ unop ++ fbne $f27, $End ++ ++ fabs $f12, $f20 ++ ldi $0, 1($0) ++ unop ++ fbne $f28, $End ++ ++ fabs $f13, $f21 ++ ldi $0, 1($0) ++ fbne $f29, $End ++ bgt $1, $L22 ++ .align 4 ++ ++$L23: ++ fabs $f14, $f22 ++ fabs $f15, $f23 ++ fabs $f16, $f24 ++ fabs $f17, $f25 ++ ++ faddd $f18, $f19, $f4 ++ faddd $f20, $f21, $f5 ++ faddd $f22, $f23, $f6 ++ faddd $f24, $f25, $f7 ++ ++ fcmpeq $f0, $f4, $f26 ++ fcmpeq $f0, $f5, $f27 ++ fcmpeq $f0, $f6, $f28 ++ fcmpeq $f0, $f7, $f29 ++ ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ ldi $0, 1($0) ++ fbne $f28, $End ++ ldi $0, 1($0) ++ fbne $f29, $End ++ .align 4 ++ ++$L40: ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) ++ ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ ++ faddd $f18, $f19, $f18 ++ fcmpeq $f0, $f18, $f2 ++ ++ ldi $0, 1($0) ++ fbne $f2, $End ++ br $31, $L40 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/izamax_simd.S b/kernel/sw_64/izamax_simd.S +new file mode 100644 +index 0000000..8b00f60 +--- /dev/null ++++ b/kernel/sw_64/izamax_simd.S +@@ -0,0 +1,609 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 96 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $2 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#define VCMPLT(a, b) vfcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#define VCMPLT(a, b) vfcmplt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ unop ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $2 ++ clr $0 ++ ++ fstd $f6, 32($sp) ++ mov X, XX ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ fclr $f0 ++ cmpeq INCX, SIZE, $3 ++ and X, (VEC_LEN*SIZE-1), $4 # test the address of X (aligment) ++ beq $2, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ bic $3, $4, $3 ++ nop ++ nop ++ beq $3, $Sub ++ .align 4 ++ ++$Align_Access: ++/* ++ Unloop 8*2=16 reals ++*/ ++#ifdef USE_MIN ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ ADD $f20, $f21, $f0 # init temp min result value ++#endif ++ sra N, 3, I ++ and N, 7, $3 ++ addl INCX, INCX, INCX ++ ble I, $Remain ++ .align 4 ++/* ++ Init max or min value ++*/ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ ++ ADD $f20, $f21, $f4 ++ nop ++ vcpyf $f4, $f0 ++ vcpyf $f4, $f1 ++ ++ ++ VLD $f22, 0*VEC_LEN*SIZE(X) ++ VLD $f23, 1*VEC_LEN*SIZE(X) ++ VLD $f24, 2*VEC_LEN*SIZE(X) ++ VLD $f25, 3*VEC_LEN*SIZE(X) ++ ++ /*vfabs*/ ++ vcpys $f31, $f22, $f10 ++ subl I, 1, I ++ vcpys $f31, $f23, $f11 ++ addl X, 16*SIZE, X ++ ++ vcpys $f31, $f24, $f12 ++ nop ++ vcpys $f31, $f25, $f13 ++ ble I, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++ vextf $f10, 1, $f4 ++ VLD $f22, 0*VEC_LEN*SIZE(X) ++ vextf $f10, 3, $f5 ++ VLD $f23, 1*VEC_LEN*SIZE(X) ++ ++ vextf $f11, 0, $f6 ++ VLD $f24, 2*VEC_LEN*SIZE(X) ++ vextf $f11, 2, $f7 ++ VLD $f25, 3*VEC_LEN*SIZE(X) ++ ++ vextf $f12, 1, $f14 ++ vextf $f12, 3, $f15 ++ vextf $f13, 0, $f16 ++ vextf $f13, 2, $f17 ++ ++ vinsf $f4, $f11, 0, $f11 ++ vinsf $f6, $f10, 1, $f10 ++ vinsf $f14, $f13, 0, $f13 ++ vinsf $f16, $f12, 1, $f12 ++ ++ vinsf $f5, $f11, 2, $f11 ++ vinsf $f7, $f10, 3, $f10 ++ vinsf $f15, $f13, 2, $f13 ++ vinsf $f17, $f12, 3, $f12 ++ ++ VADD $f10, $f11, $f2 ++ addl X, 16*SIZE, X ++ VADD $f12, $f13, $f3 ++ subl I, 1, I ++ ++ vcpys $f31, $f22, $f10 ++ vcpys $f31, $f23, $f11 ++ VCMPLT($f0, $f2), $f18 ++ VCMPLT($f1, $f3), $f19 ++ ++ vcpys $f31, $f24, $f12 ++ fillcs PREFETCHSIZE * SIZE(X) ++ vcpys $f31, $f25, $f13 ++ nop ++ ++ vfseleq $f18, $f0, $f2, $f0 ++ vfseleq $f19, $f1, $f3, $f1 ++ nop ++ bgt I, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++/*spilt the complex vector to real vector($f10,$f12) and image vector ($f11,$f13)*/ ++ vextf $f10, 1, $f4 ++ vextf $f10, 3, $f5 ++ vextf $f11, 0, $f6 ++ vextf $f11, 2, $f7 ++ ++ vextf $f12, 1, $f14 ++ vextf $f12, 3, $f15 ++ vextf $f13, 0, $f16 ++ vextf $f13, 2, $f17 ++ ++ vinsf $f4, $f11, 0, $f11 ++ vinsf $f6, $f10, 1, $f10 ++ vinsf $f14, $f13, 0, $f13 ++ vinsf $f16, $f12, 1, $f12 ++ ++ vinsf $f5, $f11, 2, $f11 ++ vinsf $f7, $f10, 3, $f10 ++ vinsf $f15, $f13, 2, $f13 ++ vinsf $f17, $f12, 3, $f12 ++ ++ VADD $f10, $f11, $f2 ++ VADD $f12, $f13, $f3 ++ VCMPLT($f0, $f2), $f18 ++ VCMPLT($f1, $f3), $f19 ++ ++ vfseleq $f18, $f0, $f2, $f0 ++ vfseleq $f19, $f1, $f3, $f1 ++/*find the max or min between f0 and f1*/ ++ VCMPLT($f0, $f1), $f18 ++ vfseleq $f18, $f0, $f1, $f0 ++ ++ ++ vextf $f0, 1, $f22 ++ vextf $f0, 2, $f23 ++ vextf $f0, 3, $f24 ++ CMPLT($f0, $f22), $f16 ++ ++ CMPLT($f23, $f24), $f17 ++ fseleq $f16, $f0, $f22, $f0 ++ fseleq $f17, $f23, $f24, $f23 ++ CMPLT($f0, $f23), $f18 ++ ++ fseleq $f18, $f0, $f23, $f0 ++ nop ++ .align 4 ++$Remain: ++ ble $3, $Continuous_FindIndex ++ .align 4 ++$RemainLoop: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ unop ++ addl X, 2*SIZE, X ++ ++ fabs $f20, $f29 ++ fabs $f21, $f30 ++ ADD $f29, $f30, $f29 ++ ++ CMPLT($f0, $f29), $f16 ++ fselne $f16,$f29,$f0, $f0 ++ ++ subl $3, 1, $3 ++ bgt $3, $RemainLoop ++ .align 4 ++ ++ /*find index*/ ++$Continuous_FindIndex: ++ ++ jmp $L20 ++ ++$Sub: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ sra N, 2, $1 ++ addl INCX, INCX, INCX ++ ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ ADD $f20, $f21, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ ldi $1, -1($1) ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f1 ++ LD $f23, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ fmov $f0, $f2 ++ LD $f25, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ fmov $f0, $f3 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f8 ++ fabs $f21, $f9 ++ fabs $f22, $f10 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ fabs $f25, $f13 ++ fabs $f26, $f14 ++ fabs $f27, $f15 ++ ++ ble $1, $L14 ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ ldi $1, -1($1) ++ addl X, INCX, X ++ ++ LD $f22, 0 * SIZE(X) ++ LD $f23, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ LD $f25, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ ADD $f8, $f9, $f16 ++ fillcs PREFETCHSIZE * SIZE(X) ++ fabs $f20, $f8 ++ fillcs 64 * SIZE(X) ++ ++ ADD $f10, $f11, $f17 ++ unop ++ fabs $f21, $f9 ++ LD $f20, 0 * SIZE(X) ++ ++ ADD $f12, $f13, $f18 ++ LD $f21, 1 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ ADD $f14, $f15, $f19 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ unop ++ ++ CMPLT($f0, $f16), $f4 ++ LD $f23, 1 * SIZE(X) ++ fabs $f24, $f12 ++ addl X, INCX, X ++ ++ CMPLT($f1, $f17), $f5 ++ LD $f24, 0 * SIZE(X) ++ fabs $f25, $f13 ++ unop ++ ++ CMPLT($f2, $f18), $f6 ++ LD $f25, 1 * SIZE(X) ++ fabs $f26, $f14 ++ addl X, INCX, X ++ ++ CMPLT($f3, $f19), $f7 ++ LD $f26, 0 * SIZE(X) ++ fabs $f27, $f15 ++ unop ++ ++ fselne $f4,$f16,$f0, $f0 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ldi $1, -1($1) # i -- ++ ++ fselne $f5,$f17,$f1, $f1 ++ fselne $f6,$f18,$f2, $f2 ++ fselne $f7,$f19,$f3, $f3 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ ADD $f8, $f9, $f16 ++ fabs $f20, $f8 ++ ++ ADD $f10, $f11, $f17 ++ fabs $f21, $f9 ++ ++ ADD $f12, $f13, $f18 ++ fabs $f22, $f10 ++ ++ ADD $f14, $f15, $f19 ++ fabs $f23, $f11 ++ ++ CMPLT($f0, $f16), $f4 ++ fabs $f24, $f12 ++ ++ CMPLT($f1, $f17), $f5 ++ fabs $f25, $f13 ++ ++ CMPLT($f2, $f18), $f6 ++ fabs $f26, $f14 ++ CMPLT($f3, $f19), $f7 ++ fabs $f27, $f15 ++ ++ fselne $f4,$f16,$f0, $f0 ++ fselne $f5,$f17,$f1, $f1 ++ fselne $f6,$f18,$f2, $f2 ++ fselne $f7,$f19,$f3, $f3 ++ .align 4 ++ ++$L14: ++ ADD $f8, $f9, $f16 ++ ADD $f10, $f11, $f17 ++ ADD $f12, $f13, $f18 ++ ADD $f14, $f15, $f19 ++ ++ CMPLT($f0, $f16), $f4 ++ CMPLT($f1, $f17), $f5 ++ CMPLT($f2, $f18), $f6 ++ CMPLT($f3, $f19), $f7 ++ ++ fselne $f4,$f16,$f0, $f0 ++ fselne $f5,$f17,$f1, $f1 ++ fselne $f6,$f18,$f2, $f2 ++ fselne $f7,$f19,$f3, $f3 ++ ++ CMPLT($f0, $f1), $f16 ++ CMPLT($f2, $f3), $f17 ++ ++ fselne $f16,$f1,$f0, $f0 ++ fselne $f17,$f3,$f2, $f2 ++ ++ CMPLT($f0, $f2), $f16 ++ fselne $f16,$f2,$f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 3, $1 ++ unop ++ unop ++ ble $1, $L20 ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ fabs $f21, $f30 ++ ADD $f29, $f30, $f29 ++ ++ CMPLT($f0, $f29), $f16 ++ fselne $f16,$f29,$f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$L20: ++ sra N, 2, $1 ++ ble $1, $L40 ++ .align 4 ++ ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ LD $f13, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ LD $f15, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ LD $f17, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ fabs $f12, $f20 ++ fabs $f13, $f21 ++ ++ ldi $1, -1($1) ++ ble $1, $L23 ++ .align 4 ++ ++$L22: ++ LD $f10, 0 * SIZE(XX) ++ fabs $f14, $f22 ++ LD $f11, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f12, 0 * SIZE(XX) ++ fabs $f15, $f23 ++ LD $f13, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f14, 0 * SIZE(XX) ++ fabs $f16, $f24 ++ LD $f15, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ LD $f16, 0 * SIZE(XX) ++ fabs $f17, $f25 ++ LD $f17, 1 * SIZE(XX) ++ addl XX, INCX, XX ++ ++ ADD $f18, $f19, $f4 ++ ADD $f20, $f21, $f5 ++ ADD $f22, $f23, $f6 ++ ADD $f24, $f25, $f7 ++ ++ fcmpeq $f0, $f4, $f26 ++ fcmpeq $f0, $f5, $f27 ++ fcmpeq $f0, $f6, $f28 ++ fcmpeq $f0, $f7, $f29 ++ ++ fabs $f10, $f18 ++ ldi $0, 1($0) ++ ldi $1, -1($1) # i -- ++ fbne $f26, $End ++ ++ fabs $f11, $f19 ++ ldi $0, 1($0) ++ unop ++ fbne $f27, $End ++ ++ fabs $f12, $f20 ++ ldi $0, 1($0) ++ fillcs PREFETCHSIZE * SIZE(X) ++ fbne $f28, $End ++ ++ fabs $f13, $f21 ++ ldi $0, 1($0) ++ fbne $f29, $End ++ bgt $1, $L22 ++ .align 4 ++ ++$L23: ++ fabs $f14, $f22 ++ fabs $f15, $f23 ++ fabs $f16, $f24 ++ fabs $f17, $f25 ++ ++ ADD $f18, $f19, $f4 ++ ADD $f20, $f21, $f5 ++ ADD $f22, $f23, $f6 ++ ADD $f24, $f25, $f7 ++ ++ fcmpeq $f0, $f4, $f26 ++ fcmpeq $f0, $f5, $f27 ++ fcmpeq $f0, $f6, $f28 ++ fcmpeq $f0, $f7, $f29 ++ ++ ldi $0, 1($0) ++ fbne $f26, $End ++ ldi $0, 1($0) ++ fbne $f27, $End ++ ldi $0, 1($0) ++ fbne $f28, $End ++ ldi $0, 1($0) ++ fbne $f29, $End ++ .align 4 ++ ++$L40: ++ LD $f10, 0 * SIZE(XX) ++ LD $f11, 1 * SIZE(XX) ++ ++ addl XX, INCX, XX ++ ++ fabs $f10, $f18 ++ fabs $f11, $f19 ++ ++ ADD $f18, $f19, $f18 ++ fcmpeq $f0, $f18, $f2 ++ ++ ldi $0, 1($0) ++ fbne $f2, $End ++ br $31, $L40 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/lsame.S b/kernel/sw_64/lsame.S +new file mode 100644 +index 0000000..c2c0863 +--- /dev/null ++++ b/kernel/sw_64/lsame.S +@@ -0,0 +1,77 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#include "version.h" ++ ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl lsame_ ++ .ent lsame_ ++lsame_: ++ .frame $sp,0,$26,0 ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ lda $28, _mcount ++ jsr $28, ($28), _mcount ++ .prologue 1 ++#else ++ .prologue 0 ++#endif ++ ++ ldbu $5, 0($16) ++ ldbu $6, 0($17) ++// extb $2, $5 ++// extbl $3, $6 ++ ++ subl $5, 96, $1 ++ subl $6, 96, $2 ++ subl $5, 32, $3 ++ subl $6, 32, $4 ++ ++ ++ selgt $1, $3, $5, $5 ++ selgt $2, $4, $6, $6 ++ cmpeq $5, $6, $0 ++ .align 4 ++ ++$End: ++ ret ++ .end lsame_ ++ .ident VERSION +diff --git a/kernel/sw_64/max.S b/kernel/sw_64/max.S +new file mode 100644 +index 0000000..07925d1 +--- /dev/null ++++ b/kernel/sw_64/max.S +@@ -0,0 +1,227 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef F_INTERFACE ++ ldl N, 0(N) # n ++ ldl INCX, 0(INCX) # incx ++#endif ++ ldi $sp, -STACKSIZE($sp) ++ nop ++ .align 4 ++ ++ cmplt $31, N, $2 ++ cmplt $31, INCX, $3 ++ SXADDQ INCX, $31, INCX ++ and $2, $3, $0 ++ ++ sra N, 3, $1 ++ fclr $f0 ++ unop ++ beq $0, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f0, 0 * SIZE(X) ++ unop ++ unop ++ ble $1, $L15 ++ .align 4 ++ ++ fmov $f0, $f1 ++ addl X, INCX, X ++ fmov $f0, $f10 ++ ldi $1, -1($1) ++ ++ LD $f21, 0 * SIZE(X) ++ fmov $f0, $f11 ++ addl X, INCX, X ++ fmov $f0, $f12 ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f13 ++ addl X, INCX, X ++ fmov $f0, $f14 ++ ++ LD $f23, 0 * SIZE(X) ++ fmov $f0, $f15 ++ addl X, INCX, X ++ fmov $f0, $f20 ++ ++ LD $f24, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f25, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f26, 0 * SIZE(X) ++ addl X, INCX, X ++ LD $f27, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ CMPLT($f0, $f20), $f16 ++ CMPLT($f1, $f21), $f17 ++ CMPLT($f10, $f22), $f18 ++ CMPLT($f11, $f23), $f19 ++ ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ fselne $f16, $f20, $f0, $f0 ++ LD $f20, 0 * SIZE(X) ++ CMPLT($f12, $f24), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f21, $f1, $f1 ++ LD $f21, 0 * SIZE(X) ++ CMPLT($f13, $f25), $f17 ++ addl X, INCX, X ++ ++ fselne $f18, $f22, $f10, $f10 ++ LD $f22, 0 * SIZE(X) ++ CMPLT($f14, $f26), $f18 ++ addl X, INCX, X ++ ++ fselne $f19, $f23, $f11, $f11 ++ LD $f23, 0 * SIZE(X) ++ CMPLT($f15, $f27), $f19 ++ addl X, INCX, X ++ ++ fselne $f16, $f24, $f12, $f12 ++ LD $f24, 0 * SIZE(X) ++ CMPLT($f0, $f20), $f16 ++ addl X, INCX, X ++ ++ fselne $f17, $f25, $f13, $f13 ++ LD $f25, 0 * SIZE(X) ++ CMPLT($f1, $f21), $f17 ++ addl X, INCX, X ++ ++ fselne $f18, $f26, $f14, $f14 ++ LD $f26, 0 * SIZE(X) ++ CMPLT($f10, $f22), $f18 ++ addl X, INCX, X ++ ++ fselne $f19, $f27, $f15, $f15 ++ LD $f27, 0 * SIZE(X) ++ CMPLT($f11, $f23), $f19 ++ ldi $1, -1($1) # i -- ++ ++ addl X, INCX, X ++ unop ++ unop ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ fselne $f16, $f20, $f0, $f0 ++ CMPLT($f12, $f24), $f16 ++ ++ fselne $f17, $f21, $f1, $f1 ++ CMPLT($f13, $f25), $f17 ++ ++ fselne $f18, $f22, $f10, $f10 ++ CMPLT($f14, $f26), $f18 ++ ++ fselne $f19, $f23, $f11, $f11 ++ CMPLT($f15, $f27), $f19 ++ ++ fselne $f16, $f24, $f12, $f12 ++ CMPLT($f0, $f1), $f16 ++ fselne $f17, $f25, $f13, $f13 ++ CMPLT($f10, $f11), $f17 ++ ++ fselne $f18, $f26, $f14, $f14 ++ CMPLT($f12, $f13), $f18 ++ fselne $f19, $f27, $f15, $f15 ++ CMPLT($f14, $f15), $f19 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f11, $f10, $f10 ++ fselne $f18, $f13, $f12, $f12 ++ fselne $f19, $f15, $f14, $f14 ++ ++ CMPLT($f0, $f10), $f16 ++ CMPLT($f12, $f14), $f17 ++ ++ fselne $f16, $f10, $f0, $f0 ++ fselne $f17, $f14, $f12, $f12 ++ ++ CMPLT($f0, $f12), $f16 ++ fselne $f16, $f12, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 7, $1 ++ unop ++ unop ++ ble $1, $End ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ CMPLT($f0, $f20), $f16 ++ fselne $f16, $f20, $f0, $f0 ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$End: ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/nrm2_simd.S b/kernel/sw_64/nrm2_simd.S +new file mode 100644 +index 0000000..0888454 +--- /dev/null ++++ b/kernel/sw_64/nrm2_simd.S +@@ -0,0 +1,493 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++ ++ PROFCODE ++ ++ ++ fclr a0 ++ SXADDQ INCX, 0, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, SIZE, $0 ++ fclr a3 ++ beq $0, $L20 #stride access ++ ++/* test the address of X */ ++ and X, (VEC_LEN*SIZE-1), $3 ++ fclr t0 ++ nop ++ bne $3, $UnAlign_ACCESS ++/*Align access. Use simd instructions.*/ ++ sra N, 4, I ++ ble I, $Remain ++ ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t0 #clear s0 vector ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t1 ++ ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t2 ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t3 ++ ++ addl X, 16 * SIZE, X ++ subl I, 1, I ++ nop ++ ble I, $MainLoopEnd ++$MainLoop: ++ fillcs PREFETCHSIZE * SIZE(X) ++ VMAD a0, a0, t0, t0 ++ subl I, 1, I ++ VMAD a1, a1, t1, t1 ++ ++ addl X, 16 * SIZE, X ++ VMAD a2, a2, t2, t2 ++ nop ++ VMAD a3, a3, t3, t3 ++ ++ VLD a0, -4*VEC_LEN*SIZE(X) ++ VLD a1, -3*VEC_LEN*SIZE(X) ++ VLD a2, -2*VEC_LEN*SIZE(X) ++ VLD a3, -1*VEC_LEN*SIZE(X) ++ ++ bgt I, $MainLoop ++ .align 4 ++$MainLoopEnd: ++ VMAD a0, a0, t0, t0 ++ VMAD a1, a1, t1, t1 ++ VMAD a2, a2, t2, t2 ++ VMAD a3, a3, t3, t3 ++ ++ VADD t0, t1, a0 ++ VADD t2, t3, a1 ++ nop ++ VADD a0, a1, t0 ++ ++ vextf t0, 1, t1 ++ vextf t0, 2, t2 ++ vextf t0, 3, t3 ++ nop ++ ++ ADD t0, t1, a2 ++ ADD t2, t3, a3 ++ nop ++ ADD a2, a3, t0 ++ ++ .align 4 ++$Remain: ++ and N, 15, I ++ ble I, $End ++ .align 4 ++$RemainLoop: ++ LD a0, 0 * SIZE(X) ++ addl X, SIZE, X ++ MAD a0, a0, t0, t0 ++ subl I, 1, I ++ ++ bgt I, $RemainLoop ++ .align 4 ++$End: ++ SQRT t0, a0 ++ ret ++ .align 4 ++ ++/*Don't use simd*/ ++ ++$UnAlign_ACCESS: ++ ++ fclr t0 ++ sra N, 4, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ ADD a0, t0, a0 ++ fillcs (PREFETCHSIZE) * SIZE(X) ++ MUL x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ ADD a1, t1, a1 ++ mov X, XX ++ MUL x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ ADD a3, t3, a3 ++ unop ++ MUL x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ ADD a0, t0, a0 ++ unop ++ MUL x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ ADD a1, t1, a1 ++ unop ++ MUL x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ ADD a3, t3, a3 ++ unop ++ MUL x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ ADD a0, t0, a0 ++ unop ++ MUL x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ ADD a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ MUL x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ ADD a3, t3, a3 ++ unop ++ MUL x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ ADD a0, t0, a0 ++ unop ++ MUL x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ ADD a1, t1, a1 ++ ldi I, -1(I) ++ MUL x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ ADD a3, t3, a3 ++ MUL x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ ADD a0, t0, a0 ++ mov X, XX ++ MUL x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ ADD a1, t1, a1 ++ unop ++ MUL x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ ADD a3, t3, a3 ++ unop ++ MUL x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ ADD a0, t0, a0 ++ unop ++ MUL x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ ADD a1, t1, a1 ++ unop ++ MUL x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ ADD a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ MUL x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ ADD a0, t0, a0 ++ MUL x0, x0, t0 ++ ADD a1, t1, a1 ++ MUL x1, x1, t1 ++ ++ ADD a2, t2, a2 ++ MUL x2, x2, t2 ++ ADD a3, t3, a3 ++ MUL x3, x3, t3 ++ ++ ADD a0, t0, a0 ++ MUL x4, x4, t0 ++ ADD a1, t1, a1 ++ MUL x5, x5, t1 ++ ++ ADD a2, t2, a2 ++ MUL x6, x6, t2 ++ ADD a3, t3, a3 ++ MUL x7, x7, t3 ++ ++ ADD a1, t1, a1 ++ ADD a2, t2, a2 ++ ADD a3, t3, a3 ++ .align 4 ++ ++$L15: ++ and N, 15, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ++ ADD a0, t0, a0 ++ MUL x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L25 ++ ++ fclr t2 ++ fclr t3 ++ ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x3, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x5, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x6, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ ADD a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ MUL x0, x0, t0 ++ addl X, INCX, X ++ ++ ADD a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ MUL x1, x1, t1 ++ addl X, INCX, X ++ ++ ADD a2, t2, a2 ++ LD x1, 0 * SIZE(X) ++ MUL x2, x2, t2 ++ addl X, INCX, X ++ ++ ADD a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ MUL x3, x3, t3 ++ addl X, INCX, X ++ ++ ADD a0, t0, a0 ++ LD x3, 0 * SIZE(X) ++ MUL x4, x4, t0 ++ addl X, INCX, X ++ ++ ADD a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ MUL x5, x5, t1 ++ addl X, INCX, X ++ ++ ADD a2, t2, a2 ++ LD x5, 0 * SIZE(X) ++ MUL x6, x6, t2 ++ addl X, INCX, X ++ ++ ADD a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ MUL x7, x7, t3 ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ ADD a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ MUL x0, x0, t0 ++ addl X, INCX, X ++ ++ ADD a1, t1, a1 ++ unop ++ MUL x1, x1, t1 ++ unop ++ ++ ADD a2, t2, a2 ++ MUL x2, x2, t2 ++ ADD a3, t3, a3 ++ MUL x3, x3, t3 ++ ++ ADD a0, t0, a0 ++ MUL x4, x4, t0 ++ ADD a1, t1, a1 ++ MUL x5, x5, t1 ++ ++ ADD a2, t2, a2 ++ MUL x6, x6, t2 ++ ADD a3, t3, a3 ++ MUL x7, x7, t3 ++ ++ ADD a1, t1, a1 ++ ADD a2, t2, a2 ++ ADD a3, t3, a3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ ADD a0, t0, a0 ++ MUL x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ ADD a0, t0, a0 ++ ++ ADD a0, a1, a0 ++ ADD a2, a3, a2 ++ ++ ++ ADD a0, a2, a0 ++ SQRT a0, a0 ++ ++ .align 4 ++ ++$L999: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/rot.S b/kernel/sw_64/rot.S +new file mode 100644 +index 0000000..3c8624e +--- /dev/null ++++ b/kernel/sw_64/rot.S +@@ -0,0 +1,680 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define I $21 ++#define XX $23 ++#define YY $24 ++ ++#define C $f10 ++#define S $f11 ++ ++#define PREFETCH_SIZE 80 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ldi $sp, -16($sp) ++ fstd $f20, 8($sp) ++ ++ fmov $f21, C ++ LD S, 16($sp) ++ cmpeq INCX, 1, $23 ++ cmpeq INCY, 1, $24 ++ ble N, $L998 ++ ++ ++ and $23, $24, $23 ++ beq $23, $L50 ++ ++ sra N, 3, I ++ ble I, $L15 ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ LD $f16, 2*SIZE(X) ++ LD $f17, 2*SIZE(Y) ++ LD $f18, 3*SIZE(X) ++ LD $f19, 3*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ unop ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ ++ LD $f13, 4*SIZE(Y) ++ MUL S, $f12, $f24 ++ LD $f12, 4*SIZE(X) ++ MUL C, $f14, $f25 ++ ++ ldi I, -1(I) ++ MUL S, $f15, $f26 ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ MUL C, $f15, $f27 ++ ++ LD $f15, 5*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ MUL C, $f16, $f21 ++ flds $f31, (PREFETCH_SIZE) * SIZE(X) ++ unop ++ LD $f14, 5*SIZE(X) ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ ++ MUL C, $f17, $f23 ++ flds $f31, (PREFETCH_SIZE) * SIZE(Y) ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 6*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 7*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 8*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 8*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 9*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ ++ MUL C, $f16, $f21 ++ LD $f14, 9*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ ++ MUL C, $f17, $f23 ++ LD $f17, 10*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 10*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ ++ MUL C, $f19, $f27 ++ LD $f19, 11*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ ldi I, -1(I) ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 11*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 6*SIZE(X) ++ MUL S, $f13, $f22 ++ unop ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 12*SIZE(Y) ++ ldi X, 8*SIZE(X) ++ unop ++ ++ ST $f24, 6*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 4*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ unop ++ ++ ST $f26, -1*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 5*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, -1*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ MUL C, $f16, $f21 ++ LD $f14, 5*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ LD $f16, 6*SIZE(X) ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ SUB $f23, $f24, $f18 ++ fmov $f18,$f24 ++ LD $f18, 7*SIZE(X) ++ ++ MUL C, $f12, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ ++ MUL C, $f13, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ MUL C, $f14, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ ++ MUL C, $f15, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ ++ MUL C, $f16, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ ++ ST $f22, 6*SIZE(X) ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ ST $f24, 6*SIZE(Y) ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ ST $f26, 7*SIZE(X) ++ ldi X, 8*SIZE(X) ++ ST $f28, 7*SIZE(Y) ++ ldi Y, 8*SIZE(Y) ++ .align 4 ++ ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f25 ++ SUB $f23, $f24, $f26 ++ ldi I, -1(I) ++ ++ ST $f25, 0*SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ST $f26, 0*SIZE(Y) ++ ldi Y, 1 * SIZE(Y) ++ ++ bgt I, $L16 ++ .align 4 ++ ++$L998: ++ clr $0 ++ fldd $f20, 8($sp) ++ ldi $sp, 16($sp) ++ ret ++ .align 4 ++ ++$L50: ++ mov X, XX ++ mov Y, YY ++ ++ sra N, 3, I ++ ble I, $L55 ++ .align 4 ++ ++$L51: ++ LD $f12, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f14, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f16, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f18, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f16, $f21 ++ MUL S, $f17, $f22 ++ MUL C, $f17, $f23 ++ MUL S, $f16, $f24 ++ ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f18, $f25 ++ MUL S, $f19, $f26 ++ MUL C, $f19, $f27 ++ MUL S, $f18, $f28 ++ ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f14, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f16, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f18, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f16, $f21 ++ MUL S, $f17, $f22 ++ MUL C, $f17, $f23 ++ MUL S, $f16, $f24 ++ ++ ADD $f21, $f22, $f20 ++ fmov $f20,$f22 ++ SUB $f23, $f24, $f20 ++ fmov $f20,$f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f18, $f25 ++ MUL S, $f19, $f26 ++ MUL C, $f19, $f27 ++ MUL S, $f18, $f28 ++ ++ ADD $f25, $f26, $f20 ++ fmov $f20,$f26 ++ SUB $f27, $f28, $f20 ++ fmov $f20,$f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ldi I, -1(I) ++ bgt I, $L51 ++ .align 4 ++ ++$L55: ++ and N, 7, I ++ ble I, $L999 ++ .align 4 ++ ++$L56: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f25 ++ SUB $f23, $f24, $f26 ++ ldi I, -1(I) ++ ++ ST $f25, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ ST $f26, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ bgt I, $L56 ++ .align 4 ++ ++$L999: ++ fldd $f20, 8($sp) ++ ldi $sp, 16($sp) ++ ++ clr $0 ++# fldd $f20, 8($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/rot.S.bak b/kernel/sw_64/rot.S.bak +new file mode 100644 +index 0000000..62e9ff9 +--- /dev/null ++++ b/kernel/sw_64/rot.S.bak +@@ -0,0 +1,624 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define I $21 ++#define XX $23 ++#define YY $24 ++ ++#define C $f10 ++#define S $f11 ++ ++#define PREFETCH_SIZE 80 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fmov $f21, C ++ LD S, 0($sp) ++ ++ cmpeq INCX, 1, $23 ++ cmpeq INCY, 1, $24 ++ ble N, $L998 ++ ++ and $23, $24, $23 ++ beq $23, $L50 ++ ++ sra N, 3, I ++ ble I, $L15 ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ LD $f16, 2*SIZE(X) ++ LD $f17, 2*SIZE(Y) ++ LD $f18, 3*SIZE(X) ++ LD $f19, 3*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ unop ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ ++ LD $f13, 4*SIZE(Y) ++ MUL S, $f12, $f24 ++ LD $f12, 4*SIZE(X) ++ MUL C, $f14, $f25 ++ ++ ldi I, -1(I) ++ MUL S, $f15, $f26 ++ ADD $f21, $f22, $f22 ++ MUL C, $f15, $f27 ++ ++ LD $f15, 5*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ MUL C, $f16, $f21 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ unop ++ LD $f14, 5*SIZE(X) ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ fillcs (PREFETCH_SIZE) * SIZE(Y) ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 6*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 7*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 8*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 8*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 9*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ LD $f14, 9*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ LD $f17, 10*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 10*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ LD $f19, 11*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ ldi I, -1(I) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 11*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 6*SIZE(X) ++ MUL S, $f13, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 12*SIZE(Y) ++ ldi X, 8*SIZE(X) ++ unop ++ ++ ST $f24, 6*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 4*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ unop ++ ++ ST $f26, -1*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 5*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, -1*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ MUL C, $f16, $f21 ++ LD $f14, 5*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ LD $f16, 6*SIZE(X) ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ LD $f18, 7*SIZE(X) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 6*SIZE(X) ++ ADD $f25, $f26, $f26 ++ ST $f24, 6*SIZE(Y) ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 7*SIZE(X) ++ ldi X, 8*SIZE(X) ++ ST $f28, 7*SIZE(Y) ++ ldi Y, 8*SIZE(Y) ++ .align 4 ++ ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f25 ++ SUB $f23, $f24, $f26 ++ ldi I, -1(I) ++ ++ ST $f25, 0*SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ST $f26, 0*SIZE(Y) ++ ldi Y, 1 * SIZE(Y) ++ ++ bgt I, $L16 ++ .align 4 ++ ++$L998: ++ clr $0 ++ ret ++ .align 4 ++ ++$L50: ++ mov X, XX ++ mov Y, YY ++ ++ sra N, 3, I ++ ble I, $L55 ++ .align 4 ++ ++$L51: ++ LD $f12, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f14, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f16, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f18, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f16, $f21 ++ MUL S, $f17, $f22 ++ MUL C, $f17, $f23 ++ MUL S, $f16, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f18, $f25 ++ MUL S, $f19, $f26 ++ MUL C, $f19, $f27 ++ MUL S, $f18, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f14, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f16, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f18, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f16, $f21 ++ MUL S, $f17, $f22 ++ MUL C, $f17, $f23 ++ MUL S, $f16, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f18, $f25 ++ MUL S, $f19, $f26 ++ MUL C, $f19, $f27 ++ MUL S, $f18, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ldi I, -1(I) ++ bgt I, $L51 ++ .align 4 ++ ++$L55: ++ and N, 7, I ++ ble I, $L999 ++ .align 4 ++ ++$L56: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f25 ++ SUB $f23, $f24, $f26 ++ ldi I, -1(I) ++ ++ ST $f25, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ ST $f26, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ bgt I, $L56 ++ .align 4 ++ ++$L999: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/rot_simd.S b/kernel/sw_64/rot_simd.S +new file mode 100644 +index 0000000..99f3e05 +--- /dev/null ++++ b/kernel/sw_64/rot_simd.S +@@ -0,0 +1,783 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define I $21 ++#define XX $23 ++#define YY $24 ++ ++#define C $f10 ++#define S $f11 ++ ++#define x0 $f12 ++#define x1 $f14 ++#define x2 $f16 ++#define x3 $f18 ++ ++#define y0 $f13 ++#define y1 $f15 ++#define y2 $f17 ++#define y3 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++#define t4 $f24 ++#define t5 $f25 ++#define t6 $f26 ++#define t7 $f27 ++ ++#define PREFETCHSIZE 80 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fmov $f21, C ++ LD S, 0($sp) ++ ++ cmpeq INCX, 1, $23 ++ cmpeq INCY, 1, $24 ++ ble N, $L998 ++ ++ and $23, $24, $23 ++ beq $23, $L50 #incx!=1 or incy !=1 ++ ++/* test the address of X */ ++ and X, (VEC_LEN*SIZE-1), $3 ++ and Y, (VEC_LEN*SIZE-1), $4 ++ or $3, $4, $4 ++ bne $4, $UnAlign_ACCESS ++ ++/*Align Accessing*/ ++ sra N, 4, I ++ ble I, $Remain ++ ++ vcpyf C, C ++ vcpyf S, S ++ ++ VLD x0, 0*VEC_LEN*SIZE(X) ++ VLD x1, 1*VEC_LEN*SIZE(X) ++ VLD x2, 2*VEC_LEN*SIZE(X) ++ VLD x3, 3*VEC_LEN*SIZE(X) ++ ++ VLD y0, 0*VEC_LEN*SIZE(Y) ++ VLD y1, 1*VEC_LEN*SIZE(Y) ++ VLD y2, 2*VEC_LEN*SIZE(Y) ++ VLD y3, 3*VEC_LEN*SIZE(Y) ++ ++ addl X, 16 * SIZE, X ++ addl Y, 16 * SIZE, Y ++ subl I, 1, I ++ ble I, $MainLoopEnd ++ .align 4 ++$MainLoop: ++ VMUL C, x0, t0 ++ fillcs (PREFETCHSIZE) * SIZE(X) ++ VMUL C, x1, t1 ++ fillcs (PREFETCHSIZE) * SIZE(Y) ++ ++ VMUL C, x2, t2 ++ subl I, 1, I ++ VMUL C, x3, t3 ++ nop ++ ++ VMUL S, x0, t4 ++ VLD x0, 0*VEC_LEN*SIZE(X) ++ VMUL S, x1, t5 ++ VLD x1, 1*VEC_LEN*SIZE(X) ++ ++ VMUL S, x2, t6 ++ VLD x2, 2*VEC_LEN*SIZE(X) ++ VMUL S, x3, t7 ++ VLD x3, 3*VEC_LEN*SIZE(X) ++ ++ VMAD S, y0, t0, t0 ++ VMAD S, y1, t1, t1 ++ VMAD S, y2, t2, t2 ++ VMAD S, y3, t3, t3 ++ ++ VMSUB C, y0, t4, t4 ++ VLD y0, 0*VEC_LEN*SIZE(Y) ++ VMSUB C, y1, t5, t5 ++ VLD y1, 1*VEC_LEN*SIZE(Y) ++ ++ VMSUB C, y2, t6, t6 ++ VLD y2, 2*VEC_LEN*SIZE(Y) ++ VMSUB C, y3, t7, t7 ++ VLD y3, 3*VEC_LEN*SIZE(Y) ++ ++ VST t0, -4*VEC_LEN*SIZE(X) ++ VST t1, -3*VEC_LEN*SIZE(X) ++ VST t2, -2*VEC_LEN*SIZE(X) ++ VST t3, -1*VEC_LEN*SIZE(X) ++ ++ VST t4, -4*VEC_LEN*SIZE(Y) ++ VST t5, -3*VEC_LEN*SIZE(Y) ++ VST t6, -2*VEC_LEN*SIZE(Y) ++ VST t7, -1*VEC_LEN*SIZE(Y) ++ ++ addl X, 16 * SIZE, X ++ addl Y, 16 * SIZE, Y ++ nop ++ bgt I, $MainLoop ++ .align 4 ++$MainLoopEnd: ++ VMUL C, x0, t0 ++ VMUL C, x1, t1 ++ VMUL C, x2, t2 ++ VMUL C, x3, t3 ++ ++ VMUL S, x0, t4 ++ VMUL S, x1, t5 ++ VMUL S, x2, t6 ++ VMUL S, x3, t7 ++ ++ VMAD S, y0, t0, t0 ++ VMAD S, y1, t1, t1 ++ VMAD S, y2, t2, t2 ++ VMAD S, y3, t3, t3 ++ ++ VMSUB C, y0, t4, t4 ++ VMSUB C, y1, t5, t5 ++ VMSUB C, y2, t6, t6 ++ VMSUB C, y3, t7, t7 ++ ++ VST t0, -4*VEC_LEN*SIZE(X) ++ VST t1, -3*VEC_LEN*SIZE(X) ++ VST t2, -2*VEC_LEN*SIZE(X) ++ VST t3, -1*VEC_LEN*SIZE(X) ++ ++ VST t4, -4*VEC_LEN*SIZE(Y) ++ VST t5, -3*VEC_LEN*SIZE(Y) ++ VST t6, -2*VEC_LEN*SIZE(Y) ++ VST t7, -1*VEC_LEN*SIZE(Y) ++ ++ .align 4 ++$Remain: ++ and N, 15, I ++ ble I, $End ++$RemainLoop: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f12, $f24 ++ MAD S, $f13, $f21, $f25 ++ MSUB C, $f13, $f24, $f26 ++ ++ ++ ldi I, -1(I) ++ ST $f25, 0*SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ST $f26, 0*SIZE(Y) ++ ++ ldi Y, 1 * SIZE(Y) ++ bgt I, $RemainLoop ++ ++ .align 4 ++$End: ++ clr $0 ++ ret ++ .align 4 ++ ++$UnAlign_ACCESS: ++ ++ sra N, 3, I ++ ble I, $L15 ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ LD $f16, 2*SIZE(X) ++ LD $f17, 2*SIZE(Y) ++ LD $f18, 3*SIZE(X) ++ LD $f19, 3*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ unop ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ ++ LD $f13, 4*SIZE(Y) ++ MUL S, $f12, $f24 ++ LD $f12, 4*SIZE(X) ++ MUL C, $f14, $f25 ++ ++ ldi I, -1(I) ++ MUL S, $f15, $f26 ++ ADD $f21, $f22, $f22 ++ MUL C, $f15, $f27 ++ ++ LD $f15, 5*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ MUL C, $f16, $f21 ++ fillcs (PREFETCHSIZE) * SIZE(X) ++ unop ++ LD $f14, 5*SIZE(X) ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ fillcs (PREFETCHSIZE) * SIZE(Y) ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 6*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 7*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 8*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 8*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 9*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ LD $f14, 9*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ LD $f17, 10*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 10*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ LD $f19, 11*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ ldi I, -1(I) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 11*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 6*SIZE(X) ++ MUL S, $f13, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 12*SIZE(Y) ++ ldi X, 8*SIZE(X) ++ unop ++ ++ ST $f24, 6*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 4*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ unop ++ ++ ST $f26, -1*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 5*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, -1*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ MUL C, $f16, $f21 ++ LD $f14, 5*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ LD $f16, 6*SIZE(X) ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ LD $f18, 7*SIZE(X) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 6*SIZE(X) ++ ADD $f25, $f26, $f26 ++ ST $f24, 6*SIZE(Y) ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 7*SIZE(X) ++ ldi X, 8*SIZE(X) ++ ST $f28, 7*SIZE(Y) ++ ldi Y, 8*SIZE(Y) ++ .align 4 ++ ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f25 ++ SUB $f23, $f24, $f26 ++ ldi I, -1(I) ++ ++ ST $f25, 0*SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ST $f26, 0*SIZE(Y) ++ ldi Y, 1 * SIZE(Y) ++ ++ bgt I, $L16 ++ .align 4 ++ ++$L998: ++ clr $0 ++ ret ++ .align 4 ++ ++$L50: ++ mov X, XX ++ mov Y, YY ++ ++ sra N, 3, I ++ ble I, $L55 ++ .align 4 ++ ++$L51: ++ LD $f12, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f14, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f16, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f18, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f16, $f21 ++ MUL S, $f17, $f22 ++ MUL C, $f17, $f23 ++ MUL S, $f16, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f18, $f25 ++ MUL S, $f19, $f26 ++ MUL C, $f19, $f27 ++ MUL S, $f18, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f13, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f14, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f16, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f17, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ LD $f18, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f19, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f16, $f21 ++ MUL S, $f17, $f22 ++ MUL C, $f17, $f23 ++ MUL S, $f16, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f24, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ MUL C, $f18, $f25 ++ MUL S, $f19, $f26 ++ MUL C, $f19, $f27 ++ MUL S, $f18, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 0*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 0*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ldi I, -1(I) ++ bgt I, $L51 ++ .align 4 ++ ++$L55: ++ and N, 7, I ++ ble I, $L999 ++ .align 4 ++ ++$L56: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f25 ++ SUB $f23, $f24, $f26 ++ ldi I, -1(I) ++ ++ ST $f25, 0*SIZE(X) ++ SXADDQ INCX, X, X ++ ST $f26, 0*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ bgt I, $L56 ++ .align 4 ++ ++$L999: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/scal-sw.S.bak b/kernel/sw_64/scal-sw.S.bak +new file mode 100644 +index 0000000..f8da324 +--- /dev/null ++++ b/kernel/sw_64/scal-sw.S.bak +@@ -0,0 +1,480 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $20 ++#define INCX $21 ++ ++#define XX $18 ++#define I $19 ++ ++#define ALPHA $f19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f21 ++ ++#define t0 $f22 ++#define t1 $f23 ++#define t2 $f24 ++#define t3 $f25 ++ ++ PROLOGUE ++ PROFCODE ++ ++ mov X, XX ++ ble N, $L999 ++ ++ cmpeq INCX, 1, $0 ++ beq $0, $L20 ++ ++#ifndef DOUBLE ++ sra N, 4, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ LD a4, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ LD a5, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ LD a6, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ LD a7, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 8 * SIZE(X) ++ LD a1, 9 * SIZE(X) ++ LD a2, 10 * SIZE(X) ++ LD a3, 11 * SIZE(X) ++ ++ ST t0, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 12 * SIZE(X) ++ LD a5, 13 * SIZE(X) ++ LD a6, 14 * SIZE(X) ++ LD a7, 15 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t0, 8 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 9 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 10 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 11 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 16 * SIZE(X) ++ LD a1, 17 * SIZE(X) ++ LD a2, 18 * SIZE(X) ++ LD a3, 19 * SIZE(X) ++ ++ ST t0, 12 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 13 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 14 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 15 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 20 * SIZE(X) ++ LD a5, 21 * SIZE(X) ++ LD a6, 22 * SIZE(X) ++ LD a7, 23 * SIZE(X) ++ ++ ST t0, 16 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 17 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 18 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 19 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 24 * SIZE(X) ++ LD a1, 25 * SIZE(X) ++ LD a2, 26 * SIZE(X) ++ LD a3, 27 * SIZE(X) ++ ++ ST t0, 20 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 21 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 22 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 23 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 28 * SIZE(X) ++ LD a5, 29 * SIZE(X) ++ LD a6, 30 * SIZE(X) ++ LD a7, 31 * SIZE(X) ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ ldi I, -1(I) ++ addl X, 16 * SIZE, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ST t0, 8 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 9 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 10 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 11 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ ST t0, 12 * SIZE(X) ++ ST t1, 13 * SIZE(X) ++ ST t2, 14 * SIZE(X) ++ ST t3, 15 * SIZE(X) ++ addl X, 16 * SIZE, X ++ .align 4 ++ ++$L15: ++ and N, 15, I ++ ++#else ++ ++ sra N, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ LD a4, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ LD a5, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ LD a6, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ LD a7, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 8 * SIZE(X) ++ ldi I, -1(I) ++ LD a1, 9 * SIZE(X) ++ addl X, 8 * SIZE, X ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ ST t0, -4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, -3 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, -2 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, -1 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ fillcs PREFETCHSIZE * SIZE(X) ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ ST t0, 4 * SIZE(X) ++ ST t1, 5 * SIZE(X) ++ ST t2, 6 * SIZE(X) ++ ST t3, 7 * SIZE(X) ++ addl X, 8 * SIZE, X ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ++#endif ++ ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ LD a0, 0 * SIZE(X) ++ ++ MUL a0, ALPHA, t0 ++ ++ ST t0, 0 * SIZE(X) ++ ++ addl X, SIZE, X ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ ret ++ .align 4 ++ ++$L20: ++ sra N, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ldi I, -1(I) ++ SXADDQ INCX, X, X ++ ++ LD a5, 0 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ SXADDQ INCX, X, X ++ unop ++ ++ LD a6, 0 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ SXADDQ INCX, X, X ++ unop ++ ++ LD a7, 0 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ SXADDQ INCX, X, X ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ST t0, 0 * SIZE(XX) ++ MUL a4, ALPHA, t0 ++ fillcs PREFETCHSIZE * SIZE(X) ++ SXADDQ INCX, XX, XX ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) ++ unop ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a5, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a6, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a7, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t0, 0 * SIZE(XX) ++ MUL a0, ALPHA, t0 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a4, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a1, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a2, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a6, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a3, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a7, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ unop ++ bne I, $L22 ++ .align 4 ++ ++$L23: ++ ST t0, 0 * SIZE(XX) ++ MUL a4, ALPHA, t0 ++ SXADDQ INCX, XX, XX ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a5, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a6, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a7, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L27: ++ LD a0, 0 * SIZE(X) ++ ++ MUL a0, ALPHA, t0 ++ ++ ST t0, 0 * SIZE(XX) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L27 ++ .align 4 ++ ++$L999: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/scal.S b/kernel/sw_64/scal.S +new file mode 100644 +index 0000000..87b89c9 +--- /dev/null ++++ b/kernel/sw_64/scal.S +@@ -0,0 +1,480 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $20 ++#define INCX $21 ++ ++#define XX $18 ++#define I $19 ++ ++#define ALPHA $f19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f21 ++ ++#define t0 $f22 ++#define t1 $f23 ++#define t2 $f24 ++#define t3 $f25 ++ ++ PROLOGUE ++ PROFCODE ++ ++ mov X, XX ++ ble N, $L999 ++ ++ cmpeq INCX, 1, $0 ++ beq $0, $L20 ++ ++#ifndef DOUBLE ++ sra N, 4, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ LD a4, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ LD a5, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ LD a6, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ LD a7, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 8 * SIZE(X) ++ LD a1, 9 * SIZE(X) ++ LD a2, 10 * SIZE(X) ++ LD a3, 11 * SIZE(X) ++ ++ ST t0, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 12 * SIZE(X) ++ LD a5, 13 * SIZE(X) ++ LD a6, 14 * SIZE(X) ++ LD a7, 15 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t0, 8 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 9 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 10 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 11 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 16 * SIZE(X) ++ LD a1, 17 * SIZE(X) ++ LD a2, 18 * SIZE(X) ++ LD a3, 19 * SIZE(X) ++ ++ ST t0, 12 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 13 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 14 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 15 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 20 * SIZE(X) ++ LD a5, 21 * SIZE(X) ++ LD a6, 22 * SIZE(X) ++ LD a7, 23 * SIZE(X) ++ ++ ST t0, 16 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 17 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 18 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 19 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 24 * SIZE(X) ++ LD a1, 25 * SIZE(X) ++ LD a2, 26 * SIZE(X) ++ LD a3, 27 * SIZE(X) ++ ++ ST t0, 20 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, 21 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, 22 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, 23 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 28 * SIZE(X) ++ LD a5, 29 * SIZE(X) ++ LD a6, 30 * SIZE(X) ++ LD a7, 31 * SIZE(X) ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ ldi I, -1(I) ++ addl X, 16 * SIZE, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ST t0, 8 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 9 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 10 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 11 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ ST t0, 12 * SIZE(X) ++ ST t1, 13 * SIZE(X) ++ ST t2, 14 * SIZE(X) ++ ST t3, 15 * SIZE(X) ++ addl X, 16 * SIZE, X ++ .align 4 ++ ++$L15: ++ and N, 15, I ++ ++#else ++ ++ sra N, 3, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ LD a4, 4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ LD a5, 5 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ LD a6, 6 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ LD a7, 7 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ LD a0, 8 * SIZE(X) ++ ldi I, -1(I) ++ LD a1, 9 * SIZE(X) ++ addl X, 8 * SIZE, X ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ ST t0, -4 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t1, -3 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ ++ ST t2, -2 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ ST t3, -1 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ fillcs PREFETCHSIZE * SIZE(X) ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ST t0, 0 * SIZE(X) ++ MUL a4, ALPHA, t0 ++ ST t1, 1 * SIZE(X) ++ MUL a5, ALPHA, t1 ++ ++ ST t2, 2 * SIZE(X) ++ MUL a6, ALPHA, t2 ++ ST t3, 3 * SIZE(X) ++ MUL a7, ALPHA, t3 ++ ++ ST t0, 4 * SIZE(X) ++ ST t1, 5 * SIZE(X) ++ ST t2, 6 * SIZE(X) ++ ST t3, 7 * SIZE(X) ++ addl X, 8 * SIZE, X ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ++#endif ++ ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ LD a0, 0 * SIZE(X) ++ ++ MUL a0, ALPHA, t0 ++ ++ ST t0, 0 * SIZE(X) ++ ++ addl X, SIZE, X ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ ret ++ .align 4 ++ ++$L20: ++ sra N, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ldi I, -1(I) ++ SXADDQ INCX, X, X ++ ++ LD a5, 0 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ SXADDQ INCX, X, X ++ unop ++ ++ LD a6, 0 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ SXADDQ INCX, X, X ++ unop ++ ++ LD a7, 0 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ SXADDQ INCX, X, X ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ST t0, 0 * SIZE(XX) ++ MUL a4, ALPHA, t0 ++ fillcs PREFETCHSIZE * SIZE(X) ++ SXADDQ INCX, XX, XX ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) ++ unop ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a5, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a6, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a7, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t0, 0 * SIZE(XX) ++ MUL a0, ALPHA, t0 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a4, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a1, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a2, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a6, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a3, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a7, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ unop ++ bne I, $L22 ++ .align 4 ++ ++$L23: ++ ST t0, 0 * SIZE(XX) ++ MUL a4, ALPHA, t0 ++ SXADDQ INCX, XX, XX ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a5, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a6, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a7, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L27: ++ LD a0, 0 * SIZE(X) ++ ++ MUL a0, ALPHA, t0 ++ ++ ST t0, 0 * SIZE(XX) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L27 ++ .align 4 ++ ++$L999: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/scal_simd.S b/kernel/sw_64/scal_simd.S +new file mode 100644 +index 0000000..7462e99 +--- /dev/null ++++ b/kernel/sw_64/scal_simd.S +@@ -0,0 +1,344 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 144 ++ ++#define N $16 ++#define X $20 ++#define INCX $21 ++ ++#define XX $18 ++#define I $19 ++ ++#define ALPHA $f19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f21 ++ ++#define t0 $f22 ++#define t1 $f23 ++#define t2 $f24 ++#define t3 $f25 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++ mov X, XX ++ ble N, $L999 ++ ++ cmpeq INCX, 1, $0 ++ beq $0, $L20 ++ ++/** ++ test the address of X ++**/ ++ and X, (VEC_LEN*SIZE-1), $4 ++ beq $4, $Align_X_Access ++ ++ .align 5 ++/** ++ process the unalign address of X ++**/ ++ sra N, 4, I ++ ble I, $Remain /*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/ ++ ++ sra $4, BASE_SHIFT, $4 ++ ldi $3, VEC_LEN ++ subl $3, $4, $4 ++ subl N, $4, N ++ ++$UnAlign_X_Loop: ++ LD a0, 0*SIZE(X) ++ MUL a0, ALPHA, t0 ++ ST t0, 0*SIZE(X) ++ addl X, SIZE, X ++ ++ ++ ++ subl $4, 1, $4 ++ bgt $4, $UnAlign_X_Loop ++ .align 5 ++ ++$Align_X_Access: ++ ++/* ++ Unloop 16 ++*/ ++ sra N, 4, I ++ vcpyf ALPHA, ALPHA ++ ble I, $Remain ++ ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $MainLoop_End ++ .align 5 ++$MainLoop: ++ VMUL a0, ALPHA, t0 ++ VLD a0, 4*VEC_LEN*SIZE(X) ++ VMUL a1, ALPHA, t1 ++ VLD a1, 5*VEC_LEN*SIZE(X) ++ ++ VMUL a2, ALPHA, t2 ++ VLD a2, 6*VEC_LEN*SIZE(X) ++ VMUL a3, ALPHA, t3 ++ VLD a3, 7*VEC_LEN*SIZE(X) ++ ++ VST t0, 0*VEC_LEN*SIZE(X) ++ VST t1, 1*VEC_LEN*SIZE(X) ++ VST t2, 2*VEC_LEN*SIZE(X) ++ VST t3, 3*VEC_LEN*SIZE(X) ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ ldi I, -1(I) ++ addl X, 16 * SIZE, X ++ bne I, $MainLoop ++ .align 5 ++ ++$MainLoop_End: ++ VMUL a0, ALPHA, t0 ++ VST t0, 0*VEC_LEN*SIZE(X) ++ VMUL a1, ALPHA, t1 ++ VST t1, 1*VEC_LEN*SIZE(X) ++ ++ VMUL a2, ALPHA, t2 ++ VST t2, 2*VEC_LEN*SIZE(X) ++ VMUL a3, ALPHA, t3 ++ VST t3, 3*VEC_LEN*SIZE(X) ++ ++ addl X, 16 * SIZE, X ++ .align 5 ++ ++$Remain: ++ and N, 15, I ++ unop ++ unop ++ ble I, $L999 ++ .align 5 ++ ++$L17: ++ LD a0, 0 * SIZE(X) ++ ++ MUL a0, ALPHA, t0 ++ ++ ST t0, 0 * SIZE(X) ++ ++ addl X, SIZE, X ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ ret ++ .align 5 ++ ++$L20: ++ sra N, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ MUL a0, ALPHA, t0 ++ ldi I, -1(I) ++ SXADDQ INCX, X, X ++ ++ LD a5, 0 * SIZE(X) ++ MUL a1, ALPHA, t1 ++ SXADDQ INCX, X, X ++ unop ++ ++ LD a6, 0 * SIZE(X) ++ MUL a2, ALPHA, t2 ++ SXADDQ INCX, X, X ++ unop ++ ++ LD a7, 0 * SIZE(X) ++ MUL a3, ALPHA, t3 ++ SXADDQ INCX, X, X ++ ble I, $L23 ++ .align 5 ++ ++$L22: ++ ST t0, 0 * SIZE(XX) ++ MUL a4, ALPHA, t0 ++/* ++ fillcs PREFETCHSIZE * SIZE(X) ++*/ ++ fillcs PREFETCHSIZE * SIZE(X) ++ SXADDQ INCX, XX, XX ++ ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) ++ unop ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a5, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a1, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a6, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a7, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t0, 0 * SIZE(XX) ++ MUL a0, ALPHA, t0 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a4, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a1, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a2, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a6, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a3, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ unop ++ ++ LD a7, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ unop ++ bne I, $L22 ++ .align 5 ++ ++$L23: ++ ST t0, 0 * SIZE(XX) ++ MUL a4, ALPHA, t0 ++ SXADDQ INCX, XX, XX ++ ++ ST t1, 0 * SIZE(XX) ++ MUL a5, ALPHA, t1 ++ SXADDQ INCX, XX, XX ++ ++ ST t2, 0 * SIZE(XX) ++ MUL a6, ALPHA, t2 ++ SXADDQ INCX, XX, XX ++ ++ ST t3, 0 * SIZE(XX) ++ MUL a7, ALPHA, t3 ++ SXADDQ INCX, XX, XX ++ ++ ST t0, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t1, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t2, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST t3, 0 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ .align 5 ++ ++$L25: ++ and N, 7, I ++ unop ++ unop ++ ble I, $L999 ++ .align 5 ++ ++$L27: ++ LD a0, 0 * SIZE(X) ++ ++ MUL a0, ALPHA, t0 ++ ++ ST t0, 0 * SIZE(XX) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L27 ++ .align 5 ++ ++$L999: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/snrm2.S b/kernel/sw_64/snrm2.S +new file mode 100644 +index 0000000..ff1ec57 +--- /dev/null ++++ b/kernel/sw_64/snrm2.S +@@ -0,0 +1,491 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++#define x8 $f24 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stl $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ SXADDQ INCX, 0, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 4, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, x8 ++ fmov x8,a0 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1,x8 ++ fmov x8,a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2,x8 ++ fmov x8,a2 ++ #unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3,x8 ++ fmov x8,a3 ++ #unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, x8 ++ fmov x8,a0 ++ #unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd a1, t1, x8 ++ fmov x8,a1 ++ #unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, x8 ++ fmov x8,a2 ++ #unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd a3, t3, x8 ++ fmov x8,a3 ++ #unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, x8 ++ fmov x8,a0 ++ #unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1,x8 ++ fmov x8,a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, x8 ++ fmov x8,a2 ++ #unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3,x8 ++ fmov x8,a3 ++ #unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd a0, t0, x8 ++ fmov x8,a0 ++ #unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd a1, t1,x8 ++ fmov x8,a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd a2, t2, x8 ++ fmov x8,a2 ++ #unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd a3, t3,x8 ++ fmov x8,a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0,x8 ++ fmov x8,a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1,x8 ++ fmov x8,a1 ++ #unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2,x8 ++ fmov x8,a2 ++ #unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, x8 ++ fmov x8,a3 ++ #unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, x8 ++ fmov x8,a0 ++ #unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd a1, t1, x8 ++ fmov x8,a1 ++ #unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd a2, t2, x8 ++ fmov x8,a2 ++ #unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd a3, t3,x8 ++ fmov x8,a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0,x8 ++ fmov x8,a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, x8 ++ fmov x8,a1 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, x8 ++ fmov x8,a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, x8 ++ fmov x8,a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, x8 ++ fmov x8,a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, x8 ++ fmov x8,a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, x8 ++ fmov x8,a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, x8 ++ fmov x8,a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, x8 ++ fmov x8,a1 ++ faddd a2, t2, x8 ++ fmov x8,a2 ++ faddd a3, t3, x8 ++ fmov x8,a3 ++ .align 4 ++ ++$L15: ++ and N, 15, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ++ faddd a0, t0,x8 ++ fmov x8,a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L25 ++ ++ fclr t2 ++ fclr t3 ++ ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x3, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x5, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x6, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, x8 ++ fmov x8,a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1,x8 ++ fmov x8,a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2,x8 ++ fmov x8,a2 ++ LD x1, 0 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3,x8 ++ fmov x8,a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ addl X, INCX, X ++ ++ faddd a0, t0,x8 ++ fmov x8,a0 ++ LD x3, 0 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1,x8 ++ fmov x8,a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2,x8 ++ fmov x8,a2 ++ LD x5, 0 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, x8 ++ fmov x8,a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0,x8 ++ fmov x8,a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, x8 ++ fmov x8,a1 ++ unop ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2,x8 ++ fmov x8,a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, x8 ++ fmov x8,a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, x8 ++ fmov x8,a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, x8 ++ fmov x8,a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, x8 ++ fmov x8,a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, x8 ++ fmov x8,a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, x8 ++ fmov x8,a1 ++ faddd a2, t2, x8 ++ fmov x8,a2 ++ faddd a3, t3, x8 ++ fmov x8,a3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0,x8 ++ fmov x8,a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0,x8 ++ fmov x8,a0 ++ ++ faddd a0, a1, x8 ++ fmov x8,a1 ++ faddd a2, a3, x8 ++ fmov x8,a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2,x8 ++ fsqrtd x8, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/snrm2.S.bak b/kernel/sw_64/snrm2.S.bak +new file mode 100644 +index 0000000..753c90b +--- /dev/null ++++ b/kernel/sw_64/snrm2.S.bak +@@ -0,0 +1,431 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ SXADDQ INCX, 0, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 4, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, a0 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, a1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L15: ++ and N, 15, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ ldi X, 1 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L25 ++ ++ fclr t2 ++ fclr t3 ++ ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x1, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x3, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x5, 0 * SIZE(X) ++ addl X, INCX, X ++ LD x6, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2, a2 ++ LD x1, 0 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ LD x3, 0 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ addl X, INCX, X ++ ++ faddd a2, t2, a2 ++ LD x5, 0 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ addl X, INCX, X ++ ++ ldi I, -1(I) ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 0 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a1, t1, a1 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ ++ ldi I, -1(I) ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, a0 ++ ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/staticbuffer.S b/kernel/sw_64/staticbuffer.S +new file mode 100644 +index 0000000..7bbd23d +--- /dev/null ++++ b/kernel/sw_64/staticbuffer.S +@@ -0,0 +1,45 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++ ++#ifdef ALLOC_STATIC ++ .align 8 ++ .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 ++#endif +diff --git a/kernel/sw_64/sum.S b/kernel/sw_64/sum.S +new file mode 100644 +index 0000000..0be6d53 +--- /dev/null ++++ b/kernel/sw_64/sum.S +@@ -0,0 +1,230 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ ble N, $L999 ++ ++ sra N, 3, I ++ fclr s1 ++ fclr s2 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t1 ++ SXADDQ INCX, X, X ++ fclr t2 ++ ++ LD a1, 0 * SIZE(X) ++ fclr t3 ++ SXADDQ INCX, X, X ++ fclr s3 ++ ++ LD a2, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a3, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a5, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ ldw $31, PREFETCHSIZE * 2 * SIZE(X) ++ fmov a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ LD a6, 0 * SIZE(X) ++ fmov a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ LD a7, 0 * SIZE(X) ++ fmov a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ LD a0, 0 * SIZE(X) ++ fmov a3, t3 ++ SXADDQ INCX, X, X ++ ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ LD a1, 0 * SIZE(X) ++ fmov a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ LD a2, 0 * SIZE(X) ++ fmov a5, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ LD a3, 0 * SIZE(X) ++ fmov a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ LD a4, 0 * SIZE(X) ++ fmov a7, t3 ++ SXADDQ INCX, X, X ++ ++ LD a5, 0 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ LD a6, 0 * SIZE(X) ++ fmov a0, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ LD a7, 0 * SIZE(X) ++ fmov a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ fmov a2, t2 ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ fmov a3, t3 ++ ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ fmov a4, t0 ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ fmov a5, t1 ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ fmov a6, t2 ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ fmov a7, t3 ++ ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ ++ ADD s0, s1, $f24 ++ fmov $f24,s0 ++ ADD s2, s3, $f24 ++ fmov $f24,s2 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ADD s0, s2, $f24 ++ fmov $f24,s0 ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ LD a0, 0 * SIZE(X) ++ SXADDQ INCX, X, X ++ fmov a0, t0 ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/sw_fpcr.S b/kernel/sw_64/sw_fpcr.S +new file mode 100644 +index 0000000..5dee238 +--- /dev/null ++++ b/kernel/sw_64/sw_fpcr.S +@@ -0,0 +1,39 @@ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ .arch sw2b ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl read_fpcr ++ .ent read_fpcr ++read_fpcr: ++ .frame $sp, 0, $26, 0 ++ RFPCR $f10 ++ fstd $f10, 0($16) ++ ret ++ .end read_fpcr ++ ++ .globl write_fpcr ++ .ent write_fpcr ++write_fpcr: ++ .frame $sp, 0, $26, 0 ++ fldd $f10, 0($16) ++ WFPCR $f10 ++ ret ++ .end write_fpcr ++/** ++ .globl fadd_test ++ .ent fadd_test ++ ++fadd_test: ++ .frame $sp, 0, $26, 0 ++ faddd $f16, $f17, $f16 ++ fmov $f16, $f0 ++ ret ++ .end fadd_test ++**/ ++ .ident VERSION ++ +diff --git a/kernel/sw_64/sw_fpcr_inline.c b/kernel/sw_64/sw_fpcr_inline.c +new file mode 100644 +index 0000000..1943e3e +--- /dev/null ++++ b/kernel/sw_64/sw_fpcr_inline.c +@@ -0,0 +1,13 @@ ++#include "common.h" ++ ++void read_fpcr(long * test){ ++ ++ __asm__("rfpcr $f10 \n fstd $f10, %0":"=m"(*test):); ++ return; ++} ++ ++void write_fpcr(long * test){ ++ ++ __asm__("fldd $f10, %0\nwfpcr $f10"::"m"(*test)); ++ return; ++} +diff --git a/kernel/sw_64/swap.S b/kernel/sw_64/swap.S +new file mode 100644 +index 0000000..5c8b679 +--- /dev/null ++++ b/kernel/sw_64/swap.S +@@ -0,0 +1,249 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++ mov $20, $17 ++ mov $21, $18 ++ ldl $19, 0($sp) ++ ldl $20, 8($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ subl $18, 1, $1 ++ subl $20, 1, $2 ++ ble $16, $SubEnd # if n <= 0 goto $End ++ or $1, $2, $1 ++ ++ sra $16, 3, $21 ++ ++ and $16, 7, $22 ++ bne $1, $Sub ++ ble $21, $MainRemain ++ .align 4 ++ ++$MainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f12, 2*SIZE($19) ++ LD $f13, 3*SIZE($19) ++ LD $f14, 4*SIZE($19) ++ LD $f15, 5*SIZE($19) ++ LD $f16, 6*SIZE($19) ++ LD $f17, 7*SIZE($19) ++ ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ LD $f22, 2*SIZE($17) ++ LD $f23, 3*SIZE($17) ++ LD $f24, 4*SIZE($17) ++ LD $f25, 5*SIZE($17) ++ LD $f26, 6*SIZE($17) ++ LD $f27, 7*SIZE($17) ++ ++ fillcs 32*SIZE($17) ++ unop ++ fillcs 32*SIZE($19) ++ subl $21, 1, $21 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f11, 1*SIZE($17) ++ ST $f12, 2*SIZE($17) ++ ST $f13, 3*SIZE($17) ++ ST $f14, 4*SIZE($17) ++ ST $f15, 5*SIZE($17) ++ ST $f16, 6*SIZE($17) ++ ST $f17, 7*SIZE($17) ++ ++ ST $f20, 0*SIZE($19) ++ ST $f21, 1*SIZE($19) ++ ST $f22, 2*SIZE($19) ++ ST $f23, 3*SIZE($19) ++ ST $f24, 4*SIZE($19) ++ ST $f25, 5*SIZE($19) ++ ST $f26, 6*SIZE($19) ++ ST $f27, 7*SIZE($19) ++ ++ ldi $17, 8*SIZE($17) ++ ldi $19, 8*SIZE($19) ++ bgt $21, $MainLoop ++ .align 4 ++ ++$MainRemain: ++ ble $22, $MainEnd ++ .align 4 ++ ++$MainRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ ldi $17, 1*SIZE($17) ++ ldi $19, 1*SIZE($19) ++ subl $22, 1, $22 ++ ST $f10, -1*SIZE($17) ++ ST $f20, -1*SIZE($19) ++ bgt $22, $MainRemainLoop ++ .align 4 ++ ++$MainEnd: ++ clr $0 ++ ret ++ .align 4 ++ ++$Sub: ++ mov $17, $23 ++ mov $19, $24 ++ ++ ble $21, $SubRemain ++ .align 4 ++ ++$SubLoop: ++ LD $f10, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f11, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f12, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f13, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f14, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f15, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f16, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f17, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f20, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f21, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f22, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f23, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f24, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f25, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f26, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f27, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ ST $f10, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f11, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f12, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f13, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f14, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f15, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f16, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f17, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f20, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f21, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f22, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f23, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f24, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f25, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f26, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f27, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ subl $21, 1, $21 ++ bgt $21, $SubLoop ++ .align 4 ++ ++$SubRemain: ++ ble $22, $SubEnd ++ .align 4 ++ ++$SubRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ ++ subl $22, 1, $22 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f20, 0*SIZE($19) ++ ++ SXADDQ $18, $17, $17 ++ SXADDQ $20, $19, $19 ++ bgt $22, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/swap_simd.S b/kernel/sw_64/swap_simd.S +new file mode 100644 +index 0000000..8a6141d +--- /dev/null ++++ b/kernel/sw_64/swap_simd.S +@@ -0,0 +1,327 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 64 ++#define X $17 ++#define Y $19 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++ mov $20, $17 ++ mov $21, $18 ++ ldl $19, 0($sp) ++ ldl $20, 8($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ subl $18, 1, $1 ++ subl $20, 1, $2 ++ ble $16, $SubEnd # if n <= 0 goto $End ++ or $1, $2, $1 ++ ++/* ++ Unloop 16 ++*/ ++ sra $16, 4, $21 ++ and $16, 15, $22 ++ bne $1, $Sub ++ ble $21, $MainRemain ++ .align 4 ++ ++/* ++ test the address of Y & X ++*/ ++ and Y, (VEC_LEN*SIZE-1), $4 ++ and X, (VEC_LEN*SIZE-1), $3 ++ or $3, $4, $4 ++ bne $4, $UnAlign_ACCESS ++ ++/* align access*/ ++ ++$MainLoop: ++ VLD $f10, 0*VEC_LEN*SIZE(Y) ++ VLD $f11, 1*VEC_LEN*SIZE(Y) ++ VLD $f12, 2*VEC_LEN*SIZE(Y) ++ VLD $f13, 3*VEC_LEN*SIZE(Y) ++ ++ ++ VLD $f20, 0*VEC_LEN*SIZE(X) ++ VLD $f21, 1*VEC_LEN*SIZE(X) ++ VLD $f22, 2*VEC_LEN*SIZE(X) ++ VLD $f23, 3*VEC_LEN*SIZE(X) ++ ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ unop ++ fillcs PREFETCHSIZE * SIZE(Y) ++ subl $21, 1, $21 ++ ++ VST $f10, 0*VEC_LEN*SIZE(X) ++ VST $f11, 1*VEC_LEN*SIZE(X) ++ VST $f12, 2*VEC_LEN*SIZE(X) ++ VST $f13, 3*VEC_LEN*SIZE(X) ++ ++ VST $f20, 0*VEC_LEN*SIZE(Y) ++ VST $f21, 1*VEC_LEN*SIZE(Y) ++ VST $f22, 2*VEC_LEN*SIZE(Y) ++ VST $f23, 3*VEC_LEN*SIZE(Y) ++ ++ ldi $17, 16*SIZE(X) ++ ldi $19, 16*SIZE(Y) ++ bgt $21, $MainLoop ++ .align 4 ++ ++$MainRemain: ++ ble $22, $MainEnd ++ .align 4 ++ ++$MainRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ ldi $17, 1*SIZE($17) ++ ldi $19, 1*SIZE($19) ++ subl $22, 1, $22 ++ ST $f10, -1*SIZE($17) ++ ST $f20, -1*SIZE($19) ++ bgt $22, $MainRemainLoop ++ .align 4 ++ ++$MainEnd: ++ clr $0 ++ ret ++ .align 4 ++ ++$UnAlign_ACCESS: ++ sra $16, 3, $21 ++ and $16, 7, $22 ++ nop ++ ble $21, $UnAlign_ACCESS_MainRemain ++ .align 4 ++$UnAlign_ACCESS_MainLoop: ++ LD $f10, 0*SIZE(Y) ++ LD $f11, 1*SIZE(Y) ++ LD $f12, 2*SIZE(Y) ++ LD $f13, 3*SIZE(Y) ++ LD $f14, 4*SIZE(Y) ++ LD $f15, 5*SIZE(Y) ++ LD $f16, 6*SIZE(Y) ++ LD $f17, 7*SIZE(Y) ++ ++ LD $f20, 0*SIZE(X) ++ LD $f21, 1*SIZE(X) ++ LD $f22, 2*SIZE(X) ++ LD $f23, 3*SIZE(X) ++ LD $f24, 4*SIZE(X) ++ LD $f25, 5*SIZE(X) ++ LD $f26, 6*SIZE(X) ++ LD $f27, 7*SIZE(X) ++ ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ unop ++ fillcs PREFETCHSIZE * SIZE(Y) ++ subl $21, 1, $21 ++ ++ ST $f10, 0*SIZE(X) ++ ST $f11, 1*SIZE(X) ++ ST $f12, 2*SIZE(X) ++ ST $f13, 3*SIZE(X) ++ ST $f14, 4*SIZE(X) ++ ST $f15, 5*SIZE(X) ++ ST $f16, 6*SIZE(X) ++ ST $f17, 7*SIZE(X) ++ ++ ST $f20, 0*SIZE(Y) ++ ST $f21, 1*SIZE(Y) ++ ST $f22, 2*SIZE(Y) ++ ST $f23, 3*SIZE(Y) ++ ST $f24, 4*SIZE(Y) ++ ST $f25, 5*SIZE(Y) ++ ST $f26, 6*SIZE(Y) ++ ST $f27, 7*SIZE(Y) ++ ++ ldi X, 8*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ bgt $21, $UnAlign_ACCESS_MainLoop ++ .align 4 ++ ++$UnAlign_ACCESS_MainRemain: ++ ble $22, $UnAlign_ACCESS_MainEnd ++ .align 4 ++ ++$UnAlign_ACCESS_MainRemainLoop: ++ LD $f10, 0*SIZE(Y) ++ LD $f20, 0*SIZE(X) ++ ldi X, 1*SIZE(X) ++ ldi Y, 1*SIZE(Y) ++ subl $22, 1, $22 ++ ST $f10, -1*SIZE(X) ++ ST $f20, -1*SIZE(Y) ++ bgt $22, $UnAlign_ACCESS_MainRemainLoop ++ .align 4 ++ ++$UnAlign_ACCESS_MainEnd: ++ clr $0 ++ ret ++ .align 4 ++ ++$Sub: ++ sra $16, 3, $21 ++ and $16, 7, $22 ++ mov $17, $23 ++ mov $19, $24 ++ ++ ble $21, $SubRemain ++ .align 4 ++ ++$SubLoop: ++ LD $f10, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f11, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f12, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f13, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f14, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f15, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f16, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ LD $f17, 0*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f20, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f21, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f22, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f23, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f24, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f25, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f26, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ LD $f27, 0*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ ST $f10, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f11, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f12, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f13, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f14, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f15, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f16, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ST $f17, 0*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f20, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f21, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f22, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f23, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f24, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f25, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f26, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ST $f27, 0*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ subl $21, 1, $21 ++ bgt $21, $SubLoop ++ .align 4 ++ ++$SubRemain: ++ ble $22, $SubEnd ++ .align 4 ++ ++$SubRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ ++ subl $22, 1, $22 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f20, 0*SIZE($19) ++ ++ SXADDQ $18, $17, $17 ++ SXADDQ $20, $19, $19 ++ bgt $22, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S b/kernel/sw_64/trsm_kernel_4x4_LN.S +new file mode 100644 +index 0000000..109c471 +--- /dev/null ++++ b/kernel/sw_64/trsm_kernel_4x4_LN.S +@@ -0,0 +1,5144 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 56 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++#define tmp $9 ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ stl tmp, 64($sp) ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ mull M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif ++ ++#ifdef RN ++ negq OFFSET, KK ++#endif ++ ++#ifdef RT ++ mulq N, K, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mulq N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif ++ ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ fclr t3 ++ fclr t4 ++ ++ and M, 1, I ++ ble I, $L20 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 ++ ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 ++ ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ LD b5, 3 * SIZE(BO) ++ FIMOVD b5, tmp ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ IFMOVD tmp, b5 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ MUL a1, b3, b5 ++ fmov b5, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL a1, b4, b5 ++ fmov b5, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L37: ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ MUL a1, b3, b5 ++ fmov b5, t3 ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, b5 ++ fmov b5, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, b5 ++ fmov b5, c05 ++ MUL b2, c05, b5 ++ fmov b5, t1 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ MUL b3, c05, b5 ++ fmov b5, t1 ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a2, c09, b5 ++ fmov b5, t1 ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ MUL a3, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ MUL a2, c13, b5 ++ fmov b5, t1 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ MUL a3, c13, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL a4, c13, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, b5 ++ fmov b5, c09 ++ MUL b2, c09, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL b3, c09, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 ++ ++ ble L, $L25 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 ++ ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ FIMOVD b5, tmp ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ IFMOVD tmp, b5 ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ IFMOVD tmp, b5 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ unop ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, b5 ++ fmov b5, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b4, b5 ++ fmov b5, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b4, b5 ++ fmov b5, t3 ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, b5 ++ fmov b5, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++ ++ SUB b1, c02, b5 ++ fmov b5, c02 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c10, b5 ++ fmov b5, c10 ++ SUB b4, c14, b5 ++ fmov b5, c14 ++ ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c05, b5 ++ fmov b5, c05 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++ ++ SUB b1, c09, b5 ++ fmov b5, c09 ++ SUB b2, c10, b5 ++ fmov b5, c10 ++ SUB b3, c13, b5 ++ fmov b5, c13 ++ SUB b4, c14, b5 ++ fmov b5, c14 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c10, b5 ++ fmov b5, t3 ++ MUL a2, c14, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c09, b5 ++ fmov b5, c09 ++ MUL a3, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ MUL a2, c09, b5 ++ fmov b5, t3 ++ MUL a2, c13, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++ MUL a3, c10, b5 ++ fmov b5, c10 ++ MUL a3, c14, b5 ++ fmov b5, c14 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c02, b5 ++ fmov b5, t2 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c02, b5 ++ fmov b5, t2 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, b5 ++ fmov b5, c05 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ ++ MUL b2, c05, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL b3, c05, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ MUL a2, c09, b5 ++ fmov b5, t1 ++ MUL a2, c10, b5 ++ fmov b5, t2 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ ++ MUL a3, c13, b5 ++ fmov b5, c13 ++ MUL a3, c14, b5 ++ fmov b5, c14 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ ++ MUL a2, c13, b5 ++ fmov b5, t1 ++ MUL a2, c14, b5 ++ fmov b5, t2 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a3, c13, b5 ++ fmov b5, t1 ++ MUL a3, c14, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a4, c13, b5 ++ fmov b5, t1 ++ MUL a4, c14, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, b5 ++ fmov b5, c09 ++ MUL b1, c10, b5 ++ fmov b5, c10 ++ ++ MUL b2, c09, b5 ++ fmov b5, t1 ++ MUL b2, c10, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL b3, c09, b5 ++ fmov b5, t1 ++ MUL b3, c10, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ sra M, 2, I ++ ble I, $L39 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 ++#else ++ ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 ++#endif ++ ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ FIMOVD b5, tmp ++ ++/* 2 */ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ ldi L, -2(L) ++ IFMOVD tmp, b5 ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ MUL b1, a1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, b5 ++ fmov b5, t2 ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, b5 ++ fmov b5, t3 ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL b1, a4, b5 ++ fmov b5, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, b5 ++ fmov b5, t3 ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, b5 ++ fmov b5, t4 ++ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, b5 ++ fmov b5, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, b5 ++ fmov b5, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, b5 ++ fmov b5, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, b5 ++ fmov b5, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, b5 ++ fmov b5, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, b5 ++ fmov b5, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, b5 ++ fmov b5, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, b5 ++ fmov b5, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, b5 ++ fmov b5, t2 ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, b5 ++ fmov b5, t3 ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL b1, a4, b5 ++ fmov b5, t2 ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, b5 ++ fmov b5, t3 ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, b5 ++ fmov b5, t4 ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ MUL b3, a1, b5 ++ fmov b5, t1 ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ MUL b3, a2, b5 ++ fmov b5, t2 ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ MUL b4, a2, b5 ++ fmov b5, t3 ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL b2, a3, b5 ++ fmov b5, t4 ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ MUL b3, a3, b5 ++ fmov b5, t1 ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ MUL b3, a4, b5 ++ fmov b5, t2 ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ MUL b4, a4, b5 ++ fmov b5, t3 ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, b5 ++ fmov b5, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++ ++ SUB b1, c02, b5 ++ fmov b5, c02 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c10, b5 ++ fmov b5, c10 ++ SUB b4, c14, b5 ++ fmov b5, c14 ++ ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) ++ ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) ++ ++ SUB a1, c03, b5 ++ fmov b5, c03 ++ SUB a2, c07, b5 ++ fmov b5, c07 ++ SUB a3, c11, b5 ++ fmov b5, c11 ++ SUB a4, c15, b5 ++ fmov b5, c15 ++ ++ SUB b1, c04, b5 ++ fmov b5, c04 ++ SUB b2, c08, b5 ++ fmov b5, c08 ++ SUB b3, c12, b5 ++ fmov b5, c12 ++ SUB b4, c16, b5 ++ fmov b5, c16 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++ ++ SUB b1, c05, b5 ++ fmov b5, c05 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c07, b5 ++ fmov b5, c07 ++ SUB b4, c08, b5 ++ fmov b5, c08 ++ ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) ++ ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) ++ ++ SUB a1, c09, b5 ++ fmov b5, c09 ++ SUB a2, c10, b5 ++ fmov b5, c10 ++ SUB a3, c11, b5 ++ fmov b5, c11 ++ SUB a4, c12, b5 ++ fmov b5, c12 ++ ++ SUB b1, c13, b5 ++ fmov b5, c13 ++ SUB b2, c14, b5 ++ fmov b5, c14 ++ SUB b3, c15, b5 ++ fmov b5, c15 ++ SUB b4, c16, b5 ++ fmov b5, c16 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ MUL a1, c16, b5 ++ fmov b5, c16 ++ ++ MUL a2, c04, b5 ++ fmov b5, t1 ++ MUL a2, c08, b5 ++ fmov b5, t2 ++ MUL a2, c12, b5 ++ fmov b5, t3 ++ MUL a2, c16, b5 ++ fmov b5, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c15, t4, b5 ++ fmov b5, c15 ++ ++ MUL a3, c04, b5 ++ fmov b5, t1 ++ MUL a3, c08, b5 ++ fmov b5, t2 ++ MUL a3, c12, b5 ++ fmov b5, t3 ++ MUL a3, c16, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL a4, c04, b5 ++ fmov b5, t1 ++ MUL a4, c08, b5 ++ fmov b5, t2 ++ MUL a4, c12, b5 ++ fmov b5, t3 ++ MUL a4, c16, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, b5 ++ fmov b5, c03 ++ MUL b1, c07, b5 ++ fmov b5, c07 ++ MUL b1, c11, b5 ++ fmov b5, c11 ++ MUL b1, c15, b5 ++ fmov b5, c15 ++ ++ MUL b2, c03, b5 ++ fmov b5, t1 ++ MUL b2, c07, b5 ++ fmov b5, t2 ++ MUL b2, c11, b5 ++ fmov b5, t3 ++ MUL b2, c15, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL b3, c03, b5 ++ fmov b5, t1 ++ MUL b3, c07, b5 ++ fmov b5, t2 ++ MUL b3, c11, b5 ++ fmov b5, t3 ++ MUL b3, c15, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c10, b5 ++ fmov b5, t3 ++ MUL a2, c14, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c09, b5 ++ fmov b5, c09 ++ MUL a3, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ MUL a2, c09, b5 ++ fmov b5, t3 ++ MUL a2, c13, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c05, b5 ++ fmov b5, t2 ++ MUL a3, c09, b5 ++ fmov b5, t3 ++ MUL a3, c13, b5 ++ fmov b5, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c15, t4, b5 ++ fmov b5, c15 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c05, b5 ++ fmov b5, t2 ++ MUL a4, c09, b5 ++ fmov b5, t3 ++ MUL a4, c13, b5 ++ fmov b5, t4 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ SUB c12, t3, b5 ++ fmov b5, c12 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, b5 ++ fmov b5, c02 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ MUL b1, c10, b5 ++ fmov b5, c10 ++ MUL b1, c14, b5 ++ fmov b5, c14 ++ ++ MUL b2, c02, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ MUL b2, c10, b5 ++ fmov b5, t3 ++ MUL b2, c14, b5 ++ fmov b5, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c15, t4, b5 ++ fmov b5, c15 ++ ++ MUL b3, c02, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ MUL b3, c10, b5 ++ fmov b5, t3 ++ MUL b3, c14, b5 ++ fmov b5, t4 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ SUB c12, t3, b5 ++ fmov b5, c12 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c15, b5 ++ fmov b5, c15 ++ ++ MUL a2, c03, b5 ++ fmov b5, t1 ++ MUL a2, c07, b5 ++ fmov b5, t2 ++ MUL a2, c11, b5 ++ fmov b5, t3 ++ MUL a2, c15, b5 ++ fmov b5, t4 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ SUB c12, t3, b5 ++ fmov b5, c12 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ MUL a3, c04, b5 ++ fmov b5, c04 ++ MUL a3, c08, b5 ++ fmov b5, c08 ++ MUL a3, c12, b5 ++ fmov b5, c12 ++ MUL a3, c16, b5 ++ fmov b5, c16 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ MUL a2, c03, b5 ++ fmov b5, t3 ++ MUL a2, c04, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c02, b5 ++ fmov b5, t2 ++ MUL a3, c03, b5 ++ fmov b5, t3 ++ MUL a3, c04, b5 ++ fmov b5, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c02, b5 ++ fmov b5, t2 ++ MUL a4, c03, b5 ++ fmov b5, t3 ++ MUL a4, c04, b5 ++ fmov b5, t4 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ SUB c15, t3, b5 ++ fmov b5, c15 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, b5 ++ fmov b5, c05 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ MUL b1, c07, b5 ++ fmov b5, c07 ++ MUL b1, c08, b5 ++ fmov b5, c08 ++ ++ MUL b2, c05, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ MUL b2, c07, b5 ++ fmov b5, t3 ++ MUL b2, c08, b5 ++ fmov b5, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL b3, c05, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ MUL b3, c07, b5 ++ fmov b5, t3 ++ MUL b3, c08, b5 ++ fmov b5, t4 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ SUB c15, t3, b5 ++ fmov b5, c15 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ MUL a2, c09, b5 ++ fmov b5, t1 ++ MUL a2, c10, b5 ++ fmov b5, t2 ++ MUL a2, c11, b5 ++ fmov b5, t3 ++ MUL a2, c12, b5 ++ fmov b5, t4 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ SUB c15, t3, b5 ++ fmov b5, c15 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ MUL a3, c13, b5 ++ fmov b5, c13 ++ MUL a3, c14, b5 ++ fmov b5, c14 ++ MUL a3, c15, b5 ++ fmov b5, c15 ++ MUL a3, c16, b5 ++ fmov b5, c16 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ MUL a1, c15, b5 ++ fmov b5, c15 ++ MUL a1, c16, b5 ++ fmov b5, c16 ++ ++ MUL a2, c13, b5 ++ fmov b5, t1 ++ MUL a2, c14, b5 ++ fmov b5, t2 ++ MUL a2, c15, b5 ++ fmov b5, t3 ++ MUL a2, c16, b5 ++ fmov b5, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a3, c13, b5 ++ fmov b5, t1 ++ MUL a3, c14, b5 ++ fmov b5, t2 ++ MUL a3, c15, b5 ++ fmov b5, t3 ++ MUL a3, c16, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL a4, c13, b5 ++ fmov b5, t1 ++ MUL a4, c14, b5 ++ fmov b5, t2 ++ MUL a4, c15, b5 ++ fmov b5, t3 ++ MUL a4, c16, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, b5 ++ fmov b5, c09 ++ MUL b1, c10, b5 ++ fmov b5, c10 ++ MUL b1, c11, b5 ++ fmov b5, c11 ++ MUL b1, c12, b5 ++ fmov b5, c12 ++ ++ MUL b2, c09, b5 ++ fmov b5, t1 ++ MUL b2, c10, b5 ++ fmov b5, t2 ++ MUL b2, c11, b5 ++ fmov b5, t3 ++ MUL b2, c12, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL b3, c09, b5 ++ fmov b5, t1 ++ MUL b3, c10, b5 ++ fmov b5, t2 ++ MUL b3, c11, b5 ++ fmov b5, t3 ++ MUL b3, c12, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c07, b5 ++ fmov b5, t3 ++ MUL a2, c08, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c03, b5 ++ fmov b5, c03 ++ MUL a3, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++ ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) ++ ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++ ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) ++ ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) ++ ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 4, KK ++#endif ++ ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ fclr t3 ++ fclr t4 ++ ++ and M, 1, I ++ ble I, $L60 ++ ++#if defined(LT) || defined(RN) ++ ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L78 ++ ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L78 ++ ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, b5 ++ fmov b5, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, b5 ++ fmov b5, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, b5 ++ fmov b5, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif ++ .align 4 ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L77: ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ ADD c02, t3, b5 ++ fmov b5, c02 ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ ++ ADD c01, c02, b5 ++ fmov b5, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, b5 ++ fmov b5, c05 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ++ .align 4 ++ ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L68 ++ ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L68 ++ ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ldi L, -2(L) ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, b5 ++ fmov b5, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a4, b4, b5 ++ fmov b5, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 ++#else ++ blbs TMP1, $L67 ++#endif ++ .align 4 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L67: ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ .align 4 ++ ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c02, b5 ++ fmov b5, c02 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c05, b5 ++ fmov b5, c05 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ sra M, 2, I ++ ble I, $L79 ++ .align 4 ++ ++$L51: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble KK, $L58 ++ ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble TMP1, $L58 ++ ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ unop ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ ldi L, -2(L) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ unop ++ ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ unop ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ unop ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ unop ++ ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b3, b5 ++ fmov b5, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a5, b3, b5 ++ fmov b5, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b4, b5 ++ fmov b5, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b4, b5 ++ fmov b5, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, b5 ++ fmov b5, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L57: ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ .align 4 ++ ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c02, b5 ++ fmov b5, c02 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++ ++ SUB b1, c03, b5 ++ fmov b5, c03 ++ SUB b2, c07, b5 ++ fmov b5, c07 ++ SUB b3, c04, b5 ++ fmov b5, c04 ++ SUB b4, c08, b5 ++ fmov b5, c08 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++ ++ SUB b1, c05, b5 ++ fmov b5, c05 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c07, b5 ++ fmov b5, c07 ++ SUB b4, c08, b5 ++ fmov b5, c08 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ ++ MUL a2, c04, b5 ++ fmov b5, t1 ++ MUL a2, c08, b5 ++ fmov b5, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ ++ MUL a3, c04, b5 ++ fmov b5, t1 ++ MUL a3, c08, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a4, c04, b5 ++ fmov b5, t1 ++ MUL a4, c08, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, b5 ++ fmov b5, c03 ++ MUL b1, c07, b5 ++ fmov b5, c07 ++ ++ MUL b2, c03, b5 ++ fmov b5, t1 ++ MUL b2, c07, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL b3, c03, b5 ++ fmov b5, t1 ++ MUL b3, c07, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c05, b5 ++ fmov b5, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c05, b5 ++ fmov b5, t2 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, b5 ++ fmov b5, c02 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ ++ MUL b2, c02, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ ++ MUL b3, c02, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ ++ MUL a2, c03, b5 ++ fmov b5, t1 ++ MUL a2, c07, b5 ++ fmov b5, t2 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ ++ MUL a3, c04, b5 ++ fmov b5, c04 ++ MUL a3, c08, b5 ++ fmov b5, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ MUL a2, c03, b5 ++ fmov b5, t3 ++ MUL a2, c04, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++ MUL a3, c07, b5 ++ fmov b5, c07 ++ MUL a3, c08, b5 ++ fmov b5, c08 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c07, b5 ++ fmov b5, t3 ++ MUL a2, c08, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c03, b5 ++ fmov b5, c03 ++ MUL a3, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 ++ .align 4 ++ ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L80: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C ++#endif ++ ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ and M, 1, I ++ ble I, $L100 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ MUL a3, b3, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b4, b5 ++ fmov b5, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ++ ADD c01, c02, b5 ++ fmov b5, c01 ++ ADD c03, c04, b5 ++ fmov b5, c03 ++ ADD c01, c03, b5 ++ fmov b5, c01 ++ ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++#else ++ LD a1, 0 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ ble I, $L110 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, b5 ++ fmov b5, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ++ ADD c01, c03, b5 ++ fmov b5, c01 ++ ADD c02, c04, b5 ++ fmov b5, c02 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ sra M, 2, I ++ ble I, $L119 ++ .align 4 ++ ++$L91: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi L, -1(L) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b3, b5 ++ fmov b5, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b3, b5 ++ fmov b5, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b4, b5 ++ fmov b5, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, b5 ++ fmov b5, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, b5 ++ fmov b5, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a2, c04, b5 ++ fmov b5, t1 ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ MUL a3, c04, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL a4, c04, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, b5 ++ fmov b5, c03 ++ MUL b2, c03, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL b3, c03, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, b5 ++ fmov b5, c02 ++ MUL b2, c02, b5 ++ fmov b5, t1 ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ MUL b3, c02, b5 ++ fmov b5, t1 ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a2, c03, b5 ++ fmov b5, t1 ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ MUL a3, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldl tmp, 64($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S.bak b/kernel/sw_64/trsm_kernel_4x4_LN.S.bak +new file mode 100644 +index 0000000..8405570 +--- /dev/null ++++ b/kernel/sw_64/trsm_kernel_4x4_LN.S.bak +@@ -0,0 +1,4073 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(SW2B) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW2B ++#define PREFETCHSIZE 56 ++#define UNOP nop ++#endif ++ ++#ifdef EV6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 56 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ mull M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif ++ ++#ifdef RN ++ negq OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif ++ ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ fclr t3 ++ fclr t4 ++ ++ and M, 1, I ++ ble I, $L20 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 ++ ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 ++ ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L37: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ ++ ADD c13, t4, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ADD c09, t3, c09 ++ ADD c13, t4, c13 ++ ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c01, t1 ++ SUB c09, t1, c09 ++ MUL a4, c01, t1 ++ SUB c13, t1, c13 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b2, c05, t1 ++ SUB c09, t1, c09 ++ MUL b3, c05, t1 ++ SUB c13, t1, c13 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a2, c09, t1 ++ SUB c13, t1, c13 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a2, c13, t1 ++ SUB c09, t1, c09 ++ MUL a3, c13, t1 ++ SUB c05, t1, c05 ++ MUL a4, c13, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b2, c09, t1 ++ SUB c05, t1, c05 ++ MUL b3, c09, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 ++ ++ ble L, $L25 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 ++ ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ ADD c10, t2, c10 ++ ADD c13, t3, c13 ++ ADD c14, t4, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c13, c13 ++ SUB b4, c14, c14 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++ MUL a3, c10, c10 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ sra M, 2, I ++ ble I, $L39 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 ++#else ++ ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 ++#endif ++ ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, c11 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c11, t1, c11 ++ ADD c12, t2, c12 ++ ADD c16, t3, c16 ++ ADD c15, t4, c15 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) ++ ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) ++ ++ SUB a1, c03, c03 ++ SUB a2, c07, c07 ++ SUB a3, c11, c11 ++ SUB a4, c15, c15 ++ ++ SUB b1, c04, c04 ++ SUB b2, c08, c08 ++ SUB b3, c12, c12 ++ SUB b4, c16, c16 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++ ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) ++ ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) ++ ++ SUB a1, c09, c09 ++ SUB a2, c10, c10 ++ SUB a3, c11, c11 ++ SUB a4, c12, c12 ++ ++ SUB b1, c13, c13 ++ SUB b2, c14, c14 ++ SUB b3, c15, c15 ++ SUB b4, c16, c16 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ MUL a1, c12, c12 ++ MUL a1, c16, c16 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ MUL a2, c12, t3 ++ MUL a2, c16, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ MUL a3, c12, t3 ++ MUL a3, c16, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ MUL a4, c12, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ MUL b1, c11, c11 ++ MUL b1, c15, c15 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ MUL b2, c11, t3 ++ MUL b2, c15, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ MUL b3, c11, t3 ++ MUL b3, c15, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ MUL a3, c09, t3 ++ MUL a3, c13, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ MUL a4, c09, t3 ++ MUL a4, c13, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ MUL b1, c10, c10 ++ MUL b1, c14, c14 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ MUL b2, c10, t3 ++ MUL b2, c14, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ MUL b3, c10, t3 ++ MUL b3, c14, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ MUL a1, c11, c11 ++ MUL a1, c15, c15 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ MUL a2, c11, t3 ++ MUL a2, c15, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++ MUL a3, c12, c12 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ MUL a4, c03, t3 ++ MUL a4, c04, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ MUL b1, c07, c07 ++ MUL b1, c08, c08 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ MUL b2, c07, t3 ++ MUL b2, c08, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ MUL b3, c07, t3 ++ MUL b3, c08, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ MUL a2, c11, t3 ++ MUL a2, c12, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++ MUL a3, c15, c15 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ MUL a1, c15, c15 ++ MUL a1, c16, c16 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ MUL a2, c15, t3 ++ MUL a2, c16, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ MUL a3, c15, t3 ++ MUL a3, c16, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ MUL a4, c15, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ MUL b1, c11, c11 ++ MUL b1, c12, c12 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ MUL b2, c11, t3 ++ MUL b2, c12, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ MUL b3, c11, t3 ++ MUL b3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++ ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) ++ ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++ ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) ++ ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) ++ ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 4, KK ++#endif ++ ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ fclr t3 ++ fclr t4 ++ ++ and M, 1, I ++ ble I, $L60 ++ ++#if defined(LT) || defined(RN) ++ ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L78 ++ ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L78 ++ ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L77: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 ++ ++ ADD c01, c02, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, c05 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ++ .align 4 ++ ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L68 ++ ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L68 ++ ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 ++#else ++ blbs TMP1, $L67 ++#endif ++ .align 4 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L67: ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c05, t3, c05 ++ ADD c06, t4, c06 ++ .align 4 ++ ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ sra M, 2, I ++ ble I, $L79 ++ .align 4 ++ ++$L51: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble KK, $L58 ++ ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble TMP1, $L58 ++ ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, c05 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L57: ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, c05 ++ ADD c06, t2, c06 ++ ADD c07, t3, c07 ++ ADD c08, t4, c08 ++ .align 4 ++ ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++ ++ SUB b1, c03, c03 ++ SUB b2, c07, c07 ++ SUB b3, c04, c04 ++ SUB b4, c08, c08 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++ MUL a3, c07, c07 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 ++ .align 4 ++ ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L80: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C ++#endif ++ ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ and M, 1, I ++ ble I, $L100 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 ++ ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++#else ++ LD a1, 0 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ ble I, $L110 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ sra M, 2, I ++ ble I, $L119 ++ .align 4 ++ ++$L91: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a2, c04, t1 ++ SUB c03, t1, c03 ++ MUL a3, c04, t1 ++ SUB c02, t1, c02 ++ MUL a4, c04, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b2, c03, t1 ++ SUB c02, t1, c02 ++ MUL b3, c03, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c01, t1 ++ SUB c03, t1, c03 ++ MUL a4, c01, t1 ++ SUB c04, t1, c04 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b2, c02, t1 ++ SUB c03, t1, c03 ++ MUL b3, c02, t1 ++ SUB c04, t1, c04 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a2, c03, t1 ++ SUB c04, t1, c04 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S b/kernel/sw_64/trsm_kernel_4x4_LT.S +new file mode 100644 +index 0000000..54f8a51 +--- /dev/null ++++ b/kernel/sw_64/trsm_kernel_4x4_LT.S +@@ -0,0 +1,5145 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 56 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++#define STACKSIZE 88 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define tmp $9 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ stl $9, 64($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ mulq M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mulq N, K, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mulq N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif ++ ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ flds $f31, 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 ++#else ++ ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 ++#endif ++ ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ FIMOVD b5, tmp ++/* 2 */ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ ldi L, -2(L) ++ IFMOVD tmp, b5 ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ MUL b1, a1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, b5 ++ fmov b5, t2 ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, b5 ++ fmov b5, t3 ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL b1, a4, b5 ++ fmov b5, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, b5 ++ fmov b5, t3 ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, b5 ++ fmov b5, t4 ++ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, b5 ++ fmov b5, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, b5 ++ fmov b5, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, b5 ++ fmov b5, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, b5 ++ fmov b5, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, b5 ++ fmov b5, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, b5 ++ fmov b5, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, b5 ++ fmov b5, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, b5 ++ fmov b5, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, b5 ++ fmov b5, t2 ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, b5 ++ fmov b5, t3 ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL b1, a4, b5 ++ fmov b5, t2 ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, b5 ++ fmov b5, t3 ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, b5 ++ fmov b5, t4 ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ MUL b3, a1, b5 ++ fmov b5, t1 ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ MUL b3, a2, b5 ++ fmov b5, t2 ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ MUL b4, a2, b5 ++ fmov b5, t3 ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL b2, a3, b5 ++ fmov b5, t4 ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ MUL b3, a3, b5 ++ fmov b5, t1 ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ MUL b3, a4, b5 ++ fmov b5, t2 ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ MUL b4, a4, b5 ++ fmov b5, t3 ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, b5 ++ fmov b5, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++ ++ SUB b1, c02, b5 ++ fmov b5, c02 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c10, b5 ++ fmov b5, c10 ++ SUB b4, c14, b5 ++ fmov b5, c14 ++ ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) ++ ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) ++ ++ SUB a1, c03, b5 ++ fmov b5, c03 ++ SUB a2, c07, b5 ++ fmov b5, c07 ++ SUB a3, c11, b5 ++ fmov b5, c11 ++ SUB a4, c15, b5 ++ fmov b5, c15 ++ ++ SUB b1, c04, b5 ++ fmov b5, c04 ++ SUB b2, c08, b5 ++ fmov b5, c08 ++ SUB b3, c12, b5 ++ fmov b5, c12 ++ SUB b4, c16, b5 ++ fmov b5, c16 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++ ++ SUB b1, c05, b5 ++ fmov b5, c05 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c07, b5 ++ fmov b5, c07 ++ SUB b4, c08, b5 ++ fmov b5, c08 ++ ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) ++ ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) ++ ++ SUB a1, c09, b5 ++ fmov b5, c09 ++ SUB a2, c10, b5 ++ fmov b5, c10 ++ SUB a3, c11, b5 ++ fmov b5, c11 ++ SUB a4, c12, b5 ++ fmov b5, c12 ++ ++ SUB b1, c13, b5 ++ fmov b5, c13 ++ SUB b2, c14, b5 ++ fmov b5, c14 ++ SUB b3, c15, b5 ++ fmov b5, c15 ++ SUB b4, c16, b5 ++ fmov b5, c16 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ MUL a1, c16, b5 ++ fmov b5, c16 ++ ++ MUL a2, c04, b5 ++ fmov b5, t1 ++ MUL a2, c08, b5 ++ fmov b5, t2 ++ MUL a2, c12, b5 ++ fmov b5, t3 ++ MUL a2, c16, b5 ++ fmov b5, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c15, t4, b5 ++ fmov b5, c15 ++ ++ MUL a3, c04, b5 ++ fmov b5, t1 ++ MUL a3, c08, b5 ++ fmov b5, t2 ++ MUL a3, c12, b5 ++ fmov b5, t3 ++ MUL a3, c16, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL a4, c04, b5 ++ fmov b5, t1 ++ MUL a4, c08, b5 ++ fmov b5, t2 ++ MUL a4, c12, b5 ++ fmov b5, t3 ++ MUL a4, c16, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, b5 ++ fmov b5, c03 ++ MUL b1, c07, b5 ++ fmov b5, c07 ++ MUL b1, c11, b5 ++ fmov b5, c11 ++ MUL b1, c15, b5 ++ fmov b5, c15 ++ ++ MUL b2, c03, b5 ++ fmov b5, t1 ++ MUL b2, c07, b5 ++ fmov b5, t2 ++ MUL b2, c11, b5 ++ fmov b5, t3 ++ MUL b2, c15, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL b3, c03, b5 ++ fmov b5, t1 ++ MUL b3, c07, b5 ++ fmov b5, t2 ++ MUL b3, c11, b5 ++ fmov b5, t3 ++ MUL b3, c15, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c10, b5 ++ fmov b5, t3 ++ MUL a2, c14, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c09, b5 ++ fmov b5, c09 ++ MUL a3, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ MUL a2, c09, b5 ++ fmov b5, t3 ++ MUL a2, c13, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c05, b5 ++ fmov b5, t2 ++ MUL a3, c09, b5 ++ fmov b5, t3 ++ MUL a3, c13, b5 ++ fmov b5, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c15, t4, b5 ++ fmov b5, c15 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c05, b5 ++ fmov b5, t2 ++ MUL a4, c09, b5 ++ fmov b5, t3 ++ MUL a4, c13, b5 ++ fmov b5, t4 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ SUB c12, t3, b5 ++ fmov b5, c12 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, b5 ++ fmov b5, c02 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ MUL b1, c10, b5 ++ fmov b5, c10 ++ MUL b1, c14, b5 ++ fmov b5, c14 ++ ++ MUL b2, c02, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ MUL b2, c10, b5 ++ fmov b5, t3 ++ MUL b2, c14, b5 ++ fmov b5, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c15, t4, b5 ++ fmov b5, c15 ++ ++ MUL b3, c02, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ MUL b3, c10, b5 ++ fmov b5, t3 ++ MUL b3, c14, b5 ++ fmov b5, t4 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ SUB c12, t3, b5 ++ fmov b5, c12 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c15, b5 ++ fmov b5, c15 ++ ++ MUL a2, c03, b5 ++ fmov b5, t1 ++ MUL a2, c07, b5 ++ fmov b5, t2 ++ MUL a2, c11, b5 ++ fmov b5, t3 ++ MUL a2, c15, b5 ++ fmov b5, t4 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ SUB c12, t3, b5 ++ fmov b5, c12 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ MUL a3, c04, b5 ++ fmov b5, c04 ++ MUL a3, c08, b5 ++ fmov b5, c08 ++ MUL a3, c12, b5 ++ fmov b5, c12 ++ MUL a3, c16, b5 ++ fmov b5, c16 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ MUL a2, c03, b5 ++ fmov b5, t3 ++ MUL a2, c04, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c02, b5 ++ fmov b5, t2 ++ MUL a3, c03, b5 ++ fmov b5, t3 ++ MUL a3, c04, b5 ++ fmov b5, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c02, b5 ++ fmov b5, t2 ++ MUL a4, c03, b5 ++ fmov b5, t3 ++ MUL a4, c04, b5 ++ fmov b5, t4 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ SUB c15, t3, b5 ++ fmov b5, c15 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, b5 ++ fmov b5, c05 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ MUL b1, c07, b5 ++ fmov b5, c07 ++ MUL b1, c08, b5 ++ fmov b5, c08 ++ ++ MUL b2, c05, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ MUL b2, c07, b5 ++ fmov b5, t3 ++ MUL b2, c08, b5 ++ fmov b5, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL b3, c05, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ MUL b3, c07, b5 ++ fmov b5, t3 ++ MUL b3, c08, b5 ++ fmov b5, t4 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ SUB c15, t3, b5 ++ fmov b5, c15 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ MUL a2, c09, b5 ++ fmov b5, t1 ++ MUL a2, c10, b5 ++ fmov b5, t2 ++ MUL a2, c11, b5 ++ fmov b5, t3 ++ MUL a2, c12, b5 ++ fmov b5, t4 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ SUB c15, t3, b5 ++ fmov b5, c15 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ MUL a3, c13, b5 ++ fmov b5, c13 ++ MUL a3, c14, b5 ++ fmov b5, c14 ++ MUL a3, c15, b5 ++ fmov b5, c15 ++ MUL a3, c16, b5 ++ fmov b5, c16 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ MUL a1, c15, b5 ++ fmov b5, c15 ++ MUL a1, c16, b5 ++ fmov b5, c16 ++ ++ MUL a2, c13, b5 ++ fmov b5, t1 ++ MUL a2, c14, b5 ++ fmov b5, t2 ++ MUL a2, c15, b5 ++ fmov b5, t3 ++ MUL a2, c16, b5 ++ fmov b5, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a3, c13, b5 ++ fmov b5, t1 ++ MUL a3, c14, b5 ++ fmov b5, t2 ++ MUL a3, c15, b5 ++ fmov b5, t3 ++ MUL a3, c16, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL a4, c13, b5 ++ fmov b5, t1 ++ MUL a4, c14, b5 ++ fmov b5, t2 ++ MUL a4, c15, b5 ++ fmov b5, t3 ++ MUL a4, c16, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, b5 ++ fmov b5, c09 ++ MUL b1, c10, b5 ++ fmov b5, c10 ++ MUL b1, c11, b5 ++ fmov b5, c11 ++ MUL b1, c12, b5 ++ fmov b5, c12 ++ ++ MUL b2, c09, b5 ++ fmov b5, t1 ++ MUL b2, c10, b5 ++ fmov b5, t2 ++ MUL b2, c11, b5 ++ fmov b5, t3 ++ MUL b2, c12, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL b3, c09, b5 ++ fmov b5, t1 ++ MUL b3, c10, b5 ++ fmov b5, t2 ++ MUL b3, c11, b5 ++ fmov b5, t3 ++ MUL b3, c12, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c07, b5 ++ fmov b5, t3 ++ MUL a2, c08, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c03, b5 ++ fmov b5, c03 ++ MUL a3, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++ ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) ++ ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++ ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) ++ ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) ++ ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 ++ ++ ble L, $L25 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 ++ ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ FIMOVD b5, tmp ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ IFMOVD tmp, b5 ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ IFMOVD tmp, b5 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ unop ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, b5 ++ fmov b5, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b4, b5 ++ fmov b5, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b4, b5 ++ fmov b5, t3 ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, b5 ++ fmov b5, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++ ++ SUB b1, c02, b5 ++ fmov b5, c02 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c10, b5 ++ fmov b5, c10 ++ SUB b4, c14, b5 ++ fmov b5, c14 ++ ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c05, b5 ++ fmov b5, c05 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++ ++ SUB b1, c09, b5 ++ fmov b5, c09 ++ SUB b2, c10, b5 ++ fmov b5, c10 ++ SUB b3, c13, b5 ++ fmov b5, c13 ++ SUB b4, c14, b5 ++ fmov b5, c14 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c10, b5 ++ fmov b5, t3 ++ MUL a2, c14, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c09, b5 ++ fmov b5, c09 ++ MUL a3, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ MUL a2, c09, b5 ++ fmov b5, t3 ++ MUL a2, c13, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++ MUL a3, c10, b5 ++ fmov b5, c10 ++ MUL a3, c14, b5 ++ fmov b5, c14 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c02, b5 ++ fmov b5, t2 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c02, b5 ++ fmov b5, t2 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, b5 ++ fmov b5, c05 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ ++ MUL b2, c05, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL b3, c05, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ MUL a2, c09, b5 ++ fmov b5, t1 ++ MUL a2, c10, b5 ++ fmov b5, t2 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ ++ MUL a3, c13, b5 ++ fmov b5, c13 ++ MUL a3, c14, b5 ++ fmov b5, c14 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ ++ MUL a2, c13, b5 ++ fmov b5, t1 ++ MUL a2, c14, b5 ++ fmov b5, t2 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a3, c13, b5 ++ fmov b5, t1 ++ MUL a3, c14, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a4, c13, b5 ++ fmov b5, t1 ++ MUL a4, c14, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, b5 ++ fmov b5, c09 ++ MUL b1, c10, b5 ++ fmov b5, c10 ++ ++ MUL b2, c09, b5 ++ fmov b5, t1 ++ MUL b2, c10, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL b3, c09, b5 ++ fmov b5, t1 ++ MUL b3, c10, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ and M, 1, I ++ ble I, $L39 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 ++ ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 ++ ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, b5 ++ fmov b5,c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ LD b5, 3 * SIZE(BO) ++ FIMOVD b5, tmp ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ IFMOVD tmp, b5 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ MUL a1, b3, b5 ++ fmov b5, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL a1, b4, b5 ++ fmov b5, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L37: ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ MUL a1, b3, b5 ++ fmov b5, t3 ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, b5 ++ fmov b5, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, b5 ++ fmov b5, c05 ++ MUL b2, c05, b5 ++ fmov b5, t1 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ MUL b3, c05, b5 ++ fmov b5, t1 ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a2, c09, b5 ++ fmov b5, t1 ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ MUL a3, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ MUL a2, c13, b5 ++ fmov b5, t1 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ MUL a3, c13, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL a4, c13, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, b5 ++ fmov b5, c09 ++ MUL b2, c09, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL b3, c09, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 4, KK ++#endif ++ ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 ++ ++$L51: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble KK, $L58 ++ ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble TMP1, $L58 ++ ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ unop ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ ldi L, -2(L) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ unop ++ ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ unop ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ unop ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ unop ++ ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b3, b5 ++ fmov b5, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a5, b3, b5 ++ fmov b5, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b4, b5 ++ fmov b5, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b4, b5 ++ fmov b5, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, b5 ++ fmov b5, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L57: ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ .align 4 ++ ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c02, b5 ++ fmov b5, c02 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++ ++ SUB b1, c03, b5 ++ fmov b5, c03 ++ SUB b2, c07, b5 ++ fmov b5, c07 ++ SUB b3, c04, b5 ++ fmov b5, c04 ++ SUB b4, c08, b5 ++ fmov b5, c08 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++ ++ SUB b1, c05, b5 ++ fmov b5, c05 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c07, b5 ++ fmov b5, c07 ++ SUB b4, c08, b5 ++ fmov b5, c08 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ ++ MUL a2, c04, b5 ++ fmov b5, t1 ++ MUL a2, c08, b5 ++ fmov b5, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ ++ MUL a3, c04, b5 ++ fmov b5, t1 ++ MUL a3, c08, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a4, c04, b5 ++ fmov b5, t1 ++ MUL a4, c08, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, b5 ++ fmov b5, c03 ++ MUL b1, c07, b5 ++ fmov b5, c07 ++ ++ MUL b2, c03, b5 ++ fmov b5, t1 ++ MUL b2, c07, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL b3, c03, b5 ++ fmov b5, t1 ++ MUL b3, c07, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c05, b5 ++ fmov b5, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c05, b5 ++ fmov b5, t2 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, b5 ++ fmov b5, c02 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ ++ MUL b2, c02, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ ++ MUL b3, c02, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ ++ MUL a2, c03, b5 ++ fmov b5, t1 ++ MUL a2, c07, b5 ++ fmov b5, t2 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ ++ MUL a3, c04, b5 ++ fmov b5, c04 ++ MUL a3, c08, b5 ++ fmov b5, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ MUL a2, c03, b5 ++ fmov b5, t3 ++ MUL a2, c04, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++ MUL a3, c07, b5 ++ fmov b5, c07 ++ MUL a3, c08, b5 ++ fmov b5, c08 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c07, b5 ++ fmov b5, t3 ++ MUL a2, c08, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c03, b5 ++ fmov b5, c03 ++ MUL a3, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L68 ++ ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L68 ++ ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ldi L, -2(L) ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, b5 ++ fmov b5, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a4, b4, b5 ++ fmov b5, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 ++#else ++ blbs TMP1, $L67 ++#endif ++ .align 4 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L67: ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ .align 4 ++ ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c02, b5 ++ fmov b5, c02 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c05, b5 ++ fmov b5, c05 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ and M, 1, I ++ ble I, $L79 ++ ++#if defined(LT) || defined(RN) ++ ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L78 ++ ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L78 ++ ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, b5 ++ fmov b5, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, b5 ++ fmov b5, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, b5 ++ fmov b5, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif ++ .align 4 ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L77: ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ ADD c02, t3, b5 ++ fmov b5, c02 ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ ++ ADD c01, c02, b5 ++ fmov b5, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, b5 ++ fmov b5, c05 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ++ .align 4 ++ ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L80: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C ++#endif ++ ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 ++ .align 4 ++ ++$L91: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi L, -1(L) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b3, b5 ++ fmov b5, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b3, b5 ++ fmov b5, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b4, b5 ++ fmov b5, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, b5 ++ fmov b5, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, b5 ++ fmov b5, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a2, c04, b5 ++ fmov b5, t1 ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ MUL a3, c04, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL a4, c04, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, b5 ++ fmov b5, c03 ++ MUL b2, c03, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL b3, c03, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, b5 ++ fmov b5, c02 ++ MUL b2, c02, b5 ++ fmov b5, t1 ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ MUL b3, c02, b5 ++ fmov b5, t1 ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a2, c03, b5 ++ fmov b5, t1 ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ MUL a3, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ ble I, $L110 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, b5 ++ fmov b5, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ++ ADD c01, c03, b5 ++ fmov b5, c01 ++ ADD c02, c04, b5 ++ fmov b5, c02 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ and M, 1, I ++ ble I, $L119 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ MUL a3, b3, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b4, b5 ++ fmov b5, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ++ ADD c01, c02, b5 ++ fmov b5, c01 ++ ADD c03, c04, b5 ++ fmov b5, c03 ++ ADD c01, c03, b5 ++ fmov b5, c01 ++ ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++#else ++ LD a1, 0 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldl $9, 64($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S.bak b/kernel/sw_64/trsm_kernel_4x4_LT.S.bak +new file mode 100644 +index 0000000..86136ae +--- /dev/null ++++ b/kernel/sw_64/trsm_kernel_4x4_LT.S.bak +@@ -0,0 +1,4072 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ ++#if !defined(SW2B) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW2B ++#define PREFETCHSIZE 56 ++#define UNOP nop ++#endif ++ ++#ifdef EV6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 56 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ mull M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 2, J ++ ble J, $L40 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif ++ ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 ++#else ++ ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 ++#endif ++ ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, c11 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c11, t1, c11 ++ ADD c12, t2, c12 ++ ADD c16, t3, c16 ++ ADD c15, t4, c15 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) ++ ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) ++ ++ SUB a1, c03, c03 ++ SUB a2, c07, c07 ++ SUB a3, c11, c11 ++ SUB a4, c15, c15 ++ ++ SUB b1, c04, c04 ++ SUB b2, c08, c08 ++ SUB b3, c12, c12 ++ SUB b4, c16, c16 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++ ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) ++ ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) ++ ++ SUB a1, c09, c09 ++ SUB a2, c10, c10 ++ SUB a3, c11, c11 ++ SUB a4, c12, c12 ++ ++ SUB b1, c13, c13 ++ SUB b2, c14, c14 ++ SUB b3, c15, c15 ++ SUB b4, c16, c16 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ MUL a1, c12, c12 ++ MUL a1, c16, c16 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ MUL a2, c12, t3 ++ MUL a2, c16, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ MUL a3, c12, t3 ++ MUL a3, c16, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ MUL a4, c12, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ MUL b1, c11, c11 ++ MUL b1, c15, c15 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ MUL b2, c11, t3 ++ MUL b2, c15, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ MUL b3, c11, t3 ++ MUL b3, c15, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ MUL a3, c09, t3 ++ MUL a3, c13, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ MUL a4, c09, t3 ++ MUL a4, c13, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ MUL b1, c10, c10 ++ MUL b1, c14, c14 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ MUL b2, c10, t3 ++ MUL b2, c14, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ MUL b3, c10, t3 ++ MUL b3, c14, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ MUL a1, c11, c11 ++ MUL a1, c15, c15 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ MUL a2, c11, t3 ++ MUL a2, c15, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++ MUL a3, c12, c12 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ MUL a4, c03, t3 ++ MUL a4, c04, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ MUL b1, c07, c07 ++ MUL b1, c08, c08 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ MUL b2, c07, t3 ++ MUL b2, c08, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ MUL b3, c07, t3 ++ MUL b3, c08, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ MUL a2, c11, t3 ++ MUL a2, c12, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++ MUL a3, c15, c15 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ MUL a1, c15, c15 ++ MUL a1, c16, c16 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ MUL a2, c15, t3 ++ MUL a2, c16, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ MUL a3, c15, t3 ++ MUL a3, c16, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ MUL a4, c15, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ MUL b1, c11, c11 ++ MUL b1, c12, c12 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ MUL b2, c11, t3 ++ MUL b2, c12, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ MUL b3, c11, t3 ++ MUL b3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++ ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) ++ ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++ ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) ++ ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) ++ ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 ++ ++ ble L, $L25 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 ++ ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ ADD c10, t2, c10 ++ ADD c13, t3, c13 ++ ADD c14, t4, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c13, c13 ++ SUB b4, c14, c14 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++ MUL a3, c10, c10 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ and M, 1, I ++ ble I, $L39 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 ++ ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 ++ ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L37: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ ++ ADD c13, t4, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ADD c09, t3, c09 ++ ADD c13, t4, c13 ++ ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c01, t1 ++ SUB c09, t1, c09 ++ MUL a4, c01, t1 ++ SUB c13, t1, c13 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b2, c05, t1 ++ SUB c09, t1, c09 ++ MUL b3, c05, t1 ++ SUB c13, t1, c13 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a2, c09, t1 ++ SUB c13, t1, c13 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a2, c13, t1 ++ SUB c09, t1, c09 ++ MUL a3, c13, t1 ++ SUB c05, t1, c05 ++ MUL a4, c13, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b2, c09, t1 ++ SUB c05, t1, c05 ++ MUL b3, c09, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 4, KK ++#endif ++ ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 ++ ++$L51: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble KK, $L58 ++ ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble TMP1, $L58 ++ ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, c05 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L57: ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, c05 ++ ADD c06, t2, c06 ++ ADD c07, t3, c07 ++ ADD c08, t4, c08 ++ .align 4 ++ ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++ ++ SUB b1, c03, c03 ++ SUB b2, c07, c07 ++ SUB b3, c04, c04 ++ SUB b4, c08, c08 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++ MUL a3, c07, c07 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L68 ++ ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L68 ++ ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 ++#else ++ blbs TMP1, $L67 ++#endif ++ .align 4 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L67: ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c05, t3, c05 ++ ADD c06, t4, c06 ++ .align 4 ++ ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ and M, 1, I ++ ble I, $L79 ++ ++#if defined(LT) || defined(RN) ++ ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L78 ++ ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L78 ++ ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L77: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 ++ ++ ADD c01, c02, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, c05 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ++ .align 4 ++ ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L80: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C ++#endif ++ ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 ++ .align 4 ++ ++$L91: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a2, c04, t1 ++ SUB c03, t1, c03 ++ MUL a3, c04, t1 ++ SUB c02, t1, c02 ++ MUL a4, c04, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b2, c03, t1 ++ SUB c02, t1, c02 ++ MUL b3, c03, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c01, t1 ++ SUB c03, t1, c03 ++ MUL a4, c01, t1 ++ SUB c04, t1, c04 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b2, c02, t1 ++ SUB c03, t1, c03 ++ MUL b3, c02, t1 ++ SUB c04, t1, c04 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a2, c03, t1 ++ SUB c04, t1, c04 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ ble I, $L110 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ and M, 1, I ++ ble I, $L119 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 ++ ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++#else ++ LD a1, 0 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S b/kernel/sw_64/trsm_kernel_4x4_RT.S +new file mode 100644 +index 0000000..b9a1975 +--- /dev/null ++++ b/kernel/sw_64/trsm_kernel_4x4_RT.S +@@ -0,0 +1,5148 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 56 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++#define STACKSIZE 88 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define tmp $9 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ stl $9, 64($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ mulq M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif ++ ++#ifdef RN ++ negq OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ and N, 1, J ++ ble J, $L40 ++ ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C ++#endif ++ ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 ++ .align 4 ++ ++$L91: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi L, -1(L) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b3, b5 ++ fmov b5, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b3, b5 ++ fmov b5, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b4, b5 ++ fmov b5, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, b5 ++ fmov b5, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, b5 ++ fmov b5, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a2, c04, b5 ++ fmov b5, t1 ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ MUL a3, c04, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL a4, c04, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, b5 ++ fmov b5, c03 ++ MUL b2, c03, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL b3, c03, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, b5 ++ fmov b5, c02 ++ MUL b2, c02, b5 ++ fmov b5, t1 ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ MUL b3, c02, b5 ++ fmov b5, t1 ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a2, c03, b5 ++ fmov b5, t1 ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ MUL a3, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ ble I, $L110 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, b5 ++ fmov b5, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -1(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ++ ADD c01, c03, b5 ++ fmov b5, c01 ++ ADD c02, c04, b5 ++ fmov b5, c02 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ and M, 1, I ++ ble I, $L119 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ MUL a3, b3, b5 ++ fmov b5, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b4, b5 ++ fmov b5, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ++ ADD c01, c02, b5 ++ fmov b5, c01 ++ ADD c03, c04, b5 ++ fmov b5, c03 ++ ADD c01, c03, b5 ++ fmov b5, c01 ++ ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++#else ++ LD a1, 0 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 ++ ++$L51: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble KK, $L58 ++ ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble TMP1, $L58 ++ ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ unop ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ ldi L, -2(L) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ unop ++ ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ unop ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ unop ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ unop ++ ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b3, b5 ++ fmov b5, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a5, b3, b5 ++ fmov b5, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b4, b5 ++ fmov b5, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b4, b5 ++ fmov b5, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, b5 ++ fmov b5, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L57: ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, b5 ++ fmov b5, t3 ++ ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ MUL a4, b1, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b2, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b2, b5 ++ fmov b5, t2 ++ ADD c03, t3, b5 ++ fmov b5, c03 ++ MUL a3, b2, b5 ++ fmov b5, t3 ++ ++ ADD c04, t4, b5 ++ fmov b5, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, b5 ++ fmov b5, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, b5 ++ fmov b5, c05 ++ ADD c06, t2, b5 ++ fmov b5, c06 ++ ADD c07, t3, b5 ++ fmov b5, c07 ++ ADD c08, t4, b5 ++ fmov b5, c08 ++ .align 4 ++ ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c02, b5 ++ fmov b5, c02 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++ ++ SUB b1, c03, b5 ++ fmov b5, c03 ++ SUB b2, c07, b5 ++ fmov b5, c07 ++ SUB b3, c04, b5 ++ fmov b5, c04 ++ SUB b4, c08, b5 ++ fmov b5, c08 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++ ++ SUB b1, c05, b5 ++ fmov b5, c05 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c07, b5 ++ fmov b5, c07 ++ SUB b4, c08, b5 ++ fmov b5, c08 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ ++ MUL a2, c04, b5 ++ fmov b5, t1 ++ MUL a2, c08, b5 ++ fmov b5, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ ++ MUL a3, c04, b5 ++ fmov b5, t1 ++ MUL a3, c08, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a4, c04, b5 ++ fmov b5, t1 ++ MUL a4, c08, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, b5 ++ fmov b5, c03 ++ MUL b1, c07, b5 ++ fmov b5, c07 ++ ++ MUL b2, c03, b5 ++ fmov b5, t1 ++ MUL b2, c07, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL b3, c03, b5 ++ fmov b5, t1 ++ MUL b3, c07, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c05, b5 ++ fmov b5, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c05, b5 ++ fmov b5, t2 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, b5 ++ fmov b5, c02 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ ++ MUL b2, c02, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ ++ MUL b3, c02, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ ++ MUL a2, c03, b5 ++ fmov b5, t1 ++ MUL a2, c07, b5 ++ fmov b5, t2 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ ++ MUL a3, c04, b5 ++ fmov b5, c04 ++ MUL a3, c08, b5 ++ fmov b5, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ MUL a2, c03, b5 ++ fmov b5, t3 ++ MUL a2, c04, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++ MUL a3, c07, b5 ++ fmov b5, c07 ++ MUL a3, c08, b5 ++ fmov b5, c08 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c07, b5 ++ fmov b5, t3 ++ MUL a2, c08, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c03, b5 ++ fmov b5, c03 ++ MUL a3, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L68 ++ ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L68 ++ ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ldi L, -2(L) ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, b5 ++ fmov b5, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a3, b4, b5 ++ fmov b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a4, b4, b5 ++ fmov b5, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 ++#else ++ blbs TMP1, $L67 ++#endif ++ .align 4 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L67: ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ .align 4 ++ ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c02, b5 ++ fmov b5, c02 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c05, b5 ++ fmov b5, c05 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ and M, 1, I ++ ble I, $L79 ++ ++#if defined(LT) || defined(RN) ++ ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L78 ++ ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L78 ++ ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, b5 ++ fmov b5, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, b5 ++ fmov b5, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, b5 ++ fmov b5, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif ++ .align 4 ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L77: ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ ADD c02, t3, b5 ++ fmov b5, c02 ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ ++ ADD c01, c02, b5 ++ fmov b5, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, b5 ++ fmov b5, c05 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ++ .align 4 ++ ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L80: ++ sra N, 2, J ++ ble J, $L999 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif ++ ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ flds $f31, 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 ++#else ++ ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 ++#endif ++ ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, b5 ++ fmov b5, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ FIMOVD b5, tmp ++ ++/* 2 */ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ ldi L, -2(L) ++ IFMOVD tmp, b5 ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ MUL b1, a1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, b5 ++ fmov b5, t2 ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, b5 ++ fmov b5, t3 ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL b1, a4, b5 ++ fmov b5, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, b5 ++ fmov b5, t3 ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, b5 ++ fmov b5, t4 ++ ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, b5 ++ fmov b5, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, b5 ++ fmov b5, t2 ++ unop ++ ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, b5 ++ fmov b5, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, b5 ++ fmov b5, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, b5 ++ fmov b5, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, b5 ++ fmov b5, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, b5 ++ fmov b5, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, b5 ++ fmov b5, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, b5 ++ fmov b5, t2 ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, b5 ++ fmov b5, t3 ++ ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL b1, a4, b5 ++ fmov b5, t2 ++ ADD c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, b5 ++ fmov b5, t3 ++ ++ ADD c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, b5 ++ fmov b5, t4 ++ ADD c03, t1, b5 ++ fmov b5, c03 ++ MUL b3, a1, b5 ++ fmov b5, t1 ++ ++ ADD c04, t2, b5 ++ fmov b5, c04 ++ MUL b3, a2, b5 ++ fmov b5, t2 ++ ADD c08, t3, b5 ++ fmov b5, c08 ++ MUL b4, a2, b5 ++ fmov b5, t3 ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL b2, a3, b5 ++ fmov b5, t4 ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ MUL b3, a3, b5 ++ fmov b5, t1 ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ MUL b3, a4, b5 ++ fmov b5, t2 ++ ADD c14, t3, b5 ++ fmov b5, c14 ++ MUL b4, a4, b5 ++ fmov b5, t3 ++ ++ ADD c07, t4, b5 ++ fmov b5, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, b5 ++ fmov b5, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c11, t1, b5 ++ fmov b5, c11 ++ ADD c12, t2, b5 ++ fmov b5, c12 ++ ADD c16, t3, b5 ++ fmov b5, c16 ++ ADD c15, t4, b5 ++ fmov b5, c15 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++ ++ SUB b1, c02, b5 ++ fmov b5, c02 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c10, b5 ++ fmov b5, c10 ++ SUB b4, c14, b5 ++ fmov b5, c14 ++ ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) ++ ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) ++ ++ SUB a1, c03, b5 ++ fmov b5, c03 ++ SUB a2, c07, b5 ++ fmov b5, c07 ++ SUB a3, c11, b5 ++ fmov b5, c11 ++ SUB a4, c15, b5 ++ fmov b5, c15 ++ ++ SUB b1, c04, b5 ++ fmov b5, c04 ++ SUB b2, c08, b5 ++ fmov b5, c08 ++ SUB b3, c12, b5 ++ fmov b5, c12 ++ SUB b4, c16, b5 ++ fmov b5, c16 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++ ++ SUB b1, c05, b5 ++ fmov b5, c05 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c07, b5 ++ fmov b5, c07 ++ SUB b4, c08, b5 ++ fmov b5, c08 ++ ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) ++ ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) ++ ++ SUB a1, c09, b5 ++ fmov b5, c09 ++ SUB a2, c10, b5 ++ fmov b5, c10 ++ SUB a3, c11, b5 ++ fmov b5, c11 ++ SUB a4, c12, b5 ++ fmov b5, c12 ++ ++ SUB b1, c13, b5 ++ fmov b5, c13 ++ SUB b2, c14, b5 ++ fmov b5, c14 ++ SUB b3, c15, b5 ++ fmov b5, c15 ++ SUB b4, c16, b5 ++ fmov b5, c16 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ MUL a1, c16, b5 ++ fmov b5, c16 ++ ++ MUL a2, c04, b5 ++ fmov b5, t1 ++ MUL a2, c08, b5 ++ fmov b5, t2 ++ MUL a2, c12, b5 ++ fmov b5, t3 ++ MUL a2, c16, b5 ++ fmov b5, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c15, t4, b5 ++ fmov b5, c15 ++ ++ MUL a3, c04, b5 ++ fmov b5, t1 ++ MUL a3, c08, b5 ++ fmov b5, t2 ++ MUL a3, c12, b5 ++ fmov b5, t3 ++ MUL a3, c16, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL a4, c04, b5 ++ fmov b5, t1 ++ MUL a4, c08, b5 ++ fmov b5, t2 ++ MUL a4, c12, b5 ++ fmov b5, t3 ++ MUL a4, c16, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, b5 ++ fmov b5, c03 ++ MUL b1, c07, b5 ++ fmov b5, c07 ++ MUL b1, c11, b5 ++ fmov b5, c11 ++ MUL b1, c15, b5 ++ fmov b5, c15 ++ ++ MUL b2, c03, b5 ++ fmov b5, t1 ++ MUL b2, c07, b5 ++ fmov b5, t2 ++ MUL b2, c11, b5 ++ fmov b5, t3 ++ MUL b2, c15, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL b3, c03, b5 ++ fmov b5, t1 ++ MUL b3, c07, b5 ++ fmov b5, t2 ++ MUL b3, c11, b5 ++ fmov b5, t3 ++ MUL b3, c15, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c10, b5 ++ fmov b5, t3 ++ MUL a2, c14, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c09, b5 ++ fmov b5, c09 ++ MUL a3, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ MUL a2, c09, b5 ++ fmov b5, t3 ++ MUL a2, c13, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c05, b5 ++ fmov b5, t2 ++ MUL a3, c09, b5 ++ fmov b5, t3 ++ MUL a3, c13, b5 ++ fmov b5, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c15, t4, b5 ++ fmov b5, c15 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c05, b5 ++ fmov b5, t2 ++ MUL a4, c09, b5 ++ fmov b5, t3 ++ MUL a4, c13, b5 ++ fmov b5, t4 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ SUB c12, t3, b5 ++ fmov b5, c12 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, b5 ++ fmov b5, c02 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ MUL b1, c10, b5 ++ fmov b5, c10 ++ MUL b1, c14, b5 ++ fmov b5, c14 ++ ++ MUL b2, c02, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ MUL b2, c10, b5 ++ fmov b5, t3 ++ MUL b2, c14, b5 ++ fmov b5, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c07, t2, b5 ++ fmov b5, c07 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c15, t4, b5 ++ fmov b5, c15 ++ ++ MUL b3, c02, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ MUL b3, c10, b5 ++ fmov b5, t3 ++ MUL b3, c14, b5 ++ fmov b5, t4 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ SUB c12, t3, b5 ++ fmov b5, c12 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c15, b5 ++ fmov b5, c15 ++ ++ MUL a2, c03, b5 ++ fmov b5, t1 ++ MUL a2, c07, b5 ++ fmov b5, t2 ++ MUL a2, c11, b5 ++ fmov b5, t3 ++ MUL a2, c15, b5 ++ fmov b5, t4 ++ ++ SUB c04, t1, b5 ++ fmov b5, c04 ++ SUB c08, t2, b5 ++ fmov b5, c08 ++ SUB c12, t3, b5 ++ fmov b5, c12 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ MUL a3, c04, b5 ++ fmov b5, c04 ++ MUL a3, c08, b5 ++ fmov b5, c08 ++ MUL a3, c12, b5 ++ fmov b5, c12 ++ MUL a3, c16, b5 ++ fmov b5, c16 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ MUL a2, c03, b5 ++ fmov b5, t3 ++ MUL a2, c04, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c02, b5 ++ fmov b5, t2 ++ MUL a3, c03, b5 ++ fmov b5, t3 ++ MUL a3, c04, b5 ++ fmov b5, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c02, b5 ++ fmov b5, t2 ++ MUL a4, c03, b5 ++ fmov b5, t3 ++ MUL a4, c04, b5 ++ fmov b5, t4 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ SUB c15, t3, b5 ++ fmov b5, c15 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, b5 ++ fmov b5, c05 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ MUL b1, c07, b5 ++ fmov b5, c07 ++ MUL b1, c08, b5 ++ fmov b5, c08 ++ ++ MUL b2, c05, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ MUL b2, c07, b5 ++ fmov b5, t3 ++ MUL b2, c08, b5 ++ fmov b5, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL b3, c05, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ MUL b3, c07, b5 ++ fmov b5, t3 ++ MUL b3, c08, b5 ++ fmov b5, t4 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ SUB c15, t3, b5 ++ fmov b5, c15 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ MUL a2, c09, b5 ++ fmov b5, t1 ++ MUL a2, c10, b5 ++ fmov b5, t2 ++ MUL a2, c11, b5 ++ fmov b5, t3 ++ MUL a2, c12, b5 ++ fmov b5, t4 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ SUB c15, t3, b5 ++ fmov b5, c15 ++ SUB c16, t4, b5 ++ fmov b5, c16 ++ ++ MUL a3, c13, b5 ++ fmov b5, c13 ++ MUL a3, c14, b5 ++ fmov b5, c14 ++ MUL a3, c15, b5 ++ fmov b5, c15 ++ MUL a3, c16, b5 ++ fmov b5, c16 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ MUL a1, c15, b5 ++ fmov b5, c15 ++ MUL a1, c16, b5 ++ fmov b5, c16 ++ ++ MUL a2, c13, b5 ++ fmov b5, t1 ++ MUL a2, c14, b5 ++ fmov b5, t2 ++ MUL a2, c15, b5 ++ fmov b5, t3 ++ MUL a2, c16, b5 ++ fmov b5, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a3, c13, b5 ++ fmov b5, t1 ++ MUL a3, c14, b5 ++ fmov b5, t2 ++ MUL a3, c15, b5 ++ fmov b5, t3 ++ MUL a3, c16, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL a4, c13, b5 ++ fmov b5, t1 ++ MUL a4, c14, b5 ++ fmov b5, t2 ++ MUL a4, c15, b5 ++ fmov b5, t3 ++ MUL a4, c16, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, b5 ++ fmov b5, c09 ++ MUL b1, c10, b5 ++ fmov b5, c10 ++ MUL b1, c11, b5 ++ fmov b5, c11 ++ MUL b1, c12, b5 ++ fmov b5, c12 ++ ++ MUL b2, c09, b5 ++ fmov b5, t1 ++ MUL b2, c10, b5 ++ fmov b5, t2 ++ MUL b2, c11, b5 ++ fmov b5, t3 ++ MUL b2, c12, b5 ++ fmov b5, t4 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c07, t3, b5 ++ fmov b5, c07 ++ SUB c08, t4, b5 ++ fmov b5, c08 ++ ++ MUL b3, c09, b5 ++ fmov b5, t1 ++ MUL b3, c10, b5 ++ fmov b5, t2 ++ MUL b3, c11, b5 ++ fmov b5, t3 ++ MUL b3, c12, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c07, b5 ++ fmov b5, c07 ++ MUL a1, c08, b5 ++ fmov b5, c08 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c07, b5 ++ fmov b5, t3 ++ MUL a2, c08, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c03, b5 ++ fmov b5, c03 ++ MUL a3, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++ ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) ++ ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++ ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) ++ ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) ++ ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 ++ ++ ble L, $L25 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 ++ ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ FIMOVD b5, tmp ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ IFMOVD tmp, b5 ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ IFMOVD tmp, b5 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++ ++$L25: ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ unop ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, b5 ++ fmov b5, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b4, b5 ++ fmov b5, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ MUL a2, b1, b5 ++ fmov b5, t2 ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ MUL a1, b2, b5 ++ fmov b5, t3 ++ ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ MUL a2, b2, b5 ++ fmov b5, t4 ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b3, b5 ++ fmov b5, t1 ++ ++ ADD c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b3, b5 ++ fmov b5, t2 ++ ADD c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b4, b5 ++ fmov b5, t3 ++ ++ ADD c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, b5 ++ fmov b5, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, b5 ++ fmov b5, c09 ++ ADD c10, t2, b5 ++ fmov b5, c10 ++ ADD c13, t3, b5 ++ fmov b5, c13 ++ ADD c14, t4, b5 ++ fmov b5, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++ ++ SUB b1, c02, b5 ++ fmov b5, c02 ++ SUB b2, c06, b5 ++ fmov b5, c06 ++ SUB b3, c10, b5 ++ fmov b5, c10 ++ SUB b4, c14, b5 ++ fmov b5, c14 ++ ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c05, b5 ++ fmov b5, c05 ++ SUB a4, c06, b5 ++ fmov b5, c06 ++ ++ SUB b1, c09, b5 ++ fmov b5, c09 ++ SUB b2, c10, b5 ++ fmov b5, c10 ++ SUB b3, c13, b5 ++ fmov b5, c13 ++ SUB b4, c14, b5 ++ fmov b5, c14 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ ++ MUL a2, c02, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ MUL a2, c10, b5 ++ fmov b5, t3 ++ MUL a2, c14, b5 ++ fmov b5, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c05, t2, b5 ++ fmov b5, c05 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c13, t4, b5 ++ fmov b5, c13 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c05, b5 ++ fmov b5, c05 ++ MUL a3, c09, b5 ++ fmov b5, c09 ++ MUL a3, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c05, b5 ++ fmov b5, t2 ++ MUL a2, c09, b5 ++ fmov b5, t3 ++ MUL a2, c13, b5 ++ fmov b5, t4 ++ ++ SUB c02, t1, b5 ++ fmov b5, c02 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ SUB c10, t3, b5 ++ fmov b5, c10 ++ SUB c14, t4, b5 ++ fmov b5, c14 ++ ++ MUL a3, c02, b5 ++ fmov b5, c02 ++ MUL a3, c06, b5 ++ fmov b5, c06 ++ MUL a3, c10, b5 ++ fmov b5, c10 ++ MUL a3, c14, b5 ++ fmov b5, c14 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ MUL a2, c02, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ MUL a3, c02, b5 ++ fmov b5, t2 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ MUL a4, c02, b5 ++ fmov b5, t2 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, b5 ++ fmov b5, c05 ++ MUL b1, c06, b5 ++ fmov b5, c06 ++ ++ MUL b2, c05, b5 ++ fmov b5, t1 ++ MUL b2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL b3, c05, b5 ++ fmov b5, t1 ++ MUL b3, c06, b5 ++ fmov b5, t2 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ MUL a2, c09, b5 ++ fmov b5, t1 ++ MUL a2, c10, b5 ++ fmov b5, t2 ++ ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ SUB c14, t2, b5 ++ fmov b5, c14 ++ ++ MUL a3, c13, b5 ++ fmov b5, c13 ++ MUL a3, c14, b5 ++ fmov b5, c14 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ MUL a1, c14, b5 ++ fmov b5, c14 ++ ++ MUL a2, c13, b5 ++ fmov b5, t1 ++ MUL a2, c14, b5 ++ fmov b5, t2 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a3, c13, b5 ++ fmov b5, t1 ++ MUL a3, c14, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL a4, c13, b5 ++ fmov b5, t1 ++ MUL a4, c14, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, b5 ++ fmov b5, c09 ++ MUL b1, c10, b5 ++ fmov b5, c10 ++ ++ MUL b2, c09, b5 ++ fmov b5, t1 ++ MUL b2, c10, b5 ++ fmov b5, t2 ++ ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ SUB c06, t2, b5 ++ fmov b5, c06 ++ ++ MUL b3, c09, b5 ++ fmov b5, t1 ++ MUL b3, c10, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c06, b5 ++ fmov b5, c06 ++ ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ MUL a2, c06, b5 ++ fmov b5, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a3, c01, b5 ++ fmov b5, c01 ++ MUL a3, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ and M, 1, I ++ ble I, $L39 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 ++ ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 ++ ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ LD b5, 3 * SIZE(BO) ++ FIMOVD b5, tmp ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ IFMOVD tmp, b5 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, b5 ++ fmov b5, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 ++ ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ MUL a1, b3, b5 ++ fmov b5, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ MUL a1, b4, b5 ++ fmov b5, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, b5 ++ fmov b5, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L37: ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ MUL a1, b2, b5 ++ fmov b5, t2 ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ MUL a1, b3, b5 ++ fmov b5, t3 ++ ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, b5 ++ fmov b5, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c01, t1, b5 ++ fmov b5, c01 ++ ADD c05, t2, b5 ++ fmov b5, c05 ++ ADD c09, t3, b5 ++ fmov b5, c09 ++ ADD c13, t4, b5 ++ fmov b5, c13 ++ ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c05, b5 ++ fmov b5, c05 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a2, c01, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL a3, c01, b5 ++ fmov b5, t1 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ MUL a4, c01, b5 ++ fmov b5, t1 ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, b5 ++ fmov b5, c05 ++ MUL b2, c05, b5 ++ fmov b5, t1 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ MUL b3, c05, b5 ++ fmov b5, t1 ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a2, c09, b5 ++ fmov b5, t1 ++ SUB c13, t1, b5 ++ fmov b5, c13 ++ MUL a3, c13, b5 ++ fmov b5, c13 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, b5 ++ fmov b5, c13 ++ MUL a2, c13, b5 ++ fmov b5, t1 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ MUL a3, c13, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL a4, c13, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, b5 ++ fmov b5, c09 ++ MUL b2, c09, b5 ++ fmov b5, t1 ++ SUB c05, t1, b5 ++ fmov b5, c05 ++ MUL b3, c09, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, b5 ++ fmov b5, c05 ++ MUL a2, c05, b5 ++ fmov b5, t1 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ MUL a3, c01, b5 ++ fmov b5, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 4, KK ++#endif ++ ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldl $9, 64($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S.bak b/kernel/sw_64/trsm_kernel_4x4_RT.S.bak +new file mode 100644 +index 0000000..af57279 +--- /dev/null ++++ b/kernel/sw_64/trsm_kernel_4x4_RT.S.bak +@@ -0,0 +1,4072 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ ++#if !defined(SW2B) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW2B ++#define PREFETCHSIZE 56 ++#define UNOP nop ++#endif ++ ++#ifdef EV6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 56 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $20 ++#define B $21 ++#define C $22 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++#define C3 $25 ++#define C4 $27 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl C, 0 + STACKSIZE($sp) ++ ldl LDC, 8 + STACKSIZE($sp) ++ ldl OFFSET, 16 + STACKSIZE($sp) ++ ++ SXADDQ LDC, 0, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ mull M, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ M, C, C ++#endif ++ ++#ifdef RN ++ negq OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ and N, 1, J ++ ble J, $L40 ++ ++#ifdef RT ++ sll K, BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C ++#endif ++ ++ mov C, C1 ++#ifndef RT ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ ble I, $L100 ++ .align 4 ++ ++$L91: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L95 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L95 ++#endif ++ .align 5 ++ ++$L92: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi L, -1(L) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 9 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 10 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 11 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ LD a1, 12 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD a2, 13 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b3, t3 ++ LD a3, 14 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b3, t4 ++ LD a5, 15 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b4, t1 ++ LD a1, 16 * SIZE(AO) ++ ldi AO, 16 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b4, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L92 ++ .align 4 ++ ++$L95: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ unop ++ ble L, $L98 ++ .align 4 ++ ++$L96: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 1 * SIZE(BO) ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b1, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b1, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ldi AO, 4 * SIZE(AO) ++ bgt L, $L96 ++ .align 4 ++ ++$L98: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a2, c04, t1 ++ SUB c03, t1, c03 ++ MUL a3, c04, t1 ++ SUB c02, t1, c02 ++ MUL a4, c04, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b2, c03, t1 ++ SUB c02, t1, c02 ++ MUL b3, c03, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c01, t1 ++ SUB c03, t1, c03 ++ MUL a4, c01, t1 ++ SUB c04, t1, c04 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b2, c02, t1 ++ SUB c03, t1, c03 ++ MUL b3, c02, t1 ++ SUB c04, t1, c04 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a2, c03, t1 ++ SUB c04, t1, c04 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L91 ++ .align 4 ++ ++$L100: ++ and M, 2, I ++ ble I, $L110 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ ble L, $L105 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ ble L, $L105 ++#endif ++ .align 5 ++ ++$L102: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ ldi BO, 4 * SIZE(BO) ++ MUL a3, b2, t3 ++ LD a3, 6 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a5, 7 * SIZE(AO) ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ LD a1, 8 * SIZE(AO) ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, 3 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 3 * SIZE(BO) ++ bgt L, $L102 ++ .align 4 ++ ++$L105: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L108 ++ .align 4 ++ ++$L106: ++ ADD c01, t1, c01 ++ ldi L, -1(L) ++ MUL a1, b1, t1 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ LD a2, 3 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi AO, 2 * SIZE(AO) ++ unop ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L106 ++ .align 4 ++ ++$L108: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c03, c01 ++ ADD c02, c04, c02 ++ ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a2, c02, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c02, t1, c02 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L110: ++ and M, 1, I ++ ble I, $L119 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c04 ++ ++ sra KK, 2, L ++ mov B, BO ++ unop ++ ble L, $L115 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c04 ++ ++ sra TMP1, 2, L ++ unop ++ ble L, $L115 ++#endif ++ .align 4 ++ ++$L112: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 4 * SIZE(AO) ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ LD a2, 5 * SIZE(AO) ++ LD b2, 5 * SIZE(BO) ++ ++ ADD c03, t3, c03 ++ MUL a3, b3, t3 ++ LD a3, 6 * SIZE(AO) ++ LD b3, 6 * SIZE(BO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b4, t4 ++ LD a4, 7 * SIZE(AO) ++ LD b4, 7 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 4 * SIZE(AO) ++ ldi BO, 4 * SIZE(BO) ++ bgt L, $L112 ++ .align 4 ++ ++$L115: ++#if defined(LT) || defined(RN) ++ and KK, 3, L ++#else ++ and TMP1, 3, L ++#endif ++ ble L, $L118 ++ .align 4 ++ ++$L116: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++ LD a1, 1 * SIZE(AO) ++ LD b1, 1 * SIZE(BO) ++ ++ ldi L, -1(L) ++ ldi AO, 1 * SIZE(AO) ++ ldi BO, 1 * SIZE(BO) ++ bgt L, $L116 ++ .align 4 ++ ++$L118: ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ADD c01, c02, c01 ++ ADD c03, c04, c03 ++ ADD c01, c03, c01 ++ ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ addl B, TMP2, BO ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++#else ++ LD a1, 0 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 1 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ SXADDQ K, AORIG, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L119: ++#ifdef LN ++ SXADDQ K, B, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L40: ++ and N, 2, J ++ ble J, $L80 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ addl LDC, LDC, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ fclr t1 ++#ifndef RT ++ addl C2, LDC, C ++#endif ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L60 ++ .align 4 ++ ++$L51: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ ldi BO, 2 * SIZE(B) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble KK, $L58 ++ ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c03 ++ LD a2, 1 * SIZE(AO) ++ fclr c07 ++ LD a3, 2 * SIZE(AO) ++ fclr c04 ++ LD a4, 3 * SIZE(AO) ++ fclr c08 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ldi BO, 2 * SIZE(BO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ble TMP1, $L58 ++ ++ ble L, $L55 ++#endif ++ .align 4 ++ ++$L52: ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD c05, t1, c05 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L57: ++ ADD c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c05, t1, c05 ++ ADD c06, t2, c06 ++ ADD c07, t3, c07 ++ ADD c08, t4, c08 ++ .align 4 ++ ++$L58: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++ ++ SUB b1, c03, c03 ++ SUB b2, c07, c07 ++ SUB b3, c04, c04 ++ SUB b4, c08, c08 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++ MUL a3, c07, c07 ++ MUL a3, c08, c08 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c07, 5 * SIZE(BO) ++ ST c04, 6 * SIZE(BO) ++ ST c08, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L51 ++ .align 4 ++ ++$L60: ++ and M, 2, I ++ ble I, $L70 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L68 ++ ++ ble L, $L65 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L68 ++ ++ ble L, $L65 ++#endif ++ .align 4 ++ ++$L62: ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L62 ++ .align 4 ++ ++$L65: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L67 ++#else ++ blbs TMP1, $L67 ++#endif ++ .align 4 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L67: ++ ADD c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c02, t2, c02 ++ ADD c05, t3, c05 ++ ADD c06, t4, c06 ++ .align 4 ++ ++$L68: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c02, c02 ++ SUB a4, c06, c06 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c05, c05 ++ MUL a3, c06, c06 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c02, 2 * SIZE(BO) ++ ST c06, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L70: ++ and M, 1, I ++ ble I, $L79 ++ ++#if defined(LT) || defined(RN) ++ ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c02 ++ LD b2, 1 * SIZE(B) ++ fclr c06 ++ ++ ldi L, -2(KK) ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 2 * SIZE(B) ++ ++ ble KK, $L78 ++ ++ ble L, $L75 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c02 ++ LD b2, 1 * SIZE(BO) ++ fclr c06 ++ ++ ldi L, -2(TMP1) ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ble TMP1, $L78 ++ ++ ble L, $L75 ++#endif ++ .align 4 ++ ++$L72: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 1 * SIZE(AO) ++ LD b2, 3 * SIZE(BO) ++ ++ ADD c02, t3, c02 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b3, t3 ++ LD b3, 4 * SIZE(BO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD a2, 0 * SIZE(AO) ++ LD b4, 5 * SIZE(BO) ++ ++ ldi BO, 4 * SIZE(BO) ++ unop ++ unop ++ bgt L, $L72 ++ .align 4 ++ ++$L75: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L77 ++#else ++ blbs TMP1, $L77 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ LD a1, 0 * SIZE(AO) ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L77: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c02, t3, c02 ++ ADD c06, t4, c06 ++ ++ ADD c01, c02, c01 ++ ldi AO, 1 * SIZE(AO) ++ ADD c05, c06, c05 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ++ .align 4 ++ ++$L78: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c05, c05 ++#endif ++ ++#ifdef RT ++ LD a1, 3 * SIZE(BO) ++ LD a2, 2 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L79: ++#ifdef LN ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L80: ++ sra N, 2, J ++ ble J, $L999 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ s4addl LDC, 0, TMP1 ++ subl C, TMP1, C ++#endif ++ ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C3 ++#ifndef RT ++ s4addl LDC, C, C ++#endif ++ ++ fclr t1 ++ addl C3, LDC, C4 ++ fclr t2 ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 2, I ++ fclr t3 ++ fclr t4 ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c06 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(KK) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(B) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble KK, $L18 ++#else ++ ++#ifdef LN ++ sll K, BASE_SHIFT + 2, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 2, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c11 ++ LD a2, 1 * SIZE(AO) ++ fclr c12 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c16 ++ LD a4, 3 * SIZE(AO) ++ fclr c15 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c02 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c06 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c03 ++ ldi L, -2(TMP1) ++ fclr c04 ++ ++ fillcs 7 * SIZE(C2) ++ fclr c08 ++ ldi BO, 4 * SIZE(BO) ++ fclr c13 ++ ++ fillcs 4 * SIZE(C3) ++ fclr c09 ++ ldi AO, 4 * SIZE(AO) ++ fclr c10 ++ ++ fillcs 7 * SIZE(C4) ++ fclr c14 ++ fclr c07 ++ ble TMP1, $L18 ++#endif ++ ++ ble L, $L15 ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD c11, t1, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD c11, t1, c11 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c11, t1, c11 ++ ADD c12, t2, c12 ++ ADD c16, t3, c16 ++ ADD c15, t4, c15 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 4, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++ LD a1, 8 * SIZE(BO) ++ LD a2, 9 * SIZE(BO) ++ LD a3, 10 * SIZE(BO) ++ LD a4, 11 * SIZE(BO) ++ ++ LD b1, 12 * SIZE(BO) ++ LD b2, 13 * SIZE(BO) ++ LD b3, 14 * SIZE(BO) ++ LD b4, 15 * SIZE(BO) ++ ++ SUB a1, c03, c03 ++ SUB a2, c07, c07 ++ SUB a3, c11, c11 ++ SUB a4, c15, c15 ++ ++ SUB b1, c04, c04 ++ SUB b2, c08, c08 ++ SUB b3, c12, c12 ++ SUB b4, c16, c16 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c05, c05 ++ SUB b2, c06, c06 ++ SUB b3, c07, c07 ++ SUB b4, c08, c08 ++ ++ LD a1, 8 * SIZE(AO) ++ LD a2, 9 * SIZE(AO) ++ LD a3, 10 * SIZE(AO) ++ LD a4, 11 * SIZE(AO) ++ ++ LD b1, 12 * SIZE(AO) ++ LD b2, 13 * SIZE(AO) ++ LD b3, 14 * SIZE(AO) ++ LD b4, 15 * SIZE(AO) ++ ++ SUB a1, c09, c09 ++ SUB a2, c10, c10 ++ SUB a3, c11, c11 ++ SUB a4, c12, c12 ++ ++ SUB b1, c13, c13 ++ SUB b2, c14, c14 ++ SUB b3, c15, c15 ++ SUB b4, c16, c16 ++#endif ++ ++#ifdef LN ++ LD a1, 15 * SIZE(AO) ++ LD a2, 14 * SIZE(AO) ++ LD a3, 13 * SIZE(AO) ++ LD a4, 12 * SIZE(AO) ++ ++ MUL a1, c04, c04 ++ MUL a1, c08, c08 ++ MUL a1, c12, c12 ++ MUL a1, c16, c16 ++ ++ MUL a2, c04, t1 ++ MUL a2, c08, t2 ++ MUL a2, c12, t3 ++ MUL a2, c16, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a3, c04, t1 ++ MUL a3, c08, t2 ++ MUL a3, c12, t3 ++ MUL a3, c16, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a4, c04, t1 ++ MUL a4, c08, t2 ++ MUL a4, c12, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD b1, 10 * SIZE(AO) ++ LD b2, 9 * SIZE(AO) ++ LD b3, 8 * SIZE(AO) ++ ++ MUL b1, c03, c03 ++ MUL b1, c07, c07 ++ MUL b1, c11, c11 ++ MUL b1, c15, c15 ++ ++ MUL b2, c03, t1 ++ MUL b2, c07, t2 ++ MUL b2, c11, t3 ++ MUL b2, c15, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL b3, c03, t1 ++ MUL b3, c07, t2 ++ MUL b3, c11, t3 ++ MUL b3, c15, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ LD a1, 5 * SIZE(AO) ++ LD a2, 4 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c01, t1 ++ MUL a3, c05, t2 ++ MUL a3, c09, t3 ++ MUL a3, c13, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL a4, c01, t1 ++ MUL a4, c05, t2 ++ MUL a4, c09, t3 ++ MUL a4, c13, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(AO) ++ LD b2, 6 * SIZE(AO) ++ LD b3, 7 * SIZE(AO) ++ ++ MUL b1, c02, c02 ++ MUL b1, c06, c06 ++ MUL b1, c10, c10 ++ MUL b1, c14, c14 ++ ++ MUL b2, c02, t1 ++ MUL b2, c06, t2 ++ MUL b2, c10, t3 ++ MUL b2, c14, t4 ++ ++ SUB c03, t1, c03 ++ SUB c07, t2, c07 ++ SUB c11, t3, c11 ++ SUB c15, t4, c15 ++ ++ MUL b3, c02, t1 ++ MUL b3, c06, t2 ++ MUL b3, c10, t3 ++ MUL b3, c14, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(AO) ++ LD a2, 11 * SIZE(AO) ++ LD a3, 15 * SIZE(AO) ++ ++ MUL a1, c03, c03 ++ MUL a1, c07, c07 ++ MUL a1, c11, c11 ++ MUL a1, c15, c15 ++ ++ MUL a2, c03, t1 ++ MUL a2, c07, t2 ++ MUL a2, c11, t3 ++ MUL a2, c15, t4 ++ ++ SUB c04, t1, c04 ++ SUB c08, t2, c08 ++ SUB c12, t3, c12 ++ SUB c16, t4, c16 ++ ++ MUL a3, c04, c04 ++ MUL a3, c08, c08 ++ MUL a3, c12, c12 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ MUL a2, c03, t3 ++ MUL a2, c04, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ MUL a4, c03, t3 ++ MUL a4, c04, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ MUL b1, c07, c07 ++ MUL b1, c08, c08 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ MUL b2, c07, t3 ++ MUL b2, c08, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ MUL b3, c07, t3 ++ MUL b3, c08, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ MUL a2, c11, t3 ++ MUL a2, c12, t4 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ SUB c15, t3, c15 ++ SUB c16, t4, c16 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++ MUL a3, c15, c15 ++ MUL a3, c16, c16 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ MUL a1, c15, c15 ++ MUL a1, c16, c16 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ MUL a2, c15, t3 ++ MUL a2, c16, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ MUL a3, c15, t3 ++ MUL a3, c16, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ MUL a4, c15, t3 ++ MUL a4, c16, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ MUL b1, c11, c11 ++ MUL b1, c12, c12 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ MUL b2, c11, t3 ++ MUL b2, c12, t4 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ SUB c07, t3, c07 ++ SUB c08, t4, c08 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ MUL b3, c11, t3 ++ MUL b3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ MUL a1, c07, c07 ++ MUL a1, c08, c08 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ MUL a2, c07, t3 ++ MUL a2, c08, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++ MUL a3, c03, c03 ++ MUL a3, c04, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++ ++ ST c03, 8 * SIZE(BO) ++ ST c07, 9 * SIZE(BO) ++ ST c11, 10 * SIZE(BO) ++ ST c15, 11 * SIZE(BO) ++ ++ ST c04, 12 * SIZE(BO) ++ ST c08, 13 * SIZE(BO) ++ ST c12, 14 * SIZE(BO) ++ ST c16, 15 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c05, 4 * SIZE(AO) ++ ST c06, 5 * SIZE(AO) ++ ST c07, 6 * SIZE(AO) ++ ST c08, 7 * SIZE(AO) ++ ++ ST c09, 8 * SIZE(AO) ++ ST c10, 9 * SIZE(AO) ++ ST c11, 10 * SIZE(AO) ++ ST c12, 11 * SIZE(AO) ++ ++ ST c13, 12 * SIZE(AO) ++ ST c14, 13 * SIZE(AO) ++ ST c15, 14 * SIZE(AO) ++ ST c16, 15 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++ ldi C3, -4 * SIZE(C3) ++ ldi C4, -4 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ST c07, 2 * SIZE(C2) ++ ST c08, 3 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c11, 2 * SIZE(C3) ++ ST c12, 3 * SIZE(C3) ++ ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ST c15, 2 * SIZE(C4) ++ ST c16, 3 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++ ldi C3, 4 * SIZE(C3) ++ ldi C4, 4 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 2, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 4, KK ++#endif ++ ++#ifdef LN ++ subl KK, 4, KK ++#endif ++ ++ ldi I, -1(I) ++ ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 2, I ++ ble I, $L30 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c01 ++ LD b4, 3 * SIZE(B) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c02 ++ fclr c06 ++ ble KK, $L28 ++ ++ ble L, $L25 ++ ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c10 ++ LD a4, 3 * SIZE(AO) ++ fclr c14 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c01 ++ LD b4, 3 * SIZE(BO) ++ fclr c05 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c02 ++ fclr c06 ++ ble TMP1, $L28 ++ ++ ble L, $L25 ++#endif ++ .align 4 ++ ++$L22: ++ ADD c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ ++ ADD c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c09, t1, c09 ++ ADD c10, t2, c10 ++ ADD c13, t3, c13 ++ ADD c14, t4, c14 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++ ++ SUB b1, c02, c02 ++ SUB b2, c06, c06 ++ SUB b3, c10, c10 ++ SUB b4, c14, c14 ++ ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c05, c05 ++ SUB a4, c06, c06 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c13, c13 ++ SUB b4, c14, c14 ++#endif ++ ++#ifdef LN ++ LD a1, 3 * SIZE(AO) ++ LD a2, 2 * SIZE(AO) ++ LD a3, 0 * SIZE(AO) ++ ++ MUL a1, c02, c02 ++ MUL a1, c06, c06 ++ MUL a1, c10, c10 ++ MUL a1, c14, c14 ++ ++ MUL a2, c02, t1 ++ MUL a2, c06, t2 ++ MUL a2, c10, t3 ++ MUL a2, c14, t4 ++ ++ SUB c01, t1, c01 ++ SUB c05, t2, c05 ++ SUB c09, t3, c09 ++ SUB c13, t4, c13 ++ ++ MUL a3, c01, c01 ++ MUL a3, c05, c05 ++ MUL a3, c09, c09 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 3 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++ ++ MUL a2, c01, t1 ++ MUL a2, c05, t2 ++ MUL a2, c09, t3 ++ MUL a2, c13, t4 ++ ++ SUB c02, t1, c02 ++ SUB c06, t2, c06 ++ SUB c10, t3, c10 ++ SUB c14, t4, c14 ++ ++ MUL a3, c02, c02 ++ MUL a3, c06, c06 ++ MUL a3, c10, c10 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ MUL a2, c01, t1 ++ MUL a2, c02, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c01, t1 ++ MUL a4, c02, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b1, c06, c06 ++ ++ MUL b2, c05, t1 ++ MUL b2, c06, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL b3, c05, t1 ++ MUL b3, c06, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ MUL a2, c09, t1 ++ MUL a2, c10, t2 ++ ++ SUB c13, t1, c13 ++ SUB c14, t2, c14 ++ ++ MUL a3, c13, c13 ++ MUL a3, c14, c14 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a1, c14, c14 ++ ++ MUL a2, c13, t1 ++ MUL a2, c14, t2 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a3, c13, t1 ++ MUL a3, c14, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL a4, c13, t1 ++ MUL a4, c14, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b1, c10, c10 ++ ++ MUL b2, c09, t1 ++ MUL b2, c10, t2 ++ ++ SUB c05, t1, c05 ++ SUB c06, t2, c06 ++ ++ MUL b3, c09, t1 ++ MUL b3, c10, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a1, c06, c06 ++ ++ MUL a2, c05, t1 ++ MUL a2, c06, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a3, c01, c01 ++ MUL a3, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++ ++ ST c02, 4 * SIZE(BO) ++ ST c06, 5 * SIZE(BO) ++ ST c10, 6 * SIZE(BO) ++ ST c14, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c05, 2 * SIZE(AO) ++ ST c06, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c13, 6 * SIZE(AO) ++ ST c14, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++ ldi C3, -2 * SIZE(C3) ++ ldi C4, -2 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c06, 1 * SIZE(C2) ++ ++ ST c09, 0 * SIZE(C3) ++ ST c10, 1 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ST c14, 1 * SIZE(C4) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++ ldi C3, 2 * SIZE(C3) ++ ldi C4, 2 * SIZE(C4) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, 1 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ .align 4 ++ ++$L30: ++ and M, 1, I ++ ble I, $L39 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(B) ++ ldi L, -2(KK) ++ LD b2, 1 * SIZE(B) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(B) ++ fclr c09 ++ LD b4, 3 * SIZE(B) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(B) ++ ble KK, $L38 ++ ++ ble L, $L35 ++#else ++#ifdef LN ++ sll K, BASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, BASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c01 ++ LD a2, 1 * SIZE(AO) ++ fclr c05 ++ ++ LD b1, 0 * SIZE(BO) ++ ldi L, -2(TMP1) ++ LD b2, 1 * SIZE(BO) ++ ldi AO, 1 * SIZE(AO) ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c09 ++ LD b4, 3 * SIZE(BO) ++ fclr c13 ++ ++ ldi BO, 4 * SIZE(BO) ++ ble TMP1, $L38 ++ ++ ble L, $L35 ++#endif ++ .align 4 ++ ++$L32: ++ ADD c01, t1, c01 ++ ldi L, -2(L) ++ MUL a1, b1, t1 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ ldi AO, 2 * SIZE(AO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b5, 3 * SIZE(BO) ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, -1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ MUL a2, b1, t1 ++ LD b1, 4 * SIZE(BO) ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD c05, t2, c05 ++ MUL a2, b2, t2 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ LD b4, -1 * SIZE(BO) ++ MUL a2, b3, t3 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a2, b5, t4 ++ LD a2, 0 * SIZE(AO) ++ bgt L, $L32 ++ .align 4 ++ ++$L35: ++ ADD c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L37 ++#else ++ blbs TMP1, $L37 ++#endif ++ .align 4 ++ ++ ADD c05, t2, c05 ++ LD b1, 0 * SIZE(BO) ++ MUL a1, b2, t2 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD c13, t4, c13 ++ MUL a1, b4, t4 ++ LD a1, 0 * SIZE(AO) ++ ldi AO, 1 * SIZE(AO) ++ ++ ADD c01, t1, c01 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L37: ++ ADD c05, t2, c05 ++ MUL a1, b2, t2 ++ ADD c09, t3, c09 ++ MUL a1, b3, t3 ++ ++ ADD c13, t4, c13 ++ ldi AO, 1 * SIZE(AO) ++ MUL a1, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD c01, t1, c01 ++ ADD c05, t2, c05 ++ ADD c09, t3, c09 ++ ADD c13, t4, c13 ++ ++$L38: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 4, TMP1 ++#endif ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -1 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c05, c05 ++ SUB a3, c09, c09 ++ SUB a4, c13, c13 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ ++ MUL a1, c01, c01 ++ MUL a1, c05, c05 ++ MUL a1, c09, c09 ++ MUL a1, c13, c13 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a1, c01, c01 ++ MUL a2, c01, t1 ++ SUB c05, t1, c05 ++ MUL a3, c01, t1 ++ SUB c09, t1, c09 ++ MUL a4, c01, t1 ++ SUB c13, t1, c13 ++ ++ LD b1, 5 * SIZE(BO) ++ LD b2, 6 * SIZE(BO) ++ LD b3, 7 * SIZE(BO) ++ ++ MUL b1, c05, c05 ++ MUL b2, c05, t1 ++ SUB c09, t1, c09 ++ MUL b3, c05, t1 ++ SUB c13, t1, c13 ++ ++ LD a1, 10 * SIZE(BO) ++ LD a2, 11 * SIZE(BO) ++ LD a3, 15 * SIZE(BO) ++ ++ MUL a1, c09, c09 ++ MUL a2, c09, t1 ++ SUB c13, t1, c13 ++ MUL a3, c13, c13 ++#endif ++ ++#ifdef RT ++ LD a1, 15 * SIZE(BO) ++ LD a2, 14 * SIZE(BO) ++ LD a3, 13 * SIZE(BO) ++ LD a4, 12 * SIZE(BO) ++ ++ MUL a1, c13, c13 ++ MUL a2, c13, t1 ++ SUB c09, t1, c09 ++ MUL a3, c13, t1 ++ SUB c05, t1, c05 ++ MUL a4, c13, t1 ++ SUB c01, t1, c01 ++ ++ LD b1, 10 * SIZE(BO) ++ LD b2, 9 * SIZE(BO) ++ LD b3, 8 * SIZE(BO) ++ ++ MUL b1, c09, c09 ++ MUL b2, c09, t1 ++ SUB c05, t1, c05 ++ MUL b3, c09, t1 ++ SUB c01, t1, c01 ++ ++ LD a1, 5 * SIZE(BO) ++ LD a2, 4 * SIZE(BO) ++ LD a3, 0 * SIZE(BO) ++ ++ MUL a1, c05, c05 ++ MUL a2, c05, t1 ++ SUB c01, t1, c01 ++ MUL a3, c01, c01 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c05, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c13, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c05, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c13, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -1 * SIZE(C1) ++ ldi C2, -1 * SIZE(C2) ++ ldi C3, -1 * SIZE(C3) ++ ldi C4, -1 * SIZE(C4) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c05, 0 * SIZE(C2) ++ ST c09, 0 * SIZE(C3) ++ ST c13, 0 * SIZE(C4) ++ ++#ifdef RT ++ sll K, 0 + BASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, BASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, BASE_SHIFT + 2, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L39: ++#ifdef LN ++ sll K, 2 + BASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 4, KK ++#endif ++ ++#ifdef RT ++ subl KK, 4, KK ++#endif ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zamax.S b/kernel/sw_64/zamax.S +new file mode 100644 +index 0000000..c453e9d +--- /dev/null ++++ b/kernel/sw_64/zamax.S +@@ -0,0 +1,302 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $0 ++ unop ++ ++ fstd $f6, 32($sp) ++ unop ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ fclr $f0 ++ beq $0, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ sra N, 2, $1 ++ addl INCX, INCX, INCX ++ ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ faddd $f20, $f21, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ ldi $1, -1($1) ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f1 ++ LD $f23, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ fmov $f0, $f2 ++ LD $f25, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ fmov $f0, $f3 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f8 ++ fabs $f21, $f9 ++ fabs $f22, $f10 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ fabs $f25, $f13 ++ fabs $f26, $f14 ++ fabs $f27, $f15 ++ ++ ble $1, $L14 ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ ldi $1, -1($1) ++ addl X, INCX, X ++ ++ LD $f22, 0 * SIZE(X) ++ LD $f23, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ LD $f25, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ faddd $f8, $f9, $f16 ++ unop ++ fabs $f20, $f8 ++ fillcs 64 * SIZE(X) ++ ++ faddd $f10, $f11, $f17 ++ unop ++ fabs $f21, $f9 ++ LD $f20, 0 * SIZE(X) ++ ++ faddd $f12, $f13, $f18 ++ LD $f21, 1 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ faddd $f14, $f15, $f19 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ unop ++ ++ CMPLT($f0, $f16), $f4 ++ LD $f23, 1 * SIZE(X) ++ fabs $f24, $f12 ++ addl X, INCX, X ++ ++ CMPLT($f1, $f17), $f5 ++ LD $f24, 0 * SIZE(X) ++ fabs $f25, $f13 ++ unop ++ ++ CMPLT($f2, $f18), $f6 ++ LD $f25, 1 * SIZE(X) ++ fabs $f26, $f14 ++ addl X, INCX, X ++ ++ CMPLT($f3, $f19), $f7 ++ LD $f26, 0 * SIZE(X) ++ fabs $f27, $f15 ++ unop ++ ++ fselne $f4, $f16, $f0, $f0 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ldi $1, -1($1) # i -- ++ ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ faddd $f8, $f9, $f16 ++ fabs $f20, $f8 ++ ++ faddd $f10, $f11, $f17 ++ fabs $f21, $f9 ++ ++ faddd $f12, $f13, $f18 ++ fabs $f22, $f10 ++ ++ faddd $f14, $f15, $f19 ++ fabs $f23, $f11 ++ ++ CMPLT($f0, $f16), $f4 ++ fabs $f24, $f12 ++ ++ CMPLT($f1, $f17), $f5 ++ fabs $f25, $f13 ++ ++ CMPLT($f2, $f18), $f6 ++ fabs $f26, $f14 ++ CMPLT($f3, $f19), $f7 ++ fabs $f27, $f15 ++ ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ .align 4 ++ ++$L14: ++ faddd $f8, $f9, $f16 ++ faddd $f10, $f11, $f17 ++ faddd $f12, $f13, $f18 ++ faddd $f14, $f15, $f19 ++ ++ CMPLT($f0, $f16), $f4 ++ CMPLT($f1, $f17), $f5 ++ CMPLT($f2, $f18), $f6 ++ CMPLT($f3, $f19), $f7 ++ ++ fselne $f4, $f16, $f0, $f0 ++ fselne $f5, $f17, $f1, $f1 ++ fselne $f6, $f18, $f2, $f2 ++ fselne $f7, $f19, $f3, $f3 ++ ++ CMPLT($f0, $f1), $f16 ++ CMPLT($f2, $f3), $f17 ++ ++ fselne $f16, $f1, $f0, $f0 ++ fselne $f17, $f3, $f2, $f2 ++ ++ CMPLT($f0, $f2), $f16 ++ fselne $f16, $f2, $f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 3, $1 ++ unop ++ unop ++ ble $1, $End ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ fabs $f21, $f30 ++ faddd $f29, $f30, $f20 ++ fmov $f20,$f29 ++ ++ CMPLT($f0, $f29), $f16 ++ fselne $f16, $f29, $f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/zamax.S.bak b/kernel/sw_64/zamax.S.bak +new file mode 100644 +index 0000000..74b9331 +--- /dev/null ++++ b/kernel/sw_64/zamax.S.bak +@@ -0,0 +1,301 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++ ++#ifndef USE_MIN ++#define CMPLT(a, b) fcmplt a, b ++#else ++#define CMPLT(a, b) fcmplt b, a ++#endif ++ ++#define STACKSIZE 8 * 8 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, STACKSIZE, $26, 0 ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fclr $f16 ++ cmplt $31, N, $2 ++ ++ fstd $f3, 8($sp) ++ fclr $f17 ++ cmplt $31, INCX, $3 ++ unop ++ ++ fstd $f4, 16($sp) ++ fclr $f18 ++ SXADDQ INCX, $31, INCX ++ unop ++ ++ fstd $f5, 24($sp) ++ fclr $f19 ++ and $2, $3, $0 ++ unop ++ ++ fstd $f6, 32($sp) ++ unop ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ fclr $f0 ++ beq $0, $End # if (n <= 0) or (incx <= 0) return ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ sra N, 2, $1 ++ addl INCX, INCX, INCX ++ ++ fabs $f20, $f20 ++ fabs $f21, $f21 ++ faddd $f20, $f21, $f0 ++ ble $1, $L15 ++ .align 4 ++ ++ ldi $1, -1($1) ++ unop ++ addl X, INCX, X ++ unop ++ ++ LD $f22, 0 * SIZE(X) ++ fmov $f0, $f1 ++ LD $f23, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ fmov $f0, $f2 ++ LD $f25, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ fmov $f0, $f3 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ fabs $f20, $f8 ++ fabs $f21, $f9 ++ fabs $f22, $f10 ++ fabs $f23, $f11 ++ ++ fabs $f24, $f12 ++ fabs $f25, $f13 ++ fabs $f26, $f14 ++ fabs $f27, $f15 ++ ++ ble $1, $L14 ++ .align 4 ++ ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ ldi $1, -1($1) ++ addl X, INCX, X ++ ++ LD $f22, 0 * SIZE(X) ++ LD $f23, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f24, 0 * SIZE(X) ++ LD $f25, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ LD $f26, 0 * SIZE(X) ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ble $1, $L13 ++ .align 4 ++ ++$L12: ++ faddd $f8, $f9, $f16 ++ unop ++ fabs $f20, $f8 ++ fillcs 64 * SIZE(X) ++ ++ faddd $f10, $f11, $f17 ++ unop ++ fabs $f21, $f9 ++ LD $f20, 0 * SIZE(X) ++ ++ faddd $f12, $f13, $f18 ++ LD $f21, 1 * SIZE(X) ++ fabs $f22, $f10 ++ addl X, INCX, X ++ ++ faddd $f14, $f15, $f19 ++ LD $f22, 0 * SIZE(X) ++ fabs $f23, $f11 ++ unop ++ ++ CMPLT($f0, $f16), $f4 ++ LD $f23, 1 * SIZE(X) ++ fabs $f24, $f12 ++ addl X, INCX, X ++ ++ CMPLT($f1, $f17), $f5 ++ LD $f24, 0 * SIZE(X) ++ fabs $f25, $f13 ++ unop ++ ++ CMPLT($f2, $f18), $f6 ++ LD $f25, 1 * SIZE(X) ++ fabs $f26, $f14 ++ addl X, INCX, X ++ ++ CMPLT($f3, $f19), $f7 ++ LD $f26, 0 * SIZE(X) ++ fabs $f27, $f15 ++ unop ++ ++fselne $f4,$f16,$f0, $f0 ++ LD $f27, 1 * SIZE(X) ++ addl X, INCX, X ++ ldi $1, -1($1) # i -- ++ ++fselne $f5,$f17,$f1, $f1 ++fselne $f6,$f18,$f2, $f2 ++fselne $f7,$f19,$f3, $f3 ++ bgt $1,$L12 ++ .align 4 ++ ++$L13: ++ faddd $f8, $f9, $f16 ++ fabs $f20, $f8 ++ ++ faddd $f10, $f11, $f17 ++ fabs $f21, $f9 ++ ++ faddd $f12, $f13, $f18 ++ fabs $f22, $f10 ++ ++ faddd $f14, $f15, $f19 ++ fabs $f23, $f11 ++ ++ CMPLT($f0, $f16), $f4 ++ fabs $f24, $f12 ++ ++ CMPLT($f1, $f17), $f5 ++ fabs $f25, $f13 ++ ++ CMPLT($f2, $f18), $f6 ++ fabs $f26, $f14 ++ CMPLT($f3, $f19), $f7 ++ fabs $f27, $f15 ++ ++fselne $f4,$f16,$f0, $f0 ++fselne $f5,$f17,$f1, $f1 ++fselne $f6,$f18,$f2, $f2 ++fselne $f7,$f19,$f3, $f3 ++ .align 4 ++ ++$L14: ++ faddd $f8, $f9, $f16 ++ faddd $f10, $f11, $f17 ++ faddd $f12, $f13, $f18 ++ faddd $f14, $f15, $f19 ++ ++ CMPLT($f0, $f16), $f4 ++ CMPLT($f1, $f17), $f5 ++ CMPLT($f2, $f18), $f6 ++ CMPLT($f3, $f19), $f7 ++ ++fselne $f4,$f16,$f0, $f0 ++fselne $f5,$f17,$f1, $f1 ++fselne $f6,$f18,$f2, $f2 ++fselne $f7,$f19,$f3, $f3 ++ ++ CMPLT($f0, $f1), $f16 ++ CMPLT($f2, $f3), $f17 ++ ++fselne $f16,$f1,$f0, $f0 ++fselne $f17,$f3,$f2, $f2 ++ ++ CMPLT($f0, $f2), $f16 ++fselne $f16,$f2,$f0, $f0 ++ .align 4 ++ ++$L15: ++ and N, 3, $1 ++ unop ++ unop ++ ble $1, $End ++ .align 4 ++ ++$L16: ++ LD $f20, 0 * SIZE(X) ++ LD $f21, 1 * SIZE(X) ++ unop ++ addl X, INCX, X ++ ++ fabs $f20, $f29 ++ fabs $f21, $f30 ++ faddd $f29, $f30, $f29 ++ ++ CMPLT($f0, $f29), $f16 ++fselne $f16,$f29,$f0, $f0 ++ ++ ldi $1, -1($1) # i -- ++ bgt $1, $L16 ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldi $sp, STACKSIZE($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/zasum.S b/kernel/sw_64/zasum.S +new file mode 100644 +index 0000000..72e120c +--- /dev/null ++++ b/kernel/sw_64/zasum.S +@@ -0,0 +1,231 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++#define t4 $f24 ++#define s4 $f27 ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ addl INCX, INCX, INCX ++ ++ fclr s1 ++ unop ++ fclr t1 ++ ble N, $L999 ++ ++ fclr s2 ++ sra N, 2, I ++ fclr s3 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t2 ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a2, 0 * SIZE(X) ++ fclr t3 ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) ++ ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ fillcs PREFETCHSIZE * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ LD a6, 0 * SIZE(X) ++ fabs a1, t1 ++ unop ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ LD a7, 1 * SIZE(X) ++ fabs a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a0, 0 * SIZE(X) ++ fabs a3, t3 ++ unop ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD a1, 1 * SIZE(X) ++ fabs a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ LD a2, 0 * SIZE(X) ++ fabs a5, t1 ++ unop ++ ++ fadds s2, t2, s4 ++ fmov s4,s2 ++ LD a3, 1 * SIZE(X) ++ fabs a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a4, 0 * SIZE(X) ++ fabs a7, t3 ++ unop ++ ++ LD a5, 1 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD a6, 0 * SIZE(X) ++ fabs a0, t0 ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ LD a7, 1 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ fabs a2, t2 ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ fabs a3, t3 ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ fabs a4, t0 ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ fabs a5, t1 ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ fabs a6, t2 ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ fabs a7, t3 ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ ++ .align 4 ++ ++$L15: ++ ADD s0, s2, $f25 ++ fmov $f25, s0 ++ and N, 3, I ++ ADD s1, s3, $f25 ++ fmov $f25, s1 ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, $f25 ++ fmov $f25, s0 ++ LD a0, 0 * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, $f25 ++ fmov $f25, s1 ++ LD a1, 1 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, $f25 ++ ADD s1, t1, $f26 ++ ++ ADD $f25, $f26, s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zasum.S.bak b/kernel/sw_64/zasum.S.bak +new file mode 100644 +index 0000000..db79771 +--- /dev/null ++++ b/kernel/sw_64/zasum.S.bak +@@ -0,0 +1,208 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ addl INCX, INCX, INCX ++ ++ fclr s1 ++ unop ++ fclr t1 ++ ble N, $L999 ++ ++ fclr s2 ++ sra N, 2, I ++ fclr s3 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t2 ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a2, 0 * SIZE(X) ++ fclr t3 ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) ++ ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ fillcs PREFETCHSIZE * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fabs a1, t1 ++ unop ++ ++ ADD s2, t2, s2 ++ LD a7, 1 * SIZE(X) ++ fabs a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ fabs a3, t3 ++ unop ++ ++ ADD s0, t0, s0 ++ LD a1, 1 * SIZE(X) ++ fabs a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) ++ fabs a5, t1 ++ unop ++ ++ ADD s2, t2, s2 ++ LD a3, 1 * SIZE(X) ++ fabs a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ fabs a7, t3 ++ unop ++ ++ LD a5, 1 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD a6, 0 * SIZE(X) ++ fabs a0, t0 ++ ++ ADD s1, t1, s1 ++ LD a7, 1 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ fabs a2, t2 ++ ADD s3, t3, s3 ++ fabs a3, t3 ++ ++ ADD s0, t0, s0 ++ fabs a4, t0 ++ ADD s1, t1, s1 ++ fabs a5, t1 ++ ADD s2, t2, s2 ++ fabs a6, t2 ++ ADD s3, t3, s3 ++ fabs a7, t3 ++ ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ .align 4 ++ ++$L15: ++ ADD s0, s2, s0 ++ and N, 3, I ++ ADD s1, s3, s1 ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ LD a0, 0 * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a1, 1 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ++ ADD s0, s1, s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zasum_simd.S b/kernel/sw_64/zasum_simd.S +new file mode 100644 +index 0000000..5606fdf +--- /dev/null ++++ b/kernel/sw_64/zasum_simd.S +@@ -0,0 +1,385 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 96 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++#define t4 $f24 ++#define t5 $f25 ++#define t6 $f26 ++#define t7 $f27 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ fclr s0 ++ unop ++ fclr t0 ++ addl INCX, INCX, INCX ++ ++ fclr s1 ++ unop ++ fclr t1 ++ ble N, $L999 ++ ++ cmpeq INCX, 2, $3 ++ beq $3, $Sub ++ .align 4 ++ ++ and X, (VEC_LEN*SIZE-1), $6 ++ bgt $6, $UnAlign_X_ACCESS ++ .align 4 ++$Align_Access: ++ ++/* ++ Unloop 8*2= 16 reals ++*/ ++ sra N, 3, I ++ fclr s2 ++ fclr s3 ++ ble I, $Remain ++ ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t0 ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t1 ++ ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t2 ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t3 ++ ++ subl I, 1, I ++ addl X, 16*SIZE, X ++ unop ++ ble I, $MainLoopEnd ++ ++$MainLoop: ++ vcpys $f31, a0, a4 ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ vcpys $f31, a1, a5 ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ ++ vcpys $f31, a2, a6 ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ vcpys $f31, a3, a7 ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ ++ VADD t0, a4, t0 ++ subl I, 1, I ++ VADD t1, a5, t1 ++ fillcs PREFETCHSIZE * SIZE(X) ++ ++ VADD t2, a6, t2 ++ addl X, 16*SIZE, X ++ VADD t3, a7, t3 ++ bgt I, $MainLoop ++ ++$MainLoopEnd: ++ /*fabs*/ ++ ++ vcpys $f31, a0, a4 ++ vcpys $f31, a1, a5 ++ vcpys $f31, a2, a6 ++ vcpys $f31, a3, a7 ++ ++ VADD t0, a4, t0 ++ VADD t1, a5, t1 ++ VADD t2, a6, t2 ++ VADD t3, a7, t3 ++ ++ VADD t0, t1, t0 ++ VADD t2, t3, t2 ++ VADD t0, t2, t0 ++ nop ++ ++ vextf t0, 0, s0 ++ vextf t0, 1, s1 ++ vextf t0, 2, s2 ++ vextf t0, 3, s3 ++ ++$Remain: ++ and N, 7, I ++ ADD s0, s2, s0 ++ ADD s1, s3, s1 ++ ble I, $End ++ .align 4 ++ ++$RemainLoop: ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ fabs a0, t0 ++ addl X, 2*SIZE, X ++ ++ fabs a1, t1 ++ ldi I, -1(I) ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ++ bne I, $RemainLoop ++ .align 4 ++$End: ++ ADD s0, s1, s0 ++ ret ++ .align 4 ++ ++$UnAlign_X_ACCESS: ++ sra N, 3, I ++ fclr s2 ++ fclr s3 ++ ble I, $Remain ++ ++ VLD_UL a0, 0*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t0 ++ VLD_UH t4, 1*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t1 ++ ++ VLD_UL a1, 1*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t2 ++ VLD_UH t5, 2*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t3 ++ ++ VLD_UL a2, 2*VEC_LEN*SIZE(X) ++ VLD_UH t6, 3*VEC_LEN*SIZE(X) ++ VLD_UL a3, 3*VEC_LEN*SIZE(X) ++ VLD_UH t7, 4*VEC_LEN*SIZE(X) ++ ++ vbisw a0, t4, a0 ++ subl I, 1, I ++ vbisw a1, t5, a1 ++ addl X, 16*SIZE, X ++ ++ vbisw a2, t6, a2 ++ unop ++ vbisw a3, t7, a3 ++ ble I, $MainLoopEnd ++ ++$UnAlign_X_ACCESS_MainLoop: ++/*fabs*/ ++ vcpys $f31, a0, a4 ++ VLD_UL a0, 0*VEC_LEN*SIZE(X) ++ vcpys $f31, a1, a5 ++ VLD_UH t4, 1*VEC_LEN*SIZE(X) ++ ++ vcpys $f31, a2, a6 ++ VLD_UL a1, 1*VEC_LEN*SIZE(X) ++ vcpys $f31, a3, a7 ++ VLD_UH t5, 2*VEC_LEN*SIZE(X) ++ ++ VADD t0, a4, t0 ++ VLD_UL a2, 2*VEC_LEN*SIZE(X) ++ VADD t1, a5, t1 ++ VLD_UH t6, 3*VEC_LEN*SIZE(X) ++ ++ VADD t2, a6, t2 ++ VLD_UL a3, 3*VEC_LEN*SIZE(X) ++ VADD t3, a7, t3 ++ VLD_UH t7, 4*VEC_LEN*SIZE(X) ++ ++ ++ vbisw a0, t4, a0 ++ subl I, 1, I ++ vbisw a1, t5, a1 ++ fillcs PREFETCHSIZE * SIZE(X) ++ ++ vbisw a2, t6, a2 ++ addl X, 16*SIZE, X ++ vbisw a3, t7, a3 ++ bgt I, $UnAlign_X_ACCESS_MainLoop ++ ++ jmp $MainLoopEnd ++ .align 4 ++ ++ ++$Sub: ++ fclr s2 ++ sra N, 2, I ++ fclr s3 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t2 ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a2, 0 * SIZE(X) ++ fclr t3 ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) ++ ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, s0 ++ fillcs PREFETCHSIZE * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a6, 0 * SIZE(X) ++ fabs a1, t1 ++ unop ++ ++ ADD s2, t2, s2 ++ LD a7, 1 * SIZE(X) ++ fabs a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ fabs a3, t3 ++ unop ++ ++ ADD s0, t0, s0 ++ LD a1, 1 * SIZE(X) ++ fabs a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, s1 ++ LD a2, 0 * SIZE(X) ++ fabs a5, t1 ++ unop ++ ++ ADD s2, t2, s2 ++ LD a3, 1 * SIZE(X) ++ fabs a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ fabs a7, t3 ++ unop ++ ++ LD a5, 1 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, s0 ++ LD a6, 0 * SIZE(X) ++ fabs a0, t0 ++ ++ ADD s1, t1, s1 ++ LD a7, 1 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ fabs a2, t2 ++ ADD s3, t3, s3 ++ fabs a3, t3 ++ ++ ADD s0, t0, s0 ++ fabs a4, t0 ++ ADD s1, t1, s1 ++ fabs a5, t1 ++ ADD s2, t2, s2 ++ fabs a6, t2 ++ ADD s3, t3, s3 ++ fabs a7, t3 ++ ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++ .align 4 ++ ++$L15: ++ ADD s0, s2, s0 ++ and N, 3, I ++ ADD s1, s3, s1 ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, s0 ++ LD a0, 0 * SIZE(X) ++ fabs a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, s1 ++ LD a1, 1 * SIZE(X) ++ fabs a1, t1 ++ SXADDQ INCX, X, X ++ ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ++ ADD s0, s1, s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zaxpy.S b/kernel/sw_64/zaxpy.S +new file mode 100644 +index 0000000..19b6398 +--- /dev/null ++++ b/kernel/sw_64/zaxpy.S +@@ -0,0 +1,654 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 40 ++ ++#ifndef CONJ ++#define ADD1 SUB ++#define ADD2 ADD ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#endif ++ ++#define tmp $f9 ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldw $19, 0($sp) ++ fmov $f19, $f29 ++ ldl $20, 8($sp) ++ fmov $f20, $f30 ++ ++ mov $21, $18 ++ ldw $21, 16($sp) ++ ldi $sp, -64($sp) ++ nop ++ ++ fstd $f2, 0($sp) ++ cmpeq $19, 1, $1 ++ fstd $f3, 8($sp) ++ cmpeq $21, 1, $2 ++ ++ fstd $f4, 16($sp) ++ and $16, 3, $5 ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd tmp, 56($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ and $1, $2, $1 ++ ble $16, $End ++ sra $16, 2, $4 ++ beq $1, $Sub ++ ++ ble $4, $Remain ++ subl $4, 1, $4 ++ ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ LD $f2, 2*SIZE($18) ++ LD $f3, 3*SIZE($18) ++ LD $f4, 4*SIZE($18) ++ LD $f5, 5*SIZE($18) ++ LD $f6, 6*SIZE($18) ++ LD $f7, 7*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ LD $f10, 2*SIZE($20) ++ LD $f11, 3*SIZE($20) ++ LD $f12, 4*SIZE($20) ++ LD $f13, 5*SIZE($20) ++ LD $f14, 6*SIZE($20) ++ LD $f15, 7*SIZE($20) ++ ++ addl $18, 8*SIZE, $18 ++ ble $4, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++ fillcs PREFETCHSIZE * SIZE($20) ++ fillcs PREFETCHSIZE * SIZE($18) ++ ++ MUL $f29, $f0, $f20 ++ fillcs 9*SIZE($18) ++ MUL $f30, $f1, $f21 ++ unop ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ MUL $f29, $f2, $f24 ++ unop ++ MUL $f30, $f3, $f25 ++ nop ++ ++ MUL $f30, $f2, $f26 ++ LD $f2, 2*SIZE($18) ++ MUL $f29, $f3, $f27 ++ LD $f3, 3*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ unop ++ MUL $f30, $f4, $f22 ++ LD $f4, 4*SIZE($18) ++ ++ ADD2 $f26, $f27, $f19 ++ addl $20, 8*SIZE, $20 ++ MUL $f29, $f5, $f23 ++ LD $f5, 5*SIZE($18) ++ ++ ADD $f16, $f8, tmp ++ fmov tmp, $f16 ++ LD $f8, 0*SIZE($20) ++ MUL $f29, $f6, $f24 ++ unop ++ ++ ADD $f17, $f28, tmp ++ fmov tmp, $f17 ++ LD $f28, 1*SIZE($20) ++ MUL $f30, $f7, $f25 ++ unop ++ ++ ADD $f18, $f10, tmp ++ fmov tmp, $f18 ++ LD $f10, 2*SIZE($20) ++ MUL $f30, $f6, $f26 ++ LD $f6, 6*SIZE($18) ++ ++ ADD $f19, $f11, tmp ++ fmov tmp, $f19 ++ LD $f11, 3*SIZE($20) ++ MUL $f29, $f7, $f27 ++ LD $f7, 7*SIZE($18) ++ ++ ST $f16,-8*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17,-7*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ ST $f18,-6*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ST $f19,-5*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ ++ ADD $f16, $f12, tmp ++ fmov tmp, $f16 ++ LD $f12, 4*SIZE($20) ++ ADD $f17, $f13, tmp ++ fmov tmp, $f17 ++ LD $f13, 5*SIZE($20) ++ ADD $f18, $f14, tmp ++ fmov tmp, $f18 ++ LD $f14, 6*SIZE($20) ++ ADD $f19, $f15, tmp ++ fmov tmp, $f19 ++ LD $f15, 7*SIZE($20) ++ ++ ST $f16,-4*SIZE($20) ++ addl $18, 8*SIZE, $18 ++ ST $f17,-3*SIZE($20) ++ subl $4, 1, $4 ++ ++ ST $f18,-2*SIZE($20) ++ nop ++ ST $f19,-1*SIZE($20) ++ bgt $4, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ MUL $f29, $f2, $f24 ++ MUL $f30, $f3, $f25 ++ MUL $f30, $f2, $f26 ++ MUL $f29, $f3, $f27 ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ MUL $f30, $f4, $f22 ++ ADD2 $f26, $f27, $f19 ++ MUL $f29, $f5, $f23 ++ ++ ADD $f16, $f8, tmp ++ fmov tmp, $f16 ++ MUL $f29, $f6, $f24 ++ ADD $f17, $f28, tmp ++ fmov tmp, $f17 ++ MUL $f30, $f7, $f25 ++ ++ ADD $f18, $f10, tmp ++ fmov tmp, $f18 ++ MUL $f30, $f6, $f26 ++ ADD $f19, $f11, tmp ++ fmov tmp, $f19 ++ MUL $f29, $f7, $f27 ++ ++ ST $f16, 0*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17, 1*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ ST $f18, 2*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ST $f19, 3*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ ++ ADD $f16, $f12, tmp ++ fmov tmp, $f16 ++ ADD $f17, $f13, tmp ++ fmov tmp, $f17 ++ ADD $f18, $f14, tmp ++ fmov tmp, $f18 ++ ADD $f19, $f15, tmp ++ fmov tmp, $f19 ++ ++ ST $f16, 4*SIZE($20) ++ ST $f17, 5*SIZE($20) ++ ST $f18, 6*SIZE($20) ++ ST $f19, 7*SIZE($20) ++ ++ unop ++ addl $20, 8*SIZE, $20 ++ unop ++ ble $5, $End ++ .align 4 ++ ++$Remain: ++ subl $5, 1, $6 ++ ble $5, $End ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ addl $18, 2*SIZE, $18 ++ ble $6, $RemainLoopEnd ++ .align 4 ++ ++$RemainLoop: ++ MUL $f29, $f0, $f20 ++ subl $6, 1, $6 ++ MUL $f30, $f1, $f21 ++ addl $20, 2*SIZE, $20 ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, tmp ++ fmov tmp, $f16 ++ LD $f8, 0*SIZE($20) ++ ADD $f17, $f28, tmp ++ fmov tmp, $f17 ++ LD $f28, 1*SIZE($20) ++ ++ ST $f16,-2*SIZE($20) ++ addl $18, 2*SIZE, $18 ++ ST $f17,-1*SIZE($20) ++ bgt $6, $RemainLoop ++ .align 4 ++ ++$RemainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, tmp ++ fmov tmp, $f16 ++ ADD $f17, $f28, tmp ++ fmov tmp, $f17 ++ ++ ST $f16, 0*SIZE($20) ++ nop ++ ST $f17, 1*SIZE($20) ++ nop ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd tmp, 56($sp) ++ ldi $sp, 64($sp) ++ ret ++ .align 4 ++ ++$Sub: ++ SXSUBL $16, SIZE, $22 ++ addl $22, $22, $22 # Complex ++ .align 4 ++ ++ addl $19, $19, $19 # Complex ++ addl $21, $21, $21 # Complex ++ ++ ble $4, $SubRemain ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f2, 0*SIZE($18) ++ LD $f3, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f4, 0*SIZE($18) ++ LD $f5, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f6, 0*SIZE($18) ++ LD $f7, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ SXADDQ $21, $20, $24 ++ ++ LD $f10, 0*SIZE($24) ++ LD $f11, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ LD $f12, 0*SIZE($24) ++ LD $f13, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ LD $f14, 0*SIZE($24) ++ LD $f15, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ subl $4, 1, $4 ++ ble $4, $SubMainLoopEnd ++ .align 4 ++ ++$SubMainLoop: ++ MUL $f29, $f0, $f20 ++ unop ++ MUL $f30, $f1, $f21 ++ unop ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ MUL $f29, $f2, $f24 ++ SXADDQ $19, $18, $18 ++ MUL $f30, $f3, $f25 ++ unop ++ ++ MUL $f30, $f2, $f26 ++ LD $f2, 0*SIZE($18) ++ MUL $f29, $f3, $f27 ++ LD $f3, 1*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ SXADDQ $19, $18, $18 ++ MUL $f29, $f4, $f20 ++ unop ++ ++ ADD2 $f22, $f23, $f17 ++ unop ++ MUL $f30, $f5, $f21 ++ unop ++ ++ ADD1 $f24, $f25, $f18 ++ unop ++ MUL $f30, $f4, $f22 ++ LD $f4, 0*SIZE($18) ++ ++ ADD2 $f26, $f27, $f19 ++ unop ++ MUL $f29, $f5, $f23 ++ LD $f5, 1*SIZE($18) ++ ++ ADD $f16, $f8, tmp ++ fmov tmp, $f16 ++ LD $f8, 0*SIZE($24) ++ MUL $f29, $f6, $f24 ++ SXADDQ $19, $18, $18 ++ ++ ADD $f17, $f28, tmp ++ fmov tmp, $f17 ++ LD $f28, 1*SIZE($24) ++ MUL $f30, $f7, $f25 ++ SXADDQ $21, $24, $24 ++ ++ ADD $f18, $f10, tmp ++ fmov tmp, $f18 ++ LD $f10, 0*SIZE($24) ++ MUL $f30, $f6, $f26 ++ LD $f6, 0*SIZE($18) ++ ++ ADD $f19, $f11, tmp ++ fmov tmp, $f19 ++ LD $f11, 1*SIZE($24) ++ MUL $f29, $f7, $f27 ++ LD $f7, 1*SIZE($18) ++ ++ ST $f16, 0*SIZE($20) ++ SXADDQ $19, $18, $18 ++ ADD1 $f20, $f21, $f16 ++ unop ++ ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ADD2 $f22, $f23, $f17 ++ unop ++ ++ ST $f18, 0*SIZE($20) ++ SXADDQ $21, $24, $24 ++ ADD1 $f24, $f25, $f18 ++ unop ++ ++ ST $f19, 1*SIZE($20) ++ unop ++ ADD2 $f26, $f27, $f19 ++ SXADDQ $21, $20, $20 ++ ++ ADD $f16, $f12, tmp ++ fmov tmp, $f16 ++ unop ++ LD $f12, 0*SIZE($24) ++ unop ++ ++ ADD $f17, $f13, tmp ++ fmov tmp, $f17 ++ unop ++ LD $f13, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ ADD $f18, $f14, tmp ++ fmov tmp, $f18 ++ subl $4, 1, $4 ++ LD $f14, 0*SIZE($24) ++ unop ++ ++ ADD $f19, $f15, tmp ++ fmov tmp, $f19 ++ unop ++ LD $f15, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ unop ++ ++ ST $f18, 0*SIZE($20) ++ ST $f19, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ bgt $4, $SubMainLoop ++ .align 4 ++ ++$SubMainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ MUL $f29, $f2, $f24 ++ MUL $f30, $f3, $f25 ++ MUL $f30, $f2, $f26 ++ MUL $f29, $f3, $f27 ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ MUL $f30, $f4, $f22 ++ ADD2 $f26, $f27, $f19 ++ MUL $f29, $f5, $f23 ++ ++ ADD $f16, $f8, tmp ++ fmov tmp, $f16 ++ MUL $f29, $f6, $f24 ++ ADD $f17, $f28, tmp ++ fmov tmp, $f17 ++ MUL $f30, $f7, $f25 ++ ++ ADD $f18, $f10, tmp ++ fmov tmp, $f18 ++ MUL $f30, $f6, $f26 ++ ADD $f19, $f11, tmp ++ fmov tmp, $f19 ++ MUL $f29, $f7, $f27 ++ ++ ST $f16, 0*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17, 1*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ SXADDQ $21, $20, $20 ++ nop ++ ST $f18, 0*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ++ ST $f19, 1*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ SXADDQ $21, $20, $20 ++ ADD $f16, $f12, tmp ++ fmov tmp, $f16 ++ ++ ADD $f17, $f13, tmp ++ fmov tmp, $f17 ++ ADD $f18, $f14, tmp ++ fmov tmp, $f18 ++ ADD $f19, $f15, tmp ++ fmov tmp, $f19 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ++ ST $f18, 0*SIZE($20) ++ ST $f19, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ble $5, $SubEnd ++ .align 4 ++ ++$SubRemain: ++ subl $5, 1, $6 ++ ble $5, $SubEnd ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ SXADDQ $19, $18, $18 ++ SXADDQ $21, $20, $24 ++ ble $6, $SubRemainLoopEnd ++ .align 4 ++ ++$SubRemainLoop: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ADD1 $f20, $f21, $f16 ++ SXADDQ $19, $18, $18 ++ ++ ADD2 $f22, $f23, $f17 ++ nop ++ ADD $f16, $f8, tmp ++ fmov tmp, $f16 ++ LD $f8, 0*SIZE($24) ++ ++ ADD $f17, $f28, tmp ++ fmov tmp, $f17 ++ LD $f28, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ subl $6, 1, $6 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ bgt $6, $SubRemainLoop ++ .align 4 ++ ++$SubRemainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, tmp ++ fmov tmp, $f16 ++ ADD $f17, $f28, tmp ++ fmov tmp, $f17 ++ ++ ST $f16, 0*SIZE($20) ++ nop ++ ST $f17, 1*SIZE($20) ++ nop ++ .align 4 ++ ++$SubEnd: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd tmp, 56($sp) ++ ldi $sp, 64($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zaxpy.S.bak b/kernel/sw_64/zaxpy.S.bak +new file mode 100644 +index 0000000..c6cd44b +--- /dev/null ++++ b/kernel/sw_64/zaxpy.S.bak +@@ -0,0 +1,611 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 40 ++ ++#ifndef CONJ ++#define ADD1 SUB ++#define ADD2 ADD ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#endif ++ ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldl $19, 0($sp) ++ fmov $f19, $f29 ++ ldl $20, 8($sp) ++ fmov $f20, $f30 ++ ++ mov $21, $18 ++ ldl $21, 16($sp) ++ ldi $sp, -64($sp) ++ nop ++ ++ fstd $f2, 0($sp) ++ cmpeq $19, 1, $1 ++ fstd $f3, 8($sp) ++ cmpeq $21, 1, $2 ++ ++ fstd $f4, 16($sp) ++ and $16, 3, $5 ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ and $1, $2, $1 ++ ble $16, $End ++ sra $16, 2, $4 ++ beq $1, $Sub ++ ++ ble $4, $Remain ++ subl $4, 1, $4 ++ ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ LD $f2, 2*SIZE($18) ++ LD $f3, 3*SIZE($18) ++ LD $f4, 4*SIZE($18) ++ LD $f5, 5*SIZE($18) ++ LD $f6, 6*SIZE($18) ++ LD $f7, 7*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ LD $f10, 2*SIZE($20) ++ LD $f11, 3*SIZE($20) ++ LD $f12, 4*SIZE($20) ++ LD $f13, 5*SIZE($20) ++ LD $f14, 6*SIZE($20) ++ LD $f15, 7*SIZE($20) ++ ++ addl $18, 8*SIZE, $18 ++ ble $4, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++ fillcs PREFETCHSIZE * SIZE($20) ++ fillcs PREFETCHSIZE * SIZE($18) ++ ++ MUL $f29, $f0, $f20 ++ fillcs 9*SIZE($18) ++ MUL $f30, $f1, $f21 ++ unop ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ MUL $f29, $f2, $f24 ++ unop ++ MUL $f30, $f3, $f25 ++ nop ++ ++ MUL $f30, $f2, $f26 ++ LD $f2, 2*SIZE($18) ++ MUL $f29, $f3, $f27 ++ LD $f3, 3*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ unop ++ MUL $f30, $f4, $f22 ++ LD $f4, 4*SIZE($18) ++ ++ ADD2 $f26, $f27, $f19 ++ addl $20, 8*SIZE, $20 ++ MUL $f29, $f5, $f23 ++ LD $f5, 5*SIZE($18) ++ ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($20) ++ MUL $f29, $f6, $f24 ++ unop ++ ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($20) ++ MUL $f30, $f7, $f25 ++ unop ++ ++ ADD $f18, $f10, $f18 ++ LD $f10, 2*SIZE($20) ++ MUL $f30, $f6, $f26 ++ LD $f6, 6*SIZE($18) ++ ++ ADD $f19, $f11, $f19 ++ LD $f11, 3*SIZE($20) ++ MUL $f29, $f7, $f27 ++ LD $f7, 7*SIZE($18) ++ ++ ST $f16,-8*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17,-7*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ ST $f18,-6*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ST $f19,-5*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ ++ ADD $f16, $f12, $f16 ++ LD $f12, 4*SIZE($20) ++ ADD $f17, $f13, $f17 ++ LD $f13, 5*SIZE($20) ++ ADD $f18, $f14, $f18 ++ LD $f14, 6*SIZE($20) ++ ADD $f19, $f15, $f19 ++ LD $f15, 7*SIZE($20) ++ ++ ST $f16,-4*SIZE($20) ++ addl $18, 8*SIZE, $18 ++ ST $f17,-3*SIZE($20) ++ subl $4, 1, $4 ++ ++ ST $f18,-2*SIZE($20) ++ nop ++ ST $f19,-1*SIZE($20) ++ bgt $4, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ MUL $f29, $f2, $f24 ++ MUL $f30, $f3, $f25 ++ MUL $f30, $f2, $f26 ++ MUL $f29, $f3, $f27 ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ MUL $f30, $f4, $f22 ++ ADD2 $f26, $f27, $f19 ++ MUL $f29, $f5, $f23 ++ ++ ADD $f16, $f8, $f16 ++ MUL $f29, $f6, $f24 ++ ADD $f17, $f28, $f17 ++ MUL $f30, $f7, $f25 ++ ++ ADD $f18, $f10, $f18 ++ MUL $f30, $f6, $f26 ++ ADD $f19, $f11, $f19 ++ MUL $f29, $f7, $f27 ++ ++ ST $f16, 0*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17, 1*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ ST $f18, 2*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ST $f19, 3*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ ++ ADD $f16, $f12, $f16 ++ ADD $f17, $f13, $f17 ++ ADD $f18, $f14, $f18 ++ ADD $f19, $f15, $f19 ++ ++ ST $f16, 4*SIZE($20) ++ ST $f17, 5*SIZE($20) ++ ST $f18, 6*SIZE($20) ++ ST $f19, 7*SIZE($20) ++ ++ unop ++ addl $20, 8*SIZE, $20 ++ unop ++ ble $5, $End ++ .align 4 ++ ++$Remain: ++ subl $5, 1, $6 ++ ble $5, $End ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ addl $18, 2*SIZE, $18 ++ ble $6, $RemainLoopEnd ++ .align 4 ++ ++$RemainLoop: ++ MUL $f29, $f0, $f20 ++ subl $6, 1, $6 ++ MUL $f30, $f1, $f21 ++ addl $20, 2*SIZE, $20 ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($20) ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($20) ++ ++ ST $f16,-2*SIZE($20) ++ addl $18, 2*SIZE, $18 ++ ST $f17,-1*SIZE($20) ++ bgt $6, $RemainLoop ++ .align 4 ++ ++$RemainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ ADD $f17, $f28, $f17 ++ ++ ST $f16, 0*SIZE($20) ++ nop ++ ST $f17, 1*SIZE($20) ++ nop ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ ldi $sp, 64($sp) ++ ret ++ .align 4 ++ ++$Sub: ++ SXSUBL $16, SIZE, $22 ++ addl $22, $22, $22 # Complex ++ .align 4 ++ ++ addl $19, $19, $19 # Complex ++ addl $21, $21, $21 # Complex ++ ++ ble $4, $SubRemain ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f2, 0*SIZE($18) ++ LD $f3, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f4, 0*SIZE($18) ++ LD $f5, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f6, 0*SIZE($18) ++ LD $f7, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ SXADDQ $21, $20, $24 ++ ++ LD $f10, 0*SIZE($24) ++ LD $f11, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ LD $f12, 0*SIZE($24) ++ LD $f13, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ LD $f14, 0*SIZE($24) ++ LD $f15, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ subl $4, 1, $4 ++ ble $4, $SubMainLoopEnd ++ .align 4 ++ ++$SubMainLoop: ++ MUL $f29, $f0, $f20 ++ unop ++ MUL $f30, $f1, $f21 ++ unop ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ MUL $f29, $f2, $f24 ++ SXADDQ $19, $18, $18 ++ MUL $f30, $f3, $f25 ++ unop ++ ++ MUL $f30, $f2, $f26 ++ LD $f2, 0*SIZE($18) ++ MUL $f29, $f3, $f27 ++ LD $f3, 1*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ SXADDQ $19, $18, $18 ++ MUL $f29, $f4, $f20 ++ unop ++ ++ ADD2 $f22, $f23, $f17 ++ unop ++ MUL $f30, $f5, $f21 ++ unop ++ ++ ADD1 $f24, $f25, $f18 ++ unop ++ MUL $f30, $f4, $f22 ++ LD $f4, 0*SIZE($18) ++ ++ ADD2 $f26, $f27, $f19 ++ unop ++ MUL $f29, $f5, $f23 ++ LD $f5, 1*SIZE($18) ++ ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($24) ++ MUL $f29, $f6, $f24 ++ SXADDQ $19, $18, $18 ++ ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($24) ++ MUL $f30, $f7, $f25 ++ SXADDQ $21, $24, $24 ++ ++ ADD $f18, $f10, $f18 ++ LD $f10, 0*SIZE($24) ++ MUL $f30, $f6, $f26 ++ LD $f6, 0*SIZE($18) ++ ++ ADD $f19, $f11, $f19 ++ LD $f11, 1*SIZE($24) ++ MUL $f29, $f7, $f27 ++ LD $f7, 1*SIZE($18) ++ ++ ST $f16, 0*SIZE($20) ++ SXADDQ $19, $18, $18 ++ ADD1 $f20, $f21, $f16 ++ unop ++ ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ADD2 $f22, $f23, $f17 ++ unop ++ ++ ST $f18, 0*SIZE($20) ++ SXADDQ $21, $24, $24 ++ ADD1 $f24, $f25, $f18 ++ unop ++ ++ ST $f19, 1*SIZE($20) ++ unop ++ ADD2 $f26, $f27, $f19 ++ SXADDQ $21, $20, $20 ++ ++ ADD $f16, $f12, $f16 ++ unop ++ LD $f12, 0*SIZE($24) ++ unop ++ ++ ADD $f17, $f13, $f17 ++ unop ++ LD $f13, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ ADD $f18, $f14, $f18 ++ subl $4, 1, $4 ++ LD $f14, 0*SIZE($24) ++ unop ++ ++ ADD $f19, $f15, $f19 ++ unop ++ LD $f15, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ unop ++ ++ ST $f18, 0*SIZE($20) ++ ST $f19, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ bgt $4, $SubMainLoop ++ .align 4 ++ ++$SubMainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ MUL $f29, $f2, $f24 ++ MUL $f30, $f3, $f25 ++ MUL $f30, $f2, $f26 ++ MUL $f29, $f3, $f27 ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ MUL $f30, $f4, $f22 ++ ADD2 $f26, $f27, $f19 ++ MUL $f29, $f5, $f23 ++ ++ ADD $f16, $f8, $f16 ++ MUL $f29, $f6, $f24 ++ ADD $f17, $f28, $f17 ++ MUL $f30, $f7, $f25 ++ ++ ADD $f18, $f10, $f18 ++ MUL $f30, $f6, $f26 ++ ADD $f19, $f11, $f19 ++ MUL $f29, $f7, $f27 ++ ++ ST $f16, 0*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17, 1*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ SXADDQ $21, $20, $20 ++ nop ++ ST $f18, 0*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ++ ST $f19, 1*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ SXADDQ $21, $20, $20 ++ ADD $f16, $f12, $f16 ++ ++ ADD $f17, $f13, $f17 ++ ADD $f18, $f14, $f18 ++ ADD $f19, $f15, $f19 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ++ ST $f18, 0*SIZE($20) ++ ST $f19, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ble $5, $SubEnd ++ .align 4 ++ ++$SubRemain: ++ subl $5, 1, $6 ++ ble $5, $SubEnd ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ SXADDQ $19, $18, $18 ++ SXADDQ $21, $20, $24 ++ ble $6, $SubRemainLoopEnd ++ .align 4 ++ ++$SubRemainLoop: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ADD1 $f20, $f21, $f16 ++ SXADDQ $19, $18, $18 ++ ++ ADD2 $f22, $f23, $f17 ++ nop ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($24) ++ ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ subl $6, 1, $6 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ bgt $6, $SubRemainLoop ++ .align 4 ++ ++$SubRemainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ ADD $f17, $f28, $f17 ++ ++ ST $f16, 0*SIZE($20) ++ nop ++ ST $f17, 1*SIZE($20) ++ nop ++ .align 4 ++ ++$SubEnd: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ ldi $sp, 64($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zaxpy_simd.S b/kernel/sw_64/zaxpy_simd.S +new file mode 100644 +index 0000000..a823ebf +--- /dev/null ++++ b/kernel/sw_64/zaxpy_simd.S +@@ -0,0 +1,1479 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 128 ++ ++#ifndef CONJ ++#define ADD1 SUB ++#define ADD2 ADD ++ ++#define VADD1 VSUB ++#define VADD2 VADD ++#define VMAD1 VNMAD ++#define VMAD2 VMAD ++ ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++ ++#define VADD1 VADD ++#define VADD2 VSUB ++#define VMAD1 VMAD ++#define VMAD2 VNMAD ++ ++#endif ++ ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 64, $26, 0 ++ ++ ldl $19, 0($sp) ++ fmov $f19, $f29 ++ ldl $20, 8($sp) ++ fmov $f20, $f30 ++ ++ mov $21, $18 ++ ldl $21, 16($sp) ++ ldi $sp, -64($sp) ++ nop ++ ++ fstd $f2, 0($sp) ++ cmpeq $19, 1, $1 ++ fstd $f3, 8($sp) ++ cmpeq $21, 1, $2 ++ ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ nop ++ ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++/* ++ unloop 8: process 8 complex=16 float/double ++*/ ++ and $1, $2, $1 ++ ble $16, $End ++ sra $16, 3, $4 ++ and $16, 7, $5 ++ ++ beq $1, $Sub ++ ble $4, $Remain ++ subl $4, 1, $4 ++ nop ++/*extern alpha_r alpha_i to vector*/ ++ ++ vcpyf $f29, $f29 ++ vcpyf $f30, $f30 ++ ++/** ++ align ? ++ test the address of Y & X ++**/ ++ and $20, (VEC_LEN*SIZE-1), $6 ++ bgt $6, $UnAlign_Y_ACCESS ++ ++ and $18, (VEC_LEN*SIZE-1), $7 ++ nop ++ nop ++ bgt $7, $UnAlign_X_ACCESS ++ ++ .align 4 ++ ++ VLD $f0, 0*VEC_LEN*SIZE($18) ++ VLD $f1, 1*VEC_LEN*SIZE($18) ++ VLD $f2, 2*VEC_LEN*SIZE($18) ++ VLD $f3, 3*VEC_LEN*SIZE($18) ++ ++/* ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ LD $f2, 2*SIZE($18) ++ LD $f3, 3*SIZE($18) ++ ++ LD $f4, 4*SIZE($18) ++ LD $f5, 5*SIZE($18) ++ LD $f6, 6*SIZE($18) ++ LD $f7, 7*SIZE($18) ++*/ ++ ++ VLD $f8, 0*VEC_LEN*SIZE($20) ++ VLD $f28, 1*VEC_LEN*SIZE($20) ++ VLD $f10, 2*VEC_LEN*SIZE($20) ++ VLD $f11, 3*VEC_LEN*SIZE($20) ++ ++/* ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ LD $f10, 2*SIZE($20) ++ LD $f11, 3*SIZE($20) ++ LD $f12, 4*SIZE($20) ++ LD $f13, 5*SIZE($20) ++ LD $f14, 6*SIZE($20) ++ LD $f15, 7*SIZE($20) ++*/ ++ addl $18, 16*SIZE, $18 ++ ble $4, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++/* ++ fillcs PREFETCHSIZE * SIZE($20) ++ fillcs PREFETCHSIZE * SIZE($18) ++*/ ++ fillcs PREFETCHSIZE * SIZE($20) ++ fillcs PREFETCHSIZE * SIZE($18) ++ ++/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ ++ vextf $f0, 1, $f4 ++ vextf $f0, 3, $f5 ++ vextf $f1, 0, $f6 ++ vextf $f1, 2, $f7 ++ ++ vextf $f2, 1, $f12 ++ vextf $f2, 3, $f13 ++ vextf $f3, 0, $f14 ++ vextf $f3, 2, $f15 ++ ++ vinsf $f4, $f1, 0, $f1 ++ vinsf $f5, $f1, 2, $f1 ++ vinsf $f6, $f0, 1, $f0 ++ vinsf $f7, $f0, 3, $f0 ++ ++ vinsf $f12, $f3, 0, $f3 ++ vinsf $f13, $f3, 2, $f3 ++ vinsf $f14, $f2, 1, $f2 ++ vinsf $f15, $f2, 3, $f2 ++ ++/*Compute*/ ++ VMUL $f29, $f0, $f20 ++ VMUL $f30, $f0, $f21 ++ VMUL $f29, $f2, $f22 ++ VMUL $f30, $f2, $f23 ++ ++ ++ VMAD1 $f30, $f1, $f20, $f16 ++ VMAD2 $f29, $f1, $f21, $f17 ++ VMAD1 $f30, $f3, $f22, $f18 ++ VMAD2 $f29, $f3, $f23, $f19 ++ ++ VLD $f0, 0*VEC_LEN*SIZE($18) ++ VLD $f1, 1*VEC_LEN*SIZE($18) ++ VLD $f2, 2*VEC_LEN*SIZE($18) ++ VLD $f3, 3*VEC_LEN*SIZE($18) ++ ++/*combine the real & image vector to complex vector*/ ++ vextf $f16, 1, $f24 ++ vextf $f16, 3, $f25 ++ vextf $f17, 0, $f26 ++ vextf $f17, 2, $f27 ++ ++ vextf $f18, 1, $f12 ++ vextf $f18, 3, $f13 ++ vextf $f19, 0, $f14 ++ vextf $f19, 2, $f15 ++ ++ vinsf $f24, $f17, 0, $f17 ++ addl $20, 16*SIZE, $20 ++ vinsf $f25, $f17, 2, $f17 ++ addl $18, 16*SIZE, $18 ++ ++ vinsf $f26, $f16, 1, $f16 ++ subl $4, 1, $4 ++ vinsf $f27, $f16, 3, $f16 ++ nop ++ ++ vinsf $f12, $f19, 0, $f19 ++ vinsf $f13, $f19, 2, $f19 ++ vinsf $f14, $f18, 1, $f18 ++ vinsf $f15, $f18, 3, $f18 ++ ++ VADD $f16, $f8, $f16 ++ VLD $f8, 0*VEC_LEN*SIZE($20) ++ VADD $f17, $f28, $f17 ++ VLD $f28, 1*VEC_LEN*SIZE($20) ++ ++ VADD $f18, $f10, $f18 ++ VLD $f10, 2*VEC_LEN*SIZE($20) ++ VADD $f19, $f11, $f19 ++ VLD $f11, 3*VEC_LEN*SIZE($20) ++ ++ VST $f16, -4*VEC_LEN*SIZE($20) ++ VST $f17, -3*VEC_LEN*SIZE($20) ++ VST $f18, -2*VEC_LEN*SIZE($20) ++ VST $f19, -1*VEC_LEN*SIZE($20) ++ ++/* ++ MUL $f29, $f0, $f20 ++ fillcs 9*SIZE($18) ++ MUL $f30, $f1, $f21 ++ unop ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ MUL $f29, $f2, $f24 ++ unop ++ MUL $f30, $f3, $f25 ++ nop ++ ++ MUL $f30, $f2, $f26 ++ LD $f2, 2*SIZE($18) ++ MUL $f29, $f3, $f27 ++ LD $f3, 3*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ unop ++ MUL $f30, $f4, $f22 ++ LD $f4, 4*SIZE($18) ++ ++ ADD2 $f26, $f27, $f19 ++ addl $20, 8*SIZE, $20 ++ MUL $f29, $f5, $f23 ++ LD $f5, 5*SIZE($18) ++ ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($20) ++ MUL $f29, $f6, $f24 ++ unop ++ ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($20) ++ MUL $f30, $f7, $f25 ++ unop ++ ++ ADD $f18, $f10, $f18 ++ LD $f10, 2*SIZE($20) ++ MUL $f30, $f6, $f26 ++ LD $f6, 6*SIZE($18) ++ ++ ADD $f19, $f11, $f19 ++ LD $f11, 3*SIZE($20) ++ MUL $f29, $f7, $f27 ++ LD $f7, 7*SIZE($18) ++ ++ ST $f16,-8*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17,-7*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ ST $f18,-6*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ST $f19,-5*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ ++ ADD $f16, $f12, $f16 ++ LD $f12, 4*SIZE($20) ++ ADD $f17, $f13, $f17 ++ LD $f13, 5*SIZE($20) ++ ADD $f18, $f14, $f18 ++ LD $f14, 6*SIZE($20) ++ ADD $f19, $f15, $f19 ++ LD $f15, 7*SIZE($20) ++ ++ ST $f16,-4*SIZE($20) ++ ++ ST $f17,-3*SIZE($20) ++ ++ ++ ST $f18,-2*SIZE($20) ++ nop ++ ST $f19,-1*SIZE($20) ++*/ ++ bgt $4, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ ++/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ ++ vextf $f0, 1, $f4 ++ vextf $f0, 3, $f5 ++ vextf $f1, 0, $f6 ++ vextf $f1, 2, $f7 ++ ++ vextf $f2, 1, $f12 ++ vextf $f2, 3, $f13 ++ vextf $f3, 0, $f14 ++ vextf $f3, 2, $f15 ++ ++ vinsf $f4, $f1, 0, $f1 ++ vinsf $f5, $f1, 2, $f1 ++ vinsf $f6, $f0, 1, $f0 ++ vinsf $f7, $f0, 3, $f0 ++ ++ vinsf $f12, $f3, 0, $f3 ++ vinsf $f13, $f3, 2, $f3 ++ vinsf $f14, $f2, 1, $f2 ++ vinsf $f15, $f2, 3, $f2 ++ ++ VMUL $f29, $f0, $f20 ++ VMUL $f30, $f0, $f21 ++ VMUL $f29, $f2, $f22 ++ VMUL $f30, $f2, $f23 ++ ++ VMAD1 $f30, $f1, $f20, $f16 ++ VMAD2 $f29, $f1, $f21, $f17 ++ VMAD1 $f30, $f3, $f22, $f18 ++ VMAD2 $f29, $f3, $f23, $f19 ++ ++/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ ++ vextf $f16, 1, $f24 ++ vextf $f16, 3, $f25 ++ vextf $f17, 0, $f26 ++ vextf $f17, 2, $f27 ++ ++ vextf $f18, 1, $f12 ++ vextf $f18, 3, $f13 ++ vextf $f19, 0, $f14 ++ vextf $f19, 2, $f15 ++ ++ vinsf $f24, $f17, 0, $f17 ++ vinsf $f25, $f17, 2, $f17 ++ vinsf $f26, $f16, 1, $f16 ++ vinsf $f27, $f16, 3, $f16 ++ ++ vinsf $f12, $f19, 0, $f19 ++ vinsf $f13, $f19, 2, $f19 ++ vinsf $f14, $f18, 1, $f18 ++ vinsf $f15, $f18, 3, $f18 ++ ++ VADD $f16, $f8, $f16 ++ VADD $f17, $f28, $f17 ++ VADD $f18, $f10, $f18 ++ VADD $f19, $f11, $f19 ++ ++ VST $f16, 0*VEC_LEN*SIZE($20) ++ VST $f17, 1*VEC_LEN*SIZE($20) ++ VST $f18, 2*VEC_LEN*SIZE($20) ++ VST $f19, 3*VEC_LEN*SIZE($20) ++ ++ addl $20, 16*SIZE, $20 ++ ble $5, $End ++ ++/* MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ MUL $f29, $f2, $f24 ++ MUL $f30, $f3, $f25 ++ MUL $f30, $f2, $f26 ++ MUL $f29, $f3, $f27 ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ MUL $f30, $f4, $f22 ++ ADD2 $f26, $f27, $f19 ++ MUL $f29, $f5, $f23 ++ ++ ADD $f16, $f8, $f16 ++ MUL $f29, $f6, $f24 ++ ADD $f17, $f28, $f17 ++ MUL $f30, $f7, $f25 ++ ++ ADD $f18, $f10, $f18 ++ MUL $f30, $f6, $f26 ++ ADD $f19, $f11, $f19 ++ MUL $f29, $f7, $f27 ++ ++ ST $f16, 0*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17, 1*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ ST $f18, 2*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ST $f19, 3*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ ++ ADD $f16, $f12, $f16 ++ ADD $f17, $f13, $f17 ++ ADD $f18, $f14, $f18 ++ ADD $f19, $f15, $f19 ++ ++ ST $f16, 4*SIZE($20) ++ ST $f17, 5*SIZE($20) ++ ST $f18, 6*SIZE($20) ++ ST $f19, 7*SIZE($20) ++ ++ unop ++ unop ++*/ ++ .align 4 ++ ++$Remain: ++ subl $5, 1, $6 ++ ble $5, $End ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ addl $18, 2*SIZE, $18 ++ ble $6, $RemainLoopEnd ++ .align 4 ++ ++$RemainLoop: ++ MUL $f29, $f0, $f20 ++ subl $6, 1, $6 ++ MUL $f30, $f1, $f21 ++ addl $20, 2*SIZE, $20 ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($20) ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($20) ++ ++ ST $f16,-2*SIZE($20) ++ addl $18, 2*SIZE, $18 ++ ST $f17,-1*SIZE($20) ++ bgt $6, $RemainLoop ++ .align 4 ++ ++$RemainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ ADD $f17, $f28, $f17 ++ ++ ST $f16, 0*SIZE($20) ++ nop ++ ST $f17, 1*SIZE($20) ++ nop ++ .align 4 ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ ldi $sp, 64($sp) ++ ret ++ .align 4 ++ ++$UnAlign_Y_ACCESS: ++ and $18, (VEC_LEN*SIZE-1), $7 ++ nop ++ nop ++ bgt $7, $UnAlign_XY_ACCESS ++ .align 4 ++/* ++ Unalign access Y, Align access X ++*/ ++ ++ VLD_UL $f8, 0*VEC_LEN*SIZE($20) ++ VLD_UH $f12, 1*VEC_LEN*SIZE($20) ++ ++ VLD_UL $f28, 1*VEC_LEN*SIZE($20) ++ VLD_UH $f13, 2*VEC_LEN*SIZE($20) ++ ++ VLD_UL $f10, 2*VEC_LEN*SIZE($20) ++ VLD_UH $f14, 3*VEC_LEN*SIZE($20) ++ ++ VLD_UL $f11, 3*VEC_LEN*SIZE($20) ++ VLD_UH $f15, 4*VEC_LEN*SIZE($20) ++ ++ VLD $f0, 0*VEC_LEN*SIZE($18) ++ VLD $f1, 1*VEC_LEN*SIZE($18) ++ VLD $f2, 2*VEC_LEN*SIZE($18) ++ VLD $f3, 3*VEC_LEN*SIZE($18) ++ ++ vbisw $f8, $f12, $f8 ++ vbisw $f28, $f13, $f28 ++ vbisw $f10, $f14, $f10 ++ vbisw $f11, $f15, $f11 ++ ++ addl $18, 16*SIZE, $18 ++ ble $4, $UnAlign_Y_MainLoopEnd ++ .align 4 ++$UnAlign_Y_MainLoop: ++ fillcs PREFETCHSIZE * SIZE($20) ++ fillcs PREFETCHSIZE * SIZE($18) ++ ++/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ ++ vextf $f0, 1, $f4 ++ vextf $f0, 3, $f5 ++ vextf $f1, 0, $f6 ++ vextf $f1, 2, $f7 ++ ++ vextf $f2, 1, $f12 ++ vextf $f2, 3, $f13 ++ vextf $f3, 0, $f14 ++ vextf $f3, 2, $f15 ++ ++ vinsf $f4, $f1, 0, $f1 ++ vinsf $f5, $f1, 2, $f1 ++ vinsf $f6, $f0, 1, $f0 ++ vinsf $f7, $f0, 3, $f0 ++ ++ vinsf $f12, $f3, 0, $f3 ++ vinsf $f13, $f3, 2, $f3 ++ vinsf $f14, $f2, 1, $f2 ++ vinsf $f15, $f2, 3, $f2 ++ ++/*Compute*/ ++ VMUL $f29, $f0, $f20 ++ VMUL $f30, $f0, $f21 ++ VMUL $f29, $f2, $f22 ++ VMUL $f30, $f2, $f23 ++ ++ ++ VMAD1 $f30, $f1, $f20, $f16 ++ VMAD2 $f29, $f1, $f21, $f17 ++ VMAD1 $f30, $f3, $f22, $f18 ++ VMAD2 $f29, $f3, $f23, $f19 ++ ++ VLD $f0, 0*VEC_LEN*SIZE($18) ++ VLD $f1, 1*VEC_LEN*SIZE($18) ++ VLD $f2, 2*VEC_LEN*SIZE($18) ++ VLD $f3, 3*VEC_LEN*SIZE($18) ++ ++ ++/*combine the real & image vector to complex vector*/ ++ vextf $f16, 1, $f24 ++ vextf $f16, 3, $f25 ++ vextf $f17, 0, $f26 ++ vextf $f17, 2, $f27 ++ ++ vextf $f18, 1, $f12 ++ vextf $f18, 3, $f13 ++ vextf $f19, 0, $f14 ++ vextf $f19, 2, $f15 ++ ++ vinsf $f24, $f17, 0, $f17 ++ addl $20, 16*SIZE, $20 ++ vinsf $f25, $f17, 2, $f17 ++ addl $18, 16*SIZE, $18 ++ ++ vinsf $f26, $f16, 1, $f16 ++ subl $4, 1, $4 ++ vinsf $f27, $f16, 3, $f16 ++ nop ++ ++ vinsf $f12, $f19, 0, $f19 ++ vinsf $f13, $f19, 2, $f19 ++ vinsf $f14, $f18, 1, $f18 ++ vinsf $f15, $f18, 3, $f18 ++ ++ VADD $f16, $f8, $f16 ++ VLD_UL $f8, 0*VEC_LEN*SIZE($20) ++ VLD_UH $f12, 1*VEC_LEN*SIZE($20) ++ ++ VADD $f17, $f28, $f17 ++ VLD_UL $f28, 1*VEC_LEN*SIZE($20) ++ VLD_UH $f13, 2*VEC_LEN*SIZE($20) ++ ++ ++ VADD $f18, $f10, $f18 ++ VLD_UL $f10, 2*VEC_LEN*SIZE($20) ++ VLD_UH $f14, 3*VEC_LEN*SIZE($20) ++ ++ VADD $f19, $f11, $f19 ++ VLD_UL $f11, 3*VEC_LEN*SIZE($20) ++ VLD_UH $f15, 4*VEC_LEN*SIZE($20) ++ ++ ++ vbisw $f8, $f12, $f8 ++ VST_UL $f16, -4*VEC_LEN*SIZE($20) ++ VST_UH $f16, -3*VEC_LEN*SIZE($20) ++ ++ vbisw $f28, $f13, $f28 ++ VST_UL $f17, -3*VEC_LEN*SIZE($20) ++ VST_UH $f17, -2*VEC_LEN*SIZE($20) ++ ++ vbisw $f10, $f14, $f10 ++ VST_UL $f18, -2*VEC_LEN*SIZE($20) ++ VST_UH $f18, -1*VEC_LEN*SIZE($20) ++ ++ vbisw $f11, $f15, $f11 ++ VST_UL $f19, -1*VEC_LEN*SIZE($20) ++ VST_UH $f19, 0*VEC_LEN*SIZE($20) ++ ++ bgt $4, $UnAlign_Y_MainLoop ++ ++$UnAlign_Y_MainLoopEnd: ++/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ ++ vextf $f0, 1, $f4 ++ vextf $f0, 3, $f5 ++ vextf $f1, 0, $f6 ++ vextf $f1, 2, $f7 ++ ++ vextf $f2, 1, $f12 ++ vextf $f2, 3, $f13 ++ vextf $f3, 0, $f14 ++ vextf $f3, 2, $f15 ++ ++ vinsf $f4, $f1, 0, $f1 ++ vinsf $f5, $f1, 2, $f1 ++ vinsf $f6, $f0, 1, $f0 ++ vinsf $f7, $f0, 3, $f0 ++ ++ vinsf $f12, $f3, 0, $f3 ++ vinsf $f13, $f3, 2, $f3 ++ vinsf $f14, $f2, 1, $f2 ++ vinsf $f15, $f2, 3, $f2 ++ ++ VMUL $f29, $f0, $f20 ++ VMUL $f30, $f0, $f21 ++ VMUL $f29, $f2, $f22 ++ VMUL $f30, $f2, $f23 ++ ++ VMAD1 $f30, $f1, $f20, $f16 ++ VMAD2 $f29, $f1, $f21, $f17 ++ VMAD1 $f30, $f3, $f22, $f18 ++ VMAD2 $f29, $f3, $f23, $f19 ++ ++/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ ++ vextf $f16, 1, $f24 ++ vextf $f16, 3, $f25 ++ vextf $f17, 0, $f26 ++ vextf $f17, 2, $f27 ++ ++ vextf $f18, 1, $f12 ++ vextf $f18, 3, $f13 ++ vextf $f19, 0, $f14 ++ vextf $f19, 2, $f15 ++ ++ vinsf $f24, $f17, 0, $f17 ++ vinsf $f25, $f17, 2, $f17 ++ vinsf $f26, $f16, 1, $f16 ++ vinsf $f27, $f16, 3, $f16 ++ ++ vinsf $f12, $f19, 0, $f19 ++ vinsf $f13, $f19, 2, $f19 ++ vinsf $f14, $f18, 1, $f18 ++ vinsf $f15, $f18, 3, $f18 ++ ++ VADD $f16, $f8, $f16 ++ VADD $f17, $f28, $f17 ++ VADD $f18, $f10, $f18 ++ VADD $f19, $f11, $f19 ++ ++ VST_UL $f16, 0*VEC_LEN*SIZE($20) ++ VST_UH $f16, 1*VEC_LEN*SIZE($20) ++ VST_UL $f17, 1*VEC_LEN*SIZE($20) ++ VST_UH $f17, 2*VEC_LEN*SIZE($20) ++ ++ VST_UL $f18, 2*VEC_LEN*SIZE($20) ++ VST_UH $f18, 3*VEC_LEN*SIZE($20) ++ VST_UL $f19, 3*VEC_LEN*SIZE($20) ++ VST_UH $f19, 4*VEC_LEN*SIZE($20) ++ ++ addl $20, 16*SIZE, $20 ++ ble $5, $End ++ ++ jmp $Remain ++ ++ .align 4 ++ ++ ++$UnAlign_X_ACCESS: ++ and $20, (VEC_LEN*SIZE-1), $6 ++ nop ++ nop ++ bgt $6, $UnAlign_XY_ACCESS ++ ++ .align 4 ++/* ++ Unalign access X, Align access Y ++*/ ++ VLD_UL $f0, 0*VEC_LEN*SIZE($18) ++ VLD_UH $f4, 1*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f1, 1*VEC_LEN*SIZE($18) ++ VLD_UH $f5, 2*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f2, 2*VEC_LEN*SIZE($18) ++ VLD_UH $f6, 3*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f3, 3*VEC_LEN*SIZE($18) ++ VLD_UH $f7, 4*VEC_LEN*SIZE($18) ++ ++ VLD $f8, 0*VEC_LEN*SIZE($20) ++ VLD $f28, 1*VEC_LEN*SIZE($20) ++ VLD $f10, 2*VEC_LEN*SIZE($20) ++ VLD $f11, 3*VEC_LEN*SIZE($20) ++ ++ vbisw $f0, $f4, $f0 ++ vbisw $f1, $f5, $f1 ++ vbisw $f2, $f6, $f2 ++ vbisw $f3, $f7, $f3 ++ ++ addl $18, 16*SIZE, $18 ++ ble $4, $UnAlign_X_MainLoopEnd ++ .align 4 ++$UnAlign_X_MainLoop: ++ fillcs PREFETCHSIZE * SIZE($20) ++ fillcs PREFETCHSIZE * SIZE($18) ++ ++/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ ++ vextf $f0, 1, $f4 ++ vextf $f0, 3, $f5 ++ vextf $f1, 0, $f6 ++ vextf $f1, 2, $f7 ++ ++ vextf $f2, 1, $f12 ++ vextf $f2, 3, $f13 ++ vextf $f3, 0, $f14 ++ vextf $f3, 2, $f15 ++ ++ vinsf $f4, $f1, 0, $f1 ++ vinsf $f5, $f1, 2, $f1 ++ vinsf $f6, $f0, 1, $f0 ++ vinsf $f7, $f0, 3, $f0 ++ ++ vinsf $f12, $f3, 0, $f3 ++ vinsf $f13, $f3, 2, $f3 ++ vinsf $f14, $f2, 1, $f2 ++ vinsf $f15, $f2, 3, $f2 ++ ++/*Compute*/ ++ VMUL $f29, $f0, $f20 ++ VMUL $f30, $f0, $f21 ++ VMUL $f29, $f2, $f22 ++ VMUL $f30, $f2, $f23 ++ ++ ++ VMAD1 $f30, $f1, $f20, $f16 ++ VMAD2 $f29, $f1, $f21, $f17 ++ VMAD1 $f30, $f3, $f22, $f18 ++ VMAD2 $f29, $f3, $f23, $f19 ++/* ++ VLD $f0, 0*VEC_LEN*SIZE($18) ++ VLD $f1, 1*VEC_LEN*SIZE($18) ++ VLD $f2, 2*VEC_LEN*SIZE($18) ++ VLD $f3, 3*VEC_LEN*SIZE($18) ++*/ ++ VLD_UL $f0, 0*VEC_LEN*SIZE($18) ++ VLD_UH $f4, 1*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f1, 1*VEC_LEN*SIZE($18) ++ VLD_UH $f5, 2*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f2, 2*VEC_LEN*SIZE($18) ++ VLD_UH $f6, 3*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f3, 3*VEC_LEN*SIZE($18) ++ VLD_UH $f7, 4*VEC_LEN*SIZE($18) ++ ++/*combine the real & image vector to complex vector*/ ++ vextf $f16, 1, $f24 ++ vextf $f16, 3, $f25 ++ vextf $f17, 0, $f26 ++ vextf $f17, 2, $f27 ++ ++ vextf $f18, 1, $f12 ++ vextf $f18, 3, $f13 ++ vextf $f19, 0, $f14 ++ vextf $f19, 2, $f15 ++ ++ vbisw $f0, $f4, $f0 ++ vbisw $f1, $f5, $f1 ++ vbisw $f2, $f6, $f2 ++ vbisw $f3, $f7, $f3 ++ ++ vinsf $f24, $f17, 0, $f17 ++ addl $20, 16*SIZE, $20 ++ vinsf $f25, $f17, 2, $f17 ++ addl $18, 16*SIZE, $18 ++ ++ vinsf $f26, $f16, 1, $f16 ++ subl $4, 1, $4 ++ vinsf $f27, $f16, 3, $f16 ++ nop ++ ++ vinsf $f12, $f19, 0, $f19 ++ vinsf $f13, $f19, 2, $f19 ++ vinsf $f14, $f18, 1, $f18 ++ vinsf $f15, $f18, 3, $f18 ++ ++ VADD $f16, $f8, $f16 ++ VLD $f8, 0*VEC_LEN*SIZE($20) ++ VADD $f17, $f28, $f17 ++ VLD $f28, 1*VEC_LEN*SIZE($20) ++ ++ VADD $f18, $f10, $f18 ++ VLD $f10, 2*VEC_LEN*SIZE($20) ++ VADD $f19, $f11, $f19 ++ VLD $f11, 3*VEC_LEN*SIZE($20) ++ ++ VST $f16, -4*VEC_LEN*SIZE($20) ++ VST $f17, -3*VEC_LEN*SIZE($20) ++ VST $f18, -2*VEC_LEN*SIZE($20) ++ VST $f19, -1*VEC_LEN*SIZE($20) ++ ++ bgt $4, $UnAlign_X_MainLoop ++ .align 4 ++ ++$UnAlign_X_MainLoopEnd: ++/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ ++ vextf $f0, 1, $f4 ++ vextf $f0, 3, $f5 ++ vextf $f1, 0, $f6 ++ vextf $f1, 2, $f7 ++ ++ vextf $f2, 1, $f12 ++ vextf $f2, 3, $f13 ++ vextf $f3, 0, $f14 ++ vextf $f3, 2, $f15 ++ ++ vinsf $f4, $f1, 0, $f1 ++ vinsf $f5, $f1, 2, $f1 ++ vinsf $f6, $f0, 1, $f0 ++ vinsf $f7, $f0, 3, $f0 ++ ++ vinsf $f12, $f3, 0, $f3 ++ vinsf $f13, $f3, 2, $f3 ++ vinsf $f14, $f2, 1, $f2 ++ vinsf $f15, $f2, 3, $f2 ++ ++ VMUL $f29, $f0, $f20 ++ VMUL $f30, $f0, $f21 ++ VMUL $f29, $f2, $f22 ++ VMUL $f30, $f2, $f23 ++ ++ VMAD1 $f30, $f1, $f20, $f16 ++ VMAD2 $f29, $f1, $f21, $f17 ++ VMAD1 $f30, $f3, $f22, $f18 ++ VMAD2 $f29, $f3, $f23, $f19 ++ ++/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ ++ vextf $f16, 1, $f24 ++ vextf $f16, 3, $f25 ++ vextf $f17, 0, $f26 ++ vextf $f17, 2, $f27 ++ ++ vextf $f18, 1, $f12 ++ vextf $f18, 3, $f13 ++ vextf $f19, 0, $f14 ++ vextf $f19, 2, $f15 ++ ++ vinsf $f24, $f17, 0, $f17 ++ vinsf $f25, $f17, 2, $f17 ++ vinsf $f26, $f16, 1, $f16 ++ vinsf $f27, $f16, 3, $f16 ++ ++ vinsf $f12, $f19, 0, $f19 ++ vinsf $f13, $f19, 2, $f19 ++ vinsf $f14, $f18, 1, $f18 ++ vinsf $f15, $f18, 3, $f18 ++ ++ VADD $f16, $f8, $f16 ++ VADD $f17, $f28, $f17 ++ VADD $f18, $f10, $f18 ++ VADD $f19, $f11, $f19 ++ ++ VST $f16, 0*VEC_LEN*SIZE($20) ++ VST $f17, 1*VEC_LEN*SIZE($20) ++ VST $f18, 2*VEC_LEN*SIZE($20) ++ VST $f19, 3*VEC_LEN*SIZE($20) ++ ++ addl $20, 16*SIZE, $20 ++ ble $5, $End ++ ++ jmp $Remain ++ .align 4 ++ ++$UnAlign_XY_ACCESS: ++/* ++ Unalign access X & Y ++*/ ++ VLD_UL $f0, 0*VEC_LEN*SIZE($18) ++ VLD_UH $f4, 1*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f1, 1*VEC_LEN*SIZE($18) ++ VLD_UH $f5, 2*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f2, 2*VEC_LEN*SIZE($18) ++ VLD_UH $f6, 3*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f3, 3*VEC_LEN*SIZE($18) ++ VLD_UH $f7, 4*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f8, 0*VEC_LEN*SIZE($20) ++ VLD_UH $f12, 1*VEC_LEN*SIZE($20) ++ ++ VLD_UL $f28, 1*VEC_LEN*SIZE($20) ++ VLD_UH $f13, 2*VEC_LEN*SIZE($20) ++ ++ VLD_UL $f10, 2*VEC_LEN*SIZE($20) ++ VLD_UH $f14, 3*VEC_LEN*SIZE($20) ++ ++ VLD_UL $f11, 3*VEC_LEN*SIZE($20) ++ VLD_UH $f15, 4*VEC_LEN*SIZE($20) ++ ++ vbisw $f0, $f4, $f0 ++ vbisw $f1, $f5, $f1 ++ vbisw $f2, $f6, $f2 ++ vbisw $f3, $f7, $f3 ++ ++ vbisw $f8, $f12, $f8 ++ vbisw $f28, $f13, $f28 ++ vbisw $f10, $f14, $f10 ++ vbisw $f11, $f15, $f11 ++ ++ addl $18, 16*SIZE, $18 ++ ble $4, $UnAlign_MainLoopEnd ++ .align 4 ++ ++$UnAlign_MainLoop: ++ fillcs PREFETCHSIZE * SIZE($20) ++ fillcs PREFETCHSIZE * SIZE($18) ++ ++/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ ++ vextf $f0, 1, $f4 ++ vextf $f0, 3, $f5 ++ vextf $f1, 0, $f6 ++ vextf $f1, 2, $f7 ++ ++ vextf $f2, 1, $f12 ++ vextf $f2, 3, $f13 ++ vextf $f3, 0, $f14 ++ vextf $f3, 2, $f15 ++ ++ vinsf $f4, $f1, 0, $f1 ++ vinsf $f5, $f1, 2, $f1 ++ vinsf $f6, $f0, 1, $f0 ++ vinsf $f7, $f0, 3, $f0 ++ ++ vinsf $f12, $f3, 0, $f3 ++ vinsf $f13, $f3, 2, $f3 ++ vinsf $f14, $f2, 1, $f2 ++ vinsf $f15, $f2, 3, $f2 ++ ++/*Compute*/ ++ VMUL $f29, $f0, $f20 ++ VMUL $f30, $f0, $f21 ++ VMUL $f29, $f2, $f22 ++ VMUL $f30, $f2, $f23 ++ ++ ++ VMAD1 $f30, $f1, $f20, $f16 ++ VMAD2 $f29, $f1, $f21, $f17 ++ VMAD1 $f30, $f3, $f22, $f18 ++ VMAD2 $f29, $f3, $f23, $f19 ++/* ++ VLD $f0, 0*VEC_LEN*SIZE($18) ++ VLD $f1, 1*VEC_LEN*SIZE($18) ++ VLD $f2, 2*VEC_LEN*SIZE($18) ++ VLD $f3, 3*VEC_LEN*SIZE($18) ++*/ ++ VLD_UL $f0, 0*VEC_LEN*SIZE($18) ++ VLD_UH $f4, 1*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f1, 1*VEC_LEN*SIZE($18) ++ VLD_UH $f5, 2*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f2, 2*VEC_LEN*SIZE($18) ++ VLD_UH $f6, 3*VEC_LEN*SIZE($18) ++ ++ VLD_UL $f3, 3*VEC_LEN*SIZE($18) ++ VLD_UH $f7, 4*VEC_LEN*SIZE($18) ++ ++/*combine the real & image vector to complex vector*/ ++ vextf $f16, 1, $f24 ++ vextf $f16, 3, $f25 ++ vextf $f17, 0, $f26 ++ vextf $f17, 2, $f27 ++ ++ vextf $f18, 1, $f12 ++ vextf $f18, 3, $f13 ++ vextf $f19, 0, $f14 ++ vextf $f19, 2, $f15 ++ ++ vbisw $f0, $f4, $f0 ++ vbisw $f1, $f5, $f1 ++ vbisw $f2, $f6, $f2 ++ vbisw $f3, $f7, $f3 ++ ++ vinsf $f24, $f17, 0, $f17 ++ addl $20, 16*SIZE, $20 ++ vinsf $f25, $f17, 2, $f17 ++ addl $18, 16*SIZE, $18 ++ ++ vinsf $f26, $f16, 1, $f16 ++ subl $4, 1, $4 ++ vinsf $f27, $f16, 3, $f16 ++ nop ++ ++ vinsf $f12, $f19, 0, $f19 ++ vinsf $f13, $f19, 2, $f19 ++ vinsf $f14, $f18, 1, $f18 ++ vinsf $f15, $f18, 3, $f18 ++ ++ VADD $f16, $f8, $f16 ++ VLD_UL $f8, 0*VEC_LEN*SIZE($20) ++ VLD_UH $f12, 1*VEC_LEN*SIZE($20) ++ ++ VADD $f17, $f28, $f17 ++ VLD_UL $f28, 1*VEC_LEN*SIZE($20) ++ VLD_UH $f13, 2*VEC_LEN*SIZE($20) ++ ++ ++ VADD $f18, $f10, $f18 ++ VLD_UL $f10, 2*VEC_LEN*SIZE($20) ++ VLD_UH $f14, 3*VEC_LEN*SIZE($20) ++ ++ VADD $f19, $f11, $f19 ++ VLD_UL $f11, 3*VEC_LEN*SIZE($20) ++ VLD_UH $f15, 4*VEC_LEN*SIZE($20) ++ ++/* ++ VST $f16, -4*VEC_LEN*SIZE($20) ++ VST $f17, -3*VEC_LEN*SIZE($20) ++ VST $f18, -2*VEC_LEN*SIZE($20) ++ VST $f19, -1*VEC_LEN*SIZE($20) ++*/ ++ ++ vbisw $f8, $f12, $f8 ++ VST_UL $f16, -4*VEC_LEN*SIZE($20) ++ VST_UH $f16, -3*VEC_LEN*SIZE($20) ++ ++ vbisw $f28, $f13, $f28 ++ VST_UL $f17, -3*VEC_LEN*SIZE($20) ++ VST_UH $f17, -2*VEC_LEN*SIZE($20) ++ ++ vbisw $f10, $f14, $f10 ++ VST_UL $f18, -2*VEC_LEN*SIZE($20) ++ VST_UH $f18, -1*VEC_LEN*SIZE($20) ++ ++ vbisw $f11, $f15, $f11 ++ VST_UL $f19, -1*VEC_LEN*SIZE($20) ++ VST_UH $f19, 0*VEC_LEN*SIZE($20) ++ ++ bgt $4, $UnAlign_MainLoop ++ .align 4 ++ ++$UnAlign_MainLoopEnd: ++ ++/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ ++ vextf $f0, 1, $f4 ++ vextf $f0, 3, $f5 ++ vextf $f1, 0, $f6 ++ vextf $f1, 2, $f7 ++ ++ vextf $f2, 1, $f12 ++ vextf $f2, 3, $f13 ++ vextf $f3, 0, $f14 ++ vextf $f3, 2, $f15 ++ ++ vinsf $f4, $f1, 0, $f1 ++ vinsf $f5, $f1, 2, $f1 ++ vinsf $f6, $f0, 1, $f0 ++ vinsf $f7, $f0, 3, $f0 ++ ++ vinsf $f12, $f3, 0, $f3 ++ vinsf $f13, $f3, 2, $f3 ++ vinsf $f14, $f2, 1, $f2 ++ vinsf $f15, $f2, 3, $f2 ++ ++ VMUL $f29, $f0, $f20 ++ VMUL $f30, $f0, $f21 ++ VMUL $f29, $f2, $f22 ++ VMUL $f30, $f2, $f23 ++ ++ VMAD1 $f30, $f1, $f20, $f16 ++ VMAD2 $f29, $f1, $f21, $f17 ++ VMAD1 $f30, $f3, $f22, $f18 ++ VMAD2 $f29, $f3, $f23, $f19 ++ ++/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/ ++ vextf $f16, 1, $f24 ++ vextf $f16, 3, $f25 ++ vextf $f17, 0, $f26 ++ vextf $f17, 2, $f27 ++ ++ vextf $f18, 1, $f12 ++ vextf $f18, 3, $f13 ++ vextf $f19, 0, $f14 ++ vextf $f19, 2, $f15 ++ ++ vinsf $f24, $f17, 0, $f17 ++ vinsf $f25, $f17, 2, $f17 ++ vinsf $f26, $f16, 1, $f16 ++ vinsf $f27, $f16, 3, $f16 ++ ++ vinsf $f12, $f19, 0, $f19 ++ vinsf $f13, $f19, 2, $f19 ++ vinsf $f14, $f18, 1, $f18 ++ vinsf $f15, $f18, 3, $f18 ++ ++ VADD $f16, $f8, $f16 ++ VADD $f17, $f28, $f17 ++ VADD $f18, $f10, $f18 ++ VADD $f19, $f11, $f19 ++ ++ VST_UL $f16, 0*VEC_LEN*SIZE($20) ++ VST_UH $f16, 1*VEC_LEN*SIZE($20) ++ VST_UL $f17, 1*VEC_LEN*SIZE($20) ++ VST_UH $f17, 2*VEC_LEN*SIZE($20) ++ ++ VST_UL $f18, 2*VEC_LEN*SIZE($20) ++ VST_UH $f18, 3*VEC_LEN*SIZE($20) ++ VST_UL $f19, 3*VEC_LEN*SIZE($20) ++ VST_UH $f19, 4*VEC_LEN*SIZE($20) ++ ++ addl $20, 16*SIZE, $20 ++ ble $5, $End ++ ++ jmp $Remain ++ .align 4 ++/*Unloop 4 complex = 8 float/double*/ ++$Sub: ++ sra $16, 2, $4 ++ and $16, 3, $5 ++ SXSUBL $16, SIZE, $22 ++ addl $22, $22, $22 # Complex ++ .align 4 ++ ++ addl $19, $19, $19 # Complex ++ addl $21, $21, $21 # Complex ++ ++ ble $4, $SubRemain ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f2, 0*SIZE($18) ++ LD $f3, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f4, 0*SIZE($18) ++ LD $f5, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f6, 0*SIZE($18) ++ LD $f7, 1*SIZE($18) ++ SXADDQ $19, $18, $18 ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ SXADDQ $21, $20, $24 ++ ++ LD $f10, 0*SIZE($24) ++ LD $f11, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ LD $f12, 0*SIZE($24) ++ LD $f13, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ LD $f14, 0*SIZE($24) ++ LD $f15, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ subl $4, 1, $4 ++ ble $4, $SubMainLoopEnd ++ .align 4 ++ ++$SubMainLoop: ++ MUL $f29, $f0, $f20 ++ unop ++ MUL $f30, $f1, $f21 ++ unop ++ ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ++ MUL $f29, $f2, $f24 ++ SXADDQ $19, $18, $18 ++ MUL $f30, $f3, $f25 ++ unop ++ ++ MUL $f30, $f2, $f26 ++ LD $f2, 0*SIZE($18) ++ MUL $f29, $f3, $f27 ++ LD $f3, 1*SIZE($18) ++ ++ ADD1 $f20, $f21, $f16 ++ SXADDQ $19, $18, $18 ++ MUL $f29, $f4, $f20 ++ unop ++ ++ ADD2 $f22, $f23, $f17 ++ unop ++ MUL $f30, $f5, $f21 ++ unop ++ ++ ADD1 $f24, $f25, $f18 ++ unop ++ MUL $f30, $f4, $f22 ++ LD $f4, 0*SIZE($18) ++ ++ ADD2 $f26, $f27, $f19 ++ unop ++ MUL $f29, $f5, $f23 ++ LD $f5, 1*SIZE($18) ++ ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($24) ++ MUL $f29, $f6, $f24 ++ SXADDQ $19, $18, $18 ++ ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($24) ++ MUL $f30, $f7, $f25 ++ SXADDQ $21, $24, $24 ++ ++ ADD $f18, $f10, $f18 ++ LD $f10, 0*SIZE($24) ++ MUL $f30, $f6, $f26 ++ LD $f6, 0*SIZE($18) ++ ++ ADD $f19, $f11, $f19 ++ LD $f11, 1*SIZE($24) ++ MUL $f29, $f7, $f27 ++ LD $f7, 1*SIZE($18) ++ ++ ST $f16, 0*SIZE($20) ++ SXADDQ $19, $18, $18 ++ ADD1 $f20, $f21, $f16 ++ unop ++ ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ADD2 $f22, $f23, $f17 ++ unop ++ ++ ST $f18, 0*SIZE($20) ++ SXADDQ $21, $24, $24 ++ ADD1 $f24, $f25, $f18 ++ unop ++ ++ ST $f19, 1*SIZE($20) ++ unop ++ ADD2 $f26, $f27, $f19 ++ SXADDQ $21, $20, $20 ++ ++ ADD $f16, $f12, $f16 ++ unop ++ LD $f12, 0*SIZE($24) ++ unop ++ ++ ADD $f17, $f13, $f17 ++ unop ++ LD $f13, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ ADD $f18, $f14, $f18 ++ subl $4, 1, $4 ++ LD $f14, 0*SIZE($24) ++ unop ++ ++ ADD $f19, $f15, $f19 ++ unop ++ LD $f15, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ unop ++ ++ ST $f18, 0*SIZE($20) ++ ST $f19, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ bgt $4, $SubMainLoop ++ .align 4 ++ ++$SubMainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ MUL $f29, $f2, $f24 ++ MUL $f30, $f3, $f25 ++ MUL $f30, $f2, $f26 ++ MUL $f29, $f3, $f27 ++ ++ ADD1 $f20, $f21, $f16 ++ MUL $f29, $f4, $f20 ++ ADD2 $f22, $f23, $f17 ++ MUL $f30, $f5, $f21 ++ ++ ADD1 $f24, $f25, $f18 ++ MUL $f30, $f4, $f22 ++ ADD2 $f26, $f27, $f19 ++ MUL $f29, $f5, $f23 ++ ++ ADD $f16, $f8, $f16 ++ MUL $f29, $f6, $f24 ++ ADD $f17, $f28, $f17 ++ MUL $f30, $f7, $f25 ++ ++ ADD $f18, $f10, $f18 ++ MUL $f30, $f6, $f26 ++ ADD $f19, $f11, $f19 ++ MUL $f29, $f7, $f27 ++ ++ ST $f16, 0*SIZE($20) ++ ADD1 $f20, $f21, $f16 ++ ST $f17, 1*SIZE($20) ++ ADD2 $f22, $f23, $f17 ++ ++ SXADDQ $21, $20, $20 ++ nop ++ ST $f18, 0*SIZE($20) ++ ADD1 $f24, $f25, $f18 ++ ++ ST $f19, 1*SIZE($20) ++ ADD2 $f26, $f27, $f19 ++ SXADDQ $21, $20, $20 ++ ADD $f16, $f12, $f16 ++ ++ ADD $f17, $f13, $f17 ++ ADD $f18, $f14, $f18 ++ ADD $f19, $f15, $f19 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ++ ST $f18, 0*SIZE($20) ++ ST $f19, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ ble $5, $SubEnd ++ .align 4 ++ ++$SubRemain: ++ subl $5, 1, $6 ++ ble $5, $SubEnd ++ LD $f0, 0*SIZE($18) ++ LD $f1, 1*SIZE($18) ++ ++ LD $f8, 0*SIZE($20) ++ LD $f28, 1*SIZE($20) ++ SXADDQ $19, $18, $18 ++ SXADDQ $21, $20, $24 ++ ble $6, $SubRemainLoopEnd ++ .align 4 ++ ++$SubRemainLoop: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ LD $f0, 0*SIZE($18) ++ ++ MUL $f29, $f1, $f23 ++ LD $f1, 1*SIZE($18) ++ ADD1 $f20, $f21, $f16 ++ SXADDQ $19, $18, $18 ++ ++ ADD2 $f22, $f23, $f17 ++ nop ++ ADD $f16, $f8, $f16 ++ LD $f8, 0*SIZE($24) ++ ++ ADD $f17, $f28, $f17 ++ LD $f28, 1*SIZE($24) ++ SXADDQ $21, $24, $24 ++ subl $6, 1, $6 ++ ++ ST $f16, 0*SIZE($20) ++ ST $f17, 1*SIZE($20) ++ SXADDQ $21, $20, $20 ++ bgt $6, $SubRemainLoop ++ .align 4 ++ ++$SubRemainLoopEnd: ++ MUL $f29, $f0, $f20 ++ MUL $f30, $f1, $f21 ++ MUL $f30, $f0, $f22 ++ MUL $f29, $f1, $f23 ++ ++ ADD1 $f20, $f21, $f16 ++ ADD2 $f22, $f23, $f17 ++ ADD $f16, $f8, $f16 ++ ADD $f17, $f28, $f17 ++ ++ ST $f16, 0*SIZE($20) ++ nop ++ ST $f17, 1*SIZE($20) ++ nop ++ .align 4 ++ ++$SubEnd: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ ldi $sp, 64($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zdot.S b/kernel/sw_64/zdot.S +new file mode 100644 +index 0000000..114a7a3 +--- /dev/null ++++ b/kernel/sw_64/zdot.S +@@ -0,0 +1,583 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define XX $21 ++#define YY $23 ++ ++#define I $5 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f2 ++#define s3 $f30 ++#define s4 $f3 ++ ++#define a0 $f10 ++#define a1 $f11 ++#define a2 $f12 ++#define a3 $f13 ++#define a4 $f14 ++#define a5 $f15 ++#define a6 $f16 ++#define a7 $f17 ++ ++#define b0 $f18 ++#define b1 $f19 ++#define b2 $f20 ++#define b3 $f21 ++#define b4 $f22 ++#define b5 $f23 ++#define b6 $f24 ++#define b7 $f25 ++ ++#define t0 $f26 ++#define t1 $f27 ++#define t2 $f28 ++#define t3 $f29 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 24, $26, 0 ++ ++ ldi $sp, -24($sp) ++ fclr s0 ++ fstd $f2, 0($sp) ++ fstd $f3, 16($sp) ++ fclr s1 ++ ++ fclr s2 ++ addl INCX, INCX, INCX ++ fclr s3 ++ ble N, $L999 ++ ++ addl INCY, INCY, INCY ++ fclr t0 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ ++ srl N, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ LD a3, 1 * SIZE(X) ++ LD b2, 0 * SIZE(Y) ++ LD b3, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ LD b4, 0 * SIZE(Y) ++ LD b5, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a6, 0 * SIZE(X) ++ LD b6, 0 * SIZE(Y) ++ ++ subl I, 1, I ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD a7, 1 * SIZE(X) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ fillcs PREFETCHSIZE * SIZE(X) ++ MUL a0, b1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ fillcs PREFETCHSIZE * SIZE(Y) ++ MUL a1, b0, t2 ++ SXADDQ INCY, Y, Y ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ #unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ unop ++ MUL a5, b4, t2 ++ unop ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ unop ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b6, 0 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ unop ++ MUL a1, b0, t2 ++ unop ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ unop ++ MUL a5, b4, t2 ++ subl I, 1, I ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD b6, 0 * SIZE(Y) ++ MUL a7, b7, t3 ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD a7, 1 * SIZE(X) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ unop ++ MUL a1, b0, t2 ++ unop ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ unop ++ MUL a5, b4, t2 ++ unop ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ unop ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, 1 * SIZE(X) ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ LD b6, 0 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ MUL a1, b0, t2 ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ MUL a1, b1, t3 ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ MUL a2, b2, t0 ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ MUL a2, b3, t1 ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ MUL a3, b2, t2 ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ MUL a4, b5, t1 ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ MUL a5, b4, t2 ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ MUL a5, b5, t3 ++ ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ MUL a6, b6, t0 ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ MUL a6, b7, t1 ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ MUL a7, b6, t2 ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ unop ++ unop ++ ble I, $L998 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ subl I, 1, I ++ SXADDQ INCY, Y, Y ++ ble I, $L28 ++ .align 4 ++ ++$L26: ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ mov X, XX ++ MUL a0, b0, t0 ++ mov Y, YY ++ ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ LD a0, 0 * SIZE(XX) ++ MUL a1, b0, t2 ++ LD b0, 0 * SIZE(YY) ++ ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ subl I, 1, I ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(XX) ++ ++ LD b1, 1 * SIZE(YY) ++ bgt I, $L26 ++ .align 4 ++ ++$L28: ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ MUL a0, b1, t1 ++ ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ MUL a1, b0, t2 ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ MUL a1, b1, t3 ++ .align 4 ++ ++$L998: ++ ADD s0, t0, s4 ++ fmov s4,s0 ++ ADD s1, t1, s4 ++ fmov s4,s1 ++ ADD s2, t2, s4 ++ fmov s4,s2 ++ ADD s3, t3, s4 ++ fmov s4,s3 ++ ++#ifndef CONJ ++ SUB s0, s3, s4 ++ fmov s4,s0 ++ ADD s1, s2, s4 ++ fmov s4,s1 ++#else ++ ADD s0, s3, s4 ++ fmov s4,s0 ++ SUB s1, s2, s4 ++ fmov s4,s1 ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 16($sp) ++ ldi $sp, 24($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/zdot.S.bak b/kernel/sw_64/zdot.S.bak +new file mode 100644 +index 0000000..d10673c +--- /dev/null ++++ b/kernel/sw_64/zdot.S.bak +@@ -0,0 +1,500 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define XX $21 ++#define YY $23 ++ ++#define I $5 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f2 ++#define s3 $f30 ++ ++#define a0 $f10 ++#define a1 $f11 ++#define a2 $f12 ++#define a3 $f13 ++#define a4 $f14 ++#define a5 $f15 ++#define a6 $f16 ++#define a7 $f17 ++ ++#define b0 $f18 ++#define b1 $f19 ++#define b2 $f20 ++#define b3 $f21 ++#define b4 $f22 ++#define b5 $f23 ++#define b6 $f24 ++#define b7 $f25 ++ ++#define t0 $f26 ++#define t1 $f27 ++#define t2 $f28 ++#define t3 $f29 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 16, $26, 0 ++ ++ ldi $sp, -16($sp) ++ fclr s0 ++ fstd $f2, 0($sp) ++ fclr s1 ++ ++ fclr s2 ++ addl INCX, INCX, INCX ++ fclr s3 ++ ble N, $L999 ++ ++ addl INCY, INCY, INCY ++ fclr t0 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ ++ srl N, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ LD a3, 1 * SIZE(X) ++ LD b2, 0 * SIZE(Y) ++ LD b3, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ LD b4, 0 * SIZE(Y) ++ LD b5, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a6, 0 * SIZE(X) ++ LD b6, 0 * SIZE(Y) ++ ++ subl I, 1, I ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, s0 ++ LD a7, 1 * SIZE(X) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ fillcs PREFETCHSIZE * SIZE(X) ++ MUL a0, b1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ fillcs PREFETCHSIZE * SIZE(Y) ++ MUL a1, b0, t2 ++ SXADDQ INCY, Y, Y ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, 0 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a1, b0, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ subl I, 1, I ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD b6, 0 * SIZE(Y) ++ MUL a7, b7, t3 ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD s0, t0, s0 ++ LD a7, 1 * SIZE(X) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a1, b0, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, 0 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ MUL a1, b0, t2 ++ ADD s3, t3, s3 ++ MUL a1, b1, t3 ++ ++ ADD s0, t0, s0 ++ MUL a2, b2, t0 ++ ADD s1, t1, s1 ++ MUL a2, b3, t1 ++ ++ ADD s2, t2, s2 ++ MUL a3, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s1 ++ MUL a4, b5, t1 ++ ++ ADD s2, t2, s2 ++ MUL a5, b4, t2 ++ ADD s3, t3, s3 ++ MUL a5, b5, t3 ++ ++ ADD s0, t0, s0 ++ MUL a6, b6, t0 ++ ADD s1, t1, s1 ++ MUL a6, b7, t1 ++ ++ ADD s2, t2, s2 ++ MUL a7, b6, t2 ++ ADD s3, t3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ unop ++ unop ++ ble I, $L998 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ subl I, 1, I ++ SXADDQ INCY, Y, Y ++ ble I, $L28 ++ .align 4 ++ ++$L26: ++ ADD s0, t0, s0 ++ mov X, XX ++ MUL a0, b0, t0 ++ mov Y, YY ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ LD a0, 0 * SIZE(XX) ++ MUL a1, b0, t2 ++ LD b0, 0 * SIZE(YY) ++ ++ ADD s3, t3, s3 ++ subl I, 1, I ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(XX) ++ ++ LD b1, 1 * SIZE(YY) ++ bgt I, $L26 ++ .align 4 ++ ++$L28: ++ ADD s0, t0, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s1 ++ MUL a0, b1, t1 ++ ++ ADD s2, t2, s2 ++ MUL a1, b0, t2 ++ ADD s3, t3, s3 ++ MUL a1, b1, t3 ++ .align 4 ++ ++$L998: ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++#ifndef CONJ ++ SUB s0, s3, s0 ++ ADD s1, s2, s1 ++#else ++ ADD s0, s3, s0 ++ SUB s1, s2, s1 ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ ldi $sp, 16($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/zdot_simd.S b/kernel/sw_64/zdot_simd.S +new file mode 100644 +index 0000000..ed775e6 +--- /dev/null ++++ b/kernel/sw_64/zdot_simd.S +@@ -0,0 +1,699 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define XX $21 ++#define YY $23 ++ ++#define I $5 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f2 ++#define s3 $f30 ++ ++#define a0 $f10 ++#define a1 $f11 ++#define a2 $f12 ++#define a3 $f13 ++#define a4 $f14 ++#define a5 $f15 ++#define a6 $f16 ++#define a7 $f17 ++ ++#define b0 $f18 ++#define b1 $f19 ++#define b2 $f20 ++#define b3 $f21 ++#define b4 $f22 ++#define b5 $f23 ++#define b6 $f24 ++#define b7 $f25 ++ ++#define t0 $f26 ++#define t1 $f27 ++#define t2 $f28 ++#define t3 $f29 ++ ++#define t4 $f3 ++#define t5 $f4 ++#define t6 $f5 ++#define t7 $f6 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 40, $26, 0 ++ ++ ldi $sp, -40($sp) ++ fclr s0 ++ fstd $f2, 0($sp) ++ fclr s1 ++ ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ ++ fclr s2 ++ addl INCX, INCX, INCX ++ fclr s3 ++ ble N, $L999 ++ ++ addl INCY, INCY, INCY ++ fclr t0 ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ ++ cmpeq INCX, 2, $21 ++ cmpeq INCY, 2, $22 ++ and $21, $22, $22 ++ beq $22, $Sub ++ ++/* ++ test the address of Y & X ++*/ ++ and Y, (VEC_LEN*SIZE-1), $4 ++ and X, (VEC_LEN*SIZE-1), $3 ++ or $3, $4, $4 ++ bne $4, $UnAlign_ACCESS ++ ++/*Align access*/ ++/*UnLoop 8*/ ++ srl N, 3, I ++ ble I, $Remain ++ .align 4 ++ vcpys $f31, $f31, s0 #clear s0 vector ++ vcpys $f31, $f31, s1 #clear s0 vector ++ vcpys $f31, $f31, s2 #clear s0 vector ++ vcpys $f31, $f31, s3 #clear s0 vector ++ ++ vcpys $f31, $f31, t0 ++ vcpys $f31, $f31, t1 ++ vcpys $f31, $f31, t2 ++ vcpys $f31, $f31, t3 ++ ++$MainLoop: ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ ++ VLD b0, 0*VEC_LEN*SIZE(Y) ++ VADD s0, t0, s0 ++ VLD b1, 1*VEC_LEN*SIZE(Y) ++ VADD s1, t1, s1 ++ ++ VLD b2, 2*VEC_LEN*SIZE(Y) ++ VADD s2, t2, s2 ++ VLD b3, 3*VEC_LEN*SIZE(Y) ++ VADD s3, t3, s3 ++ ++/*spilt the X complex vector to real vector(a0, a2) and image vector (a1, a3) ++ Y complex vectory to real vector(b0, b2) and image vector (b1, b3) ++*/ ++ vextf a0, 1, a4 ++ vextf a0, 3, a5 ++ vextf a1, 0, a6 ++ vextf a1, 2, a7 ++ ++ vextf a2, 1, t0 ++ vextf a2, 3, t1 ++ vextf a3, 0, t2 ++ vextf a3, 2, t3 ++ ++ vextf b0, 1, b4 ++ vextf b0, 3, b5 ++ vextf b1, 0, b6 ++ vextf b1, 2, b7 ++ ++ vextf b2, 1, t4 ++ vextf b2, 3, t5 ++ vextf b3, 0, t6 ++ vextf b3, 2, t7 ++ ++ vinsf a4, a1, 0, a1 ++ vinsf a6, a0, 1, a0 ++ vinsf t0, a3, 0, a3 ++ vinsf t2, a2, 1, a2 ++ ++ vinsf b4, b1, 0, b1 ++ addl X, 16 * SIZE, X ++ vinsf b6, b0, 1, b0 ++ addl Y, 16 * SIZE, Y ++ ++ vinsf t4, b3, 0, b3 ++ subl I, 1, I ++ vinsf t6, b2, 1, b2 ++ nop ++ ++ vinsf a5, a1, 2, a1 ++ vinsf a7, a0, 3, a0 ++ vinsf t1, a3, 2, a3 ++ vinsf t3, a2, 3, a2 ++ ++ vinsf b5, b1, 2, b1 ++ vinsf b7, b0, 3, b0 ++ vinsf t5, b3, 2, b3 ++ vinsf t7, b2, 3, b2 ++ ++ /*Computing*/ ++ ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ VMAD a0, b0, s0, s0 ++ fillcs PREFETCHSIZE * SIZE(Y) ++ VMAD a0, b1, s1, s1 ++ ++ VMAD a1, b0, s2, s2 ++ VMAD a1, b1, s3, s3 ++ VMUL a2, b2, t0 /*Just multiply. Add it in next loop.*/ ++ VMUL a2, b3, t1 ++ ++ VMUL a3, b2, t2 ++ VMUL a3, b3, t3 ++ nop ++ bgt I, $MainLoop ++ .align 4 ++$MainLoopEnd: ++ VADD s0, t0, s0 ++ VADD s1, t1, s1 ++ VADD s2, t2, s2 ++ VADD s3, t3, s3 ++ ++#ifndef CONJ ++ VSUB s0, s3, s0 ++ VADD s1, s2, s1 ++#else ++ VADD s0, s3, s0 ++ VSUB s1, s2, s1 ++#endif ++ vcpys $f31, $f31, s2 #clear s0 vector ++ vcpys $f31, $f31, s3 #clear s0 vector ++ ++ vextf s0, 1, t1 ++ vextf s0, 2, t2 ++ vextf s0, 3, t3 ++ vextf s1, 1, t5 ++ ++ vextf s1, 2, t6 ++ vextf s1, 3, t7 ++ ADD s0, t1, s0 ++ ADD t2, t3, t0 ++ ++ ADD s1, t5, s1 ++ ADD t6, t7, t4 ++ ADD s0, t0, s0 ++ ADD s1, t4, s1 ++$Remain: ++ and N, 7, I ++ ble I, $End ++ .align 4 ++$RemainLoop: ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ subl I, 1, I ++ SXADDQ INCY, Y, Y ++ MAD a0, b0, s0, s0 ++ ++ MAD a0, b1, s1, s1 ++ MAD a1, b0, s2, s2 ++ MAD a1, b1, s3, s3 ++ bgt I, $RemainLoop ++ .align 4 ++ ++#ifndef CONJ ++ SUB s0, s3, s0 ++ ADD s1, s2, s1 ++#else ++ ADD s0, s3, s0 ++ SUB s1, s2, s1 ++#endif ++ ++$End: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ ldi $sp, 40($sp) ++ ret ++ ++ .align 4 ++ ++$UnAlign_ACCESS: ++$Sub: ++ srl N, 3, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a2, 0 * SIZE(X) ++ LD a3, 1 * SIZE(X) ++ LD b2, 0 * SIZE(Y) ++ LD b3, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ LD b4, 0 * SIZE(Y) ++ LD b5, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ LD a6, 0 * SIZE(X) ++ LD b6, 0 * SIZE(Y) ++ ++ subl I, 1, I ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD s0, t0, s0 ++ LD a7, 1 * SIZE(X) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ fillcs PREFETCHSIZE * SIZE(X) ++ MUL a0, b1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, s2 ++ fillcs PREFETCHSIZE * SIZE(Y) ++ MUL a1, b0, t2 ++ SXADDQ INCY, Y, Y ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, 0 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a1, b0, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ subl I, 1, I ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD b6, 0 * SIZE(Y) ++ MUL a7, b7, t3 ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD s0, t0, s0 ++ LD a7, 1 * SIZE(X) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a1, b0, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a0, 0 * SIZE(X) ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b0, 0 * SIZE(Y) ++ MUL a2, b2, t0 ++ LD b1, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a2, b3, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a3, b2, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a2, 0 * SIZE(X) ++ MUL a3, b3, t3 ++ LD a3, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b2, 0 * SIZE(Y) ++ MUL a4, b4, t0 ++ LD b3, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a4, b5, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a5, b4, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a4, 0 * SIZE(X) ++ MUL a5, b5, t3 ++ LD a5, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b4, 0 * SIZE(Y) ++ MUL a6, b6, t0 ++ LD b5, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a6, b7, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ unop ++ MUL a7, b6, t2 ++ unop ++ ++ ADD s3, t3, s3 ++ LD a6, 0 * SIZE(X) ++ MUL a7, b7, t3 ++ LD a7, 1 * SIZE(X) ++ ++ ADD s0, t0, s0 ++ LD b6, 0 * SIZE(Y) ++ MUL a0, b0, t0 ++ LD b7, 1 * SIZE(Y) ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ MUL a1, b0, t2 ++ ADD s3, t3, s3 ++ MUL a1, b1, t3 ++ ++ ADD s0, t0, s0 ++ MUL a2, b2, t0 ++ ADD s1, t1, s1 ++ MUL a2, b3, t1 ++ ++ ADD s2, t2, s2 ++ MUL a3, b2, t2 ++ ADD s3, t3, s3 ++ MUL a3, b3, t3 ++ ++ ADD s0, t0, s0 ++ MUL a4, b4, t0 ++ ADD s1, t1, s1 ++ MUL a4, b5, t1 ++ ++ ADD s2, t2, s2 ++ MUL a5, b4, t2 ++ ADD s3, t3, s3 ++ MUL a5, b5, t3 ++ ++ ADD s0, t0, s0 ++ MUL a6, b6, t0 ++ ADD s1, t1, s1 ++ MUL a6, b7, t1 ++ ++ ADD s2, t2, s2 ++ MUL a7, b6, t2 ++ ADD s3, t3, s3 ++ MUL a7, b7, t3 ++ .align 4 ++ ++$L25: ++ and N, 7, I ++ unop ++ unop ++ ble I, $L998 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ LD b0, 0 * SIZE(Y) ++ LD b1, 1 * SIZE(Y) ++ ++ SXADDQ INCX, X, X ++ subl I, 1, I ++ SXADDQ INCY, Y, Y ++ ble I, $L28 ++ .align 4 ++ ++$L26: ++ ADD s0, t0, s0 ++ mov X, XX ++ MUL a0, b0, t0 ++ mov Y, YY ++ ++ ADD s1, t1, s1 ++ SXADDQ INCX, X, X ++ MUL a0, b1, t1 ++ SXADDQ INCY, Y, Y ++ ++ ADD s2, t2, s2 ++ LD a0, 0 * SIZE(XX) ++ MUL a1, b0, t2 ++ LD b0, 0 * SIZE(YY) ++ ++ ADD s3, t3, s3 ++ subl I, 1, I ++ MUL a1, b1, t3 ++ LD a1, 1 * SIZE(XX) ++ ++ LD b1, 1 * SIZE(YY) ++ bgt I, $L26 ++ .align 4 ++ ++$L28: ++ ADD s0, t0, s0 ++ MUL a0, b0, t0 ++ ADD s1, t1, s1 ++ MUL a0, b1, t1 ++ ++ ADD s2, t2, s2 ++ MUL a1, b0, t2 ++ ADD s3, t3, s3 ++ MUL a1, b1, t3 ++ .align 4 ++ ++$L998: ++ ADD s0, t0, s0 ++ ADD s1, t1, s1 ++ ADD s2, t2, s2 ++ ADD s3, t3, s3 ++ ++#ifndef CONJ ++ SUB s0, s3, s0 ++ ADD s1, s2, s1 ++#else ++ ADD s0, s3, s0 ++ SUB s1, s2, s1 ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ ++ fldd $f6, 32($sp) ++ ldi $sp, 40($sp) ++ ret ++ ++ EPILOGUE +diff --git a/kernel/sw_64/zgemm_beta.S b/kernel/sw_64/zgemm_beta.S +new file mode 100644 +index 0000000..18f845c +--- /dev/null ++++ b/kernel/sw_64/zgemm_beta.S +@@ -0,0 +1,192 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ .set noat ++ .set noreorder ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++CNAME: ++ .frame $sp, 0, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $28, _mcount ++ jsr $28, ($28), _mcount ++ .prologue 1 ++#else ++ .prologue 0 ++#endif ++ ++ ldl $18, 24($sp) ++ ble $16, $End ++ ldl $19, 32($sp) ++ ble $17, $End ++ ++ addl $19, $19, $19 ++ fbne $f19,$Main ++ fbne $f20,$Main ++ .align 4 ++ ++$L13: ++ mov $18, $1 ++ ldi $17, -1($17) ++ SXADDQ $19, $18, $18 ++ mov $16, $2 ++ .align 4 ++ ++$L12: ++ ST $f31, 0*SIZE($1) ++ ST $f31, 1*SIZE($1) ++ ldi $2, -1($2) ++ ldi $1, 2*SIZE($1) ++ bgt $2, $L12 ++ bgt $17,$L13 ++ clr $0 ++ ret ++ .align 4 ++ ++/* Main Routine */ ++$Main: ++ sra $16, 1, $2 # $2 = (m >> 1) ++ mov $18, $1 # c_offset = c ++ ldi $17, -1($17) # n -- ++ SXADDQ $19, $18, $18 # c += ldc ++ beq $2, $L18 ++ ++ LD $f14, 0*SIZE($1) ++ LD $f15, 1*SIZE($1) ++ LD $f24, 2*SIZE($1) ++ LD $f25, 3*SIZE($1) ++ ldi $2, -1($2) # $2 -- ++ ble $2, $L19 ++ .align 4 ++ ++ ++$L23: ++ MUL $f19, $f14, $f10 ++ fillcs 9*SIZE($1) ++ MUL $f20, $f15, $f11 ++ ldi $2, -1($2) ++ ++ MUL $f19, $f15, $f12 ++ LD $f15, 5*SIZE($1) ++ MUL $f20, $f14, $f13 ++ LD $f14, 4*SIZE($1) ++ ++ MUL $f19, $f24, $f16 ++ unop ++ MUL $f20, $f25, $f17 ++ unop ++ ++ MUL $f19, $f25, $f18 ++ LD $f25, 7*SIZE($1) ++ SUB $f10, $f11, $f22 ++ unop ++ ++ MUL $f20, $f24, $f21 ++ LD $f24, 6*SIZE($1) ++ ADD $f12, $f13, $f23 ++ ldi $1, 4*SIZE($1) ++ ++ SUB $f16, $f17, $f26 ++ ADD $f18, $f21, $f27 ++ ST $f22,-4*SIZE($1) ++ ST $f23,-3*SIZE($1) ++ ++ ST $f26,-2*SIZE($1) ++ ST $f27,-1*SIZE($1) ++ unop ++ bgt $2,$L23 ++ .align 4 ++ ++$L19: ++ MUL $f19, $f14, $f10 ++ MUL $f20, $f15, $f11 ++ MUL $f19, $f15, $f12 ++ MUL $f20, $f14, $f13 ++ ++ MUL $f19, $f24, $f16 ++ MUL $f20, $f25, $f17 ++ MUL $f19, $f25, $f18 ++ MUL $f20, $f24, $f21 ++ ++ SUB $f10, $f11, $f22 ++ ADD $f12, $f13, $f23 ++ SUB $f16, $f17, $f26 ++ ADD $f18, $f21, $f27 ++ ldi $1, 4*SIZE($1) ++ ++ ST $f22, -4*SIZE($1) ++ ST $f23, -3*SIZE($1) ++ ST $f26, -2*SIZE($1) ++ ST $f27, -1*SIZE($1) ++ ++ blbs $16, $L18 ++ bgt $17, $Main ++ clr $0 ++ ret ++ .align 4 ++ ++$L18: ++ LD $f14, 0*SIZE($1) ++ LD $f15, 1*SIZE($1) ++ MUL $f19, $f15, $f13 ++ MUL $f20, $f14, $f10 ++ ++ MUL $f19, $f14, $f12 ++ MUL $f20, $f15, $f11 ++ ADD $f13, $f10, $f26 ++ SUB $f12, $f11, $f27 ++ ++ ST $f26, 1*SIZE($1) ++ ST $f27, 0*SIZE($1) ++ ldi $1, 2*SIZE($1) ++ bgt $17, $Main ++ .align 4 ++ ++$End: ++ clr $0 ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/zgemm_kernel_2x2.S b/kernel/sw_64/zgemm_kernel_2x2.S +new file mode 100644 +index 0000000..6cf954b +--- /dev/null ++++ b/kernel/sw_64/zgemm_kernel_2x2.S +@@ -0,0 +1,1949 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 48 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++ .set noat ++ .set noreorder ++ .arch sw6a ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 88 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define BB $3 ++#define OFFSET $4 ++ ++#define tmp $9 ++ ++#define ALPHA_R 64($sp) ++#define ALPHA_I 72($sp) ++ ++#if defined(NN) || defined(NT) || defined(TN) || defined(TT) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB ++#endif ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++#ifdef TRMMKERNEL ++ ldl OFFSET, 24 + STACKSIZE($sp) ++#endif ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ fstd $f19, ALPHA_R ++ fstd $f20, ALPHA_I ++ stl tmp, 80($sp) ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ subl $31, OFFSET, KK ++#endif ++ ++ sra N, 1, J ++ ble J, $L30 ++ .align 4 ++ ++$L01: ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ s4addl K, 0, BB ++ ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ SXADDQ BB, B, BB ++ addl C2, LDC, C ++ unop ++ ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#ifndef EV4 ++ fillcs 0 * SIZE(BB) ++ fillcs 8 * SIZE(BB) ++ unop ++ ldi BB, 16 * SIZE(BB) ++#endif ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble L, $L15 ++#else ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, a6 ++ fmov a6, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, a6 ++ fmov a6, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, a6 ++ fmov a6, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, a6 ++ fmov a6, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, a6 ++ fmov a6, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, a6 ++ fmov a6, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, a6 ++ fmov a6, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, a6 ++ fmov a6, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, a6 ++ fmov a6, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, a6 ++ fmov a6, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, a6 ++ fmov a6, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ FIMOVD a6, tmp ++ ++ ADD3 c10, t2, a6 ++ fmov a6, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, a6 ++ fmov a6, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, a6 ++ fmov a6, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, a6 ++ fmov a6, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, a6 ++ fmov a6, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, a6 ++ fmov a6, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, a6 ++ fmov a6, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ unop ++ IFMOVD tmp, a6 ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, a6 ++ fmov a6, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, a6 ++ fmov a6, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, a6 ++ fmov a6, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, a6 ++ fmov a6, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, a6 ++ fmov a6, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, a6 ++ fmov a6, c13 ++ unop ++ IFMOVD tmp, a6 ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, a6 ++ fmov a6, c09 ++ unop ++ IFMOVD tmp, a6 ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, a6 ++ fmov a6, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, a6 ++ fmov a6, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, a6 ++ fmov a6, c07 ++ IFMOVD tmp, a6 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, a6 ++ fmov a6, c11 ++ fldd alpha_r, ALPHA_R ++ FIMOVD alpha_r, tmp ++ MUL b1, a1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L18 ++#else ++ blbs TMP1, $L18 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, a6 ++ fmov a6, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, a6 ++ fmov a6, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, a6 ++ fmov a6, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, a6 ++ fmov a6, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, a6 ++ fmov a6, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, a6 ++ fmov a6, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, a6 ++ fmov a6, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, a6 ++ fmov a6, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, a6 ++ fmov a6, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, a6 ++ fmov a6, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, a6 ++ fmov a6, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, a6 ++ fmov a6, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, a6 ++ fmov a6, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, a6 ++ fmov a6, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L18: ++ ADD3 c12, t2, a6 ++ fmov a6, c12 ++ unop ++ MUL b1, a2, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD2 c16, t3, a6 ++ fmov a6, c16 ++ unop ++ MUL b2, a2, t3 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD4 c15, t4, a6 ++ fmov a6, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL b1, a4, t2 ++#ifndef TRMMKERNEL ++ LD b1, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c06, t3, a6 ++ fmov a6, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, a6 ++ fmov a6, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, a6 ++ fmov a6, c03 ++ unop ++ MUL b3, a1, t1 ++#ifndef TRMMKERNEL ++ LD a1, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD3 c04, t2, a6 ++ fmov a6, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, a6 ++ fmov a6, c08 ++ unop ++ MUL b4, a2, t3 ++#ifndef TRMMKERNEL ++ LD a2, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD4 c13, t4, a6 ++ fmov a6, c13 ++ unop ++ MUL b2, a3, t4 ++#ifndef TRMMKERNEL ++ LD b2, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD1 c09, t1, a6 ++ fmov a6, c09 ++ ldi I, -1(I) ++ MUL b3, a3, t1 ++ unop ++ ++ ADD3 c10, t2, a6 ++ fmov a6, c10 ++ unop ++ MUL b3, a4, t2 ++#ifndef TRMMKERNEL ++ LD b3, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD2 c14, t3, a6 ++ fmov a6, c14 ++ unop ++ MUL b4, a4, t3 ++#ifndef TRMMKERNEL ++ LD a4, 2 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD4 c07, t4, a6 ++ fmov a6, c07 ++ unop ++ MUL b4, a3, t4 ++#ifndef TRMMKERNEL ++ LD a3, 3 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD1 c11, t1, a6 ++ fmov a6, c11 ++ ADD3 c12, t2, a6 ++ fmov a6, c12 ++ ADD2 c16, t3, a6 ++ fmov a6, c16 ++ ADD4 c15, t4, a6 ++ fmov a6, c15 ++ ++ ADD c01, c06, a6 ++ fmov a6, c01 ++ ADD c02, c05, a6 ++ fmov a6, c02 ++ ADD c03, c08, a6 ++ fmov a6, c03 ++ ADD c04, c07, a6 ++ fmov a6, c04 ++ ++ ADD c09, c14, a6 ++ fmov a6, c09 ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c01, t1 ++ ADD c10, c13, a6 ++ fmov a6, c10 ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c02, t2 ++ ++ ADD c11, c16, a6 ++ fmov a6, c11 ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c03, t3 ++ ADD c12, c15, a6 ++ fmov a6, c12 ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c04, t4 ++ ++#ifndef TRMMKERNEL ++ ADD a5, t1, a6 ++ fmov a6, a5 ++ MUL alpha_i, c02, t1 ++ ADD b1, t2, a6 ++ fmov a6, b1 ++ MUL alpha_i, c01, t2 ++ ++ ADD a1, t3, a6 ++ fmov a6, a1 ++ MUL alpha_i, c04, t3 ++ ADD a2, t4, a6 ++ fmov a6, a2 ++ MUL alpha_i, c03, t4 ++#else ++ ADD $f31, t1, a5 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, b1 ++ MUL alpha_i, c01, t2 ++ ++ ADD $f31, t3, a1 ++ MUL alpha_i, c04, t3 ++ ADD $f31, t4, a2 ++ MUL alpha_i, c03, t4 ++#endif ++ ++ SUB a5, t1, a6 ++ fmov a6, a5 ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c09, t1 ++ ADD b1, t2, a6 ++ fmov a6, b1 ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c10, t2 ++ ++ SUB a1, t3, a6 ++ fmov a6, a1 ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c11, t3 ++ ADD a2, t4, a6 ++ fmov a6, a2 ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c12, t4 ++ ++#ifndef TRMMKERNEL ++ ADD b2, t1, a6 ++ fmov a6, b2 ++ MUL alpha_i, c10, t1 ++ ADD b3, t2, a6 ++ fmov a6, b3 ++ MUL alpha_i, c09, t2 ++ ++ ADD a4, t3, a6 ++ fmov a6, a4 ++ MUL alpha_i, c12, t3 ++ ADD a3, t4, a6 ++ fmov a6, a3 ++ MUL alpha_i, c11, t4 ++#else ++ ADD $f31, t1, b2 ++ MUL alpha_i, c10, t1 ++ ADD $f31, t2, b3 ++ MUL alpha_i, c09, t2 ++ ++ ADD $f31, t3, a4 ++ MUL alpha_i, c12, t3 ++ ADD $f31, t4, a3 ++ MUL alpha_i, c11, t4 ++#endif ++ ++ SUB b2, t1, a6 ++ fmov a6, b2 ++ ST a5, 0 * SIZE(C1) ++ fclr t1 ++ unop ++ ++ ADD b3, t2, a6 ++ fmov a6, b3 ++ ST b1, 1 * SIZE(C1) ++ fclr t2 ++ unop ++ ++ SUB a4, t3, a6 ++ fmov a6, a4 ++ ST a1, 2 * SIZE(C1) ++ fclr t3 ++ unop ++ ++ ADD a3, t4, a6 ++ fmov a6, a3 ++ ST a2, 3 * SIZE(C1) ++ fclr t4 ++ unop ++ ++ ST b2, 0 * SIZE(C2) ++ fclr c01 ++ ST b3, 1 * SIZE(C2) ++ fclr c05 ++ ++ ST a4, 2 * SIZE(C2) ++ ldi C1, 4 * SIZE(C1) ++ ST a3, 3 * SIZE(C2) ++ ldi C2, 4 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 1, I ++ ble I, $L29 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ble L, $L25 ++#else ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, a6 ++ fmov a6, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, a6 ++ fmov a6, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, a6 ++ fmov a6, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, a6 ++ fmov a6, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, a6 ++ fmov a6, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, a6 ++ fmov a6, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, a6 ++ fmov a6, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, a6 ++ fmov a6, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, a6 ++ fmov a6, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, a6 ++ fmov a6, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, a6 ++ fmov a6, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, a6 ++ fmov a6, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, a6 ++ fmov a6, c09 ++ fldd alpha_r, ALPHA_R ++ FIMOVD alpha_r, tmp ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L28 ++#else ++ blbs TMP1, $L28 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, a6 ++ fmov a6, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, a6 ++ fmov a6, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, a6 ++ fmov a6, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, a6 ++ fmov a6, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, a6 ++ fmov a6, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, a6 ++ fmov a6, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L28: ++ ADD3 c10, t2, a6 ++ fmov a6, c10 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c13, t3, a6 ++ fmov a6, c13 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c03, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c14, t4, a6 ++ fmov a6, c14 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c04, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ unop ++ MUL a1, b3, t1 ++#ifndef TRMMKERNEL ++ LD c11, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL a2, b3, t2 ++#ifndef TRMMKERNEL ++ LD c12, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD4 c05, t3, a6 ++ fmov a6, c05 ++ MUL a1, b4, t3 ++ ADD2 c06, t4, a6 ++ fmov a6, c06 ++ MUL a2, b4, t4 ++ ++ ADD1 c09, t1, a6 ++ fmov a6, c09 ++ ADD3 c10, t2, a6 ++ fmov a6, c10 ++ ADD4 c13, t3, a6 ++ fmov a6, c13 ++ ADD2 c14, t4, a6 ++ fmov a6, c14 ++ ++ ADD c01, c06, a6 ++ fmov a6, c01 ++ ADD c02, c05, a6 ++ fmov a6, c02 ++ ADD c09, c14, a6 ++ fmov a6, c09 ++ ADD c10, c13, a6 ++ fmov a6, c10 ++ ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_r, c09, t3 ++ MUL alpha_r, c10, t4 ++ ++#ifndef TRMMKERNEL ++ ADD c03, t1, a6 ++ fmov a6, c03 ++ MUL alpha_i, c02, t1 ++ ADD c04, t2, a6 ++ fmov a6, c04 ++ MUL alpha_i, c01, t2 ++ ++ ADD c11, t3, a6 ++ fmov a6, c11 ++ MUL alpha_i, c10, t3 ++ ADD c12, t4, a6 ++ fmov a6, c12 ++ MUL alpha_i, c09, t4 ++#else ++ ADD $f31, t1, c03 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, c04 ++ MUL alpha_i, c01, t2 ++ ++ ADD $f31, t3, c11 ++ MUL alpha_i, c10, t3 ++ ADD $f31, t4, c12 ++ MUL alpha_i, c09, t4 ++#endif ++ ++ SUB c03, t1, a6 ++ fmov a6, c03 ++ ADD c04, t2, a6 ++ fmov a6, c04 ++ SUB c11, t3, a6 ++ fmov a6, c11 ++ ADD c12, t4, a6 ++ fmov a6, c12 ++ ++ ST c03, 0 * SIZE(C1) ++ ST c04, 1 * SIZE(C1) ++ ST c11, 0 * SIZE(C2) ++ ST c12, 1 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 1, KK ++#endif ++ .align 4 ++ ++$L29: ++ mov BO, B ++ ldi J, -1(J) ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 2, KK ++#else ++ unop ++#endif ++ bgt J, $L01 ++ .align 4 ++ ++$L30: ++ and N, 1, J ++ ble J, $L999 ++ ++ mov C, C1 ++ mov A, AO ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ sra M, 1, I ++ ble I, $L50 ++ .align 4 ++ ++$L41: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ fclr c04 ++ fclr c08 ++ ble L, $L45 ++#else ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, a6 ++ fmov a6, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, a6 ++ fmov a6, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, a6 ++ fmov a6, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, a6 ++ fmov a6, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, a6 ++ fmov a6, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, a6 ++ fmov a6, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, a6 ++ fmov a6, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, a6 ++ fmov a6, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, a6 ++ fmov a6, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, a6 ++ fmov a6, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, a6 ++ fmov a6, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, a6 ++ fmov a6, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, a6 ++ fmov a6, c05 ++ fldd alpha_r, ALPHA_R ++ FIMOVD alpha_r, tmp ++ MUL b1, a1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L48 ++#else ++ blbs TMP1, $L48 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, a6 ++ fmov a6, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, a6 ++ fmov a6, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, a6 ++ fmov a6, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, a6 ++ fmov a6, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, a6 ++ fmov a6, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, a6 ++ fmov a6, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L48: ++ ADD2 c06, t2, a6 ++ fmov a6, c06 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c07, t3, a6 ++ fmov a6, c07 ++ ldi I, -1(I) ++ MUL a3, b1, t3 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c08, t4, a6 ++ fmov a6, c08 ++ unop ++ MUL a4, b1, t4 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ unop ++ MUL a1, b2, t1 ++#ifndef TRMMKERNEL ++ LD c11, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL a2, b2, t2 ++#ifndef TRMMKERNEL ++ LD c12, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c03, t3, a6 ++ fmov a6, c03 ++ MUL a3, b2, t3 ++ ADD3 c04, t4, a6 ++ fmov a6, c04 ++ MUL a4, b2, t4 ++ ++ ADD4 c05, t1, a6 ++ fmov a6, c05 ++ ADD2 c06, t2, a6 ++ fmov a6, c06 ++ ADD4 c07, t3, a6 ++ fmov a6, c07 ++ ADD2 c08, t4, a6 ++ fmov a6, c08 ++ ++ ADD c01, c06, a6 ++ fmov a6, c01 ++ ADD c02, c05, a6 ++ fmov a6, c02 ++ ADD c03, c08, a6 ++ fmov a6, c03 ++ ADD c04, c07, a6 ++ fmov a6, c04 ++ ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_r, c03, t3 ++ MUL alpha_r, c04, t4 ++ ++#ifndef TRMMKERNEL ++ ADD c09, t1, a6 ++ fmov a6, c09 ++ MUL alpha_i, c02, t1 ++ ADD c10, t2, a6 ++ fmov a6, c10 ++ MUL alpha_i, c01, t2 ++ ++ ADD c11, t3, a6 ++ fmov a6, c11 ++ MUL alpha_i, c04, t3 ++ ADD c12, t4, a6 ++ fmov a6, c12 ++ MUL alpha_i, c03, t4 ++#else ++ ADD $f31, t1, c09 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, c10 ++ MUL alpha_i, c01, t2 ++ ++ ADD $f31, t3, c11 ++ MUL alpha_i, c04, t3 ++ ADD $f31, t4, c12 ++ MUL alpha_i, c03, t4 ++#endif ++ ++ SUB c09, t1, a6 ++ fmov a6, c09 ++ ADD c10, t2, a6 ++ fmov a6, c10 ++ SUB c11, t3, a6 ++ fmov a6, c11 ++ ADD c12, t4, a6 ++ fmov a6, c12 ++ ++ ST c09, 0 * SIZE(C1) ++ ST c10, 1 * SIZE(C1) ++ ST c11, 2 * SIZE(C1) ++ ST c12, 3 * SIZE(C1) ++ ++ ldi C1, 4 * SIZE(C1) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ ++ bgt I, $L41 ++ .align 4 ++ ++$L50: ++ and M, 1, I ++ ble I, $L999 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ble L, $L55 ++#else ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, a6 ++ fmov a6, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, a6 ++ fmov a6, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, a6 ++ fmov a6, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, a6 ++ fmov a6, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ fldd alpha_r, ALPHA_R ++ FIMOVD alpha_r, tmp ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L58 ++#else ++ blbs TMP1, $L58 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, a6 ++ fmov a6, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, a6 ++ fmov a6, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L58: ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c05, t3, a6 ++ fmov a6, c05 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c03, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c06, t4, a6 ++ fmov a6, c06 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c04, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c01, t1, a6 ++ fmov a6, c01 ++ ADD3 c02, t2, a6 ++ fmov a6, c02 ++ ADD4 c05, t3, a6 ++ fmov a6, c05 ++ ADD2 c06, t4, a6 ++ fmov a6, c06 ++ ++ ADD c01, c06, a6 ++ fmov a6, c01 ++ ADD c02, c05, a6 ++ fmov a6, c02 ++ ++ IFMOVD tmp, alpha_r ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_i, c02, t3 ++ MUL alpha_i, c01, t4 ++ ++#ifndef TRMMKERNEL ++ ADD c03, t1, a6 ++ fmov a6, c03 ++ ADD c04, t2, a6 ++ fmov a6, c04 ++#else ++ ADD $f31, t1, c03 ++ ADD $f31, t2, c04 ++#endif ++ ++ SUB c03, t3, a6 ++ fmov a6, c03 ++ ADD c04, t4, a6 ++ fmov a6, c04 ++ ++ ST c03, 0 * SIZE(C1) ++ ST c04, 1 * SIZE(C1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldl $9, 80($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/zgemm_kernel_2x2.S.bak b/kernel/sw_64/zgemm_kernel_2x2.S.bak +new file mode 100644 +index 0000000..2133673 +--- /dev/null ++++ b/kernel/sw_64/zgemm_kernel_2x2.S.bak +@@ -0,0 +1,1704 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(SW2B) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW2B ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ .set noat ++ .set noreorder ++ .arch ev6 ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define BB $3 ++#define OFFSET $4 ++ ++#define ALPHA_R 64($sp) ++#define ALPHA_I 72($sp) ++ ++#if defined(NN) || defined(NT) || defined(TN) || defined(TT) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB ++#endif ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++#ifdef TRMMKERNEL ++ ldl OFFSET, 24 + STACKSIZE($sp) ++#endif ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ fstd $f19, ALPHA_R ++ fstd $f20, ALPHA_I ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ subl $31, OFFSET, KK ++#endif ++ ++ sra N, 1, J ++ ble J, $L30 ++ .align 4 ++ ++$L01: ++ mov C, C1 ++ addl C, LDC, C2 ++ mov A, AO ++ s4addl K, 0, BB ++ ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ SXADDQ BB, B, BB ++ addl C2, LDC, C ++ unop ++ ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#ifndef EV4 ++ fillcs 0 * SIZE(BB) ++ fillcs 8 * SIZE(BB) ++ unop ++ ldi BB, 16 * SIZE(BB) ++#endif ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble L, $L15 ++#else ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD1 c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, c11 ++ fldd alpha_r, ALPHA_R ++ MUL b1, a1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L18 ++#else ++ blbs TMP1, $L18 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L18: ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++#ifndef TRMMKERNEL ++ LD a5, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++#ifndef TRMMKERNEL ++ LD b1, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++#ifndef TRMMKERNEL ++ LD a1, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++#ifndef TRMMKERNEL ++ LD a2, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++#ifndef TRMMKERNEL ++ LD b2, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD1 c09, t1, c09 ++ ldi I, -1(I) ++ MUL b3, a3, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++#ifndef TRMMKERNEL ++ LD b3, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++#ifndef TRMMKERNEL ++ LD a4, 2 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++#ifndef TRMMKERNEL ++ LD a3, 3 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD1 c11, t1, c11 ++ ADD3 c12, t2, c12 ++ ADD2 c16, t3, c16 ++ ADD4 c15, t4, c15 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++ ADD c09, c14, c09 ++ MUL alpha_r, c01, t1 ++ ADD c10, c13, c10 ++ MUL alpha_r, c02, t2 ++ ++ ADD c11, c16, c11 ++ MUL alpha_r, c03, t3 ++ ADD c12, c15, c12 ++ MUL alpha_r, c04, t4 ++ ++#ifndef TRMMKERNEL ++ ADD a5, t1, a5 ++ MUL alpha_i, c02, t1 ++ ADD b1, t2, b1 ++ MUL alpha_i, c01, t2 ++ ++ ADD a1, t3, a1 ++ MUL alpha_i, c04, t3 ++ ADD a2, t4, a2 ++ MUL alpha_i, c03, t4 ++#else ++ ADD $f31, t1, a5 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, b1 ++ MUL alpha_i, c01, t2 ++ ++ ADD $f31, t3, a1 ++ MUL alpha_i, c04, t3 ++ ADD $f31, t4, a2 ++ MUL alpha_i, c03, t4 ++#endif ++ ++ SUB a5, t1, a5 ++ MUL alpha_r, c09, t1 ++ ADD b1, t2, b1 ++ MUL alpha_r, c10, t2 ++ ++ SUB a1, t3, a1 ++ MUL alpha_r, c11, t3 ++ ADD a2, t4, a2 ++ MUL alpha_r, c12, t4 ++ ++#ifndef TRMMKERNEL ++ ADD b2, t1, b2 ++ MUL alpha_i, c10, t1 ++ ADD b3, t2, b3 ++ MUL alpha_i, c09, t2 ++ ++ ADD a4, t3, a4 ++ MUL alpha_i, c12, t3 ++ ADD a3, t4, a3 ++ MUL alpha_i, c11, t4 ++#else ++ ADD $f31, t1, b2 ++ MUL alpha_i, c10, t1 ++ ADD $f31, t2, b3 ++ MUL alpha_i, c09, t2 ++ ++ ADD $f31, t3, a4 ++ MUL alpha_i, c12, t3 ++ ADD $f31, t4, a3 ++ MUL alpha_i, c11, t4 ++#endif ++ ++ SUB b2, t1, b2 ++ ST a5, 0 * SIZE(C1) ++ fclr t1 ++ unop ++ ++ ADD b3, t2, b3 ++ ST b1, 1 * SIZE(C1) ++ fclr t2 ++ unop ++ ++ SUB a4, t3, a4 ++ ST a1, 2 * SIZE(C1) ++ fclr t3 ++ unop ++ ++ ADD a3, t4, a3 ++ ST a2, 3 * SIZE(C1) ++ fclr t4 ++ unop ++ ++ ST b2, 0 * SIZE(C2) ++ fclr c01 ++ ST b3, 1 * SIZE(C2) ++ fclr c05 ++ ++ ST a4, 2 * SIZE(C2) ++ ldi C1, 4 * SIZE(C1) ++ ST a3, 3 * SIZE(C2) ++ ldi C2, 4 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 1, I ++ ble I, $L29 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 2, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ble L, $L25 ++#else ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, c09 ++ fldd alpha_r, ALPHA_R ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L28 ++#else ++ blbs TMP1, $L28 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L28: ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c03, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c04, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++#ifndef TRMMKERNEL ++ LD c11, 0 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++#ifndef TRMMKERNEL ++ LD c12, 1 * SIZE(C2) ++#else ++ unop ++#endif ++ ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_r, c09, t3 ++ MUL alpha_r, c10, t4 ++ ++#ifndef TRMMKERNEL ++ ADD c03, t1, c03 ++ MUL alpha_i, c02, t1 ++ ADD c04, t2, c04 ++ MUL alpha_i, c01, t2 ++ ++ ADD c11, t3, c11 ++ MUL alpha_i, c10, t3 ++ ADD c12, t4, c12 ++ MUL alpha_i, c09, t4 ++#else ++ ADD $f31, t1, c03 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, c04 ++ MUL alpha_i, c01, t2 ++ ++ ADD $f31, t3, c11 ++ MUL alpha_i, c10, t3 ++ ADD $f31, t4, c12 ++ MUL alpha_i, c09, t4 ++#endif ++ ++ SUB c03, t1, c03 ++ ADD c04, t2, c04 ++ SUB c11, t3, c11 ++ ADD c12, t4, c12 ++ ++ ST c03, 0 * SIZE(C1) ++ ST c04, 1 * SIZE(C1) ++ ST c11, 0 * SIZE(C2) ++ ST c12, 1 * SIZE(C2) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 1, TMP1 ++#else ++ subl TMP1, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 1, KK ++#endif ++ .align 4 ++ ++$L29: ++ mov BO, B ++ ldi J, -1(J) ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 2, KK ++#else ++ unop ++#endif ++ bgt J, $L01 ++ .align 4 ++ ++$L30: ++ and N, 1, J ++ ble J, $L999 ++ ++ mov C, C1 ++ mov A, AO ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ sra M, 1, I ++ ble I, $L50 ++ .align 4 ++ ++$L41: ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 2, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ fclr c04 ++ fclr c08 ++ ble L, $L45 ++#else ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, c05 ++ fldd alpha_r, ALPHA_R ++ MUL b1, a1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L48 ++#else ++ blbs TMP1, $L48 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L48: ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c07, t3, c07 ++ ldi I, -1(I) ++ MUL a3, b1, t3 ++#ifndef TRMMKERNEL ++ LD c09, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++#ifndef TRMMKERNEL ++ LD c10, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++#ifndef TRMMKERNEL ++ LD c11, 2 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++#ifndef TRMMKERNEL ++ LD c12, 3 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_r, c03, t3 ++ MUL alpha_r, c04, t4 ++ ++#ifndef TRMMKERNEL ++ ADD c09, t1, c09 ++ MUL alpha_i, c02, t1 ++ ADD c10, t2, c10 ++ MUL alpha_i, c01, t2 ++ ++ ADD c11, t3, c11 ++ MUL alpha_i, c04, t3 ++ ADD c12, t4, c12 ++ MUL alpha_i, c03, t4 ++#else ++ ADD $f31, t1, c09 ++ MUL alpha_i, c02, t1 ++ ADD $f31, t2, c10 ++ MUL alpha_i, c01, t2 ++ ++ ADD $f31, t3, c11 ++ MUL alpha_i, c04, t3 ++ ADD $f31, t4, c12 ++ MUL alpha_i, c03, t4 ++#endif ++ ++ SUB c09, t1, c09 ++ ADD c10, t2, c10 ++ SUB c11, t3, c11 ++ ADD c12, t4, c12 ++ ++ ST c09, 0 * SIZE(C1) ++ ST c10, 1 * SIZE(C1) ++ ST c11, 2 * SIZE(C1) ++ ST c12, 3 * SIZE(C1) ++ ++ ldi C1, 4 * SIZE(C1) ++ ++#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TMP1 ++#ifdef LEFT ++ subl TMP1, 2, TMP1 ++#else ++ subl TMP1, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ addl KK, 2, KK ++#endif ++ ++ bgt I, $L41 ++ .align 4 ++ ++$L50: ++ and M, 1, I ++ ble I, $L999 ++ ++#if !defined(TRMMKERNEL) || \ ++ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ ++ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) ++ ++#ifdef TRMMKERNEL ++#ifdef LEFT ++ addl KK, 1, TMP1 ++#else ++ addl KK, 1, TMP1 ++#endif ++#endif ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++#ifndef TRMMKERNEL ++ ldi L, -2(K) ++#else ++ ldi L, -2(TMP1) ++#endif ++ ble L, $L55 ++#else ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AO, TMP1, AO ++ addl B, TMP1, BO ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, c01 ++ fldd alpha_r, ALPHA_R ++ MUL a1, b1, t1 ++#ifndef TRMMKERNEL ++ blbs K, $L58 ++#else ++ blbs TMP1, $L58 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L58: ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ fldd alpha_i, ALPHA_I ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b2, t3 ++#ifndef TRMMKERNEL ++ LD c03, 0 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++#ifndef TRMMKERNEL ++ LD c04, 1 * SIZE(C1) ++#else ++ unop ++#endif ++ ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ++ MUL alpha_r, c01, t1 ++ MUL alpha_r, c02, t2 ++ MUL alpha_i, c02, t3 ++ MUL alpha_i, c01, t4 ++ ++#ifndef TRMMKERNEL ++ ADD c03, t1, c03 ++ ADD c04, t2, c04 ++#else ++ ADD $f31, t1, c03 ++ ADD $f31, t2, c04 ++#endif ++ ++ SUB c03, t3, c03 ++ ADD c04, t4, c04 ++ ++ ST c03, 0 * SIZE(C1) ++ ST c04, 1 * SIZE(C1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/zgemm_kernel_simd_8x2.S b/kernel/sw_64/zgemm_kernel_simd_8x2.S +new file mode 100644 +index 0000000..f6a36fb +--- /dev/null ++++ b/kernel/sw_64/zgemm_kernel_simd_8x2.S +@@ -0,0 +1,3189 @@ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(SW2B) ++#error "Architecture is not specified." ++#endif ++ ++#define STACKSIZE 128 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define PREA $10 ++#define PREB $11 ++ ++#define AO $9 ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define a5 $f16 ++#define a6 $f24 ++#define a7 $f25 ++#define a8 $f26 ++ ++#define b5 $f27 ++#define b6 $f28 ++#define b7 $f29 ++#define b8 $f30 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TEMP $1 ++#define KK $2 ++#define BB $3 ++#define OFFSET $4 ++ ++#define ALPHA_R 64($sp) ++#define ALPHA_I 72($sp) ++ ++/* ++ *=================== ++ * (a+bi)*(c+di) ++ * ADD1 ac '+' bd ++ * ADD2 ad '+' bc ++ * FMAD5 a*alpha_r + real part ++ * FMAD6 a*alpha_i + image part ++ * FMAD7 b*alpha_r + image part ++ * FMAD8 b*alpha_i + real part ++ ++ *=================== ++ */ ++ ++/* ++ *=================== ++ * (a+bi) * (c+di) ++ * (a+bi) * (alpha_r+alpha_i) ++ *=================== ++ */ ++#if defined(NN) || defined(NT) || defined(TN) || defined(TT) ++#define ADD1 SUB ++#define ADD2 ADD ++#define FMAD5 MAD ++#define FMAD6 MAD ++#define FMAD7 MAD ++#define FMAD8 NMAD ++#endif ++ ++/* ++ *=================== ++ * (a-bi) * (c+di) ++ * (a+bi) * (alpha_r+alpha_i) ++ *=================== ++ */ ++ ++#if defined(NR) || defined(NC) || defined(TR) || defined(TC) ++#define ADD1 ADD ++#define ADD2 SUB ++#define FMAD5 MAD ++#define FMAD6 MAD ++#define FMAD7 MAD ++#define FMAD8 NMAD ++#endif ++ ++/* ++ *=================== ++ * (a+bi) * (c-di) ++ * (a-bi) * (alpha_r+alpha_i) ++ *=================== ++ */ ++ ++#if defined(RN) || defined(RT) || defined(CN) || defined(CT) ++#define ADD1 ADD ++#define ADD2 SUB ++#define FMAD5 MAD ++#define FMAD6 MAD ++#define FMAD7 NMAD ++#define FMAD8 MAD ++#endif ++ ++/* ++ *=================== ++ * (a-bi) * (c-di) ++ * (a-bi) * (alpha_r+alpha_i) ++ *=================== ++ */ ++#if defined(RR) || defined(RC) || defined(CR) || defined(CC) ++#define ADD1 SUB ++#define ADD2 ADD ++#define FMAD5 MAD ++#define FMAD6 MAD ++#define FMAD7 NMAD ++#define FMAD8 MAD ++#endif ++ ++ ++ ++ PROLOGUE ++ PROFCODE ++ ++ .frame $30, STACKSIZE, $26, 0 ++ ldi $sp, -STACKSIZE($sp) ++ ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++#ifdef TRMMKERNEL ++ ldl OFFSET, 24 + STACKSIZE($sp) ++#endif ++ ++ sll LDC, ZBASE_SHIFT, LDC # LDC*sizebyte ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ST $f19, ALPHA_R ++ ST $f20, ALPHA_I ++ ++ stl $9, 80($sp) # Integer Saved Register ++ stl $10,88($sp) ++ stl $11,96($sp) ++ stl $12,104($sp) ++ stl $13,112($sp) ++ stl $14,120($sp) ++ ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ subl $31, OFFSET, KK ++#endif ++ ++ sra N, 1, J # J=N/2 ++ ble J, $L50 ++ .align 4 ++ ++$L01: ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK ++#endif ++ ++ sra M, 3, I # I=M/8 ++ sll K, ZBASE_SHIFT, PREB ++ ++ sll K, 2+ZBASE_SHIFT, PREA ++ mov C, C1 ++ ++ addl C, LDC, C2 ++ mov A, AO # Reset A ++ ++ addl PREB, B, PREB ++ addl C2, LDC, C # Change C to next panel ++ ++ addl PREA, A, PREA ++ beq I, $L20 # GEMM_MR=8 ++ ++$L11: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B, BO # LL && RU reset B ++ nop ++#else ++ sll KK, 3 + ZBASE_SHIFT, L # KK*8mr ++ sll KK, 1 + ZBASE_SHIFT, TEMP # KK*2nr ++ ++ addl AO, L, AO # mov AO point to the data part ++ addl B,TEMP,BO # mov BO point to the data part ++#endif ++ ++ vcpys $f31,$f31,c01 # Clear result regs ++ fillcs 0(C1) ++ fillcs 4*SIZE(C1) ++ ++ vcpys $f31,$f31,c02 ++ fillcs 8*SIZE(C1) ++ fillcs 12*SIZE(C1) ++ ++ vcpys $f31,$f31,c03 ++ fillcs 0(C2) ++ fillcs 4*SIZE(C2) ++ ++ vcpys $f31,$f31,c04 ++ fillcs 8*SIZE(C2) ++ fillcs 12*SIZE(C2) ++ ++ vcpys $f31,$f31,c05 ++ vcpys $f31,$f31,c06 ++ vcpys $f31,$f31,c07 ++ vcpys $f31,$f31,c08 ++ ++ vcpys $f31,$f31,c09 ++ LDDE b1, 0 * SIZE(BO) # B1R ++ LDDE b2, 1 * SIZE(BO) # B1I ++ ++ vcpys $f31,$f31,c10 ++ VLD a1, 0 * SIZE(AO) # A1, A2 ++ VLD a2, 4 * SIZE(AO) # A3, A4 ++ ++ vcpys $f31,$f31,c11 ++ LDDE b3, 2 * SIZE(BO) # B2R ++ LDDE b4, 3 * SIZE(BO) # B2I ++ ++ vcpys $f31,$f31,c12 ++ VLD a3, 8 * SIZE(AO) # A5, A6 ++ VLD a4,12 * SIZE(AO) # A7, A8 ++ ++ vcpys $f31,$f31,c13 ++ vcpys $f31,$f31,c14 ++ vcpys $f31,$f31,c15 ++ vcpys $f31,$f31,c16 ++ ++ ++ ++ ++#if (defined(LEFT) && !defined(TRANSA)) \ ++ ||(!defined(LEFT) && defined(TRANSA)) ++ subl K, KK, TEMP # temp is the length of data part ++#elif defined(LEFT) ++ addl KK, 8, TEMP # mr=8, careful about complex ++#else ++ addl KK, 2, TEMP # nr=2 ++#endif ++ sra TEMP, 1, L # L=TEMP/2 ++ ble L, $L15 ++ ++#else ++ vcpys $f31,$f31,c01 # Clear result regs ++ mov B, BO # Set B, (block A x panel Bj) ++ sra K, 1, L # Unroll K as 2 ++ ++ vcpys $f31,$f31,c02 ++ fillcs 0(C1) ++ fillcs 4*SIZE(C1) ++ ++ vcpys $f31,$f31,c03 ++ fillcs 8*SIZE(C1) ++ fillcs 12*SIZE(C1) ++ ++ vcpys $f31,$f31,c04 ++ fillcs 0(C2) ++ fillcs 4*SIZE(C2) ++ ++ vcpys $f31,$f31,c05 ++ fillcs 8*SIZE(C2) ++ fillcs 12*SIZE(C2) ++ ++ vcpys $f31,$f31,c06 ++ vcpys $f31,$f31,c07 ++ vcpys $f31,$f31,c08 ++ vcpys $f31,$f31,c09 ++ ++ vcpys $f31,$f31,c10 ++ LDDE b1, 0 * SIZE(BO) # B1R ++ LDDE b2, 1 * SIZE(BO) # B1I ++ ++ vcpys $f31,$f31,c11 ++ VLD a1, 0 * SIZE(AO) # A1, A2 ++ VLD a2, 4 * SIZE(AO) # A3, A4 ++ ++ vcpys $f31,$f31,c12 ++ LDDE b3, 2 * SIZE(BO) # B2R ++ LDDE b4, 3 * SIZE(BO) # B2I ++ ++ vcpys $f31,$f31,c13 ++ VLD a3, 8 * SIZE(AO) # A5, A6 ++ VLD a4,12 * SIZE(AO) # A7, A8 ++ ++ vcpys $f31,$f31,c14 ++ vcpys $f31,$f31,c15 ++ ++ vcpys $f31,$f31,c16 ++ ble L, $L15 ++#endif ++ ++ .align 4 ++$L12: ++ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE ++ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) ++ LDDE b5, 4 * SIZE(BO) # next B1R ++ ++ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) ++ LDDE b6, 5 * SIZE(BO) # next B1I ++ ++ VMAD a2,b1,c05,c05 # C31, C41 ++ VLD a8,12 * SIZE(AO) # next A7, A8 ++ ++ VMAD a2,b2,c06,c06 # C31, C41 ++ VLD a7, 8 * SIZE(AO) # next A5, A6 ++ ++ VMAD a1,b3,c03,c03 # C12(ac,bc), C22(ac,bc) ++ VMAD a1,b4,c04,c04 # C12(ad,bd), C22(ad,bd) ++ VMAD a3,b1,c09,c09 # C51, C61 ++ VMAD a3,b2,c10,c10 # C51, C61 ++ ++ ++ VMAD a2,b3,c07,c07 # C32, C42 ++ LDDE b7, 6 * SIZE(BO) # next B2R ++ ++ VMAD a2,b4,c08,c08 # C32, C42 ++ LDDE b8, 7 * SIZE(BO) # next B2I ++ ++ VMAD a4,b1,c13,c13 # C71, C81 ++ VLD a5, 0 * SIZE(AO) # next A1, A2, a5==a0 ++ ++ VMAD a4,b2,c14,c14 # C71, C81 ++ VLD a6, 4 * SIZE(AO) # next A3, A4 ++ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE ++ ++ ++ VMAD a3,b3,c11,c11 # C52, C62 ++ fillcs 0(PREB) ++ ++ VMAD a3,b4,c12,c12 # C52, C62 ++ fillcs 0(PREA) ++ ++ VMAD a4,b3,c15,c15 # C72, C82 ++ fillcs 8*SIZE(PREA) ++ ++ VMAD a4,b4,c16,c16 # C72, C82 ++ subl L, 1, L # ++ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE ++ ++ VMAD a8,b5,c13,c13 ++ LDDE b1, 0 * SIZE(BO) ++ ++ VMAD a8,b6,c14,c14 ++ LDDE b2, 1 * SIZE(BO) ++ ++ VMAD a7,b5,c09,c09 ++ addl PREA, 16*SIZE, PREA ++ VLD a4,12 * SIZE(AO) ++ ++ VMAD a7,b6,c10,c10 ++ VLD a3, 8 * SIZE(AO) ++ ++ VMAD a5,b5,c01,c01 ++ VMAD a5,b6,c02,c02 ++ VMAD a5,b7,c03,c03 ++ VMAD a5,b8,c04,c04 ++ ++ VMAD a8,b7,c15,c15 ++ LDDE b3, 2 * SIZE(BO) ++ ++ VMAD a8,b8,c16,c16 ++ LDDE b4, 3 * SIZE(BO) ++ ++ VMAD a6,b5,c05,c05 ++ VLD a1, 0 * SIZE(AO) ++ ++ VMAD a6,b6,c06,c06 ++ VLD a2, 4 * SIZE(AO) ++ ++ ++ VMAD a7,b7,c11,c11 ++ fillcs 4*SIZE(PREB) ++ ++ VMAD a7,b8,c12,c12 ++ fillcs 0(PREA) ++ ++ VMAD a6,b7,c07,c07 ++ addl PREB, 8*SIZE, PREB ++ fillcs 8*SIZE(PREA) ++ ++ VMAD a6,b8,c08,c08 ++ addl PREA, 16*SIZE, PREA ++ bne L, $L12 # continue K ++ ++$L15: ++ LD alpha_r, ALPHA_R # $f30==b8 ++#ifndef TRMMKERNEL ++ blbc K, $L18 # if(K&1) ++#else ++ blbc TEMP, $L18 ++#endif ++ ++$L16: ++ VMAD a1,b1,c01,c01 # C11R C21R ++ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE ++ ++ VMAD a1,b2,c02,c02 # C11I C21I ++ addl BO, 4*SIZE, BO ++ ++ VMAD a1,b3,c03,c03 # C12R c22R ++ VMAD a1,b4,c04,c04 # C12I C22I ++ ++ VMAD a2,b1,c05,c05 # C31R C41R ++ VMAD a2,b2,c06,c06 # C31I C41I ++ VMAD a2,b3,c07,c07 # C32R C42R ++ VMAD a2,b4,c08,c08 # C32I C42I ++ ++ VMAD a3,b1,c09,c09 # C51R C61R ++ VMAD a3,b2,c10,c10 # C51I C61I ++ VMAD a3,b3,c11,c11 # C52R C62R ++ VMAD a3,b4,c12,c12 # C52I C62I ++ ++ VMAD a4,b1,c13,c13 # C71R C81R ++ VMAD a4,b2,c14,c14 # C71I C81I ++ VMAD a4,b3,c15,c15 # C72R C82R ++ VMAD a4,b4,c16,c16 # C72I C82I ++ ++$L18: # Write back ++ LD alpha_i, ALPHA_I # $f29==b7 ++#ifndef TRMMKERNEL ++ vextf c01, 0, a1 # a1=C11R_ac ++ vextf c01, 1, a2 # a2=C11I_bc ++ vextf c01, 2, a3 # a3=C21R_ac ++ vextf c01, 3, a4 # a4=C21I_bc ++ ++ vextf c02, 0, b1 # b1=C11I_ad ++ vextf c02, 1, b2 # b2=C11R_bd ++ vextf c02, 2, b3 # b3=C21I_ad ++ vextf c02, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 0 * SIZE(C1) ++ LD a2, 1 * SIZE(C1) ++ LD a3, 2 * SIZE(C1) ++ LD a4, 3 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 0 * SIZE(C1) ++ ST c01, 1 * SIZE(C1) ++ ST b6, 2 * SIZE(C1) ++ ST c02, 3 * SIZE(C1) ++ ++ vextf c05, 0, a1 # a1=C11R_ac ++ vextf c05, 1, a2 # a2=C11I_bc ++ vextf c05, 2, a3 # a3=C21R_ac ++ vextf c05, 3, a4 # a4=C21I_bc ++ ++ vextf c06, 0, b1 # b1=C11I_ad ++ vextf c06, 1, b2 # b2=C11R_bd ++ vextf c06, 2, b3 # b3=C21I_ad ++ vextf c06, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 4 * SIZE(C1) ++ LD a2, 5 * SIZE(C1) ++ LD a3, 6 * SIZE(C1) ++ LD a4, 7 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 4 * SIZE(C1) ++ ST c01, 5 * SIZE(C1) ++ ST b6, 6 * SIZE(C1) ++ ST c02, 7 * SIZE(C1) ++ ++ vextf c09, 0, a1 # a1=C11R_ac ++ vextf c09, 1, a2 # a2=C11I_bc ++ vextf c09, 2, a3 # a3=C21R_ac ++ vextf c09, 3, a4 # a4=C21I_bc ++ ++ vextf c10, 0, b1 # b1=C11I_ad ++ vextf c10, 1, b2 # b2=C11R_bd ++ vextf c10, 2, b3 # b3=C21I_ad ++ vextf c10, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 8 * SIZE(C1) ++ LD a2, 9 * SIZE(C1) ++ LD a3, 10 * SIZE(C1) ++ LD a4, 11 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 8 * SIZE(C1) ++ ST c01, 9 * SIZE(C1) ++ ST b6, 10 * SIZE(C1) ++ ST c02, 11 * SIZE(C1) ++ ++ vextf c13, 0, a1 # a1=C11R_ac ++ vextf c13, 1, a2 # a2=C11I_bc ++ vextf c13, 2, a3 # a3=C21R_ac ++ vextf c13, 3, a4 # a4=C21I_bc ++ ++ vextf c14, 0, b1 # b1=C11I_ad ++ vextf c14, 1, b2 # b2=C11R_bd ++ vextf c14, 2, b3 # b3=C21I_ad ++ vextf c14, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 12 * SIZE(C1) ++ LD a2, 13 * SIZE(C1) ++ LD a3, 14 * SIZE(C1) ++ LD a4, 15 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 12 * SIZE(C1) ++ ST c01, 13 * SIZE(C1) ++ ST b6, 14 * SIZE(C1) ++ ST c02, 15 * SIZE(C1) ++ ++ ++ vextf c03, 0, a1 # a1=C11R_ac ++ vextf c03, 1, a2 # a2=C11I_bc ++ vextf c03, 2, a3 # a3=C21R_ac ++ vextf c03, 3, a4 # a4=C21I_bc ++ ++ vextf c04, 0, b1 # b1=C11I_ad ++ vextf c04, 1, b2 # b2=C11R_bd ++ vextf c04, 2, b3 # b3=C21I_ad ++ vextf c04, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 0 * SIZE(C2) ++ LD a2, 1 * SIZE(C2) ++ LD a3, 2 * SIZE(C2) ++ LD a4, 3 * SIZE(C2) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 0 * SIZE(C2) ++ ST c02, 1 * SIZE(C2) ++ ST c05, 2 * SIZE(C2) ++ ST c06, 3 * SIZE(C2) ++ ++ vextf c07, 0, a1 # a1=C11R_ac ++ vextf c07, 1, a2 # a2=C11I_bc ++ vextf c07, 2, a3 # a3=C21R_ac ++ vextf c07, 3, a4 # a4=C21I_bc ++ ++ vextf c08, 0, b1 # b1=C11I_ad ++ vextf c08, 1, b2 # b2=C11R_bd ++ vextf c08, 2, b3 # b3=C21I_ad ++ vextf c08, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 4 * SIZE(C2) ++ LD a2, 5 * SIZE(C2) ++ LD a3, 6 * SIZE(C2) ++ LD a4, 7 * SIZE(C2) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 4 * SIZE(C2) ++ ST c02, 5 * SIZE(C2) ++ ST c05, 6 * SIZE(C2) ++ ST c06, 7 * SIZE(C2) ++ ++ vextf c11, 0, a1 # a1=C11R_ac ++ vextf c11, 1, a2 # a2=C11I_bc ++ vextf c11, 2, a3 # a3=C21R_ac ++ vextf c11, 3, a4 # a4=C21I_bc ++ ++ vextf c12, 0, b1 # b1=C11I_ad ++ vextf c12, 1, b2 # b2=C11R_bd ++ vextf c12, 2, b3 # b3=C21I_ad ++ vextf c12, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 8 * SIZE(C2) ++ LD a2, 9 * SIZE(C2) ++ LD a3, 10 * SIZE(C2) ++ LD a4, 11 * SIZE(C2) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 8 * SIZE(C2) ++ ST c02, 9 * SIZE(C2) ++ ST c05, 10 * SIZE(C2) ++ ST c06, 11 * SIZE(C2) ++ ++ vextf c15, 0, a1 # a1=C11R_ac ++ vextf c15, 1, a2 # a2=C11I_bc ++ vextf c15, 2, a3 # a3=C21R_ac ++ vextf c15, 3, a4 # a4=C21I_bc ++ ++ vextf c16, 0, b1 # b1=C11I_ad ++ vextf c16, 1, b2 # b2=C11R_bd ++ vextf c16, 2, b3 # b3=C21I_ad ++ vextf c16, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 12 * SIZE(C2) ++ LD a2, 13 * SIZE(C2) ++ LD a3, 14 * SIZE(C2) ++ LD a4, 15 * SIZE(C2) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 12 * SIZE(C2) ++ ST c02, 13 * SIZE(C2) ++ ST c05, 14 * SIZE(C2) ++ ST c06, 15 * SIZE(C2) ++ ++#else ++ vextf c01, 0, a1 # a1=C11R_ac ++ vextf c01, 1, a2 # a2=C11I_bc ++ vextf c01, 2, a3 # a3=C21R_ac ++ vextf c01, 3, a4 # a4=C21I_bc ++ ++ vextf c02, 0, b1 # b1=C11I_ad ++ vextf c02, 1, b2 # b2=C11R_bd ++ vextf c02, 2, b3 # b3=C21I_ad ++ vextf c02, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 0 * SIZE(C1) ++ ST c01, 1 * SIZE(C1) ++ ST b6, 2 * SIZE(C1) ++ ST c02, 3 * SIZE(C1) ++ ++ vextf c05, 0, a1 # a1=C11R_ac ++ vextf c05, 1, a2 # a2=C11I_bc ++ vextf c05, 2, a3 # a3=C21R_ac ++ vextf c05, 3, a4 # a4=C21I_bc ++ ++ vextf c06, 0, b1 # b1=C11I_ad ++ vextf c06, 1, b2 # b2=C11R_bd ++ vextf c06, 2, b3 # b3=C21I_ad ++ vextf c06, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 4 * SIZE(C1) ++ ST c01, 5 * SIZE(C1) ++ ST b6, 6 * SIZE(C1) ++ ST c02, 7 * SIZE(C1) ++ ++ vextf c09, 0, a1 # a1=C11R_ac ++ vextf c09, 1, a2 # a2=C11I_bc ++ vextf c09, 2, a3 # a3=C21R_ac ++ vextf c09, 3, a4 # a4=C21I_bc ++ ++ vextf c10, 0, b1 # b1=C11I_ad ++ vextf c10, 1, b2 # b2=C11R_bd ++ vextf c10, 2, b3 # b3=C21I_ad ++ vextf c10, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 8 * SIZE(C1) ++ ST c01, 9 * SIZE(C1) ++ ST b6, 10 * SIZE(C1) ++ ST c02, 11 * SIZE(C1) ++ ++ vextf c13, 0, a1 # a1=C11R_ac ++ vextf c13, 1, a2 # a2=C11I_bc ++ vextf c13, 2, a3 # a3=C21R_ac ++ vextf c13, 3, a4 # a4=C21I_bc ++ ++ vextf c14, 0, b1 # b1=C11I_ad ++ vextf c14, 1, b2 # b2=C11R_bd ++ vextf c14, 2, b3 # b3=C21I_ad ++ vextf c14, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 12 * SIZE(C1) ++ ST c01, 13 * SIZE(C1) ++ ST b6, 14 * SIZE(C1) ++ ST c02, 15 * SIZE(C1) ++ ++ ++ vextf c03, 0, a1 # a1=C11R_ac ++ vextf c03, 1, a2 # a2=C11I_bc ++ vextf c03, 2, a3 # a3=C21R_ac ++ vextf c03, 3, a4 # a4=C21I_bc ++ ++ vextf c04, 0, b1 # b1=C11I_ad ++ vextf c04, 1, b2 # b2=C11R_bd ++ vextf c04, 2, b3 # b3=C21I_ad ++ vextf c04, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 0 * SIZE(C2) ++ ST c02, 1 * SIZE(C2) ++ ST c05, 2 * SIZE(C2) ++ ST c06, 3 * SIZE(C2) ++ ++ vextf c07, 0, a1 # a1=C11R_ac ++ vextf c07, 1, a2 # a2=C11I_bc ++ vextf c07, 2, a3 # a3=C21R_ac ++ vextf c07, 3, a4 # a4=C21I_bc ++ ++ vextf c08, 0, b1 # b1=C11I_ad ++ vextf c08, 1, b2 # b2=C11R_bd ++ vextf c08, 2, b3 # b3=C21I_ad ++ vextf c08, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 4 * SIZE(C2) ++ ST c02, 5 * SIZE(C2) ++ ST c05, 6 * SIZE(C2) ++ ST c06, 7 * SIZE(C2) ++ ++ vextf c11, 0, a1 # a1=C11R_ac ++ vextf c11, 1, a2 # a2=C11I_bc ++ vextf c11, 2, a3 # a3=C21R_ac ++ vextf c11, 3, a4 # a4=C21I_bc ++ ++ vextf c12, 0, b1 # b1=C11I_ad ++ vextf c12, 1, b2 # b2=C11R_bd ++ vextf c12, 2, b3 # b3=C21I_ad ++ vextf c12, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 8 * SIZE(C2) ++ ST c02, 9 * SIZE(C2) ++ ST c05, 10 * SIZE(C2) ++ ST c06, 11 * SIZE(C2) ++ ++ vextf c15, 0, a1 # a1=C11R_ac ++ vextf c15, 1, a2 # a2=C11I_bc ++ vextf c15, 2, a3 # a3=C21R_ac ++ vextf c15, 3, a4 # a4=C21I_bc ++ ++ vextf c16, 0, b1 # b1=C11I_ad ++ vextf c16, 1, b2 # b2=C11R_bd ++ vextf c16, 2, b3 # b3=C21I_ad ++ vextf c16, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 12 * SIZE(C2) ++ ST c02, 13 * SIZE(C2) ++ ST c05, 14 * SIZE(C2) ++ ST c06, 15 * SIZE(C2) ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 8, TEMP ++#else ++ subl TEMP, 2, TEMP ++#endif ++ ++ sll TEMP, 3 + ZBASE_SHIFT,L # mr=8 ++ sll TEMP, 1 + ZBASE_SHIFT,TEMP # nr=2 ++ ++ addl AO, L, AO ++ addl BO, TEMP, BO ++#endif ++ ++#ifdef LEFT ++ addl KK,8,KK ++#endif ++#endif ++ ++ jmp $L09 ++ ++ ++ .align 4 ++ ++$L20: # N=2, M=4 ++ and M, 4, I # I=M&4 ++ ble I, $L30 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B, BO ++ nop ++#else ++ sll KK, 2 + ZBASE_SHIFT, L # mr=4 ++ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2 ++ ++ addl AO, L, AO ++ addl B, TEMP, BO ++#endif ++ fillcs 0(C1) ++ fillcs 4*SIZE(C1) ++ fillcs 8*SIZE(C1) ++ ++ vcpys $f31,$f31,c01 # Clear result regs ++ vcpys $f31,$f31,c02 ++ vcpys $f31,$f31,c03 ++ vcpys $f31,$f31,c04 ++ ++ fillcs 0(C2) ++ fillcs 4*SIZE(C2) ++ fillcs 8*SIZE(C2) ++ ++ vcpys $f31,$f31,c05 ++ vcpys $f31,$f31,c06 ++ vcpys $f31,$f31,c07 ++ vcpys $f31,$f31,c08 ++ ++ LDDE b1, 0 * SIZE(BO) # B1R ++ LDDE b2, 1 * SIZE(BO) # B1I ++ LDDE b3, 2 * SIZE(BO) # B2R ++ LDDE b4, 3 * SIZE(BO) # B2I ++ ++ VLD a1, 0 * SIZE(AO) # A1, A2 ++ VLD a2, 4 * SIZE(AO) # A3, A4 ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl K, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 4, TEMP # mr=4 ++#else ++ addl KK, 2,TEMP # nr=2 ++#endif ++ sra TEMP, 1, L ++ ble L, $L25 ++ ++#else ++ mov B, BO # Set B, (block A x panel Bj) ++ sra K, 1, L # Unroll K as 2 ++ ++ fillcs 0(C1) ++ fillcs 4*SIZE(C1) ++ fillcs 8*SIZE(C1) ++ ++ vcpys $f31,$f31,c01 # Clear result regs ++ vcpys $f31,$f31,c02 ++ vcpys $f31,$f31,c03 ++ vcpys $f31,$f31,c04 ++ ++ fillcs 0(C2) ++ fillcs 4*SIZE(C2) ++ fillcs 8*SIZE(C2) ++ ++ vcpys $f31,$f31,c05 ++ vcpys $f31,$f31,c06 ++ vcpys $f31,$f31,c07 ++ vcpys $f31,$f31,c08 ++ ++ LDDE b1, 0 * SIZE(BO) # B1R ++ LDDE b2, 1 * SIZE(BO) # B1I ++ LDDE b3, 2 * SIZE(BO) # B2R ++ LDDE b4, 3 * SIZE(BO) # B2I ++ ++ VLD a1, 0 * SIZE(AO) # A1, A2 ++ VLD a2, 4 * SIZE(AO) # A3, A4 ++ ++ ble L, $L25 ++#endif ++ ++ .align 4 ++$L22: ++ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) ++ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) ++ VMAD a1,b3,c03,c03 # C12(ac,bc), C22(ac,bc) ++ VMAD a1,b4,c04,c04 # C12(ad,bd), C22(ad,bd) ++ ++ LDDE b5, 4 * SIZE(BO) # next B1R ++ LDDE b6, 5 * SIZE(BO) # next B1I ++ LDDE b7, 6 * SIZE(BO) # next B2R ++ LDDE b8, 7 * SIZE(BO) # next B2I ++ ++ fillcs 0(PREB) ++ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE ++ VMAD a2,b1,c05,c05 # C31, C41 ++ VMAD a2,b2,c06,c06 # C31, C41 ++ ++ fillcs 0(PREA) ++ VMAD a2,b3,c07,c07 # C32, C42 ++ VMAD a2,b4,c08,c08 # C32, C42 ++ ++ VLD a5, 8 * SIZE(AO) # next A1, A2, a5==a0 ++ VLD a6, 12 * SIZE(AO) # next A3, A4 ++ ++ subl L, 1, L # ++ ++ addl AO, 16*SIZE, AO # AO+=4mr*2kr*2px*SIZE ++ VMAD a5,b5,c01,c01 ++ VMAD a5,b6,c02,c02 ++ ++ addl PREA, 16*SIZE, PREA ++ VMAD a5,b7,c03,c03 ++ VMAD a5,b8,c04,c04 ++ ++ LDDE b1, 0 * SIZE(BO) ++ LDDE b2, 1 * SIZE(BO) ++ LDDE b3, 2 * SIZE(BO) ++ LDDE b4, 3 * SIZE(BO) ++ ++ fillcs 4*SIZE(PREB) ++ VMAD a6,b5,c05,c05 ++ VMAD a6,b6,c06,c06 ++ ++ fillcs 0(PREA) ++ VMAD a6,b7,c07,c07 ++ VMAD a6,b8,c08,c08 ++ ++ VLD a1, 0 * SIZE(AO) ++ VLD a2, 4 * SIZE(AO) ++ ++ addl PREB, 8*SIZE, PREB ++ addl PREA, 16*SIZE, PREA ++ bne L, $L22 # continue K ++ ++$L25: ++ LD alpha_r, ALPHA_R # $f30==b8 ++#ifndef TRMMKERNEL ++ blbc K, $L28 # if(K&1) ++#else ++ blbc TEMP, $L28 ++#endif ++ ++$L26: ++ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE ++ VMAD a1,b1,c01,c01 # C11R C21R ++ VMAD a1,b2,c02,c02 # C11I C21I ++ VMAD a1,b3,c03,c03 # C12R c22R ++ VMAD a1,b4,c04,c04 # C12I C22I ++ ++ addl BO, 4*SIZE, BO ++ VMAD a2,b1,c05,c05 # C31R C41R ++ VMAD a2,b2,c06,c06 # C31I C41I ++ VMAD a2,b3,c07,c07 # C32R C42R ++ VMAD a2,b4,c08,c08 # C32I C42I ++ ++$L28: # Write back ++ LD alpha_i, ALPHA_I # $f29==b7 ++#ifndef TRMMKERNEL ++ vextf c01, 0, a1 # a1=C11R_ac ++ vextf c01, 1, a2 # a2=C11I_bc ++ vextf c01, 2, a3 # a3=C21R_ac ++ vextf c01, 3, a4 # a4=C21I_bc ++ ++ vextf c02, 0, b1 # b1=C11I_ad ++ vextf c02, 1, b2 # b2=C11R_bd ++ vextf c02, 2, b3 # b3=C21I_ad ++ vextf c02, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 0 * SIZE(C1) ++ LD a2, 1 * SIZE(C1) ++ LD a3, 2 * SIZE(C1) ++ LD a4, 3 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 0 * SIZE(C1) ++ ST c01, 1 * SIZE(C1) ++ ST b6, 2 * SIZE(C1) ++ ST c02, 3 * SIZE(C1) ++ ++ vextf c05, 0, a1 # a1=C11R_ac ++ vextf c05, 1, a2 # a2=C11I_bc ++ vextf c05, 2, a3 # a3=C21R_ac ++ vextf c05, 3, a4 # a4=C21I_bc ++ ++ vextf c06, 0, b1 # b1=C11I_ad ++ vextf c06, 1, b2 # b2=C11R_bd ++ vextf c06, 2, b3 # b3=C21I_ad ++ vextf c06, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 4 * SIZE(C1) ++ LD a2, 5 * SIZE(C1) ++ LD a3, 6 * SIZE(C1) ++ LD a4, 7 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 4 * SIZE(C1) ++ ST c01, 5 * SIZE(C1) ++ ST b6, 6 * SIZE(C1) ++ ST c02, 7 * SIZE(C1) ++ ++ ++ vextf c03, 0, a1 # a1=C11R_ac ++ vextf c03, 1, a2 # a2=C11I_bc ++ vextf c03, 2, a3 # a3=C21R_ac ++ vextf c03, 3, a4 # a4=C21I_bc ++ ++ vextf c04, 0, b1 # b1=C11I_ad ++ vextf c04, 1, b2 # b2=C11R_bd ++ vextf c04, 2, b3 # b3=C21I_ad ++ vextf c04, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 0 * SIZE(C2) ++ LD a2, 1 * SIZE(C2) ++ LD a3, 2 * SIZE(C2) ++ LD a4, 3 * SIZE(C2) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 0 * SIZE(C2) ++ ST c02, 1 * SIZE(C2) ++ ST c05, 2 * SIZE(C2) ++ ST c06, 3 * SIZE(C2) ++ ++ vextf c07, 0, a1 # a1=C11R_ac ++ vextf c07, 1, a2 # a2=C11I_bc ++ vextf c07, 2, a3 # a3=C21R_ac ++ vextf c07, 3, a4 # a4=C21I_bc ++ ++ vextf c08, 0, b1 # b1=C11I_ad ++ vextf c08, 1, b2 # b2=C11R_bd ++ vextf c08, 2, b3 # b3=C21I_ad ++ vextf c08, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 4 * SIZE(C2) ++ LD a2, 5 * SIZE(C2) ++ LD a3, 6 * SIZE(C2) ++ LD a4, 7 * SIZE(C2) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 4 * SIZE(C2) ++ ST c02, 5 * SIZE(C2) ++ ST c05, 6 * SIZE(C2) ++ ST c06, 7 * SIZE(C2) ++ ++#else ++ ++ vextf c01, 0, a1 # a1=C11R_ac ++ vextf c01, 1, a2 # a2=C11I_bc ++ vextf c01, 2, a3 # a3=C21R_ac ++ vextf c01, 3, a4 # a4=C21I_bc ++ ++ vextf c02, 0, b1 # b1=C11I_ad ++ vextf c02, 1, b2 # b2=C11R_bd ++ vextf c02, 2, b3 # b3=C21I_ad ++ vextf c02, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 0 * SIZE(C1) ++ ST c01, 1 * SIZE(C1) ++ ST b6, 2 * SIZE(C1) ++ ST c02, 3 * SIZE(C1) ++ ++ vextf c05, 0, a1 # a1=C11R_ac ++ vextf c05, 1, a2 # a2=C11I_bc ++ vextf c05, 2, a3 # a3=C21R_ac ++ vextf c05, 3, a4 # a4=C21I_bc ++ ++ vextf c06, 0, b1 # b1=C11I_ad ++ vextf c06, 1, b2 # b2=C11R_bd ++ vextf c06, 2, b3 # b3=C21I_ad ++ vextf c06, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 4 * SIZE(C1) ++ ST c01, 5 * SIZE(C1) ++ ST b6, 6 * SIZE(C1) ++ ST c02, 7 * SIZE(C1) ++ ++ ++ vextf c03, 0, a1 # a1=C11R_ac ++ vextf c03, 1, a2 # a2=C11I_bc ++ vextf c03, 2, a3 # a3=C21R_ac ++ vextf c03, 3, a4 # a4=C21I_bc ++ ++ vextf c04, 0, b1 # b1=C11I_ad ++ vextf c04, 1, b2 # b2=C11R_bd ++ vextf c04, 2, b3 # b3=C21I_ad ++ vextf c04, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 0 * SIZE(C2) ++ ST c02, 1 * SIZE(C2) ++ ST c05, 2 * SIZE(C2) ++ ST c06, 3 * SIZE(C2) ++ ++ vextf c07, 0, a1 # a1=C11R_ac ++ vextf c07, 1, a2 # a2=C11I_bc ++ vextf c07, 2, a3 # a3=C21R_ac ++ vextf c07, 3, a4 # a4=C21I_bc ++ ++ vextf c08, 0, b1 # b1=C11I_ad ++ vextf c08, 1, b2 # b2=C11R_bd ++ vextf c08, 2, b3 # b3=C21I_ad ++ vextf c08, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, c01 ++ FMAD8 a8, alpha_i, a3, c05 ++ FMAD6 b5, alpha_i, a2, c02 ++ FMAD6 a6, alpha_i, a4, c06 ++ ++ ST c01, 4 * SIZE(C2) ++ ST c02, 5 * SIZE(C2) ++ ST c05, 6 * SIZE(C2) ++ ST c06, 7 * SIZE(C2) ++ ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 4, TEMP ++#else ++ subl TEMP, 2, TEMP ++#endif ++ ++ sll TEMP, 2 + ZBASE_SHIFT, L ++ sll TEMP, 1 + ZBASE_SHIFT, TEMP ++ ++ addl AO, L, AO ++ addl BO, TEMP,BO ++#endif ++ ++#ifdef LEFT ++ addl KK, 4,KK ++#endif ++#endif ++ ++ addl C1, 8*SIZE, C1 ++ addl C2, 8*SIZE, C2 ++ ++ ++ .align 4 ++$L30: ++ and M, 2, I # I=M&2 ++ ble I, $L40 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B, BO ++ nop ++#else ++ sll KK, 1 + ZBASE_SHIFT, L # mr=2 ++ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2 ++ ++ addl AO, L, AO ++ addl B, TEMP, BO ++#endif ++ ++ fclr c01 ++ fclr c02 ++ fclr c03 ++ fclr c04 ++ fclr c05 ++ fclr c06 ++ fclr c07 ++ fclr c08 # CLEAR 8 register ++ fclr c09 ++ fclr c10 ++ fclr c11 ++ fclr c12 ++ fclr c13 ++ fclr c14 ++ fclr c15 ++ fclr c16 ++ ++ fillcs 0*SIZE(C1) ++ fillcs 4*SIZE(C1) ++ ++ LD b1, 0*SIZE(BO) # b1 real part ++ LD b2, 1*SIZE(BO) # b1 image part ++ LD b3, 2*SIZE(BO) # b2 real part ++ LD b4, 3*SIZE(BO) # b2 image part ++ ++ fillcs 0*SIZE(C2) ++ fillcs 4*SIZE(C2) ++ ++ LD a1, 0*SIZE(AO) # a1 real part ++ LD a2, 1*SIZE(AO) # a1 image part ++ LD a3, 2*SIZE(AO) # a2 real part ++ LD a4, 3*SIZE(AO) # a2 image part ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl K, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 2, TEMP # mr=2 ++#else ++ addl KK, 2, TEMP # nr=2 ++#endif ++ sra TEMP, 1, L ++ ble L, $L35 ++ ++#else ++ ++ mov B, BO # Set B, (block A x panel Bj) ++ sra K, 1, L # Unroll K as 2 ++ ++ fclr c01 ++ fclr c02 ++ fclr c03 ++ fclr c04 ++ fclr c05 ++ fclr c06 ++ fclr c07 ++ fclr c08 # CLEAR 8 register ++ fclr c09 ++ fclr c10 ++ fclr c11 ++ fclr c12 ++ fclr c13 ++ fclr c14 ++ fclr c15 ++ fclr c16 ++ ++ fillcs 0*SIZE(C1) ++ fillcs 4*SIZE(C1) ++ ++ LD b1, 0*SIZE(BO) # b1 real part ++ LD b2, 1*SIZE(BO) # b1 image part ++ LD b3, 2*SIZE(BO) # b2 real part ++ LD b4, 3*SIZE(BO) # b2 image part ++ ++ fillcs 0*SIZE(C2) ++ fillcs 4*SIZE(C2) ++ ++ LD a1, 0*SIZE(AO) # a1 real part ++ LD a2, 1*SIZE(AO) # a1 image part ++ LD a3, 2*SIZE(AO) # a2 real part ++ LD a4, 3*SIZE(AO) # a2 image part ++ ++ ble L, $L35 ++#endif ++ ++ .align 4 ++$L32: ++ MAD a1,b1,c01,c01 # a1*c1 ++ MAD a1,b2,c02,c02 # a1*d1 ++ MAD a1,b3,c03,c03 # a1*c2 ++ MAD a1,b4,c04,c04 # a1*d2 ++ ++ LD b5, 4 * SIZE(BO) # next B1R ++ LD b6, 5 * SIZE(BO) # next B1I ++ LD b7, 6 * SIZE(BO) # next B2R ++ LD b8, 7 * SIZE(BO) # next B2I ++ ++ LD a5, 4 * SIZE(AO) # next A1-A4 real part ++ LD a6, 5 * SIZE(AO) # next A1-A4 image part ++ LD a7, 6 * SIZE(AO) ++ LD a8, 7 * SIZE(AO) ++ ++ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE ++ MAD a2,b1,c05,c05 # b1*c1 ++ MAD a2,b2,c06,c06 # b1*d1 ++ MAD a2,b3,c07,c07 # b1*c2 ++ MAD a2,b4,c08,c08 # b1*d2 ++ ++ MAD a3,b1,c09,c09 # a2*c1 ++ MAD a3,b2,c10,c10 # a2*d1 ++ MAD a3,b3,c11,c11 # a2*c2 ++ MAD a3,b4,c12,c12 # a2*d2 ++ ++ MAD a4,b1,c13,c13 # b2*c1 ++ MAD a4,b2,c14,c14 # b2*d1 ++ MAD a4,b3,c15,c15 # b2*c2 ++ MAD a4,b4,c16,c16 # b2*d2 ++ ++ subl L, 1, L # ++ ++ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE ++ MAD a5,b5,c01,c01 ++ MAD a5,b6,c02,c02 ++ MAD a5,b7,c03,c03 ++ MAD a5,b8,c04,c04 ++ ++ LD b1, 0 * SIZE(BO) ++ LD b2, 1 * SIZE(BO) ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MAD a6,b5,c05,c05 ++ MAD a6,b6,c06,c06 ++ MAD a6,b7,c07,c07 ++ MAD a6,b8,c08,c08 ++ ++ MAD a7,b5,c09,c09 ++ MAD a7,b6,c10,c10 ++ MAD a7,b7,c11,c11 ++ MAD a7,b8,c12,c12 ++ ++ MAD a8,b5,c13,c13 ++ MAD a8,b6,c14,c14 ++ MAD a8,b7,c15,c15 ++ MAD a8,b8,c16,c16 ++ ++ bne L, $L32 # continue K ++ ++$L35: ++ LD alpha_r, ALPHA_R # $f30==b8 ++#ifndef TRMMKERNEL ++ blbc K, $L38 # if(K&1) ++#else ++ blbc TEMP, $L38 ++#endif ++ ++$L36: ++ addl AO, 4*SIZE, AO # AO+=2mr*1kr*2px*SIZE ++ addl BO, 4*SIZE, BO ++ ++ MAD a1,b1,c01,c01 # a1*c1 ++ MAD a1,b2,c02,c02 # a1*d1 ++ MAD a1,b3,c03,c03 # a1*c2 ++ MAD a1,b4,c04,c04 # a1*d2 ++ ++ MAD a2,b1,c05,c05 # b1*c1 ++ MAD a2,b2,c06,c06 # b1*d1 ++ MAD a2,b3,c07,c07 # b1*c2 ++ MAD a2,b4,c08,c08 # b1*d2 ++ ++ MAD a3,b1,c09,c09 # a2*c1 ++ MAD a3,b2,c10,c10 # a2*d1 ++ MAD a3,b3,c11,c11 # a2*c2 ++ MAD a3,b4,c12,c12 # a2*d2 ++ ++ MAD a4,b1,c13,c13 # b2*c1 ++ MAD a4,b2,c14,c14 # b2*d1 ++ MAD a4,b3,c15,c15 # b2*c2 ++ MAD a4,b4,c16,c16 # b2*d2 ++ ++ ++ ++$L38: # Write back ++ LD alpha_i, ALPHA_I # $f29==b7 ++#ifndef TRMMKERNEL ++ ADD1 c01, c06, c01 # ac '+' bd ++ ADD1 c09, c14, c09 ++ ADD1 c03, c08, c03 # ++ ADD1 c11, c16, c11 ++ ++ ADD2 c05, c02, c02 # bc '+' ad ++ ADD2 c13, c10, c10 ++ ADD2 c07, c04, c04 ++ ADD2 c15, c12, c12 ++ ++ LD b1, 0 * SIZE(C1) ++ LD b2, 1 * SIZE(C1) ++ LD b3, 2 * SIZE(C1) ++ LD b4, 3 * SIZE(C1) ++ ++ LD a5, 0 * SIZE(C2) ++ LD a6, 1 * SIZE(C2) ++ LD a7, 2 * SIZE(C2) ++ LD a8, 3 * SIZE(C2) ++ ++ FMAD5 c01, alpha_r, b1, b1 ++ FMAD5 c09, alpha_r, b3, b3 ++ FMAD5 c03, alpha_r, a5, a5 ++ FMAD5 c11, alpha_r, a7, a7 ++ ++ FMAD7 c02, alpha_r, b2, b2 ++ FMAD7 c10, alpha_r, b4, b4 ++ FMAD7 c04, alpha_r, a6, a6 ++ FMAD7 c12, alpha_r, a8, a8 ++ ++ FMAD8 c02, alpha_i, b1, b1 ++ FMAD8 c10, alpha_i, b3, b3 ++ FMAD8 c04, alpha_i, a5, a5 ++ FMAD8 c12, alpha_i, a7, a7 ++ ++ FMAD6 c01, alpha_i, b2, b2 ++ FMAD6 c09, alpha_i, b4, b4 ++ FMAD6 c03, alpha_i, a6, a6 ++ FMAD6 c11, alpha_i, a8, a8 ++ ++ ST b1, 0 * SIZE(C1) ++ ST b2, 1 * SIZE(C1) ++ ST b3, 2 * SIZE(C1) ++ ST b4, 3 * SIZE(C1) ++ ++ ST a5, 0 * SIZE(C2) ++ ST a6, 1 * SIZE(C2) ++ ST a7, 2 * SIZE(C2) ++ ST a8, 3 * SIZE(C2) ++ ++#else ++ ++ ADD1 c01, c06, c01 # ac '+' bd ++ ADD1 c09, c14, c09 ++ ADD1 c03, c08, c03 # ++ ADD1 c11, c16, c11 ++ ++ ADD2 c05, c02, c02 # bc '+' ad ++ ADD2 c13, c10, c10 ++ ADD2 c07, c04, c04 ++ ADD2 c15, c12, c12 ++ ++ FMAD5 c01, alpha_r, $f31, b1 ++ FMAD5 c09, alpha_r, $f31, b3 ++ FMAD5 c03, alpha_r, $f31, a5 ++ FMAD5 c11, alpha_r, $f31, a7 ++ ++ FMAD7 c02, alpha_r, $f31, b2 ++ FMAD7 c10, alpha_r, $f31, b4 ++ FMAD7 c04, alpha_r, $f31, a6 ++ FMAD7 c12, alpha_r, $f31, a8 ++ ++ FMAD8 c02, alpha_i, b1, b1 ++ FMAD8 c10, alpha_i, b3, b3 ++ FMAD8 c04, alpha_i, a5, a5 ++ FMAD8 c12, alpha_i, a7, a7 ++ ++ FMAD6 c01, alpha_i, b2, b2 ++ FMAD6 c09, alpha_i, b4, b4 ++ FMAD6 c03, alpha_i, a6, a6 ++ FMAD6 c11, alpha_i, a8, a8 ++ ++ ST b1, 0 * SIZE(C1) ++ ST b2, 1 * SIZE(C1) ++ ST b3, 2 * SIZE(C1) ++ ST b4, 3 * SIZE(C1) ++ ++ ST a5, 0 * SIZE(C2) ++ ST a6, 1 * SIZE(C2) ++ ST a7, 2 * SIZE(C2) ++ ST a8, 3 * SIZE(C2) ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 2, TEMP ++#else ++ subl TEMP, 2, TEMP ++#endif ++ ++ sll TEMP, 1 + ZBASE_SHIFT, L ++ sll TEMP, 1 + ZBASE_SHIFT, TEMP ++ ++ addl AO, L, AO ++ addl BO, TEMP, BO ++#endif ++ ++#ifdef LEFT ++ addl KK, 2, KK ++#endif ++#endif ++ ++ addl C1, 4*SIZE, C1 ++ addl C2, 4*SIZE, C2 ++ ++ ++ .align 4 ++$L40: ++ and M, 1, I # I=M&1 ++ ble I, $L09 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B, BO ++ nop ++#else ++ sll KK, ZBASE_SHIFT, L # mr=1 ++ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2 ++ ++ addl AO, L, AO ++ addl B, TEMP, BO ++#endif ++ ++ fillcs 0*SIZE(C1) ++ fillcs 0*SIZE(C2) ++ ++ fclr c01 ++ fclr c02 ++ fclr c03 ++ fclr c04 ++ fclr c05 ++ fclr c06 ++ fclr c07 ++ fclr c08 ++ ++ LD b1, 0*SIZE(BO) # b1 real part ++ LD b2, 1*SIZE(BO) # b1 image part ++ LD b3, 2*SIZE(BO) # b2 real part ++ LD b4, 3*SIZE(BO) # b2 image part ++ ++ LD a1, 0*SIZE(AO) # a1 real part ++ LD a2, 1*SIZE(AO) # a1 image part ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl K, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 1, TEMP # mr=1 ++#else ++ addl KK, 2, TEMP # nr=2 ++#endif ++ sra TEMP, 1, L ++ ++ ble L, $L45 ++ ++#else ++ mov B, BO # Set B, (block A x panel Bj) ++ sra K, 1, L # Unroll K as 2 ++ ++ fillcs 0*SIZE(C1) ++ fillcs 0*SIZE(C2) ++ ++ fclr c01 ++ fclr c02 ++ fclr c03 ++ fclr c04 ++ fclr c05 ++ fclr c06 ++ fclr c07 ++ fclr c08 ++ ++ LD b1, 0*SIZE(BO) # b1 real part ++ LD b2, 1*SIZE(BO) # b1 image part ++ LD b3, 2*SIZE(BO) # b2 real part ++ LD b4, 3*SIZE(BO) # b2 image part ++ ++ LD a1, 0*SIZE(AO) # a1 real part ++ LD a2, 1*SIZE(AO) # a1 image part ++ ++ ble L, $L45 ++#endif ++ ++ .align 4 ++$L42: ++ MAD a1,b1,c01,c01 # C11 real part ++ MAD a1,b2,c02,c02 # C11 imag part ++ MAD a1,b3,c03,c03 # C21 real part ++ MAD a1,b4,c04,c04 # C21 imag part ++ ++ LD b5, 4 * SIZE(BO) # next B1R ++ LD b6, 5 * SIZE(BO) # next B1I ++ LD b7, 6 * SIZE(BO) # next B2R ++ LD b8, 7 * SIZE(BO) # next B2I ++ ++ LD a5, 2 * SIZE(AO) # next A1-A4 real part ++ LD a6, 3 * SIZE(AO) # next A1-A4 image part ++ ++ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE ++ MAD a2,b1,c05,c05 # C11 image part ++ MAD a2,b2,c06,c06 # C11 real part ++ MAD a2,b3,c07,c07 # C21 image part ++ MAD a2,b4,c08,c08 # C21 real part ++ ++ subl L, 1, L # ++ ++ addl AO, 4*SIZE, AO # AO+=1mr*2kr*2px*SIZE ++ MAD a5,b5,c01,c01 ++ MAD a5,b6,c02,c02 ++ MAD a5,b7,c03,c03 ++ MAD a5,b8,c04,c04 ++ ++ LD b1, 0 * SIZE(BO) ++ LD b2, 1 * SIZE(BO) ++ LD b3, 2 * SIZE(BO) ++ LD b4, 3 * SIZE(BO) ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MAD a6,b5,c05,c05 ++ MAD a6,b6,c06,c06 ++ MAD a6,b7,c07,c07 ++ MAD a6,b8,c08,c08 ++ ++ bne L, $L42 # continue K ++ ++$L45: ++ LD alpha_r, ALPHA_R # $f30==b8 ++#ifndef TRMMKERNEL ++ blbc K, $L48 # if(K&1) ++#else ++ blbc TEMP, $L48 ++#endif ++ ++$L46: ++ addl AO, 2*SIZE, AO # AO+=8mr*1kr*2px*SIZE ++ MAD a1,b1,c01,c01 # C11 real part ++ MAD a1,b2,c02,c02 # C11 imag part ++ MAD a1,b3,c03,c03 # C21 real part ++ MAD a1,b4,c04,c04 # C21 imag part ++ ++ addl BO, 4*SIZE, BO ++ MAD a2,b1,c05,c05 # C11 image part ++ MAD a2,b2,c06,c06 # C11 real part ++ MAD a2,b3,c07,c07 # C21 image part ++ MAD a2,b4,c08,c08 # C21 real part ++ ++ ++$L48: # Write back ++ LD alpha_i, ALPHA_I # $f29==b7 ++#ifndef TRMMKERNEL ++ ADD1 c01, c06, c01 ++ ADD1 c03, c08, c03 ++ ADD2 c05, c02, c02 ++ ADD2 c07, c04, c04 ++ ++ LD b1, 0 * SIZE(C1) ++ LD b2, 1 * SIZE(C1) ++ ++ LD a5, 0 * SIZE(C2) ++ LD a6, 1 * SIZE(C2) ++ ++ FMAD5 c01, alpha_r, b1, b1 ++ FMAD5 c03, alpha_r, a5, a5 ++ ++ FMAD7 c02, alpha_r, b2, b2 ++ FMAD7 c04, alpha_r, a6, a6 ++ ++ FMAD8 c02, alpha_i, b1, b1 ++ FMAD8 c04, alpha_i, a5, a5 ++ ++ FMAD6 c01, alpha_i, b2, b2 ++ FMAD6 c03, alpha_i, a6, a6 ++ ++ ST b1, 0 * SIZE(C1) ++ ST b2, 1 * SIZE(C1) ++ ++ ST a5, 0 * SIZE(C2) ++ ST a6, 1 * SIZE(C2) ++ ++#else ++ ++ ADD1 c01, c06, c01 ++ ADD1 c03, c08, c03 ++ ADD2 c05, c02, c02 ++ ADD2 c07, c04, c04 ++ ++ FMAD5 c01, alpha_r, $f31, b1 ++ FMAD5 c03, alpha_r, $f31, a5 ++ ++ FMAD7 c02, alpha_r, $f31, b2 ++ FMAD7 c04, alpha_r, $f31, a6 ++ ++ FMAD8 c02, alpha_i, b1, b1 ++ FMAD8 c04, alpha_i, a5, a5 ++ ++ FMAD6 c01, alpha_i, b2, b2 ++ FMAD6 c03, alpha_i, a6, a6 ++ ++ ST b1, 0 * SIZE(C1) ++ ST b2, 1 * SIZE(C1) ++ ++ ST a5, 0 * SIZE(C2) ++ ST a6, 1 * SIZE(C2) ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 1, TEMP ++#else ++ subl TEMP, 2, TEMP ++#endif ++ ++ sll TEMP, ZBASE_SHIFT, L ++ sll TEMP, 1 + ZBASE_SHIFT, TEMP ++ ++ addl AO, L, AO ++ addl BO, TEMP,BO ++#endif ++ ++#ifdef LEFT ++ addl KK, 1, KK ++#endif ++#endif ++ ++ addl C1, 2*SIZE, C1 ++ addl C2, 2*SIZE, C2 ++ ++ ++ .align 4 ++ ++$L09: ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ addl KK, 2, KK # nr=2 ++ nop ++#endif ++ mov BO, B # Change B to next panel ++ subl J, 1, J # J-- ++ bgt J, $L01 ++ ++ ++ .align 4 ++$L50: ++ and N, 1, J ++ ble J, $L999 # Finish! ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ mov OFFSET, KK # reset KK ++#endif ++ ++ sra M, 3, I # I=M/8 ++ sll K, 1 + ZBASE_SHIFT, PREA ++ ++ mov C, C1 ++ mov A, AO # Reset A ++ ++ addl A, PREA, PREA ++ beq I, $L60 # GEMM_MR=8 ++ ++ ++$L51: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA))\ ++ || (!defined(LEFT) && !defined(TRANSA)) ++ mov B, BO ++#else ++ sll KK, 3 + ZBASE_SHIFT,L # mr=8 ++ sll KK, ZBASE_SHIFT,TEMP # nr=1 ++ ++ addl AO, L, AO ++ addl B, TEMP, BO ++#endif ++ ++ fillcs 0(C1) ++ fillcs 4*SIZE(C1) ++ fillcs 8*SIZE(C1) ++ fillcs 12*SIZE(C1) ++ fillcs 16*SIZE(C1) ++ ++ vcpys $f31,$f31,c01 # Clear result regs ++ vcpys $f31,$f31,c02 ++ ++ vcpys $f31,$f31,c05 ++ vcpys $f31,$f31,c06 ++ ++ vcpys $f31,$f31,c09 ++ vcpys $f31,$f31,c10 ++ ++ vcpys $f31,$f31,c13 ++ vcpys $f31,$f31,c14 ++ ++ LDDE b1, 0 * SIZE(BO) # B1R ++ LDDE b2, 1 * SIZE(BO) # B1I ++ ++ VLD a1, 0 * SIZE(AO) # A1, A2 ++ VLD a2, 4 * SIZE(AO) # A3, A4 ++ VLD a3, 8 * SIZE(AO) # A5, A6 ++ VLD a4,12 * SIZE(AO) # A7, A8 ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl K, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 8, TEMP # mr=8 ++#else ++ addl KK, 1, TEMP # nr=1 ++#endif ++ sra TEMP, 1, L ++ ble L, $L55 ++ ++#else ++ mov B, BO # Set B, (block A x panel Bj) ++ sra K, 1, L # Unroll K as 2 ++ ++ fillcs 0(C1) ++ fillcs 4*SIZE(C1) ++ fillcs 8*SIZE(C1) ++ fillcs 12*SIZE(C1) ++ fillcs 16*SIZE(C1) ++ ++ vcpys $f31,$f31,c01 # Clear result regs ++ vcpys $f31,$f31,c02 ++ ++ vcpys $f31,$f31,c05 ++ vcpys $f31,$f31,c06 ++ ++ vcpys $f31,$f31,c09 ++ vcpys $f31,$f31,c10 ++ ++ vcpys $f31,$f31,c13 ++ vcpys $f31,$f31,c14 ++ ++ LDDE b1, 0 * SIZE(BO) # B1R ++ LDDE b2, 1 * SIZE(BO) # B1I ++ ++ VLD a1, 0 * SIZE(AO) # A1, A2 ++ VLD a2, 4 * SIZE(AO) # A3, A4 ++ VLD a3, 8 * SIZE(AO) # A5, A6 ++ VLD a4,12 * SIZE(AO) # A7, A8 ++ ++ ble L, $L55 ++#endif ++ ++ .align 4 ++$L52: ++ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE ++ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) ++ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) ++ ++ LDDE b5, 2 * SIZE(BO) # next B1R ++ LDDE b6, 3 * SIZE(BO) # next B1I ++ ++ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE ++ VMAD a2,b1,c05,c05 # C31, C41 ++ VMAD a2,b2,c06,c06 # C31, C41 ++ ++ VLD a5, 0 * SIZE(AO) # next A1, A2, a5==a0 ++ VLD a6, 4 * SIZE(AO) # next A3, A4 ++ VLD a7, 8 * SIZE(AO) # next A5, A6 ++ VLD a8,12 * SIZE(AO) # next A7, A8 ++ ++ VMAD a3,b1,c09,c09 # C51, C61 ++ VMAD a3,b2,c10,c10 # C51, C61 ++ ++ fillcs 0(PREA) ++ VMAD a4,b1,c13,c13 # C71, C81 ++ VMAD a4,b2,c14,c14 # C71, C81 ++ ++ subl L, 1, L # ++ ++ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE ++ VMAD a5,b5,c01,c01 ++ VMAD a5,b6,c02,c02 ++ ++ addl PREA, 16*SIZE, PREA ++ LDDE b1, 0 * SIZE(BO) ++ LDDE b2, 1 * SIZE(BO) ++ ++ VMAD a6,b5,c05,c05 ++ VMAD a6,b6,c06,c06 ++ ++ VLD a1, 0 * SIZE(AO) ++ VLD a2, 4 * SIZE(AO) ++ VLD a3, 8 * SIZE(AO) ++ VLD a4,12 * SIZE(AO) ++ ++ VMAD a7,b5,c09,c09 ++ VMAD a7,b6,c10,c10 ++ ++ fillcs 0(PREA) ++ VMAD a8,b5,c13,c13 ++ VMAD a8,b6,c14,c14 ++ ++ addl PREA, 16*SIZE, PREA ++ bne L, $L52 # continue K ++ ++$L55: ++ LD alpha_r, ALPHA_R # $f30==b8 ++#ifndef TRMMKERNEL ++ blbc K, $L58 # if(K&1) ++#else ++ blbc TEMP, $L58 ++#endif ++ ++$L56: ++ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE ++ VMAD a1,b1,c01,c01 # C11R C21R ++ VMAD a1,b2,c02,c02 # C11I C21I ++ ++ addl BO, 2*SIZE, BO ++ VMAD a2,b1,c05,c05 # C31R C41R ++ VMAD a2,b2,c06,c06 # C31I C41I ++ ++ VMAD a3,b1,c09,c09 # C51R C61R ++ VMAD a3,b2,c10,c10 # C51I C61I ++ ++ VMAD a4,b1,c13,c13 # C71R C81R ++ VMAD a4,b2,c14,c14 # C71I C81I ++ ++$L58: # Write back ++ LD alpha_i, ALPHA_I # $f29==b7 ++#ifndef TRMMKERNEL ++ vextf c01, 0, a1 # a1=C11R_ac ++ vextf c01, 1, a2 # a2=C11I_bc ++ vextf c01, 2, a3 # a3=C21R_ac ++ vextf c01, 3, a4 # a4=C21I_bc ++ ++ vextf c02, 0, b1 # b1=C11I_ad ++ vextf c02, 1, b2 # b2=C11R_bd ++ vextf c02, 2, b3 # b3=C21I_ad ++ vextf c02, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 0 * SIZE(C1) ++ LD a2, 1 * SIZE(C1) ++ LD a3, 2 * SIZE(C1) ++ LD a4, 3 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 0 * SIZE(C1) ++ ST c01, 1 * SIZE(C1) ++ ST b6, 2 * SIZE(C1) ++ ST c02, 3 * SIZE(C1) ++ ++ vextf c05, 0, a1 # a1=C11R_ac ++ vextf c05, 1, a2 # a2=C11I_bc ++ vextf c05, 2, a3 # a3=C21R_ac ++ vextf c05, 3, a4 # a4=C21I_bc ++ ++ vextf c06, 0, b1 # b1=C11I_ad ++ vextf c06, 1, b2 # b2=C11R_bd ++ vextf c06, 2, b3 # b3=C21I_ad ++ vextf c06, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 4 * SIZE(C1) ++ LD a2, 5 * SIZE(C1) ++ LD a3, 6 * SIZE(C1) ++ LD a4, 7 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 4 * SIZE(C1) ++ ST c01, 5 * SIZE(C1) ++ ST b6, 6 * SIZE(C1) ++ ST c02, 7 * SIZE(C1) ++ ++ vextf c09, 0, a1 # a1=C11R_ac ++ vextf c09, 1, a2 # a2=C11I_bc ++ vextf c09, 2, a3 # a3=C21R_ac ++ vextf c09, 3, a4 # a4=C21I_bc ++ ++ vextf c10, 0, b1 # b1=C11I_ad ++ vextf c10, 1, b2 # b2=C11R_bd ++ vextf c10, 2, b3 # b3=C21I_ad ++ vextf c10, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 8 * SIZE(C1) ++ LD a2, 9 * SIZE(C1) ++ LD a3, 10 * SIZE(C1) ++ LD a4, 11 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 8 * SIZE(C1) ++ ST c01, 9 * SIZE(C1) ++ ST b6, 10 * SIZE(C1) ++ ST c02, 11 * SIZE(C1) ++ ++ vextf c13, 0, a1 # a1=C11R_ac ++ vextf c13, 1, a2 # a2=C11I_bc ++ vextf c13, 2, a3 # a3=C21R_ac ++ vextf c13, 3, a4 # a4=C21I_bc ++ ++ vextf c14, 0, b1 # b1=C11I_ad ++ vextf c14, 1, b2 # b2=C11R_bd ++ vextf c14, 2, b3 # b3=C21I_ad ++ vextf c14, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 12 * SIZE(C1) ++ LD a2, 13 * SIZE(C1) ++ LD a3, 14 * SIZE(C1) ++ LD a4, 15 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 12 * SIZE(C1) ++ ST c01, 13 * SIZE(C1) ++ ST b6, 14 * SIZE(C1) ++ ST c02, 15 * SIZE(C1) ++ ++#else ++ ++ vextf c01, 0, a1 # a1=C11R_ac ++ vextf c01, 1, a2 # a2=C11I_bc ++ vextf c01, 2, a3 # a3=C21R_ac ++ vextf c01, 3, a4 # a4=C21I_bc ++ ++ vextf c02, 0, b1 # b1=C11I_ad ++ vextf c02, 1, b2 # b2=C11R_bd ++ vextf c02, 2, b3 # b3=C21I_ad ++ vextf c02, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 0 * SIZE(C1) ++ ST c01, 1 * SIZE(C1) ++ ST b6, 2 * SIZE(C1) ++ ST c02, 3 * SIZE(C1) ++ ++ vextf c05, 0, a1 # a1=C11R_ac ++ vextf c05, 1, a2 # a2=C11I_bc ++ vextf c05, 2, a3 # a3=C21R_ac ++ vextf c05, 3, a4 # a4=C21I_bc ++ ++ vextf c06, 0, b1 # b1=C11I_ad ++ vextf c06, 1, b2 # b2=C11R_bd ++ vextf c06, 2, b3 # b3=C21I_ad ++ vextf c06, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 4 * SIZE(C1) ++ ST c01, 5 * SIZE(C1) ++ ST b6, 6 * SIZE(C1) ++ ST c02, 7 * SIZE(C1) ++ ++ vextf c09, 0, a1 # a1=C11R_ac ++ vextf c09, 1, a2 # a2=C11I_bc ++ vextf c09, 2, a3 # a3=C21R_ac ++ vextf c09, 3, a4 # a4=C21I_bc ++ ++ vextf c10, 0, b1 # b1=C11I_ad ++ vextf c10, 1, b2 # b2=C11R_bd ++ vextf c10, 2, b3 # b3=C21I_ad ++ vextf c10, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 8 * SIZE(C1) ++ ST c01, 9 * SIZE(C1) ++ ST b6, 10 * SIZE(C1) ++ ST c02, 11 * SIZE(C1) ++ ++ vextf c13, 0, a1 # a1=C11R_ac ++ vextf c13, 1, a2 # a2=C11I_bc ++ vextf c13, 2, a3 # a3=C21R_ac ++ vextf c13, 3, a4 # a4=C21I_bc ++ ++ vextf c14, 0, b1 # b1=C11I_ad ++ vextf c14, 1, b2 # b2=C11R_bd ++ vextf c14, 2, b3 # b3=C21I_ad ++ vextf c14, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 12 * SIZE(C1) ++ ST c01, 13 * SIZE(C1) ++ ST b6, 14 * SIZE(C1) ++ ST c02, 15 * SIZE(C1) ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 8, TEMP ++#else ++ subl TEMP, 1, TEMP ++#endif ++ ++ sll TEMP, 3 + ZBASE_SHIFT,L ++ sll TEMP, ZBASE_SHIFT,TEMP ++ ++ addl AO, L, AO ++ addl BO, TEMP, BO ++#endif ++ ++#ifdef LEFT ++ addl KK, 8, KK ++#endif ++#endif ++ ++ jmp $L999 ++ ++ ++ .align 4 ++$L60: ++ and M, 4, I ++ ble I, $L70 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA))\ ++ || (!defined(LEFT) && !defined(TRANSA)) ++ mov B, BO ++#else ++ sll KK, 2 + ZBASE_SHIFT,L # mr=4 ++ sll KK, ZBASE_SHIFT,TEMP # nr=1 ++ ++ addl AO, L, AO ++ addl B, TEMP, BO ++#endif ++ ++ fillcs 0(C1) ++ fillcs 4*SIZE(C1) ++ fillcs 8*SIZE(C1) ++ ++ vcpys $f31,$f31,c01 # Clear result regs ++ vcpys $f31,$f31,c02 ++ ++ vcpys $f31,$f31,c05 ++ vcpys $f31,$f31,c06 ++ ++ LDDE b1, 0 * SIZE(BO) # B1R ++ LDDE b2, 1 * SIZE(BO) # B1I ++ ++ VLD a1, 0 * SIZE(AO) # A1, A2 ++ VLD a2, 4 * SIZE(AO) # A3, A4 ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl K, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 4, TEMP # mr=4 ++#else ++ addl KK, 1, TEMP # nr=1 ++#endif ++ sra TEMP, 1, L ++ ble L, $L65 ++ ++#else ++ ++ mov B, BO # Set B, (block A x panel Bj) ++ sra K, 1, L # Unroll K as 2 ++ ++ fillcs 0(C1) ++ fillcs 4*SIZE(C1) ++ fillcs 8*SIZE(C1) ++ ++ vcpys $f31,$f31,c01 # Clear result regs ++ vcpys $f31,$f31,c02 ++ ++ vcpys $f31,$f31,c05 ++ vcpys $f31,$f31,c06 ++ ++ LDDE b1, 0 * SIZE(BO) # B1R ++ LDDE b2, 1 * SIZE(BO) # B1I ++ ++ VLD a1, 0 * SIZE(AO) # A1, A2 ++ VLD a2, 4 * SIZE(AO) # A3, A4 ++ ++ ble L, $L65 ++#endif ++ ++ .align 4 ++$L62: ++ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc) ++ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd) ++ ++ LDDE b5, 2 * SIZE(BO) # next B1R ++ LDDE b6, 3 * SIZE(BO) # next B1I ++ ++ addl BO, 4*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE ++ VMAD a2,b1,c05,c05 # C31, C41 ++ VMAD a2,b2,c06,c06 # C31, C41 ++ ++ fillcs 0(PREA) ++ VLD a5, 8 * SIZE(AO) # next A1, A2, a5==a0 ++ VLD a6, 12 * SIZE(AO) # next A3, A4 ++ ++ subl L, 1, L # ++ ++ addl AO, 16*SIZE, AO # AO+=4mr*2kr*2px*SIZE ++ VMAD a5,b5,c01,c01 ++ VMAD a5,b6,c02,c02 ++ ++ addl PREA, 16*SIZE, PREA ++ LDDE b1, 0 * SIZE(BO) ++ LDDE b2, 1 * SIZE(BO) ++ ++ fillcs 0(PREA) ++ VMAD a6,b5,c05,c05 ++ VMAD a6,b6,c06,c06 ++ ++ VLD a1, 0 * SIZE(AO) ++ VLD a2, 4 * SIZE(AO) ++ ++ addl PREA, 16*SIZE, PREA ++ bne L, $L62 # continue K ++ ++$L65: ++ LD alpha_r, ALPHA_R # $f30==b8 ++#ifndef TRMMKERNEL ++ blbc K, $L68 # if(K&1) ++#else ++ blbc TEMP, $L68 ++#endif ++ ++$L66: ++ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE ++ VMAD a1,b1,c01,c01 # C11R C21R ++ VMAD a1,b2,c02,c02 # C11I C21I ++ ++ addl BO, 2*SIZE, BO ++ VMAD a2,b1,c05,c05 # C31R C41R ++ VMAD a2,b2,c06,c06 # C31I C41I ++ ++$L68: # Write back ++ LD alpha_i, ALPHA_I # $f29==b7 ++#ifndef TRMMKERNEL ++ vextf c01, 0, a1 # a1=C11R_ac ++ vextf c01, 1, a2 # a2=C11I_bc ++ vextf c01, 2, a3 # a3=C21R_ac ++ vextf c01, 3, a4 # a4=C21I_bc ++ ++ vextf c02, 0, b1 # b1=C11I_ad ++ vextf c02, 1, b2 # b2=C11R_bd ++ vextf c02, 2, b3 # b3=C21I_ad ++ vextf c02, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 0 * SIZE(C1) ++ LD a2, 1 * SIZE(C1) ++ LD a3, 2 * SIZE(C1) ++ LD a4, 3 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 0 * SIZE(C1) ++ ST c01, 1 * SIZE(C1) ++ ST b6, 2 * SIZE(C1) ++ ST c02, 3 * SIZE(C1) ++ ++ vextf c05, 0, a1 # a1=C11R_ac ++ vextf c05, 1, a2 # a2=C11I_bc ++ vextf c05, 2, a3 # a3=C21R_ac ++ vextf c05, 3, a4 # a4=C21I_bc ++ ++ vextf c06, 0, b1 # b1=C11I_ad ++ vextf c06, 1, b2 # b2=C11R_bd ++ vextf c06, 2, b3 # b3=C21I_ad ++ vextf c06, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ LD a1, 4 * SIZE(C1) ++ LD a2, 5 * SIZE(C1) ++ LD a3, 6 * SIZE(C1) ++ LD a4, 7 * SIZE(C1) ++ ++ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, a3, a3 ++ FMAD7 a7, alpha_r, a2, a2 ++ FMAD7 a8, alpha_r, a4, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 4 * SIZE(C1) ++ ST c01, 5 * SIZE(C1) ++ ST b6, 6 * SIZE(C1) ++ ST c02, 7 * SIZE(C1) ++ ++#else ++ ++ vextf c01, 0, a1 # a1=C11R_ac ++ vextf c01, 1, a2 # a2=C11I_bc ++ vextf c01, 2, a3 # a3=C21R_ac ++ vextf c01, 3, a4 # a4=C21I_bc ++ ++ vextf c02, 0, b1 # b1=C11I_ad ++ vextf c02, 1, b2 # b2=C11R_bd ++ vextf c02, 2, b3 # b3=C21I_ad ++ vextf c02, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 0 * SIZE(C1) ++ ST c01, 1 * SIZE(C1) ++ ST b6, 2 * SIZE(C1) ++ ST c02, 3 * SIZE(C1) ++ ++ vextf c05, 0, a1 # a1=C11R_ac ++ vextf c05, 1, a2 # a2=C11I_bc ++ vextf c05, 2, a3 # a3=C21R_ac ++ vextf c05, 3, a4 # a4=C21I_bc ++ ++ vextf c06, 0, b1 # b1=C11I_ad ++ vextf c06, 1, b2 # b2=C11R_bd ++ vextf c06, 2, b3 # b3=C21I_ad ++ vextf c06, 3, b4 # b4=C21R_bd ++ ++ ADD1 a1, b2, b5 # ac '+' bd ++ ADD1 a3, b4, a6 ++ ADD2 a2, b1, a7 # bc '+' ad ++ ADD2 a4, b3, a8 ++ ++ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1 ++ FMAD5 a6, alpha_r, $f31, a3 ++ FMAD7 a7, alpha_r, $f31, a2 ++ FMAD7 a8, alpha_r, $f31, a4 ++ ++ FMAD8 a7, alpha_i, a1, b4 ++ FMAD8 a8, alpha_i, a3, b6 ++ FMAD6 b5, alpha_i, a2, c01 ++ FMAD6 a6, alpha_i, a4, c02 ++ ++ ST b4, 4 * SIZE(C1) ++ ST c01, 5 * SIZE(C1) ++ ST b6, 6 * SIZE(C1) ++ ST c02, 7 * SIZE(C1) ++ ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl K, KK,TEMP ++#ifdef LEFT ++ subl TEMP, 4, TEMP # mr=4 ++#else ++ subl TEMP, 1, TEMP # nr=1 ++#endif ++ ++ sll TEMP, 2 + ZBASE_SHIFT, L ++ sll TEMP, ZBASE_SHIFT,TEMP ++ ++ addl AO, L, AO ++ addl BO,TEMP, BO ++#endif ++ ++#ifdef LEFT ++ addl KK,4,KK ++#endif ++#endif ++ ++ addl C1, 8*SIZE, C1 ++ ++ ++ .align 4 ++$L70: ++ and M, 2, I # I=M&2 ++ ble I, $L80 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B, BO ++ nop ++#else ++ sll KK, 1 + ZBASE_SHIFT, L # mr=2 ++ sll KK, ZBASE_SHIFT,TEMP # nr=1 ++ ++ addl AO, L, AO ++ addl B, TEMP, BO ++#endif ++ ++ fillcs 0*SIZE(C1) ++ fillcs 4*SIZE(C1) ++ ++ fclr c01 ++ fclr c02 # CLEAR 8 register ++ fclr c03 ++ fclr c04 ++ fclr c05 ++ fclr c06 ++ fclr c07 ++ fclr c08 ++ ++ LD b1, 0*SIZE(BO) # b1 real part ++ LD b2, 1*SIZE(BO) # b1 image part ++ ++ LD a1, 0*SIZE(AO) # a1 real part ++ LD a2, 1*SIZE(AO) # a1 image part ++ LD a3, 2*SIZE(AO) # a2 real part ++ LD a4, 3*SIZE(AO) # a2 image part ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl K, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 2, TEMP # mr=2 ++#else ++ addl KK, 1, TEMP # nr=1 ++#endif ++ sra TEMP, 1, L ++ ble L, $L75 ++ ++#else ++ mov B, BO # Set B, (block A x panel Bj) ++ sra K, 1, L # Unroll K as 2 ++ ++ fillcs 0*SIZE(C1) ++ fillcs 4*SIZE(C1) ++ ++ fclr c01 ++ fclr c02 # CLEAR 8 register ++ fclr c03 ++ fclr c04 ++ fclr c05 ++ fclr c06 ++ fclr c07 ++ fclr c08 ++ ++ LD b1, 0*SIZE(BO) # b1 real part ++ LD b2, 1*SIZE(BO) # b1 image part ++ ++ LD a1, 0*SIZE(AO) # a1 real part ++ LD a2, 1*SIZE(AO) # a1 image part ++ LD a3, 2*SIZE(AO) # a2 real part ++ LD a4, 3*SIZE(AO) # a2 image part ++ ++ ble L, $L75 ++#endif ++ ++ .align 4 ++$L72: ++ MAD a1,b1,c01,c01 # C11 real part ++ MAD a1,b2,c02,c02 # C11 imag part ++ ++ LD b5, 2 * SIZE(BO) # next B1R ++ LD b6, 3 * SIZE(BO) # next B1I ++ ++ LD a5, 4 * SIZE(AO) # next A1-A4 real part ++ LD a6, 5 * SIZE(AO) # next A1-A4 image part ++ LD a7, 6 * SIZE(AO) ++ LD a8, 7 * SIZE(AO) ++ ++ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE ++ MAD a2,b1,c03,c03 # C11 image part ++ MAD a2,b2,c04,c04 # C11 real part ++ ++ MAD a3,b1,c05,c05 # C12 real part ++ MAD a3,b2,c06,c06 # C12 imag part ++ ++ MAD a4,b1,c07,c07 # C12 image part ++ MAD a4,b2,c08,c08 # C12 real part ++ ++ subl L, 1, L # ++ ++ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE ++ MAD a5,b5,c01,c01 ++ MAD a5,b6,c02,c02 ++ ++ LD b1, 0 * SIZE(BO) ++ LD b2, 1 * SIZE(BO) ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MAD a6,b5,c03,c03 ++ MAD a6,b6,c04,c04 ++ ++ MAD a7,b5,c05,c05 ++ MAD a7,b6,c06,c06 ++ ++ MAD a8,b5,c07,c07 ++ MAD a8,b6,c08,c08 ++ ++ bne L, $L72 # continue K ++ ++$L75: ++ LD alpha_r, ALPHA_R # $f30==b8 ++#ifndef TRMMKERNEL ++ blbc K, $L78 # if(K&1) ++#else ++ blbc TEMP, $L78 ++#endif ++ ++$L76: ++ addl AO, 4*SIZE, AO # AO+=2mr*1kr*2px*SIZE ++ MAD a1,b1,c01,c01 # C11 real part ++ MAD a1,b2,c02,c02 # C11 imag part ++ ++ addl BO, 4*SIZE, BO ++ MAD a2,b1,c03,c03 # C11 image part ++ MAD a2,b2,c04,c04 # C11 real part ++ ++ MAD a3,b1,c05,c05 # C12 real part ++ MAD a3,b2,c06,c06 # C12 imag part ++ ++ MAD a4,b1,c07,c07 # C12 image part ++ MAD a4,b2,c08,c08 # C12 real part ++ ++ ++ ++$L78: # Write back ++ LD alpha_i, ALPHA_I # $f29==b7 ++#ifndef TRMMKERNEL ++ ADD1 c01, c04, c01 ++ ADD1 c05, c08, c05 ++ ADD2 c03, c02, c02 ++ ADD2 c07, c06, c06 ++ ++ LD b1, 0 * SIZE(C1) ++ LD b2, 1 * SIZE(C1) ++ LD b3, 2 * SIZE(C1) ++ LD b4, 3 * SIZE(C1) ++ ++ FMAD5 c01, alpha_r, b1, b1 ++ FMAD5 c05, alpha_r, b3, b3 ++ FMAD7 c02, alpha_r, b2, b2 ++ FMAD7 c06, alpha_r, b4, b4 ++ ++ FMAD8 c02, alpha_i, b1, b1 ++ FMAD8 c06, alpha_i, b3, b3 ++ FMAD6 c01, alpha_i, b2, b2 ++ FMAD6 c05, alpha_i, b4, b4 ++ ++ ST b1, 0 * SIZE(C1) ++ ST b2, 1 * SIZE(C1) ++ ST b3, 2 * SIZE(C1) ++ ST b4, 3 * SIZE(C1) ++ ++#else ++ ++ ADD1 c01, c04, c01 ++ ADD1 c05, c08, c05 ++ ADD2 c03, c02, c02 ++ ADD2 c07, c06, c06 ++ ++ FMAD5 c01, alpha_r, $f31, b1 ++ FMAD5 c05, alpha_r, $f31, b3 ++ FMAD7 c02, alpha_r, $f31, b2 ++ FMAD7 c06, alpha_r, $f31, b4 ++ ++ FMAD8 c02, alpha_i, b1, b1 ++ FMAD8 c06, alpha_i, b3, b3 ++ FMAD6 c01, alpha_i, b2, b2 ++ FMAD6 c05, alpha_i, b4, b4 ++ ++ ST b1, 0 * SIZE(C1) ++ ST b2, 1 * SIZE(C1) ++ ST b3, 2 * SIZE(C1) ++ ST b4, 3 * SIZE(C1) ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 2, TEMP ++#else ++ subl TEMP, 1, TEMP ++#endif ++ ++ sll TEMP, 1 + ZBASE_SHIFT, L ++ sll TEMP, ZBASE_SHIFT, TEMP ++ ++ addl AO, L, AO ++ addl BO, TEMP, BO ++#endif ++ ++#ifdef LEFT ++ addl KK, 2, KK ++#endif ++#endif ++ ++ addl C1, 4*SIZE, C1 ++ ++ ++ .align 4 ++$L80: ++ and M, 1, I # I=M&1 ++ ble I, $L999 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ mov B, BO ++ nop ++#else ++ sll KK, ZBASE_SHIFT, L # mr=1 ++ sll KK, ZBASE_SHIFT,TEMP # nr=1 ++ ++ addl AO, L, AO ++ addl B, TEMP, BO ++#endif ++ ++ fillcs 0*SIZE(C1) ++ ++ fclr c01 # CLEAR 8 register ++ fclr c02 ++ fclr c03 ++ fclr c04 ++ ++ LD b1, 0*SIZE(BO) # b1 real part ++ LD b2, 1*SIZE(BO) # b1 image part ++ ++ LD a1, 0*SIZE(AO) # a1 real part ++ LD a2, 1*SIZE(AO) # a1 image part ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ subl K, KK, TEMP ++#elif defined(LEFT) ++ addl KK, 1, TEMP # mr=1 ++#else ++ addl KK, 1, TEMP # nr=1 ++#endif ++ sra TEMP, 1, L ++ ble L, $L85 ++ ++#else ++ mov B, BO # Set B, (block A x panel Bj) ++ sra K, 1, L # Unroll K as 2 ++ ++ fillcs 0*SIZE(C1) ++ ++ fclr c01 # CLEAR 8 register ++ fclr c02 ++ fclr c03 ++ fclr c04 ++ ++ LD b1, 0*SIZE(BO) # b1 real part ++ LD b2, 1*SIZE(BO) # b1 image part ++ ++ LD a1, 0*SIZE(AO) # a1 real part ++ LD a2, 1*SIZE(AO) # a1 image part ++ ++ ble L, $L85 ++#endif ++ ++ .align 4 ++$L82: ++ MAD a1,b1,c01,c01 # C11 real part ++ MAD a1,b2,c02,c02 # C11 imag part ++ ++ LD b5, 2 * SIZE(BO) # next B1R ++ LD b6, 3 * SIZE(BO) # next B1I ++ ++ LD a5, 2 * SIZE(AO) # next A1-A4 real part ++ LD a6, 3 * SIZE(AO) # next A1-A4 image part ++ ++ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE ++ MAD a2,b1,c03,c03 # C11 image part ++ MAD a2,b2,c04,c04 # C11 real part ++ ++ subl L, 1, L # ++ ++ addl AO, 4*SIZE, AO # AO+=1mr*2kr*2px*SIZE ++ MAD a5,b5,c01,c01 ++ MAD a5,b6,c02,c02 ++ ++ LD b1, 0 * SIZE(BO) ++ LD b2, 1 * SIZE(BO) ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MAD a6,b5,c03,c03 ++ MAD a6,b6,c04,c04 ++ ++ bne L, $L82 # continue K ++ ++$L85: ++ LD alpha_r, ALPHA_R # $f30==b8 ++#ifndef TRMMKERNEL ++ blbc K, $L88 # if(K&1) ++#else ++ blbc TEMP, $L88 ++#endif ++ ++$L86: ++ addl AO, 2*SIZE, AO # AO+=8mr*1kr*2px*SIZE ++ MAD a1,b1,c01,c01 # C11 real part ++ MAD a1,b2,c02,c02 # C11 imag part ++ ++ addl BO, 2*SIZE, BO ++ MAD a2,b1,c03,c03 # C11 image part ++ MAD a2,b2,c04,c04 # C11 real part ++ ++$L88: # Write back ++ LD alpha_i, ALPHA_I # $f29==b7 ++#ifndef TRMMKERNEL ++ ADD1 c01, c04, c01 ++ ADD2 c03, c02, c02 ++ ++ LD b1, 0 * SIZE(C1) ++ LD b2, 1 * SIZE(C1) ++ ++ FMAD5 c01, alpha_r, b1, b1 ++ FMAD7 c02, alpha_r, b2, b2 ++ FMAD8 c02, alpha_i, b1, b1 ++ FMAD6 c01, alpha_i, b2, b2 ++ ++ ST b1, 0 * SIZE(C1) ++ ST b2, 1 * SIZE(C1) ++ ++#else ++ ++ ADD1 c01, c04, c01 ++ ADD2 c03, c02, c02 ++ ++ FMAD5 c01, alpha_r, $f31, b1 ++ FMAD7 c02, alpha_r, $f31, b2 ++ ++ FMAD8 c02, alpha_i, b1, b1 ++ FMAD6 c01, alpha_i, b2, b2 ++ ++ ST b1, 0 * SIZE(C1) ++ ST b2, 1 * SIZE(C1) ++ ++#if ( defined(LEFT) && defined(TRANSA)) || \ ++ (!defined(LEFT) && !defined(TRANSA)) ++ subl K, KK, TEMP ++#ifdef LEFT ++ subl TEMP, 1, TEMP ++#else ++ subl TEMP, 1, TEMP ++#endif ++ ++ sll TEMP, ZBASE_SHIFT, L ++ sll TEMP, ZBASE_SHIFT, TEMP ++ ++ addl AO, L, AO ++ addl BO, TEMP,BO ++#endif ++ ++#ifdef LEFT ++ addl KK, 1, KK ++#endif ++#endif ++ ++ addl C1, 2*SIZE, C1 ++ ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldl $9, 80($sp) ++ ldl $10,88($sp) ++ ldl $11,96($sp) ++ ldl $12,104($sp) ++ ldl $13,112($sp) ++ ldl $14,120($sp) ++ ++ clr $0 ++ ++ ldi $sp, STACKSIZE($sp) ++ ret $31,($26),1 # ++ ++ EPILOGUE +diff --git a/kernel/sw_64/zgemv_n.S b/kernel/sw_64/zgemv_n.S +new file mode 100644 +index 0000000..03d71ee +--- /dev/null ++++ b/kernel/sw_64/zgemv_n.S +@@ -0,0 +1,1040 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $21 ++#define LDA $18 ++ ++#define X $19 ++#define INCX $20 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define Y1 $4 ++#define A1 $5 ++#define A2 $6 ++ ++#define alpha_r $f19 ++#define alpha_i $f20 ++ ++#define alpha1 $f0 ++#define alpha2 $f1 ++#define alpha3 $f10 ++#define alpha4 $f11 ++ ++#define y0 $f12 ++#define y1 $f13 ++#define y2 $f14 ++#define y3 $f15 ++ ++#define y4 $f16 ++#define y5 $f17 ++#define y6 $f18 ++#define y7 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define t0 $f2 ++#define t1 $f3 ++#define t2 $f4 ++#define t3 $f5 ++ ++#if !defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#elif !defined(CONJ) && defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB ++#endif ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl LDA, 0 + STACKSIZE($sp) ++ ldl X, 8 + STACKSIZE($sp) ++ ldl INCX, 16 + STACKSIZE($sp) ++ ldl Y, 24 + STACKSIZE($sp) ++ ldl INCY, 32 + STACKSIZE($sp) ++ ldl BUFFER, 40 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ sll INCX, ZBASE_SHIFT, INCX ++ cmple N, 0, $1 ++ sll INCY, ZBASE_SHIFT, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ cmpeq INCY, 2 * SIZE, $0 ++ sll LDA, ZBASE_SHIFT,LDA ++ bne $0, $L10 ++ ++ mov BUFFER, Y1 ++ ++ mov Y, BUFFER ++ mov Y1, Y ++ ++ sra M, 2, I ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ ST $f31, 2 * SIZE(Y1) ++ ST $f31, 3 * SIZE(Y1) ++ ST $f31, 4 * SIZE(Y1) ++ ST $f31, 5 * SIZE(Y1) ++ ST $f31, 6 * SIZE(Y1) ++ ST $f31, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ ldi I, -1(I) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 3, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ addl Y1, 2 * SIZE, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ sra N, 1, J ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ LD alpha1, 0 * SIZE(X) ++ LD alpha2, 1 * SIZE(X) ++ addl X, INCX, X ++ LD alpha3, 0 * SIZE(X) ++ LD alpha4, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ MUL alpha_r, alpha1, y0 ++ MUL alpha_r, alpha2, y1 ++ MUL alpha_r, alpha3, y2 ++ MUL alpha_r, alpha4, y3 ++ ++ MUL alpha_i, alpha2, t0 ++ mov A, A1 ++ MUL alpha_i, alpha1, t1 ++ addl A, LDA, A2 ++ MUL alpha_i, alpha4, t2 ++ addl A2, LDA, A ++ MUL alpha_i, alpha3, t3 ++ mov Y, Y1 ++ ++#ifndef XCONJ ++ SUB y0, t0, alpha1 ++ ADD y1, t1, alpha2 ++ SUB y2, t2, alpha3 ++ ADD y3, t3, alpha4 ++#else ++ ADD y0, t0, alpha1 ++ SUB y1, t1, alpha2 ++ ADD y2, t2, alpha3 ++ SUB y3, t3, alpha4 ++#endif ++ ++ fillcs 4 * SIZE(X) ++ ++ sra M, 2, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD1 y0, t0, $f6 ++ unop ++ MUL alpha3, a4, t0 ++ LD y4, 4 * SIZE(Y1) ++ ++ ADD2 y1, t1, $f7 ++ unop ++ MUL alpha3, a5, t1 ++ LD y5, 5 * SIZE(Y1) ++ ++ ADD1 y2, t2, $f8 ++ unop ++ MUL alpha3, a6, t2 ++ LD y6, 6 * SIZE(Y1) ++ ++ ADD2 y3, t3, $f9 ++ unop ++ MUL alpha3, a7, t3 ++ LD y7, 7 * SIZE(Y1) ++ ++ ADD1 $f6, t0, y0 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 5 * SIZE(A1) ++ ++ ADD2 $f7, t1, y1 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 4 * SIZE(A1) ++ ++ ADD1 $f8, t2, y2 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 7 * SIZE(A1) ++ ++ ADD2 $f9, t3, y3 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 6 * SIZE(A1) ++ ++ ADD3 y0, t0, $f6 ++ unop ++ MUL alpha4, a5, t0 ++ LD a5, 5 * SIZE(A2) ++ ++ ADD4 y1, t1, $f7 ++ unop ++ MUL alpha4, a4, t1 ++ LD a4, 4 * SIZE(A2) ++ ++ ADD3 y2, t2, $f8 ++ unop ++ MUL alpha4, a7, t2 ++ LD a7, 7 * SIZE(A2) ++ ++ ADD4 y3, t3, $f9 ++ unop ++ MUL alpha4, a6, t3 ++ LD a6, 6 * SIZE(A2) ++ ++ ADD3 $f6, t0, y0 ++ MUL alpha1, a0, t0 ++ ADD4 $f7, t1, y1 ++ MUL alpha1, a1, t1 ++ ++ ADD3 $f8, t2, y2 ++ unop ++ MUL alpha1, a2, t2 ++ unop ++ ++ ADD4 $f9, t3, y3 ++ ldi I, -1(I) ++ MUL alpha1, a3, t3 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD1 y4, t0, $f6 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ++ ADD2 y5, t1, $f7 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ ldi I, -1(I) ++ ++ ADD1 y6, t2, $f8 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha3, a6, t2 ++ unop ++ ++ ADD2 y7, t3, $f9 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ unop ++ ++ ADD1 $f6, t0, y4 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 $f7, t1, y5 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD1 $f8, t2, y6 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD2 $f9, t3, y7 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD3 y4, t0, $f6 ++ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) ++ MUL alpha4, a5, t0 ++ LD a5, 9 * SIZE(A2) ++ ++ ADD4 y5, t1, $f7 ++ unop ++ MUL alpha4, a4, t1 ++ LD a4, 8 * SIZE(A2) ++ ++ ADD3 y6, t2, $f8 ++ unop ++ MUL alpha4, a7, t2 ++ LD a7, 11 * SIZE(A2) ++ ++ ADD4 y7, t3, $f9 ++ unop ++ MUL alpha4, a6, t3 ++ LD a6, 10 * SIZE(A2) ++ ++ ADD3 $f6, t0, y4 ++ unop ++ MUL alpha1, a0, t0 ++ LD y0, 8 * SIZE(Y1) ++ ++ ADD4 $f7, t1, y5 ++ unop ++ MUL alpha1, a1, t1 ++ LD y1, 9 * SIZE(Y1) ++ ++ ADD3 $f8, t2, y6 ++ unop ++ MUL alpha1, a2, t2 ++ LD y2, 10 * SIZE(Y1) ++ ++ ADD4 $f9, t3, y7 ++ unop ++ MUL alpha1, a3, t3 ++ LD y3, 11 * SIZE(Y1) ++ ++ ADD1 y0, t0, $f6 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ ++ ADD2 y1, t1, $f7 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ unop ++ ++ ADD1 y2, t2, $f8 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha3, a6, t2 ++ unop ++ ++ ADD2 y3, t3, $f9 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ ldi Y1, 8 * SIZE(Y1) ++ ++ ADD1 $f6, t0, y0 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 13 * SIZE(A1) ++ ++ ADD2 $f7, t1, y1 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 12 * SIZE(A1) ++ ++ ADD1 $f8, t2, y2 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 15 * SIZE(A1) ++ ++ ADD2 $f9, t3, y3 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 14 * SIZE(A1) ++ ++ ADD3 y0, t0, $f6 ++ unop ++ MUL alpha4, a5, t0 ++ LD a5, 13 * SIZE(A2) ++ ++ ADD4 y1, t1, $f7 ++ unop ++ MUL alpha4, a4, t1 ++ LD a4, 12 * SIZE(A2) ++ ++ ADD3 y2, t2, $f8 ++ unop ++ MUL alpha4, a7, t2 ++ LD a7, 15 * SIZE(A2) ++ ++ ADD4 y3, t3, $f9 ++ unop ++ MUL alpha4, a6, t3 ++ LD a6, 14 * SIZE(A2) ++ ++ ADD3 $f6, t0, y0 ++ unop ++ MUL alpha1, a0, t0 ++ LD y4, 4 * SIZE(Y1) ++ ++ ADD4 $f7, t1, y1 ++ ldi A2, 8 * SIZE(A2) ++ MUL alpha1, a1, t1 ++ LD y5, 5 * SIZE(Y1) ++ ++ ADD3 $f8, t2, y2 ++ ldi A1, 8 * SIZE(A1) ++ MUL alpha1, a2, t2 ++ LD y6, 6 * SIZE(Y1) ++ ++ ADD4 $f9, t3, y3 ++ MUL alpha1, a3, t3 ++ LD y7, 7 * SIZE(Y1) ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD1 y4, t0, $f6 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ unop ++ ++ ADD2 y5, t1, $f7 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ unop ++ ++ ADD1 y6, t2, $f8 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha3, a6, t2 ++ unop ++ ++ ADD2 y7, t3, $f9 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ unop ++ ++ ADD1 $f6, t0, y4 ++ MUL alpha2, a1, t0 ++ ADD2 $f7, t1, y5 ++ MUL alpha2, a0, t1 ++ ++ ADD1 $f8, t2, y6 ++ MUL alpha2, a3, t2 ++ ADD2 $f9, t3, y7 ++ MUL alpha2, a2, t3 ++ ++ ADD3 y4, t0, $f6 ++ MUL alpha4, a5, t0 ++ ADD4 y5, t1, $f7 ++ MUL alpha4, a4, t1 ++ ++ ADD3 y6, t2, $f8 ++ MUL alpha4, a7, t2 ++ ADD4 y7, t3, $f9 ++ MUL alpha4, a6, t3 ++ ++ ADD3 $f6, t0, y4 ++ ADD4 $f7, t1, y5 ++ ADD3 $f8, t2, y6 ++ ADD4 $f9, t3, y7 ++ ++ ST y4, 4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, 5 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) ++ ++ ST y6, 6 * SIZE(Y1) ++ unop ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L15: ++ and M, 2, I ++ ble I, $L17 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD1 y0, t0, $f6 ++ MUL alpha3, a4, t0 ++ ADD2 y1, t1, $f7 ++ MUL alpha3, a5, t1 ++ ADD1 y2, t2, $f8 ++ MUL alpha3, a6, t2 ++ ADD2 y3, t3, $f9 ++ MUL alpha3, a7, t3 ++ ++ ADD1 $f6, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 $f7, t1, y1 ++ MUL alpha2, a0, t1 ++ ++ ADD1 $f8, t2, y2 ++ MUL alpha2, a3, t2 ++ ADD2 $f9, t3, y3 ++ MUL alpha2, a2, t3 ++ ++ ADD3 y0, t0, $f6 ++ MUL alpha4, a5, t0 ++ ADD4 y1, t1, $f7 ++ MUL alpha4, a4, t1 ++ ++ ADD3 y2, t2, $f8 ++ MUL alpha4, a7, t2 ++ ADD4 y3, t3, $f9 ++ MUL alpha4, a6, t3 ++ ++ ADD3 $f6, t0, y0 ++ ADD4 $f7, t1, y1 ++ ADD3 $f8, t2, y2 ++ ADD4 $f9, t3, y3 ++ ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ++ ST y2, 2 * SIZE(Y1) ++ unop ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 ++ ++$L17: ++ blbc M, $L18 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ ++ MUL alpha1, a0, t0 ++ MUL alpha1, a1, t1 ++ ++ ADD1 y0, t0, $f6 ++ MUL alpha3, a2, t0 ++ ADD2 y1, t1, $f7 ++ MUL alpha3, a3, t1 ++ ++ ADD1 $f6, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 $f7, t1, y1 ++ MUL alpha2, a0, t1 ++ ++ ADD3 y0, t0, $f6 ++ MUL alpha4, a3, t0 ++ ADD4 y1, t1, $f7 ++ MUL alpha4, a2, t1 ++ ++ ADD3 $f6, t0, y0 ++ ADD4 $f7, t1, y1 ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ .align 4 ++ ++$L18: ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ blbc N, $L990 ++ ++ LD alpha1, 0 * SIZE(X) ++ LD alpha2, 1 * SIZE(X) ++ ++ MUL alpha_r, alpha1, y0 ++ MUL alpha_r, alpha2, y1 ++ ++ MUL alpha_i, alpha2, t0 ++ mov A, A1 ++ MUL alpha_i, alpha1, t1 ++ mov Y, Y1 ++ ++#ifndef XCONJ ++ SUB y0, t0, alpha1 ++ ADD y1, t1, alpha2 ++#else ++ ADD y0, t0, alpha1 ++ SUB y1, t1, alpha2 ++#endif ++ ++ sra M, 2, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ MUL alpha1, a0, t0 ++ LD a4, 4 * SIZE(A1) ++ MUL alpha1, a1, t1 ++ LD a5, 5 * SIZE(A1) ++ MUL alpha1, a2, t2 ++ LD a6, 6 * SIZE(A1) ++ MUL alpha1, a3, t3 ++ LD a7, 7 * SIZE(A1) ++ ++ ADD1 y0, t0, $f6 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 y1, t1, $f7 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD1 y2, t2, $f8 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD2 y3, t3, $f9 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD3 $f6, t0, y0 ++ unop ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, t0 ++ ++ ADD4 $f7, t1, y1 ++ unop ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, t1 ++ ++ ADD3 $f8, t2, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, t2 ++ ldi I, -1(I) ++ ++ ADD4 $f9, t3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, t3 ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD1 y4, t0, $f6 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a5, t0 ++ LD a5, 13 * SIZE(A1) ++ ++ ADD2 y5, t1, $f7 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a4, t1 ++ LD a4, 12 * SIZE(A1) ++ ++ ADD1 y6, t2, $f8 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a7, t2 ++ LD a7, 15 * SIZE(A1) ++ ++ ADD2 y7, t3, $f9 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a6, t3 ++ LD a6, 14 * SIZE(A1) ++ ++ ADD3 $f6, t0, y4 ++ LD y0, 8 * SIZE(Y1) ++ MUL alpha1, a0, t0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ++ ADD4 $f7, t1, y5 ++ LD y1, 9 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ ldi I, -1(I) ++ ++ ADD3 $f8, t2, y6 ++ LD y2, 10 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ unop ++ ++ ADD4 $f9, t3, y7 ++ LD y3, 11 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ unop ++ ++ ADD1 y0, t0, $f6 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha2, a1, t0 ++ LD a1, 17 * SIZE(A1) ++ ++ ADD2 y1, t1, $f7 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha2, a0, t1 ++ LD a0, 16 * SIZE(A1) ++ ++ ADD1 y2, t2, $f8 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha2, a3, t2 ++ LD a3, 19 * SIZE(A1) ++ ++ ADD2 y3, t3, $f9 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha2, a2, t3 ++ LD a2, 18 * SIZE(A1) ++ ++ ADD3 $f6, t0, y0 ++ LD y4, 12 * SIZE(Y1) ++ MUL alpha1, a4, t0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) ++ ++ ADD4 $f7, t1, y1 ++ LD y5, 13 * SIZE(Y1) ++ MUL alpha1, a5, t1 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD3 $f8, t2, y2 ++ LD y6, 14 * SIZE(Y1) ++ MUL alpha1, a6, t2 ++ ldi Y1, 8 * SIZE(Y1) ++ ++ ADD4 $f9, t3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, t3 ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD1 y4, t0, $f6 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a5, t0 ++ unop ++ ++ ADD2 y5, t1, $f7 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a4, t1 ++ unop ++ ++ ADD1 y6, t2, $f8 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a7, t2 ++ unop ++ ++ ADD2 y7, t3, $f9 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a6, t3 ++ unop ++ ++ ADD3 $f6, t0, y4 ++ ADD4 $f7, t1, y5 ++ ADD3 $f8, t2, y6 ++ ADD4 $f9, t3, y7 ++ ++ ST y4, 4 * SIZE(Y1) ++ unop ++ ST y5, 5 * SIZE(Y1) ++ unop ++ ++ ST y6, 6 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L25: ++ and M, 2, I ++ ble I, $L27 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD1 y0, t0, $f6 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, $f7 ++ MUL alpha2, a0, t1 ++ ADD1 y2, t2, $f8 ++ MUL alpha2, a3, t2 ++ ADD2 y3, t3, $f9 ++ MUL alpha2, a2, t3 ++ ++ ADD3 $f6, t0, y0 ++ ADD4 $f7, t1, y1 ++ ADD3 $f8, t2, y2 ++ ADD4 $f9, t3, y3 ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ ++ ST y2, 2 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 ++ ++$L27: ++ blbc M, $L990 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ ++ ADD1 y0, t0, $f6 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, $f7 ++ MUL alpha2, a0, t1 ++ ++ ADD3 $f6, t0, y0 ++ ADD4 $f7, t1, y1 ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ .align 4 ++ ++$L990: ++ cmpeq INCY, 2 * SIZE, $0 ++ bne $0, $L999 ++ ++ mov BUFFER, Y1 ++ ++ sra M, 2, I ++ ble I, $L995 ++ .align 4 ++ ++$L992: ++ LD a0, 0 * SIZE(BUFFER) ++ LD a1, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a2, 0 * SIZE(BUFFER) ++ LD a3, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ LD y2, 2 * SIZE(Y) ++ LD y3, 3 * SIZE(Y) ++ ++ LD a4, 0 * SIZE(BUFFER) ++ LD a5, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a6, 0 * SIZE(BUFFER) ++ LD a7, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y4, 4 * SIZE(Y) ++ LD y5, 5 * SIZE(Y) ++ LD y6, 6 * SIZE(Y) ++ LD y7, 7 * SIZE(Y) ++ ++ ADD a0, y0, $f6 ++ ADD a1, y1, $f7 ++ ADD a2, y2, $f8 ++ ADD a3, y3, $f9 ++ ++ fmov $f6, a0 ++ fmov $f7, a1 ++ fmov $f8, a2 ++ fmov $f9, a3 ++ ++ ST a0, 0 * SIZE(Y1) ++ ADD a4, y4, $f6 ++ ST a1, 1 * SIZE(Y1) ++ ADD a5, y5, $f7 ++ addl Y1, INCY, Y1 ++ ++ ST a2, 0 * SIZE(Y1) ++ ADD a6, y6, $f8 ++ ST a3, 1 * SIZE(Y1) ++ ADD a7, y7, $f9 ++ addl Y1, INCY, Y1 ++ ++ fmov $f6, a4 ++ fmov $f7, a5 ++ fmov $f8, a6 ++ fmov $f9, a7 ++ ++ ST a4, 0 * SIZE(Y1) ++ ST a5, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a6, 0 * SIZE(Y1) ++ ST a7, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ ldi Y, 8 * SIZE(Y) ++ bgt I, $L992 ++ .align 4 ++ ++$L995: ++ and M, 3, I ++ ble I, $L999 ++ .align 4 ++ ++$L996: ++ LD a0, 0 * SIZE(BUFFER) ++ LD a1, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ ldi Y, 2 * SIZE(Y) ++ ++ ADD a0, y0, $f6 ++ ADD a1, y1, $f7 ++ ++ fmov $f6, a0 ++ fmov $f7, a1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L996 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zgemv_n.S.bak b/kernel/sw_64/zgemv_n.S.bak +new file mode 100644 +index 0000000..3dd482e +--- /dev/null ++++ b/kernel/sw_64/zgemv_n.S.bak +@@ -0,0 +1,1027 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $21 ++#define LDA $18 ++ ++#define X $19 ++#define INCX $20 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define Y1 $4 ++#define A1 $5 ++#define A2 $6 ++ ++#define alpha_r $f19 ++#define alpha_i $f20 ++ ++#define alpha1 $f0 ++#define alpha2 $f1 ++#define alpha3 $f10 ++#define alpha4 $f11 ++ ++#define y0 $f12 ++#define y1 $f13 ++#define y2 $f14 ++#define y3 $f15 ++ ++#define y4 $f16 ++#define y5 $f17 ++#define y6 $f18 ++#define y7 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define t0 $f2 ++#define t1 $f3 ++#define t2 $f4 ++#define t3 $f5 ++ ++#if !defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#elif !defined(CONJ) && defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB ++#endif ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl LDA, 0 + STACKSIZE($sp) ++ ldl X, 8 + STACKSIZE($sp) ++ ldl INCX, 16 + STACKSIZE($sp) ++ ldl Y, 24 + STACKSIZE($sp) ++ ldl INCY, 32 + STACKSIZE($sp) ++ ldl BUFFER, 40 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ sll INCX, ZBASE_SHIFT, INCX ++ cmple N, 0, $1 ++ sll INCY, ZBASE_SHIFT, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ cmpeq INCY, 2 * SIZE, $0 ++ sll LDA, ZBASE_SHIFT,LDA ++ bne $0, $L10 ++ ++ mov BUFFER, Y1 ++ ++ mov Y, BUFFER ++ mov Y1, Y ++ ++ sra M, 2, I ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ ST $f31, 2 * SIZE(Y1) ++ ST $f31, 3 * SIZE(Y1) ++ ST $f31, 4 * SIZE(Y1) ++ ST $f31, 5 * SIZE(Y1) ++ ST $f31, 6 * SIZE(Y1) ++ ST $f31, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ ldi I, -1(I) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 3, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ ST $f31, 0 * SIZE(Y1) ++ ST $f31, 1 * SIZE(Y1) ++ addl Y1, 2 * SIZE, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ sra N, 1, J ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ LD alpha1, 0 * SIZE(X) ++ LD alpha2, 1 * SIZE(X) ++ addl X, INCX, X ++ LD alpha3, 0 * SIZE(X) ++ LD alpha4, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ MUL alpha_r, alpha1, y0 ++ MUL alpha_r, alpha2, y1 ++ MUL alpha_r, alpha3, y2 ++ MUL alpha_r, alpha4, y3 ++ ++ MUL alpha_i, alpha2, t0 ++ mov A, A1 ++ MUL alpha_i, alpha1, t1 ++ addl A, LDA, A2 ++ MUL alpha_i, alpha4, t2 ++ addl A2, LDA, A ++ MUL alpha_i, alpha3, t3 ++ mov Y, Y1 ++ ++#ifndef XCONJ ++ SUB y0, t0, alpha1 ++ ADD y1, t1, alpha2 ++ SUB y2, t2, alpha3 ++ ADD y3, t3, alpha4 ++#else ++ ADD y0, t0, alpha1 ++ SUB y1, t1, alpha2 ++ ADD y2, t2, alpha3 ++ SUB y3, t3, alpha4 ++#endif ++ ++ fillcs 4 * SIZE(X) ++ ++ sra M, 2, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ unop ++ MUL alpha3, a4, t0 ++ LD y4, 4 * SIZE(Y1) ++ ++ ADD2 y1, t1, y1 ++ unop ++ MUL alpha3, a5, t1 ++ LD y5, 5 * SIZE(Y1) ++ ++ ADD1 y2, t2, y2 ++ unop ++ MUL alpha3, a6, t2 ++ LD y6, 6 * SIZE(Y1) ++ ++ ADD2 y3, t3, y3 ++ unop ++ MUL alpha3, a7, t3 ++ LD y7, 7 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 5 * SIZE(A1) ++ ++ ADD2 y1, t1, y1 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 4 * SIZE(A1) ++ ++ ADD1 y2, t2, y2 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 7 * SIZE(A1) ++ ++ ADD2 y3, t3, y3 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 6 * SIZE(A1) ++ ++ ADD3 y0, t0, y0 ++ unop ++ MUL alpha4, a5, t0 ++ LD a5, 5 * SIZE(A2) ++ ++ ADD4 y1, t1, y1 ++ unop ++ MUL alpha4, a4, t1 ++ LD a4, 4 * SIZE(A2) ++ ++ ADD3 y2, t2, y2 ++ unop ++ MUL alpha4, a7, t2 ++ LD a7, 7 * SIZE(A2) ++ ++ ADD4 y3, t3, y3 ++ unop ++ MUL alpha4, a6, t3 ++ LD a6, 6 * SIZE(A2) ++ ++ ADD3 y0, t0, y0 ++ MUL alpha1, a0, t0 ++ ADD4 y1, t1, y1 ++ MUL alpha1, a1, t1 ++ ++ ADD3 y2, t2, y2 ++ unop ++ MUL alpha1, a2, t2 ++ unop ++ ++ ADD4 y3, t3, y3 ++ ldi I, -1(I) ++ MUL alpha1, a3, t3 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ ldi I, -1(I) ++ ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha3, a6, t2 ++ unop ++ ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ unop ++ ++ ADD1 y4, t0, y4 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 y5, t1, y5 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD1 y6, t2, y6 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD2 y7, t3, y7 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD3 y4, t0, y4 ++ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) ++ MUL alpha4, a5, t0 ++ LD a5, 9 * SIZE(A2) ++ ++ ADD4 y5, t1, y5 ++ unop ++ MUL alpha4, a4, t1 ++ LD a4, 8 * SIZE(A2) ++ ++ ADD3 y6, t2, y6 ++ unop ++ MUL alpha4, a7, t2 ++ LD a7, 11 * SIZE(A2) ++ ++ ADD4 y7, t3, y7 ++ unop ++ MUL alpha4, a6, t3 ++ LD a6, 10 * SIZE(A2) ++ ++ ADD3 y4, t0, y4 ++ unop ++ MUL alpha1, a0, t0 ++ LD y0, 8 * SIZE(Y1) ++ ++ ADD4 y5, t1, y5 ++ unop ++ MUL alpha1, a1, t1 ++ LD y1, 9 * SIZE(Y1) ++ ++ ADD3 y6, t2, y6 ++ unop ++ MUL alpha1, a2, t2 ++ LD y2, 10 * SIZE(Y1) ++ ++ ADD4 y7, t3, y7 ++ unop ++ MUL alpha1, a3, t3 ++ LD y3, 11 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ ++ ADD2 y1, t1, y1 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ unop ++ ++ ADD1 y2, t2, y2 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha3, a6, t2 ++ unop ++ ++ ADD2 y3, t3, y3 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ ldi Y1, 8 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 13 * SIZE(A1) ++ ++ ADD2 y1, t1, y1 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 12 * SIZE(A1) ++ ++ ADD1 y2, t2, y2 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 15 * SIZE(A1) ++ ++ ADD2 y3, t3, y3 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 14 * SIZE(A1) ++ ++ ADD3 y0, t0, y0 ++ unop ++ MUL alpha4, a5, t0 ++ LD a5, 13 * SIZE(A2) ++ ++ ADD4 y1, t1, y1 ++ unop ++ MUL alpha4, a4, t1 ++ LD a4, 12 * SIZE(A2) ++ ++ ADD3 y2, t2, y2 ++ unop ++ MUL alpha4, a7, t2 ++ LD a7, 15 * SIZE(A2) ++ ++ ADD4 y3, t3, y3 ++ unop ++ MUL alpha4, a6, t3 ++ LD a6, 14 * SIZE(A2) ++ ++ ADD3 y0, t0, y0 ++ unop ++ MUL alpha1, a0, t0 ++ LD y4, 4 * SIZE(Y1) ++ ++ ADD4 y1, t1, y1 ++ ldi A2, 8 * SIZE(A2) ++ MUL alpha1, a1, t1 ++ LD y5, 5 * SIZE(Y1) ++ ++ ADD3 y2, t2, y2 ++ ldi A1, 8 * SIZE(A1) ++ MUL alpha1, a2, t2 ++ LD y6, 6 * SIZE(Y1) ++ ++ ADD4 y3, t3, y3 ++ MUL alpha1, a3, t3 ++ LD y7, 7 * SIZE(Y1) ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha3, a4, t0 ++ unop ++ ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha3, a5, t1 ++ unop ++ ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha3, a6, t2 ++ unop ++ ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha3, a7, t3 ++ unop ++ ++ ADD1 y4, t0, y4 ++ MUL alpha2, a1, t0 ++ ADD2 y5, t1, y5 ++ MUL alpha2, a0, t1 ++ ++ ADD1 y6, t2, y6 ++ MUL alpha2, a3, t2 ++ ADD2 y7, t3, y7 ++ MUL alpha2, a2, t3 ++ ++ ADD3 y4, t0, y4 ++ MUL alpha4, a5, t0 ++ ADD4 y5, t1, y5 ++ MUL alpha4, a4, t1 ++ ++ ADD3 y6, t2, y6 ++ MUL alpha4, a7, t2 ++ ADD4 y7, t3, y7 ++ MUL alpha4, a6, t3 ++ ++ ADD3 y4, t0, y4 ++ ADD4 y5, t1, y5 ++ ADD3 y6, t2, y6 ++ ADD4 y7, t3, y7 ++ ++ ST y4, 4 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y5, 5 * SIZE(Y1) ++ ldi A2, 8 * SIZE(A2) ++ ++ ST y6, 6 * SIZE(Y1) ++ unop ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L15: ++ and M, 2, I ++ ble I, $L17 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD a4, 0 * SIZE(A2) ++ LD a5, 1 * SIZE(A2) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ MUL alpha3, a4, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha3, a5, t1 ++ ADD1 y2, t2, y2 ++ MUL alpha3, a6, t2 ++ ADD2 y3, t3, y3 ++ MUL alpha3, a7, t3 ++ ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 ++ ++ ADD1 y2, t2, y2 ++ MUL alpha2, a3, t2 ++ ADD2 y3, t3, y3 ++ MUL alpha2, a2, t3 ++ ++ ADD3 y0, t0, y0 ++ MUL alpha4, a5, t0 ++ ADD4 y1, t1, y1 ++ MUL alpha4, a4, t1 ++ ++ ADD3 y2, t2, y2 ++ MUL alpha4, a7, t2 ++ ADD4 y3, t3, y3 ++ MUL alpha4, a6, t3 ++ ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 ++ ADD3 y2, t2, y2 ++ ADD4 y3, t3, y3 ++ ++ ST y0, 0 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y1, 1 * SIZE(Y1) ++ ldi A2, 4 * SIZE(A2) ++ ++ ST y2, 2 * SIZE(Y1) ++ unop ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 ++ ++$L17: ++ blbc M, $L18 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ ++ MUL alpha1, a0, t0 ++ MUL alpha1, a1, t1 ++ ++ ADD1 y0, t0, y0 ++ MUL alpha3, a2, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha3, a3, t1 ++ ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 ++ ++ ADD3 y0, t0, y0 ++ MUL alpha4, a3, t0 ++ ADD4 y1, t1, y1 ++ MUL alpha4, a2, t1 ++ ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ .align 4 ++ ++$L18: ++ ldi J, -1(J) ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ blbc N, $L990 ++ ++ LD alpha1, 0 * SIZE(X) ++ LD alpha2, 1 * SIZE(X) ++ ++ MUL alpha_r, alpha1, y0 ++ MUL alpha_r, alpha2, y1 ++ ++ MUL alpha_i, alpha2, t0 ++ mov A, A1 ++ MUL alpha_i, alpha1, t1 ++ mov Y, Y1 ++ ++#ifndef XCONJ ++ SUB y0, t0, alpha1 ++ ADD y1, t1, alpha2 ++#else ++ ADD y0, t0, alpha1 ++ SUB y1, t1, alpha2 ++#endif ++ ++ sra M, 2, I ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ LD y0, 0 * SIZE(Y1) ++ LD y1, 1 * SIZE(Y1) ++ LD y2, 2 * SIZE(Y1) ++ LD y3, 3 * SIZE(Y1) ++ ++ MUL alpha1, a0, t0 ++ LD a4, 4 * SIZE(A1) ++ MUL alpha1, a1, t1 ++ LD a5, 5 * SIZE(A1) ++ MUL alpha1, a2, t2 ++ LD a6, 6 * SIZE(A1) ++ MUL alpha1, a3, t3 ++ LD a7, 7 * SIZE(A1) ++ ++ ADD1 y0, t0, y0 ++ unop ++ MUL alpha2, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 y1, t1, y1 ++ unop ++ MUL alpha2, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD1 y2, t2, y2 ++ unop ++ MUL alpha2, a3, t2 ++ LD a3, 11 * SIZE(A1) ++ ++ ADD2 y3, t3, y3 ++ unop ++ MUL alpha2, a2, t3 ++ LD a2, 10 * SIZE(A1) ++ ++ ADD3 y0, t0, y0 ++ unop ++ LD y4, 4 * SIZE(Y1) ++ MUL alpha1, a4, t0 ++ ++ ADD4 y1, t1, y1 ++ unop ++ LD y5, 5 * SIZE(Y1) ++ MUL alpha1, a5, t1 ++ ++ ADD3 y2, t2, y2 ++ LD y6, 6 * SIZE(Y1) ++ MUL alpha1, a6, t2 ++ ldi I, -1(I) ++ ++ ADD4 y3, t3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, t3 ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a5, t0 ++ LD a5, 13 * SIZE(A1) ++ ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a4, t1 ++ LD a4, 12 * SIZE(A1) ++ ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a7, t2 ++ LD a7, 15 * SIZE(A1) ++ ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a6, t3 ++ LD a6, 14 * SIZE(A1) ++ ++ ADD3 y4, t0, y4 ++ LD y0, 8 * SIZE(Y1) ++ MUL alpha1, a0, t0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ ++ ADD4 y5, t1, y5 ++ LD y1, 9 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ ldi I, -1(I) ++ ++ ADD3 y6, t2, y6 ++ LD y2, 10 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ unop ++ ++ ADD4 y7, t3, y7 ++ LD y3, 11 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ unop ++ ++ ADD1 y0, t0, y0 ++ ST y4, 4 * SIZE(Y1) ++ MUL alpha2, a1, t0 ++ LD a1, 17 * SIZE(A1) ++ ++ ADD2 y1, t1, y1 ++ ST y5, 5 * SIZE(Y1) ++ MUL alpha2, a0, t1 ++ LD a0, 16 * SIZE(A1) ++ ++ ADD1 y2, t2, y2 ++ ST y6, 6 * SIZE(Y1) ++ MUL alpha2, a3, t2 ++ LD a3, 19 * SIZE(A1) ++ ++ ADD2 y3, t3, y3 ++ ST y7, 7 * SIZE(Y1) ++ MUL alpha2, a2, t3 ++ LD a2, 18 * SIZE(A1) ++ ++ ADD3 y0, t0, y0 ++ LD y4, 12 * SIZE(Y1) ++ MUL alpha1, a4, t0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(Y1) ++ ++ ADD4 y1, t1, y1 ++ LD y5, 13 * SIZE(Y1) ++ MUL alpha1, a5, t1 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD3 y2, t2, y2 ++ LD y6, 14 * SIZE(Y1) ++ MUL alpha1, a6, t2 ++ ldi Y1, 8 * SIZE(Y1) ++ ++ ADD4 y3, t3, y3 ++ LD y7, 7 * SIZE(Y1) ++ MUL alpha1, a7, t3 ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD1 y4, t0, y4 ++ ST y0, 0 * SIZE(Y1) ++ MUL alpha2, a5, t0 ++ unop ++ ++ ADD2 y5, t1, y5 ++ ST y1, 1 * SIZE(Y1) ++ MUL alpha2, a4, t1 ++ unop ++ ++ ADD1 y6, t2, y6 ++ ST y2, 2 * SIZE(Y1) ++ MUL alpha2, a7, t2 ++ unop ++ ++ ADD2 y7, t3, y7 ++ ST y3, 3 * SIZE(Y1) ++ MUL alpha2, a6, t3 ++ unop ++ ++ ADD3 y4, t0, y4 ++ ADD4 y5, t1, y5 ++ ADD3 y6, t2, y6 ++ ADD4 y7, t3, y7 ++ ++ ST y4, 4 * SIZE(Y1) ++ unop ++ ST y5, 5 * SIZE(Y1) ++ unop ++ ++ ST y6, 6 * SIZE(Y1) ++ ldi A1, 8 * SIZE(A1) ++ ST y7, 7 * SIZE(Y1) ++ ldi Y1, 8 * SIZE(Y1) ++ .align 4 ++ ++$L25: ++ and M, 2, I ++ ble I, $L27 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 2 * SIZE(A1) ++ LD a3, 3 * SIZE(A1) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ MUL alpha1, a2, t2 ++ LD y2, 2 * SIZE(Y1) ++ MUL alpha1, a3, t3 ++ LD y3, 3 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 ++ ADD1 y2, t2, y2 ++ MUL alpha2, a3, t2 ++ ADD2 y3, t3, y3 ++ MUL alpha2, a2, t3 ++ ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 ++ ADD3 y2, t2, y2 ++ ADD4 y3, t3, y3 ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ ++ ST y2, 2 * SIZE(Y1) ++ ldi A1, 4 * SIZE(A1) ++ ST y3, 3 * SIZE(Y1) ++ ldi Y1, 4 * SIZE(Y1) ++ .align 4 ++ ++$L27: ++ blbc M, $L990 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ ++ MUL alpha1, a0, t0 ++ LD y0, 0 * SIZE(Y1) ++ MUL alpha1, a1, t1 ++ LD y1, 1 * SIZE(Y1) ++ ++ ADD1 y0, t0, y0 ++ MUL alpha2, a1, t0 ++ ADD2 y1, t1, y1 ++ MUL alpha2, a0, t1 ++ ++ ADD3 y0, t0, y0 ++ ADD4 y1, t1, y1 ++ ++ ST y0, 0 * SIZE(Y1) ++ ST y1, 1 * SIZE(Y1) ++ .align 4 ++ ++$L990: ++ cmpeq INCY, 2 * SIZE, $0 ++ bne $0, $L999 ++ ++ mov BUFFER, Y1 ++ ++ sra M, 2, I ++ ble I, $L995 ++ .align 4 ++ ++$L992: ++ LD a0, 0 * SIZE(BUFFER) ++ LD a1, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a2, 0 * SIZE(BUFFER) ++ LD a3, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ LD y2, 2 * SIZE(Y) ++ LD y3, 3 * SIZE(Y) ++ ++ LD a4, 0 * SIZE(BUFFER) ++ LD a5, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ LD a6, 0 * SIZE(BUFFER) ++ LD a7, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y4, 4 * SIZE(Y) ++ LD y5, 5 * SIZE(Y) ++ LD y6, 6 * SIZE(Y) ++ LD y7, 7 * SIZE(Y) ++ ++ ADD a0, y0, a0 ++ ADD a1, y1, a1 ++ ADD a2, y2, a2 ++ ADD a3, y3, a3 ++ ++ ST a0, 0 * SIZE(Y1) ++ ADD a4, y4, a4 ++ ST a1, 1 * SIZE(Y1) ++ ADD a5, y5, a5 ++ addl Y1, INCY, Y1 ++ ++ ST a2, 0 * SIZE(Y1) ++ ADD a6, y6, a6 ++ ST a3, 1 * SIZE(Y1) ++ ADD a7, y7, a7 ++ addl Y1, INCY, Y1 ++ ++ ST a4, 0 * SIZE(Y1) ++ ST a5, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ST a6, 0 * SIZE(Y1) ++ ST a7, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ ldi Y, 8 * SIZE(Y) ++ bgt I, $L992 ++ .align 4 ++ ++$L995: ++ and M, 3, I ++ ble I, $L999 ++ .align 4 ++ ++$L996: ++ LD a0, 0 * SIZE(BUFFER) ++ LD a1, 1 * SIZE(BUFFER) ++ addl BUFFER, INCY, BUFFER ++ ++ LD y0, 0 * SIZE(Y) ++ LD y1, 1 * SIZE(Y) ++ ldi Y, 2 * SIZE(Y) ++ ++ ADD a0, y0, a0 ++ ADD a1, y1, a1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ldi I, -1(I) ++ bgt I, $L996 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zgemv_t.S b/kernel/sw_64/zgemv_t.S +new file mode 100644 +index 0000000..bf31cb4 +--- /dev/null ++++ b/kernel/sw_64/zgemv_t.S +@@ -0,0 +1,1047 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $21 ++#define LDA $18 ++ ++#define X $19 ++#define INCX $20 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define X1 $3 ++#define Y1 $4 ++#define A1 $5 ++#define A2 $6 ++ ++#define alpha_r $f19 ++#define alpha_i $f20 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 ++ ++#if !defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif !defined(CONJ) && defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#elif defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB ++#endif ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl LDA, 0 + STACKSIZE($sp) ++ ldl X, 8 + STACKSIZE($sp) ++ ldl INCX, 16 + STACKSIZE($sp) ++ ldl Y, 24 + STACKSIZE($sp) ++ ldl INCY, 32 + STACKSIZE($sp) ++ ldl BUFFER, 40 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ sll INCX, ZBASE_SHIFT, INCX ++ cmple N, 0, $1 ++ sll INCY, ZBASE_SHIFT, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ cmpeq INCX, 2 * SIZE, $0 ++ mov X, X1 ++ sll LDA, ZBASE_SHIFT,LDA ++ bne $0, $L10 ++ ++ sra M, 2, I ++ mov BUFFER, Y1 ++ mov BUFFER, X ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ ldi I, -1(I) ++ ++ LD a0, 0 * SIZE(X1) ++ LD a1, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a2, 0 * SIZE(X1) ++ LD a3, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ST a2, 2 * SIZE(Y1) ++ ST a3, 3 * SIZE(Y1) ++ ++ LD a4, 0 * SIZE(X1) ++ LD a5, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a6, 0 * SIZE(X1) ++ LD a7, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a4, 4 * SIZE(Y1) ++ ST a5, 5 * SIZE(Y1) ++ ST a6, 6 * SIZE(Y1) ++ ST a7, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 3, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ LD a0, 0 * SIZE(X1) ++ LD a1, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ mov Y, Y1 ++ fclr t0 ++ unop ++ fclr t1 ++ ++ sra N, 1, J ++ fclr t2 ++ fclr t3 ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ mov A, A1 ++ fclr s0 ++ addl A, LDA, A2 ++ fclr s1 ++ ++ addl A2, LDA, A ++ unop ++ mov X, X1 ++ fillcs 3 * SIZE(Y) ++ ++ sra M, 2, I ++ fclr s2 ++ fclr s3 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 3 * SIZE(A1) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD a8, 4 * SIZE(A1) ++ LD a9, 5 * SIZE(A1) ++ LD a10, 4 * SIZE(A2) ++ LD a11, 5 * SIZE(A2) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 7 * SIZE(A1) ++ LD a14, 6 * SIZE(A2) ++ LD a15, 7 * SIZE(A2) ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ unop ++ ++ ADD3 s2, t2, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x0, a2, t2 ++ unop ++ ++ ADD4 s3, t3, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x1, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ unop ++ MUL x1, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD1 s2, t2, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x1, a3, t2 ++ LD a3, 9 * SIZE(A2) ++ ++ ADD2 s3, t3, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x1, a2, t3 ++ LD a2, 8 * SIZE(A2) ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x2, a4, t0 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x2, a5, t1 ++ ADD3 s2, t2, $f30 ++ fmov $f30, s2 ++ MUL x2, a6, t2 ++ ++ ADD4 s3, t3, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x2, a7, t3 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x3, a5, t0 ++ LD a5, 11 * SIZE(A1) ++ ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ unop ++ MUL x3, a4, t1 ++ LD a4, 10 * SIZE(A1) ++ ++ ADD1 s2, t2, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x3, a7, t2 ++ LD a7, 11 * SIZE(A2) ++ ++ ADD2 s3, t3, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x3, a6, t3 ++ LD a6, 10 * SIZE(A2) ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x0, a8, t0 ++ LD x3, 7 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x0, a9, t1 ++ unop ++ ++ ADD3 s2, t2, $f30 ++ fmov $f30, s2 ++ ldi I, -1(I) ++ MUL x0, a10, t2 ++ unop ++ ++ ADD4 s3, t3, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x0, a11, t3 ++ LD x0, 8 * SIZE(X1) ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x1, a9, t0 ++ LD a9, 13 * SIZE(A1) ++ ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ unop ++ MUL x1, a8, t1 ++ LD a8, 12 * SIZE(A1) ++ ++ ADD1 s2, t2, $f30 ++ fmov $f30, s2 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a11, t2 ++ LD a11, 13 * SIZE(A2) ++ ++ ADD2 s3, t3, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x1, a10, t3 ++ LD a10, 12 * SIZE(A2) ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x2, a12, t0 ++ LD x1, 9 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ MUL x2, a13, t1 ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD3 s2, t2, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x2, a14, t2 ++ unop ++ ++ ADD4 s3, t3, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x2, a15, t3 ++ LD x2, 10 * SIZE(X1) ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x3, a13, t0 ++ LD a13, 7 * SIZE(A1) ++ ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a12, t1 ++ LD a12, 6 * SIZE(A1) ++ ++ ADD1 s2, t2, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x3, a15, t2 ++ LD a15, 7 * SIZE(A2) ++ ++ ADD2 s3, t3, $f30 ++ fmov $f30, s3 ++ MUL x3, a14, t3 ++ LD a14, 6 * SIZE(A2) ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x0, a1, t1 ++ ADD3 s2, t2, $f30 ++ fmov $f30, s2 ++ MUL x0, a2, t2 ++ ++ ADD4 s3, t3, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x1, a0, t1 ++ ++ ADD1 s2, t2, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x1, a3, t2 ++ unop ++ ++ ADD2 s3, t3, $f30 ++ fmov $f30, s3 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a2, t3 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x2, a4, t0 ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x2, a5, t1 ++ ++ ADD3 s2, t2, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x2, a6, t2 ++ unop ++ ++ ADD4 s3, t3, $f30 ++ fmov $f30, s3 ++ ldi A2, 8 * SIZE(A2) ++ MUL x2, a7, t3 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x3, a5, t0 ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x3, a4, t1 ++ ++ ADD1 s2, t2, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x3, a7, t2 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD2 s3, t3, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x3, a6, t3 ++ LD x3, -1 * SIZE(X1) ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x0, a8, t0 ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x0, a9, t1 ++ ++ ADD3 s2, t2, $f30 ++ fmov $f30, s2 ++ MUL x0, a10, t2 ++ ADD4 s3, t3, $f30 ++ fmov $f30, s3 ++ MUL x0, a11, t3 ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x1, a9, t0 ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x1, a8, t1 ++ ++ ADD1 s2, t2, $f30 ++ fmov $f30, s2 ++ MUL x1, a11, t2 ++ ADD2 s3, t3, $f30 ++ fmov $f30, s3 ++ MUL x1, a10, t3 ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x2, a12, t0 ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x2, a13, t1 ++ ++ ADD3 s2, t2, $f30 ++ fmov $f30, s2 ++ MUL x2, a14, t2 ++ ADD4 s3, t3, $f30 ++ fmov $f30, s3 ++ MUL x2, a15, t3 ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x3, a13, t0 ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x3, a12, t1 ++ ++ ADD1 s2, t2, $f30 ++ fmov $f30, s2 ++ MUL x3, a15, t2 ++ ADD2 s3, t3, $f30 ++ fmov $f30, s3 ++ MUL x3, a14, t3 ++ .align 4 ++ ++$L15: ++ and M, 3, I ++ ble I, $L18 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L17 ++ .align 4 ++ ++$L16: ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ ldi I, -1(I) ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x0, a1, t1 ++ ADD3 s2, t2, $f30 ++ fmov $f30, s2 ++ MUL x0, a2, t2 ++ ++ ADD4 s3, t3, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 2 * SIZE(X1) ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ ldi A2, 2 * SIZE(A2) ++ MUL x1, a1, t0 ++ LD a1, 3 * SIZE(A1) ++ ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ ldi X1, 2 * SIZE(X1) ++ MUL x1, a0, t1 ++ LD a0, 2 * SIZE(A1) ++ ++ ADD1 s2, t2, $f30 ++ fmov $f30, s2 ++ ldi A1, 2 * SIZE(A1) ++ MUL x1, a3, t2 ++ LD a3, 1 * SIZE(A2) ++ ++ ADD2 s3, t3, $f30 ++ fmov $f30, s3 ++ MUL x1, a2, t3 ++ LD a2, 0 * SIZE(A2) ++ bgt I, $L16 ++ .align 4 ++ ++$L17: ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ unop ++ MUL x0, a1, t1 ++ unop ++ ++ ADD3 s2, t2, $f30 ++ fmov $f30, s2 ++ MUL x0, a2, t2 ++ ADD4 s3, t3, $f30 ++ fmov $f30, s3 ++ MUL x0, a3, t3 ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x1, a0, t1 ++ ++ ADD1 s2, t2, $f30 ++ fmov $f30, s2 ++ MUL x1, a3, t2 ++ ADD2 s3, t3, $f30 ++ fmov $f30, s3 ++ MUL x1, a2, t3 ++ .align 4 ++ ++$L18: ++ LD a0, 0 * SIZE(Y) ++ unop ++ LD a1, 1 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ LD a2, 0 * SIZE(Y) ++ unop ++ LD a3, 1 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ ADD3 s0, t0, a8 ++ ADD4 s1, t1, a9 ++ ADD3 s2, t2, a10 ++ ADD4 s3, t3, a11 ++ ++ fmov a8, s0 ++ fmov a9, s1 ++ fmov a10, s2 ++ fmov a11, s3 ++ ++ MUL alpha_r, s0, t0 ++ MUL alpha_r, s1, t1 ++ MUL alpha_r, s2, t2 ++ MUL alpha_r, s3, t3 ++ ++ ADD a0, t0, a8 ++ MUL alpha_i, s1, t0 ++ ADD a1, t1, a9 ++ MUL alpha_i, s0, t1 ++ ADD a2, t2, a10 ++ MUL alpha_i, s3, t2 ++ ADD a3, t3, a11 ++ MUL alpha_i, s2, t3 ++ ++ SUB a8, t0, a0 ++ ADD a9, t1, a1 ++ SUB a10, t2, a2 ++ ADD a11, t3, a3 ++ ++ ST a0, 0 * SIZE(Y1) ++ fclr t0 ++ ST a1, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ST a2, 0 * SIZE(Y1) ++ fclr t1 ++ ST a3, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ fclr t2 ++ ldi J, -1(J) ++ fclr t3 ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ blbc N, $L999 ++ ++ mov A, A1 ++ fclr s0 ++ fclr s1 ++ mov X, X1 ++ ++ sra M, 2, I ++ fclr s2 ++ fclr s3 ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 3 * SIZE(A1) ++ LD a8, 4 * SIZE(A1) ++ LD a9, 5 * SIZE(A1) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 7 * SIZE(A1) ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ unop ++ MUL x0, a1, t1 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s2, t0, $f30 ++ fmov $f30, s2 ++ ldi I, -1(I) ++ MUL x1, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 s3, t1, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x1, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x2, a4, t0 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ unop ++ MUL x2, a5, t1 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s2, t0, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x3, a5, t0 ++ LD a5, 11 * SIZE(A1) ++ ++ ADD2 s3, t1, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x3, a4, t1 ++ LD a4, 10 * SIZE(A1) ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x0, a8, t0 ++ LD x3, 7 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ unop ++ MUL x0, a9, t1 ++ LD x0, 8 * SIZE(X1) ++ ++ ADD1 s2, t0, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x1, a9, t0 ++ LD a9, 13 * SIZE(A1) ++ ++ ADD2 s3, t1, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x1, a8, t1 ++ LD a8, 12 * SIZE(A1) ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x2, a12, t0 ++ LD x1, 9 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x2, a13, t1 ++ LD x2, 10 * SIZE(X1) ++ ++ ADD1 s2, t0, $f30 ++ fmov $f30, s2 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a13, t0 ++ LD a13, 7 * SIZE(A1) ++ ++ ADD2 s3, t1, $f30 ++ fmov $f30, s3 ++ MUL x3, a12, t1 ++ LD a12, 6 * SIZE(A1) ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ unop ++ MUL x0, a1, t1 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s2, t0, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x1, a1, t0 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD2 s3, t1, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x1, a0, t1 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x2, a4, t0 ++ unop ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ unop ++ MUL x2, a5, t1 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s2, t0, $f30 ++ fmov $f30, s2 ++ unop ++ MUL x3, a5, t0 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD2 s3, t1, $f30 ++ fmov $f30, s3 ++ unop ++ MUL x3, a4, t1 ++ LD x3, -1 * SIZE(X1) ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x0, a8, t0 ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x0, a9, t1 ++ ++ ADD1 s2, t0, $f30 ++ fmov $f30, s2 ++ MUL x1, a9, t0 ++ ADD2 s3, t1, $f30 ++ fmov $f30, s3 ++ MUL x1, a8, t1 ++ ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x2, a12, t0 ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x2, a13, t1 ++ ++ ADD1 s2, t0, $f30 ++ fmov $f30, s2 ++ MUL x3, a13, t0 ++ ADD2 s3, t1, $f30 ++ fmov $f30, s3 ++ MUL x3, a12, t1 ++ .align 4 ++ ++$L25: ++ and M, 3, I ++ ble I, $L28 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L27 ++ .align 4 ++ ++$L26: ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ ldi A1, 2 * SIZE(A1) ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ ldi I, -1(I) ++ MUL x0, a1, t1 ++ LD x0, 2 * SIZE(X1) ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ ldi X1, 2 * SIZE(X1) ++ MUL x1, a1, t0 ++ LD a1, 1 * SIZE(A1) ++ ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x1, a0, t1 ++ LD a0, 0 * SIZE(A1) ++ bgt I, $L26 ++ .align 4 ++ ++$L27: ++ ADD3 s0, t0, $f30 ++ fmov $f30, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, $f30 ++ fmov $f30, s1 ++ unop ++ MUL x0, a1, t1 ++ unop ++ ++ ADD1 s0, t0, $f30 ++ fmov $f30, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, $f30 ++ fmov $f30, s1 ++ MUL x1, a0, t1 ++ .align 4 ++ ++$L28: ++ LD a0, 0 * SIZE(Y) ++ LD a1, 1 * SIZE(Y) ++ ++ ADD3 s0, t0, a8 ++ ADD4 s1, t1, a9 ++ ADD3 s2, t2, a10 ++ ADD4 s3, t3, a11 ++ ++ ADD a8, a10, s0 ++ ADD a9, a11, s1 ++ ++ MUL alpha_r, s0, t0 ++ MUL alpha_r, s1, t1 ++ ++ ADD a0, t0, a8 ++ MUL alpha_i, s1, t0 ++ ADD a1, t1, a9 ++ MUL alpha_i, s0, t1 ++ ++ SUB a8, t0, a0 ++ ADD a9, t1, a1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zgemv_t.S.bak b/kernel/sw_64/zgemv_t.S.bak +new file mode 100644 +index 0000000..f857fb7 +--- /dev/null ++++ b/kernel/sw_64/zgemv_t.S.bak +@@ -0,0 +1,922 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define STACKSIZE 64 ++#define PREFETCHSIZE 32 ++ ++#define M $16 ++#define N $17 ++#define A $21 ++#define LDA $18 ++ ++#define X $19 ++#define INCX $20 ++#define Y $22 ++#define INCY $23 ++ ++#define BUFFER $24 ++ ++#define I $25 ++#define J $27 ++ ++#define X1 $3 ++#define Y1 $4 ++#define A1 $5 ++#define A2 $6 ++ ++#define alpha_r $f19 ++#define alpha_i $f20 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f21 ++ ++#define a0 $f22 ++#define a1 $f23 ++#define a2 $f24 ++#define a3 $f25 ++#define a4 $f26 ++#define a5 $f27 ++#define a6 $f28 ++#define a7 $f29 ++ ++#define a8 $f2 ++#define a9 $f3 ++#define a10 $f4 ++#define a11 $f5 ++#define a12 $f6 ++#define a13 $f7 ++#define a14 $f8 ++#define a15 $f9 ++ ++#if !defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#elif !defined(CONJ) && defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#elif defined(CONJ) && !defined(XCONJ) ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#else ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 SUB ++#define ADD4 SUB ++#endif ++ ++ PROLOGUE ++ ++ ldi $sp, -STACKSIZE($sp) ++ ldl LDA, 0 + STACKSIZE($sp) ++ ldl X, 8 + STACKSIZE($sp) ++ ldl INCX, 16 + STACKSIZE($sp) ++ ldl Y, 24 + STACKSIZE($sp) ++ ldl INCY, 32 + STACKSIZE($sp) ++ ldl BUFFER, 40 + STACKSIZE($sp) ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ PROFCODE ++ ++ cmple M, 0, $0 ++ sll INCX, ZBASE_SHIFT, INCX ++ cmple N, 0, $1 ++ sll INCY, ZBASE_SHIFT, INCY ++ ++ or $0, $1, $0 ++ bne $0, $L999 ++ ++ cmpeq INCX, 2 * SIZE, $0 ++ mov X, X1 ++ sll LDA, ZBASE_SHIFT,LDA ++ bne $0, $L10 ++ ++ sra M, 2, I ++ mov BUFFER, Y1 ++ mov BUFFER, X ++ ble I, $L05 ++ .align 4 ++ ++$L02: ++ fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ ldi I, -1(I) ++ ++ LD a0, 0 * SIZE(X1) ++ LD a1, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a2, 0 * SIZE(X1) ++ LD a3, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ST a2, 2 * SIZE(Y1) ++ ST a3, 3 * SIZE(Y1) ++ ++ LD a4, 0 * SIZE(X1) ++ LD a5, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ LD a6, 0 * SIZE(X1) ++ LD a7, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a4, 4 * SIZE(Y1) ++ ST a5, 5 * SIZE(Y1) ++ ST a6, 6 * SIZE(Y1) ++ ST a7, 7 * SIZE(Y1) ++ ++ ldi Y1, 8 * SIZE(Y1) ++ bgt I, $L02 ++ .align 4 ++ ++$L05: ++ and M, 3, I ++ ble I, $L10 ++ .align 4 ++ ++$L06: ++ LD a0, 0 * SIZE(X1) ++ LD a1, 1 * SIZE(X1) ++ addl X1, INCX, X1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ ldi Y1, 2 * SIZE(Y1) ++ ++ ldi I, -1(I) ++ bgt I, $L06 ++ .align 4 ++ ++$L10: ++ mov Y, Y1 ++ fclr t0 ++ unop ++ fclr t1 ++ ++ sra N, 1, J ++ fclr t2 ++ fclr t3 ++ ble J, $L20 ++ .align 4 ++ ++$L11: ++ mov A, A1 ++ fclr s0 ++ addl A, LDA, A2 ++ fclr s1 ++ ++ addl A2, LDA, A ++ unop ++ mov X, X1 ++ fillcs 3 * SIZE(Y) ++ ++ sra M, 2, I ++ fclr s2 ++ fclr s3 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 3 * SIZE(A1) ++ LD a6, 2 * SIZE(A2) ++ LD a7, 3 * SIZE(A2) ++ ++ LD a8, 4 * SIZE(A1) ++ LD a9, 5 * SIZE(A1) ++ LD a10, 4 * SIZE(A2) ++ LD a11, 5 * SIZE(A2) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 7 * SIZE(A1) ++ LD a14, 6 * SIZE(A2) ++ LD a15, 7 * SIZE(A2) ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a1, t1 ++ unop ++ ++ ADD3 s2, t2, s2 ++ unop ++ MUL x0, a2, t2 ++ unop ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ unop ++ MUL x1, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ unop ++ MUL x1, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD1 s2, t2, s2 ++ unop ++ MUL x1, a3, t2 ++ LD a3, 9 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 ++ unop ++ MUL x1, a2, t3 ++ LD a2, 8 * SIZE(A2) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a4, t0 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ MUL x2, a5, t1 ++ ADD3 s2, t2, s2 ++ MUL x2, a6, t2 ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x2, a7, t3 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ unop ++ MUL x3, a5, t0 ++ LD a5, 11 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ unop ++ MUL x3, a4, t1 ++ LD a4, 10 * SIZE(A1) ++ ++ ADD1 s2, t2, s2 ++ unop ++ MUL x3, a7, t2 ++ LD a7, 11 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 ++ unop ++ MUL x3, a6, t3 ++ LD a6, 10 * SIZE(A2) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a8, t0 ++ LD x3, 7 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A2) ++ MUL x0, a9, t1 ++ unop ++ ++ ADD3 s2, t2, s2 ++ ldi I, -1(I) ++ MUL x0, a10, t2 ++ unop ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a11, t3 ++ LD x0, 8 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ unop ++ MUL x1, a9, t0 ++ LD a9, 13 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ unop ++ MUL x1, a8, t1 ++ LD a8, 12 * SIZE(A1) ++ ++ ADD1 s2, t2, s2 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a11, t2 ++ LD a11, 13 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 ++ unop ++ MUL x1, a10, t3 ++ LD a10, 12 * SIZE(A2) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a12, t0 ++ LD x1, 9 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ fillcs (PREFETCHSIZE + 0) * SIZE(X1) ++ MUL x2, a13, t1 ++ ldi A2, 8 * SIZE(A2) ++ ++ ADD3 s2, t2, s2 ++ unop ++ MUL x2, a14, t2 ++ unop ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x2, a15, t3 ++ LD x2, 10 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ unop ++ MUL x3, a13, t0 ++ LD a13, 7 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a12, t1 ++ LD a12, 6 * SIZE(A1) ++ ++ ADD1 s2, t2, s2 ++ unop ++ MUL x3, a15, t2 ++ LD a15, 7 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 ++ MUL x3, a14, t3 ++ LD a14, 6 * SIZE(A2) ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ MUL x0, a1, t1 ++ ADD3 s2, t2, s2 ++ MUL x0, a2, t2 ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 ++ ++ ADD1 s2, t2, s2 ++ unop ++ MUL x1, a3, t2 ++ unop ++ ++ ADD2 s3, t3, s3 ++ ldi A1, 8 * SIZE(A1) ++ MUL x1, a2, t3 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD3 s0, t0, s0 ++ MUL x2, a4, t0 ++ ADD4 s1, t1, s1 ++ MUL x2, a5, t1 ++ ++ ADD3 s2, t2, s2 ++ unop ++ MUL x2, a6, t2 ++ unop ++ ++ ADD4 s3, t3, s3 ++ ldi A2, 8 * SIZE(A2) ++ MUL x2, a7, t3 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ MUL x3, a5, t0 ++ ADD2 s1, t1, s1 ++ MUL x3, a4, t1 ++ ++ ADD1 s2, t2, s2 ++ unop ++ MUL x3, a7, t2 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD2 s3, t3, s3 ++ unop ++ MUL x3, a6, t3 ++ LD x3, -1 * SIZE(X1) ++ ++ ADD3 s0, t0, s0 ++ MUL x0, a8, t0 ++ ADD4 s1, t1, s1 ++ MUL x0, a9, t1 ++ ++ ADD3 s2, t2, s2 ++ MUL x0, a10, t2 ++ ADD4 s3, t3, s3 ++ MUL x0, a11, t3 ++ ++ ADD1 s0, t0, s0 ++ MUL x1, a9, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a8, t1 ++ ++ ADD1 s2, t2, s2 ++ MUL x1, a11, t2 ++ ADD2 s3, t3, s3 ++ MUL x1, a10, t3 ++ ++ ADD3 s0, t0, s0 ++ MUL x2, a12, t0 ++ ADD4 s1, t1, s1 ++ MUL x2, a13, t1 ++ ++ ADD3 s2, t2, s2 ++ MUL x2, a14, t2 ++ ADD4 s3, t3, s3 ++ MUL x2, a15, t3 ++ ++ ADD1 s0, t0, s0 ++ MUL x3, a13, t0 ++ ADD2 s1, t1, s1 ++ MUL x3, a12, t1 ++ ++ ADD1 s2, t2, s2 ++ MUL x3, a15, t2 ++ ADD2 s3, t3, s3 ++ MUL x3, a14, t3 ++ .align 4 ++ ++$L15: ++ and M, 3, I ++ ble I, $L18 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a2, 0 * SIZE(A2) ++ LD a3, 1 * SIZE(A2) ++ ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L17 ++ .align 4 ++ ++$L16: ++ ADD3 s0, t0, s0 ++ ldi I, -1(I) ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ MUL x0, a1, t1 ++ ADD3 s2, t2, s2 ++ MUL x0, a2, t2 ++ ++ ADD4 s3, t3, s3 ++ unop ++ MUL x0, a3, t3 ++ LD x0, 2 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ ldi A2, 2 * SIZE(A2) ++ MUL x1, a1, t0 ++ LD a1, 3 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ ldi X1, 2 * SIZE(X1) ++ MUL x1, a0, t1 ++ LD a0, 2 * SIZE(A1) ++ ++ ADD1 s2, t2, s2 ++ ldi A1, 2 * SIZE(A1) ++ MUL x1, a3, t2 ++ LD a3, 1 * SIZE(A2) ++ ++ ADD2 s3, t3, s3 ++ MUL x1, a2, t3 ++ LD a2, 0 * SIZE(A2) ++ bgt I, $L16 ++ .align 4 ++ ++$L17: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ unop ++ ++ ADD3 s2, t2, s2 ++ MUL x0, a2, t2 ++ ADD4 s3, t3, s3 ++ MUL x0, a3, t3 ++ ++ ADD1 s0, t0, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 ++ ++ ADD1 s2, t2, s2 ++ MUL x1, a3, t2 ++ ADD2 s3, t3, s3 ++ MUL x1, a2, t3 ++ .align 4 ++ ++$L18: ++ LD a0, 0 * SIZE(Y) ++ unop ++ LD a1, 1 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ LD a2, 0 * SIZE(Y) ++ unop ++ LD a3, 1 * SIZE(Y) ++ addl Y, INCY, Y ++ ++ ADD3 s0, t0, s0 ++ ADD4 s1, t1, s1 ++ ADD3 s2, t2, s2 ++ ADD4 s3, t3, s3 ++ ++ MUL alpha_r, s0, t0 ++ MUL alpha_r, s1, t1 ++ MUL alpha_r, s2, t2 ++ MUL alpha_r, s3, t3 ++ ++ ADD a0, t0, a0 ++ MUL alpha_i, s1, t0 ++ ADD a1, t1, a1 ++ MUL alpha_i, s0, t1 ++ ADD a2, t2, a2 ++ MUL alpha_i, s3, t2 ++ ADD a3, t3, a3 ++ MUL alpha_i, s2, t3 ++ ++ SUB a0, t0, a0 ++ ADD a1, t1, a1 ++ SUB a2, t2, a2 ++ ADD a3, t3, a3 ++ ++ ST a0, 0 * SIZE(Y1) ++ fclr t0 ++ ST a1, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ ST a2, 0 * SIZE(Y1) ++ fclr t1 ++ ST a3, 1 * SIZE(Y1) ++ addl Y1, INCY, Y1 ++ ++ fclr t2 ++ ldi J, -1(J) ++ fclr t3 ++ bgt J, $L11 ++ .align 4 ++ ++$L20: ++ blbc N, $L999 ++ ++ mov A, A1 ++ fclr s0 ++ fclr s1 ++ mov X, X1 ++ ++ sra M, 2, I ++ fclr s2 ++ fclr s3 ++ ble I, $L25 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ LD a4, 2 * SIZE(A1) ++ LD a5, 3 * SIZE(A1) ++ LD a8, 4 * SIZE(A1) ++ LD a9, 5 * SIZE(A1) ++ LD a12, 6 * SIZE(A1) ++ LD a13, 7 * SIZE(A1) ++ ++ LD x0, 0 * SIZE(X1) ++ LD x1, 1 * SIZE(X1) ++ LD x2, 2 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L23 ++ .align 4 ++ ++$L22: ++ ADD3 s0, t0, s0 ++ fillcs (PREFETCHSIZE + 0) * SIZE(A1) ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ ldi I, -1(I) ++ MUL x1, a1, t0 ++ LD a1, 9 * SIZE(A1) ++ ++ ADD2 s3, t1, s3 ++ unop ++ MUL x1, a0, t1 ++ LD a0, 8 * SIZE(A1) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a4, t0 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x2, a5, t1 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ unop ++ MUL x3, a5, t0 ++ LD a5, 11 * SIZE(A1) ++ ++ ADD2 s3, t1, s3 ++ unop ++ MUL x3, a4, t1 ++ LD a4, 10 * SIZE(A1) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a8, t0 ++ LD x3, 7 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a9, t1 ++ LD x0, 8 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ unop ++ MUL x1, a9, t0 ++ LD a9, 13 * SIZE(A1) ++ ++ ADD2 s3, t1, s3 ++ unop ++ MUL x1, a8, t1 ++ LD a8, 12 * SIZE(A1) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a12, t0 ++ LD x1, 9 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ ldi A1, 8 * SIZE(A1) ++ MUL x2, a13, t1 ++ LD x2, 10 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ ldi X1, 8 * SIZE(X1) ++ MUL x3, a13, t0 ++ LD a13, 7 * SIZE(A1) ++ ++ ADD2 s3, t1, s3 ++ MUL x3, a12, t1 ++ LD a12, 6 * SIZE(A1) ++ bgt I, $L22 ++ .align 4 ++ ++$L23: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x3, 3 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ LD x0, 4 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ unop ++ MUL x1, a1, t0 ++ ldi A1, 8 * SIZE(A1) ++ ++ ADD2 s3, t1, s3 ++ unop ++ MUL x1, a0, t1 ++ LD x1, 5 * SIZE(X1) ++ ++ ADD3 s0, t0, s0 ++ unop ++ MUL x2, a4, t0 ++ unop ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x2, a5, t1 ++ LD x2, 6 * SIZE(X1) ++ ++ ADD1 s2, t0, s2 ++ unop ++ MUL x3, a5, t0 ++ ldi X1, 8 * SIZE(X1) ++ ++ ADD2 s3, t1, s3 ++ unop ++ MUL x3, a4, t1 ++ LD x3, -1 * SIZE(X1) ++ ++ ADD3 s0, t0, s0 ++ MUL x0, a8, t0 ++ ADD4 s1, t1, s1 ++ MUL x0, a9, t1 ++ ++ ADD1 s2, t0, s2 ++ MUL x1, a9, t0 ++ ADD2 s3, t1, s3 ++ MUL x1, a8, t1 ++ ++ ADD3 s0, t0, s0 ++ MUL x2, a12, t0 ++ ADD4 s1, t1, s1 ++ MUL x2, a13, t1 ++ ++ ADD1 s2, t0, s2 ++ MUL x3, a13, t0 ++ ADD2 s3, t1, s3 ++ MUL x3, a12, t1 ++ .align 4 ++ ++$L25: ++ and M, 3, I ++ ble I, $L28 ++ ++ LD a0, 0 * SIZE(A1) ++ LD a1, 1 * SIZE(A1) ++ ++ LD x0, 0 * SIZE(X1) ++ ++ ldi I, -1(I) ++ ble I, $L27 ++ .align 4 ++ ++$L26: ++ ADD3 s0, t0, s0 ++ ldi A1, 2 * SIZE(A1) ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ ldi I, -1(I) ++ MUL x0, a1, t1 ++ LD x0, 2 * SIZE(X1) ++ ++ ADD1 s0, t0, s0 ++ ldi X1, 2 * SIZE(X1) ++ MUL x1, a1, t0 ++ LD a1, 1 * SIZE(A1) ++ ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 ++ LD a0, 0 * SIZE(A1) ++ bgt I, $L26 ++ .align 4 ++ ++$L27: ++ ADD3 s0, t0, s0 ++ unop ++ MUL x0, a0, t0 ++ LD x1, 1 * SIZE(X1) ++ ++ ADD4 s1, t1, s1 ++ unop ++ MUL x0, a1, t1 ++ unop ++ ++ ADD1 s0, t0, s0 ++ MUL x1, a1, t0 ++ ADD2 s1, t1, s1 ++ MUL x1, a0, t1 ++ .align 4 ++ ++$L28: ++ LD a0, 0 * SIZE(Y) ++ LD a1, 1 * SIZE(Y) ++ ++ ADD3 s0, t0, s0 ++ ADD4 s1, t1, s1 ++ ADD3 s2, t2, s2 ++ ADD4 s3, t3, s3 ++ ++ ADD s0, s2, s0 ++ ADD s1, s3, s1 ++ ++ MUL alpha_r, s0, t0 ++ MUL alpha_r, s1, t1 ++ ++ ADD a0, t0, a0 ++ MUL alpha_i, s1, t0 ++ ADD a1, t1, a1 ++ MUL alpha_i, s0, t1 ++ ++ SUB a0, t0, a0 ++ ADD a1, t1, a1 ++ ++ ST a0, 0 * SIZE(Y1) ++ ST a1, 1 * SIZE(Y1) ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ++ ldi $sp, STACKSIZE($sp) ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/znrm2.S b/kernel/sw_64/znrm2.S +new file mode 100644 +index 0000000..c1b7375 +--- /dev/null ++++ b/kernel/sw_64/znrm2.S +@@ -0,0 +1,441 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stl $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ sll INCX, ZBASE_SHIFT, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, 2 * SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, $f25 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, $f26 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, $f27 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, $f28 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd $f25, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd $f26, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd $f27, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd $f28, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, $f25 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, $f26 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, $f27 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, $f28 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd $f25, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd $f26, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd $f27, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd $f28, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0, $f25 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, $f26 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, $f27 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, $f28 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd $f25, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd $f26, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd $f27, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd $f28, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, $f25 ++ fmuld x0, x0, t0 ++ faddd a1, t1, $f26 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, $f27 ++ fmuld x2, x2, t2 ++ faddd a3, t3, $f28 ++ fmuld x3, x3, t3 ++ ++ faddd $f25, t0, a0 ++ fmuld x4, x4, t0 ++ faddd $f26, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd $f27, t2, a2 ++ fmuld x6, x6, t2 ++ faddd $f28, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a2, t2, $f27 ++ fmov $f27, a2 ++ faddd a3, t3, $f28 ++ fmov $f28, a3 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ LD x1, 1 * SIZE(X) ++ ++ ldi X, 2 * SIZE(X) ++ ++ faddd a0, t0, $f25 ++ fmov $f25, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, $f26 ++ fmov $f26, a1 ++ fmuld x1, x1, t1 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 2, I ++ fclr t1 ++ ble I, $L25 ++ ++ LD x0, 0 * SIZE(X) ++ fclr t2 ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ fclr t3 ++ LD x3, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x5, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x6, 0 * SIZE(X) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, $f25 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, $f26 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, $f27 ++ LD x1, 1 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, $f28 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ unop ++ ++ faddd $f25, t0, a0 ++ LD x3, 1 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd $f26, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ ldi I, -1(I) ++ ++ faddd $f27, t2, a2 ++ LD x5, 1 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd $f28, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, $f25 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, $f26 ++ fmuld x1, x1, t1 ++ faddd a2, t2, $f27 ++ fmuld x2, x2, t2 ++ ++ faddd a3, t3, $f28 ++ fmuld x3, x3, t3 ++ faddd $f25, t0, a0 ++ fmuld x4, x4, t0 ++ ++ faddd $f26, t1, a1 ++ fmuld x5, x5, t1 ++ faddd $f27, t2, a2 ++ fmuld x6, x6, t2 ++ ++ faddd $f28, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a2, t2, $f27 ++ fmov $f27, a2 ++ faddd a3, t3, $f28 ++ fmov $f28, a3 ++ .align 4 ++ ++$L25: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0, $f25 ++ fmov $f25, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, $f26 ++ fmov $f26, a1 ++ fmuld x1, x1, t1 ++ ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, $f25 ++ faddd a1, t1, $f26 ++ fmov $f25, a0 ++ fmov $f26, a1 ++ ++ faddd a0, a1, $f25 ++ fmov $f25, a0 ++ faddd a2, a3, $f26 ++ fmov $f26, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, $f25 ++ fmov $f25, a0 ++ fsqrtd a0, $f25 ++ fmov $f25, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/znrm2.S.bak b/kernel/sw_64/znrm2.S.bak +new file mode 100644 +index 0000000..b2e80e0 +--- /dev/null ++++ b/kernel/sw_64/znrm2.S.bak +@@ -0,0 +1,426 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCH_SIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++#if defined(EV4) || defined(EV5) ++ .frame $30,16,$26,0 ++ .mask 0x4000000,-16 ++ ldih $29, 0($27) !gpdisp!1 ++ ldi $29, 0($29) !gpdisp!1 ++ ++ ldi $sp, -16($sp) ++ ldl $27, sqrt($29) !literal!2 ++ stq $26, 0($sp) ++ ++ PROFCODE ++ .prologue 1 ++#else ++ PROFCODE ++#endif ++ ++ fclr a0 ++ sll INCX, ZBASE_SHIFT, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, 2 * SIZE, $0 ++ fclr a3 ++ beq $0, $L20 ++ ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ faddd a0, t0, a0 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ mov X, XX ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ fmuld x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ ldi I, -1(I) ++ fmuld x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ faddd a0, t0, a0 ++ mov X, XX ++ fmuld x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ faddd a3, t3, a3 ++ unop ++ fmuld x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ unop ++ fmuld x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ faddd a1, t1, a1 ++ unop ++ fmuld x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ faddd a2, t2, a2 ++ unop ++ fmuld x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ faddd a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ fmuld x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ LD x1, 1 * SIZE(X) ++ ++ ldi X, 2 * SIZE(X) ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 2, I ++ fclr t1 ++ ble I, $L25 ++ ++ LD x0, 0 * SIZE(X) ++ fclr t2 ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ fclr t3 ++ LD x3, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x5, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x6, 0 * SIZE(X) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ fmuld x1, x1, t1 ++ unop ++ ++ faddd a2, t2, a2 ++ LD x1, 1 * SIZE(X) ++ fmuld x2, x2, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ fmuld x3, x3, t3 ++ unop ++ ++ faddd a0, t0, a0 ++ LD x3, 1 * SIZE(X) ++ fmuld x4, x4, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ fmuld x5, x5, t1 ++ ldi I, -1(I) ++ ++ faddd a2, t2, a2 ++ LD x5, 1 * SIZE(X) ++ fmuld x6, x6, t2 ++ addl X, INCX, X ++ ++ faddd a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ fmuld x7, x7, t3 ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ faddd a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ fmuld x0, x0, t0 ++ addl X, INCX, X ++ ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ faddd a2, t2, a2 ++ fmuld x2, x2, t2 ++ ++ faddd a3, t3, a3 ++ fmuld x3, x3, t3 ++ faddd a0, t0, a0 ++ fmuld x4, x4, t0 ++ ++ faddd a1, t1, a1 ++ fmuld x5, x5, t1 ++ faddd a2, t2, a2 ++ fmuld x6, x6, t2 ++ ++ faddd a3, t3, a3 ++ fmuld x7, x7, t3 ++ faddd a2, t2, a2 ++ faddd a3, t3, a3 ++ .align 4 ++ ++$L25: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ faddd a0, t0, a0 ++ fmuld x0, x0, t0 ++ faddd a1, t1, a1 ++ fmuld x1, x1, t1 ++ ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ faddd a0, t0, a0 ++ faddd a1, t1, a1 ++ ++ faddd a0, a1, a0 ++ faddd a2, a3, a2 ++ ++#if defined(EV4) || defined(EV5) ++ faddd a0, a2, $f16 ++ jsr $26, ($27), sqrt !lituse_jsr!2 ++ ++ ldih $29, 0($26) !gpdisp!3 ++ ldi $29, 0($29) !gpdisp!3 ++#else ++ faddd a0, a2, a0 ++ fsqrtd a0, a0 ++#endif ++ .align 4 ++ ++$L999: ++#if defined(EV4) || defined(EV5) ++ ldl $26, 0($sp) ++ ldi $sp, 16($sp) ++#endif ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/znrm2_simd.S b/kernel/sw_64/znrm2_simd.S +new file mode 100644 +index 0000000..5a509d4 +--- /dev/null ++++ b/kernel/sw_64/znrm2_simd.S +@@ -0,0 +1,492 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 80 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define XX $19 ++ ++#define I $0 ++ ++#define a0 $f0 ++#define a1 $f1 ++#define a2 $f10 ++#define a3 $f11 ++#define t0 $f12 ++#define t1 $f13 ++#define t2 $f14 ++#define t3 $f15 ++ ++#define x0 $f16 ++#define x1 $f17 ++#define x2 $f18 ++#define x3 $f19 ++#define x4 $f20 ++#define x5 $f21 ++#define x6 $f22 ++#define x7 $f23 ++ ++ PROLOGUE ++ ++ PROFCODE ++ ++ fclr a0 ++ sll INCX, ZBASE_SHIFT, INCX ++ fclr a1 ++ ble N, $L999 ++ ++ fclr a2 ++ cmpeq INCX, 2 * SIZE, $0 ++ fclr a3 ++ beq $0, $L20 #stride access ++ ++ ++/* test the address of X */ ++ and X, (VEC_LEN*SIZE-1), $3 ++ fclr t0 ++ fclr t1 ++ bne $3, $UnAlign_ACCESS ++/*Align access. Use simd instructions. Unloop 8 complex*/ ++ sra N, 3, I ++ ble I, $Remain ++ ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t0 #clear s0 vector ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t1 ++ ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t2 ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ vcpys $f31, $f31, t3 ++ ++ addl X, 16 * SIZE, X ++ subl I, 1, I ++ nop ++ ble I, $MainLoopEnd ++$MainLoop: ++ fillcs PREFETCHSIZE * SIZE(X) ++ VMAD a0, a0, t0, t0 ++ subl I, 1, I ++ VMAD a1, a1, t1, t1 ++ ++ addl X, 16 * SIZE, X ++ VMAD a2, a2, t2, t2 ++ nop ++ VMAD a3, a3, t3, t3 ++ ++ VLD a0, -4*VEC_LEN*SIZE(X) ++ VLD a1, -3*VEC_LEN*SIZE(X) ++ VLD a2, -2*VEC_LEN*SIZE(X) ++ VLD a3, -1*VEC_LEN*SIZE(X) ++ ++ bgt I, $MainLoop ++ .align 4 ++$MainLoopEnd: ++ VMAD a0, a0, t0, t0 ++ VMAD a1, a1, t1, t1 ++ VMAD a2, a2, t2, t2 ++ VMAD a3, a3, t3, t3 ++ ++ VADD t0, t1, a0 ++ VADD t2, t3, a1 ++ nop ++ VADD a0, a1, t0 ++ ++ vextf t0, 1, t1 ++ vextf t0, 2, t2 ++ vextf t0, 3, t3 ++ nop ++ ++ ADD t0, t1, a2 ++ ADD t2, t3, a3 ++ fclr t1 ++ ADD a2, a3, t0 ++ ++ .align 4 ++$Remain: ++ and N, 7, I ++ ble I, $End ++ .align 4 ++$RemainLoop: ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ ++ addl X, 2*SIZE, X ++ MAD a0, a0, t0, t0 ++ subl I, 1, I ++ MAD a1, a1, t1, t1 ++ ++ bgt I, $RemainLoop ++ .align 4 ++ ++ ADD t0, t1, t0 ++$End: ++ SQRT t0, a0 ++ ret ++ .align 4 ++ ++$UnAlign_ACCESS: ++ ++ fclr t0 ++ sra N, 3, I ++ fclr t1 ++ ble I, $L15 ++ ++ fclr t2 ++ LD x0, 0 * SIZE(X) ++ fclr t3 ++ LD x1, 1 * SIZE(X) ++ ++ LD x2, 2 * SIZE(X) ++ LD x3, 3 * SIZE(X) ++ LD x4, 4 * SIZE(X) ++ LD x5, 5 * SIZE(X) ++ LD x6, 6 * SIZE(X) ++ LD x7, 7 * SIZE(X) ++ ++ ldi I, -1(I) ++ ble I, $L12 ++ .align 4 ++ ++$L11: ++ ADD a0, t0, a0 ++ fillcs (PREFETCHSIZE) * SIZE(X) ++ MUL x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ ADD a1, t1, a1 ++ mov X, XX ++ MUL x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ ADD a3, t3, a3 ++ unop ++ MUL x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ ADD a0, t0, a0 ++ unop ++ MUL x4, x4, t0 ++ LD x4, 12 * SIZE(X) ++ ++ ADD a1, t1, a1 ++ unop ++ MUL x5, x5, t1 ++ LD x5, 13 * SIZE(X) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x6, x6, t2 ++ LD x6, 14 * SIZE(X) ++ ++ ADD a3, t3, a3 ++ unop ++ MUL x7, x7, t3 ++ LD x7, 15 * SIZE(X) ++ ++ ADD a0, t0, a0 ++ unop ++ MUL x0, x0, t0 ++ LD x0, 16 * SIZE(X) ++ ++ ADD a1, t1, a1 ++ ldi X, 16 * SIZE(X) ++ MUL x1, x1, t1 ++ LD x1, 17 * SIZE(XX) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x2, x2, t2 ++ LD x2, 18 * SIZE(XX) ++ ++ ADD a3, t3, a3 ++ unop ++ MUL x3, x3, t3 ++ LD x3, 19 * SIZE(XX) ++ ++ ADD a0, t0, a0 ++ unop ++ MUL x4, x4, t0 ++ LD x4, 20 * SIZE(XX) ++ ++ ADD a1, t1, a1 ++ ldi I, -1(I) ++ MUL x5, x5, t1 ++ LD x5, 21 * SIZE(XX) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x6, x6, t2 ++ LD x6, 22 * SIZE(XX) ++ ++ ADD a3, t3, a3 ++ MUL x7, x7, t3 ++ LD x7, 23 * SIZE(XX) ++ bgt I, $L11 ++ .align 4 ++ ++$L12: ++ ADD a0, t0, a0 ++ mov X, XX ++ MUL x0, x0, t0 ++ LD x0, 8 * SIZE(X) ++ ++ ADD a1, t1, a1 ++ unop ++ MUL x1, x1, t1 ++ LD x1, 9 * SIZE(X) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x2, x2, t2 ++ LD x2, 10 * SIZE(X) ++ ++ ADD a3, t3, a3 ++ unop ++ MUL x3, x3, t3 ++ LD x3, 11 * SIZE(X) ++ ++ ADD a0, t0, a0 ++ unop ++ MUL x4, x4, t0 ++ LD x4, 12 * SIZE(XX) ++ ++ ADD a1, t1, a1 ++ unop ++ MUL x5, x5, t1 ++ LD x5, 13 * SIZE(XX) ++ ++ ADD a2, t2, a2 ++ unop ++ MUL x6, x6, t2 ++ LD x6, 14 * SIZE(XX) ++ ++ ADD a3, t3, a3 ++ ldi X, 16 * SIZE(X) ++ MUL x7, x7, t3 ++ LD x7, 15 * SIZE(XX) ++ ++ ADD a0, t0, a0 ++ MUL x0, x0, t0 ++ ADD a1, t1, a1 ++ MUL x1, x1, t1 ++ ++ ADD a2, t2, a2 ++ MUL x2, x2, t2 ++ ADD a3, t3, a3 ++ MUL x3, x3, t3 ++ ++ ADD a0, t0, a0 ++ MUL x4, x4, t0 ++ ADD a1, t1, a1 ++ MUL x5, x5, t1 ++ ++ ADD a2, t2, a2 ++ MUL x6, x6, t2 ++ ADD a3, t3, a3 ++ MUL x7, x7, t3 ++ ++ ADD a2, t2, a2 ++ ADD a3, t3, a3 ++ .align 4 ++ ++$L15: ++ and N, 7, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD x0, 0 * SIZE(X) ++ LD x1, 1 * SIZE(X) ++ ++ ldi X, 2 * SIZE(X) ++ ++ ADD a0, t0, a0 ++ MUL x0, x0, t0 ++ ADD a1, t1, a1 ++ MUL x1, x1, t1 ++ ++ ldi I, -1(I) ++ bgt I, $L16 ++ bsr $31, $L998 ++ .align 4 ++ ++$L20: ++ fclr t0 ++ sra N, 2, I ++ fclr t1 ++ ble I, $L25 ++ ++ LD x0, 0 * SIZE(X) ++ fclr t2 ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ LD x2, 0 * SIZE(X) ++ fclr t3 ++ LD x3, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x4, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x5, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ LD x6, 0 * SIZE(X) ++ ble I, $L22 ++ .align 4 ++ ++$L21: ++ ADD a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ MUL x0, x0, t0 ++ addl X, INCX, X ++ ++ ADD a1, t1, a1 ++ LD x0, 0 * SIZE(X) ++ MUL x1, x1, t1 ++ unop ++ ++ ADD a2, t2, a2 ++ LD x1, 1 * SIZE(X) ++ MUL x2, x2, t2 ++ addl X, INCX, X ++ ++ ADD a3, t3, a3 ++ LD x2, 0 * SIZE(X) ++ MUL x3, x3, t3 ++ unop ++ ++ ADD a0, t0, a0 ++ LD x3, 1 * SIZE(X) ++ MUL x4, x4, t0 ++ addl X, INCX, X ++ ++ ADD a1, t1, a1 ++ LD x4, 0 * SIZE(X) ++ MUL x5, x5, t1 ++ ldi I, -1(I) ++ ++ ADD a2, t2, a2 ++ LD x5, 1 * SIZE(X) ++ MUL x6, x6, t2 ++ addl X, INCX, X ++ ++ ADD a3, t3, a3 ++ LD x6, 0 * SIZE(X) ++ MUL x7, x7, t3 ++ bgt I, $L21 ++ .align 4 ++ ++$L22: ++ ADD a0, t0, a0 ++ LD x7, 1 * SIZE(X) ++ MUL x0, x0, t0 ++ addl X, INCX, X ++ ++ ADD a1, t1, a1 ++ MUL x1, x1, t1 ++ ADD a2, t2, a2 ++ MUL x2, x2, t2 ++ ++ ADD a3, t3, a3 ++ MUL x3, x3, t3 ++ ADD a0, t0, a0 ++ MUL x4, x4, t0 ++ ++ ADD a1, t1, a1 ++ MUL x5, x5, t1 ++ ADD a2, t2, a2 ++ MUL x6, x6, t2 ++ ++ ADD a3, t3, a3 ++ MUL x7, x7, t3 ++ ADD a2, t2, a2 ++ ADD a3, t3, a3 ++ .align 4 ++ ++$L25: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L26: ++ LD x0, 0 * SIZE(X) ++ ldi I, -1(I) ++ LD x1, 1 * SIZE(X) ++ addl X, INCX, X ++ ++ ADD a0, t0, a0 ++ MUL x0, x0, t0 ++ ADD a1, t1, a1 ++ MUL x1, x1, t1 ++ ++ bgt I, $L26 ++ .align 4 ++ ++ ++$L998: ++ ADD a0, t0, a0 ++ ADD a1, t1, a1 ++ ++ ADD a0, a1, a0 ++ ADD a2, a3, a2 ++ ++ ++ ++ ADD a0, a2, a0 ++ SQRT a0, a0 ++ ++ .align 4 ++ ++$L999: ++ ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zrot.S b/kernel/sw_64/zrot.S +new file mode 100644 +index 0000000..9016a00 +--- /dev/null ++++ b/kernel/sw_64/zrot.S +@@ -0,0 +1,689 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define I $21 ++#define XX $23 ++#define YY $24 ++ ++#define b9 $f29 ++ ++#define C $f10 ++#define S $f11 ++ ++#define PREFETCH_SIZE 80 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fmov $f21, C ++ LD S, 0($sp) ++ ++ addl INCX, INCX, INCX ++ addl INCY, INCY, INCY ++ ++ cmpeq INCX, 2, $23 ++ cmpeq INCY, 2, $24 ++ ble N, $L998 ++ ++ and $23, $24, $23 ++ beq $23, $L50 ++ ++ sra N, 2, I ++ ble I, $L15 ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ LD $f16, 2*SIZE(X) ++ LD $f17, 2*SIZE(Y) ++ LD $f18, 3*SIZE(X) ++ LD $f19, 3*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ unop ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ ++ LD $f13, 4*SIZE(Y) ++ MUL S, $f12, $f24 ++ LD $f12, 4*SIZE(X) ++ MUL C, $f14, $f25 ++ ++ ldi I, -1(I) ++ MUL S, $f15, $f26 ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ MUL C, $f15, $f27 ++ ++ LD $f15, 5*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ MUL C, $f16, $f21 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ unop ++ LD $f14, 5*SIZE(X) ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ ++ MUL C, $f17, $f23 ++ fillcs (PREFETCH_SIZE) * SIZE(Y) ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 6*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 7*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 8*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 8*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 9*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f16, $f21 ++ LD $f14, 9*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ ++ MUL C, $f17, $f23 ++ LD $f17, 10*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 10*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ ++ MUL C, $f19, $f27 ++ LD $f19, 11*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ ldi I, -1(I) ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 11*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 6*SIZE(X) ++ MUL S, $f13, $f22 ++ unop ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 12*SIZE(Y) ++ ldi X, 8*SIZE(X) ++ unop ++ ++ ST $f24, 6*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 4*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ unop ++ ++ ST $f26, -1*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 5*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, -1*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ MUL C, $f16, $f21 ++ LD $f14, 5*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ LD $f16, 6*SIZE(X) ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ LD $f18, 7*SIZE(X) ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f12, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ ++ MUL C, $f13, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ MUL C, $f14, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ ++ MUL C, $f15, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f16, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ ST $f22, 6*SIZE(X) ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ ST $f24, 6*SIZE(Y) ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ ST $f26, 7*SIZE(X) ++ ldi X, 8*SIZE(X) ++ ST $f28, 7*SIZE(Y) ++ ldi Y, 8*SIZE(Y) ++ .align 4 ++ ++ ++$L15: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ ST $f22, 0*SIZE(X) ++ ST $f24, 0*SIZE(Y) ++ ldi I, -1(I) ++ ++ ST $f26, 1*SIZE(X) ++ ldi X, 2 * SIZE(X) ++ ST $f28, 1*SIZE(Y) ++ ldi Y, 2 * SIZE(Y) ++ ++ bgt I, $L16 ++ .align 4 ++ ++$L998: ++ clr $0 ++ ret ++ .align 4 ++ ++$L50: ++ mov X, XX ++ mov Y, YY ++ ++ sra N, 2, I ++ ble I, $L55 ++ .align 4 ++ ++$L51: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ldi I, -1(I) ++ bgt I, $L51 ++ .align 4 ++ ++$L55: ++ and N, 3, I ++ ble I, $L999 ++ .align 4 ++ ++$L56: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, b9 ++ fmov b9, $f22 ++ SUB $f23, $f24, b9 ++ fmov b9, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, b9 ++ fmov b9, $f26 ++ SUB $f27, $f28, b9 ++ fmov b9, $f28 ++ ++ ST $f22, 0*SIZE(X) ++ ST $f24, 0*SIZE(Y) ++ ldi I, -1(I) ++ ++ ST $f26, 1*SIZE(X) ++ ST $f28, 1*SIZE(Y) ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ bgt I, $L56 ++ .align 4 ++ ++$L999: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zrot.S.bak b/kernel/sw_64/zrot.S.bak +new file mode 100644 +index 0000000..83dd2b1 +--- /dev/null ++++ b/kernel/sw_64/zrot.S.bak +@@ -0,0 +1,631 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define I $21 ++#define XX $23 ++#define YY $24 ++ ++#define C $f10 ++#define S $f11 ++ ++#define PREFETCH_SIZE 80 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fmov $f21, C ++ LD S, 0($sp) ++ ++ addl INCX, INCX, INCX ++ addl INCY, INCY, INCY ++ ++ cmpeq INCX, 2, $23 ++ cmpeq INCY, 2, $24 ++ ble N, $L998 ++ ++ and $23, $24, $23 ++ beq $23, $L50 ++ ++ sra N, 2, I ++ ble I, $L15 ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ LD $f16, 2*SIZE(X) ++ LD $f17, 2*SIZE(Y) ++ LD $f18, 3*SIZE(X) ++ LD $f19, 3*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ unop ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ ++ LD $f13, 4*SIZE(Y) ++ MUL S, $f12, $f24 ++ LD $f12, 4*SIZE(X) ++ MUL C, $f14, $f25 ++ ++ ldi I, -1(I) ++ MUL S, $f15, $f26 ++ ADD $f21, $f22, $f22 ++ MUL C, $f15, $f27 ++ ++ LD $f15, 5*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ MUL C, $f16, $f21 ++ fillcs (PREFETCH_SIZE) * SIZE(X) ++ unop ++ LD $f14, 5*SIZE(X) ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ fillcs (PREFETCH_SIZE) * SIZE(Y) ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 6*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 7*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 8*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 8*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 9*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ LD $f14, 9*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ LD $f17, 10*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 10*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ LD $f19, 11*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ ldi I, -1(I) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 11*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 6*SIZE(X) ++ MUL S, $f13, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 12*SIZE(Y) ++ ldi X, 8*SIZE(X) ++ unop ++ ++ ST $f24, 6*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 4*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ unop ++ ++ ST $f26, -1*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 5*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, -1*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ MUL C, $f16, $f21 ++ LD $f14, 5*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ LD $f16, 6*SIZE(X) ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ LD $f18, 7*SIZE(X) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 6*SIZE(X) ++ ADD $f25, $f26, $f26 ++ ST $f24, 6*SIZE(Y) ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 7*SIZE(X) ++ ldi X, 8*SIZE(X) ++ ST $f28, 7*SIZE(Y) ++ ldi Y, 8*SIZE(Y) ++ .align 4 ++ ++ ++$L15: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(X) ++ ST $f24, 0*SIZE(Y) ++ ldi I, -1(I) ++ ++ ST $f26, 1*SIZE(X) ++ ldi X, 2 * SIZE(X) ++ ST $f28, 1*SIZE(Y) ++ ldi Y, 2 * SIZE(Y) ++ ++ bgt I, $L16 ++ .align 4 ++ ++$L998: ++ clr $0 ++ ret ++ .align 4 ++ ++$L50: ++ mov X, XX ++ mov Y, YY ++ ++ sra N, 2, I ++ ble I, $L55 ++ .align 4 ++ ++$L51: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ldi I, -1(I) ++ bgt I, $L51 ++ .align 4 ++ ++$L55: ++ and N, 3, I ++ ble I, $L999 ++ .align 4 ++ ++$L56: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(X) ++ ST $f24, 0*SIZE(Y) ++ ldi I, -1(I) ++ ++ ST $f26, 1*SIZE(X) ++ ST $f28, 1*SIZE(Y) ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ bgt I, $L56 ++ .align 4 ++ ++$L999: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zrot_simd.S b/kernel/sw_64/zrot_simd.S +new file mode 100644 +index 0000000..9e00ebf +--- /dev/null ++++ b/kernel/sw_64/zrot_simd.S +@@ -0,0 +1,799 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define Y $19 ++#define INCY $20 ++#define I $21 ++#define XX $23 ++#define YY $24 ++ ++#define C $f10 ++#define S $f11 ++ ++#define x0 $f12 ++#define x1 $f14 ++#define x2 $f16 ++#define x3 $f18 ++ ++#define y0 $f13 ++#define y1 $f15 ++#define y2 $f17 ++#define y3 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++#define t4 $f24 ++#define t5 $f25 ++#define t6 $f26 ++#define t7 $f27 ++ ++#define PREFETCHSIZE 80 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ fmov $f21, C ++ LD S, 0($sp) ++ ++ addl INCX, INCX, INCX ++ addl INCY, INCY, INCY ++ ++ cmpeq INCX, 2, $23 ++ cmpeq INCY, 2, $24 ++ ble N, $L998 ++ ++ and $23, $24, $23 ++ beq $23, $L50 ++ ++/* test the address of X */ ++ and X, (VEC_LEN*SIZE-1), $3 ++ and Y, (VEC_LEN*SIZE-1), $4 ++ or $3, $4, $4 ++ bne $4, $UnAlign_ACCESS ++ ++/*Align Accessing*/ ++ sra N, 3, I ++ ble I, $Remain ++ ++ vcpyf C, C ++ vcpyf S, S ++ ++ VLD x0, 0*VEC_LEN*SIZE(X) ++ VLD x1, 1*VEC_LEN*SIZE(X) ++ VLD x2, 2*VEC_LEN*SIZE(X) ++ VLD x3, 3*VEC_LEN*SIZE(X) ++ ++ VLD y0, 0*VEC_LEN*SIZE(Y) ++ VLD y1, 1*VEC_LEN*SIZE(Y) ++ VLD y2, 2*VEC_LEN*SIZE(Y) ++ VLD y3, 3*VEC_LEN*SIZE(Y) ++ ++ addl X, 16 * SIZE, X ++ addl Y, 16 * SIZE, Y ++ subl I, 1, I ++ ble I, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++ VMUL C, x0, t0 ++ fillcs (PREFETCHSIZE) * SIZE(X) ++ VMUL C, x1, t1 ++ fillcs (PREFETCHSIZE) * SIZE(Y) ++ ++ VMUL C, x2, t2 ++ subl I, 1, I ++ VMUL C, x3, t3 ++ nop ++ ++ VMUL S, x0, t4 ++ VLD x0, 0*VEC_LEN*SIZE(X) ++ VMUL S, x1, t5 ++ VLD x1, 1*VEC_LEN*SIZE(X) ++ ++ VMUL S, x2, t6 ++ VLD x2, 2*VEC_LEN*SIZE(X) ++ VMUL S, x3, t7 ++ VLD x3, 3*VEC_LEN*SIZE(X) ++ ++ VMAD S, y0, t0, t0 ++ VMAD S, y1, t1, t1 ++ VMAD S, y2, t2, t2 ++ VMAD S, y3, t3, t3 ++ ++ VMSUB C, y0, t4, t4 ++ VLD y0, 0*VEC_LEN*SIZE(Y) ++ VMSUB C, y1, t5, t5 ++ VLD y1, 1*VEC_LEN*SIZE(Y) ++ ++ VMSUB C, y2, t6, t6 ++ VLD y2, 2*VEC_LEN*SIZE(Y) ++ VMSUB C, y3, t7, t7 ++ VLD y3, 3*VEC_LEN*SIZE(Y) ++ ++ VST t0, -4*VEC_LEN*SIZE(X) ++ VST t1, -3*VEC_LEN*SIZE(X) ++ VST t2, -2*VEC_LEN*SIZE(X) ++ VST t3, -1*VEC_LEN*SIZE(X) ++ ++ VST t4, -4*VEC_LEN*SIZE(Y) ++ VST t5, -3*VEC_LEN*SIZE(Y) ++ VST t6, -2*VEC_LEN*SIZE(Y) ++ VST t7, -1*VEC_LEN*SIZE(Y) ++ ++ addl X, 16 * SIZE, X ++ addl Y, 16 * SIZE, Y ++ nop ++ bgt I, $MainLoop ++ .align 4 ++$MainLoopEnd: ++ VMUL C, x0, t0 ++ VMUL C, x1, t1 ++ VMUL C, x2, t2 ++ VMUL C, x3, t3 ++ ++ VMUL S, x0, t4 ++ VMUL S, x1, t5 ++ VMUL S, x2, t6 ++ VMUL S, x3, t7 ++ ++ VMAD S, y0, t0, t0 ++ VMAD S, y1, t1, t1 ++ VMAD S, y2, t2, t2 ++ VMAD S, y3, t3, t3 ++ ++ VMSUB C, y0, t4, t4 ++ VMSUB C, y1, t5, t5 ++ VMSUB C, y2, t6, t6 ++ VMSUB C, y3, t7, t7 ++ ++ VST t0, -4*VEC_LEN*SIZE(X) ++ VST t1, -3*VEC_LEN*SIZE(X) ++ VST t2, -2*VEC_LEN*SIZE(X) ++ VST t3, -1*VEC_LEN*SIZE(X) ++ ++ VST t4, -4*VEC_LEN*SIZE(Y) ++ VST t5, -3*VEC_LEN*SIZE(Y) ++ VST t6, -2*VEC_LEN*SIZE(Y) ++ VST t7, -1*VEC_LEN*SIZE(Y) ++ ++ .align 4 ++$Remain: ++ and N, 7, I ++ ble I, $End ++$RemainLoop: ++ LD x0, 0*SIZE(X) ++ LD y0, 0*SIZE(Y) ++ LD x1, 1*SIZE(X) ++ LD y1, 1*SIZE(Y) ++ ++ MUL C, x0, t0 ++ MUL S, x0, t4 ++ MAD S, y0, t0, t0 ++ MSUB C, y0, t4, t4 ++ ++ MUL C, x1, t1 ++ ldi I, -1(I) ++ MUL S, x1, t5 ++ ldi X, 2 * SIZE(X) ++ ++ MAD S, y1, t1, t1 ++ ldi Y, 2 * SIZE(Y) ++ MSUB C, y1, t5, t5 ++ nop ++ ++ ST t0, -2*SIZE(X) ++ ST t1, -1*SIZE(X) ++ ST t4, -2*SIZE(Y) ++ ST t5, -1*SIZE(Y) ++ ++ bgt I, $RemainLoop ++ .align 4 ++$End: ++ clr $0 ++ ret ++ .align 4 ++ ++$UnAlign_ACCESS: ++ sra N, 2, I ++ ble I, $L15 ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ LD $f16, 2*SIZE(X) ++ LD $f17, 2*SIZE(Y) ++ LD $f18, 3*SIZE(X) ++ LD $f19, 3*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ unop ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ ++ LD $f13, 4*SIZE(Y) ++ MUL S, $f12, $f24 ++ LD $f12, 4*SIZE(X) ++ MUL C, $f14, $f25 ++ ++ ldi I, -1(I) ++ MUL S, $f15, $f26 ++ ADD $f21, $f22, $f22 ++ MUL C, $f15, $f27 ++ ++ LD $f15, 5*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ MUL C, $f16, $f21 ++ fillcs (PREFETCHSIZE) * SIZE(X) ++ unop ++ LD $f14, 5*SIZE(X) ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ fillcs (PREFETCHSIZE) * SIZE(Y) ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 6*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 7*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 8*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 8*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 9*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ LD $f14, 9*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ LD $f17, 10*SIZE(Y) ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ LD $f16, 10*SIZE(X) ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ LD $f19, 11*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ ldi I, -1(I) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ LD $f18, 11*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 6*SIZE(X) ++ MUL S, $f13, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ LD $f13, 12*SIZE(Y) ++ ldi X, 8*SIZE(X) ++ unop ++ ++ ST $f24, 6*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ LD $f12, 4*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ unop ++ ++ ST $f26, -1*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ LD $f15, 5*SIZE(Y) ++ unop ++ unop ++ ++ ST $f28, -1*SIZE(Y) ++ MUL S, $f14, $f28 ++ SUB $f23, $f24, $f24 ++ bgt I, $L12 ++ .align 4 ++ ++$L13: ++ MUL C, $f16, $f21 ++ LD $f14, 5*SIZE(X) ++ unop ++ unop ++ ++ ST $f22, 0*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ LD $f17, 6*SIZE(Y) ++ ++ ST $f24, 0*SIZE(Y) ++ MUL S, $f16, $f24 ++ LD $f16, 6*SIZE(X) ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 1*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ LD $f19, 7*SIZE(Y) ++ ++ ST $f28, 1*SIZE(Y) ++ MUL S, $f18, $f28 ++ LD $f18, 7*SIZE(X) ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f12, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 2*SIZE(X) ++ unop ++ MUL S, $f13, $f22 ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f13, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 2*SIZE(Y) ++ MUL S, $f12, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f14, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 3*SIZE(X) ++ MUL S, $f15, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f15, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 3*SIZE(Y) ++ MUL S, $f14, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f16, $f21 ++ unop ++ unop ++ unop ++ ++ ST $f22, 4*SIZE(X) ++ MUL S, $f17, $f22 ++ unop ++ ADD $f25, $f26, $f26 ++ ++ MUL C, $f17, $f23 ++ unop ++ unop ++ unop ++ ++ ST $f24, 4*SIZE(Y) ++ MUL S, $f16, $f24 ++ unop ++ SUB $f27, $f28, $f28 ++ ++ MUL C, $f18, $f25 ++ unop ++ unop ++ unop ++ ++ ST $f26, 5*SIZE(X) ++ MUL S, $f19, $f26 ++ unop ++ ADD $f21, $f22, $f22 ++ ++ MUL C, $f19, $f27 ++ unop ++ unop ++ unop ++ ++ ST $f28, 5*SIZE(Y) ++ MUL S, $f18, $f28 ++ unop ++ SUB $f23, $f24, $f24 ++ ++ ST $f22, 6*SIZE(X) ++ ADD $f25, $f26, $f26 ++ ST $f24, 6*SIZE(Y) ++ SUB $f27, $f28, $f28 ++ ++ ST $f26, 7*SIZE(X) ++ ldi X, 8*SIZE(X) ++ ST $f28, 7*SIZE(Y) ++ ldi Y, 8*SIZE(Y) ++ .align 4 ++ ++ ++$L15: ++ and N, 3, I ++ ble I, $L998 ++ .align 4 ++ ++$L16: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(X) ++ ST $f24, 0*SIZE(Y) ++ ldi I, -1(I) ++ ++ ST $f26, 1*SIZE(X) ++ ldi X, 2 * SIZE(X) ++ ST $f28, 1*SIZE(Y) ++ ldi Y, 2 * SIZE(Y) ++ ++ bgt I, $L16 ++ .align 4 ++ ++$L998: ++ clr $0 ++ ret ++ .align 4 ++ ++$L50: ++ mov X, XX ++ mov Y, YY ++ ++ sra N, 2, I ++ ble I, $L55 ++ .align 4 ++ ++$L51: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ SXADDQ INCX, X, X ++ LD $f15, 1*SIZE(Y) ++ SXADDQ INCY, Y, Y ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(XX) ++ ST $f24, 0*SIZE(YY) ++ ST $f26, 1*SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ST $f28, 1*SIZE(YY) ++ SXADDQ INCY, YY, YY ++ ++ ldi I, -1(I) ++ bgt I, $L51 ++ .align 4 ++ ++$L55: ++ and N, 3, I ++ ble I, $L999 ++ .align 4 ++ ++$L56: ++ LD $f12, 0*SIZE(X) ++ LD $f13, 0*SIZE(Y) ++ LD $f14, 1*SIZE(X) ++ LD $f15, 1*SIZE(Y) ++ ++ MUL C, $f12, $f21 ++ MUL S, $f13, $f22 ++ MUL C, $f13, $f23 ++ MUL S, $f12, $f24 ++ ++ ADD $f21, $f22, $f22 ++ SUB $f23, $f24, $f24 ++ ++ MUL C, $f14, $f25 ++ MUL S, $f15, $f26 ++ MUL C, $f15, $f27 ++ MUL S, $f14, $f28 ++ ++ ADD $f25, $f26, $f26 ++ SUB $f27, $f28, $f28 ++ ++ ST $f22, 0*SIZE(X) ++ ST $f24, 0*SIZE(Y) ++ ldi I, -1(I) ++ ++ ST $f26, 1*SIZE(X) ++ ST $f28, 1*SIZE(Y) ++ SXADDQ INCX, X, X ++ SXADDQ INCY, Y, Y ++ ++ bgt I, $L56 ++ .align 4 ++ ++$L999: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zscal.S b/kernel/sw_64/zscal.S +new file mode 100644 +index 0000000..9589624 +--- /dev/null ++++ b/kernel/sw_64/zscal.S +@@ -0,0 +1,255 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $21 ++#define INCX $17 ++ ++#define XX $18 ++#define I $19 ++ ++#define ALPHA_R $f19 ++#define ALPHA_I $f20 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f21 ++ ++#define t0 $f22 ++#define t1 $f23 ++#define t2 $f24 ++#define t3 $f25 ++ ++#define t4 $f26 ++#define t5 $f27 ++#define t6 $f28 ++#define t7 $f29 ++ ++ PROLOGUE ++ PROFCODE ++ ++ ldl INCX, 0($sp) ++ mov X, XX ++ ble N, $L999 ++ ++ addl INCX, INCX, INCX ++ ++ sra N, 2, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a2, 0 * SIZE(X) ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a6, 0 * SIZE(X) ++ LD a7, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t4, 0 * SIZE(XX) ++ MUL a2, ALPHA_R, t0 ++ ST t5, 1 * SIZE(XX) ++ MUL a3, ALPHA_I, t1 ++ ++ MUL a2, ALPHA_I, t2 ++ LD a0, 0 * SIZE(X) ++ MUL a3, ALPHA_R, t3 ++ LD a1, 1 * SIZE(X) ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ SXADDQ INCX, X, X ++ ++ MUL a4, ALPHA_R, t0 ++ ST t6, 0 * SIZE(XX) ++ MUL a5, ALPHA_I, t1 ++ ST t7, 1 * SIZE(XX) ++ ++ MUL a4, ALPHA_I, t2 ++ LD a2, 0 * SIZE(X) ++ MUL a5, ALPHA_R, t3 ++ LD a3, 1 * SIZE(X) ++ ++ SUB t0, t1, t4 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t5 ++ SXADDQ INCX, X, X ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, 0 * SIZE(XX) ++ MUL a7, ALPHA_I, t1 ++ ST t5, 1 * SIZE(XX) ++ ++ MUL a6, ALPHA_I, t2 ++ LD a4, 0 * SIZE(X) ++ MUL a7, ALPHA_R, t3 ++ LD a5, 1 * SIZE(X) ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ ST t6, 0 * SIZE(XX) ++ MUL a1, ALPHA_I, t1 ++ ST t7, 1 * SIZE(XX) ++ ++ MUL a0, ALPHA_I, t2 ++ LD a6, 0 * SIZE(X) ++ MUL a1, ALPHA_R, t3 ++ LD a7, 1 * SIZE(X) ++ ++ SUB t0, t1, t4 ++ ldi I, -1(I) ++ ADD t2, t3, t5 ++ SXADDQ INCX, XX, XX ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ MUL a2, ALPHA_R, t0 ++ MUL a3, ALPHA_I, t1 ++ ST t4, 0 * SIZE(XX) ++ MUL a2, ALPHA_I, t2 ++ ST t5, 1 * SIZE(XX) ++ MUL a3, ALPHA_R, t3 ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ unop ++ ++ ST t6, 0 * SIZE(XX) ++ MUL a4, ALPHA_R, t0 ++ ST t7, 1 * SIZE(XX) ++ MUL a5, ALPHA_I, t1 ++ MUL a4, ALPHA_I, t2 ++ MUL a5, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t5 ++ unop ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, 0 * SIZE(XX) ++ MUL a7, ALPHA_I, t1 ++ ST t5, 1 * SIZE(XX) ++ ++ MUL a6, ALPHA_I, t2 ++ MUL a7, ALPHA_R, t3 ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ ++ ST t6, 0 * SIZE(XX) ++ ST t7, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ .align 4 ++ ++$L15: ++ and N, 3, I ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++ ++ ST t4, 0 * SIZE(XX) ++ ST t5, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zscal.S.bak b/kernel/sw_64/zscal.S.bak +new file mode 100644 +index 0000000..4525b56 +--- /dev/null ++++ b/kernel/sw_64/zscal.S.bak +@@ -0,0 +1,443 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $21 ++#define INCX $17 ++ ++#define XX $18 ++#define I $19 ++ ++#define ALPHA_R $f19 ++#define ALPHA_I $f20 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f21 ++ ++#define t0 $f22 ++#define t1 $f23 ++#define t2 $f24 ++#define t3 $f25 ++ ++#define t4 $f26 ++#define t5 $f27 ++#define t6 $f28 ++#define t7 $f29 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++ ldl INCX, 0($sp) ++ mov X, XX ++ cmpeq INCX, 1, $0 ++ ble N, $L999 ++ ++ beq $0, $Sub ++ nop ++ ++/* ++ unloop 4 (4*2=8) ++*/ ++ sra N, 2, I ++ ble I, $Remain ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ LD a4, 4 * SIZE(X) ++ LD a5, 5 * SIZE(X) ++ ++ LD a6, 6 * SIZE(X) ++ LD a7, 7 * SIZE(X) ++ ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a0, ALPHA_I, t2 ++ ++ NMAD a1, ALPHA_I, t0, t4 ++ MAD a1, ALPHA_R, t2, t5 ++/* ++ MUL a1, ALPHA_I, t1 ++ MUL a1, ALPHA_R, t3 ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++*/ ++ ldi I, -1(I) ++ addl X, 8*SIZE, X ++ ++ ble I, $MainLoopEnd ++ .align 4 ++ ++$MainLoop: ++ MUL a2, ALPHA_R, t0 ++ ST t4, -8 * SIZE(X) ++ MUL a2, ALPHA_I, t2 ++ ST t5, -7 * SIZE(X) ++ ++ ++ NMAD a3, ALPHA_I, t0, t6 ++ LD a0, 0 * SIZE(X) ++ MAD a3, ALPHA_R, t2, t7 ++ LD a1, 1 * SIZE(X) ++ ++ ST t6, -6 * SIZE(X) ++ MUL a4, ALPHA_R, t0 ++ ST t7, -5 * SIZE(X) ++ MUL a4, ALPHA_I, t2 ++ ++ ++ NMAD a5, ALPHA_I, t0, t4 ++ LD a2, 2 * SIZE(X) ++ MAD a5, ALPHA_R, t2, t5 ++ LD a3, 3 * SIZE(X) ++/* ++ MUL a5, ALPHA_I, t1 ++ MUL a5, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++*/ ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, -4 * SIZE(X) ++ MUL a6, ALPHA_I, t2 ++ ST t5, -3 * SIZE(X) ++ ++ NMAD a7, ALPHA_I, t0, t6 ++ LD a4, 4 * SIZE(X) ++ MAD a7, ALPHA_R, t2, t7 ++ LD a5, 5 * SIZE(X) ++/* ++ ++ MUL a7, ALPHA_I, t1 ++ MUL a7, ALPHA_R, t3 ++ ++ SUB t0, t1, t6 ++ ADD t2, t3, t7 ++*/ ++ MUL a0, ALPHA_R, t0 ++ ST t6, -2 * SIZE(X) ++ MUL a0, ALPHA_I, t2 ++ ST t7, -1 * SIZE(X) ++ ++ NMAD a1, ALPHA_I, t0, t4 ++ LD a6, 6 * SIZE(X) ++ MAD a1, ALPHA_R, t2, t5 ++ LD a7, 7 * SIZE(X) ++ ++ ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ subl I, 1, I ++ addl X, 8*SIZE, X ++ bgt I, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++ MUL a2, ALPHA_R, t0 ++ ST t4, -8 * SIZE(X) ++ MUL a2, ALPHA_I, t2 ++ ST t5, -7 * SIZE(X) ++ ++ ++ NMAD a3, ALPHA_I, t0, t6 ++ MAD a3, ALPHA_R, t2, t7 ++ ++ ++ ST t6, -6 * SIZE(X) ++ MUL a4, ALPHA_R, t0 ++ ST t7, -5 * SIZE(X) ++ MUL a4, ALPHA_I, t2 ++ ++ ++ NMAD a5, ALPHA_I, t0, t4 ++ MAD a5, ALPHA_R, t2, t5 ++/* ++ MUL a5, ALPHA_I, t1 ++ MUL a5, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++*/ ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, -4 * SIZE(X) ++ MUL a6, ALPHA_I, t2 ++ ST t5, -3 * SIZE(X) ++ ++ NMAD a7, ALPHA_I, t0, t6 ++ MAD a7, ALPHA_R, t2, t7 ++/* ++ ++ MUL a7, ALPHA_I, t1 ++ MUL a7, ALPHA_R, t3 ++ ++ SUB t0, t1, t6 ++ ADD t2, t3, t7 ++*/ ++ ST t6, -2 * SIZE(X) ++ ST t7, -1 * SIZE(X) ++ ++ .align 4 ++$Remain: ++ and N, 3, I ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$RemainLoop: ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a0, ALPHA_I, t2 ++ ++ NMAD a1, ALPHA_I, t0, t4 ++ MAD a1, ALPHA_R, t2, t5 ++ ++/* ++ MUL a1, ALPHA_I, t1 ++ MUL a1, ALPHA_R, t3 ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++*/ ++ ST t4, 0 * SIZE(X) ++ ST t5, 1 * SIZE(X) ++ ++ addl X, 2*SIZE, X ++ ldi I, -1(I) ++ bne I, $RemainLoop ++ nop ++ ++ ret ++ .align 4 ++ ++$Sub: ++ addl INCX, INCX, INCX ++ ++ sra N, 2, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a2, 0 * SIZE(X) ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a6, 0 * SIZE(X) ++ LD a7, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t4, 0 * SIZE(XX) ++ MUL a2, ALPHA_R, t0 ++ ST t5, 1 * SIZE(XX) ++ MUL a3, ALPHA_I, t1 ++ ++ MUL a2, ALPHA_I, t2 ++ LD a0, 0 * SIZE(X) ++ MUL a3, ALPHA_R, t3 ++ LD a1, 1 * SIZE(X) ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ SXADDQ INCX, X, X ++ ++ MUL a4, ALPHA_R, t0 ++ ST t6, 0 * SIZE(XX) ++ MUL a5, ALPHA_I, t1 ++ ST t7, 1 * SIZE(XX) ++ ++ MUL a4, ALPHA_I, t2 ++ LD a2, 0 * SIZE(X) ++ MUL a5, ALPHA_R, t3 ++ LD a3, 1 * SIZE(X) ++ ++ SUB t0, t1, t4 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t5 ++ SXADDQ INCX, X, X ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, 0 * SIZE(XX) ++ MUL a7, ALPHA_I, t1 ++ ST t5, 1 * SIZE(XX) ++ ++ MUL a6, ALPHA_I, t2 ++ LD a4, 0 * SIZE(X) ++ MUL a7, ALPHA_R, t3 ++ LD a5, 1 * SIZE(X) ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ ST t6, 0 * SIZE(XX) ++ MUL a1, ALPHA_I, t1 ++ ST t7, 1 * SIZE(XX) ++ ++ MUL a0, ALPHA_I, t2 ++ LD a6, 0 * SIZE(X) ++ MUL a1, ALPHA_R, t3 ++ LD a7, 1 * SIZE(X) ++ ++ SUB t0, t1, t4 ++ ldi I, -1(I) ++ ADD t2, t3, t5 ++ SXADDQ INCX, XX, XX ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ MUL a2, ALPHA_R, t0 ++ MUL a3, ALPHA_I, t1 ++ ST t4, 0 * SIZE(XX) ++ MUL a2, ALPHA_I, t2 ++ ST t5, 1 * SIZE(XX) ++ MUL a3, ALPHA_R, t3 ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ unop ++ ++ ST t6, 0 * SIZE(XX) ++ MUL a4, ALPHA_R, t0 ++ ST t7, 1 * SIZE(XX) ++ MUL a5, ALPHA_I, t1 ++ MUL a4, ALPHA_I, t2 ++ MUL a5, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t5 ++ unop ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, 0 * SIZE(XX) ++ MUL a7, ALPHA_I, t1 ++ ST t5, 1 * SIZE(XX) ++ ++ MUL a6, ALPHA_I, t2 ++ MUL a7, ALPHA_R, t3 ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ ++ ST t6, 0 * SIZE(XX) ++ ST t7, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ .align 4 ++ ++$L15: ++ and N, 3, I ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++ ++ ST t4, 0 * SIZE(XX) ++ ST t5, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zscal_simd.S b/kernel/sw_64/zscal_simd.S +new file mode 100644 +index 0000000..09d2f38 +--- /dev/null ++++ b/kernel/sw_64/zscal_simd.S +@@ -0,0 +1,579 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 96 ++ ++#define N $16 ++#define X $21 ++#define INCX $17 ++ ++#define XX $18 ++#define I $19 ++ ++#define ALPHA_R $f19 ++#define ALPHA_I $f20 ++ ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f21 ++ ++#define t0 $f22 ++#define t1 $f23 ++#define t2 $f24 ++#define t3 $f25 ++ ++#define t4 $f26 ++#define t5 $f27 ++#define t6 $f28 ++#define t7 $f29 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++ ldl INCX, 0($sp) ++ mov X, XX ++ cmpeq INCX, 1, $0 ++ ble N, $L999 ++ ++ beq $0, $Sub ++ .align 5 ++ ++ and X, (VEC_LEN*SIZE-1), $6 ++ bgt $6, $UnAlign_X_ACCESS ++ ++/* ++ Unloop 8 (8*2=16) ++*/ ++ sra N, 3, I ++ vcpyf ALPHA_R, ALPHA_R ++ vcpyf ALPHA_I, ALPHA_I ++ ble I, $Remain ++ ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ ++ subl I, 1, I ++ addl X, 16*SIZE, X ++ ble I, $MainLoopEnd ++ .align 4 ++ ++ ++$MainLoop: ++ ++ vextf a0, 1, a4 ++ vextf a0, 3, a5 ++ vextf a1, 0, a6 ++ vextf a1, 2, a7 ++ ++ vextf a2, 1, t0 ++ vextf a2, 3, t1 ++ vextf a3, 0, t2 ++ vextf a3, 2, t3 ++ ++ vinsf a4, a1, 0, a1 ++ vinsf a5, a1, 2, a1 ++ vinsf a6, a0, 1, a0 ++ vinsf a7, a0, 3, a0 ++ ++ vinsf t0, a3, 0, a3 ++ vinsf t1, a3, 2, a3 ++ vinsf t2, a2, 1, a2 ++ vinsf t3, a2, 3, a2 ++ ++ VMUL ALPHA_R, a0, t4 ++ VMUL ALPHA_I, a0, t5 ++ VMUL ALPHA_R, a2, t6 ++ VMUL ALPHA_I, a2, t7 ++ ++ VNMAD ALPHA_I, a1, t4, t0 ++ VLD a0, 0*VEC_LEN*SIZE(X) ++ VMAD ALPHA_R, a1, t5, t1 ++ VLD a1, 1*VEC_LEN*SIZE(X) ++ ++ VNMAD ALPHA_I, a3, t6, t2 ++ VLD a2, 2*VEC_LEN*SIZE(X) ++ VMAD ALPHA_R, a3, t7, t3 ++ VLD a3, 3*VEC_LEN*SIZE(X) ++ ++/*combine the real(t0,t2) & image(t1,t3) vector to complex vector*/ ++ vextf t0, 1, a4 ++ vextf t0, 3, a5 ++ vextf t1, 0, a6 ++ vextf t1, 2, a7 ++ ++ vextf t2, 1, s0 ++ vextf t2, 3, s1 ++ vextf t3, 0, s2 ++ vextf t3, 2, s3 ++ ++ vinsf a4, t1, 0, t1 ++ vinsf a5, t1, 2, t1 ++ vinsf a6, t0, 1, t0 ++ vinsf a7, t0, 3, t0 ++ ++ vinsf s0, t3, 0, t3 ++ vinsf s1, t3, 2, t3 ++ vinsf s2, t2, 1, t2 ++ vinsf s3, t2, 3, t2 ++ ++ VST t0, -4*VEC_LEN*SIZE(X) ++ VST t1, -3*VEC_LEN*SIZE(X) ++ VST t2, -2*VEC_LEN*SIZE(X) ++ VST t3, -1*VEC_LEN*SIZE(X) ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ subl I, 1, I ++ addl X, 16*SIZE, X ++ bgt I, $MainLoop ++ .align 4 ++ ++$MainLoopEnd: ++/*spilt the complex vector to real vector($f0) and image vector ($f1)*/ ++ vextf a0, 1, a4 ++ vextf a0, 3, a5 ++ vextf a1, 0, a6 ++ vextf a1, 2, a7 ++ ++ vextf a2, 1, t0 ++ vextf a2, 3, t1 ++ vextf a3, 0, t2 ++ vextf a3, 2, t3 ++ ++ vinsf a4, a1, 0, a1 ++ vinsf a5, a1, 2, a1 ++ vinsf a6, a0, 1, a0 ++ vinsf a7, a0, 3, a0 ++ ++ vinsf t0, a3, 0, a3 ++ vinsf t1, a3, 2, a3 ++ vinsf t2, a2, 1, a2 ++ vinsf t3, a2, 3, a2 ++ ++ VMUL ALPHA_R, a0, t4 ++ VMUL ALPHA_I, a0, t5 ++ VMUL ALPHA_R, a2, t6 ++ VMUL ALPHA_I, a2, t7 ++ ++ VNMAD ALPHA_I, a1, t4, t0 ++ VMAD ALPHA_R, a1, t5, t1 ++ VNMAD ALPHA_I, a3, t6, t2 ++ VMAD ALPHA_R, a3, t7, t3 ++ ++/*combine the real(t0,t2) & image(t1,t3) vector to complex vector*/ ++ vextf t0, 1, a4 ++ vextf t0, 3, a5 ++ vextf t1, 0, a6 ++ vextf t1, 2, a7 ++ ++ vextf t2, 1, s0 ++ vextf t2, 3, s1 ++ vextf t3, 0, s2 ++ vextf t3, 2, s3 ++ ++ vinsf a4, t1, 0, t1 ++ vinsf a5, t1, 2, t1 ++ vinsf a6, t0, 1, t0 ++ vinsf a7, t0, 3, t0 ++ ++ vinsf s0, t3, 0, t3 ++ vinsf s1, t3, 2, t3 ++ vinsf s2, t2, 1, t2 ++ vinsf s3, t2, 3, t2 ++ ++ VST t0, -4*VEC_LEN*SIZE(X) ++ VST t1, -3*VEC_LEN*SIZE(X) ++ VST t2, -2*VEC_LEN*SIZE(X) ++ VST t3, -1*VEC_LEN*SIZE(X) ++ ++$Remain: ++ and N, 7, I ++ unop ++ unop ++ ble I, $L999 ++ .align 5 ++ ++$Remain_loop: ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++ ST t4, 0 * SIZE(X) ++ ST t5, 1 * SIZE(X) ++ ++ addl X, 2*SIZE, X ++ ldi I, -1(I) ++ bne I, $Remain_loop ++ ret ++ .align 5 ++ ++$UnAlign_X_ACCESS: ++/* ++ unloop 4 (4*2=8) ++*/ ++ sra N, 2, I ++ ble I, $Unalign_Remain ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ ++ LD a2, 2 * SIZE(X) ++ LD a3, 3 * SIZE(X) ++ ++ LD a4, 4 * SIZE(X) ++ MUL a0, ALPHA_R, t0 ++ LD a5, 5 * SIZE(X) ++ MUL a0, ALPHA_I, t2 ++ ++ LD a6, 6 * SIZE(X) ++ NMAD a1, ALPHA_I, t0, t4 ++ LD a7, 7 * SIZE(X) ++ MAD a1, ALPHA_R, t2, t5 ++ ++ ++ ldi I, -1(I) ++ addl X, 8*SIZE, X ++ ble I, $Unalign_MainLoopEnd ++ .align 4 ++ ++$Unalign_MainLoop: ++ MUL a2, ALPHA_R, t0 ++ ST t4, -8 * SIZE(X) ++ MUL a2, ALPHA_I, t2 ++ ST t5, -7 * SIZE(X) ++ ++ ++ NMAD a3, ALPHA_I, t0, t6 ++ LD a0, 0 * SIZE(X) ++ MAD a3, ALPHA_R, t2, t7 ++ LD a1, 1 * SIZE(X) ++ ++ ST t6, -6 * SIZE(X) ++ MUL a4, ALPHA_R, t0 ++ ST t7, -5 * SIZE(X) ++ MUL a4, ALPHA_I, t2 ++ ++ ++ NMAD a5, ALPHA_I, t0, t4 ++ LD a2, 2 * SIZE(X) ++ MAD a5, ALPHA_R, t2, t5 ++ LD a3, 3 * SIZE(X) ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, -4 * SIZE(X) ++ MUL a6, ALPHA_I, t2 ++ ST t5, -3 * SIZE(X) ++ ++ NMAD a7, ALPHA_I, t0, t6 ++ LD a4, 4 * SIZE(X) ++ MAD a7, ALPHA_R, t2, t7 ++ LD a5, 5 * SIZE(X) ++ ++ MUL a0, ALPHA_R, t0 ++ ST t6, -2 * SIZE(X) ++ MUL a0, ALPHA_I, t2 ++ ST t7, -1 * SIZE(X) ++ ++ NMAD a1, ALPHA_I, t0, t4 ++ LD a6, 6 * SIZE(X) ++ MAD a1, ALPHA_R, t2, t5 ++ LD a7, 7 * SIZE(X) ++ ++ ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ subl I, 1, I ++ addl X, 8*SIZE, X ++ bgt I, $Unalign_MainLoop ++ .align 4 ++ ++$Unalign_MainLoopEnd: ++ MUL a2, ALPHA_R, t0 ++ ST t4, -8 * SIZE(X) ++ MUL a2, ALPHA_I, t2 ++ ST t5, -7 * SIZE(X) ++ ++ ++ NMAD a3, ALPHA_I, t0, t6 ++ MAD a3, ALPHA_R, t2, t7 ++ ++ ++ ST t6, -6 * SIZE(X) ++ MUL a4, ALPHA_R, t0 ++ ST t7, -5 * SIZE(X) ++ MUL a4, ALPHA_I, t2 ++ ++ ++ NMAD a5, ALPHA_I, t0, t4 ++ MAD a5, ALPHA_R, t2, t5 ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, -4 * SIZE(X) ++ MUL a6, ALPHA_I, t2 ++ ST t5, -3 * SIZE(X) ++ ++ NMAD a7, ALPHA_I, t0, t6 ++ MAD a7, ALPHA_R, t2, t7 ++ ST t6, -2 * SIZE(X) ++ ST t7, -1 * SIZE(X) ++ ++ .align 4 ++$Unalign_Remain: ++ and N, 3, I ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$Unalign_RemainLoop: ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a0, ALPHA_I, t2 ++ ++ NMAD a1, ALPHA_I, t0, t4 ++ MAD a1, ALPHA_R, t2, t5 ++ ++ ST t4, 0 * SIZE(X) ++ ST t5, 1 * SIZE(X) ++ ++ addl X, 2*SIZE, X ++ ldi I, -1(I) ++ bne I, $Unalign_RemainLoop ++ nop ++ ++ ret ++ .align 4 ++ ++$Sub: ++ addl INCX, INCX, INCX ++ ++ sra N, 2, I ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a2, 0 * SIZE(X) ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ LD a6, 0 * SIZE(X) ++ LD a7, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++ ++ ldi I, -1(I) ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ST t4, 0 * SIZE(XX) ++ MUL a2, ALPHA_R, t0 ++ ST t5, 1 * SIZE(XX) ++ MUL a3, ALPHA_I, t1 ++ ++ MUL a2, ALPHA_I, t2 ++ LD a0, 0 * SIZE(X) ++ MUL a3, ALPHA_R, t3 ++ LD a1, 1 * SIZE(X) ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ SXADDQ INCX, X, X ++ ++ MUL a4, ALPHA_R, t0 ++ ST t6, 0 * SIZE(XX) ++ MUL a5, ALPHA_I, t1 ++ ST t7, 1 * SIZE(XX) ++ ++ MUL a4, ALPHA_I, t2 ++ LD a2, 0 * SIZE(X) ++ MUL a5, ALPHA_R, t3 ++ LD a3, 1 * SIZE(X) ++ ++ SUB t0, t1, t4 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t5 ++ SXADDQ INCX, X, X ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, 0 * SIZE(XX) ++ MUL a7, ALPHA_I, t1 ++ ST t5, 1 * SIZE(XX) ++ ++ MUL a6, ALPHA_I, t2 ++ LD a4, 0 * SIZE(X) ++ MUL a7, ALPHA_R, t3 ++ LD a5, 1 * SIZE(X) ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ ST t6, 0 * SIZE(XX) ++ MUL a1, ALPHA_I, t1 ++ ST t7, 1 * SIZE(XX) ++ ++ MUL a0, ALPHA_I, t2 ++ LD a6, 0 * SIZE(X) ++ MUL a1, ALPHA_R, t3 ++ LD a7, 1 * SIZE(X) ++ ++ SUB t0, t1, t4 ++ ldi I, -1(I) ++ ADD t2, t3, t5 ++ SXADDQ INCX, XX, XX ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ MUL a2, ALPHA_R, t0 ++ MUL a3, ALPHA_I, t1 ++ ST t4, 0 * SIZE(XX) ++ MUL a2, ALPHA_I, t2 ++ ST t5, 1 * SIZE(XX) ++ MUL a3, ALPHA_R, t3 ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ unop ++ ++ ST t6, 0 * SIZE(XX) ++ MUL a4, ALPHA_R, t0 ++ ST t7, 1 * SIZE(XX) ++ MUL a5, ALPHA_I, t1 ++ MUL a4, ALPHA_I, t2 ++ MUL a5, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t5 ++ unop ++ ++ MUL a6, ALPHA_R, t0 ++ ST t4, 0 * SIZE(XX) ++ MUL a7, ALPHA_I, t1 ++ ST t5, 1 * SIZE(XX) ++ ++ MUL a6, ALPHA_I, t2 ++ MUL a7, ALPHA_R, t3 ++ ++ SUB t0, t1, t6 ++ SXADDQ INCX, XX, XX ++ ADD t2, t3, t7 ++ ++ ST t6, 0 * SIZE(XX) ++ ST t7, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ .align 4 ++ ++$L15: ++ and N, 3, I ++ unop ++ unop ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ LD a0, 0 * SIZE(X) ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ MUL a0, ALPHA_R, t0 ++ MUL a1, ALPHA_I, t1 ++ MUL a0, ALPHA_I, t2 ++ MUL a1, ALPHA_R, t3 ++ ++ SUB t0, t1, t4 ++ ADD t2, t3, t5 ++ ++ ST t4, 0 * SIZE(XX) ++ ST t5, 1 * SIZE(XX) ++ SXADDQ INCX, XX, XX ++ ++ ldi I, -1(I) ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zsum.S b/kernel/sw_64/zsum.S +new file mode 100644 +index 0000000..7b8570c +--- /dev/null ++++ b/kernel/sw_64/zsum.S +@@ -0,0 +1,234 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 88 ++ ++#define N $16 ++#define X $17 ++#define INCX $18 ++#define I $19 ++ ++#define s0 $f0 ++#define s1 $f1 ++#define s2 $f10 ++#define s3 $f11 ++ ++#define a0 $f12 ++#define a1 $f13 ++#define a2 $f14 ++#define a3 $f15 ++#define a4 $f16 ++#define a5 $f17 ++#define a6 $f18 ++#define a7 $f19 ++ ++#define t0 $f20 ++#define t1 $f21 ++#define t2 $f22 ++#define t3 $f23 ++ ++ PROLOGUE ++ PROFCODE ++ ++ fclr s0 ++ unop ++ fclr t0 ++ addw INCX, INCX, $20 ++ mov $20,INCX ++ ++ fclr s1 ++ unop ++ fclr t1 ++ ble N, $L999 ++ ++ fclr s2 ++ sra N, 2, I ++ fclr s3 ++ ble I, $L15 ++ ++ LD a0, 0 * SIZE(X) ++ fclr t2 ++ LD a1, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a2, 0 * SIZE(X) ++ fclr t3 ++ LD a3, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ++ LD a4, 0 * SIZE(X) ++ LD a5, 1 * SIZE(X) ++ SXADDQ INCX, X, X ++ ldi I, -1(I) ++ ++ ble I, $L13 ++ .align 4 ++ ++$L12: ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ ldl $31, PREFETCHSIZE * SIZE(X) ++ fmov a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ LD a6, 0 * SIZE(X) ++ fmov a1, t1 ++ unop ++ ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ LD a7, 1 * SIZE(X) ++ fmov a2, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ LD a0, 0 * SIZE(X) ++ fmov a3, t3 ++ unop ++ ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ LD a1, 1 * SIZE(X) ++ fmov a4, t0 ++ SXADDQ INCX, X, X ++ ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ LD a2, 0 * SIZE(X) ++ fmov a5, t1 ++ unop ++ ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ LD a3, 1 * SIZE(X) ++ fmov a6, t2 ++ SXADDQ INCX, X, X ++ ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ LD a4, 0 * SIZE(X) ++ fmov a7, t3 ++ unop ++ ++ LD a5, 1 * SIZE(X) ++ unop ++ SXADDQ INCX, X, X ++ bne I, $L12 ++ .align 4 ++ ++$L13: ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ LD a6, 0 * SIZE(X) ++ fmov a0, t0 ++ ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ LD a7, 1 * SIZE(X) ++ fmov a1, t1 ++ SXADDQ INCX, X, X ++ ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ fmov a2, t2 ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ fmov a3, t3 ++ ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ fmov a4, t0 ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ fmov a5, t1 ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ fmov a6, t2 ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ fmov a7, t3 ++ ++ ADD s2, t2, $f24 ++ fmov $f24,s2 ++ ADD s3, t3, $f24 ++ fmov $f24,s3 ++ ++ .align 4 ++ ++$L15: ++ ADD s0, s2, $f24 ++ fmov $f24,s0 ++ and N, 3, I ++ ADD s1, s3, $f24 ++ fmov $f24,s1 ++ ble I, $L999 ++ .align 4 ++ ++$L17: ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ LD a0, 0 * SIZE(X) ++ fmov a0, t0 ++ ldi I, -1(I) ++ ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ LD a1, 1 * SIZE(X) ++ fmov a1, t1 ++ SXADDQ INCX, X, X ++ ++ bne I, $L17 ++ .align 4 ++ ++$L999: ++ ADD s0, t0, $f24 ++ fmov $f24,s0 ++ ADD s1, t1, $f24 ++ fmov $f24,s1 ++ ++ ADD s0, s1, $f24 ++ fmov $f24,s0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zswap.S.bak b/kernel/sw_64/zswap.S.bak +new file mode 100644 +index 0000000..f0b19dd +--- /dev/null ++++ b/kernel/sw_64/zswap.S.bak +@@ -0,0 +1,244 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++ mov $21, $17 ++ ldl $18, 0($sp) ++ ldl $19, 8($sp) ++ ldl $20, 16($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ble $16, $SubEnd # if n <= 0 goto $End ++ ++ cmpeq $18, 1, $1 ++ addl $18, $18, $18 ++ cmpeq $20, 1, $2 ++ addl $20, $20, $20 ++ ++ sra $16, 2, $21 ++ and $1, $2, $1 ++ and $16, 3, $22 ++ beq $1, $Sub ++ ++ ble $21, $MainRemain ++ .align 4 ++ ++$MainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f12, 2*SIZE($19) ++ LD $f13, 3*SIZE($19) ++ LD $f14, 4*SIZE($19) ++ LD $f15, 5*SIZE($19) ++ LD $f16, 6*SIZE($19) ++ LD $f17, 7*SIZE($19) ++ ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ LD $f22, 2*SIZE($17) ++ LD $f23, 3*SIZE($17) ++ LD $f24, 4*SIZE($17) ++ LD $f25, 5*SIZE($17) ++ LD $f26, 6*SIZE($17) ++ LD $f27, 7*SIZE($17) ++ ++ fillcs 16*SIZE($17) ++ unop ++ fillcs 16*SIZE($19) ++ subl $21, 1, $21 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f11, 1*SIZE($17) ++ ST $f12, 2*SIZE($17) ++ ST $f13, 3*SIZE($17) ++ ST $f14, 4*SIZE($17) ++ ST $f15, 5*SIZE($17) ++ ST $f16, 6*SIZE($17) ++ ST $f17, 7*SIZE($17) ++ ++ ST $f20, 0*SIZE($19) ++ ST $f21, 1*SIZE($19) ++ ST $f22, 2*SIZE($19) ++ ST $f23, 3*SIZE($19) ++ ST $f24, 4*SIZE($19) ++ ST $f25, 5*SIZE($19) ++ ST $f26, 6*SIZE($19) ++ ST $f27, 7*SIZE($19) ++ ++ ldi $17, 8*SIZE($17) ++ ldi $19, 8*SIZE($19) ++ bgt $21, $MainLoop ++ .align 4 ++ ++$MainRemain: ++ ble $22, $MainEnd ++ .align 4 ++ ++$MainRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ ++ ldi $17, 2*SIZE($17) ++ ldi $19, 2*SIZE($19) ++ subl $22, 1, $22 ++ ST $f10, -2*SIZE($17) ++ ST $f11, -1*SIZE($17) ++ ST $f20, -2*SIZE($19) ++ ST $f21, -1*SIZE($19) ++ bgt $22, $MainRemainLoop ++ .align 4 ++ ++$MainEnd: ++ clr $0 ++ ret ++ .align 4 ++ ++$Sub: ++ mov $17, $23 ++ mov $19, $24 ++ ble $21, $SubRemain ++ .align 4 ++ ++$SubLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f12, 0*SIZE($19) ++ LD $f13, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f14, 0*SIZE($19) ++ LD $f15, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f16, 0*SIZE($19) ++ LD $f17, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f22, 0*SIZE($17) ++ LD $f23, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f24, 0*SIZE($17) ++ LD $f25, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f26, 0*SIZE($17) ++ LD $f27, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ ST $f10, 0*SIZE($23) ++ ST $f11, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f12, 0*SIZE($23) ++ ST $f13, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f14, 0*SIZE($23) ++ ST $f15, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f16, 0*SIZE($23) ++ ST $f17, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f20, 0*SIZE($24) ++ ST $f21, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f22, 0*SIZE($24) ++ ST $f23, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f24, 0*SIZE($24) ++ ST $f25, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f26, 0*SIZE($24) ++ ST $f27, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ subl $21, 1, $21 ++ bgt $21, $SubLoop ++ .align 4 ++ ++$SubRemain: ++ ble $22, $SubEnd ++ .align 4 ++ ++$SubRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ ++ subl $22, 1, $22 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f11, 1*SIZE($17) ++ ST $f20, 0*SIZE($19) ++ ST $f21, 1*SIZE($19) ++ ++ SXADDQ $18, $17, $17 ++ SXADDQ $20, $19, $19 ++ bgt $22, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/zswap.c b/kernel/sw_64/zswap.c +new file mode 100644 +index 0000000..ae4760a +--- /dev/null ++++ b/kernel/sw_64/zswap.c +@@ -0,0 +1,72 @@ ++/*************************************************************************** ++Copyright (c) 2013, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++/************************************************************************************** ++* 2013/09/14 Saar ++* BLASTEST float : OK ++* BLASTEST double : OK ++* CTEST : OK ++* TEST : OK ++* ++**************************************************************************************/ ++ ++#include "common.h" ++#include ++ ++int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) ++{ ++ BLASLONG i=0; ++ BLASLONG ix=0,iy=0; ++ FLOAT temp[2]; ++ BLASLONG inc_x2; ++ BLASLONG inc_y2; ++ ++ if ( n < 0 ) return(0); ++ ++ inc_x2 = 2 * inc_x; ++ inc_y2 = 2 * inc_y; ++ ++ while(i < n) ++ { ++ ++ temp[0] = x[ix] ; ++ temp[1] = x[ix+1] ; ++ x[ix] = y[iy] ; ++ x[ix+1] = y[iy+1] ; ++ y[iy] = temp[0] ; ++ y[iy+1] = temp[1] ; ++ ++ ix += inc_x2 ; ++ iy += inc_y2 ; ++ i++ ; ++ ++ } ++ return(0); ++ ++} ++ ++ +diff --git a/kernel/sw_64/zswap_simd.S b/kernel/sw_64/zswap_simd.S +new file mode 100644 +index 0000000..e49c95b +--- /dev/null ++++ b/kernel/sw_64/zswap_simd.S +@@ -0,0 +1,306 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#define PREFETCHSIZE 64 ++#define X $17 ++#define Y $19 ++ ++ PROLOGUE ++ PROFCODE ++ .frame $sp, 0, $26, 0 ++ ++ mov $21, $17 ++ ldl $18, 0($sp) ++ ldl $19, 8($sp) ++ ldl $20, 16($sp) ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ble $16, $SubEnd # if n <= 0 goto $End ++ ++ cmpeq $18, 1, $1 ++ addl $18, $18, $18 ++ cmpeq $20, 1, $2 ++ addl $20, $20, $20 ++ ++/* ++ Unloop 8 complex, 16 real ++*/ ++ ++ sra $16, 3, $21 ++ and $1, $2, $1 ++ and $16, 7, $22 ++ beq $1, $Sub ++ ++/* ++ test the address of Y & X ++*/ ++ and Y, (VEC_LEN*SIZE-1), $4 ++ and X, (VEC_LEN*SIZE-1), $3 ++ or $3, $4, $4 ++ bne $4, $UnAlign_ACCESS ++ ++/* align access*/ ++ ++ ble $21, $MainRemain ++ .align 4 ++ ++$MainLoop: ++ VLD $f10, 0*VEC_LEN*SIZE(Y) ++ VLD $f11, 1*VEC_LEN*SIZE(Y) ++ VLD $f12, 2*VEC_LEN*SIZE(Y) ++ VLD $f13, 3*VEC_LEN*SIZE(Y) ++ ++ VLD $f20, 0*VEC_LEN*SIZE(X) ++ VLD $f21, 1*VEC_LEN*SIZE(X) ++ VLD $f22, 2*VEC_LEN*SIZE(X) ++ VLD $f23, 3*VEC_LEN*SIZE(X) ++ ++ fillcs PREFETCHSIZE * SIZE(X) ++ unop ++ fillcs PREFETCHSIZE * SIZE(Y) ++ subl $21, 1, $21 ++ ++ VST $f10, 0*VEC_LEN*SIZE(X) ++ VST $f11, 1*VEC_LEN*SIZE(X) ++ VST $f12, 2*VEC_LEN*SIZE(X) ++ VST $f13, 3*VEC_LEN*SIZE(X) ++ ++ VST $f20, 0*VEC_LEN*SIZE(Y) ++ VST $f21, 1*VEC_LEN*SIZE(Y) ++ VST $f22, 2*VEC_LEN*SIZE(Y) ++ VST $f23, 3*VEC_LEN*SIZE(Y) ++ ++ ldi $17, 16*SIZE(X) ++ ldi $19, 16*SIZE(Y) ++ bgt $21, $MainLoop ++ .align 4 ++ ++ jmp $MainRemain ++ .align 4 ++ ++$UnAlign_ACCESS: ++ sra $16, 2, $21 ++ and $16, 3, $22 ++ nop ++ ble $21, $MainRemain ++ .align 4 ++$UnAlign_ACCESS_MainLoop: ++ ++ LD $f10, 0*SIZE(Y) ++ LD $f11, 1*SIZE(Y) ++ LD $f12, 2*SIZE(Y) ++ LD $f13, 3*SIZE(Y) ++ LD $f14, 4*SIZE(Y) ++ LD $f15, 5*SIZE(Y) ++ LD $f16, 6*SIZE(Y) ++ LD $f17, 7*SIZE(Y) ++ ++ LD $f20, 0*SIZE(X) ++ LD $f21, 1*SIZE(X) ++ LD $f22, 2*SIZE(X) ++ LD $f23, 3*SIZE(X) ++ LD $f24, 4*SIZE(X) ++ LD $f25, 5*SIZE(X) ++ LD $f26, 6*SIZE(X) ++ LD $f27, 7*SIZE(X) ++ ++ fillcs 16*SIZE(X) ++ unop ++ fillcs 16*SIZE(Y) ++ subl $21, 1, $21 ++ ++ ST $f10, 0*SIZE(X) ++ ST $f11, 1*SIZE(X) ++ ST $f12, 2*SIZE(X) ++ ST $f13, 3*SIZE(X) ++ ST $f14, 4*SIZE(X) ++ ST $f15, 5*SIZE(X) ++ ST $f16, 6*SIZE(X) ++ ST $f17, 7*SIZE(X) ++ ++ ST $f20, 0*SIZE(Y) ++ ST $f21, 1*SIZE(Y) ++ ST $f22, 2*SIZE(Y) ++ ST $f23, 3*SIZE(Y) ++ ST $f24, 4*SIZE(Y) ++ ST $f25, 5*SIZE(Y) ++ ST $f26, 6*SIZE(Y) ++ ST $f27, 7*SIZE(Y) ++ ++ ldi X, 8*SIZE(X) ++ ldi Y, 8*SIZE(Y) ++ bgt $21, $UnAlign_ACCESS_MainLoop ++ .align 4 ++ ++$MainRemain: ++ ble $22, $MainEnd ++ .align 4 ++ ++$MainRemainLoop: ++ LD $f10, 0*SIZE(Y) ++ LD $f11, 1*SIZE(Y) ++ LD $f20, 0*SIZE(X) ++ LD $f21, 1*SIZE(X) ++ ++ ldi X, 2*SIZE(X) ++ ldi Y, 2*SIZE(Y) ++ subl $22, 1, $22 ++ ST $f10, -2*SIZE(X) ++ ST $f11, -1*SIZE(X) ++ ST $f20, -2*SIZE(Y) ++ ST $f21, -1*SIZE(Y) ++ bgt $22, $MainRemainLoop ++ .align 4 ++ ++$MainEnd: ++ clr $0 ++ ret ++ .align 4 ++ ++$Sub: ++ sra $16, 2, $21 ++ and $16, 3, $22 ++ ++ mov $17, $23 ++ mov $19, $24 ++ ble $21, $SubRemain ++ .align 4 ++ ++$SubLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f12, 0*SIZE($19) ++ LD $f13, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f14, 0*SIZE($19) ++ LD $f15, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f16, 0*SIZE($19) ++ LD $f17, 1*SIZE($19) ++ SXADDQ $20, $19, $19 ++ ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f22, 0*SIZE($17) ++ LD $f23, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f24, 0*SIZE($17) ++ LD $f25, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ LD $f26, 0*SIZE($17) ++ LD $f27, 1*SIZE($17) ++ SXADDQ $18, $17, $17 ++ ++ ST $f10, 0*SIZE($23) ++ ST $f11, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f12, 0*SIZE($23) ++ ST $f13, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f14, 0*SIZE($23) ++ ST $f15, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f16, 0*SIZE($23) ++ ST $f17, 1*SIZE($23) ++ SXADDQ $18, $23, $23 ++ ++ ST $f20, 0*SIZE($24) ++ ST $f21, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f22, 0*SIZE($24) ++ ST $f23, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f24, 0*SIZE($24) ++ ST $f25, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ ST $f26, 0*SIZE($24) ++ ST $f27, 1*SIZE($24) ++ SXADDQ $20, $24, $24 ++ ++ subl $21, 1, $21 ++ bgt $21, $SubLoop ++ .align 4 ++ ++$SubRemain: ++ ble $22, $SubEnd ++ .align 4 ++ ++$SubRemainLoop: ++ LD $f10, 0*SIZE($19) ++ LD $f11, 1*SIZE($19) ++ LD $f20, 0*SIZE($17) ++ LD $f21, 1*SIZE($17) ++ ++ subl $22, 1, $22 ++ ++ ST $f10, 0*SIZE($17) ++ ST $f11, 1*SIZE($17) ++ ST $f20, 0*SIZE($19) ++ ST $f21, 1*SIZE($19) ++ ++ SXADDQ $18, $17, $17 ++ SXADDQ $20, $19, $19 ++ bgt $22, $SubRemainLoop ++ .align 4 ++ ++$SubEnd: ++ clr $0 ++ ret ++ EPILOGUE +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S b/kernel/sw_64/ztrsm_kernel_2x2_LN.S +new file mode 100644 +index 0000000..3a14e58 +--- /dev/null ++++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S +@@ -0,0 +1,2593 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 48 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++ .set noat ++ .set noreorder ++ .arch sw6a ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 88 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define tmp $9 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++#if defined(LN) || defined(LT) ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#else ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#endif ++ ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++ ldl OFFSET, 24 + STACKSIZE($sp) ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ stl tmp, 72($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ addl M, M, TMP2 ++ mull TMP2, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ TMP2, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ addl TMP1, TMP1, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 1, J ++ ble J, $L30 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C2 ++ subl C2, LDC, C1 ++ subl C2, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ and M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ble I, $L20 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L28 ++ ble L, $L25 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L28 ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++// unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++// unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++// unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++// unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++// unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++// unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++// unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ FIMOVD b5, tmp ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++// unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++// unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++// unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++// unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++// unop ++ IFMOVD tmp, b5 ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ IFMOVD tmp, b5 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++// unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++// unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++// unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++// unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++// unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++// unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++// unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ MUL a2, b1, t2 ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ MUL a1, b2, t3 ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ MUL a2, b2, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b4, t3 ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ADD c09, c14, b5 ++ fmov b5, c09 ++ ADD c10, c13, b5 ++ fmov b5, c10 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c10, b5 ++ fmov b5, c10 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c10, b5 ++ fmov b5, c10 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c09, t3, b5 ++ fmov b5, c09 ++ ADD6 c10, t4, b5 ++ fmov b5, c10 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, b5 ++ fmov b5, c09 ++ ADD5 c10, t2, b5 ++ fmov b5, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L20: ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L29 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(KK) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble KK, $L18 ++ ble L, $L15 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble TMP1, $L18 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++// unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++// unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++// unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ FIMOVD b5, tmp ++ ++/* 2 */ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++// unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++// unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++// unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++// unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++// unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++// unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++// unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++// unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++// unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++// unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++// unop ++ IFMOVD tmp, b5 ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ ldi L, -2(L) ++ IFMOVD tmp, b5 ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++// unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++// unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++// unop ++ IFMOVD tmp, b5 ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++// unop ++ IFMOVD tmp, b5 ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++// unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++// unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++// unop ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++// unop ++ ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++// unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++// unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++// unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++// unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++// unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++// unop ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++// unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++// unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++// unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++// unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++// unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++// unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++// unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++// unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++// unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL b1, a4, t2 ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, t3 ++ ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, t4 ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++ MUL b3, a1, t1 ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++ MUL b3, a2, t2 ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++ MUL b4, a2, t3 ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++ MUL b2, a3, t4 ++ ADD1 c09, t1,b5 ++ fmov b5, c09 ++ MUL b3, a3, t1 ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ MUL b3, a4, t2 ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++ MUL b4, a4, t3 ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ADD c03, c08, b5 ++ fmov b5, c03 ++ ADD c04, c07, b5 ++ fmov b5, c04 ++ ++ ADD c09, c14, b5 ++ fmov b5, c09 ++ ADD c10, c13, b5 ++ fmov b5, c10 ++ ADD c11, c16, b5 ++ fmov b5, c11 ++ ADD c12, c15, b5 ++ fmov b5, c12 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c10, b5 ++ fmov b5, c10 ++ ++ SUB b1, c03, b5 ++ fmov b5, c03 ++ SUB b2, c04, b5 ++ fmov b5, c04 ++ SUB b3, c11, b5 ++ fmov b5, c11 ++ SUB b4, c12, b5 ++ fmov b5, c12 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++ ++ SUB b1, c09, b5 ++ fmov b5, c09 ++ SUB b2, c10, b5 ++ fmov b5, c10 ++ SUB b3, c11, b5 ++ fmov b5, c11 ++ SUB b4, c12, b5 ++ fmov b5, c12 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c10, t4, b5 ++ fmov b5, c10 ++ ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ADD6 c09, t3, b5 ++ fmov b5, c09 ++ ADD5 c10, t4, b5 ++ fmov b5, c10 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c09, t3, b5 ++ fmov b5, c09 ++ ADD6 c10, t4, b5 ++ fmov b5, c10 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c09, t3, b5 ++ fmov b5, c09 ++ ADD6 c10, t4, b5 ++ fmov b5, c10 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c09, t3 ++ MUL a3, c10, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c04, t2, b5 ++ fmov b5, c04 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c10, t3 ++ MUL a4, c09, t4 ++ ++ ADD6 c03, t1, b5 ++ fmov b5, c03 ++ ADD5 c04, t2, b5 ++ fmov b5, c04 ++ ADD6 c11, t3, b5 ++ fmov b5, c11 ++ ADD5 c12, t4, b5 ++ fmov b5, c12 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c03, t3, b5 ++ fmov b5, c03 ++ ADD6 c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ ++ ADD6 c09, t1, b5 ++ fmov b5, c09 ++ ADD5 c10, t2, b5 ++ fmov b5, c10 ++ ADD6 c11, t3, b5 ++ fmov b5, c11 ++ ADD5 c12, t4, b5 ++ fmov b5, c12 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ADD6 c03, t3, b5 ++ fmov b5, c03 ++ ADD5 c04, t4, b5 ++ fmov b5, c04 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c03, t3, b5 ++ fmov b5, c03 ++ ADD6 c04, t4, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ fclr c01 ++ fclr c05 ++ ++ ldi I, -1(I) ++ bgt I, $L11 ++ .align 4 ++ ++$L29: ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L30: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C1 ++ subl C, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ and M, 1, I ++ ble I, $L50 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L58 ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L58 ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++// unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++// unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++// unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++// unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++// unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++// unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L57: ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, t2 ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b2, t3 ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ++$L58: ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L50: ++ sra M, 1, I ++ ble I, $L59 ++ .align 4 ++ ++$L41: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(KK) ++ fclr c04 ++ fclr c08 ++ ++ ble KK, $L48 ++ ble L, $L45 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ++ ble TMP1, $L48 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++// unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++// unop ++ ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++// unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++// unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++// unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++// unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++// unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++// unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++// unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++// unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++// unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++// unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++// unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L47 ++#else ++ blbs TMP1, $L47 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++// unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++// unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++// unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++// unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L47: ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ MUL a4, b1, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b2, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++ MUL a3, b2, t3 ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ADD c03, c08, b5 ++ fmov b5, c03 ++ ADD c04, c07, b5 ++ fmov b5, c04 ++ ++$L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c04, t2, b5 ++ fmov b5, c04 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c03, t1, b5 ++ fmov b5, c03 ++ ADD5 c04, t2, b5 ++ fmov b5, c04 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c03, t3, b5 ++ fmov b5, c03 ++ ADD6 c04, t4, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L41 ++ .align 4 ++ ++$L59: ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldl tmp, 72($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak +new file mode 100644 +index 0000000..71202d8 +--- /dev/null ++++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak +@@ -0,0 +1,2230 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ ++#if !defined(SW2B) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW2B ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ .set noat ++ .set noreorder ++ .arch ev6 ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++#if defined(LN) || defined(LT) ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#else ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#endif ++ ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++ ldl OFFSET, 24 + STACKSIZE($sp) ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ addl M, M, TMP2 ++ mull TMP2, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ TMP2, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ addl TMP1, TMP1, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 1, J ++ ble J, $L30 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C2 ++ subl C2, LDC, C1 ++ subl C2, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ and M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ble I, $L20 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L28 ++ ble L, $L25 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L28 ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD3 c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD4 c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD2 c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L20: ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L29 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(KK) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble KK, $L18 ++ ble L, $L15 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble TMP1, $L18 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD1 c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, c11 ++ unop ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD1 c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD3 c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD2 c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD4 c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD1 c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD3 c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD2 c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD4 c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c11, t1, c11 ++ ADD3 c12, t2, c12 ++ ADD2 c16, t3, c16 ++ ADD4 c15, t4, c15 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ ADD c11, c16, c11 ++ ADD c12, c15, c12 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++ ++ SUB b1, c03, c03 ++ SUB b2, c04, c04 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c09, t3, c09 ++ SUB c10, t4, c10 ++ ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c09, t3, c09 ++ ADD5 c10, t4, c10 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c09, t3 ++ MUL a3, c10, t4 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c10, t3 ++ MUL a4, c09, t4 ++ ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c03, t3, c03 ++ ADD5 c04, t4, c04 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ fclr c01 ++ fclr c05 ++ ++ ldi I, -1(I) ++ bgt I, $L11 ++ .align 4 ++ ++$L29: ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L30: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C1 ++ subl C, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ and M, 1, I ++ ble I, $L50 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L58 ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L58 ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L57: ++ ADD3 c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ++$L58: ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L50: ++ sra M, 1, I ++ ble I, $L59 ++ .align 4 ++ ++$L41: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(KK) ++ fclr c04 ++ fclr c08 ++ ++ ble KK, $L48 ++ ble L, $L45 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ++ ble TMP1, $L48 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, c05 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L47 ++#else ++ blbs TMP1, $L47 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L47: ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD3 c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++$L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L41 ++ .align 4 ++ ++$L59: ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S b/kernel/sw_64/ztrsm_kernel_2x2_LT.S +new file mode 100644 +index 0000000..bb38b56 +--- /dev/null ++++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S +@@ -0,0 +1,2624 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 48 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++ .set noat ++ .set noreorder ++ .arch sw6a ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 88 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define tmp $9 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++#if defined(LN) || defined(LT) ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#else ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#endif ++ ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++ ldl OFFSET, 24 + STACKSIZE($sp) ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ stl tmp, 72($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ addl M, M, TMP2 ++ mull TMP2, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ TMP2, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ addl TMP1, TMP1, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 1, J ++ ble J, $L30 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C2 ++ subl C2, LDC, C1 ++ subl C2, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(KK) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble KK, $L18 ++ ble L, $L15 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble TMP1, $L18 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ FIMOVD b5, tmp ++/* 2 */ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ ldi L, -2(L) ++ IFMOVD tmp, b5 ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++ unop ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL b1, a4, t2 ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, t3 ++ ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, t4 ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++ MUL b3, a1, t1 ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++ MUL b3, a2, t2 ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++ MUL b4, a2, t3 ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++ MUL b2, a3, t4 ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ MUL b3, a3, t1 ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ MUL b3, a4, t2 ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++ MUL b4, a4, t3 ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ADD c03, c08, b5 ++ fmov b5, c03 ++ ADD c04, c07, b5 ++ fmov b5, c04 ++ ++ ADD c09, c14, b5 ++ fmov b5, c09 ++ ADD c10, c13, b5 ++ fmov b5, c10 ++ ADD c11, c16, b5 ++ fmov b5, c11 ++ ADD c12, c15, b5 ++ fmov b5, c12 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c10, b5 ++ fmov b5, c10 ++ ++ SUB b1, c03, b5 ++ fmov b5, c03 ++ SUB b2, c04, b5 ++ fmov b5, c04 ++ SUB b3, c11, b5 ++ fmov b5, c11 ++ SUB b4, c12, b5 ++ fmov b5, c12 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++ ++ SUB b1, c09, b5 ++ fmov b5, c09 ++ SUB b2, c10, b5 ++ fmov b5, c10 ++ SUB b3, c11, b5 ++ fmov b5, c11 ++ SUB b4, c12, b5 ++ fmov b5, c12 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c10, t4, b5 ++ fmov b5, c10 ++ ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ADD6 c09, t3, b5 ++ fmov b5, c09 ++ ADD5 c10, t4, b5 ++ fmov b5, c10 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c09, t3, b5 ++ fmov b5, c09 ++ ADD6 c10, t4, b5 ++ fmov b5, c10 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c09, t3, b5 ++ fmov b5, c09 ++ ADD6 c10, t4, b5 ++ fmov b5, c10 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c09, t3 ++ MUL a3, c10, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c04, t2, b5 ++ fmov b5, c04 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c10, t3 ++ MUL a4, c09, t4 ++ ++ ADD6 c03, t1, b5 ++ fmov b5, c03 ++ ADD5 c04, t2, b5 ++ fmov b5, c04 ++ ADD6 c11, t3, b5 ++ fmov b5, c11 ++ ADD5 c12, t4, b5 ++ fmov b5, c12 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c03, t3, b5 ++ fmov b5, c03 ++ ADD6 c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ ++ ADD6 c09, t1, b5 ++ fmov b5, c09 ++ ADD5 c10, t2, b5 ++ fmov b5, c10 ++ ADD6 c11, t3, b5 ++ fmov b5, c11 ++ ADD5 c12, t4, b5 ++ fmov b5, c12 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ADD6 c03, t3, b5 ++ fmov b5, c03 ++ ADD5 c04, t4, b5 ++ fmov b5, c04 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c03, t3, b5 ++ fmov b5, c03 ++ ADD6 c04, t4, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ fclr c01 ++ fclr c05 ++ ++ ldi I, -1(I) ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 1, I ++ ble I, $L29 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L28 ++ ble L, $L25 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L28 ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ FIMOVD b5, tmp ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ unop ++ IFMOVD tmp, b5 ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ IFMOVD tmp, b5 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ MUL a2, b1, t2 ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ MUL a1, b2, t3 ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ MUL a2, b2, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b4, t3 ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ADD c09, c14, b5 ++ fmov b5, c09 ++ ADD c10, c13, b5 ++ fmov b5, c10 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c10, b5 ++ fmov b5, c10 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c10, b5 ++ fmov b5, c10 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c09, t3, b5 ++ fmov b5, c09 ++ ADD6 c10, t4, b5 ++ fmov b5, c10 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, b5 ++ fmov b5, c09 ++ ADD5 c10, t2, b5 ++ fmov b5, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L29: ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L30: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C1 ++ subl C, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ ble I, $L50 ++ .align 4 ++ ++$L41: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(KK) ++ fclr c04 ++ fclr c08 ++ ++ ble KK, $L48 ++ ble L, $L45 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ++ ble TMP1, $L48 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L47 ++#else ++ blbs TMP1, $L47 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L47: ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ MUL a4, b1, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b2, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++ MUL a3, b2, t3 ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ADD c03, c08, b5 ++ fmov b5, c03 ++ ADD c04, c07, b5 ++ fmov b5, c04 ++ ++$L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c04, t2, b5 ++ fmov b5, c04 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c03, t1, b5 ++ fmov b5, c03 ++ ADD5 c04, t2, b5 ++ fmov b5, c04 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c03, t3, b5 ++ fmov b5, c03 ++ ADD6 c04, t4, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L41 ++ .align 4 ++ ++$L50: ++ and M, 1, I ++ ble I, $L59 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L58 ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L58 ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L57: ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, t2 ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b2, t3 ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ++$L58: ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L59: ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldl tmp, 72($sp) ++ ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak +new file mode 100644 +index 0000000..f4a2c13 +--- /dev/null ++++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak +@@ -0,0 +1,2222 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(SW2B) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW2B ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ .set noat ++ .set noreorder ++ .arch ev6 ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++#if defined(LN) || defined(LT) ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#else ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#endif ++ ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++ ldl OFFSET, 24 + STACKSIZE($sp) ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ addl M, M, TMP2 ++ mull TMP2, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ TMP2, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ addl TMP1, TMP1, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ sra N, 1, J ++ ble J, $L30 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C2 ++ subl C2, LDC, C1 ++ subl C2, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(KK) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble KK, $L18 ++ ble L, $L15 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble TMP1, $L18 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD1 c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, c11 ++ unop ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD1 c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD3 c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD2 c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD4 c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD1 c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD3 c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD2 c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD4 c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c11, t1, c11 ++ ADD3 c12, t2, c12 ++ ADD2 c16, t3, c16 ++ ADD4 c15, t4, c15 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ ADD c11, c16, c11 ++ ADD c12, c15, c12 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++ ++ SUB b1, c03, c03 ++ SUB b2, c04, c04 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c09, t3, c09 ++ SUB c10, t4, c10 ++ ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c09, t3, c09 ++ ADD5 c10, t4, c10 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c09, t3 ++ MUL a3, c10, t4 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c10, t3 ++ MUL a4, c09, t4 ++ ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c03, t3, c03 ++ ADD5 c04, t4, c04 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ fclr c01 ++ fclr c05 ++ ++ ldi I, -1(I) ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 1, I ++ ble I, $L29 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L28 ++ ble L, $L25 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L28 ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD3 c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD4 c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD2 c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L29: ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L30: ++ and N, 1, J ++ ble J, $L999 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C1 ++ subl C, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ ble I, $L50 ++ .align 4 ++ ++$L41: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(KK) ++ fclr c04 ++ fclr c08 ++ ++ ble KK, $L48 ++ ble L, $L45 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ++ ble TMP1, $L48 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, c05 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L47 ++#else ++ blbs TMP1, $L47 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L47: ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD3 c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++$L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L41 ++ .align 4 ++ ++$L50: ++ and M, 1, I ++ ble I, $L59 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L58 ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L58 ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L57: ++ ADD3 c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ++$L58: ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L59: ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S b/kernel/sw_64/ztrsm_kernel_2x2_RT.S +new file mode 100644 +index 0000000..97dbc16 +--- /dev/null ++++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S +@@ -0,0 +1,2623 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++#if !defined(EV4) && !defined(EV5) && !defined(SW6) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW6 ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++#ifdef EV5 ++#define PREFETCHSIZE 48 ++#define UNOP ++#endif ++ ++#ifdef EV4 ++#define UNOP ++#endif ++ ++ .set noat ++ .set noreorder ++ .arch sw6a ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 88 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define tmp $9 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++#if defined(LN) || defined(LT) ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#else ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#endif ++ ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++ ldl OFFSET, 24 + STACKSIZE($sp) ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ stl tmp, 72($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ addl M, M, TMP2 ++ mull TMP2, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ TMP2, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ addl TMP1, TMP1, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ and N, 1, J ++ ble J, $L30 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C1 ++ subl C, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ ble I, $L50 ++ .align 4 ++ ++$L41: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(KK) ++ fclr c04 ++ fclr c08 ++ ++ ble KK, $L48 ++ ble L, $L45 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ++ ble TMP1, $L48 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L47 ++#else ++ blbs TMP1, $L47 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L47: ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ MUL a4, b1, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b2, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, b5 ++ fmov b5, c03 ++ MUL a3, b2, t3 ++ ++ ADD3 c04, t4, b5 ++ fmov b5, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD4 c05, t1, b5 ++ fmov b5, c05 ++ ADD2 c06, t2, b5 ++ fmov b5, c06 ++ ADD4 c07, t3, b5 ++ fmov b5, c07 ++ ADD2 c08, t4, b5 ++ fmov b5, c08 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ADD c03, c08, b5 ++ fmov b5, c03 ++ ADD c04, c07, b5 ++ fmov b5, c04 ++ ++$L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c04, t2, b5 ++ fmov b5, c04 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c03, t1, b5 ++ fmov b5, c03 ++ ADD5 c04, t2, b5 ++ fmov b5, c04 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c03, t3, b5 ++ fmov b5, c03 ++ ADD6 c04, t4, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L41 ++ .align 4 ++ ++$L50: ++ and M, 1, I ++ ble I, $L59 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L58 ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L58 ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L57: ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b1, t2 ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b2, t3 ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ++$L58: ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L59: ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L30: ++ sra N, 1, J ++ ble J, $L999 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C2 ++ subl C2, LDC, C1 ++ subl C2, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(KK) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble KK, $L18 ++ ble L, $L15 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble TMP1, $L18 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ FIMOVD b5, tmp ++/* 2 */ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ ldi L, -2(L) ++ IFMOVD tmp, b5 ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ IFMOVD tmp, b5 ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++ unop ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL b1, a4, t2 ++ ADD2 c06, t3, b5 ++ fmov b5, c06 ++ MUL b2, a4, t3 ++ ++ ADD4 c05, t4, b5 ++ fmov b5, c05 ++ MUL b4, a1, t4 ++ ADD1 c03, t1, b5 ++ fmov b5, c03 ++ MUL b3, a1, t1 ++ ++ ADD3 c04, t2, b5 ++ fmov b5, c04 ++ MUL b3, a2, t2 ++ ADD2 c08, t3, b5 ++ fmov b5, c08 ++ MUL b4, a2, t3 ++ ++ ADD4 c13, t4, b5 ++ fmov b5, c13 ++ MUL b2, a3, t4 ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ MUL b3, a3, t1 ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ MUL b3, a4, t2 ++ ADD2 c14, t3, b5 ++ fmov b5, c14 ++ MUL b4, a4, t3 ++ ++ ADD4 c07, t4, b5 ++ fmov b5, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c11, t1, b5 ++ fmov b5, c11 ++ ADD3 c12, t2, b5 ++ fmov b5, c12 ++ ADD2 c16, t3, b5 ++ fmov b5, c16 ++ ADD4 c15, t4, b5 ++ fmov b5, c15 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ADD c03, c08, b5 ++ fmov b5, c03 ++ ADD c04, c07, b5 ++ fmov b5, c04 ++ ++ ADD c09, c14, b5 ++ fmov b5, c09 ++ ADD c10, c13, b5 ++ fmov b5, c10 ++ ADD c11, c16, b5 ++ fmov b5, c11 ++ ADD c12, c15, b5 ++ fmov b5, c12 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c10, b5 ++ fmov b5, c10 ++ ++ SUB b1, c03, b5 ++ fmov b5, c03 ++ SUB b2, c04, b5 ++ fmov b5, c04 ++ SUB b3, c11, b5 ++ fmov b5, c11 ++ SUB b4, c12, b5 ++ fmov b5, c12 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c03, b5 ++ fmov b5, c03 ++ SUB a4, c04, b5 ++ fmov b5, c04 ++ ++ SUB b1, c09, b5 ++ fmov b5, c09 ++ SUB b2, c10, b5 ++ fmov b5, c10 ++ SUB b3, c11, b5 ++ fmov b5, c11 ++ SUB b4, c12, b5 ++ fmov b5, c12 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c09, t3, b5 ++ fmov b5, c09 ++ SUB c10, t4, b5 ++ fmov b5, c10 ++ ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ADD6 c09, t3, b5 ++ fmov b5, c09 ++ ADD5 c10, t4, b5 ++ fmov b5, c10 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c09, t3, b5 ++ fmov b5, c09 ++ ADD6 c10, t4, b5 ++ fmov b5, c10 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c09, t3, b5 ++ fmov b5, c09 ++ ADD6 c10, t4, b5 ++ fmov b5, c10 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c09, t3 ++ MUL a3, c10, t4 ++ ++ SUB c03, t1, b5 ++ fmov b5, c03 ++ SUB c04, t2, b5 ++ fmov b5, c04 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c10, t3 ++ MUL a4, c09, t4 ++ ++ ADD6 c03, t1, b5 ++ fmov b5, c03 ++ ADD5 c04, t2, b5 ++ fmov b5, c04 ++ ADD6 c11, t3, b5 ++ fmov b5, c11 ++ ADD5 c12, t4, b5 ++ fmov b5, c12 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ ADD5 c03, t1, b5 ++ fmov b5, c03 ++ ADD6 c04, t2, b5 ++ fmov b5, c04 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c03, t3, b5 ++ fmov b5, c03 ++ ADD6 c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ SUB c11, t3, b5 ++ fmov b5, c11 ++ SUB c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ ++ ADD6 c09, t1, b5 ++ fmov b5, c09 ++ ADD5 c10, t2, b5 ++ fmov b5, c10 ++ ADD6 c11, t3, b5 ++ fmov b5, c11 ++ ADD5 c12, t4, b5 ++ fmov b5, c12 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ MUL a1, c11, b5 ++ fmov b5, c11 ++ MUL a1, c12, b5 ++ fmov b5, c12 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++ ADD5 c11, t3, b5 ++ fmov b5, c11 ++ ADD6 c12, t4, b5 ++ fmov b5, c12 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ SUB c03, t3, b5 ++ fmov b5, c03 ++ SUB c04, t4, b5 ++ fmov b5, c04 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ADD6 c03, t3, b5 ++ fmov b5, c03 ++ ADD5 c04, t4, b5 ++ fmov b5, c04 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c03, b5 ++ fmov b5, c03 ++ MUL a1, c04, b5 ++ fmov b5, c04 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c03, t3, b5 ++ fmov b5, c03 ++ ADD6 c04, t4, b5 ++ fmov b5, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ fclr c01 ++ fclr c05 ++ ++ ldi I, -1(I) ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 1, I ++ ble I, $L29 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L28 ++ ble L, $L25 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L28 ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ FIMOVD b5, tmp ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ unop ++ IFMOVD tmp, b5 ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ IFMOVD tmp, b5 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ MUL a2, b1, t2 ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ MUL a1, b2, t3 ++ ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ MUL a2, b2, t4 ++ ADD1 c01, t1, b5 ++ fmov b5, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, b5 ++ fmov b5, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, b5 ++ fmov b5, c05 ++ MUL a1, b4, t3 ++ ++ ADD2 c06, t4, b5 ++ fmov b5, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c09, t1, b5 ++ fmov b5, c09 ++ ADD3 c10, t2, b5 ++ fmov b5, c10 ++ ADD4 c13, t3, b5 ++ fmov b5, c13 ++ ADD2 c14, t4, b5 ++ fmov b5, c14 ++ ++ ADD c01, c06, b5 ++ fmov b5, c01 ++ ADD c02, c05, b5 ++ fmov b5, c02 ++ ADD c09, c14, b5 ++ fmov b5, c09 ++ ADD c10, c13, b5 ++ fmov b5, c10 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c10, b5 ++ fmov b5, c10 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, b5 ++ fmov b5, c01 ++ SUB a2, c02, b5 ++ fmov b5, c02 ++ SUB a3, c09, b5 ++ fmov b5, c09 ++ SUB a4, c10, b5 ++ fmov b5, c10 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ADD5 c09, t3, b5 ++ fmov b5, c09 ++ ADD6 c10, t4, b5 ++ fmov b5, c10 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, b5 ++ fmov b5, c09 ++ SUB c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, b5 ++ fmov b5, c09 ++ ADD5 c10, t2, b5 ++ fmov b5, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, b5 ++ fmov b5, c09 ++ MUL a1, c10, b5 ++ fmov b5, c10 ++ ++ ADD5 c09, t1, b5 ++ fmov b5, c09 ++ ADD6 c10, t2, b5 ++ fmov b5, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, b5 ++ fmov b5, c01 ++ SUB c02, t2, b5 ++ fmov b5, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, b5 ++ fmov b5, c01 ++ ADD5 c02, t2, b5 ++ fmov b5, c02 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, b5 ++ fmov b5, c01 ++ MUL a1, c02, b5 ++ fmov b5, c02 ++ ++ ADD5 c01, t1, b5 ++ fmov b5, c01 ++ ADD6 c02, t2, b5 ++ fmov b5, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L29: ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ ldl tmp, 72($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak +new file mode 100644 +index 0000000..4d4f59d +--- /dev/null ++++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak +@@ -0,0 +1,2223 @@ ++/*********************************************************************/ ++/* Copyright 2009, 2010 The University of Texas at Austin. */ ++/* All rights reserved. */ ++/* */ ++/* Redistribution and use in source and binary forms, with or */ ++/* without modification, are permitted provided that the following */ ++/* conditions are met: */ ++/* */ ++/* 1. Redistributions of source code must retain the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer. */ ++/* */ ++/* 2. Redistributions in binary form must reproduce the above */ ++/* copyright notice, this list of conditions and the following */ ++/* disclaimer in the documentation and/or other materials */ ++/* provided with the distribution. */ ++/* */ ++/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ ++/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ ++/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ ++/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ ++/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ ++/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ ++/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ ++/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ ++/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ ++/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ ++/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ ++/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ ++/* POSSIBILITY OF SUCH DAMAGE. */ ++/* */ ++/* The views and conclusions contained in the software and */ ++/* documentation are those of the authors and should not be */ ++/* interpreted as representing official policies, either expressed */ ++/* or implied, of The University of Texas at Austin. */ ++/*********************************************************************/ ++ ++#define ASSEMBLER ++#include "common.h" ++#include "version.h" ++ ++ ++#if !defined(SW2B) ++#error "Architecture is not specified." ++#endif ++ ++#ifdef SW2B ++#define PREFETCHSIZE 56 ++#define UNOP unop ++#endif ++ ++ ++ .set noat ++ .set noreorder ++ .arch ev6 ++ ++.text ++ .align 5 ++ .globl CNAME ++ .ent CNAME ++ ++#define STACKSIZE 80 ++ ++#define M $16 ++#define N $17 ++#define K $18 ++#define A $21 ++#define B $22 ++#define C $20 ++#define LDC $23 ++ ++#define C1 $19 ++#define C2 $24 ++ ++#define AO $at ++#define BO $5 ++#define I $6 ++#define J $7 ++#define L $8 ++ ++#define a1 $f16 ++#define a2 $f17 ++#define a3 $f18 ++#define a4 $f19 ++ ++#define b1 $f20 ++#define b2 $f21 ++#define b3 $f22 ++#define b4 $f23 ++ ++#define t1 $f24 ++#define t2 $f25 ++#define t3 $f26 ++#define t4 $f27 ++ ++#define a5 $f28 ++#define a6 $f30 ++#define b5 $f29 ++ ++#define alpha_i $f29 ++#define alpha_r $f30 ++ ++#define c01 $f0 ++#define c02 $f1 ++#define c03 $f2 ++#define c04 $f3 ++ ++#define c05 $f4 ++#define c06 $f5 ++#define c07 $f6 ++#define c08 $f7 ++ ++#define c09 $f8 ++#define c10 $f9 ++#define c11 $f10 ++#define c12 $f11 ++ ++#define c13 $f12 ++#define c14 $f13 ++#define c15 $f14 ++#define c16 $f15 ++ ++#define TMP1 $0 ++#define TMP2 $1 ++#define KK $2 ++#define AORIG $3 ++#define OFFSET $4 ++ ++#if defined(LN) || defined(LT) ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 SUB ++#define ADD4 ADD ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#else ++#ifndef CONJ ++#define ADD1 ADD ++#define ADD2 SUB ++#define ADD3 ADD ++#define ADD4 ADD ++#define ADD5 SUB ++#define ADD6 ADD ++#else ++#define ADD1 ADD ++#define ADD2 ADD ++#define ADD3 ADD ++#define ADD4 SUB ++#define ADD5 ADD ++#define ADD6 SUB ++#endif ++#endif ++ ++ ++CNAME: ++ .frame $sp, STACKSIZE, $26, 0 ++ ++#ifdef PROFILE ++ ldgp $gp, 0($27) ++ ldi $at, _mcount ++ jsr $at, ($at), _mcount ++#endif ++ ++#ifndef PROFILE ++ .prologue 0 ++#else ++ .prologue 1 ++#endif ++ ++ ldi $sp, -STACKSIZE($sp) ++ ++ ldl B, 0 + STACKSIZE($sp) ++ ldl C, 8 + STACKSIZE($sp) ++ ldl LDC, 16 + STACKSIZE($sp) ++ ldl OFFSET, 24 + STACKSIZE($sp) ++ ++ sll LDC, ZBASE_SHIFT, LDC ++ ++ fstd $f2, 0($sp) ++ fstd $f3, 8($sp) ++ fstd $f4, 16($sp) ++ fstd $f5, 24($sp) ++ fstd $f6, 32($sp) ++ fstd $f7, 40($sp) ++ fstd $f8, 48($sp) ++ fstd $f9, 56($sp) ++ ++ cmple M, 0, $0 ++ cmple N, 0, $1 ++ cmple K, 0, $2 ++ ++ or $0, $1, $0 ++ or $0, $2, $0 ++ bne $0, $L999 ++ ++#ifdef LN ++ addl M, M, TMP2 ++ mull TMP2, K, TMP1 ++ SXADDQ TMP1, A, A ++ SXADDQ TMP2, C, C ++#endif ++ ++#ifdef RN ++ negl OFFSET, KK ++#endif ++ ++#ifdef RT ++ mull N, K, TMP1 ++ addl TMP1, TMP1, TMP1 ++ SXADDQ TMP1, B, B ++ ++ mull N, LDC, TMP1 ++ addl TMP1, C, C ++ ++ subl N, OFFSET, KK ++#endif ++ ++ and N, 1, J ++ ble J, $L30 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C1 ++ subl C, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ ble I, $L50 ++ .align 4 ++ ++$L41: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(B) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(KK) ++ fclr c04 ++ fclr c08 ++ ++ ble KK, $L48 ++ ble L, $L45 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi BO, 2 * SIZE(BO) ++ fclr c03 ++ ldi AO, 4 * SIZE(AO) ++ fclr c07 ++ ++ ldi L, -2(TMP1) ++ fclr c04 ++ fclr c08 ++ ++ ble TMP1, $L48 ++ ble L, $L45 ++#endif ++ .align 5 ++ ++$L42: ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD2 c06, t2, c06 ++ ldi L, -2(L) ++ MUL a2, b1, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b1, t3 ++ unop ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ ldi BO, 4 * SIZE(BO) ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ unop ++ MUL a4, b2, t4 ++ LD a5, 3 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ unop ++ MUL a1, b3, t1 ++ LD b2, -1 * SIZE(BO) ++ ++ ADD2 c06, t2, c06 ++ unop ++ MUL a2, b3, t2 ++ unop ++ ++ ADD4 c07, t3, c07 ++ unop ++ MUL a3, b3, t3 ++ ldi AO, 8 * SIZE(AO) ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a5, b3, t4 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b4, t1 ++ LD a1, -4 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b4, t2 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ LD a4, -1 * SIZE(AO) ++ MUL a3, b4, t3 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a5, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ bgt L, $L42 ++ .align 4 ++ ++$L45: ++ ADD4 c05, t1, c05 ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L47 ++#else ++ blbs TMP1, $L47 ++#endif ++ .align 4 ++ ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ unop ++ MUL a4, b1, t4 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b2, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b2, t2 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c03, t3, c03 ++ unop ++ MUL a3, b2, t3 ++ LD a3, 2 * SIZE(AO) ++ ++ ADD3 c04, t4, c04 ++ MUL a4, b2, t4 ++ LD a4, 3 * SIZE(AO) ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD4 c05, t1, c05 ++ LD b2, 1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 2 * SIZE(BO) ++ .align 4 ++ ++$L47: ++ ADD2 c06, t2, c06 ++ MUL a2, b1, t2 ++ ADD4 c07, t3, c07 ++ MUL a3, b1, t3 ++ ++ ADD2 c08, t4, c08 ++ MUL a4, b1, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b2, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b2, t2 ++ ADD1 c03, t3, c03 ++ MUL a3, b2, t3 ++ ++ ADD3 c04, t4, c04 ++ ldi AO, 4 * SIZE(AO) ++ MUL a4, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD4 c05, t1, c05 ++ ADD2 c06, t2, c06 ++ ADD4 c07, t3, c07 ++ ADD2 c08, t4, c08 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++$L48: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 1, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c03, 2 * SIZE(BO) ++ ST c04, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ ++ ldi I, -1(I) ++ bgt I, $L41 ++ .align 4 ++ ++$L50: ++ and M, 1, I ++ ble I, $L59 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c01 ++ LD b2, 1 * SIZE(B) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c02 ++ LD b4, 3 * SIZE(B) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L58 ++ ble L, $L55 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr t1 ++ LD a2, 1 * SIZE(AO) ++ fclr t2 ++ LD a3, 2 * SIZE(AO) ++ fclr t3 ++ LD a4, 3 * SIZE(AO) ++ fclr t4 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c01 ++ LD b2, 1 * SIZE(BO) ++ fclr c05 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c02 ++ LD b4, 3 * SIZE(BO) ++ fclr c06 ++ ++ ldi AO, 2 * SIZE(AO) ++ ldi BO, 2 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L58 ++ ble L, $L55 ++#endif ++ .align 5 ++ ++$L52: ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ ldi AO, 4 * SIZE(AO) ++ MUL a2, b1, t2 ++ LD b1, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi L, -2(L) ++ MUL a1, b2, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, 3 * SIZE(BO) ++ MUL a3, b3, t1 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b4, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b4, t4 ++ LD b4, 1 * SIZE(BO) ++ unop ++ ++ LD a4, 1 * SIZE(AO) ++ unop ++ unop ++ bgt L, $L52 ++ .align 4 ++ ++$L55: ++ ADD1 c01, t1, c01 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L57 ++#else ++ blbs TMP1, $L57 ++#endif ++ .align 4 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ ldi BO, 2 * SIZE(BO) ++ MUL a1, b2, t3 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b2, t4 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD1 c01, t1, c01 ++ LD b2, -1 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi AO, 2 * SIZE(AO) ++ .align 4 ++ ++$L57: ++ ADD3 c02, t2, c02 ++ MUL a2, b1, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b2, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b2, t4 ++ ldi BO, 2 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ADD3 c02, t2, c02 ++ ADD4 c05, t3, c05 ++ ADD2 c06, t4, c06 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ++$L58: ++#if defined(LN) || defined(RT) ++ subl KK, 1, TMP1 ++ ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -2 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(RN) || defined(RT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L59: ++#ifdef LN ++ sll K, ZBASE_SHIFT, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 1, KK ++#endif ++ ++#ifdef RT ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L30: ++ sra N, 1, J ++ ble J, $L999 ++ .align 4 ++ ++$L01: ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl B, TMP1, B ++ ++ subl C, LDC, C2 ++ subl C2, LDC, C1 ++ subl C2, LDC, C ++#else ++ mov C, C1 ++ addl C, LDC, C2 ++ addl C2, LDC, C ++#endif ++ ++#ifdef LN ++ addl M, OFFSET, KK ++#endif ++ ++#ifdef LT ++ mov OFFSET, KK ++#endif ++ ++#if defined(LN) || defined(RT) ++ mov A, AORIG ++#else ++ mov A, AO ++#endif ++ ++ sra M, 1, I ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++ fclr c01 ++ fclr c05 ++ ++ ble I, $L20 ++ .align 4 ++ ++$L11: ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ fclr c03 ++ LD b4, 3 * SIZE(B) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(B) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(KK) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble KK, $L18 ++ ble L, $L15 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AO ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ fclr c03 ++ LD b4, 3 * SIZE(BO) ++ fclr c07 ++ ++ ldi BO, 4 * SIZE(BO) ++ fclr c11 ++ ldi AO, 4 * SIZE(AO) ++ fclr c15 ++ ++ fillcs 4 * SIZE(C1) ++ fclr c04 ++ ldi L, -2(TMP1) ++ fclr c08 ++ ++ fillcs 4 * SIZE(C2) ++ fclr c12 ++ fclr c16 ++ ble TMP1, $L18 ++ ble L, $L15 ++#endif ++ .align 5 ++ ++$L12: ++/* 1 */ ++ ADD1 c11, t1, c11 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(AO) ++#else ++ unop ++#endif ++ MUL b1, a1, t1 ++#ifndef EV4 ++ fillcs PREFETCHSIZE * SIZE(BO) ++#else ++ unop ++#endif ++ ++ ADD3 c12, t2, c12 ++ unop ++ MUL b1, a2, t2 ++ unop ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ LD a5, 0 * SIZE(AO) ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a1, t4 ++ LD b5, 0 * SIZE(BO) ++ ++/* 2 */ ++ ADD1 c01, t1, c01 ++ UNOP ++ MUL b1, a3, t1 ++ UNOP ++ ++ ADD3 c02, t2, c02 ++ UNOP ++ MUL b1, a4, t2 ++ UNOP ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a1, t4 ++ unop ++ ++/* 3 */ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++/* 4 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ LD a6, 2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, 3 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD b4, 3 * SIZE(BO) ++ ++/* 5 */ ++ ADD1 c11, t1, c11 ++ unop ++ MUL b5, a5, t1 ++ LD a1, 4 * SIZE(AO) ++ ++ ADD3 c12, t2, c12 ++ ldi L, -2(L) ++ MUL b5, a2, t2 ++ LD b1, 4 * SIZE(BO) ++ ++ ADD2 c16, t3, c16 ++ unop ++ MUL b2, a2, t3 ++ unop ++ ++ ADD4 c15, t4, c15 ++ unop ++ MUL b2, a5, t4 ++ unop ++ ++/* 6 */ ++ ADD1 c01, t1, c01 ++ unop ++ MUL b5, a6, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b5, a4, t2 ++ unop ++ ++ ADD2 c06, t3, c06 ++ unop ++ MUL b2, a4, t3 ++ unop ++ ++ ADD4 c05, t4, c05 ++ unop ++ MUL b4, a5, t4 ++ unop ++ ++/* 7 */ ++ ADD1 c03, t1, c03 ++ ldi AO, 8 * SIZE(AO) ++ MUL b3, a5, t1 ++ unop ++ ++ ADD3 c04, t2, c04 ++ ldi BO, 8 * SIZE(BO) ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, -3 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a6, t4 ++ LD b2, -3 * SIZE(BO) ++ ++/* 8 */ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a6, t1 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ MUL b4, a6, t4 ++ LD b4, -1 * SIZE(BO) ++ bgt L, $L12 ++ .align 4 ++ ++$L15: ++ ADD1 c11, t1, c11 ++ unop ++ MUL b1, a1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L17 ++#else ++ blbs TMP1, $L17 ++#endif ++ .align 4 ++ ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL b1, a4, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ++ ADD1 c03, t1, c03 ++ unop ++ MUL b3, a1, t1 ++ LD a1, 0 * SIZE(AO) ++ ++ ADD3 c04, t2, c04 ++ unop ++ MUL b3, a2, t2 ++ unop ++ ++ ADD2 c08, t3, c08 ++ unop ++ MUL b4, a2, t3 ++ LD a2, 1 * SIZE(AO) ++ ++ ADD4 c13, t4, c13 ++ unop ++ MUL b2, a3, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL b3, a3, t1 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL b3, a4, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD2 c14, t3, c14 ++ unop ++ MUL b4, a4, t3 ++ LD a4, -1 * SIZE(AO) ++ ++ ADD4 c07, t4, c07 ++ unop ++ MUL b4, a3, t4 ++ LD a3, -2 * SIZE(AO) ++ ++ ADD1 c11, t1, c11 ++ LD b4, 3 * SIZE(BO) ++ MUL b1, a1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L17: ++ ADD3 c12, t2, c12 ++ MUL b1, a2, t2 ++ ADD2 c16, t3, c16 ++ MUL b2, a2, t3 ++ ++ ADD4 c15, t4, c15 ++ MUL b2, a1, t4 ++ ADD1 c01, t1, c01 ++ MUL b1, a3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL b1, a4, t2 ++ ADD2 c06, t3, c06 ++ MUL b2, a4, t3 ++ ++ ADD4 c05, t4, c05 ++ MUL b4, a1, t4 ++ ADD1 c03, t1, c03 ++ MUL b3, a1, t1 ++ ++ ADD3 c04, t2, c04 ++ MUL b3, a2, t2 ++ ADD2 c08, t3, c08 ++ MUL b4, a2, t3 ++ ++ ADD4 c13, t4, c13 ++ MUL b2, a3, t4 ++ ADD1 c09, t1, c09 ++ MUL b3, a3, t1 ++ ++ ADD3 c10, t2, c10 ++ MUL b3, a4, t2 ++ ADD2 c14, t3, c14 ++ MUL b4, a4, t3 ++ ++ ADD4 c07, t4, c07 ++ ldi AO, 4 * SIZE(AO) ++ MUL b4, a3, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c11, t1, c11 ++ ADD3 c12, t2, c12 ++ ADD2 c16, t3, c16 ++ ADD4 c15, t4, c15 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c03, c08, c03 ++ ADD c04, c07, c04 ++ ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ ADD c11, c16, c11 ++ ADD c12, c15, c12 ++ .align 4 ++ ++$L18: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 2, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -4 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ LD b1, 4 * SIZE(BO) ++ LD b2, 5 * SIZE(BO) ++ LD b3, 6 * SIZE(BO) ++ LD b4, 7 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++ ++ SUB b1, c03, c03 ++ SUB b2, c04, c04 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ LD b1, 4 * SIZE(AO) ++ LD b2, 5 * SIZE(AO) ++ LD b3, 6 * SIZE(AO) ++ LD b4, 7 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c03, c03 ++ SUB a4, c04, c04 ++ ++ SUB b1, c09, c09 ++ SUB b2, c10, c10 ++ SUB b3, c11, c11 ++ SUB b4, c12, c12 ++#endif ++ ++#ifdef LN ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ LD a3, 4 * SIZE(AO) ++ LD a4, 5 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c03, t1 ++ MUL a3, c04, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c09, t3, c09 ++ SUB c10, t4, c10 ++ ++ MUL a4, c04, t1 ++ MUL a4, c03, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c09, t3, c09 ++ ADD5 c10, t4, c10 ++ ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef LT ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c09, t3 ++ MUL a3, c10, t4 ++ ++ SUB c03, t1, c03 ++ SUB c04, t2, c04 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c10, t3 ++ MUL a4, c09, t4 ++ ++ ADD6 c03, t1, c03 ++ ADD5 c04, t2, c04 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(AO) ++ LD a2, 7 * SIZE(AO) ++ ++ MUL a2, c04, t1 ++ MUL a2, c03, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c03, t1, c03 ++ ADD6 c04, t2, c04 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ MUL a3, c03, t3 ++ MUL a3, c04, t4 ++ ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ SUB c11, t3, c11 ++ SUB c12, t4, c12 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ MUL a4, c04, t3 ++ MUL a4, c03, t4 ++ ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ADD6 c11, t3, c11 ++ ADD5 c12, t4, c12 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a2, c12, t3 ++ MUL a2, c11, t4 ++ ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ MUL a1, c11, c11 ++ MUL a1, c12, c12 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ADD5 c11, t3, c11 ++ ADD6 c12, t4, c12 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ MUL a3, c11, t3 ++ MUL a3, c12, t4 ++ ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ SUB c03, t3, c03 ++ SUB c04, t4, c04 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ MUL a4, c12, t3 ++ MUL a4, c11, t4 ++ ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ADD6 c03, t3, c03 ++ ADD5 c04, t4, c04 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c04, t3 ++ MUL a2, c03, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c03, c03 ++ MUL a1, c04, c04 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c03, t3, c03 ++ ADD6 c04, t4, c04 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++ ++ ST c03, 4 * SIZE(BO) ++ ST c04, 5 * SIZE(BO) ++ ST c11, 6 * SIZE(BO) ++ ST c12, 7 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c03, 2 * SIZE(AO) ++ ST c04, 3 * SIZE(AO) ++ ++ ST c09, 4 * SIZE(AO) ++ ST c10, 5 * SIZE(AO) ++ ST c11, 6 * SIZE(AO) ++ ST c12, 7 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -4 * SIZE(C1) ++ ldi C2, -4 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c03, 2 * SIZE(C1) ++ ST c04, 3 * SIZE(C1) ++ ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ST c11, 2 * SIZE(C2) ++ ST c12, 3 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 4 * SIZE(C1) ++ ldi C2, 4 * SIZE(C2) ++#endif ++ ++ fclr t1 ++ fclr t2 ++ fclr t3 ++ fclr t4 ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 1, TMP1 ++ addl AO, TMP1, AO ++ addl BO, TMP1, BO ++#endif ++ ++#ifdef LT ++ addl KK, 2, KK ++#endif ++ ++#ifdef LN ++ subl KK, 2, KK ++#endif ++ fclr c01 ++ fclr c05 ++ ++ ldi I, -1(I) ++ bgt I, $L11 ++ .align 4 ++ ++$L20: ++ and M, 1, I ++ ble I, $L29 ++ ++#if defined(LT) || defined(RN) ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(B) ++ fclr c10 ++ LD b2, 1 * SIZE(B) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(B) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(B) ++ ldi BO, 4 * SIZE(B) ++ ++ ldi L, -2(KK) ++ ++ ble KK, $L28 ++ ble L, $L25 ++#else ++#ifdef LN ++ sll K, ZBASE_SHIFT + 0, TMP1 ++ subl AORIG, TMP1, AORIG ++#endif ++ ++ sll KK, ZBASE_SHIFT + 0, TMP1 ++ addl AORIG, TMP1, AO ++ sll KK, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, BO ++ ++ subl K, KK, TMP1 ++ ++ LD a1, 0 * SIZE(AO) ++ fclr c09 ++ LD a2, 1 * SIZE(AO) ++ fclr c13 ++ ++ LD a3, 2 * SIZE(AO) ++ fclr c02 ++ LD a4, 3 * SIZE(AO) ++ fclr c06 ++ ++ LD b1, 0 * SIZE(BO) ++ fclr c10 ++ LD b2, 1 * SIZE(BO) ++ fclr c14 ++ ++ LD b3, 2 * SIZE(BO) ++ ldi AO, 2 * SIZE(AO) ++ LD b4, 3 * SIZE(BO) ++ ldi BO, 4 * SIZE(BO) ++ ++ ldi L, -2(TMP1) ++ ++ ble TMP1, $L28 ++ ble L, $L25 ++#endif ++ .align 5 ++ ++$L22: ++ ADD1 c09, t1, c09 ++ unop ++ MUL a1, b1, t1 ++ unop ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ ldi BO, 8 * SIZE(BO) ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, -7 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ unop ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, -6 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, 2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a2, b4, t4 ++ LD b5, -5 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ unop ++ MUL a3, b1, t1 ++ LD a2, 3 * SIZE(AO) ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a4, b1, t2 ++ LD b1, -4 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a3, b2, t3 ++ ldi AO, 4 * SIZE(AO) ++ ++ ADD2 c14, t4, c14 ++ MUL a4, b2, t4 ++ LD b2, -3 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ ldi L, -2(L) ++ MUL a3, b3, t1 ++ LD b4, -1 * SIZE(BO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a4, b3, t2 ++ LD b3, -2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a3, b5, t3 ++ LD a3, 0 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ MUL a4, b5, t4 ++ LD a4, 1 * SIZE(AO) ++ bgt L, $L22 ++ .align 4 ++ ++$L25: ++ ADD1 c09, t1, c09 ++ MUL a1, b1, t1 ++#if defined(LT) || defined(RN) ++ blbs KK, $L27 ++#else ++ blbs TMP1, $L27 ++#endif ++ .align 4 ++ ++ ADD3 c10, t2, c10 ++ unop ++ MUL a2, b1, t2 ++ LD b1, 0 * SIZE(BO) ++ ++ ADD4 c13, t3, c13 ++ unop ++ MUL a1, b2, t3 ++ unop ++ ++ ADD2 c14, t4, c14 ++ unop ++ MUL a2, b2, t4 ++ LD b2, 1 * SIZE(BO) ++ ++ ADD1 c01, t1, c01 ++ unop ++ MUL a1, b3, t1 ++ ldi AO, 2 * SIZE(AO) ++ ++ ADD3 c02, t2, c02 ++ unop ++ MUL a2, b3, t2 ++ LD b3, 2 * SIZE(BO) ++ ++ ADD4 c05, t3, c05 ++ unop ++ MUL a1, b4, t3 ++ LD a1, -2 * SIZE(AO) ++ ++ ADD2 c06, t4, c06 ++ unop ++ MUL a2, b4, t4 ++ LD a2, -1 * SIZE(AO) ++ ++ ADD1 c09, t1, c09 ++ LD b4, 3 * SIZE(BO) ++ MUL a1, b1, t1 ++ ldi BO, 4 * SIZE(BO) ++ .align 4 ++ ++$L27: ++ ADD3 c10, t2, c10 ++ MUL a2, b1, t2 ++ ADD4 c13, t3, c13 ++ MUL a1, b2, t3 ++ ++ ADD2 c14, t4, c14 ++ MUL a2, b2, t4 ++ ADD1 c01, t1, c01 ++ MUL a1, b3, t1 ++ ++ ADD3 c02, t2, c02 ++ MUL a2, b3, t2 ++ ADD4 c05, t3, c05 ++ MUL a1, b4, t3 ++ ++ ADD2 c06, t4, c06 ++ ldi AO, 2 * SIZE(AO) ++ MUL a2, b4, t4 ++ ldi BO, 4 * SIZE(BO) ++ ++ ADD1 c09, t1, c09 ++ ADD3 c10, t2, c10 ++ ADD4 c13, t3, c13 ++ ADD2 c14, t4, c14 ++ ++ ADD c01, c06, c01 ++ ADD c02, c05, c02 ++ ADD c09, c14, c09 ++ ADD c10, c13, c10 ++ .align 4 ++ ++$L28: ++#if defined(LN) || defined(RT) ++#ifdef LN ++ subl KK, 1, TMP1 ++#else ++ subl KK, 2, TMP1 ++#endif ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AORIG, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl B, TMP2, BO ++#else ++ ldi AO, -2 * SIZE(AO) ++ ldi BO, -4 * SIZE(BO) ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#else ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ LD a3, 2 * SIZE(AO) ++ LD a4, 3 * SIZE(AO) ++ ++ SUB a1, c01, c01 ++ SUB a2, c02, c02 ++ SUB a3, c09, c09 ++ SUB a4, c10, c10 ++#endif ++ ++#if defined(LN) || defined(LT) ++ LD a1, 0 * SIZE(AO) ++ LD a2, 1 * SIZE(AO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a2, c10, t3 ++ MUL a2, c09, t4 ++ ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ADD5 c09, t3, c09 ++ ADD6 c10, t4, c10 ++#endif ++ ++#ifdef RN ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ LD a3, 2 * SIZE(BO) ++ LD a4, 3 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++ ++ MUL a3, c01, t1 ++ MUL a3, c02, t2 ++ SUB c09, t1, c09 ++ SUB c10, t2, c10 ++ ++ MUL a4, c02, t1 ++ MUL a4, c01, t2 ++ ADD6 c09, t1, c09 ++ ADD5 c10, t2, c10 ++ ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++#endif ++ ++#ifdef RT ++ LD a1, 6 * SIZE(BO) ++ LD a2, 7 * SIZE(BO) ++ LD a3, 4 * SIZE(BO) ++ LD a4, 5 * SIZE(BO) ++ ++ MUL a2, c10, t1 ++ MUL a2, c09, t2 ++ MUL a1, c09, c09 ++ MUL a1, c10, c10 ++ ++ ADD5 c09, t1, c09 ++ ADD6 c10, t2, c10 ++ ++ MUL a3, c09, t1 ++ MUL a3, c10, t2 ++ SUB c01, t1, c01 ++ SUB c02, t2, c02 ++ ++ MUL a4, c10, t1 ++ MUL a4, c09, t2 ++ ADD6 c01, t1, c01 ++ ADD5 c02, t2, c02 ++ ++ LD a1, 0 * SIZE(BO) ++ LD a2, 1 * SIZE(BO) ++ ++ MUL a2, c02, t1 ++ MUL a2, c01, t2 ++ MUL a1, c01, c01 ++ MUL a1, c02, c02 ++ ++ ADD5 c01, t1, c01 ++ ADD6 c02, t2, c02 ++#endif ++ ++#if defined(LN) || defined(LT) ++ ST c01, 0 * SIZE(BO) ++ ST c02, 1 * SIZE(BO) ++ ST c09, 2 * SIZE(BO) ++ ST c10, 3 * SIZE(BO) ++#else ++ ST c01, 0 * SIZE(AO) ++ ST c02, 1 * SIZE(AO) ++ ST c09, 2 * SIZE(AO) ++ ST c10, 3 * SIZE(AO) ++#endif ++ ++#ifdef LN ++ ldi C1, -2 * SIZE(C1) ++ ldi C2, -2 * SIZE(C2) ++#endif ++ ++ ST c01, 0 * SIZE(C1) ++ ST c02, 1 * SIZE(C1) ++ ST c09, 0 * SIZE(C2) ++ ST c10, 1 * SIZE(C2) ++ ++#ifndef LN ++ ldi C1, 2 * SIZE(C1) ++ ldi C2, 2 * SIZE(C2) ++#endif ++ ++#ifdef RT ++ sll K, ZBASE_SHIFT, TMP1 ++ addl AORIG, TMP1, AORIG ++#endif ++ ++#if defined(LT) || defined(RN) ++ subl K, KK, TMP1 ++ sll TMP1, ZBASE_SHIFT + 0, TMP2 ++ addl AO, TMP2, AO ++ sll TMP1, ZBASE_SHIFT + 1, TMP2 ++ addl BO, TMP2, BO ++#endif ++ ++#ifdef LT ++ addl KK, 1, KK ++#endif ++ ++#ifdef LN ++ subl KK, 1, KK ++#endif ++ .align 4 ++ ++$L29: ++#ifdef LN ++ sll K, ZBASE_SHIFT + 1, TMP1 ++ addl B, TMP1, B ++#endif ++ ++#if defined(LT) || defined(RN) ++ mov BO, B ++#endif ++ ++#ifdef RN ++ addl KK, 2, KK ++#endif ++ ++#ifdef RT ++ subl KK, 2, KK ++#endif ++ ++ ldi J, -1(J) ++ bgt J, $L01 ++ .align 4 ++ ++$L999: ++ fldd $f2, 0($sp) ++ fldd $f3, 8($sp) ++ fldd $f4, 16($sp) ++ fldd $f5, 24($sp) ++ fldd $f6, 32($sp) ++ fldd $f7, 40($sp) ++ fldd $f8, 48($sp) ++ fldd $f9, 56($sp) ++ clr $0 ++ ldi $sp, STACKSIZE($sp) ++ ret ++ .ident VERSION ++ .end CNAME +diff --git a/lapack/laswp/sw_64/Makefile b/lapack/laswp/sw_64/Makefile +new file mode 100644 +index 0000000..af1f019 +--- /dev/null ++++ b/lapack/laswp/sw_64/Makefile +@@ -0,0 +1,8 @@ ++TOPDIR = ../../.. ++include ../../../Makefile.system ++ ++LASWP = ../generic/laswp_k_1.c ++ZLASWP = ../generic/zlaswp_k_1.c ++ ++include ../generic/Makefile ++ +diff --git a/param.h b/param.h +index ee4640f..1a5f361 100644 +--- a/param.h ++++ b/param.h +@@ -2128,7 +2128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + #endif + +-#if defined(EV4) || defined(EV5) || defined(EV6) ++#if defined(EV4) || defined(EV5) || defined(SW6) + + #ifdef EV4 + #define SNUMOPT 1 +@@ -2140,7 +2140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + #define GEMM_DEFAULT_OFFSET_A 512 + #define GEMM_DEFAULT_OFFSET_B 512 +-#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL ++#define GEMM_DEFAULT_ALIGN 0x0ffffUL ++//#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL + + #define SGEMM_DEFAULT_UNROLL_M 4 + #define SGEMM_DEFAULT_UNROLL_N 4 +@@ -2185,7 +2186,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #define ZGEMM_DEFAULT_Q 64 + #endif + +-#ifdef EV6 ++#ifdef SW6 + #define SGEMM_DEFAULT_P 256 + #define SGEMM_DEFAULT_Q 512 + +-- +2.31.1 + diff --git a/openblas.spec b/openblas.spec index 28e6d17e1f3aec8ec92b9b7207ade0d390f42f11..de2c629053df1bbc06b1819e34550de589f194fb 100644 --- a/openblas.spec +++ b/openblas.spec @@ -1,4 +1,4 @@ -%define anolis_release 2 +%define anolis_release 3 %bcond_with system_lapack %global lapackver 3.11.0 @@ -209,6 +209,9 @@ cd OpenBLAS-%{version} %patch0003 -p1 -b .tests %patch0004 -p2 #%patch0004 -p1 -b .Add-opt-for-LoongArch64 +%ifarch sw_64 +%patch0005 -p1 +%endif find -name \*.f -exec chmod 644 {} \; @@ -351,7 +354,11 @@ make -C serial USE_THREAD=0 PREFIX=%{buildroot} OPENBLAS_LIBRARY_DIR=%{buildroot cp -a %{_includedir}/lapacke %{buildroot}%{_includedir}/%{name} %endif +%ifarch sw_64 +suffix="_sw6" +%else suffix="" +%endif slibname=`basename %{buildroot}%{_libdir}/libopenblas${suffix}-*.so .so` mv %{buildroot}%{_libdir}/${slibname}.a %{buildroot}%{_libdir}/lib%{name}.a if [[ "$suffix" != "" ]]; then @@ -528,6 +535,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Wed Apr 23 2025 Chunchao Zhang - 0.3.28-3 +- Add sw_64 support + * Thu Mar 27 2025 Wenlong Zhang - 0.3.28-2 - Fixed the undefined reference to blas_set_parameter